爬取结果:
热播榜: 动画片:
电视剧:
纪录片:
特别节目:
代码部分:
import re
import pymongo
import requests
res = requests.get('https://tv.cctv.com/top/index.shtml?spm=C28340.PdNvWY0LYxCP.EtmP5mypaGE4.11')
content = res.content.decode("utf8")
# print(content)
datas = re.findall(r"<ul>.*?</ul>", content, re.S)
# print(datas[1]) # 热播榜
# print(datas[2]) # 动画片
# print(datas[3]) # 电视剧
# print(datas[4]) # 纪录片
# print(datas[5]) # 特别节目
# 热播榜
# result = {
# "name": "热播榜",
# "items": []
# }
# # re.S 包括换行
# items = re.findall(
# r'<li.*?lazy="(.*?)".*?<div class="text"><a href=".*?" target="_blank">(.*?)</a></div>.*?<div class="column"><i class="icon_l"></i><a href=".*?" target="_blank">(.*?)</a><i class="icon_r"></i>.*?</div>',
# datas[1], re.S)
# for item in items:
# # print(item)
# result["items"].append({
# "img": item[0],
# "title": item[1],
# "category": item[2]
# })
#
client = pymongo.MongoClient()
db = client.get_database("cctv")
# collection = db.get_collection("hot_show")
# collection.insert_one(result)
# 动画片
# result1 = {
# "name": "动画片",
# "items": []
# }
# items = re.findall(
# r'<li.*?lazy="(.*?)".*?<div class="text"><a href=".*?" target="_blank">(.*?)</a></div>.*?<p><a href=".*?" target="_blank">(.*?)</a></p>',
# datas[2], re.S)
# for item in items:
# # print(item)
# result1["items"].append({
# "img": item[0],
# "title": item[1],
# "type": item[2]
# })
# collection = db.get_collection("donghua")
# collection.insert_one(result1)
# 电视剧
# result2 = {
# "name": "电视剧",
# "items": []
# }
# items = re.findall(
# r'<li.*?lazy="(.*?)".*?<span class="number"><i class="icon_l"></i><i class="txt">(.*?)</i><i class="icon_r"></i></span>.*?<div class="text"><a href=".*?" target="_blank">(.*?)</a></div>.*?<p><a href=".*?" target="_blank">(.*?)</a></p>',
# datas[3], re.S)
# for item in items:
# # print(item)
# result2["items"].append({
# "jishu": item[0],
# "title": item[1],
# "jieshao": item[2]
# })
# collection = db.get_collection("dianshiju")
# collection.insert_one(result2)
# 纪录片
# result3 = {
# "name": "纪录片",
# "items":[]
# }
# items = re.findall(
# r'<li.*?lazy="(.*?)".*?<span class="number"><i class="icon_l"></i><i class="txt">(.*?)</i><i class="icon_r"></i></span>.*?<div class="text"><a href=".*?" target="_blank">(.*?)</a></div>.*?<p><a href=".*?" target="_blank">(.*?)</a></p>',
# datas[4], re.S)
# for item in items:
# # print(item)
# result3["items"].append({
# "img":item[0],
# "type":item[1],
# "name":item[2],
# "jiehsao":item[3]
# })
# collection = db.get_collection("jilupian")
# collection.insert_one(result3)
# 特别节目
result4 = {
"name": "特别",
"items": []
}
items = re.findall(
r'<li.*?lazy="(.*?)".*?<span class="number"><i class="icon_l"></i><i class="txt">(.*?)</i><i class="icon_r"></i></span>.*?<div class="text"><a href=".*?" target="_blank">(.*?)</a></div>.*?<p><a href=".*?"_blank">(.*?)</a></p>',
datas[5], re.S)
for item in items:
# print(item)
result4["items"].append({
"img": item[0],
"pindao": item[1],
"title": item[2],
"jieshao": item[3]
})
collection = db.get_collection("tebie")
collection.insert_one(result4)
client.close()