源码下载链接:ppt.rar - 蓝奏云
PPT下载链接:https://pan.baidu.com/s/1oOIO76xhSw283aHTDhBcPg?pwd=dydk
提取码:dydk
采集的参数
page_count = 1 # 每个栏目开始业务content="text/html; charset=gb2312"
base_url = "https://sc.chinaz.com/" # 采集的网址 https://sc.chinaz.com/tag_ppt/zhongguofeng.html
save_path = "D:\\Sprider\\ChinaZ\\"
sprider_count = 110 # 采集数量
haved_sprider_count = 0 # 已经采集的数量
word_content_list = []
folder_name = ""
first_column_name = "ppt"
sprider_start_count=800 # 从第几个序号开始 直接改数量即可 会做除法操作正 正在采集第32页的第16个资源 debug
max_pager=20 #每页的数量
采集主体代码
def sprider(self, second_column_name):
"""
采集Coder代码
:return:
"""
if second_column_name == "zhongguofeng":
self.folder_name = "中国风"
self.first_column_name="tag_ppt"
elif second_column_name == "xiaoqingxin":
self.folder_name = "小清新"
self.first_column_name = "tag_ppt"
elif second_column_name == "kejian":
self.folder_name = "课件"
self.first_column_name = "ppt"
merchant = int(self.sprider_start_count) // int(self.max_pager) + 1
second_folder_name = str(self.sprider_count) + "个" + self.folder_name
self.save_path = self.save_path+ os.sep + "PPT" + os.sep + second_folder_name
BaseFrame().debug("开始采集ChinaZPPT...")
sprider_url = (self.base_url + "/" + self.first_column_name + "/" + second_column_name + ".html")
response = requests.get(sprider_url, timeout=10, headers=UserAgent().get_random_header(self.base_url))
response.encoding = 'UTF-8'
soup = BeautifulSoup(response.text, "html5lib")
#print(soup)
div_list = soup.find('div', attrs={"class": 'ppt-list'})
div_list =div_list.find_all('div', attrs={"class": 'item'})
#print(div_list)
laster_pager_url = soup.find('a', attrs={"class": 'nextpage'})
laster_pager_url = laster_pager_url.previous_sibling
#<a href="zhongguofeng_89.html"><b>89</b></a>
page_end_number = int(laster_pager_url.find('b').string)
#print(page_end_number)
self.page_count = merchant
while self.page_count <= int(page_end_number): # 翻完停止
try:
if self.page_count == 1:
self.sprider_detail(div_list,self.page_count,page_end_number)
else:
if self.haved_sprider_count == self.sprider_count:
BaseFrame().debug("采集到达数量采集停止...")
BaseFrame().debug("开始写文章...")
self.builder_word(self.folder_name, self.save_path, self.word_content_list)
BaseFrame().debug("文件编写完毕,请到对应的磁盘查看word文件和下载文件!")
break
#https://www.a5xiazai.com/android/youxi/qipaiyouxi/list_913_1.html
#https://www.a5xiazai.com/android/youxi/qipaiyouxi/list_913_2.html
#next_url = sprider_url + "/list_{0}_{1}.html".format(str(url_index), self.page_count)
# (self.base_url + "/" + first_column_name + "/" + second_column_name + "/"+three_column_name+"")
next_url =(self.base_url + "/" + self.first_column_name + "/" + second_column_name + "_{0}.html").format(self.page_count)
# (self.base_url + "/" + self.first_column_name + "/" + second_column_name + "")+"/list_{0}_{1}.html".format(str(self.url_index), self.page_count)
response = requests.get(next_url, timeout=10, headers=UserAgent().get_random_header(self.base_url))
response.encoding = 'UTF-8'
soup = BeautifulSoup(response.text, "html5lib")
div_list = soup.find('div', attrs={"class": 'ppt-list'})
div_list = div_list.find_all('div', attrs={"class": 'item'})
self.sprider_detail(div_list, self.page_count,page_end_number)
pass
except Exception as e:
print("sprider()执行过程出现错误" + str(e))
pass
self.page_count = self.page_count + 1 # 页码增加1
def sprider_detail(self, element_list, page_count,max_page):
try:
element_length = len(element_list)
self.sprider_start_index = int(self.sprider_start_count) % int(self.max_pager)
index = self.sprider_start_index
while index < element_length:
a=element_list[index]
if self.haved_sprider_count == self.sprider_count:
BaseFrame().debug("采集到达数量采集停止...")
break
index = index + 1
sprider_info = "正在采集第" + str(page_count) + "页的第" + str(index) + "个资源"
BaseFrame().debug(sprider_info)
title_image_obj = a.find('img', attrs={"class": 'lazy'})
url_A_obj=a.find('a', attrs={"class": 'name'})
next_url = self.base_url+url_A_obj.get("href")
coder_title = title_image_obj.get("alt")
response = requests.get(next_url, timeout=10, headers=UserAgent().get_random_header(self.base_url))
response.encoding = 'UTF-8'
soup = BeautifulSoup(response.text, "html5lib")
#print(next_url)
down_load_file_div = soup.find('div', attrs={"class": 'download-url'})
if down_load_file_div is None:
BaseFrame().debug("需要花钱无法下载因此跳过哦....")
continue
down_load_file_url = down_load_file_div.find('a').get("href")
#print(down_load_file_url)
image_obj = soup.find('div', attrs={"class": "one-img-box"}).find('img')
image_src = "https:"+ image_obj.get("data-original")
#print(image_src)
if (DownLoad(self.save_path).__down_load_file__(down_load_file_url, coder_title, self.folder_name)):
DownLoad(self.save_path).down_cover_image__(image_src, coder_title) # 资源的 封面
sprider_content = [coder_title,
self.save_path + os.sep + "image" + os.sep + coder_title + ".jpg"] # 采集成功的记录
self.word_content_list.append(sprider_content) # 增加到最终的数组
self.haved_sprider_count = self.haved_sprider_count + 1
BaseFrame().debug("已经采集完成第" + str(self.haved_sprider_count) + "个")
if (int(page_count) == int(max_page)):
self.builder_word(self.folder_name, self.save_path, self.word_content_list)
BaseFrame().debug("文件编写完毕,请到对应的磁盘查看word文件和下载文件!")
except Exception as e:
print("sprider_detail:" + str(e))
pass
采集的文件名
初中化学实验课件ppt模板
开学第一课开学季ppt模板设计
大学生情绪压力管理ppt模板课件
简约风格幼小衔接ppt课件免费下载
高考填报志愿课件免费ppt模板下载
岳阳楼记教学设计ppt课件
岳阳楼记ppt课件免费下载第3课时
岳阳楼记ppt课件免费下载第2课时
岳阳楼记ppt课件免费下载第1课时
岳阳楼记译文ppt课件