需求:
1.利用requests方法爬取该i.news.qq.com网站的数据(包括名字和对应链接)
2.实现翻页的爬取
3.将爬取下来的数据保存在excel文件中
4.利用jsonpath来解析获取的数据
5.使用openpyxl库处理 Excel 文件
注意:
1.如果报以下错误:
AttributeError: module 'numpy' has no attribute 'short'
且不需要用到numpy这个模块,则
1.更新openpyxl。
pip install --upgrade openpyxl
2.因为是实时更新的,所以虽然有161页,但是一般最后一页大多无数据,这会导致爬取数据错误,所以需要添加一个异常处理。
获取网页中
try: headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36' } r = requests.get(url, headers=headers, params=data) if r.status_code == 200: return r.json() else: print(f"请求失败,状态码:{r.status_code}") return None except Exception as e: print(f"请求异常:{e}") return None
解析中
try: title = jsonpath(data, '$..title') url = jsonpath(data, '$..url') for titles, urls in zip(title, url): save_data(titles, urls) except Exception as e: print(f"解析异常:{e}") sys.exit(1)
示例代码:
import requests
from jsonpath import jsonpath
from openpyxl import Workbook
import sys
wb = Workbook() # 创建一个Excel对象
ws = wb.active
ws.append(['标题', '链接'])
def get_data(url, data):
try:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
}
r = requests.get(url, headers=headers, params=data)
if r.status_code == 200:
return r.json()
else:
print(f"请求失败,状态码:{r.status_code}")
return None
except Exception as e:
print(f"请求异常:{e}")
return None
def save_data(title, link):
ws.append([title, link])
def parse_data(data):
try:
title = jsonpath(data, '$..title')
url = jsonpath(data, '$..url')
for titles, urls in zip(title, url):
save_data(titles, urls)
except Exception as e:
print(f"解析异常:{e}")
sys.exit(1)
if __name__ == '__main__':
url = "https://i.news.qq.com/trpc.qqnews_web.kv_srv.kv_srv_http_proxy/list"
for i in range(0, 161, 20):
print(f'当前翻页参数为{i}')
data = {
'sub_srv_id': '24hours',
'srv_id': 'pc',
'offset': i,
'limit': '20',
'strategy': '1',
'ext': '{"pool":["top","hot"],"is_filter":7,"check_type":true}',
}
data_json = get_data(url, data)
if data_json:
parse_data(data_json)
wb.save('腾讯新闻.xlsx')
运行结果: