案例需求:
1.爬取腾讯社招的数据(搜索 | 腾讯招聘)包括岗位名称+链接+时间+公司名称
2.爬取所有页(翻页)
3.利用jsonpath进行数据解析
4.保存数据:txt文本形式和excel文件两种形式
解析:
1.分析该网站同步还是异步?——异步(查看xhr)
2.找到正确的数据包——看响应内容
3.复制请求地址
https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1727929418908&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword=&pageIndex=3&pageSize=10&language=zh-cn&area=cn
4.删除不必要的,找到正确的(可删可不删)
https://careers.tencent.com/tencentcareer/api/post/Query?
5.该网站反爬手段比较强,给其进行伪装
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36' }data = { 'timestamp': '1648355434381', 'countryId': '', 'cityId': '', 'bgIds': '', 'productId': '', 'categoryId': '', 'parentCategoryId': '40001', 'attrId': '', 'keyword': '', 'pageIndex': i, 'pageSize': '10', 'language': 'zh-cn', 'area': 'cn' }
6.保存在excel文件中,创建对象
wb = workbook.Workbook() # 创建Excel对象 ws = wb.active # 激活当前表 ws.append(['职称', '链接', '时间', '公司名称'])
进行excel保存
def save_excel(z,l,s,g): my_list = [z,l,s,g] # 以列表形式写入 ws.append(my_list) wb.save('腾讯社招.xlsx')
进行本地文本保存
def save_text(n,u,t,p): with open('腾讯社招.txt','a',encoding='utf-8')as f: f.write(n+'\n') f.write(u+'\n') f.write(t+'\n') f.write(p+'\n')
7.使用jsonpath解析数据
names = jsonpath(r, '$..RecruitPostName') urls = jsonpath(r, '$..PostURL') times = jsonpath(r, '$..LastUpdateTime') pronames = jsonpath(r, '$..ProductName')
8.处理解析的数据
for name, url, time, protime in zip(names, urls, times, pronames): # print(name,url,time,protime) save_text(name, url, time, protime) save_excel(name, url, time, protime)
9.翻页分析
for i in range(1,6): print("第{}页已经保存完毕!!!".format(i)) # url = 'https://careers.tencent.com/search.html' data = { 'timestamp': '1648355434381', 'countryId': '', 'cityId': '', 'bgIds': '', 'productId': '', 'categoryId': '', 'parentCategoryId': '40001', 'attrId': '', 'keyword': '', 'pageIndex': i, 'pageSize': '10', 'language': 'zh-cn', 'area': 'cn' }
示例代码:
import requests
from jsonpath import jsonpath
from openpyxl import workbook
import time
#"http://careers.tencent.com/jobdesc.html?postId=1685827130673340416"
def get_data():
response = requests.get(url, headers=headers, params=data)
r = response.json()
return r
def parse_data(r):
names = jsonpath(r, '$..RecruitPostName')
urls = jsonpath(r, '$..PostURL')
times = jsonpath(r, '$..LastUpdateTime')
pronames = jsonpath(r, '$..ProductName')
for name, url, time, protime in zip(names, urls, times, pronames):
# print(name,url,time,protime)
save_text(name, url, time, protime)
save_excel(name, url, time, protime)
# 保存数据
def save_text(n,u,t,p):
with open('腾讯社招.txt','a',encoding='utf-8')as f:
f.write(n+'\n')
f.write(u+'\n')
f.write(t+'\n')
f.write(p+'\n')
def save_excel(z,l,s,g):
my_list = [z,l,s,g] # 以列表形式写入
ws.append(my_list)
wb.save('腾讯社招.xlsx')
if __name__ == '__main__':
wb = workbook.Workbook() # 创建Excel对象
ws = wb.active # 激活当前表
ws.append(['职称', '链接', '时间', '公司名称'])
url = 'https://careers.tencent.com/tencentcareer/api/post/Query?'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36'
}
for i in range(1,6):
print("第{}页已经保存完毕!!!".format(i))
# url = 'https://careers.tencent.com/search.html'
data = {
'timestamp': '1648355434381',
'countryId': '',
'cityId': '',
'bgIds': '',
'productId': '',
'categoryId': '',
'parentCategoryId': '40001',
'attrId': '',
'keyword': '',
'pageIndex': i,
'pageSize': '10',
'language': 'zh-cn',
'area': 'cn'
}
time.sleep(2)
h=get_data()
parse_data(h)
运行结果:
同样也可以添加代理来进行
添加代理
zhima_api = 'http://http.tiqu.letecs.com/getip3?num=1&type=1&pro=&city=0&yys=0&port=1&pack=225683&ts=0&ys=0&cs=0&lb=1&sb=0&pb=4&mr=1®ions=&gm=4' proxie_ip = requests.get(zhima_api).json()['data'][0] print(proxie_ip) # 将提取后的IP处理成字典形式 构造完整HTTP代理 proxies = { 'http': 'http://' + str(proxie_ip['ip']) + ':' + str(proxie_ip['port']), #'https': 'https://' + str(proxie_ip['ip']) + ':' + str(proxie_ip['port']) }