一、数据来源分析
1.明确需求 网址:https://v.qq.com/x/cover/mzc002006n62s11/e0045s2g2eg.html 数据:视频 2.抓包分析 F12 刷新网站 搜索关键字找到对应的数据位置 M3U8 https://vd6.l.qq.com/proxyhttp
二、代码实现步骤
发送请求 获取数据 解析数据 保存数据
三、正常爬取代码
import requests
import re
from pprint import pprint
import json
headers = {
'accept': '*/*',
'accept-language': 'zh-CN,zh;q=0.9',
'content-type': 'text/plain;charset=UTF-8',
'cookie': 'qq_domain_video_guid_verify=507c92025bbf667c; vversion_name=8.2.95; video_omgid=507c92025bbf667c; _qimei_uuid42=1881811391a10031d240126f353cd310f5cbf9f774; pgv_pvid=1833209344; _qimei_fingerprint=28e27f13e024b4e72ed41740c05847af; _qimei_q36=; _qimei_h38=2db8f8c1d240126f353cd31002000004d18818; o_minduid=fEZEgSlaTS1EadavpD_tVESHLMB8xDIk; appuser=6E4931CCE5CB1885; pgv_info=ssid=s6672934747; LPDFturn=798; LPPBturn=773; LKBturn=898; LPLFturn=175; LPSJturn=904; LBSturn=975; LVINturn=259; LPHLSturn=587; LHTturn=445; Lturn=172; LPVLturn=760; LZTturn=929; LDERturn=900',
'referer': 'https://v.qq.com/',
'sec-ch-ua': '"Not)A;Brand";v="99", "Google Chrome";v="127", "Chromium";v="127"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-site',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36',
}
response = requests.post('https://vd6.l.qq.com/proxyhttp', headers=headers, data=data)
context = response.json()
voad = context['vinfo']
voaddata = json.loads(voad)
pprint(voaddata)
m3u8_url = voaddata['vl']['vi'][0]['ul']['ui'][-1]['url']
print(m3u8_url)
m3u8 = requests.get(url=m3u8_url, headers=headers).text
# print(m3u8)
ts_list= re.findall(",\n(.*?)\n#",m3u8,re.S)
for ts in ts_list:
print(ts)
tsbase = 'https://defaultts.tc.qq.com/vipts.tc.qq.com/AF8H7lDpwYdrycx9TJOH7gFbhFzqHlzIaYXsBfV6ujAk/B_efeEBb4uHJ8TOTkZIB0ooo9N3FBiMa0i4vWoGPCx-MBjTgrF9Gkl8W5Wef7bGPCS/svp_50112/_DGlnT9NscaHvtg1cgDcBjlfH5xG2zUzW2NmMUNUtynTPxeoaA7MHYiUuyjty5_8rEA4fNZmxeV0BI4I-oVG76Z6jQAZmEc7f-gBRCo3zh6gBCRuZBAS-cTfa-9KnKAffogGYhcqpJxmlBE2_oWxeI2io5c8KWb0XX8bWB36KEQykgiHkds7KKMoUe6eWuhYM4l6JW-96NpM0gB1AByhKVjypXNwxYqZVsc1uo2kq2_AT6GZ-w0p_Q/'
for ts in ts_list:
print(ts)
ts_url = tsbase + ts
print(ts_url)
res1 = requests.get(url=ts_url)
with open('./video/第2话.mp4', 'ab') as f:
f.write(res1.content)
四、自动化爬取视频
import requests
import re
import time
import tqdm #进度条
from pprint import pprint
import json
import pandas as pd
"""
一、数据来源分析
网址:https://v.qq.com/x/cover/mzc002006n62s11/e0045s2g2eg.html
https://vd6.l.qq.com/proxyhttp
"""
from DrissionPage import ChromiumPage,ChromiumOptions
#chrome:version
co = ChromiumOptions().set_paths(browser_path=r"C:\Users\lenovo\AppData\Local\Google\Chrome\Application\chrome.exe")
page = ChromiumPage(co)
page.listen.start('proxyhttp')
page.get("https://v.qq.com/x/cover/mzc002006n62s11/z0045grsctu.html")
time.sleep(10)
res=page.listen.wait()
context=res.response.body
# title = page.ele('class:.txp-next-preview__desc').text
voad = context['vinfo']
voaddata = json.loads(voad)
# pprint(voaddata)
m3u8_url = voaddata['vl']['vi'][0]['ul']['ui'][-1]['url']
tsbase ='/'.join(m3u8_url.split('/')[:-1]) +'/'
# print(m3u8_url)
m3u8 = requests.get(url=m3u8_url).text
print(m3u8)
ts_list= re.findall(",\n(.*?)\n#",m3u8,re.S)
for ts in ts_list:
print(ts)
ts_url = tsbase+ts
print(ts_url)
res1=requests.get(url=ts_url)
with open('./video/第1话.mp4','ab') as f:
f.write(res1.content)