通过输入(或文件导入)公众号名称,即可爬取该公众号所有历史文章。
通过公众号官方网站调用API,打开开发者工具后发现有
打开后发现有搜索结果的fakeid,这是每个公众号的标识。
点击某公众号后出现
这是具体公众号文章信息,它需要的就是fakeid,其中token是你自己公众号独有的。
打开后是这样的
这样就可以通过正则表达式找到所有的link,再将\替换掉,即可得到所有文章的链接,具体爬取文章哪些内容,html格式各不相同。
其中header和cookie按照自己浏览器填写即可。
from urllib.parse import quote,unquote import requests import os import numpy as np from concurrent.futures import ThreadPoolExecutor import time import re import pandas as pd import time import json from bs4 import BeautifulSoup import sys def get_articles(search_name,begin,count): sum=0 header={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0'} cookie={'Cookie':'RK=dp90GcuG9p; ptcz=d746c192ffbb523199183ab352d5fa9e9c910a4f3b54760f12b45adc13ae240b; qq_domain_video_guid_verify=990a13b1b7b9eee5; _qimei_uuid42=1811d111e17100b9c82a6d970b40848157bb8f5a7f; pgv_pvid=4349444734; _qimei_fingerprint=ef24df0d3d1526c851b8fd8e3e5046e1; _qimei_q36=; _qimei_h38=19c5e2dfc82a6d970b40848102000008e1811d; o_cookie=3260693694; ua_id=PE7tWJm65TXEuv2NAAAAAEH8DLqAC6U64zQAeHCWDHU=; wxuin=06761475252682; mm_lang=zh_CN; qz_gdt=s4h4gzicaaam2skd47cq; _clck=3935648545|1|fj2|0; uuid=936e5d75bad14672d72ecd9f1cddb465; rand_info=CAESIHrpTUh/dWE97s4zYr+5JmdooNE2+xgIQ7iSiyNoJNlP; slave_bizuin=3935648545; data_bizuin=3935648545; bizuin=3935648545; data_ticket=jlULryJfNpiYkVXO817h9zgmLsmmrPR9XkC+UPIpVXh6BVxUpj+NeDYKN0Fx1Hj4; slave_sid=UVVHcFduVmRmSFp5NUxxY3RSc3kxVFdGMVlGaVl0R01wR0E5UnlvSmFQb1VTeWlkSjBNVFpYc0hQdW9wc3lzWFJKNTNZNXc3bXY0dXFQWVE3dkMyU2ZlRnJNcUlsSWhDT0FsVGxJRDB5RGVXM2NYREl2aVZBZmQyS01QTktGbTFyVHQ2alZTcEpRMllLSE12; slave_user=gh_042488a75457; xid=31ef245af21c80ddc82bb852f22c5f87; rewardsn=; wxtokenkey=777; _clsk=doakr4|1707289611534|4|1|mp.weixin.qq.com/weheat-agent/payload/record'} refer={} search_url=f'https://mp.weixin.qq.com/cgi-bin/searchbiz?action=search_biz&begin=0&count=5&query={quote(search_name)}&token=726980468&lang=zh_CN&f=json&ajax=1' search_info=requests.get(search_url,headers=header,cookies=cookie) if search_info.status_code!=200: print('error!') print(search_url) sys.exit(0) search_info=search_info.json()['list'] fakeid=search_info[0]['fakeid'] article_url=f'https://mp.weixin.qq.com/cgi-bin/appmsgpublish?sub=list&search_field=null&begin=0&count=5&query=&fakeid={fakeid}&type=101_1&free_publish_type=1&sub_action=list_ex&token=726980468&lang=zh_CN&f=json&ajax=1' json_info=requests.get(article_url,headers=header,cookies=cookie) if json_info.status_code!=200: print('error!') print(article_url) sys.exit(0) json_info=json_info.json()['publish_page'] links=re.findall(r'link.*?http:.*?#rd',json_info) for i in range(len(links)): links[i]=links[i].replace('\\','')[7:] print(links) print('输入想要搜索的公众号名称:') search_name=input() get_articles(search_name,0,20)