import re
import requests
from bs4 import BeautifulSoup
from openpyxl import Workbook
# 首页地址
base_url = "https://top.baidu.com/board?tab=realtime"
# 请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'
}
# 保存文件
save_file = "热搜.xlsx"
# 准备导出excel
wb = Workbook()
ws = wb.active
ws.title = '热搜榜'
# 设置表头
ws.append(["标题", "热搜指数", "详情链接", "缩略图"])
# 请求页面
response = requests.get(base_url, headers=headers)
# 自动检测编码
response.encoding = response.apparent_encoding
# 使用 BeautifulSoup 解析 HTML
soup = BeautifulSoup(response.text, 'lxml')
# 更清晰地打印 HTML
# print(soup.prettify())
# 使用正则匹配类名以 'category-wrap_' 开头的元素
pattern = re.compile(r'^category-wrap_')
# 找到所有匹配的元素
matching_elements = soup.find_all(class_=pattern)
# 打印匹配结果
for element in matching_elements:
print(f"=====================================================")
# 获取标题
title = element.select(".c-single-text-ellipsis")[0].text
print(f"标题:{title}")
# 热搜指数
pattern = re.compile(r'^hot-index_')
hot_div = element.find_all(class_=pattern)[0]
print(f"热搜指数:{hot_div.text}")
# 详情链接
hot_url_a = element.find_all('a')
hot_url = hot_url_a[0]['href']
print(f"详情链接:{hot_url}")
# 缩略图
hot_img = hot_url_a[0].find_all('img')
hot_img_url = hot_img[0]['src']
print(f"缩略图:{hot_img_url}")
# 写入excel
ws.append([title, hot_url, hot_url, hot_img_url])
# 保存excel
wb.save(save_file)
print(f"热搜数据已保存到 {save_file}")
本文仅供学习参考交流,请勿违法使用。