1. 需要的类包
import pandas as pd
import requests
2. 请求地址
通过分析,数据可以直接从接口获取,无需解析页面标签,直接取出我们需要的数据即可。
def fetch_hot_news(api_url):
response = requests.get(api_url)
if response.status_code == 200:
data = response.json()
hot_news = data.get("data", {}).get("hotNews", [])
return hot_news
else:
print(f"Failed to retrieve data. Status code: {response.status_code}")
return []
3. 导出表格
def export_to_excel(hot_news_data):
if not hot_news_data:
return
# Add the missing URL field
base_url = "https://www.xxx.cn/newsDetail_forward_" #澎某pai
hot_news_data = [{
**news,
"URL": f"{base_url}{news['contId']}"
} for news in hot_news_data]
# Create a DataFrame
df = pd.DataFrame(hot_news_data)
# Choose only relevant columns
relevant_columns = ["contId", "name", "pubTime", "URL"]
df = df[relevant_columns]
# Export to Excel
df.to_excel("pengpai-top.xlsx", index=False)
print("Data exported to Excel successfully.")
处理url
def main():
api_url = "https:/xx/wwwIndex/xxx" #分析得到的需要请求的接口地址
hot_news_data = fetch_hot_news(api_url)
if hot_news_data:
export_to_excel(hot_news_data)
else:
print("No hot news data found.")