系统首页
天气数据分析
历史天气数据查询
python爬虫代码展示
import requests
import re
import time as delay
from bs4 import BeautifulSoup
import pandas as pd
import pymysql
import json
# 定义一个函数,用于获取网页的源代码
def get_page(url, headers):
html = requests.get(url, headers=headers)
if html.status_code == 200:
html.encoding = html.apparent_encoding
return html.text
else:
print("Failed to fetch page:", url) # 添加调试语句
return None
# 定义一个函数,用于解析网页中的数据,并返回一个数据框
def parse_page(html):
# 创建空列表,用于存储数据
date_box=[]
max_temp=[]
min_temp=[]
weh=[]
wind=[]
# 使用 BeautifulSoup 解析网页
bs=BeautifulSoup(html,'html.parser')
# 找到包含数据的标签
data=bs.find_all(class_='thrui')
# 使用正则表达式提取数据
date = re.compile('class="th200">(.*?)</div>')
tem = re.compile('class="th140">(.*?)</div>')
time = re.findall(date, str(data))
for item in time:
# 不提取星期数据
date_box.append(item[:10])
temp = re.findall(tem, str(data))
for i in range(len(temp) // 4):
max_temp.append(temp[i * 4 + 0])
min_temp.append(temp[i * 4 + 1])
weh.append(temp[i * 4 + 2])
wind.append(temp[i * 4 + 3])
# 将数据转换为数据框,不添加星期列
datas=pd.DataFrame({'日期':date_box,'最高温度':max_temp,'最低温度':min_temp,'天气':weh,'风向':wind })
return datas
# 定义一个函数,用于将数据框中的数据保存到 mysql 数据表中
def save_to_mysql(datas, city, db, cursor): # 添加城市参数
for i in range(len(datas)):
date = datas.iloc[i, 0]
max_temp = float(re.findall(r'\d+', datas.iloc[i, 1])[0])
min_temp = float(re.findall(r'\d+', datas.iloc[i, 2])[0])
weh = datas.iloc[i, 3]
wind = datas.iloc[i, 4]
# 在 SQL 语句中包含城市信息
sql = "INSERT INTO lishiweathers (城市, 日期, 最高温度, 最低温度, 天气, 风向) VALUES ('{}', '{}', {}, {}, '{}', '{}')".format(city, date, max_temp, min_temp, weh, wind)
cursor.execute(sql)
db.commit()
print("成功将数据保存到 lishiweathers 表中")
def crawl_weather(city, code, start, end, db, cursor):
# 根据城市的编码和时间范围,生成网页的 url
url = "http://lishi.tianqi.com/{}/{}{}.html".format(code, start, end)
# 定义请求头,模拟浏览器访问
headers = { # 请求标头,通过模拟请求标头以此实现仿人类登录进入网站并获取信息
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
'Cookie': ******************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
}
try:
# 获取网页的源代码
html = get_page(url, headers)
# 解析网页中的数据,并返回一个数据框
datas = parse_page(html)
# 将数据框中的数据保存到 mysql 数据表中
save_to_mysql(datas, city, db, cursor) # 传递 cursor 参数
# 打印提示信息,不打印星期数据,也不将其作为输出的格式
print("成功爬取 {} 的 {}-{} 的历史天气数据".format(city, start, end))
except Exception as e:
print("爬取 {} 的 {}-{} 历史天气数据失败:{}".format(city, start, end, e))
# 从本文件夹下的 city.json 文件中读取城市名称和编码
with open('city.json', 'r', encoding='utf-8') as f:
city_dict = json.load(f) # 将json文件转换为字典
# 定义一个列表,存储需要爬取的时间范围
# 您可以根据您的要求修改或添加时间
#time_list = ["202201","202202","202203","202204","202205","202206","202207","202208","202209","202210","202211","202212","202301","202302","202303","202304","202305","202306","202307","202308","202309","202310","202311","202312","202401","202402"]
time_list = ["202301","202302","202303","202304","202305","202306","202307","202308","202309","202310","202311","202312","202401","202402","202403","202404","202405","202406","202407","202408","202409","202410","202411","202412","202501","202502"]
# 打开数据库连接
# 您需要提供您的数据库的主机名,用户名,密码,端口号和数据库名
db = pymysql.connect(host="localhost",user="root",passwd="123456",port=3306,db="tianqi")
# 创建一个游标对象
cursor = db.cursor()
# 遍历城市字典和时间列表,调用爬取函数
for city, code in city_dict.items():
for time in time_list:
start = time[:6]
end = time[6:]
crawl_weather(city, code, start, end, db, cursor) # 传递 cursor 参数
delay.sleep(5)
# 关闭数据库连接
db.close()
数据库设计