基于python+flask+mysql的川渝地区天气数据分析系统

news2025/3/6 6:29:46

系统首页

天气数据分析

历史天气数据查询

python爬虫代码展示

import requests
import re
import time as delay
from bs4 import BeautifulSoup
import pandas as pd
import pymysql
import json

# 定义一个函数，用于获取网页的源代码
def get_page(url, headers):
    html = requests.get(url, headers=headers)
    if html.status_code == 200:
        html.encoding = html.apparent_encoding
        return html.text
    else:
        print("Failed to fetch page:", url)  # 添加调试语句
        return None


# 定义一个函数，用于解析网页中的数据，并返回一个数据框
def parse_page(html):
    # 创建空列表，用于存储数据
    date_box=[]
    max_temp=[]
    min_temp=[]
    weh=[]
    wind=[]
    # 使用 BeautifulSoup 解析网页
    bs=BeautifulSoup(html,'html.parser')
    # 找到包含数据的标签
    data=bs.find_all(class_='thrui')
    # 使用正则表达式提取数据
    date = re.compile('class="th200">(.*?)</div>')
    tem = re.compile('class="th140">(.*?)</div>')
    time = re.findall(date, str(data))
    for item in time:
        # 不提取星期数据
        date_box.append(item[:10])
    temp = re.findall(tem, str(data))
    for i in range(len(temp) // 4):
        max_temp.append(temp[i * 4 + 0])
        min_temp.append(temp[i * 4 + 1])
        weh.append(temp[i * 4 + 2])
        wind.append(temp[i * 4 + 3])

    # 将数据转换为数据框，不添加星期列
    datas=pd.DataFrame({'日期':date_box,'最高温度':max_temp,'最低温度':min_temp,'天气':weh,'风向':wind })
    return datas

# 定义一个函数，用于将数据框中的数据保存到 mysql 数据表中
def save_to_mysql(datas, city, db, cursor):  # 添加城市参数
    for i in range(len(datas)):
        date = datas.iloc[i, 0]
        max_temp = float(re.findall(r'\d+', datas.iloc[i, 1])[0])
        min_temp = float(re.findall(r'\d+', datas.iloc[i, 2])[0])
        weh = datas.iloc[i, 3]
        wind = datas.iloc[i, 4]
        # 在 SQL 语句中包含城市信息
        sql = "INSERT INTO lishiweathers (城市, 日期, 最高温度, 最低温度, 天气, 风向) VALUES ('{}', '{}', {}, {}, '{}', '{}')".format(city, date, max_temp, min_temp, weh, wind)
        cursor.execute(sql)
    db.commit()
    print("成功将数据保存到 lishiweathers 表中")



def crawl_weather(city, code, start, end, db, cursor):
    # 根据城市的编码和时间范围，生成网页的 url
    url = "http://lishi.tianqi.com/{}/{}{}.html".format(code, start, end)
    # 定义请求头，模拟浏览器访问
    headers = {  # 请求标头，通过模拟请求标头以此实现仿人类登录进入网站并获取信息
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
        'Cookie': ******************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
    }
    try:
        # 获取网页的源代码
        html = get_page(url, headers)
        # 解析网页中的数据，并返回一个数据框
        datas = parse_page(html)
        # 将数据框中的数据保存到 mysql 数据表中
        save_to_mysql(datas, city, db, cursor)  # 传递 cursor 参数
        # 打印提示信息，不打印星期数据，也不将其作为输出的格式
        print("成功爬取 {} 的 {}-{} 的历史天气数据".format(city, start, end))
    except Exception as e:
        print("爬取 {} 的 {}-{} 历史天气数据失败：{}".format(city, start, end, e))


# 从本文件夹下的 city.json 文件中读取城市名称和编码
with open('city.json', 'r', encoding='utf-8') as f:
    city_dict = json.load(f) # 将json文件转换为字典

# 定义一个列表，存储需要爬取的时间范围
# 您可以根据您的要求修改或添加时间
#time_list = ["202201","202202","202203","202204","202205","202206","202207","202208","202209","202210","202211","202212","202301","202302","202303","202304","202305","202306","202307","202308","202309","202310","202311","202312","202401","202402"]
time_list = ["202301","202302","202303","202304","202305","202306","202307","202308","202309","202310","202311","202312","202401","202402","202403","202404","202405","202406","202407","202408","202409","202410","202411","202412","202501","202502"]

# 打开数据库连接
# 您需要提供您的数据库的主机名，用户名，密码，端口号和数据库名
db = pymysql.connect(host="localhost",user="root",passwd="123456",port=3306,db="tianqi")
# 创建一个游标对象
cursor = db.cursor()

# 遍历城市字典和时间列表，调用爬取函数
for city, code in city_dict.items():
    for time in time_list:
        start = time[:6]
        end = time[6:]
        crawl_weather(city, code, start, end, db, cursor)  # 传递 cursor 参数
        delay.sleep(5)
# 关闭数据库连接
db.close()

数据库设计