一、优化了,输入城市代码,获取城市信息
# coding=utf-8
import requests
import re
import csv
import datetime
class WeatherForecast(object):
def __init__(self,city_code,start_year,end_year,end_month):
self.city_code=city_code
self.start_year=start_year
self.end_year=end_year
self.end_month=end_month
# self.url = 'https://tianqi.2345.com/Pc/GetHistory?areaInfo%5BareaId%5D=59493&areaInfo%5BareaType%5D=2&date%5Byear%5D={0}&date%5Bmonth%5D={1}'
# #西安地址链接 57036--->BareaId%5D=59493
# self.url = 'https://tianqi.2345.com/Pc/GetHistory?areaInfo%5BareaId%5D=57036&areaInfo%5BareaType%5D=2&date%5Byear%5D={0}&date%5Bmonth%5D={1}'
# 咸阳地址链接57048--->%5BareaId%5D=59493
# self.url = 'https://tianqi.2345.com/Pc/GetHistory?areaInfo%5BareaId%5D=57048&areaInfo%5BareaType%5D=2&date%5Byear%5D={0}&date%5Bmonth%5D={1}'
self.headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.24',
'accept-encoding': 'gzip, deflate, br'
}
self.data_list = []
def get_content(self, url):
res = requests.get(url=url, headers=self.headers)
content = res.json()
# print(content['data'])
return content['data']
def parse_data(self, content):
result = re.compile(r'<td>(?P<date>.*?)</td>.*?<td style="color:#ff5040;">(?P<max>.*?)</td>'
r'.*?<td style="color:#3097fd;" >(?P<min>.*?)</td>.*?<td>(?P<weather>.*?)</td>'
r'.*?<td>(?P<cloud>.*?)</td>.*?<td><span class="history-aqi wea-aqi.*?>(?P<sky>.*?)</span></td>',
re.S)
find_result = result.finditer(content)
for it in find_result:
data_dict = it.groupdict()
# print(data_dict)
self.data_list.append(data_dict)
return self.data_list
def write_csv(self, data_list):
curent_date=datetime.datetime.now().strftime("%Y%m%d")
save_name=str(self.city_code)+curent_date+'.csv'
with open(save_name, 'w',newline='') as f:
writer = csv.writer(f)
writer.writerow(['日期', '最高温度', '最低温度', '天气', '风力风向', '空气质量'])
for i in data_list:
writer.writerow(i.values())
print(i.values())
# 爬取数据的事项,只能获取整年,如果到月份的话 if year==2024 & month>6:
# continue 通过他跳过指定的月份,获取数据
def run(self):
for year in range(self.start_year, self.end_year+1, 1):
for month in range(1, 13, 1):
if year>self.end_year:
continue
elif year==self.end_year:
if month>self.end_month:
continue
url = f'https://tianqi.2345.com/Pc/GetHistory?areaInfo%5BareaId%5D={self.city_code}&areaInfo%5BareaType%5D=2&date%5Byear%5D={year}&date%5Bmonth%5D={month}'
print('正在获取第{0}年{1}月的天气!'.format(year, month))
content = self.get_content(url)
data = self.parse_data(content)
self.write_csv(data)
print('全部获取完毕,请在程序目录获取下载xxxx.csv!')
# 使用办法 就是换里面的url地址,以及修改下面的年月
# 具体内容查看 “数据分析.txt”里面的信息
# 天气预报查询接口
# https://tianqi.2345.com/wea_history/57036.htm
if __name__ == '__main__':
print("输入起始年,结束年,结束月")
city_code = int(input("起始城市代码:"))
start_year=int(input("起始年:"))
end_year=int(input("结束年:"))
end_month=int(input("结束年月:"))
weather = WeatherForecast(city_code,start_year,end_year,end_month)
weather.run()
# 调整获取指定年份的指定月份之前的数据
二、代码分析
本次对数据提取进行分析:---》数据解析的分析
def parse_data(self, content):
result = re.compile(r'<td>(?P<date>.*?)</td>.*?<td style="color:#ff5040;">(?P<max>.*?)</td>'
r'.*?<td style="color:#3097fd;" >(?P<min>.*?)</td>.*?<td>(?P<weather>.*?)</td>'
r'.*?<td>(?P<cloud>.*?)</td>.*?<td><span class="history-aqi wea-aqi.*?>(?P<sky>.*?)</span></td>',
re.S)
find_result = result.finditer(content)
for it in find_result:
data_dict = it.groupdict()
# print(data_dict)
self.data_list.append(data_dict)
return self.data_list
上面返回的内容是个json文件,json中data就是需要的数据
那么这个数据内容是什么呢?我们打印出来看下
result = re.compile(r'<td>(?P<date>.*?)</td>.*?<td style="color:#ff5040;">(?P<max>.*?)</td>'
r'.*?<td style="color:#3097fd;" >(?P<min>.*?)</td>.*?<td>(?P<weather>.*?)</td>'
r'.*?<td>(?P<cloud>.*?)</td>.*?<td><span class="history-aqi wea-aqi.*?>(?P<sky>.*?)</span></td>',
re.S)
通过上面的正则就是可以提取出来其中的内容了