思路
写文章的时间2023-8-4,大部分网页设置的区域都是先是省,然后通过省获取对应的市,再通过市获取对应的区,以此类推。所以模拟的请求也是按照这个逻辑,先获取所有的省,再获取所有的市,最后获取所有的区。
实现方法
博主其实是需要根据用户指定的地区做一些其他的请求,但是用户不可能知道这个地区对应的id,所以直观的展示就想到:用区域的名作为键,id作为值,但是区域名字可能会相同,例如不同的市但存在同名的区,所以最终想到用嵌套字典的方法来进行存储,因为不可能省市区都是一样的。
所以最终字典的样子是这样子的:
代码
代码解释
获取区域对应的id和名字,走了三次循环进行获取,定义一个字典,将获取到的区域的名字作为键,值也是一个字典,这个字典就包含当前区域的id,然后下一级的区域名字,如此嵌套。字典的样子:
{
'广东':{
"id": "19",
"潮州市": {
"id": "1705",
"潮安区": {
"id": "19992",
},
},
"东莞市": {
"id": "1655",
"常平镇": {
"id": "4886",
},
"长安镇": {
"id": "4760",
},
},
},
'其他省份':{'以此类推'}
}
具体详细代码
对于数据解析部分,由于每个网页返回的结构不一样,需要自己去解析对应的数据
# -*- coding: utf-8 -*-
# @Time : 2023/7/13 15:05
# @Author : gongzairen
# @File : get_areas_id_dict.py
import json
import random
import time
import requests
class GetAreasId:
def __init__(self,sku_id):
self.sku_id = sku_id
self.data_dict = {}
# 获取区域id
def get_areas_id(self):
"""
:return: 返回的是包含省,市,区的id 的dict
逻辑很混乱多层嵌套,效果并不好
样例:
{'北京': {'id': '2', '北京市': {'id': '36', '东城区': {'id': '377'}, }},'天津': {'id': '3', '天津市': {'id': '37', '和平区': {'id': '395'},}}}
"""
data_dict = {
"data": "",
"error_info": "无",
'warning_info': "无",
}
url = '对应的url'
headers = HEADERS
headers['Referer'] = f'对应的referer'
headers['Accept'] = '对应的Accept'
all_areas_id_dict = {}
# 获取省id
province_payload = {
'请求的参数': '请求的参数',
}
province_response = requests.get(url,params=province_payload,headers=headers)
province_json = province_response.json()
province_response.close()
# print(province_json)
try:
id_lists = province_json['rows']
for id_dict in id_lists:
province_name = id_dict['name']
province_id_dict = {
'id': str(id_dict['id']),
'level': str(id_dict['level']),
}
all_areas_id_dict[province_name] = province_id_dict
except Exception as e:
data_dict['error_info'] = '区域id结果已发生改变,请联系开发人员'
print('区域id结果已发生改变,请联系开发人员')
print(e)
return data_dict
# 获取市id
for province_name in all_areas_id_dict:
time.sleep(random.uniform(0,2.2))
city_payload = {
'请求的参数': '请求的参数',
}
city_response = requests.get(url, params=city_payload, headers=headers)
city_json = city_response.json()
city_response.close()
# print(city_json)
try:
id_lists = city_json['rows']
for id_dict in id_lists:
city_name = id_dict['name']
city_id_dict = {
'id': str(id_dict['id']),
'level': str(id_dict['level']),
'parentId':str(id_dict['parentId'])
}
all_areas_id_dict[province_name][city_name] = city_id_dict
except Exception as e:
data_dict['error_info'] = '区域id结果已发生改变,请联系开发人员'
print('区域id结果已发生改变,请联系开发人员')
print(e)
return data_dict
# 获取区id
for province_name in all_areas_id_dict:
city_list = list(all_areas_id_dict[province_name].keys())
for city_name in city_list:
if city_name == "id" or city_name == "level":
continue
time.sleep(random.uniform(1, 3))
area_payload = {
'请求的参数': '请求的参数',
}
area_response = requests.get(url, params=area_payload, headers=headers)
area_json = area_response.json()
area_response.close()
print(area_json)
try:
id_lists = area_json['rows']
for id_dict in id_lists:
area_name = id_dict['name']
area_id_dict = {
'id': str(id_dict['id']),
'level': str(id_dict['level']),
'parentId':str(id_dict['parentId'])
}
all_areas_id_dict[province_name][city_name][area_name] = area_id_dict
except Exception as e:
data_dict['error_info'] = '区域id结果已发生改变,请联系开发人员'
print('区域id结果已发生改变,请联系开发人员')
print(e)
return data_dict
data_dict['data] = all_areas_id_dict
print(all_areas_id_dict)
return data_dict
test_sku_id = 'xxxxxx'
HEADERS = {
'Cookie': 'xxxxxx',
}
get_areas_obj = GetAreasId(test_sku_id)
get_areas_result_dict = get_areas_obj.get_areas_id()
if get_areas_result_dict ['error_info'] != "无":
print('获取失败')
else:
all_areas_id_dict = get_areas_result_dict['data']
生成对应的表
data = "上面生成的字典" # all_areas_id_dict
id_lists = []
for province_name in data:
city_list = list(data[province_name].keys())
for city_name in city_list:
if city_name == "id" or city_name == "level" or city_name == 'parentId':
continue
area_list = list(data[province_name][city_name].keys())
for area_name in area_list:
if area_name == "id" or area_name == "level" or area_name == 'parentId':
continue
id_lists.append([province_name,city_name,area_name])
print(id_lists)
df = pd.DataFrame(id_lists, columns=['省', '市', '区'])
file_name = 'xxxx地区全表.xlsx'
df.to_excel(file_name, index=False)
print(f"Excel文件已生成: {file_name}")
效果如下