地址:
服务机构-苏州工业园区企业服务超市
import os
from datetime import datetime
from urllib import request
import pandas as pd
import re
import requests
from lxml import etree
from bs4 import BeautifulSoup
import csv
import codecs
# 20230521 根据每个区的企业服务中心 去获取企业信息数据
payload=""
headers={
"Accept":"*/*",
"Accept-Encoding":"gzip,deflate,br",
"Cookie":"uuid_tt_dd=10_18804907310-1584181525441-469327; UN=zhaomengszu; Hm_ct_6bcd52f51e9b3dce32bec4a3997715ac=6525*1*10_18804907310-1584181525441-469327!5744*1*zhaomengszu; __gads=ID=b56ad0a2297d6442-2288891c22c40062:T=1602996290:RT=1602996290:S=ALNI_MZ2SaewhN41t8fSXV6-zdnXJRSixQ; dc_session_id=10_1619259388646.896265; c_segment=15; c_first_page=https%3A//www.csdn.net/; c_first_ref=default; dc_sid=7286c32f348ba7b1edaae69ba6ef526c; Hm_lvt_6bcd52f51e9b3dce32bec4a3997715ac=1619259411; SESSION=2957f398-b82d-4e4a-a6e7-721cbe55d68d; ssxmod_itna=YqRxcQqeuDy7D8DzxabP0KPO7YcxDCDGx7KEb2dD/WQmDnqD=GFDK40oo8YbEhr1KPf23qIU0OWLhdyDCPhbPHt++eDHxY=DUpb4KoD445GwD0eG+DD4DW0x03DoxGYg+HHKiODQ40kDY5DwEHzDic837RYLrvFDfvGDiHonEG4LxG1DQ5DsOivQ4DCz4SYDmR3pgvsDCKDjaACCoYDUlqeLQ0eFEboSfiqxriYme47=GhxhGrqiiGezAiY3WRYtsheFO2YDDp+FQh44D===; ssxmod_itna2=YqRxcQqeuDy7D8DzxabP0KPO7YcxDCDGx7KEbrG9WdGDBTrFx7PmrM22ijyb8C38BqY=43+aQndCD8EIiInzbLebfBYlfgNYnWQ8kwXV+/QAhXqk+gkSQkdNiZ98eAwM1X8uw4PVHs6HG/rBiBBPCVPPtZcuCZ1AhbaImo9TTuZ98upB8lo71x8pA=4qFs=0FEPeIjbHTwPn8ypojzrPC=hkp2amOcmdIcGqKvKC=stPs5LnFupku4daU3NdcQw6r+F690gW=pCmyKmWUY2d1czOQx5OQhQcIXQik=PNCEQpvX7i82ftMaIzGh9lUKnIMePQGXFZ4YnIgo+o1vLrjihfPCpgBikhwpG3PWtUlLceXt9ILcm34dOlLhhqaIb72rQzrGam2Pe8DxapLfeIQFU1RDi/v4QnDaD6nAWcw1UGR1r5AfkKI1+hNYPMYE8Dhbja8e47jTqncfBhmWwkA4DQKeKrqbiQiuKArx+5Cpkah+xjdz1q1V+oUwK4+o4xbCo7e5=Qj8KDDFqD+=4xD===; UserName=zhaomengszu; UserInfo=6607995659ed4ff0a41eeb8eec07896f; UserToken=6607995659ed4ff0a41eeb8eec07896f; UserNick=%E8%BF%99%E5%AD%A9%E5%AD%90%E8%B0%81%E6%87%82%E5%93%88; AU=3C1; BT=1619259410818; p_uid=U010000; Hm_up_6bcd52f51e9b3dce32bec4a3997715ac=%7B%22islogin%22%3A%7B%22value%22%3A%221%22%2C%22scope%22%3A1%7D%2C%22isonline%22%3A%7B%22value%22%3A%221%22%2C%22scope%22%3A1%7D%2C%22isvip%22%3A%7B%22value%22%3A%220%22%2C%22scope%22%3A1%7D%2C%22uid_%22%3A%7B%22value%22%3A%22zhaomengszu%22%2C%22scope%22%3A1%7D%7D; c_page_id=default; log_Id_view=471; mp_sidebar_feedback_flag=block; log_Id_click=71; c_ref=https%3A//mp.csdn.net/editor/html/114404385; c_pref=https%3A//mp.csdn.net/editor/html/114404385; Hm_lpvt_6bcd52f51e9b3dce32bec4a3997715ac=1619259750; announcement-new=%7B%22isLogin%22%3Atrue%2C%22announcementUrl%22%3A%22https%3A%2F%2Fblog.csdn.net%2Fblogdevteam%2Farticle%2Fdetails%2F112280974%3Futm_source%3Dgonggao_0107%22%2C%22announcementCount%22%3A0%2C%22announcementExpire%22%3A3600000%7D; dc_tos=qs2cti; log_Id_pv=200",
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36"
}
for i in range(65):
print("正在爬取第"+i+1+"页")
url="https://sme.sipac.gov.cn/epservice/techsub/Apps/epssm/index.php?s=/OrgSpace/index/qb/1/order//province/320000/city/320500/p/"+str(i+1)+"/checkno/8B651777CD66FBCB720A940F890579CD"
resp=requests.request("GET",url,data=payload,headers=headers)
resp.encoding=resp.apparent_encoding
html_source=resp.text
# 使用正则表达式匹配 onclick 值
pattern = r'<div class="list-imgleft-container noorglogo" onclick="window.open(.*?)">'
onclicks = re.findall(pattern, html_source)
origin='https://sme.sipac.gov.cn'
# uu='https://sme.sipac.gov.cn/epservice/techsub/Apps/epssm/index.php?s=/OrgSpace/space/id/a6b998fb-1d9f-11e7-8b94-000c29881ee3'
# #resp2=requests.request("GET",origin+onclick[2:-2],data=payload,headers=headers)
# resp2=requests.request("GET",uu,data=payload,headers=headers)
# resp2.encoding=resp2.apparent_encoding
# html2 = etree.HTML(resp2.text)
# #直接写入txt
# # with open("com.txt","w",encoding='utf-8') as f:
# # f.write(resp2.text)
# aa=html2.xpath('//*[@id="items-my-basic"]/table/tr[2]/td[2]/div/text()')[0]
# print(aa)
rows=[]
# 打印结果
for onclick in onclicks:
resp2=requests.request("GET",origin+onclick[2:-2],data=payload,headers=headers)
resp2.encoding=resp2.apparent_encoding
html2 = etree.HTML(resp2.text)
qymc=html2.xpath('/html/body/div[2]/div[2]/div/div[2]/div[1]/div/div[1]/div[2]/div[1]/span/text()')[0] #企业名称
zyyw=html2.xpath('//*[@id="items-my-basic"]/table/tr[2]/td[2]/div/text()')[0] #主营业务
jgjj=html2.xpath('//*[@id="items-my-basic"]/table/tr[3]/td[2]/div/text()')[0]#机构简介
ygrs=html2.xpath('//*[@id="items-my-three"]/div[1]/table/tr[1]/td[2]/text()')[0] #员工人数
zyryrs=html2.xpath('//*[@id="items-my-three"]/div[1]/table/tr[1]/td[4]/text()')[0] #执业人员人数
bk=html2.xpath('//*[@id="items-my-three"]/div[1]/table/tr[2]/td[2]/text()')[0]#本科
ss=html2.xpath('//*[@id="items-my-three"]/div[1]/table/tr[2]/td[4]/text()')[0]#硕士
# 博士 +html2.xpath('//*[@id="items-my-three"]/div[1]/table/tr[3]/td[1]/text()')[0]+html2.xpath('//*[@id="items-my-three"]/div[1]/table/tr[3]/td[2]/text()')[0]+html2.xpath('//*[@id="items-my-three"]/div[1]/table/tr[3]/td[3]/text()')[0]+html2.xpath('//*[@id="items-my-three"]/div[1]/table/tr[3]/td[4]/text()')[0]
lxrname=html2.xpath('//*[@id="items-my-two"]/div[1]/table/tr/td[2]/text()')[0] #姓名
lxrpos=html2.xpath('//*[@id="items-my-two"]/div[1]/table/tr/td[4]/text()')[0] #职务
lxrphone=html2.xpath('//*[@id="items-my-two"]/div[1]/table/tr/td[6]/text()')[0] #手机
lxraddr=html2.xpath('//*[@id="items-my-two"]/div[2]/table/tr[2]/td[2]/p/text()')[0] #办公地址
row=(qymc,zyyw,jgjj,ygrs,zyryrs,bk,ss,lxrname,lxrpos,lxrphone,lxraddr)
rows.append(row)
with codecs.open('company20230521.csv', 'wb',encoding='gbk',errors='ignore') as f:
writer = csv.writer(f)
writer.writerow(["企业名称","主营业务","机构简介","员工人数","执业人员人数","本科","硕士","姓名","职务","手机","","办公地址"])
writer.writerows(rows)