源码如下:
import asyncio
import aiohttp
from lxml import etree
import logging
import datetime
import openpyxl
wb = openpyxl.Workbook()
sheet = wb.active
sheet.append(['房源', '房子信息', '所在区域', '单价', '关注人数和发布时间', '标签'])
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s: %(message)s')
start = datetime.datetime.now()
class Spider(object):
def __init__(self):
self.semaphore = asyncio.Semaphore(6) # 信号量,控制协程数,防止爬的过快被反爬
self.header = {
"Host": "sh.lianjia.com",
"Referer": "https://sh.lianjia.com/ershoufang/",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36"
}
async def scrape(self, url):
async with self.semaphore:
await asyncio.sleep(3) # 添加等待时间
session = aiohttp.ClientSession(headers=self.header)
response = await session.get(url)
result = await response.text()
await session.close()
return result
async def scrape_index(self, page):
url = f'https://sh.lianjia.com/ershoufang/pg{page}/'
text = await self.scrape(url)
await self.parse(text)
async def parse(self, text):
html = etree.HTML(text)
lis = html.xpath('//*[@id="content"]/div[1]/ul/li')
for li in lis:
house_data = li.xpath('.//div[@class="title"]/a/text()')[0] # 房源
house_info = li.xpath('.//div[@class="houseInfo"]/text()')[0] # 房子信息
address = ' '.join(li.xpath('.//div[@class="positionInfo"]/a/text()')) # 位置信息
price = li.xpath('.//div[@class="priceInfo"]/div[2]/span/text()')[0] # 单价 元/平米
attention_num = li.xpath('.//div[@class="followInfo"]/text()')[0] # 关注人数和发布时间
tag = ' '.join(li.xpath('.//div[@class="tag"]/span/text()')) # 标签
sheet.append([house_data, house_info, address, price, attention_num, tag])
logging.info([house_data, house_info, address, price, attention_num, tag])
def main(self):
# 100页的数据
scrape_index_tasks = [asyncio.ensure_future(self.scrape_index(page)) for page in range(1, 101)]
loop = asyncio.get_event_loop()
tasks = asyncio.gather(*scrape_index_tasks)
loop.run_until_complete(tasks)
if __name__ == '__main__':
spider = Spider()
spider.main()
wb.save('house2.xlsx')
delta = (datetime.datetime.now() - start).total_seconds()
print("用时:{:.3f}s".format(delta))
这个代码会触发链家反*机制,等3个小时就可以继续用。代码后续再修改。