-
这一节我们将之前爬取到的景点数据进行解析,并且保存为excel,便于后续使用,本节包含
(1) 景点数据解析
(2)数据保存到excel
1 编写爬虫
这次继续改进第二节的爬虫,新建一个爬虫文件叫 spiders/qiongyou_3.py
因为这一节开始我们直接解析返回的response了,所以不需要保存html了。
编写一个自定义程序去解析页面的源码提取我们需要的信息
while True:
# 先爬取3页 (测试)
if page_number > 3:
break
# 解析当前页面内容(如果需要解析,可以在这里添加解析逻辑)
page_source = self.driver.page_source
for item in self.parse_page(page_source):
yield item
parse_page方法这样写
# 解析页面
def parse_page(self, page_source):
response = scrapy.Selector(text=page_source)
print(response)
sights = response.xpath('//ul[@id="poiLists"]/li')
print('sights=', sights)
for sight in sights:
item = TourItem()
item['title'] = sight.xpath('.//h3[@class="title fontYaHei"]/a/text()').get().strip()
item['title_en'] = (sight.xpath('.//h3[@class="title fontYaHei"]/a/span/text()').get() or '').strip()
print(f"\033[92m{item['title']}\033[0m")
print(f"\033[92m{item['title_en']}\033[0m")
yield item
先打通流程,所以只提取2个字段,景点中文名和英文名
这边title_en这么写的原因是景点的外文名可能是不存在的,如果不加处理地提取会报错导致程序直接中断。
2 修改 items
修改items.py ,定义我们自己的数据结构
# 定义数据结构
class TourItem(scrapy.Item):
title = scrapy.Field()
title_en = scrapy.Field()
3 修改 Pipelines
利用pipelines 来对数据进行保存。
import pandas as pd
class TourPipeline:
def __init__(self):
self.data = []
def process_item(self, item, spider):
self.data.append(dict(item))
return item
def close_spider(self, spider):
df = pd.DataFrame(self.data)
# 使用pandas 保存东京景点 到excel文件
df.to_excel('tokyo_sights.xlsx', index=False)
spider.log('Saved data to tokyo_sights.xlsx')
需要安装pandas。
pip install pandas
还需要在settings.py 中打开配置。
ITEM_PIPELINES = {
'tutorial2.pipelines.TourPipeline': 300,
}
4 执行代码
scrapy crawl qys3
在测试的时候发现,第一次调的时候,通常驱动启动的会比较慢(这个原因我们下一期说),而后面调试就很快,发现数据可以保存到excel 。
5 完整qiongyou_3.py代码 & items.py 代码
后续继续解析其他字段,这边就直接贴出代码。
# qiongyou_3.py
import re
import scrapy
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver
from selenium.webdriver.remote.remote_connection import LOGGER
import logging
import time
from tutorial2.items import TourItem
class QiongyouSpider(scrapy.Spider):
name = 'qys3'
allowed_domains = ['qyer.com']
start_urls = ['https://place.qyer.com/tokyo/sight/']
def __init__(self, *args, **kwargs):
super(QiongyouSpider, self).__init__(*args, **kwargs)
options = webdriver.ChromeOptions()
options.add_argument('--headless')
LOGGER.setLevel(logging.WARNING)
self.driver = webdriver.Chrome(options=options) # 替换为 ChromeDriver 的实际路径
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response):
# 定义对网站的操作(保存HTML)
self.driver.get(response.url)
# 等待页面加载
WebDriverWait(self.driver, 10).until(
EC.presence_of_element_located((By.XPATH, '//ul[@id="poiLists"]'))
)
page_number = 1
while True:
# 先爬取3页 (测试)
if page_number > 40:
break
# 解析当前页面内容(如果需要解析,可以在这里添加解析逻辑)
page_source = self.driver.page_source
for item in self.parse_page(page_source):
yield item
# 查找并点击 "下一页" 按钮
try:
# javascript
next_button = self.driver.find_element(By.XPATH, '//a[@title="下一页"]')
self.driver.execute_script("arguments[0].click();", next_button)
page_number += 1
time.sleep(2) # 等待页面加载
WebDriverWait(self.driver, 10).until(
EC.presence_of_element_located((By.XPATH, '//ul[@id="poiLists"]'))
)
except Exception as e:
self.log(f"No more pages or failed to load next page: {e}")
break
self.driver.quit()
# 解析页面
def parse_page(self, page_source):
response = scrapy.Selector(text=page_source)
print(response)
sights = response.xpath('//ul[@id="poiLists"]/li')
print('sights=', sights)
for sight in sights:
item = TourItem()
item['title'] = sight.xpath('.//h3[@class="title fontYaHei"]/a/text()').get().strip()
item['title_en'] = (sight.xpath('.//h3[@class="title fontYaHei"]/a/span/text()').get() or '').strip()
item['img'] = (sight.xpath('.//p[@class="pics"]/a/img/@src').get() or '').strip()
item['score'] = (sight.xpath('.//div[@class="info"]/span[@class="grade"]/text()').get() or '').strip()
comment = (sight.xpath('.//span[@class="dping"]/a/text()').get() or '').strip()
item['comment_url'] = 'https:' + (sight.xpath('.//span[@class="dping"]/a/@href').get() or '').strip()
item['rank_title'] = (sight.xpath('.//div[@class="info"]//span[@class="infoSide"]/text()').get() or '').strip()
item['select_user'] = (sight.xpath('.//p[@class="user"]/a/img/@src').get() or '').strip()
item['select_comment'] = (sight.xpath('.//div[@class="txt"]/text() | .//p[@class="txt"]/text()').get() or '').strip()
rank = (sight.xpath('.//div[@class="info"]//em[@class="rank orange"]/text()').get() or '').strip()
review_count_pattern = r'(\d+)人点评'
review_count_match = re.search(review_count_pattern, comment)
if review_count_match:
item['comment'] = int(review_count_match.group(1))
else:
item['comment'] = 0
rank_pattern = r'第(\d+)位'
rank_match = re.search(rank_pattern, rank)
if rank_match:
item['rank'] = int(rank_match.group(1))
else:
item['rank'] = 0
print(f"\033[92m{item['title']}\033[0m")
print(f"\033[92m{item['title_en']}\033[0m")
print(f"\033[92m{item['img']}\033[0m")
print(f"\033[92m{item['score']}\033[0m")
print(f"\033[92m{item['comment_url']}\033[0m")
print(f"\033[92m{item['comment']}\033[0m")
print(f"\033[92m{item['rank_title']}\033[0m")
print(f"\033[92m{item['rank']}\033[0m")
print(f"\033[92m{item['select_user']}\033[0m")
print(f"\033[92m{item['select_comment']}\033[0m")
yield item
# items.py
import scrapy
# 定义数据结构
class TourItem(scrapy.Item):
title = scrapy.Field()
title_en = scrapy.Field()
img = scrapy.Field()
score = scrapy.Field()
comment = scrapy.Field()
comment_url = scrapy.Field()
rank_title = scrapy.Field()
rank = scrapy.Field()
select_user = scrapy.Field()
select_comment = scrapy.Field()
最后爬取到的excel 效果:
下一期我们就把数据存储到mysql中。