feapder
框架
1、简单介绍
简介
feapder
上手简单、功能强大的Python
爬虫框架,内置AirSpider
、Spider
、Task、Spider
、BatchSpider
四种爬虫解决不同场景的需求- 支持断点续爬、监控报警、浏览器渲染、海量数据去重等功能
- 更有功能强大的爬虫管理系统
feaplat
为其提供方便的部署及调度
文档地址与环境配置
官方文档:简介及安装 - feapder官方文档|feapder-document
# 在安装之前建议使用miniconda3创建一个新的虚拟环境
conda create -n feapder_base python=3.9
conda activate feapder_base
执行流程
核心:spider
spider调度
-> start_request(打包request对象) 提交给
-> request_buffer(请求缓冲区)将request对象提交给
-> 任务队列
-> collector(控制器)
-> parser_control(模版解析控制器)
-> 下载器(封装成response)
-> parser_control(模版解析控制器) 区分对象类型
-> (response) 给 parser(多个解析器) 提取页面数据
-> 清洗完毕
-> parser_control(模版解析控制器)
-> 解析完成
-> item_buffer(数据缓冲区)
-> 数据入库(db)
2、feapder
简单使用
命令
# 帮助文档
feapder create -h
常用
-p 创建项目
-s 创建爬虫
-i 创建item 字段校验
创建项目
feapder create -s douban
执行命令后需要手动选择对应的爬虫模版
AirSpider
轻量爬虫:学习成本低、可快速上手Spider
分布式爬虫:支持断点续爬、爬虫报警、数据自动入库等功能TaskSpider
分布式爬虫:内部封装了取种子任务的逻辑,内置支持从redis
或者mysql
获取任务,也可通过自定义实现从其他来源获取任务BatchSpider
批次爬虫:可周期性的采集数据,自动将数据按照指定的采集周期划分。(如每7天全量更新一次商品销量的需求)
使用
AirSpider
# -*- coding: utf-8 -*-
"""
Created on 2025-02-16 10:10:31
---------
@summary:
---------
@author: hp
"""
import feapder
from feapder import Request
class Douban(feapder.AirSpider):
def start_requests(self):
for page in range(10):
yield feapder.Request(f'https://movie.douban.com/top250?start={page * 25}&filter=')
# yield feapder.Request("https://movie.douban.com/top250?start=25&filter=")
# def download_midware(self, request: Request):
# if request.url.startswith("https://movie.douban.com/top250"):
# request.proxies = {"http": "http://127.0.0.1:7897", "https": "https://127.0.0.1:7897"}
# return request
def parse(self, request, response):
li_list = response.xpath("//ol/li/div[@class='item']")
for li in li_list:
item = dict()
item['title'] = li.xpath(".//div[@class='hd']/a/span[1]/text()").extract_first()
item['detail_url'] = li.xpath(".//div[@class='hd']/a/@href").extract_first()
item['score'] = li.xpath(".//div[@class='star']/span[2]/text()").extract_first()
yield feapder.Request(item['detail_url'], callback=self.detail_parse, item=item )
def detail_parse(self, request, response):
if response.xpath("//div[@class='indent']/span[@class='all hidden']/text()"):
request.item['detail_text'] = response.xpath("//div[@class='indent']/span[@class='all hidden']/text()").extract_first().strip()
else:
request.item['detail_text'] = response.xpath("//div[@class='indent']/span[1]/text()").extract_first().strip()
print(request.item)
if __name__ == "__main__":
Douban().start()
数据存入
MySQL
feapder create --setting
feapder create -i douban_feapder # 这里的这个item名字要与创建的 表名 一致
# -*- coding: utf-8 -*-
"""
Created on 2025-02-16 10:10:31
---------
@summary:
---------
@author: hp
"""
import feapder
from feapder import Request
from douban_feapder_item import DoubanFeapderItem
class Douban(feapder.AirSpider):
def start_requests(self):
for page in range(10):
yield feapder.Request(f'https://movie.douban.com/top250?start={page * 25}&filter=')
# yield feapder.Request("https://movie.douban.com/top250?start=25&filter=")
def parse(self, request, response):
li_list = response.xpath("//ol/li/div[@class='item']")
for li in li_list:
item = DoubanFeapderItem()
item['title'] = li.xpath(".//div[@class='hd']/a/span[1]/text()").extract_first()
item['detail_url'] = li.xpath(".//div[@class='hd']/a/@href").extract_first()
item['score'] = li.xpath(".//div[@class='star']/span[2]/text()").extract_first()
yield feapder.Request(item['detail_url'], callback=self.detail_parse, item=item )
def detail_parse(self, request, response):
if response.xpath("//div[@class='indent']/span[@class='all hidden']/text()"):
request.item['detail_text'] = response.xpath("//div[@class='indent']/span[@class='all hidden']/text()").extract_first().strip()
else:
request.item['detail_text'] = response.xpath("//div[@class='indent']/span[1]/text()").extract_first().strip()
# 进行数据入库
yield request.item
if __name__ == "__main__":
Douban().start()
3、下载中间件使用
- 下载中间件用于在请求之前,对请求做一些处理,如添加
cookie
、header
等 - 默认所有的解析函数在请求之前都会经过此下载中间件
# 默认中间件
# def download_midware(self, request: Request):
# request.headers = {
# "User-Agent" : "abc",
# }
# request.proxies = {
# 'http' : 'http://127.0.0.1:7897'
# }
# request.cookies = {
# 'a' : 'b',
# }
# return request
# 自定义中间件 需要写参数 download_midware 该参数可接收列表
def custom_download_midware(self, request: Request):
request.headers = {
"User-Agent" : "abc",
}
return request
def start_requests(self):
# request对象支持载入多个自定义中间件,将dwonload_midware的参数设置为一个列表形式即可
# 但是一般不使用,在一个中间件中配置好所有的参数即可
yield feapder.Request("https://movie.douban.com/top250?start=0&filter=", download_midware=self.custom_download_midware)
4、校验响应对象
def validate(self, request, response):
print('响应状态码:', response.status_code)
if response.status_code != 200:
print('响应状态码异常:', response.status_code)
# return False # 抛弃当前请求
raise Exception('请求重试') # 重试当前请求
5、浏览器渲染
import feapder
from selenium.webdriver.common.by import By
from feapder.utils.webdriver import WebDriver
class Baidu(feapder.AirSpider):
def start_requests(self):
yield feapder.Request("https://www.baidu.com", render=True)
def parse(self, request, response):
browser: WebDriver = response.browser
browser.find_element(By.ID, "kw").send_keys("feapder")
browser.find_element(By.ID, "su").click()
if __name__ == "__main__":
Baidu().start()
setting.py中的 浏览器渲染部分 打开
6、使用浏览器渲染动态获取接口
import feapder
from feapder.utils.webdriver import WebDriver
class TestXhrInfo(feapder.AirSpider):
def start_requests(self):
yield feapder.Request("https://spidertools.cn", render=True)
def parse(self, request, response):
browser : WebDriver = response.browser
# ad = browser.xhr_text('/ad')
# print(ad)
xhr_response = browser.xhr_response("/ad")
print("请求接口", xhr_response.request.url)
print("请求头", xhr_response.request.headers)
print("请求体", xhr_response.request.data)
print("返回头", xhr_response.headers)
print("返回地址", xhr_response.url)
print("返回内容", xhr_response.content)
if __name__ == "__main__":
TestXhrInfo().start()
setting.py中的浏览器渲染部分打开:
xhr_url_regexes=['/ad'], # 拦截xhr接口,支持正则,数组类型
7、使用feapder
抓取应届生岗位数据
# -*- coding: utf-8 -*-
"""
Created on 2025-02-16 14:40:06
---------
@summary:
---------
@author: hp
"""
import time
import feapder
from feapder.utils.webdriver import WebDriver
class JobInfo(feapder.AirSpider):
def start_requests(self):
yield feapder.Request("https://q.yingjiesheng.com/jobs/search/Python", render=True)
def parse(self, request, response):
browser: WebDriver = response.browser
time.sleep(3)
json_data = browser.xhr_json('open/noauth/job/search')
for temp in json_data['resultbody']['searchData']['joblist']['items']:
item = dict()
item['jobname'] = temp['jobname']
item['coname'] = temp['coname']
item['jobarea'] = temp['jobarea']
item['issuedate'] = temp['issuedate']
item['jobtag'] = temp['jobtag']
item['providesalary'] = temp['providesalary']
print(item)
if __name__ == "__main__":
JobInfo().start()
# setting.py
# 浏览器渲染
WEBDRIVER = dict(
pool_size=1, # 浏览器的数量
load_images=True, # 是否加载图片
user_agent=None, # 字符串 或 无参函数,返回值为user_agent
proxy=None, # xxx.xxx.xxx.xxx:xxxx 或 无参函数,返回值为代理地址
headless=False, # 是否为无头浏览器
driver_type="CHROME", # CHROME、EDGE、PHANTOMJS、FIREFOX
timeout=30, # 请求超时时间
window_size=(1024, 800), # 窗口大小
executable_path=None, # 浏览器路径,默认为默认路径
render_time=0, # 渲染时长,即打开网页等待指定时间后再获取源码
custom_argument=[
"--ignore-certificate-errors",
"--disable-blink-features=AutomationControlled",
], # 自定义浏览器渲染参数
xhr_url_regexes=['open/noauth/job/search'], # 拦截xhr接口,支持正则,数组类型
auto_install_driver=False, # 自动下载浏览器驱动 支持chrome 和 firefox
download_path=None, # 下载文件的路径
use_stealth_js=False, # 使用stealth.min.js隐藏浏览器特征
8、feapder
创建完整项目
命令
feapder craete -p wp_spider
# -*- coding: utf-8 -*-
"""
Created on 2025-02-16 15:15:35
---------
@summary:
---------
@author: hp
"""
from random import randint
import feapder
from feapder.utils.webdriver import WebDriver
from selenium.webdriver.common.by import By
import time
class WpShopInfo(feapder.AirSpider):
def start_requests(self):
yield feapder.Request("https://category.vip.com/suggest.php?keyword=%E7%94%B5%E8%84%91&ff=235|12|1|1&tfs_url=%2F%2Fmapi-pc.vip.com%2Fvips-mobile%2Frest%2Fshopping%2Fpc%2Fsearch%2Fproduct%2Frank", render=True)
def parse(self, request, response):
browser: WebDriver = response.browser
time.sleep(20) # 为了正常测试,手动登录
# 页面下划加载动态数据
self.drop_down(browser)
# 获取页面中所有商品div
div_list = browser.find_elements(By.XPATH, '//section[@id="J_searchCatList"]/div[@class="c-goods-item J-goods-item c-goods-item--auto-width"]')
# print(div_list)
# 对返回的div_list进行迭代并获取每个商品的信息
for div in div_list:
title = div.find_element(By.XPATH, './/div[2]/div[2]').text
price = div.find_element(By.XPATH, './/div[@class="c-goods-item__sale-price J-goods-item__sale-price"]').text
print(title, price)
# 翻页
self.next_page(browser)
next_url = browser.current_url # 获取翻页后的网页地址
print(next_url)
yield feapder.Request(next_url, render=True, callback=self.parse)
def drop_down(self, browser):
for i in range(1, 13):
js_code = f'document.documentElement.scrollTop={i*1000}'
browser.execute_script(js_code)
time.sleep(randint(1, 2))
def next_page(self, browser):
# 获取下一页按钮
try:
next_page = browser.find_element(By.XPATH, "//a[@class='cat-paging-next']/i")
if next_page:
next_page.click()
else:
browser.close()
except Exception as e:
print('最后一页: ', e)
browser.close()
if __name__ == "__main__":
WpShopInfo().start()