注意:以下内容仅供技术研究,请遵守目标网站的robots.txt规定,控制请求频率避免对目标服务器造成过大压力!
1. 环境准备与反爬策略
python
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time
import random
from fake_useragent import UserAgent # 需安装:pip install fake-useragent
# 初始化随机UserAgent
ua = UserAgent()
headers = {'User-Agent': ua.random}
# 代理IP池示例(需自行维护可用代理)
proxies = [
{'http': 'http://123.45.67.89:8080'},
{'http': 'http://112.114.96.34:3128'}
]
2. 爬虫核心代码(含分类遍历)
python
def fetch_book_info(base_url, max_pages=3):
all_books = []
# 第一步:获取所有分类链接
try:
response = requests.get(base_url, headers={'User-Agent': ua.random}, timeout=10)
soup = BeautifulSoup(response.content.decode('gbk'), 'html.parser') # 注意编码
# 提取分类导航栏(示例选择器,需根据实际页面调整)
category_links = {
item.text: item['href']
for item in soup.select('.nav li a')[1:-1] # 排除首尾非分类项
}
except Exception as e:
print("获取分类失败:", e)
return []
# 第二步:遍历每个分类
for category_name, category_url in category_links.items():
print(f"正在抓取分类: {category_name}")
for page in range(1, max_pages+1):
# 构造分页URL(示例格式,需根据实际调整)
page_url = f"{category_url}index_{page}.html" if page>1 else category_url
try:
# 随机使用代理
proxy = random.choice(proxies) if proxies else None
page_res = requests.get(page_url, he