注意:以下内容仅供技术研究,请遵守目标网站的robots.txt规定,控制请求频率避免对目标服务器造成过大压力!
1. 环境准备
python
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import matplotlib.pyplot as plt
2. 爬虫核心代码(带反爬策略)
python
def fetch_51job_data(keyword, max_pages=5):
jobs = []
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept-Language': 'zh-CN,zh;q=0.9'
}
for page in range(1, max_pages+1):
url = f'https://search.51job.com/list/000000,000000,0000,00,9,99,{keyword},2,{page}.html'
try:
response = requests.get(url, headers=headers, timeout=10)
response.encoding = 'gbk'