(一)显示影片基本信息
访问豆瓣电影Top250(豆瓣电影 Top 250),获取每部电影的中文片名、排名、评分及其对应的链接,按照“排名-中文片名-评分-链接”的格式显示在屏幕上。
(二)存储影片详细信息
访问豆瓣电影Top250(豆瓣电影 Top 250),在问题1的基础上,获取每部电影的导演、编剧、主演、类型、上映时间、片长、评分人数以及剧情简介等信息,并将获取到的信息保存至本地文件中。
import requests
from bs4 import BeautifulSoup
import json
import time
# 获取单页的电影信息
def get_movies_info_from_page(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"
}
response = requests.get(url, headers=headers)
if response.status_code != 200:
print(f"Failed to retrieve data from {url}")
return []
soup = BeautifulSoup(response.text, 'html.parser')
movies = soup.find_all('div', class_='item')
movies_info = []
for movie in movies:
rank = movie.find('em').get_text() # 排名
title = movie.find('span', class_='title').get_text() # 中文片名
rating = movie.find('span', class_='rating_num').get_text() # 评分
link = movie.find('a')['href'] # 详情链接
# 打印基本信息
print(f"{rank} - {title} - {rating} - {link}")
# 保存到列表中
movies_info.append({
'rank': rank,
'title': title,
'rating': rating,
'link': link
})
return movies_info
# 获取电影详情页的详细信息
def get_movie_detail(movie):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36"
}
response = requests.get(movie['link'], headers=headers)
if response.status_code != 200:
print(f"Failed to retrieve details from {movie['link']}")
return movie
soup = BeautifulSoup(response.text, 'html.parser')
# 获取导演、编剧、主演等信息
info = soup.find('div', id='info')
if info:
movie['director'] = info.find('a', rel='v:directedBy').get_text() if info.find('a', rel='v:directedBy') else ''
movie['scriptwriters'] = ','.join([a.get_text() for a in info.find_all('a', rel=False) if '编剧' in str(a)]) # 编剧信息
movie['actors'] = ','.join([a.get_text() for a in info.find_all('a', rel='v:starring')]) # 主演
movie['genre'] = ','.join([a.get_text() for a in info.find_all('span', property='v:genre')]) # 类型
movie['release_date'] = info.find('span', property='v:initialReleaseDate').get_text() if info.find('span', property='v:initialReleaseDate') else '' # 上映时间
movie['duration'] = info.find('span', property='v:runtime').get_text() if info.find('span', property='v:runtime') else '' # 片长
movie['votes'] = soup.find('span', property='v:votes').get_text() if soup.find('span', property='v:votes') else '' # 评分人数
movie['summary'] = soup.find('span', property='v:summary').get_text().strip() if soup.find('span', property='v:summary') else '' # 剧情简介
return movie
# 爬取所有Top 250的电影信息
def scrape_douban_top_250():
base_url = 'https://movie.douban.com/top250?start='
all_movies = []
for i in range(0, 250, 25):
url = base_url + str(i)
movies_info = get_movies_info_from_page(url)
# 获取每部电影的详细信息
for movie in movies_info:
movie_detail = get_movie_detail(movie)
all_movies.append(movie_detail)
# 为了防止被网站封禁,添加一些延时
time.sleep(2)
# 保存到本地文件
with open('douban_top_250_movies.json', 'w', encoding='utf-8') as f:
json.dump(all_movies, f, ensure_ascii=False, indent=4)
# 运行爬虫
scrape_douban_top_250()