豆瓣T250电影

news2026/2/8 13:39:13

爬取电影名字、年份、评分、评价人数


import requests
import re
import csv

"""1、拿到页面源代码"""

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.57'
}
for title in range(0, 250, 25):
    url = f'https://movie.douban.com/top250?start={title}&filter='
    resp = requests.get(url, headers=headers)
    html = resp.text
    # print(html)

    """2、解析数据"""
    # 写正则表达式
    obj = re.compile(r'<li>.*?<span class="title">(?P<name>.*?)</span>.*?<div class="bd">.*? '
                     r'<p class="">.*?<br>.*?(?P<year>.*?)&nbsp;/&nbsp;.*?'
                     r' <div class="star">.*?<span class="rating_num" property="v:average">(?P<score>.*?)</span>.*?'
                     r'<span>(?P<people>.*?)人评价</span>',re.S)
    result = obj.finditer(html)
    # 写入csv文件
    f = open("data.csv", mode="w")
    csvwriter = csv.writer(f)
    for it in result:
        # print(it.group("name"))
        # print(it.group("year").strip())
        # print(it.group("score").strip())
        # print(it.group("people"))

        """把数据整理成字典的格式"""
        dic = it.groupdict()
        dic['year'] = dic['year'].strip()
        csvwriter.writerow(dic.values())

f.close()
print("over!")

运行后打开data.csv