Python_使用requests获取当当网榜单中的图书数据

news2025/10/24 4:34:36

使用requests获取当当网榜单中的图书数据

- 使用到的库
- 概述
- 发送请求
- xpath解析提取数据
- - tips
- 完整代码

使用到的库

requests
etree

概述

主要抓取目标为

当当网图书畅销榜中近24小时畅销的图书信息

主要提取的数据为

排名、书名、评论数、推荐度、作者信息、出版时间、出版社、折扣价、原价、折扣率、电子书价格、详情页链接

在这里插入图片描述

发送请求

# 设置cookie
cookies = {
    'ddscreen': '2',
    'dest_area': 'country_id%3D9000%26province_id%3D111%26city_id%20%3D0%26district_id%3D0%26town_id%3D0',
    '__permanent_id': '20230727111738725280659109780638954',
    '__visit_id': '20230727111738728252759597714636934',
    '__out_refer': '',
    '__trace_id': '20230727111738729253018287016217767'
}

# 设置请求头
headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Cache-Control': 'no-cache',
    'Connection': 'keep-alive',
    # Requests sorts cookies= alphabetically
    # 'Cookie': 'ddscreen=2; dest_area=country_id%3D9000%26province_id%3D111%26city_id%20%3D0%26district_id%3D0%26town_id%3D0; __permanent_id=20230311201918630282239304091816610; __visit_id=20230613202543057355948286659409526; __out_refer=; __rpm=...1686659146436%7C...1686659147726; __trace_id=20230613202548496123352493475398790',
    'Pragma': 'no-cache',
    'Referer': 'http://bang.dangdang.com/books/bestsellers/01.00.00.00.00.00-24hours-0-0-2-1',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
}

...

# 发送请求
response = requests.get(url, cookies=cookies, headers=headers, verify=False)

xpath解析提取数据

本案例中主要使用Xpath来提取数据，也可以使用其他方法提取数据

# 使用XPATH解析提取数据
        lists = html.xpath('//div//ul[@class="bang_list clearfix bang_list_mode"]//li')

        # 提取：排名、书名、评论数、推荐度、作者信息、出版时间、出版社、折扣价、原价、折扣率、电子书价格
        for li in lists:
            book_info = {
                "排名": "".join(li.xpath('./div[1]/text()')).replace(".", ""),
                "书名": "".join(li.xpath('.//div[3]/a/@title')),
                "评论数": "".join(li.xpath('.//div[4]/a/text()')).replace("条评论", ""),
                "推荐度": "".join(li.xpath('.//div[4]/span/text()')).replace("推荐", ""),
                "作者信息": "".join(li.xpath('.//div[5]/a/text()')),
                "出版时间": "".join(li.xpath('.//div[6]/span/text()')),
                "出版社": "".join(li.xpath('.//div[6]/a/text()')),
                "折扣价": "".join(li.xpath('.//div[@class="price"]//span[@class="price_n"]/text()')),
                "原价": "".join(li.xpath('.//div[@class="price"]//span[@class="price_r"]/text()')),
                "折扣率": "".join(li.xpath('.//div[@class="price"]//span[@class="price_s"]/text()')),
                "电子书价格": "".join(li.xpath('.//div[@class="price"]//p[@class="price_e"]/span/text()')),
                "详情页链接": "".join(li.xpath('.//div[@class="name"]/a/@href'))
            }

tips

提取到的数据可变量直接接收然后存放到列表中，如下

# ranking = "".join(li.xpath('./div[1]/text()')).replace(".", "")
# name = "".join(li.xpath('.//div[3]/a/@title'))
# comments = "".join(li.xpath('.//div[4]/a/text()')).replace("条评论", "")
# recommend_level = "".join(li.xpath('.//div[4]/span/text()')).replace("推荐", "")
# author = "".join(li.xpath('.//div[5]/a/text()'))
# publish_time = "".join(li.xpath('.//div[6]/span/text()'))
# press = "".join(li.xpath('.//div[6]/a/text()'))
# discount_price = "".join(li.xpath('.//div[@class="price"]//span[@class="price_n"]/text()'))
# original_price = "".join(li.xpath('.//div[@class="price"]//span[@class="price_r"]/text()'))
# discount_rate = "".join(li.xpath('.//div[@class="price"]//span[@class="price_s"]/text()'))
# ebook_Price = "".join(li.xpath('.//div[@class="price"]//p[@class="price_e"]/span/text()'))
# details_url = "".join(li.xpath('.//div[@class="name"]/a/@href'))
#
# result_list = [ranking, name, comments, recommend_level, author, publish_time, press, discount_price, original_price, discount_rate, ebook_Price, details_url]

完整代码

import requests
from lxml import etree


# 获取当当网数据
def crawl_dd():
    # 设置cookie
    cookies = {
        'ddscreen': '2',
        'dest_area': 'country_id%3D9000%26province_id%3D111%26city_id%20%3D0%26district_id%3D0%26town_id%3D0',
        '__permanent_id': '20230727111738725280659109780638954',
        '__visit_id': '20230727111738728252759597714636934',
        '__out_refer': '',
        '__trace_id': '20230727111738729253018287016217767'
    }

    # 设置请求头
    headers = {
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Cache-Control': 'no-cache',
        'Connection': 'keep-alive',
        'Pragma': 'no-cache',
        'Referer': 'http://bang.dangdang.com/books/bestsellers/01.00.00.00.00.00-24hours-0-0-2-1',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36',
    }

    # url地址
    # url = 'http://bang.dangdang.com/books/bestsellers/01.00.00.00.00.00-24hours-0-0-1-1'

    # 抓取指定页数的数据
    for i in range(1, 5):
    	# 请求地址
        url = f'http://bang.dangdang.com/books/bestsellers/01.00.00.00.00.00-24hours-0-0-1-{i}'

        # 发送请求
        response = requests.get(url, cookies=cookies, headers=headers, verify=False)

        # 将网页响应结果转换为文本
        content = response.text

        # 解析字符串格式的HTML文档对象，将传进去的字符串转变成_Element对象
        html = etree.HTML(content)

        # 使用XPATH解析提取数据
        lists = html.xpath('//div//ul[@class="bang_list clearfix bang_list_mode"]//li')

        # 提取：排名、书名、评论数、推荐度、作者信息、出版时间、出版社、折扣价、原价、折扣率、电子书价格
        for li in lists:
            book_info = {
                "排名": "".join(li.xpath('./div[1]/text()')).replace(".", ""),
                "书名": "".join(li.xpath('.//div[3]/a/@title')),
                "评论数": "".join(li.xpath('.//div[4]/a/text()')).replace("条评论", ""),
                "推荐度": "".join(li.xpath('.//div[4]/span/text()')).replace("推荐", ""),
                "作者信息": "".join(li.xpath('.//div[5]/a/text()')),
                "出版时间": "".join(li.xpath('.//div[6]/span/text()')),
                "出版社": "".join(li.xpath('.//div[6]/a/text()')),
                "折扣价": "".join(li.xpath('.//div[@class="price"]//span[@class="price_n"]/text()')),
                "原价": "".join(li.xpath('.//div[@class="price"]//span[@class="price_r"]/text()')),
                "折扣率": "".join(li.xpath('.//div[@class="price"]//span[@class="price_s"]/text()')),
                "电子书价格": "".join(li.xpath('.//div[@class="price"]//p[@class="price_e"]/span/text()')),
                "详情页链接": "".join(li.xpath('.//div[@class="name"]/a/@href'))
            }
			
			# 测试输出
            print(book_info)
            
if __name__ == '__main__':
    crawl_dd()