python爬虫入门（实践）

news2025/4/9 4:24:38

python爬虫入门（实践）

一、对目标网站进行分析

在这里插入图片描述

二、博客爬取

获取博客所有h2标题的路由

确定目标，查看源码

代码实现

"""
获取博客所有h2标题的路由
"""

url = "http://www.crazyant.net"

import requests
from bs4 import BeautifulSoup

#发送请求，获取页面所有内容
r = requests.get(url)
if r.status_code != 200:
    raise Exception("请求失败") # 抛出异常
html_doc = r.text

# 解析html，获取对应信息
soup = BeautifulSoup(html_doc,"html.parser")

h2_nodes = soup.find_all("h2",class_="entry-title")

for h2_node in h2_nodes:
    link = h2_node.find("a")
    print(link["href"],link.get_text())

通过标题爬取所有博客文章

"""
爬取所有博客文章
"""
import re

from utils import url_manager
import requests
from bs4 import BeautifulSoup

root_url="http://www.crazyant.net"

# 将root_url添加到urls中
urls = url_manager.UrlManager()
urls.add_new_url(root_url)

# 获取所有页面内容，并保存到文件
fout = open("craw_all_pages.txt","w",encoding="utf-8")
while urls.has_new_url():
    curr_url = urls.get_url()
    r = requests.get(curr_url,timeout=2)
    if r.status_code != 200:
        print("请求失败",curr_url)
        continue
    soup = BeautifulSoup(r.text,"html.parser")
    title = soup.title.string # 获取标题

    fout.write('%s\t%s\n' % (curr_url, title))# 写入文件
    fout.flush()# 刷新缓冲区,直接写入文件
    print("success: %s, %s, %d"%(curr_url,title,len(urls.new_urls)))

    # 获取所有链接, 并添加到urls中
    links = soup.find_all("a")
    for link in links:
        href = link.get("href")
        if href is None:
            continue
        pattern = r"^http://www.crazyant.net/\d+.html$" # 匹配规则,匹配以http://www.crazyant.net/开头，并且以.html结尾的url
        # 正则匹配, 返回一个匹配对象，如果没有匹配到，返回None
        if re.match(pattern,href):
            urls.add_new_url(href)

fout.close()