python爬虫入门(实践)
一、对目标网站进行分析
二、博客爬取
-
获取博客所有h2标题的路由
-
确定目标,查看源码
-
代码实现
""" 获取博客所有h2标题的路由 """ url = "http://www.crazyant.net" import requests from bs4 import BeautifulSoup #发送请求,获取页面所有内容 r = requests.get(url) if r.status_code != 200: raise Exception("请求失败") # 抛出异常 html_doc = r.text # 解析html,获取对应信息 soup = BeautifulSoup(html_doc,"html.parser") h2_nodes = soup.find_all("h2",class_="entry-title") for h2_node in h2_nodes: link = h2_node.find("a") print(link["href"],link.get_text())
-
-
通过标题爬取所有博客文章
""" 爬取所有博客文章 """ import re from utils import url_manager import requests from bs4 import BeautifulSoup root_url="http://www.crazyant.net" # 将root_url添加到urls中 urls = url_manager.UrlManager() urls.add_new_url(root_url) # 获取所有页面内容,并保存到文件 fout = open("craw_all_pages.txt","w",encoding="utf-8") while urls.has_new_url(): curr_url = urls.get_url() r = requests.get(curr_url,timeout=2) if r.status_code != 200: print("请求失败",curr_url) continue soup = BeautifulSoup(r.text,"html.parser") title = soup.title.string # 获取标题 fout.write('%s\t%s\n' % (curr_url, title))# 写入文件 fout.flush()# 刷新缓冲区,直接写入文件 print("success: %s, %s, %d"%(curr_url,title,len(urls.new_urls))) # 获取所有链接, 并添加到urls中 links = soup.find_all("a") for link in links: href = link.get("href") if href is None: continue pattern = r"^http://www.crazyant.net/\d+.html$" # 匹配规则,匹配以http://www.crazyant.net/开头,并且以.html结尾的url # 正则匹配, 返回一个匹配对象,如果没有匹配到,返回None if re.match(pattern,href): urls.add_new_url(href) fout.close()
- 运行结果