爬虫基本 + re/etree/beautifulsoup+保存本地/连接数据库

基本

lxml/etree

beautifulsoup

保存到本地

传入数据库

大致分为

爬虫基本 + re/etree/beautifulsoup+保存本地/连接数据库

基本

爬一个很简单的百度新闻热搜

爬排名 热搜名 和热搜指数

百度热搜

我们直接开始分析

其实这个页面很简单就是在自己页面的源代码上

也不需要什么分析直接爬源代码即可

这里就是很简单的爬源代码

import requests
url='https://top.baidu.com/board?tab=realtime'
headers = {
        'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/109.0'
    } #伪造ua

def content():
    resposne=requests.get(url=url,headers=headers)
    resposne.encoding='utf-8'
    content=resposne.text
    print(content)
    return content
if __name__ == "__main__":
    content=content()

我们爬完的数据

这里就分为3个模块

re

输入放入里面开始分析

regex101: build, test, and debug regex

<div class="c-single-text-ellipsis">  (.*?) </div>

报错使用
<div class="c-single-text-ellipsis">  (.*?) <\/div>

这里我们就能发现规律

标题都是在 <div></div>中的

<div class="c-single-text-ellipsis">  (.*?) </div>
报错使用
<div class="c-single-text-ellipsis">  (.*?) <\/div>

这里我们就匹配了我们的标题

接下来要匹配热搜指数

同样的道理进行匹配

 <div class="hot-index_1Bl1a"> (.*?) <\/div>

这里我就得出了匹配代码我们可以继续开始写了

import requests
import re
url='https://top.baidu.com/board?tab=realtime'
headers = {
        'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/109.0'
    } #伪造ua

def content():
    resposne=requests.get(url=url,headers=headers)
    resposne.encoding='utf-8'
    content=resposne.text
    # print(content)
    return content
def re_text(content):
    fire=re.findall("""<div class="hot-index_1Bl1a"> (.*?) </div>""",content)
    title=re.findall("""<div class="c-single-text-ellipsis">  (.*?) </div>""",content)
    print(fire,title)
    print(len(fire),len(title))
if __name__ == "__main__":
    content=content()
    re_text(content)

这里就得出来数据

lxml/etree

这里我们使用工具xpath helper

shift+ctrl+x打开工具

按住shift

能提取出路径

然后我们开始删除前面的路径看看能不能进行贪婪提取

发现我们提取出来了所有标题

//div[@class='c-single-text-ellipsis']

那我们要提取热搜指数

也是一样的

//div[@class='hot-index_1Bl1a']

我们开始写代码

import requests
import re
from lxml import etree
url='https://top.baidu.com/board?tab=realtime'
headers = {
        'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/109.0'
    } #伪造ua

def content():
    resposne=requests.get(url=url,headers=headers)
    resposne.encoding='utf-8'
    content=resposne.text
    # print(content)
    return content
def etree_text(content):   #lxml etree
    new_content=etree.HTML(content)
    fire=new_content.xpath("""//div[@class='hot-index_1Bl1a']/text()""")
    title=new_content.xpath("""//div[@class='c-single-text-ellipsis']/text()""")
    print(fire,title)
if __name__ == "__main__":
    content=content()
    etree_text(content)

beautifulsoup

这个工具我自己感觉很好用

我们直接打开网站

对我们想要的数据进行检查

然后选择右键复制 css路径

复制即可然后我们开始调用 beautifulsoup的select函数开始查找即可然后需要调用 .get_text()方法取得内容

标题和指数都是这样

import requests
from bs4 import BeautifulSoup
url='https://top.baidu.com/board?tab=realtime'
headers = {
        'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/109.0'
    } #伪造ua

def content():
    resposne=requests.get(url=url,headers=headers)
    resposne.encoding='utf-8'
    content=resposne.text
    # print(content)
    return content
def beau_text(content):
    soup=BeautifulSoup(content,'html.parser')    #使用py自带的html.parser解析器
    # print(soup.prettify())    #内容格式化输出
    # print(soup.body)
    title= soup.select('html body div div#sanRoot.wrapper.c-font-normal.rel main.rel.container_2VTvm div.container.right-container_2EFJr div.container-bg_lQ801 div div.category-wrap_iQLoo.horizontal_1eKyQ div.content_1YWBm a.title_dIF3B div.c-single-text-ellipsis')
    title2=[]
    for i in title:
        title=i.get_text()
        # print(title)
        title2.append(title)    #这里我们就取得了热搜名字
    fire=soup.select('html body div div#sanRoot.wrapper.c-font-normal.rel main.rel.container_2VTvm div.container.right-container_2EFJr div.container-bg_lQ801 div div.category-wrap_iQLoo.horizontal_1eKyQ div.trend_2RttY.hide-icon div.hot-index_1Bl1a')
    fire2=[]
    for i in fire:
        fire=i.get_text()
        fire2.append(fire)   #获取热搜指数

if __name__ == "__main__":
    content=content()
    beau_text(content)

这里三大项就结束了

我们开始保存本地和数据库

保存到本地

import requests
import re
from lxml import etree
from bs4 import BeautifulSoup
url='https://top.baidu.com/board?tab=realtime'
headers = {
        'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/109.0'
    } #伪造ua

def content():
    resposne=requests.get(url=url,headers=headers)
    resposne.encoding='utf-8'
    content=resposne.text
    # print(content)
    return content
def to_txt(fire,title):
    content=[]
    for i in range(0,len(title)):
        content.append(str(i)+fire[i]+title[i])
    with open('百度新闻.txt','w',encoding='utf-8')as fp:
        for i in content:
            fp.write(i+'\n')

if __name__ == "__main__":
    content=content()
    fire,title=beau_text(content)
    to_txt(fire,title)

传入数据库

我们先要在数据库中设立好

这里我的数据库是spider 表名为百度新闻

里面的字段

db=pymysql.connect(host="localhost",port=3306,user="root",passwd="111111",db="spider",charset="utf8")

连接数据库


cursor=db.cursor()
设置游标

全部代码

import requests
from bs4 import BeautifulSoup
import pymysql
url='https://top.baidu.com/board?tab=realtime'
headers = {
        'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/109.0'
    } #伪造ua
db=pymysql.connect(host="localhost",port=3306,user="root",passwd="214253551",db="spider",charset="utf8")
cursor=db.cursor()  #连接数据库
def content():
    resposne=requests.get(url=url,headers=headers)
    resposne.encoding='utf-8'
    content=resposne.text
    # print(content)
    return content
def beau_text(content):
    soup=BeautifulSoup(content,'html.parser')    #使用py自带的html.parser解析器
    # print(soup.prettify())    #内容格式化输出
    # print(soup.body)
    title= soup.select('html body div div#sanRoot.wrapper.c-font-normal.rel main.rel.container_2VTvm div.container.right-container_2EFJr div.container-bg_lQ801 div div.category-wrap_iQLoo.horizontal_1eKyQ div.content_1YWBm a.title_dIF3B div.c-single-text-ellipsis')
    title2=[]
    for i in title:
        title=i.get_text()
        # print(title)
        title2.append(title)    #这里我们就取得了热搜名字
    fire=soup.select('html body div div#sanRoot.wrapper.c-font-normal.rel main.rel.container_2VTvm div.container.right-container_2EFJr div.container-bg_lQ801 div div.category-wrap_iQLoo.horizontal_1eKyQ div.trend_2RttY.hide-icon div.hot-index_1Bl1a')
    fire2=[]
    for i in fire:
        fire=i.get_text()
        fire2.append(fire)   #获取热搜指数
    return fire2,title2
def to_data(fire,title):
    data2=title
    data3=fire
    # sqli='delete from 百度新闻'
    # cursor.execute(sqli)
    # db.commit()
    data1=[]
    for i in range(0,len(title)):
        data1.append(i+1)
        sql="INSERT INTO 百度新闻 (id,title,fire) VALUES ( '" + str(data1[i]) + "', '" + data2[i] + "', '" + data3[i] + "');"
        # print(sql)
        try:
            db.ping(reconnect=True)
            cursor.execute(sql)
            db.commit()
            # print('ok')
        except Exception as err:
            #     # 检查异常原因是否是感兴趣的
            result1 = re.search('Duplicate entry.*key.*PRIMARY', str(err))
            #     # 如果是，什么都不用做
            #     # 否则（也不知道是什么原因），那就回滚吧
            if (result1 == None):
                #         # 如果发生错误则回滚
                db.rollback()
                # 关闭数据库连接
        db.close()
if __name__ == "__main__":
    content=content()
    fire,title=beau_text(content)    #beautifulsoup
    to_data(fire,title)

这里就是全部的代码了数据库的代码

后面还有一个是全部全类型的代码这个只是简单的爬虫

import requests
import re
from lxml import etree
from bs4 import BeautifulSoup
import pymysql
url='https://top.baidu.com/board?tab=realtime'
headers = {
        'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/109.0'
    } #伪造ua
db=pymysql.connect(host="localhost",port=3306,user="root",passwd="214253551",db="spider",charset="utf8")
cursor=db.cursor()
def content():
    resposne=requests.get(url=url,headers=headers)
    resposne.encoding='utf-8'
    content=resposne.text
    # print(content)
    return content
# def re_text(content):   #re正则
#     fire=re.findall("""<div class="hot-index_1Bl1a"> (.*?) </div>""",content)
#     title=re.findall("""<div class="c-single-text-ellipsis">  (.*?) </div>""",content)
#     print(fire,title)
#     print(len(fire),len(title))
# def etree_text(content):   #lxml etree
#     new_content=etree.HTML(content)
#     fire=new_content.xpath("""//div[@class='hot-index_1Bl1a']/text()""")
#     title=new_content.xpath("""//div[@class='c-single-text-ellipsis']/text()""")
#     print(fire,title)





def beau_text(content):
    soup=BeautifulSoup(content,'html.parser')    #使用py自带的html.parser解析器
    # print(soup.prettify())    #内容格式化输出
    # print(soup.body)
    title= soup.select('html body div div#sanRoot.wrapper.c-font-normal.rel main.rel.container_2VTvm div.container.right-container_2EFJr div.container-bg_lQ801 div div.category-wrap_iQLoo.horizontal_1eKyQ div.content_1YWBm a.title_dIF3B div.c-single-text-ellipsis')
    title2=[]
    for i in title:
        title=i.get_text()
        # print(title)
        title2.append(title)    #这里我们就取得了热搜名字
    fire=soup.select('html body div div#sanRoot.wrapper.c-font-normal.rel main.rel.container_2VTvm div.container.right-container_2EFJr div.container-bg_lQ801 div div.category-wrap_iQLoo.horizontal_1eKyQ div.trend_2RttY.hide-icon div.hot-index_1Bl1a')
    fire2=[]
    for i in fire:
        fire=i.get_text()
        fire2.append(fire)   #获取热搜指数
    return fire2,title2

# def to_txt(fire,title):    #保存到本地
#     content=[]
#     for i in range(0,len(title)):
#         content.append(str(i)+fire[i]+title[i])
#     with open('百度新闻.txt','w',encoding='utf-8')as fp:
#         for i in content:
#             fp.write(i+'\n')
def to_data(fire,title):
    data2=title
    data3=fire
    # sqli='delete from 百度新闻'
    # cursor.execute(sqli)
    # db.commit()
    data1=[]
    for i in range(0,len(title)):
        data1.append(i+1)
        sql="INSERT INTO 百度新闻 (id,title,fire) VALUES ( '" + str(data1[i]) + "', '" + data2[i] + "', '" + data3[i] + "');"
        # print(sql)
        try:
            db.ping(reconnect=True)
            cursor.execute(sql)
            db.commit()
            # print('ok')
        except Exception as err:
            #     # 检查异常原因是否是感兴趣的
            result1 = re.search('Duplicate entry.*key.*PRIMARY', str(err))
            #     # 如果是，什么都不用做
            #     # 否则（也不知道是什么原因），那就回滚吧
            if (result1 == None):
                #         # 如果发生错误则回滚
                db.rollback()
                # 关闭数据库连接
        db.close()
if __name__ == "__main__":
    content=content()
    # re_text(content)     #正则
    # etree_text(content)   #lxml etree
    fire,title=beau_text(content)    #beautifulsoup
    # to_txt(fire,title)          #保存到本地
    to_data(fire,title)