爬虫案例二———爬取豆瓣网TOP250的电影信息,并存入MySQL数据库
前提准备
- 安装pymysql库
pip install pymysql -i https://pypi.tuna.tsinghua.edu.cn/simple
- Python连接MySQL数据库,并进行增删改查基本操作
- 连接数据库
"""
连接MySQL数据库,并进行增删改查,同时查询了MySQL版本号,并做了动态注册的账号,实现过程:先向userinfo当中添加account、password新字段,接着通过insert插入数据,要注意的是,若是其他字段设置为not null,则必须插入内容。
"""
import pymysql.cursors
# 尝试连接数据库
try:
connect = pymysql.Connect(
host='localhost',
port=3307,
user='root',
password='123456',
db='school',
charset='utf8'
)
print("数据库连接成功")
except pymysql.err.OperationalError as e:
print(f"数据库连接失败: {e}")
- 创建数据表
cursor = connect.cursor()
sql = """
CREATE TABLE IF NOT EXISTS userinfo(
uid INT PRIMARY KEY AUTO_INCREMENT,
uname VARCHAR(20) NOT NULL,
uage INT
);
"""
cursor.execute(sql)
print("表创建成功")
- 插入数据
sql2 = "INSERT INTO userinfo(uname, uage) VALUES (%s, %s)"
data1 = ('ann', 18)
data2 = ('alice', 28)
data3 = ('rose', 20)
for data in [data1, data2, data3]:
cursor.execute(sql2, data)
connect.commit()
print('数据插入成功')
- 修改数据
sql4 = "SELECT * FROM userinfo"
cursor.execute(sql4)
rows = cursor.fetchall()
for row in rows:
print("ID:%s\t姓名:%s\t年龄:%s\t" % (row[0], row[1], row[2]))
- 补充查看MySQL数据库的版本号
cursor.execute("SELECT VERSION()")
version = cursor.fetchone()
print("MySQL数据库版本是:%s" % version)
- 关闭数据的连接释放资源
cursor.close()
connect.close()
print("数据库连接已关闭")
案例代码
"""
本面向对象,旨在爬取豆瓣网电影排行榜的电影信息,主要包含电影名称,电影宣传语,其中存入数据库的ID值即为电影排名
在使用本程序时,需要 1.输入端口号、2.输入你数据库密码、3.输入你需要新建的数据库名、4.输入你需要新建的数据表名
在完成以上4步操作后,会自动连接数据库(前提你MySQL服务已经打开),自动创建数据库、表,自动解析项目下douban.html
的内容,最后将数据插入数据库并展示在控制台,如果需要清除数据库的数据,请在db.close()前调用db.clear_data()
最后该程序还扩充了线上URL爬取,选择爬取数据条数等功能
"""
import pymysql
from bs4 import BeautifulSoup
import re
import requests
class MovieDatabase:
def __init__(self, host, port, db, user, password, charset, tName, newDB, getNum):
self.host = host
self.port = port
self.db = db
self.user = user
self.password = password
self.charset = charset
self.conn = None
self.cursor = None
self.tableName = tName
self.newDB = newDB
self.getNum = getNum
conn = pymysql.connect(
host=self.host,
port=self.port,
user=self.user,
password=self.password,
charset=self.charset,
db=self.db
)
self.conn = conn
self.cursor = conn.cursor()
def connect(self):
try:
self.conn = pymysql.connect(
host=self.host,
port=self.port,
db="test",
user=self.user,
password=self.password,
charset=self.charset
)
self.cursor = self.conn.cursor()
print('数据库连接成功')
except pymysql.MySQLError as e:
print(f'数据库连接失败,尝试创建数据库和数据表: {e}')
self.create_database()
self.create_table()
# 重新初始化连接
self.conn = pymysql.connect(
host=self.host,
port=self.port,
db="test",
user=self.user,
password=self.password,
charset=self.charset
)
self.cursor = self.conn.cursor()
print('数据库连接成功')
def create_database(self):
sql1 = "use test"
sql = f"CREATE DATABASE IF NOT EXISTS {self.newDB}"
self.cursor.execute(sql1)
self.cursor.execute(sql)
print('数据库创建成功')
def create_table(self):
sql1 = f"use {self.newDB}"
sql = f"""
CREATE TABLE IF NOT EXISTS {self.tableName} (
id INT AUTO_INCREMENT PRIMARY KEY,
moviename VARCHAR(255),
inq VARCHAR(255),
fenshu VARCHAR(255),
likeNum VARCHAR(255)
) ENGINE=INNODB DEFAULT CHARSET=utf8;
"""
self.cursor.execute(sql1)
self.cursor.execute(sql)
print('数据表创建成功')
def insert_data_online(self):
if self.getNum == -25:
ii = self.getNum + 25
while True:
# print(ii)
if ii < 226:
# 网页URL爬取
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36'}
response = requests.get(f'https://movie.douban.com/top250?start={ii}&filter=', headers=headers)
# print(ii - 25)
# 检查请求状态
if response.status_code != 200:
print(f'请求异常,状态码: {response.status_code}')
return # 退出函数,不执行后续插入操作
soup = BeautifulSoup(response.text, 'html.parser') # 创建对象
# 电影名称的获取
title = soup.select('a>span.title:nth-of-type(1)')
names = []
for p in title:
names.append(p.get_text())
# 获取内容
contents = []
likeNum = []
rs2 = soup.select("div.info>div.bd")
for p in rs2:
xcy = p.select("span.inq")
if len(xcy) > 0:
contents.append(xcy[0].get_text())
else:
contents.append("NUll")
num = p.select("span.rating_num")
likeNum.append(num[0].get_text())
# 点赞数提取
num1 = []
rs3 = soup.select("div.star>span:nth-of-type(4)")
for n in rs3:
number = re.findall(r'\d+', n.get_text())
num1.append(number[0])
for i in range(len(num1)):
sql = f"INSERT INTO {self.tableName}(moviename, inq, fenshu, likeNum) VALUES (%s, %s, %s, %s)"
insertData = (names[i], contents[i], likeNum[i], num1[i])
self.cursor.execute(sql, insertData)
self.conn.commit()
print('数据插入成功')
ii += 25
print(ii)
else:
break
else:
pp = self.getNum * 25
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36'}
response = requests.get(f'https://movie.douban.com/top250?start={pp}&filter=',
headers=headers)
# 检查请求状态
if response.status_code != 200:
print(f'请求异常,状态码: {response.status_code}')
return # 退出函数,不执行后续插入操作
soup = BeautifulSoup(response.text, 'html.parser') # 创建对象
# 电影名称的获取
title = soup.select('a>span.title:nth-of-type(1)')
names = []
for p in title:
names.append(p.get_text())
# 获取内容
contents = []
likeNum = []
rs2 = soup.select("div.info>div.bd")
for p in rs2:
xcy = p.select("span.inq")
if len(xcy) > 0:
contents.append(xcy[0].get_text())
else:
contents.append("NUll")
num = p.select("span.rating_num")
likeNum.append(num[0].get_text())
# 点赞数提取
num1 = []
rs3 = soup.select("div.star>span:nth-of-type(4)")
for n in rs3:
number = re.findall(r'\d+', n.get_text())
num1.append(number[0])
for i in range(len(num1)):
sql = f"INSERT INTO {self.tableName}(moviename, inq, fenshu, likeNum) VALUES (%s, %s, %s, %s)"
insertData = (names[i], contents[i], likeNum[i], num1[i])
self.cursor.execute(sql, insertData)
self.conn.commit()
print('数据插入成功')
def insert_data_localhost(self):
with open('douban10.html', 'r', encoding='utf-8') as file:
html = file.read()
soup = BeautifulSoup(html, 'html.parser') # 创建对象
# 电影名称的获取
title = soup.select('a>span.title:nth-of-type(1)')
names = []
for p in title:
names.append(p.get_text())
# print("names:", names)
# 获取内容
contents = []
likeNum = []
rs2 = soup.select("div.info>div.bd")
for p in rs2:
xcy = p.select("span.inq")
if len(xcy) > 0:
contents.append(xcy[0].get_text())
else:
contents.append("NUll")
num = p.select("span.rating_num")
likeNum.append(num[0].get_text())
# 点赞数提取
num1 = []
rs3 = soup.select("div.star>span:nth-of-type(4)")
for n in rs3:
number = re.findall(r'\d+', n.get_text())
num1.append(number[0])
for i in range(len(num1)):
sql = f"INSERT INTO {self.tableName}(moviename, inq,fenshu,likeNum) VALUES (%s, %s,%s,%s)"
insertData = (names[i], contents[i], likeNum[i], num1[i])
self.cursor.execute(sql, insertData)
self.conn.commit()
print('数据插入成功')
def clear_data(self):
sql = f"DELETE FROM {self.tableName}"
self.cursor.execute(sql)
self.conn.commit()
print('已清除数据')
def close(self):
if self.cursor:
self.cursor.close()
if self.conn:
self.conn.close()
print('数据库连接已关闭')
def select_all(self):
sql = f"SELECT * FROM {self.tableName}"
self.cursor.execute(sql)
rows = self.cursor.fetchall()
print("所有电影信息如下:")
for row in rows:
print(f"电影名称《{row[1]}》\t宣传语:{row[2]}\t豆瓣评分:{row[3]}\t打分人数:{row[4]}")
if __name__ == "__main__":
aport = int(input("1.请输入数据库端口号:"))
apassword = input("2.请输入你数据库的密码:")
NDB = input("3.请输入你需要创建的数据库名称:")
atableName = input("4.请输入你新建表明:")
GN = int(input("1.请输入你需要爬取多少条数据(只能是0到250且为25的整数倍,例如输入0则是获取25条数据,输入1获取50条)\n2.若是输入-25则爬取全部数据:"))
db = MovieDatabase(host='localhost', port=aport, db="test", user='root', password=apassword, charset='utf8', tName=atableName, newDB=NDB, getNum=GN)
db.connect()
db.create_database()
db.create_table()
# 读取本地html
db.insert_data_online()
db.select_all()
db.close()
# 如果需要清除数据库的数据,请在db.close()前调用db.clear_data()
# 如果无法请求到网页代码,就先把网页下载到本地并放在与py项目同级之下,请在db.close()前调用db.insert_data_localhost()
- 效果展示
前往数据库查询验证
如上图所示,数据已经成功插入!!!