爬取音乐,并将音乐信息储存到数据库中
- 确定音乐网站的url并分析网站
- 分析二级页面
- 创建数据库
- 使用Xpath解析,进行多层爬取
- 保存信息
- 完整代码
- 结果
确定音乐网站的url并分析网站
分析二级页面
创建数据库
# 创建一个链接对象
conn = pymysql.connect(host='master', user='root', password='123456', port=3306, db='spider')
# 创建游标
cur = conn.cursor()
sql_createTb = """CREATE TABLE music (
id INT NOT NULL AUTO_INCREMENT,
title VARCHAR(255),
signer VARCHAR(255),
zuoci VARCHAR(255),
zuoqu VARCHAR(255),
album VARCHAR(255),
PRIMARY KEY(id))
"""
使用Xpath解析,进行多层爬取
print("开始爬取欧美音乐榜单")
url = 'https://music.xxxxxxx.cn/v3/music/top/eur_usa'
driver_chom = webdriver.Chrome()
driver_chom.get(url)
# 使用xpath解析获取音乐的榜单
music_list = driver_chom.find_elements(By.XPATH, '//div[@id="js_songlist"]/div')
print(music_list)
# 进一步获取单个音乐的连接 进入详细页面 获得歌词 歌名 歌手 等信息
for url in music_list:
detail_url = url.find_element(By.XPATH,'div[3]/span/a').get_attribute('href')
print(detail_url)
driver_edge = webdriver.Edge()
driver_edge.get(detail_url)
time.sleep(6)
try:
title = driver_edge.find_element(By.XPATH,"//div[@class='info_contain']/h2").text
print(title)
singer = driver_edge.find_element(By.XPATH, "//div[@class='info_singer']/a").text
print(singer)
zuoci = driver_edge.find_element(By.XPATH,"//div[@class='info_about']/p[1]/span").text
print(zuoci)
zuoqu = driver_edge.find_element(By.XPATH,"//div[@class='info_about']/p[2]/span").text
print(zuoqu)
album = driver_edge.find_element(By.XPATH, "//div[@class='info_about']/p[3]/span/a").text
print(album)
保存信息
# 将歌曲的信息写入到数据库中
print("将歌曲的信息写入到数据库中!")
number = 0
insert_sql = f"insert into music() values({number},'{title}','{singer}','{zuoci}','{zuoqu}','{album}')"
try:
cur.execute(insert_sql)
except Exception as e:
# 回滚事件
conn.rollback()
conn.commit()
print("写入完成!")
# 数据库的信息写入完毕开始保存歌曲的歌词
file = open(f'./output/歌词信息/{title}_{singer}.txt', 'w',encoding='utf-8')
try:
geci = driver_edge.find_elements(By.XPATH,"/html/body/div[3]/div/div/div/p")
for i in geci:
file.write(i.text+'\n')
except Exception as e:
geci = driver_edge.find_element(By.XPATH, "/html/body/div[3]/div/div/div/p")
file.write(geci.text+'\n')
# 关闭歌词文件写入
file.close()
完整代码
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# time: 2023/12/7 19:32
import time
import pymysql
from selenium import webdriver
from selenium.webdriver.common.by import By
# 创建一个链接对象
conn = pymysql.connect(host='master', user='root', password='123456', port=3306, db='spider')
# 创建游标
cur = conn.cursor()
sql_createTb = """CREATE TABLE music (
id INT NOT NULL AUTO_INCREMENT,
title VARCHAR(255),
signer VARCHAR(255),
zuoci VARCHAR(255),
zuoqu VARCHAR(255),
album VARCHAR(255),
PRIMARY KEY(id))
"""
try:
cur.execute(sql_createTb)
except Exception as e:
# 回滚事件
conn.rollback()
conn.commit()
print("数据库建立完毕!")
# 爬取咪咕音乐 并且保存到数据库中
# 1.先确当url
print("开始爬取欧美音乐榜单")
url = 'https://music.migu.cn/v3/music/top/eur_usa'
driver_chom = webdriver.Chrome()
driver_chom.get(url)
# 使用xpath解析获取音乐的榜单
music_list = driver_chom.find_elements(By.XPATH, '//div[@id="js_songlist"]/div')
print(music_list)
# 进一步获取单个音乐的连接 进入详细页面 获得歌词 歌名 歌手 等信息
for url in music_list:
detail_url = url.find_element(By.XPATH,'div[3]/span/a').get_attribute('href')
print(detail_url)
driver_edge = webdriver.Edge()
driver_edge.get(detail_url)
time.sleep(6)
try:
title = driver_edge.find_element(By.XPATH,"//div[@class='info_contain']/h2").text
print(title)
singer = driver_edge.find_element(By.XPATH, "//div[@class='info_singer']/a").text
print(singer)
zuoci = driver_edge.find_element(By.XPATH,"//div[@class='info_about']/p[1]/span").text
print(zuoci)
zuoqu = driver_edge.find_element(By.XPATH,"//div[@class='info_about']/p[2]/span").text
print(zuoqu)
album = driver_edge.find_element(By.XPATH, "//div[@class='info_about']/p[3]/span/a").text
print(album)
# 将歌曲的信息写入到数据库中
print("将歌曲的信息写入到数据库中!")
number = 0
insert_sql = f"insert into music() values({number},'{title}','{singer}','{zuoci}','{zuoqu}','{album}')"
try:
cur.execute(insert_sql)
except Exception as e:
# 回滚事件
conn.rollback()
conn.commit()
print("写入完成!")
# 数据库的信息写入完毕开始保存歌曲的歌词
file = open(f'./output/歌词信息/{title}_{singer}.txt', 'w',encoding='utf-8')
try:
geci = driver_edge.find_elements(By.XPATH,"/html/body/div[3]/div/div/div/p")
for i in geci:
file.write(i.text+'\n')
except Exception as e:
geci = driver_edge.find_element(By.XPATH, "/html/body/div[3]/div/div/div/p")
file.write(geci.text+'\n')
# 关闭歌词文件写入
file.close()
except Exception as f:
print("*********************Error*********************")
continue
# 关闭数据库访问
cur.close()
conn.close()
结果
欢迎学习指正!!!!!