一、分析需求
目标地址:
https://www.sou-yun.cn/Query.aspx?type=poem&id=×××××
二、提取诗句
import os
import re
import requests
import parsel
#url ='https://www.sou-yun.cn/PoemIndex.aspx?dynasty=Tang&author=14976&type=Jie'
url='https://www.sou-yun.cn/Query.aspx?type=poem1&id=36647'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.139 Safari/537.36'}
response = requests.get(url=url,headers=headers)
html_content= response.text
#print(response.text)
# 正则表达式匹配
poem_sentences = re.findall(r"<div class='poemSentence'[^>]*>(.*?)<\/div>", html_content, re.DOTALL)
# 清理并输出提取的诗句
for sentence in poem_sentences:
# 移除HTML标签
clean_sentence = re.sub(r"<.*?>", "", sentence).strip()
if clean_sentence: # 过滤掉空句
print(clean_sentence)
三、其他信息
提取all需要信息,title+author+sentences
import os
import re
import requests
import parsel
#url ='https://www.sou-yun.cn/PoemIndex.aspx?dynasty=Tang&author=14976&type=Jie'
url='https://www.sou-yun.cn/Query.aspx?type=poem1&id=36647'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.139 Safari/537.36'}
response = requests.get(url=url,headers=headers)
html_content= response.text
#print(response.text)
# 提取标题
title_match = re.search(r"<span class='bold'><span class='wordLink'[^>]*>(.*?)<\/span><\/span>\s*<span[^>]*>(.*?)<\/span>\s*<span class='poemAuthor'[^>]*>(.*?)<\/span>", html_content)
if title_match:
title = title_match.group(1) + title_match.group(2) # 合并标题部分
author = re.sub(r"<.*?>", "", title_match.group(3)).strip() # 处理作者
# 正则表达式匹配诗句
poem_sentences = re.findall(r"<div class='poemSentence'[^>]*>(.*?)<\/div>", html_content, re.DOTALL)
# 清理并输出提取的信息
print("标题:", title)
print("作者:", author)
print("诗句:")
for sentence in poem_sentences:
# 移除HTML标签
clean_sentence = re.sub(r"<.*?>", "", sentence).strip()
if clean_sentence: # 过滤掉空句
print(clean_sentence)
微调格式
import os
import re
import requests
import parsel
#url ='https://www.sou-yun.cn/PoemIndex.aspx?dynasty=Tang&author=14976&type=Jie'
url='https://www.sou-yun.cn/Query.aspx?type=poem1&id=36647'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.139 Safari/537.36'}
response = requests.get(url=url,headers=headers)
html_content= response.text
#print(response.text)
# 提取标题
title_match = re.search(r"<span class='bold'><span class='wordLink'[^>]*>(.*?)<\/span><\/span>\s*<span[^>]*>(.*?)<\/span>\s*<span class='poemAuthor'[^>]*>(.*?)<\/span>", html_content)
if title_match:
title = title_match.group(1) + title_match.group(2) # 合并标题部分
author = re.sub(r"<.*?>", "", title_match.group(3)).strip() # 处理作者
# 正则表达式匹配诗句
poem_sentences = re.findall(r"<div class='poemSentence'[^>]*>(.*?)<\/div>", html_content, re.DOTALL)
# 清理并输出提取的信息
print("《 " + title + "》 ("+ author + ")")
#print("作者:", author)
#print("诗句:")
for sentence in poem_sentences:
# 移除HTML标签
clean_sentence = re.sub(r"<.*?>", "", sentence).strip()
if clean_sentence: # 过滤掉空句
print(clean_sentence)
四、保存文档
保存到txt里面,单首诗歌
import os
import re
import requests
import parsel
#url ='https://www.sou-yun.cn/PoemIndex.aspx?dynasty=Tang&author=14976&type=Jie'
url='https://www.sou-yun.cn/Query.aspx?type=poem1&id=36647'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.139 Safari/537.36'}
response = requests.get(url=url,headers=headers)
html_content= response.text
#print(response.text)
# 提取标题
title_match = re.search(r"<span class='bold'><span class='wordLink'[^>]*>(.*?)<\/span><\/span>\s*<span[^>]*>(.*?)<\/span>\s*<span class='poemAuthor'[^>]*>(.*?)<\/span>", html_content)
if title_match:
title = title_match.group(1) + title_match.group(2) # 合并标题部分
author = re.sub(r"<.*?>", "", title_match.group(3)).strip() # 处理作者
# 正则表达式匹配诗句
poem_sentences = re.findall(r"<div class='poemSentence'[^>]*>(.*?)<\/div>", html_content, re.DOTALL)
# 清理并准备写入文件的内容
output = f"《 " + title + "》 ("+ author + ")\n"
print("《 " + title + "》 ("+ author + ")")
for sentence in poem_sentences:
# 移除HTML标签
clean_sentence = re.sub(r"<.*?>", "", sentence).strip()
if clean_sentence: # 过滤掉空句
output += clean_sentence + "\n"
print(clean_sentence)
# 将结果写入文本文件
with open('poem.txt', 'w', encoding='utf-8') as file:
file.write(output)
print("信息已保存到 poem.txt")
五、多首继续
不一定是符合要求的,因为这个id暂时得不到(内容结构问题)
import os
import re
import requests
import parsel
#url ='https://www.sou-yun.cn/PoemIndex.aspx?dynasty=Tang&author=14976&type=Jie'
#url='https://www.sou-yun.cn/Query.aspx?type=poem1&id=36647'
#headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.139 Safari/537.36'}
#response = requests.get(url=url,headers=headers)
#html_content= response.text
#print(response.text)
# 指定保存文件的路径
output_file_path = 'all_poems.txt'
# 先清空(如果存在)或创建目标文件
with open(output_file_path, 'w', encoding='utf-8') as file:
file.write("") # 清空文件内容
# 循环下载每首诗
for poem_id in range(36647, 36848):
url = f'https://www.sou-yun.cn/Query.aspx?type=poem1&id={poem_id}'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.139 Safari/537.36'}
response = requests.get(url=url, headers=headers)
#html_content = response.text
# 获取网页内容
#response = requests.get(url)
if response.status_code == 200:
html_content = response.text
# 提取标题
title_match = re.search(
r"<span class='bold'><span class='wordLink'[^>]*>(.*?)<\/span><\/span>\s*<span[^>]*>(.*?)<\/span>\s*<span class='poemAuthor'[^>]*>(.*?)<\/span>",
html_content)
if title_match:
title = title_match.group(1) + title_match.group(2) # 合并标题部分
author = re.sub(r"<.*?>", "", title_match.group(3)).strip() # 处理作者
# 正则表达式匹配诗句
poem_sentences = re.findall(r"<div class='poemSentence'[^>]*>(.*?)<\/div>", html_content, re.DOTALL)
# 清理并准备写入文件的内容
output = f"《 " + title + "》 ("+ author + ")\n"
for sentence in poem_sentences:
# 移除HTML标签
clean_sentence = re.sub(r"<.*?>", "", sentence).strip()
if clean_sentence: # 过滤掉空句
output += clean_sentence + "\n"
# 为每首诗添加分隔线
output += "\n" + "=" * 50 + "\n\n" # 分隔线,用于区分不同的诗
# 将结果追加到文本文件
with open(output_file_path, 'a', encoding='utf-8') as file: # 以追加模式打开文件
file.write(output)
print(f"信息已保存到 {output_file_path}")
else:
print(f"在ID {poem_id} 的页面中找不到诗的标题或作者。")
else:
print(f"无法获取ID {poem_id} 的页面,状态码: {response.status_code}")
运行结果: