基础知识:
# 词云图 wordcloud # 1、导包 jieba wordcloud import jieba from wordcloud import WordCloud data = '全年经济社会发展主要目标任务圆满完成' data_list = list(jieba.cut(data)) # print(data_list) # generator数据类型 # 2、构造词云图样式 ===》虚拟的词云图 wb = WordCloud( width=500, height=500, background_color='white', font_path='C:\Windows\Fonts\msyh.ttc' //window中找到此路径,字体为微软雅黑 ) # 3、添加数据 wb.generate(' '.join(data_list)) # 这里的字符串是否已经进行了切割 # 4、虚拟词云图保存到本地,注意:名字必须要用png,png属于无损压缩,jpg属于有损压缩 wb.to_file('xxx.png')
案例实战:
源码: # 抓取政府工作报告的文本 import requests, os, jieba, numpy from lxml import etree from wordcloud import WordCloud from PIL import Image # 装库:pip install pillow class OneSpider(object): def __init__(self): pass def request_start_url(self): # 爬虫部分 start_url = 'https://www.ynbdm.cn/news.php' cookies = { 'PHPSESSID': 'rpkr2o2rots8pe0mr9dp0kn0d1', } headers = { 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'accept-language': 'zh-CN,zh;q=0.9', 'cache-control': 'max-age=0', # 'cookie': 'PHPSESSID=rpkr2o2rots8pe0mr9dp0kn0d1', 'priority': 'u=0, i', 'sec-ch-ua': '"Not/A)Brand";v="8", "Chromium";v="126", "Google Chrome";v="126"', 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-platform': '"Windows"', 'sec-fetch-dest': 'document', 'sec-fetch-mode': 'navigate', 'sec-fetch-site': 'none', 'sec-fetch-user': '?1', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36', } params = { 'id': '31039', } response = requests.get(start_url, params=params, cookies=cookies, headers=headers).text self.parse_response(response) def parse_response(self, response): # 解析响应 A = etree.HTML(response) # bt = A.xpath('//title/text()')[0].replace('!', '') nr = A.xpath('//div[@class="content_show"]//text()') nr = ''.join(nr) with open('政府工作报告.txt', 'w', encoding='utf-8') as f: f.write(nr) print('ok -- 政府工作报告.txt') def show_image(self): # 词云图部分 # --------1、读文本------------- data = open('政府工作报告.txt', 'r', encoding='utf-8').read() # --------2、jieba切割----------- data_list = list(jieba.cut(data)) # --------3、粗略处理文本--------- data_list = [i for i in data_list if len(i) != 1] # --------4、精确处理文本(过滤敏感信息,称为停用词)---------- tyc = open('../stop_words.txt', 'r', encoding='utf-8').read() tyc = tyc.split('\n') data_list = [i for i in data_list if i not in tyc] # print(data_list) # ------------------5、文本变字符串------------- TEXT = ' '.join(data_list) # --------6、添加一个背景图片------------------ img = Image.open('../Y.jpg') # 此处的image为一个数据类型 mask = numpy.array(img) # 得到矩阵形式的图片,[255 255 255 ... 255 255 255]代表RGB的含量 # --------7、建立词云图样式------------------------ wb = WordCloud( width=500, height=500, background_color='white', mask=mask, font_path='C:\Windows\Fonts\msyh.ttc', ) # -------8、添加数据--------------- wb.generate(TEXT) #--------9、生成本地效果------------- wb.to_file('第二个.png') print('------词云图生成完毕-----------') def main(self): if not os.path.exists('政府工作报告.txt'): self.request_start_url() else: self.show_image() if __name__ == '__main__': on = OneSpider() on.main()
运行效果:
# 样式