（python）小学出题热门词汇可视化绘制

news2025/4/18 16:20:32

1.代码

import pandas as pd  
from wordcloud import WordCloud  
import matplotlib.pyplot as plt  
from collections import Counter  
import jieba  # 如果你处理的是中文文本，需要jieba分词  
import re  
  
# 停用词列表，这里只是示例，你可以根据需要添加或修改  
stopwords = ['的', '是', '在', '了', '有', '和', '人', '我', '他', '她', '它', '们', '...','0','1','2','3','4','5','6','7','8','9','10','12','20','30']  
  
# 读取Excel文件  
df = pd.read_csv('word.csv', encoding='gbk')  
  
# 假设你的数据在名为'text'的列中  
texts = df['text'].tolist()  
  
# 数据清洗和分词  
cleaned_texts = []  
for text in texts:  
    # 去除标点符号和非中文字符  
    cleaned_text = re.sub(r'[^\u4e00-\u9fa5\w]', '', text)  
    # 使用jieba进行分词  
    words = jieba.cut(cleaned_text)  
    # 去除停用词  
    filtered_words = [word for word in words if word not in stopwords]  
    cleaned_texts.append(' '.join(filtered_words))  
  
# 生成词频字典  
word_freq = Counter()  
for text in cleaned_texts:  
    word_freq.update(text.split())  
  
# 绘制词云图  
wordcloud = WordCloud(font_path='simhei.ttf',  # 设置字体文件，确保能正确显示中文  
                      background_color='white',  
                      stopwords=None,  # WordCloud已经通过上面的步骤去除了停用词  
                      min_font_size=10).generate_from_frequencies(word_freq)  
  
plt.figure(figsize=(10, 10))  
plt.imshow(wordcloud, interpolation='bilinear')  
plt.axis('off')  
plt.show()