Gradio 案例——将文本文件转为词云图

文章目录

Gradio 案例——将文本文件转为词云图
- 界面截图
- 依赖安装
- 项目目录结构
- 代码

Gradio 案例——将文本文件转为词云图

利用 word_cloud 库，将文本文件转为词云图
更完整、丰富的示例项目见 GitHub - AlionSSS/wordcloud-webui: The web UI for word_cloud(text to word cloud picture converter)

界面截图

依赖安装

新建一个虚拟环境 Python 3.9.16
依赖
- $ pip install gradio==4.29 -i "https://pypi.doubanio.com/simple/"
- $ pip install wordcloud==1.9.3 -i "https://pypi.doubanio.com/simple/"
- $ pip install jieba==0.42.1 -i "https://pypi.doubanio.com/simple/"

项目目录结构

wordcloud-webui         # 目录
--/resources             # 资源目录
--/consts.py             # py文件，常量
--/gradio_interfaces.py  # py文件，Gradio视图
--/jieba_util.py         # py文件，工具库文件
--/lib_word_cloud.py     # py文件，工具库文件
--/main.py               # py文件，入口

代码

main.py

from gradio_interfaces import iface

if __name__ == "__main__":
    iface.launch()

lib_word_cloud.py

from wordcloud import WordCloud, ImageColorGenerator
import numpy as np
from PIL import Image

from consts import *

def text2wordcount_normal(
    text: str,
    background_color: str = "white",
    margin = 2,
    min_font_size = 4,
    max_font_size = 200,
    font_path = None,
    width: int = 400,
    height: int = 200,
):
    if not background_color or "" == str(background_color).strip():
        background_color = "white"
    if not min_font_size or  min_font_size < 1:
        min_font_size = 4
    if not max_font_size or max_font_size < 4:
        max_font_size = 200    
    if not font_path or "" == str(font_path).strip():
        font_path = DEFAULT_FONT_PATH
    if not width or width < 1:
        width = 400
    if not height or height < 1:
        height = 200 

    # Generate a word cloud image
    wordcloud = WordCloud(
        font_path=font_path,
        width=width, height=height, background_color=background_color, 
        max_words=2000, 
        margin=margin, min_font_size=min_font_size, max_font_size=max_font_size, 
        random_state=42
    ).generate(text)
    return wordcloud.to_image()

def text2wordcount_mask(
    text: str,
    background_color: str = "white",
    margin = 2,
    min_font_size = 4,
    max_font_size = 200,
    font_path = None,
    mask_image = None,
    mask_color = None,
    contour_width=3,
    contour_color="steelblue",
):
    if not background_color or "" == str(background_color).strip():
        background_color = "white"
    if not min_font_size or  min_font_size < 1:
        min_font_size = 4
    if not max_font_size or max_font_size < 4:
        max_font_size = 200   
    if not font_path or "" == str(font_path).strip():
        font_path = DEFAULT_FONT_PATH
    if not contour_width or contour_width < 0:
        contour_width = 3      
    if not contour_color or "" == str(contour_color).strip():
        contour_color = "steelblue"
    
    # mask_color
    if mask_color is not None:
        image_colors = ImageColorGenerator(mask_color, True)
    else:
        image_colors = ImageColorGenerator(mask_image, True)

    # Generate a word cloud image
    wordcloud = WordCloud(
        font_path=font_path,
        mask=mask_image,
        background_color=background_color,
        color_func=image_colors,
        contour_width=contour_width,
        contour_color=contour_color,
        max_words=2000, 
        margin=margin, min_font_size=min_font_size, max_font_size=max_font_size, 
        random_state=42
    ).generate(text)

    return wordcloud.to_image()

jieba_util.py

import jieba
# jieba.enable_parallel(4)

from consts import *

# The function for processing text with Jieba
def jieba_processing_txt(text, userdict_list=['阿Ｑ', '孔乙己', '单四嫂子']):
    if userdict_list is not None:
        for word in userdict_list:
            jieba.add_word(word)

    mywordlist = []
    seg_list = jieba.cut(text, cut_all=False)
    liststr = "/ ".join(seg_list)

    with open(STOPWORDS_PATH, encoding='utf-8') as f_stop:
        f_stop_text = f_stop.read()
        f_stop_seg_list = f_stop_text.splitlines()

    for myword in liststr.split('/'):
        if not (myword.strip() in f_stop_seg_list) and len(myword.strip()) > 1:
            mywordlist.append(myword)
    return ' '.join(mywordlist)

gradio_interfaces.py

import gradio as gr

import lib_word_cloud
import jieba_util

from consts import *

def service_text2wc(
    text_file,
    text_lang,
    text_dict: str,
    background_color,
    margin,
    max_font_size,
    min_font_size,
    font_file,
    width,
    height,
    mask_image,
    mask_color,
    contour_width,
    contour_color,
):
    if not text_file:
        gr.Warning(f"请传入正确的文本文件！")
        return
    if margin < 0 :
        gr.Warning(f"字体间隔配置不合法！")
        return
    if min_font_size < 0 or max_font_size < 0 or min_font_size > max_font_size:
        gr.Warning(f"字体大小配置不合法！")
        return

    try:
        with open(file=text_file.name, encoding="utf-8") as file:
            text = file.read()
            
        if text_lang == '中文':
            gr.Info(f"选择了中文，将使用Jieba库解析文本！")
            userdict_list = []
            if text_dict is not None:
                # userdict_list = map(lambda w: w.strip(), text_dict.split(", "))
                userdict_list = [w.strip() for w in text_dict.split(",")]
            text = jieba_util.jieba_processing_txt(text, userdict_list)
            
        font_path = font_file.name if font_file else None
        
        if mask_image is not None:
            return lib_word_cloud.text2wordcount_mask(
                text,
                background_color,
                margin,
                min_font_size,
                max_font_size,
                font_path,
                mask_image,
                mask_color,
                contour_width,
                contour_color,
            )
        else:
            return lib_word_cloud.text2wordcount_normal(
                text, 
                background_color, 
                margin,
                min_font_size,
                max_font_size,
                font_path, 
                width, 
                height
            )
    except Exception as e:
        print(e)
        raise gr.Error("文本转词云图时，发生异常：" + str(e))

js = """
function createGradioAnimation() {
    var container = document.createElement('div');
    container.id = 'gradio-animation';
    container.style.fontSize = '2em';
    container.style.fontWeight = 'bold';
    container.style.textAlign = 'center';
    container.style.marginBottom = '20px';

    var text = '欢迎使用“词云转换器”!';
    for (var i = 0; i < text.length; i++) {
        (function(i){
            setTimeout(function(){
                var letter = document.createElement('span');
                letter.style.opacity = '0';
                letter.style.transition = 'opacity 0.5s';
                letter.innerText = text[i];

                container.appendChild(letter);

                setTimeout(function() {
                    letter.style.opacity = '1';
                }, 50);
            }, i * 200);
        })(i);
    }

    var gradioContainer = document.querySelector('.gradio-container');
    gradioContainer.insertBefore(container, gradioContainer.firstChild);

    return 'Animation created';
}
"""

with gr.Blocks(title="词云转换器", js=js) as iface:
    with gr.Row():
        with gr.Column():
            with gr.Group():
                with gr.Row():
                    input_text_file = gr.File(label="待处理的文本文件（必填）")
                    with gr.Column():
                        gr.Label(label="Tips", value="请传入正常可读的文本文件，如以.txt结尾的文档", color="#fee2e2")
                        gr.File(value=EXAMPLE_TEXT_FILE, label="文本文件的样例")
                        input_text_lang = gr.Radio(label="文本语言模式", choices=["中文", "英文"], value="中文")
                input_text_dict = gr.Textbox(label="自定义分词词典（可选）", info="中文模式使用，多个词之间用英文逗号分隔，例如'阿Ｑ, 孔乙己, 单四嫂子'")
            with gr.Tab("普通模式"):
                with gr.Row():
                    input_width = gr.Number(value=400, label="生成图像的宽", minimum=1)
                    input_height = gr.Number(value=200, label="生成图像的高", minimum=1)
                gr.Label(label="Tips", value="使用该模式时，记得清理掉“Mask模式”下的“Mask图像”", color="#fee2e2")
            with gr.Tab("Mask模式"):
                with gr.Row():
                    input_contour_width = gr.Number(value=3, label="轮廓线的粗细", minimum=0)
                    input_contour_color = gr.Textbox(value="steelblue", label="轮廓线的颜色")
                with gr.Row():
                    input_mask_image = gr.Image(label="Mask图像（决定词云的形状、颜色、宽高）")
                    input_mask_color = gr.Image(label="若传入该图，则词云的颜色由该图决定")
                # gr.Image(value=EXAMPLE_MASK_IMAGE_PATH, label="Mask图像的样例", interactive=False)
                gr.Gallery(value=[EXAMPLE_MASK_IMAGE_PATH, EXAMPLE_MASK_IMAGE_PATH, EXAMPLE_MASK_IMAGE_PATH], label="Mask图像的样例", interactive=False)
        with gr.Column():
            with gr.Group():
                with gr.Row():
                    with gr.Group():
                        input_bg_color = gr.Textbox(value="white", label="词云图的背景色（默认为'white'）")
                        input_margin = gr.Number(value=2, label="字体间隔（默认为'2'）", minimum=0)
                        with gr.Row():
                            input_min_font_size = gr.Number(value=4, label="字体大小-最小值", minimum=1)
                            input_max_font_size = gr.Number(value=200, label="字体大小-最大值", minimum=4)    
                    input_font_file = gr.File(label="词云图的字体文件（可选，如otf文件）")
                format_radio = gr.Radio(choices=["png", "jpeg", "webp", "bmp", "tiff"], label="词云图像格式", value="png")
            submit_button = gr.Button("开始处理", variant="primary")
            output_image = gr.Image(label="词云图", format="png")

    def fix_format(x):
        output_image.format = x 
        return None

    format_radio.change(fn=fix_format, inputs=format_radio)

    submit_button.click(
        fn=service_text2wc,
        inputs=[
            input_text_file,
            input_text_lang,
            input_text_dict,
            input_bg_color,
            input_margin,
            input_max_font_size,
            input_min_font_size,
            input_font_file,
            input_width,
            input_height,
            input_mask_image,
            input_mask_color,
            input_contour_width,
            input_contour_color,
        ],
        outputs=output_image,
    )

consts.py，记得修改下下面文件的地址，和resource目录对应

# 样例文本
EXAMPLE_TEXT_FILE = r".\wordcloud-webui\resources\CalltoArms.txt"
# MASK图像样例
EXAMPLE_MASK_IMAGE_PATH = r".\wordcloud-webui\resources\parrot_mask.png "
# 分词器的 stop word 库
STOPWORDS_PATH = r".\wordcloud-webui\resources\stopwords_cn_en.txt"
# 词云图的默认字体
DEFAULT_FONT_PATH = r".\wordcloud-webui\resources\SourceHanSerifK-Light.otf"

resources 目录
- parrot_mask.png
- CalltoArms.txt https://github.com/amueller/word_cloud/blob/main/examples/wc_cn/CalltoArms.txt
- SourceHanSerifK-Light.otf https://github.com/amueller/word_cloud/blob/main/examples/fonts/SourceHanSerif/SourceHanSerifK-Light.otf
- stopwords_cn_en.txt https://github.com/amueller/word_cloud/blob/main/examples/wc_cn/stopwords_cn_en.txt