智能问答支持自定义问答

news2025/7/7 18:54:55

# -*- coding: utf-8 -*-
# @Time    : 2023-5-12  14:15
# @Author  : shenzh
# @FileName: chat_bot_v1.py
# @Software: PyCharm
"""
    Description:一期智能机器人设计完成，支持自定义问题和答案随时增加功能
        
"""
import json
import jieba


from sklearn.feature_extraction.text import TfidfVectorizer
import pysparnn.cluster_index as ci

import pickle
import pandas as pd


def increase_other_qa(txt_path):

    dict_list = []
    txt_qa = pd.read_csv(txt_path)
    ques_list = txt_qa['question']
    answer_list = txt_qa['answer']
    for index in range(len(ques_list)):
        txt_dict = {}
        txt_dict['q'] = ques_list[index]
        txt_dict['a'] = answer_list[index]
        dict_list.append(txt_dict)
    return dict_list


def get_tfidf_index(qa_path, txt_path, tv_path, cp_path):
    # 数据保存路径

    qa = json.load(open(qa_path, encoding='utf-8'))
    dict_list = increase_other_qa(txt_path)

    for dict_res in dict_list:
        json_str = dict(dict_res)
        # print(json_str)
        qa.append(json_str)

    corpus = []
    for id, item in enumerate(qa):
        tmp = item['q'] + item['a']
        # print(tmp)
        tmp = jieba.cut(tmp)
        tmp = ' '.join(tmp)
        corpus.append(tmp)
    # print(corpus)
    # Generate bag of word
    # TfidfVectorizer is a combination of CountVectorizer and TfidfTransformer
    # Here we use TfidfVectorizer
    tv = TfidfVectorizer()

    # deal with corpus
    tv.fit(corpus)

    # get all words
    # 词典
    tv.get_feature_names()

    # get feature
    # 获取每对 QA 的TF-IDF
    tfidf = tv.transform(corpus)

    # build index
    # 创建索引
    cp = ci.MultiClusterIndex(tfidf, range(len(corpus)))

    # save
    pickle.dump(tv, open(tv_path, 'wb'))
    pickle.dump(cp, open(cp_path, 'wb'))
    return qa, tv_path, cp_path


def predict_ans(question, qa, tv_path, cp_path, answer_num, distance_flag):
    # 分词
    cutted_qustion = jieba.cut(question)
    cutted_qustion = ' '.join(cutted_qustion)

    # retrieve qa, tv and cp built in gen.pysparnn
    # 加载之前保存的数据
    # qa = json.load(open(qa_path))
    tv = pickle.load(open(tv_path, 'rb'))
    cp = pickle.load(open(cp_path, 'rb'))

    # construct search data
    # 构造搜索数据
    search_data = [cutted_qustion]
    search_tfidf = tv.transform(search_data)

    # search from cp, k is the number of matched qa that you need
    # 搜索数据，会获取到前 k 个匹配的 QA
    result_array = cp.search(search_tfidf, k=int(answer_num), k_clusters=2, return_distance=distance_flag)
    # print(result_array)
    result = result_array[0]
    # print(result)
    # print("Top matched QA:")
    # print('====================='

    '''
     [{
        "que": "空气净化器pro噪音大吗",
        "ans": "您好，可以开启睡眠模式，几乎没有噪音",
        "sim_value": 1
    }]
    
    '''
    faq_list = []
    if distance_flag:
        for distance_value, id in result:
            faq_dict = {}
            distance_value2 = 1 - round(distance_value, 2)
            # print('ORI_Q:' + question)
            # print('Q:' + qa[int(id)]['q'])
            # print('A:' + qa[int(id)]['a'])
            # print('DV:' + str(distance_value2))
            # print('=====================')

            faq_dict['que'] = qa[int(id)]['q']
            faq_dict['ans'] = qa[int(id)]['a']
            faq_dict['sim_value'] = str(distance_value2)
            faq_list.append(faq_dict)

    else:
        for id in result:
            faq_dict = {}
            # print('ORI_Q:' + question)
            # print('Q:' + qa[int(id)]['q'])
            # print('A:' + qa[int(id)]['a'])
            # print('=====================')

            faq_dict['que'] = qa[int(id)]['q']
            faq_dict['ans'] = qa[int(id)]['a']
            faq_dict['sim_value'] = ""
            faq_list.append(faq_dict)

    return faq_list
if __name__ == '__main__':
    qa_path = '../data/qa.json'  ##通用型的问题和答案
    txt_path = '../data/qa_.csv'  ##外加指定的问题和答案
    tv_path = '../data/tv.pkl'
    cp_path = '../data/cp.pkl'

    qa, tv_path, cp_path = get_tfidf_index(qa_path, txt_path, tv_path, cp_path)
    question = '需要配备电源插座吗?'  ##问题
    answer_num = 2  ##返回推荐结果的个数
    distance_flag = True  ##结果的相似度的值
    predict_ans(question, qa, tv_path, cp_path, answer_num, distance_flag)

qa.json样例数据