# -*- coding: utf-8 -*- # @Time : 2023-5-12 14:15 # @Author : shenzh # @FileName: chat_bot_v1.py # @Software: PyCharm """ Description:一期智能机器人设计完成,支持自定义问题和答案随时增加功能 """ import json import jieba from sklearn.feature_extraction.text import TfidfVectorizer import pysparnn.cluster_index as ci import pickle import pandas as pd def increase_other_qa(txt_path): dict_list = [] txt_qa = pd.read_csv(txt_path) ques_list = txt_qa['question'] answer_list = txt_qa['answer'] for index in range(len(ques_list)): txt_dict = {} txt_dict['q'] = ques_list[index] txt_dict['a'] = answer_list[index] dict_list.append(txt_dict) return dict_list def get_tfidf_index(qa_path, txt_path, tv_path, cp_path): # 数据保存路径 qa = json.load(open(qa_path, encoding='utf-8')) dict_list = increase_other_qa(txt_path) for dict_res in dict_list: json_str = dict(dict_res) # print(json_str) qa.append(json_str) corpus = [] for id, item in enumerate(qa): tmp = item['q'] + item['a'] # print(tmp) tmp = jieba.cut(tmp) tmp = ' '.join(tmp) corpus.append(tmp) # print(corpus) # Generate bag of word # TfidfVectorizer is a combination of CountVectorizer and TfidfTransformer # Here we use TfidfVectorizer tv = TfidfVectorizer() # deal with corpus tv.fit(corpus) # get all words # 词典 tv.get_feature_names() # get feature # 获取每对 QA 的TF-IDF tfidf = tv.transform(corpus) # build index # 创建索引 cp = ci.MultiClusterIndex(tfidf, range(len(corpus))) # save pickle.dump(tv, open(tv_path, 'wb')) pickle.dump(cp, open(cp_path, 'wb')) return qa, tv_path, cp_path def predict_ans(question, qa, tv_path, cp_path, answer_num, distance_flag): # 分词 cutted_qustion = jieba.cut(question) cutted_qustion = ' '.join(cutted_qustion) # retrieve qa, tv and cp built in gen.pysparnn # 加载之前保存的数据 # qa = json.load(open(qa_path)) tv = pickle.load(open(tv_path, 'rb')) cp = pickle.load(open(cp_path, 'rb')) # construct search data # 构造搜索数据 search_data = [cutted_qustion] search_tfidf = tv.transform(search_data) # search from cp, k is the number of matched qa that you need # 搜索数据,会获取到前 k 个匹配的 QA result_array = cp.search(search_tfidf, k=int(answer_num), k_clusters=2, return_distance=distance_flag) # print(result_array) result = result_array[0] # print(result) # print("Top matched QA:") # print('=====================' ''' [{ "que": "空气净化器pro噪音大吗", "ans": "您好,可以开启睡眠模式,几乎没有噪音", "sim_value": 1 }] ''' faq_list = [] if distance_flag: for distance_value, id in result: faq_dict = {} distance_value2 = 1 - round(distance_value, 2) # print('ORI_Q:' + question) # print('Q:' + qa[int(id)]['q']) # print('A:' + qa[int(id)]['a']) # print('DV:' + str(distance_value2)) # print('=====================') faq_dict['que'] = qa[int(id)]['q'] faq_dict['ans'] = qa[int(id)]['a'] faq_dict['sim_value'] = str(distance_value2) faq_list.append(faq_dict) else: for id in result: faq_dict = {} # print('ORI_Q:' + question) # print('Q:' + qa[int(id)]['q']) # print('A:' + qa[int(id)]['a']) # print('=====================') faq_dict['que'] = qa[int(id)]['q'] faq_dict['ans'] = qa[int(id)]['a'] faq_dict['sim_value'] = "" faq_list.append(faq_dict) return faq_list if __name__ == '__main__': qa_path = '../data/qa.json' ##通用型的问题和答案 txt_path = '../data/qa_.csv' ##外加指定的问题和答案 tv_path = '../data/tv.pkl' cp_path = '../data/cp.pkl' qa, tv_path, cp_path = get_tfidf_index(qa_path, txt_path, tv_path, cp_path) question = '需要配备电源插座吗?' ##问题 answer_num = 2 ##返回推荐结果的个数 distance_flag = True ##结果的相似度的值 predict_ans(question, qa, tv_path, cp_path, answer_num, distance_flag)
qa.json样例数据
qa_.csv