一、知识图谱简单介绍
二、知识图谱的构建
三、知识图谱问答方案
NL2SQL:自然语言转为SQL语句
bulid_graph.py
"""知识图谱"""
#三元组:实体-关系-实体 实体-属性-属性值
import re,json
from py2neo import Graph
from collections import defaultdict
"""读取三元组,并将数据写入neo4j"""
#连接图数据库
graph=Graph("http://localhost:7474",auth=("neo4j","Zmj123456!"))
attribute_data=defaultdict(dict)
relation_data=defaultdict(dict)
label_data={}
#有的实体后面有括号,里面的内容可以作为标签
#提取到标签后,把括号部分删除
def get_label_then_clean(x,label_data):
if re.search("(.+)",x):
label_string=re.search("(.+)",x).group()
for label in ["歌曲","专辑","电影","电视剧"]:
if label in label_string:
x=re.sub("(.+)","",x)#括号内的内容删除掉,因为括号里面是特殊字符会影响cypher的语句运行
label_data[x]=label
else:
x=re.sub("(.+)","",x)
return x
#读取实体-关系-实体三元组文件
with open("data/01test.doc",encoding="utf8") as f:
for line in f:
head,relation,tail=line.strip().split('\t')#取出三元组
head=get_label_then_clean(head,label_data)
relation_data[head][relation]=tail
#读取实体-属性-属性值三元组
with open("data/01triplets_enti_attr_value.doc",encoding='utf8') as f:
for line in f:
entity,attribute,value=line.strip().split('\t')#取出三元组
entity=get_label_then_clean(entity,label_data)
attribute_data[entity][attribute]=value
#构建cypher语句
cypher=""
in_graph_entity=set()
for i,entity in enumerate(attribute_data):
#为所有的实体增加一个名字属性
attribute_data[entity]["NAME"]=entity
#将一个实体的所有的属性拼接成一个类似于字典的表达式
text="{"
for attribute,value in attribute_data[entity].items():
text+="%s:\'%s\',"%(attribute,value)
text=text[:-1]+"}"#最后一个逗号替换成大括号
if entity in label_data:
label=label_data[entity]
#带标签的实体构造语句
cypher+="CREATE (%s:%s %s)"%(entity,label,text)+"\n"
else:
"不带标签的实体构造语句"
cypher+="CREATE (%s %s)"%(entity,text)+"\n"
in_graph_entity.add(entity)
#构造关系语句
for i in enumerate(relation_data):
#有可能实体只有和其他实体的关系,但没有属性,为这样的实体增加一个名称属性,便于在图上认出
if head not in in_graph_entity:
cypher += "CREATE (%s {NAME:'%s'})"%(head,head)+'\n'
in_graph_entity.add(head)
for relation,tail in relation_data[head].items():
#有可能实体只有和其他实体的关系,但没有属性,为这样的实体增加一个名称属性,便于在图上认出
if tail not in in_graph_entity:
cypher +="CREATE (%s {NAME:'%s'})"%(tail,tail)+"\n"
in_graph_entity.add(tail)
#关系语句
cypher +="CREATE (%s)-[:%s]->(%s)"%(head,relation,tail)+"\n"
print(cypher)
#执行建表脚本
graph.run(cypher)
#记录我们图谱里都有哪些实体,哪些属性,哪些关系,哪些标签
data=defaultdict(set)
for head in relation_data:
data["entitys"].add(head)
for relation,tail in relation_data[head].items():
data["relations"].add(relation)
data["entitys"].add(tail)
for enti,label in label_data.items():
data["entitys"].add(enti)
data['labels'].add(label)
for enti in attribute_data:
for attr,value in attribute_data[enti].items():
data['entitys'].add(enti)
data['attributes'].add(attr)
data=dict((x,list(y))for x,y in data.items())
with open('data/kg_schema.json','w',encoding='utf8') as f:
f.write(json.dumps(data,ensure_ascii=False,indent=2))
graph_qa_base_on_sentence_match.py
"""使用文本匹配方式进行知识图谱的应用"""
import itertools,json
import re
import pandas
from py2neo import Graph
from collections import defaultdict
class GraphQA:
def __init__(self):
self.graph=Graph("http://localhost:7474",auth=("neo4j","Zmj123456!"))
schema_path="kg_schema.json"
templet_path="question_templet.xlsx"
self.load(schema_path,templet_path)
print('知识图谱问答系统加载完毕!\n=================')
#加载模板
def load(self,schema_path,templet_path):
self.load_kg_schema(schema_path)
self.load_question_templet(templet_path)
return
#加载图谱信息
def load_kg_schema(self,path):
with open(path,encoding='utf8') as f:
schema=json.load(f)
self.relation_set=set(schema['relations'])
self.entity_set=set(schema['entitys'])
self.label_set=set(schema['labels'])
self.attribute_set=set(schema['attributes'])
return
#加载模板信息
def load_question_templet(self,templet_path):
dataframe=pandas.read_excel(templet_path)
self.question_templet=[]
for index in range(len(dataframe)):
question=dataframe["question"][index]
cypher=dataframe['cypher'][index]
cypher_check=dataframe["check"][index]
answer=dataframe["answer"][index]
self.question_templet.append([question,cypher,json.loads(cypher_check),answer])
return
#获取问题中谈到的实体,可以使用基于词表的方式,也可以使用NER模型
def get_mention_entitys(self,sentence):
return re.findall("|".join(self.entity_set),sentence)
#获取问题中谈到的关系,也可以使用各种文本分类模型
def get_mention_relations(self,sentence):
return re.findall("|".join(self.relation_set),sentence)
#获取问题中谈到的属性
def get_mention_attributes(self,sentence):
return re.findall("|".join(self.attribute_set),sentence)
#获取问题中的谈到的标签
def get_mention_labels(self,sentence):
return re.findall("|".join(self.label_set),sentence)
#对问题进行预处理,提取需要的信息
def parse_sentence(self,sentence):
entitys=self.get_mention_entitys(sentence)
relations=self.get_mention_relations(sentence)
labels=self.get_mention_labels(sentence)
attributes=self.get_mention_attributes(sentence)
return{
"%ENT%":entitys,
"%REL":relations,
"%LAB%":labels,
"%ATT%":attributes
}
#将提取到的值分配到键上
def decode_value_combination(self,value_combination,cypher_check):
res={}
for index,(key,required_count) in enumerate(cypher_check.items()):
if required_count==1:
res[key]=value_combination[index][0]
else:
for i in range(required_count):
key_num=key[:-1]+str(i)+"%"
res[key_num]=value_combination[index][i]
return res
#对于找到了超过模板中需求的实体数量的情况,需要进行排列组合
#info:{"%ENT%":["周杰伦","方文山"],“%REL%”:["作曲"]}
def get_combinations(self,cypher_check,info):
slot_values=[]
for key,required_count in cypher_check.items():
slot_values.append(itertools.combinations(info[key],required_count))
value_combinations=itertools.product(*slot_values)
combinations=[]
for value_combination in value_combinations:
combinations.append(self.decode_value_combination(value_combination,cypher_check))
return combinations
#将带有token的模板替换成真实词
#string:%ENT1%和%ENT2%是%REL%关系吗
#combination:{“%ENT1%”:"word1","%ENT2%":"word2"}
def replace_token_in_string(self,string,combination):
for key,value in combination.items():
string = string.replace(key,value)
return string
#对于单条模板,根据抽取到的实体属性信息扩展,形成一个列表
#info:{"%ENT%":["周杰伦","方文山"],“%REL%”:["作曲"]}
def expend_templet(self,templet,cypher,cypher_check,info,answer):
combinations=self.get_combinations(cypher_check,info)
templet_cypher_pair=[]
for combination in combinations:
replaced_templet=self.replace_token_in_string(templet,combination)
replaced_cypher=self.replace_token_in_string(cypher,combination)
replaced_answer=self.replace_token_in_string(answer,combination)
templet_cypher_pair.append([replaced_templet,replaced_cypher,replaced_answer])
return templet_cypher_pair
#验证从文本中提取到的信息是否足够填充模板,如果不够就跳过,节省运算速度。
#如模板:%ENT%和%ENT%是什么关系? 这句话需要两个实体才能填充,如果问题中只有一个,该模板无法匹配
def check_cypher_info_valid(self,info,cypher_check):
for key,required_count in cypher_check.items():
if len(info.get(key,[]))<required_count:
return False
return True
#根据提取到的实体,关系等信息,将模板展开成待匹配的问题文本
def expand_question_and_cypher(self,info):
templet_cypher_pair=[]
for templet,cypher,cypher_check,answer in self.question_templet:
if self.check_cypher_info_valid(info,cypher_check):
templet_cypher_pair+=self.expand_templet(templet,cypher,cypher_check,info,answer)
return templet_cypher_pair
#距离函数,文本匹配的所有方法都可以使用
def sentence_similarity_function(self,string1,string2):
print("计算 %s %s"%(string1,string2))
jaccard_distance=len(set(string1)&set(string2))/len(set(string1)|set(string2))
return jaccard_distance
#通过问题匹配的方式确定匹配的cypher
def cypher_match(self,sentence,info):
templet_cypher_pair=self.expand_question_and_cypher(info)
result=[]
for templet,cypher,answer in templet_cypher_pair:
score=self.sentence_similarity_function(sentence,templet)
result.append([templet,cypher,score,answer])
result=sorted(result,reverse=True,key=lambda x:x[2])
return result
#解析结果
def parse_result(self,graph_search_result,answer,info):
graph_search_result=graph_search_result[0]
#关系查找返回的结果形式较为特殊,单独处理
if "REL" in graph_search_result:
graph_search_result['REL']=list(graph_search_result["REL"].types())[0]
answer=self.replace_token_in_string(answer,graph_search_result)
return answer
#对外提供问答接口
def query(self,sentence):#sentence:谁导演的不能说的秘密
info=self.parse_sentence(sentence)#信息抽取
templet_cypher_score=self.cypher_match(sentence,info)
for templet,cypher,score,answer in templet_cypher_score:
graph_search_result=self.graph.run(cypher).data()
#最高分命中的模板不一定在图上能找到答案,当不能找到答案时,运行一个搜案语句,找答案时停止查找后面的模板
if graph_search_result:
break
answer=self.parse_resule(graph_search_result,answer,info)
#通过问题匹配的方式确定匹配的cypher
def cypher_match(self,sentence,info):
templet_cypher_pair=self.expand_question_and_cypher(info)
result=[]
for templet,cypher,answer in templet_cypher_pair:
score=self.sentence_similarity_function(sentence,templet)
result.append([templet,cypher,score,answer])
result=sorted(result,reverse=True,key=lambda x:x[2])
return result
if __name__=="__main__":
graph=GraphQA()
res=graph.query("谁导演的不能说的秘密")
print(res)