#核心代码:初始化加载步骤,输出需要使用的词典
def my_initial():
#加载城市名、省份名
d_city_province = set()
with open("../data/dict/co_City_Dim.txt", encoding='utf-8') as cts:
for ct in cts.readlines():
d_city_province.add(ct[:-1])
with open("../data/dict/co_Province_Dim.txt", encoding='utf-8') as prvs:
for prv in prvs.readlines():
d_city_province.add(prv[:-1])
#加载公司后缀
d_4_delete = set()
with open(r"../data/dict/company_suffix.txt", encoding='utf-8') as sfs:
for sf in sfs.readlines():
d_4_delete.add(sf[:-1])
#加载停用词
stop_word = set()
with open(r"../data/dict/stopwords.txt", encoding='utf-8') as sts:
for st in sts.readlines():
stop_word.add(st[:-1])
return d_4_delete,stop_word,d_city_province
二、使用tf-idf完成实体消歧
#建立关键词组,将需要进行实体消歧的实体存进keyword_list
import collections
s = ''
keyword_list = []
for i in entity_data['entity_name'].values.tolist():
s += i + '|'
for k,v in collections.Counter(s.split('|')).items():
if v > 1:
keyword_list.append(k)
#生成tfidf矩阵
from sklearn.feature_extraction.text import TfidfVectorizer
train_sentence = []
for i in entity_data['desc'].values:
train_sentence.append(' '.join(jieba.cut(i)))
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(train_sentence)
#获取包含关键词的句子中关键词所属的entity_id
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
def get_entityid(sentence):
id_start = 1001
a_list = [' '.join(jieba.cut(sentence))]
res = cosine_similarity(vectorizer.transform(a_list),X)[0]
top_idx = np.argsort(res)[-1]
return id_start + top_idx
代码
public static boolean isAnagram(String s,String t){char[] x s.toCharArray();char[] y t.toCharArray();Arrays.sort(x);Arrays.sort(y);boolean val Arrays.equals(x, y);return val;}
测试如下
public static void main(String[] args) {String s "anag…