目录
GloVe简介
1.使用预训练的GloVe的词向量(英文文本的用的最多)¶
2.自己训练Glove词向量
3. 知识点
GloVe简介
GloVe的全称叫Global Vectors for Word Representation,它是一个基于全局词频统计(count-based & overall statistics)的词表征(word representation)工具, 是斯坦福大学在2014年提出的模型. 论文地址: https://aclanthology.org/D14-1162.pdf
1.使用预训练的GloVe的词向量(英文文本的用的最多)¶
#导包
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
#构建glove预训练词向量文件
#必须是绝对路径
# r 表示对表示其他含义的\进行处理,不用\\
glove_file = datapath(r'E:/ALOT/10_deep_learning/data/glove.6B.100d.txt')
#创建一个空的临时文件
# word2vec_glove_file = get_tmpfile('glove.6B.100d.txt')
worde2vec_glove_file = get_tmpfile('glove.6B.100d.word2vec.txt')
#将glove文件转化为临时文件
glove2word2vec(glove_file, worde2vec_glove_file)
#输出一个词向量的大小 (400000, 100) 400000个词向量,每个词向量是100维度的
[('microsoft', 0.7449405193328857), ('ibm', 0.6821643114089966), ('intel', 0.6778088212013245), ('software', 0.6775422692298889), ('dell', 0.6741442680358887), ('pc', 0.6678153276443481), ('macintosh', 0.6617538332939148), ('iphone', 0.6595612168312073), ('ipod', 0.6534676551818848), ('hewlett', 0.6516579389572144)]
model = KeyedVectors.load_word2vec_format(worde2vec_glove_file) #加载转化成word2vec的glove文件
model.most_similar('apple'相似度较高的词) #查找与'apple'相似度较高的词
model.most_similar(positive=['woman', 'king'], negative=['man'])
#表示 'king'与['man']是相对应的,按照此对应方式找'woman'最相似的词
[('queen', 0.7698540687561035), ('monarch', 0.6843381524085999), ('throne', 0.6755736470222473), ('daughter', 0.6594556570053101), ('princess', 0.6520534157752991), ('prince', 0.6517034769058228), ('elizabeth', 0.6464517712593079), ('mother', 0.631171703338623), ('emperor', 0.6106470823287964), ('wife', 0.6098655462265015)]
2.自己训练Glove词向量
#在环境中安装glove包
!pip install glove-python-binary -i https://pypi.tuna.tsinghua.edu.cn/simple
!pip install pkuseg -i https://pypi.tuna.tsinghua.edu.cn/simple #百度研发的分词库
#导包
import gensim
from glove import Glove
from glove import Corpus
import pkuseg
#分词
#传的是相对路径
#nthread=5 同时开多少个进程去分词
pkuseg.test(r'../data/不要等到毕业以后.txt', r'../data/不要等到毕业以后_分词.txt', nthread=5)
#准备数据集
with open(r'../data/不要等到毕业以后_分词.txt', 'r', encoding='utf-8') as f:
#line.strip()是去掉每一行首尾的空格
sentences = [line.replace('\n', '').split(' ') for line in f.readlines() if line.strip() != ''] #sentences是二维列表
sentences
#输出太长,部分展示:
[['迄今', '最', '实用', '的', '大学生', '人生', '规划', '工具书', '。'], ['这是', '一', '本', '用心', '打磨', '的', '书', '。'], ['完全', '不同于', '那些', '东拼西凑', '的', '大学生', '人生', '指南', ';'], ['没有', '理论', '的', '说教', ',', '只有', '苦口婆心', '地', '规劝', '。'], ['57', '条', '人生', '成长', '建议', '、', '57', '个',]
#创建语料库模型
corpus_model = Corpus() #创建语料库的示例对象
#构建共现矩阵
corpus_model.fit(sentences, window=10)
corpus_model.save('../data/corpus_model') #保存训练好的语料库模型,以后可以读取加载
corpus_model.dictionary #查看词与索引的对应关系
输出部分展示:
{'迄今': 0, '最': 1, '实用': 2, '的': 3, '大学生': 4, '人生': 5, '规划': 6, '工具书': 7, '。': 8, '这是': 9,}
corpus_model.matrix.nnz #查看共现矩阵中一共有多少个词
16184
#训练
#no_components:构建的词向量的维度是多少,经验值300维度以下
glove = Glove(no_components=100, learning_rate=0.05)
#no_threads=1 同时开的进程
#verbose=True显示训练过程中的日志
glove.fit(corpus_model.matrix, epochs=10, no_threads=1, verbose=True)
Performing 10 training epochs with 1 threads Epoch 0 Epoch 1 Epoch 2 Epoch 3 Epoch 4 Epoch 5 Epoch 6 Epoch 7 Epoch 8 Epoch 9
glove.add_dictionary(corpus_model.dictionary)
glove.dictionary
输出部分展示:
{'迄今': 0, '最': 1, '实用': 2, '的': 3, '大学生': 4, '人生': 5, '规划': 6, '工具书': 7, '。': 8, '这是': 9, '一': 10,}
#查看某个词的词向量
glove.word_vectors[glove.dictionary['我']]
array([-3.85271677e-03, 6.03527930e-03, -6.46135204e-03, 7.04933438e-04, -3.00269815e-04, -6.35298652e-03, -5.04293920e-03, -2.87305933e-03, 9.96562326e-06, -1.28239511e-03, 2.32297241e-03, 1.67665561e-03, -4.88860444e-03, 2.11428079e-03, 4.59453142e-04, -9.32589063e-03, -4.38411281e-03, -2.58539354e-03, -1.21059775e-02, -3.58457042e-03, 4.42692637e-04, 4.25907889e-03, 8.23455533e-04, -7.69576469e-03, 4.53250350e-04, 1.43330507e-03, -1.93765326e-03, -5.62252827e-03, -6.46001688e-04, 1.00060915e-03, -6.56430222e-03, -9.70502832e-03, 4.73688765e-03, -8.92992657e-04, 3.56808700e-03, 9.79184843e-03, 9.15090144e-03, 4.93799065e-04, -5.08780746e-03, -2.17941323e-04, 2.21646527e-04, 3.65010548e-03, 4.65875282e-03, -3.11105523e-03, 3.47489254e-03, 1.28445053e-03, -7.86476014e-03, -3.82036168e-03, -4.68850099e-03, -2.73481768e-03, 6.43765283e-03, 3.74767271e-04, -3.61486712e-03, -2.67486806e-03, 2.20565042e-03, -7.80749700e-04, -6.51855200e-03, -1.89578758e-03, 1.97327613e-03, -5.57302319e-03, -2.51269656e-03, 1.28111552e-03, 3.78035822e-06, -7.56995098e-03, 3.88149824e-03, -2.34932018e-03, -5.40425079e-03, -1.80393452e-03, 5.03049188e-03, -5.58385635e-03, 3.37829026e-03, -2.29710432e-03, 9.08578290e-03, -4.35930405e-03, 9.61504496e-03, 1.15320991e-02, 3.32939316e-03, 9.24636042e-03, -9.36855207e-03, 1.15861179e-02, -5.50878460e-03, 8.79297798e-03, -3.17951379e-04, 4.44059952e-03, -2.52683441e-03, -3.06109383e-03, 7.93721916e-04, -2.56597966e-03, -5.98975245e-05, -2.69277017e-03, 8.00875023e-03, -8.21644481e-03, 1.72800689e-03, 1.52961102e-03, -1.60708166e-04, 4.23559648e-03, -1.08741576e-03, -5.77238705e-03, 8.43079576e-03, 3.88278545e-03])
#查找相似词
glove.most_similar('专业',number=10)
[('的', 0.8430016342292636), (',', 0.8206669513798159), ('是', 0.6985497214327246), ('。', 0.6971518842587385), ('我', 0.6554453225214771), ('你', 0.6365204850038154), ('不', 0.5187983190436662), ('自己', 0.49869570695139864), ('一', 0.47569447514853663)]
#查看全局词向量的形状
glove.word_vectors.shape
(939, 100)
glove.word_vectors
array([[ 2.22664080e-03, -4.40492727e-04, 1.56658813e-03, ..., -3.70435501e-03, 2.13948809e-03, -3.67479285e-03], [-2.59109652e-03, 4.25853419e-03, -2.38113383e-03, ..., 2.06839387e-03, -4.34270506e-03, 2.74836134e-03], [-5.90079017e-04, 4.30749793e-03, 3.15428115e-03, ..., 1.83901467e-03, -2.14947886e-03, 6.05205719e-05], ..., [ 3.74231450e-04, 2.05554199e-03, 1.57123457e-03, ..., -2.67770537e-05, 2.63345084e-03, 1.49179696e-03], [-8.56595044e-04, -4.88608984e-03, -1.88634857e-03, ..., 6.74149560e-04, 3.00811616e-03, -4.58626075e-03], [-3.12888567e-03, 8.80138770e-05, -4.06701493e-03, ..., 1.17048113e-03, -2.72183947e-04, 2.05452380e-03]])
#查看共线矩阵(若语料库很大,共现矩阵也会比较大)
corpus_model.matrix.todense().shape
(939, 939)
3. 知识点
GloVe的全称叫Global Vectors for Word Representation,它是一个基于全局词频统计(count-based & overall statistics)的词表征(word representation)工具, 是斯坦福大学在2014年提出的模型. 论文地址: https://aclanthology.org/D14-1162.pdf