版权声明:本文为博主原创文章,未经博主允许不得转载。有问题可以加微信:lp9628(注明CSDN)。 https://cloud.tencent.com/developer/article/1435841
下面来看几个问题,下面将关注几个问题进行阐述:
- 为什么是word2vector
- 为什么语义的word2vec要好于无语义word2vec
- cbow的word2vec结果展示
- TF实现TF-IDF、共轭矩阵、cbow、skip-gram
- 训练好的word embedding通过倒排进行检索
1、 为什么是word2vector?
- 可以看下面这个博文解释的不错: 后面有时间会自己整理:http://www.cnblogs.com/pinard/p/7160330.html
2、 为什么语义的word2vec要好于无语义word2vec?
- 可以对词更好的进行向量表示
- 结果导向,比较几种word2vec方法,并且小样本下cbow会更好
3、cbow的word2vec结果展示(还有很大优化空间的,并且训练的数据也不是很多)
代码语言:javascript复制enter an word to search:父亲
('母亲', 0.5006737825399452)
('姐姐', 0.5005755597664082)
('政变', 0.5005573175296762)
('那一年', 0.5005278451741275)
('都回', 0.5005222345952383)
('孙子', 0.5005130261421589)
('竟被', 0.5005126096655758)
('8岁', 0.5005020163846302)
('姨妈', 0.5005019204893351)
('恩人', 0.5004979418283539)
enter an word to search:清华大学
('双保险', 0.5005635965752527)
('校名', 0.5005391632766295)
('反响', 0.5005145346754163)
('前几年', 0.500504904257443)
('学术会议', 0.5004996103585474)
('访问学者', 0.5004951920034038)
('其父', 0.5004868092502031)
('候机楼', 0.5004797690068876)
('前门', 0.5004781277454816)
('艺术设计', 0.5004773543246614)
enter an word to search:母亲
('父亲', 0.500673782524676)
('丈夫', 0.5005681116695313)
('孙儿', 0.5005407456399389)
('正牌', 0.5005407121014431)
('姐姐', 0.5005404523776542)
('考官', 0.5005350019987418)
('专供', 0.5005300052825112)
('to.', 0.5005233894343813)
('没门', 0.5005207594453653)
('生意人', 0.500518060889508)
enter an word to searc教育
('高等教育', 0.5007404247842682)
('性教育', 0.5006342135348033)
('严正声明', 0.5005783842259223)
('不仅是', 0.5005504417322837)
('教养', 0.5005198775820252)
('精彩演出', 0.5005141053295005)
('医疗', 0.5005133085493552)
('教职员工', 0.5005078418282736)
('教学', 0.5005068915769201)
('医疗卫生', 0.5004921731608394)
4、TF实现TF-IDF、共轭矩阵、cbow、skip-gram?
数据链接:https://pan.baidu.com/s/1v-7aaAHWsx7NZ5d3IdWbiQ 密码:k5tx
load_data.py(加载数据)
代码语言:javascript复制#!/usr/bin/env python3
# coding: utf-8
class DataLoader:
def __init__(self):
self.datafile = 'data/data.txt'
self.dataset = self.load_data()
'''加载数据集'''
def load_data(self):
dataset = []
for line in open(self.datafile):
line = line.strip().split(',')
dataset.append([word for word in line[1].split(' ') if 'nbsp' not in word and len(word) < 11])
return dataset
word2doc.py(TF-IDF)
代码语言:javascript复制#!/usr/bin/env python3
from load_data import *
import collections
import math
import numpy as np
from sklearn.decomposition import PCA
class WordVector:
def __init__(self):
self.dataset = DataLoader().dataset
self.min_count = 5
self.window_size = 5
self.word_demension = 200
self.embedding_path = 'model/word2doc_wordvec.bin'
#统计总词数
def build_word_dict(self):
words = []
for data in self.dataset:
words.extend(data)
reserved_words = [item for item in collections.Counter(words).most_common() if item[1] >= self.min_count]
word_dict = {item[0]:item[1] for item in reserved_words}
return word_dict
#统计词语IDF
def build_wordidf_dict(self):
df_dict = {}
sum_df = len(self.dataset)
for data in self.dataset:
for word in set(data):
if word not in df_dict:
df_dict[word] = 1
else:
df_dict[word] = 1
idf_dict = {word:math.log(sum_df/word_df 1) for word, word_df in df_dict.items()}
return idf_dict
#统计词语-doc,tfidf
def build_wordtfidf_dict(self):
wordidf_dict = self.build_wordidf_dict()
doctfidf_dict = {}
for index, data in enumerate(self.dataset):
doc_words = {item[0]:item[1] for item in [item for item in collections.Counter(data).most_common()]}
sum_tf = sum(doc_words.values())
doc_wordtf = {word: word_count/sum_tf for word, word_count in doc_words.items()}
doc_wordtfidf = {word: word_tf*wordidf_dict[word] for word, word_tf in doc_wordtf.items()}
doctfidf_dict[index] = doc_wordtfidf
return doctfidf_dict
#构造词语-文档共现矩阵
def build_worddoc_matrix(self):
worddoc_matrix = []
doctfidf_dict = self.build_wordtfidf_dict()
word_list = list(self.build_word_dict().keys())
word_all = len(word_list)
word_dict = {index : word for index, word in enumerate(word_list)}
count = 0
for word_id, word in word_dict.items():
tmp = []
for doc_index, word_dict in doctfidf_dict.items():
weight = word_dict.get(word, 0)
tmp.append(weight)
count = 1
print(count, '/', word_all)
worddoc_matrix.append(tmp)
worddoc_matrix = np.array(worddoc_matrix)
return worddoc_matrix
#使用PCA进行降维
def low_dimension(self):
worddoc_matrix = self.build_worddoc_matrix()
pca = PCA(n_components=self.word_demension)
low_embedding = pca.fit_transform(worddoc_matrix)
return low_embedding
#保存模型
def train_embedding(self):
print('training.....')
word_list = list(self.build_word_dict().keys())
word_dict = {index: word for index, word in enumerate(word_list)}
word_embedding_dict = {index: embedding for index, embedding in enumerate(self.low_dimension())}
print('saving models.....')
with open(self.embedding_path, 'w ') as f:
for word_index, word_embedding in word_embedding_dict.items():
word_word = word_dict[word_index]
word_embedding = [str(item) for item in word_embedding]
f.write(word_word 't' ','.join(word_embedding) 'n')
f.close()
print('done.....')
vec = WordVector()
vec.train_embedding()
word2word.py(共轭矩阵)
代码语言:javascript复制#!/usr/bin/env python3
# coding: utf-8
from load_data import *
import collections
from sklearn.decomposition import PCA
class WordVector:
def __init__(self):
self.dataset = DataLoader().dataset[:1000]
self.min_count = 5
self.window_size = 5
self.word_demension = 200
self.embedding_path = 'model/word2word_wordvec.bin'
#统计总词数
def build_word_dict(self):
words = []
for data in self.dataset:
words.extend(data)
reserved_words = [item for item in collections.Counter(words).most_common() if item[1] >= self.min_count]
word_dict = {item[0]:item[1] for item in reserved_words}
return word_dict
#构造上下文窗口
def build_word2word_dict(self):
word2word_dict = {}
for data_index, data in enumerate(self.dataset):
contexts = []
for index in range(len(data)):
if index < self.window_size:
left = data[:index]
else:
left = data[index - self.window_size: index]
if index self.window_size > len(data):
right = data[index 1:]
else:
right = data[index 1: index self.window_size 1]
context = left [data[index]] right
for word in context:
if word not in word2word_dict:
word2word_dict[word] = {}
else:
for co_word in context:
if co_word != word:
if co_word not in word2word_dict[word]:
word2word_dict[word][co_word] = 1
else:
word2word_dict[word][co_word] = 1
print(data_index)
return word2word_dict
#构造词词共现矩阵
def build_word2word_matrix(self):
word2word_dict = self.build_word2word_dict()
word_dict = self.build_word_dict()
word_list = list(word_dict)
word2word_matrix = []
words_all = len(word_list)
count = 0
for word1 in word_list:
count = 1
print(count,'/',words_all)
tmp = []
sum_tf = sum(word2word_dict[word1].values())
for word2 in word_list:
weight = word2word_dict[word1].get(word2, 0)/sum_tf
tmp.append(weight)
word2word_matrix.append(tmp)
return word2word_matrix
# 使用PCA进行降维
def low_dimension(self):
worddoc_matrix = self.build_word2word_matrix()
pca = PCA(n_components=self.word_demension)
low_embedding = pca.fit_transform(worddoc_matrix)
return low_embedding
# 保存模型
def train_embedding(self):
print('training.....')
word_list = list(self.build_word_dict().keys())
word_dict = {index: word for index, word in enumerate(word_list)}
word_embedding_dict = {index: embedding for index, embedding in enumerate(self.low_dimension())}
print('saving models.....')
with open(self.embedding_path, 'w ') as f:
for word_index, word_embedding in word_embedding_dict.items():
word_word = word_dict[word_index]
word_embedding = [str(item) for item in word_embedding]
f.write(word_word 't' ','.join(word_embedding) 'n')
f.close()
print('done.....')
vec = WordVector()
vec.train_embedding()
cbow.py
代码语言:javascript复制#!/usr/bin/env python3
# coding: utf-8
import math
import numpy as np
import tensorflow as tf
import collections
from load_data import *
class CBOW:
def __init__(self):
self.data_index = 0
self.min_count = 5 # 默认最低频次的单词
self.batch_size = 200 # 每次迭代训练选取的样本数目
self.embedding_size = 200 # 生成词向量的维度
self.window_size = 1 # 考虑前后几个词,窗口大小
self.num_steps = 1000000#定义最大迭代次数,创建并设置默认的session,开始实际训练
self.num_sampled = 100 # Number of negative examples to sample.
self.trainfilepath = './data/data'
self.modelpath = './model/cbow_wordvec.bin'
self.dataset = DataLoader().dataset
self.words = self.read_data(self.dataset)
#定义读取数据的函数,并把数据转成列表
def read_data(self, dataset):
words = []
for data in dataset:
words.extend(data)
return words
#创建数据集
def build_dataset(self, words, min_count):
# 创建词汇表,过滤低频次词语,这里使用的人是mincount>=5,其余单词认定为Unknown,编号为0,
# 这一步在gensim提供的wordvector中,采用的是minicount的方法
#对原words列表中的单词使用字典中的ID进行编号,即将单词转换成整数,储存在data列表中,同时对UNK进行计数
count = [['UNK', -1]]
reserved_words = [item for item in collections.Counter(words).most_common() if item[1] >= min_count]
count.extend(reserved_words)
dictionary = dict()
for word, _ in count:
dictionary[word] = len(dictionary)
data = list()
unk_count = 0
for word in words:
if word in dictionary:
index = dictionary[word]
else:
index = 0 # dictionary['UNK']
unk_count = unk_count 1
data.append(index)
count[0][1] = unk_count
print(len(count))
reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
return data, count, dictionary, reverse_dictionary
#生成训练样本,assert断言:申明其布尔值必须为真的判定,如果发生异常,就表示为假
def generate_batch(self, batch_size, skip_window, data):
# 该函数根据训练样本中词的顺序抽取形成训练集
# batch_size:每个批次训练多少样本
# skip_window:单词最远可以联系的距离(本次实验设为5,即目标单词只能和相邻的两个单词生成样本),2*skip_window>=num_skips
span = 2 * skip_window 1 # [ skip_window target skip_window ]
batch = np.ndarray(shape=(batch_size, span - 1), dtype=np.int32)
labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
buffer = collections.deque(maxlen=span)
for _ in range(span):
buffer.append(data[self.data_index])
self.data_indexdata_index = (self.data_index 1) % len(data)
for i in range(batch_size):
target = skip_window
target_to_avoid = [skip_window]
col_idx = 0
for j in range(span):
if j == span // 2:
continue
batch[i, col_idx] = buffer[j]
col_idx = 1
labels[i, 0] = buffer[target]
buffer.append(data[self.data_index])
self.data_index = (self.data_index 1) % len(data)
assert batch.shape[0] == batch_size and batch.shape[1] == span - 1
return batch, labels
def train_wordvec(self, vocabulary_size, batch_size, embedding_size, window_size, num_sampled, num_steps, data):
#定义CBOW Word2Vec模型的网络结构
graph = tf.Graph()
with graph.as_default(), tf.device('/cpu:0'):
train_dataset = tf.placeholder(tf.int32, shape=[batch_size, 2 * window_size])
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
softmax_weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size],stddev=1.0 / math.sqrt(embedding_size)))
softmax_biases = tf.Variable(tf.zeros([vocabulary_size]))
# 与skipgram不同, cbow的输入是上下文向量的均值,因此需要做相应变换
context_embeddings = []
for i in range(2 * window_size):
context_embeddings.append(tf.nn.embedding_lookup(embeddings, train_dataset[:, i]))
avg_embed = tf.reduce_mean(tf.stack(axis=0, values=context_embeddings), 0, keep_dims=False)
loss = tf.reduce_mean(
tf.nn.sampled_softmax_loss(weights=softmax_weights, biases=softmax_biases, inputs=avg_embed,
labels=train_labels, num_sampled=num_sampled,
num_classes=vocabulary_size))
optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss)
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
normalized_embeddings = embeddings / norm
with tf.Session(graph=graph) as session:
tf.global_variables_initializer().run()
print('Initialized')
average_loss = 0
for step in range(num_steps):
batch_data, batch_labels = self.generate_batch(batch_size, window_size, data)
feed_dict = {train_dataset: batch_data, train_labels: batch_labels}
_, l = session.run([optimizer, loss], feed_dict=feed_dict)
average_loss = l
if step % 2000 == 0:
if step > 0:
average_loss = average_loss / 2000
print('Average loss at step %d: %f' % (step, average_loss))
average_loss = 0
final_embeddings = normalized_embeddings.eval()
return final_embeddings
#保存embedding文件
def save_embedding(self, final_embeddings, model_path, reverse_dictionary):
f = open(model_path,'w ')
for index, item in enumerate(final_embeddings):
f.write(reverse_dictionary[index] 't' ','.join([str(vec) for vec in item]) 'n')
f.close()
#训练主函数
def train(self):
data, count, dictionary, reverse_dictionary = self.build_dataset(self.words, self.min_count)
vocabulary_size = len(count)
final_embeddings = self.train_wordvec(vocabulary_size, self.batch_size, self.embedding_size, self.window_size, self.num_sampled, self.num_steps, data)
self.save_embedding(final_embeddings, self.modelpath, reverse_dictionary)
def test():
vector = CBOW()
vector.train()
test()
skipgram.py
代码语言:javascript复制#!/usr/bin/env python3
# coding: utf-8
import collections
import math
import random
import numpy as np
import tensorflow as tf
from load_data import *
class SkipGram:
def __init__(self):
self.data_index = 0
self.trainpath = './data/data.txt'
self.modelpath = './model/skipgram_wordvec.bin'
self.min_count = 5#最低词频,保留模型中的词表
self.batch_size = 200 # 每次迭代训练选取的样本数目
self.embedding_size = 200 # 生成词向量的维度
self.window_size = 5 # 考虑前后几个词,窗口大小, skipgram中的中心词-上下文pairs数目就是windowsize *2
self.num_sampled = 100 # 负样本采样.
self.num_steps = 1000000#定义最大迭代次数,创建并设置默认的session,开始实际训练
self.dataset = DataLoader().dataset
self.words = self.read_data(self.dataset)
# 定义读取数据的函数,并把数据转成列表
def read_data(self, dataset):
words = []
for data in dataset:
words.extend(data)
return words
#创建数据集
def build_dataset(self, words, min_count):
# 创建词汇表,过滤低频次词语,这里使用的人是mincount>=5,其余单词认定为Unknown,编号为0,
# 这一步在gensim提供的wordvector中,采用的是minicount的方法
#对原words列表中的单词使用字典中的ID进行编号,即将单词转换成整数,储存在data列表中,同时对UNK进行计数
count = [['UNK', -1]]
count.extend([item for item in collections.Counter(words).most_common() if item[1] >= min_count])
dictionary = dict()
for word, _ in count:
dictionary[word] = len(dictionary)
data = list()
unk_count = 0
for word in words:
if word in dictionary:
index = dictionary[word]
else:
index = 0
unk_count = 1
data.append(index)
count[0][1] = unk_count
# 将dictionary中的数据反转,即可以通过ID找到对应的单词,保存在reversed_dictionary中
reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
return data, count, dictionary, reverse_dictionary
#生成训练样本,assert断言:申明其布尔值必须为真的判定,如果发生异常,就表示为假
def generate_batch(self, batch_size, window_size, data):
# 该函数根据训练样本中词的顺序抽取形成训练集
# 这个函数的功能是对数据data中的每个单词,分别与前一个单词和后一个单词生成一个batch,
# 即[data[1],data[0]]和[data[1],data[2]],其中当前单词data[1]存在batch中,前后单词存在labels中
# batch_size:每个批次训练多少样本
# num_skips: 为每个单词生成多少样本(本次实验是2个),batch_size必须是num_skips的整数倍,这样可以确保由一个目标词汇生成的样本在同一个批次中。
# window_size:单词最远可以联系的距离(本次实验设为1,即目标单词只能和相邻的两个单词生成样本),2*window_size>=num_skips
'''
eg:
batch, labels = generate_batch(batch_size = 8, num_skips = 2, window_size = 1)
#Sample data [0, 5241, 3082, 12, 6, 195, 2, 3137, 46, 59] ['UNK', 'anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used']
#假设取num_steps为2, window_size为1, batchsize为8
#batch:[5242, 3084, 12, 6]
#labels[0, 3082, 5241, 12, 3082, 6, 12, 195]
print(batch) [5242 5242 3084 3084 12 12 6 6],共8维
print(labels) [[ 0] [3082] [ 12] [5242] [ 6] [3082] [ 12] [ 195]],共8维
'''
batch = np.ndarray(shape = (batch_size), dtype = np.int32) #建一个batch大小的数组,保存任意单词
labels = np.ndarray(shape = (batch_size, 1), dtype = np.int32)#建一个(batch,1)大小的二位数组,保存任意单词前一个或者后一个单词,从而形成一个pair
span = 2 * window_size 1 #窗口大小,为3,结构为[ window_size target window_size ][wn-i,wn,wn i]
buffer = collections.deque(maxlen = span) #建立一个结构为双向队列的缓冲区,大小不超过3,实际上是为了构造bath以及labels,采用队列的思想
for _ in range(span):
buffer.append(data[self.data_index])
self.data_index = (self.data_index 1) % len(data)
#batch_size一定是Num_skips的倍数,保证每个batch_size都能够用完num_skips
for i in range(batch_size // (window_size*2)):
target = window_size
targets_to_avoid = [window_size]
for j in range(window_size*2):
while target in targets_to_avoid:
target = random.randint(0, span - 1)
targets_to_avoid.append(target)
batch[i * window_size*2 j] = buffer[window_size]
labels[i * window_size*2 j, 0] = buffer[target]
buffer.append(data[self.data_index])
self.data_index = (self.data_index 1)%len(data)
return batch, labels
def train_wordvec(self, vocabulary_size, batch_size, embedding_size, window_size, num_sampled, num_steps, data):
#定义Skip-Gram Word2Vec模型的网络结构
graph = tf.Graph()
with graph.as_default():
#输入数据, 大小为一个batch_size
train_inputs = tf.placeholder(tf.int32, shape = [batch_size])
#目标数据,大小为[batch_size]
train_labels = tf.placeholder(tf.int32, shape = [batch_size, 1])
#使用cpu进行训练
with tf.device('/cpu:0'):
#生成一个vocabulary_size×embedding_size的随机矩阵,为词表中的每个词,随机生成一个embedding size维度大小的向量,
#词向量矩阵,初始时为均匀随机正态分布,tf.random_uniform((4, 4), minval=low,maxval=high,dtype=tf.float32)))
#随机初始化一个值于介于-1和1之间的随机数,矩阵大小为词表大小乘以词向量维度
embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
#tf.nn.embedding_lookup函数的用法主要是选取一个张量里面索引对应的元素。用于查找对应的wordembedding, ,将输入序列向量化
#tf.nn.embedding_lookup(params, ids, partition_strategy='mod', name=None, validate_indices=True, max_norm=None)
embed = tf.nn.embedding_lookup(embeddings, train_inputs)
#全连接层,Wx b,设置W大小为,embedding_size×vocabulary_size的权重矩阵,模型内部参数矩阵,初始为截断正太分布
nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size], stddev = 1.0 / math.sqrt(embedding_size)))
# 全连接层,Wx b,设置W大小为,vocabulary_size×1的偏置
nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
#定义loss,损失函数,tf.reduce_mean求平均值,# 得到NCE损失(负采样得到的损失)
loss = tf.reduce_mean(tf.nn.nce_loss(weights = nce_weights,# 权重
biases = nce_biases,# 偏差
labels = train_labels,# 输入的标签
inputs = embed, # 输入向量
num_sampled = num_sampled,# 负采样的个数
num_classes = vocabulary_size))# 类别数目
#定义优化器,使用梯度下降优化算法
optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
#计算每个词向量的模,并进行单位归一化,保留词向量维度
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims = True))
normalized_embeddings = embeddings / norm
#初始化模型变量
init = tf.global_variables_initializer()
#基于构造网络进行训练
with tf.Session(graph = graph) as session:
#初始化运行
init.run()
#定义平均损失
average_loss = 0
#每步进行迭代
for step in range(num_steps):
batch_inputs, batch_labels = self.generate_batch(batch_size, window_size, data)
#feed_dict是一个字典,在字典中需要给出每一个用到的占位符的取值。
feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}
#计算每次迭代中的loss
_, loss_val = session.run([optimizer, loss], feed_dict = feed_dict)
#计算总loss
average_loss = loss_val
if step % 2000 == 0:
if step > 0:
average_loss /= 2000
print("Average loss at step ", step, ":", average_loss)
average_loss = 0
final_embeddings = normalized_embeddings.eval()
return final_embeddings
#保存embedding文件
def save_embedding(self, final_embeddings, reverse_dictionary):
f = open(self.modelpath, 'w ')
for index, item in enumerate(final_embeddings):
f.write(reverse_dictionary[index] 't' ','.join([str(vec) for vec in item]) 'n')
f.close()
#训练主函数
def train(self):
data, count, dictionary, reverse_dictionary = self.build_dataset(self.words, self.min_count)
vocabulary_size = len(count)
final_embeddings = self.train_wordvec(vocabulary_size, self.batch_size, self.embedding_size, self.window_size, self.num_sampled, self.num_steps, data)
self.save_embedding(final_embeddings, reverse_dictionary)
def test():
vector = SkipGram()
vector.train()
test()
5、训练好的word embedding通过倒排进行检索?
word_cluster.py
代码语言:javascript复制#!/usr/bin/env python3
# coding: utf-8
import numpy as np
class WordCluster:
def __init__(self):
self.embedding_path = 'model/word2word_wordvec.bin'
self.embedding_path = 'model/word2doc_wordvec.bin'
self.embedding_path = 'model/skipgram_wordvec.bin'
self.embedding_path = 'model/cbow_wordvec.bin'
self.word_embedding_dict, self.word_dict, self.word_embeddings = self.load_model(self.embedding_path)
self.similar_num = 10
#加载词向量文件
def load_model(self, embedding_path):
print('loading models....')
word_embedding_dict = {}
word_embeddings = []
word_dict = {}
index = 0
for line in open(embedding_path):
line = line.strip().split('t')
word = line[0]
word_embedding = np.array([float(item) for item in line[1].split(',') if item])
word_embedding_dict[word] = word_embedding
word_embeddings.append(word_embedding)
word_dict[index] = word
index = 1
return word_embedding_dict, word_dict, np.array(word_embeddings)
# 计算相似度
def similarity_cosine(self, word):
A = self.word_embedding_dict[word]
B = (self.word_embeddings).T
dot_num = np.dot(A, B)
denom = np.linalg.norm(A) * np.linalg.norm(B)
cos = dot_num / denom
sims = 0.5 0.5 * cos
sim_dict = {self.word_dict[index]: sim for index, sim in enumerate(sims.tolist()) if word != self.word_dict[index]}
sim_words = sorted(sim_dict.items(), key=lambda asd: asd[1], reverse=True)[:self.similar_num]
return sim_words
#获取相似词语
def get_similar_words(self, word):
if word in self.word_embedding_dict:
return self.similarity_cosine(word)
else:
return []
def test():
vec = WordCluster()
while 1:
word = input('enter an word to search:').strip()
simi_words = vec.get_similar_words(word)
for word in simi_words:
print(word)
test()