DL杂记：word2vec之TF-IDF、共轭矩阵、cbow、skip-gram

下面来看几个问题，下面将关注几个问题进行阐述：

为什么是word2vector
为什么语义的word2vec要好于无语义word2vec
cbow的word2vec结果展示
TF实现TF-IDF、共轭矩阵、cbow、skip-gram
训练好的word embedding通过倒排进行检索

1、为什么是word2vector？

可以看下面这个博文解释的不错：后面有时间会自己整理：http://www.cnblogs.com/pinard/p/7160330.html

2、为什么语义的word2vec要好于无语义word2vec？

可以对词更好的进行向量表示
结果导向，比较几种word2vec方法，并且小样本下cbow会更好

3、cbow的word2vec结果展示（还有很大优化空间的，并且训练的数据也不是很多）

代码语言：javascript复制

enter an word to search:父亲
('母亲', 0.5006737825399452)
('姐姐', 0.5005755597664082)
('政变', 0.5005573175296762)
('那一年', 0.5005278451741275)
('都回', 0.5005222345952383)
('孙子', 0.5005130261421589)
('竟被', 0.5005126096655758)
('8岁', 0.5005020163846302)
('姨妈', 0.5005019204893351)
('恩人', 0.5004979418283539)
enter an word to search:清华大学
('双保险', 0.5005635965752527)
('校名', 0.5005391632766295)
('反响', 0.5005145346754163)
('前几年', 0.500504904257443)
('学术会议', 0.5004996103585474)
('访问学者', 0.5004951920034038)
('其父', 0.5004868092502031)
('候机楼', 0.5004797690068876)
('前门', 0.5004781277454816)
('艺术设计', 0.5004773543246614)
enter an word to search:母亲
('父亲', 0.500673782524676)
('丈夫', 0.5005681116695313)
('孙儿', 0.5005407456399389)
('正牌', 0.5005407121014431)
('姐姐', 0.5005404523776542)
('考官', 0.5005350019987418)
('专供', 0.5005300052825112)
('to.', 0.5005233894343813)
('没门', 0.5005207594453653)
('生意人', 0.500518060889508)
enter an word to searc教育
('高等教育', 0.5007404247842682)
('性教育', 0.5006342135348033)
('严正声明', 0.5005783842259223)
('不仅是', 0.5005504417322837)
('教养', 0.5005198775820252)
('精彩演出', 0.5005141053295005)
('医疗', 0.5005133085493552)
('教职员工', 0.5005078418282736)
('教学', 0.5005068915769201)
('医疗卫生', 0.5004921731608394)

4、TF实现TF-IDF、共轭矩阵、cbow、skip-gram？

数据链接:https://pan.baidu.com/s/1v-7aaAHWsx7NZ5d3IdWbiQ 密码:k5tx

load_data.py（加载数据）

代码语言：javascript复制

#!/usr/bin/env python3
# coding: utf-8


class DataLoader:
    def __init__(self):
        self.datafile = 'data/data.txt'
        self.dataset = self.load_data()

    '''加载数据集'''
    def load_data(self):
        dataset = []
        for line in open(self.datafile):
            line = line.strip().split(',')
            dataset.append([word for word in line[1].split(' ') if 'nbsp' not in word and len(word) < 11])
        return dataset

word2doc.py（TF-IDF）

代码语言：javascript复制

#!/usr/bin/env python3
from load_data import *
import collections
import math
import numpy as np
from sklearn.decomposition import PCA

class WordVector:
    def __init__(self):
        self.dataset = DataLoader().dataset
        self.min_count = 5
        self.window_size = 5
        self.word_demension = 200
        self.embedding_path = 'model/word2doc_wordvec.bin'
    #统计总词数
    def build_word_dict(self):
        words = []
        for data in self.dataset:
            words.extend(data)
        reserved_words = [item for item in collections.Counter(words).most_common() if item[1] >= self.min_count]
        word_dict = {item[0]:item[1] for item in reserved_words}
        return word_dict

    #统计词语IDF
    def build_wordidf_dict(self):
        df_dict = {}
        sum_df = len(self.dataset)
        for data in self.dataset:
            for word in set(data):
                if word not in df_dict:
                    df_dict[word] = 1
                else:
                    df_dict[word]  = 1
        idf_dict = {word:math.log(sum_df/word_df 1) for word, word_df in df_dict.items()}
        return idf_dict

    #统计词语-doc,tfidf
    def build_wordtfidf_dict(self):
        wordidf_dict = self.build_wordidf_dict()
        doctfidf_dict = {}
        for index, data in enumerate(self.dataset):
            doc_words = {item[0]:item[1] for item in [item for item in collections.Counter(data).most_common()]}
            sum_tf = sum(doc_words.values())
            doc_wordtf = {word: word_count/sum_tf for word, word_count in doc_words.items()}
            doc_wordtfidf = {word: word_tf*wordidf_dict[word] for word, word_tf in doc_wordtf.items()}
            doctfidf_dict[index] = doc_wordtfidf
        return doctfidf_dict

    #构造词语-文档共现矩阵
    def build_worddoc_matrix(self):
        worddoc_matrix = []
        doctfidf_dict = self.build_wordtfidf_dict()
        word_list = list(self.build_word_dict().keys())
        word_all = len(word_list)
        word_dict = {index : word for index, word in enumerate(word_list)}
        count = 0
        for word_id, word in word_dict.items():
            tmp = []
            for doc_index, word_dict in doctfidf_dict.items():
                weight = word_dict.get(word, 0)
                tmp.append(weight)
            count  = 1
            print(count, '/', word_all)
            worddoc_matrix.append(tmp)
        worddoc_matrix = np.array(worddoc_matrix)
        return worddoc_matrix

    #使用PCA进行降维
    def low_dimension(self):
        worddoc_matrix = self.build_worddoc_matrix()
        pca = PCA(n_components=self.word_demension)
        low_embedding = pca.fit_transform(worddoc_matrix)
        return low_embedding

    #保存模型
    def train_embedding(self):
        print('training.....')
        word_list = list(self.build_word_dict().keys())
        word_dict = {index: word for index, word in enumerate(word_list)}
        word_embedding_dict = {index: embedding for index, embedding in enumerate(self.low_dimension())}
        print('saving models.....')
        with open(self.embedding_path, 'w ') as f:
            for word_index, word_embedding in word_embedding_dict.items():
                word_word = word_dict[word_index]
                word_embedding = [str(item) for item in word_embedding]
                f.write(word_word   't'   ','.join(word_embedding)   'n')
        f.close()
        print('done.....')

vec = WordVector()
vec.train_embedding()

word2word.py（共轭矩阵）

代码语言：javascript复制

#!/usr/bin/env python3
# coding: utf-8

from load_data import *
import collections
from sklearn.decomposition import PCA

class WordVector:
    def __init__(self):
        self.dataset = DataLoader().dataset[:1000]
        self.min_count = 5
        self.window_size = 5
        self.word_demension = 200
        self.embedding_path = 'model/word2word_wordvec.bin'
    #统计总词数
    def build_word_dict(self):
        words = []
        for data in self.dataset:
            words.extend(data)
        reserved_words = [item for item in collections.Counter(words).most_common() if item[1] >= self.min_count]
        word_dict = {item[0]:item[1] for item in reserved_words}
        return word_dict
    #构造上下文窗口
    def build_word2word_dict(self):
        word2word_dict = {}
        for data_index, data in enumerate(self.dataset):
            contexts = []
            for index in range(len(data)):
                if index < self.window_size:
                    left = data[:index]
                else:
                    left = data[index - self.window_size: index]
                if index   self.window_size > len(data):
                    right = data[index   1:]
                else:
                    right = data[index   1: index   self.window_size   1]
                context = left   [data[index]]   right
                for word in context:
                    if word not in word2word_dict:
                        word2word_dict[word] = {}
                    else:
                        for co_word in context:
                            if co_word != word:
                                if co_word not in word2word_dict[word]:
                                    word2word_dict[word][co_word] = 1
                                else:
                                    word2word_dict[word][co_word]  = 1
            print(data_index)
        return word2word_dict

    #构造词词共现矩阵
    def build_word2word_matrix(self):
        word2word_dict = self.build_word2word_dict()
        word_dict = self.build_word_dict()
        word_list = list(word_dict)
        word2word_matrix = []
        words_all = len(word_list)
        count = 0
        for word1 in word_list:
            count  = 1
            print(count,'/',words_all)
            tmp = []
            sum_tf = sum(word2word_dict[word1].values())
            for word2 in word_list:
                weight = word2word_dict[word1].get(word2, 0)/sum_tf
                tmp.append(weight)
            word2word_matrix.append(tmp)

        return word2word_matrix

    # 使用PCA进行降维
    def low_dimension(self):
        worddoc_matrix = self.build_word2word_matrix()
        pca = PCA(n_components=self.word_demension)
        low_embedding = pca.fit_transform(worddoc_matrix)
        return low_embedding

    # 保存模型
    def train_embedding(self):
        print('training.....')
        word_list = list(self.build_word_dict().keys())
        word_dict = {index: word for index, word in enumerate(word_list)}
        word_embedding_dict = {index: embedding for index, embedding in enumerate(self.low_dimension())}
        print('saving models.....')
        with open(self.embedding_path, 'w ') as f:
            for word_index, word_embedding in word_embedding_dict.items():
                word_word = word_dict[word_index]
                word_embedding = [str(item) for item in word_embedding]
                f.write(word_word   't'   ','.join(word_embedding)   'n')
        f.close()
        print('done.....')

vec = WordVector()
vec.train_embedding()

cbow.py

代码语言：javascript复制

#!/usr/bin/env python3
# coding: utf-8


import math
import numpy as np
import tensorflow as tf
import collections
from load_data import *

class CBOW:
    def __init__(self):
        self.data_index = 0
        self.min_count = 5 # 默认最低频次的单词
        self.batch_size = 200  # 每次迭代训练选取的样本数目
        self.embedding_size = 200  # 生成词向量的维度
        self.window_size = 1  # 考虑前后几个词，窗口大小
        self.num_steps = 1000000#定义最大迭代次数，创建并设置默认的session，开始实际训练
        self.num_sampled = 100  # Number of negative examples to sample.
        self.trainfilepath = './data/data'
        self.modelpath = './model/cbow_wordvec.bin'
        self.dataset = DataLoader().dataset
        self.words = self.read_data(self.dataset)
    #定义读取数据的函数，并把数据转成列表
    def read_data(self, dataset):
        words = []
        for data in dataset:
            words.extend(data)
        return words

    #创建数据集
    def build_dataset(self, words, min_count):
        # 创建词汇表，过滤低频次词语，这里使用的人是mincount>=5，其余单词认定为Unknown,编号为0,
        # 这一步在gensim提供的wordvector中，采用的是minicount的方法
        #对原words列表中的单词使用字典中的ID进行编号，即将单词转换成整数，储存在data列表中，同时对UNK进行计数
        count = [['UNK', -1]]
        reserved_words = [item for item in collections.Counter(words).most_common() if item[1] >= min_count]
        count.extend(reserved_words)
        dictionary = dict()
        for word, _ in count:
            dictionary[word] = len(dictionary)
        data = list()
        unk_count = 0
        for word in words:
            if word in dictionary:
                index = dictionary[word]
            else:
                index = 0  # dictionary['UNK']
                unk_count = unk_count   1
            data.append(index)
        count[0][1] = unk_count
        print(len(count))
        reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
        return data, count, dictionary, reverse_dictionary

    #生成训练样本，assert断言：申明其布尔值必须为真的判定，如果发生异常，就表示为假
    def generate_batch(self, batch_size, skip_window, data):
        # 该函数根据训练样本中词的顺序抽取形成训练集
        # batch_size:每个批次训练多少样本
        # skip_window:单词最远可以联系的距离（本次实验设为5，即目标单词只能和相邻的两个单词生成样本），2*skip_window>=num_skips
        span = 2 * skip_window   1  # [ skip_window target skip_window ]
        batch = np.ndarray(shape=(batch_size, span - 1), dtype=np.int32)
        labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
        buffer = collections.deque(maxlen=span)

        for _ in range(span):
            buffer.append(data[self.data_index])
            self.data_indexdata_index = (self.data_index   1) % len(data)

        for i in range(batch_size):
            target = skip_window
            target_to_avoid = [skip_window]
            col_idx = 0
            for j in range(span):
                if j == span // 2:
                    continue
                batch[i, col_idx] = buffer[j]
                col_idx  = 1
            labels[i, 0] = buffer[target]

            buffer.append(data[self.data_index])
            self.data_index = (self.data_index   1) % len(data)

        assert batch.shape[0] == batch_size and batch.shape[1] == span - 1

        return batch, labels

    def train_wordvec(self, vocabulary_size, batch_size, embedding_size, window_size, num_sampled, num_steps, data):
        #定义CBOW Word2Vec模型的网络结构
        graph = tf.Graph()
        with graph.as_default(), tf.device('/cpu:0'):
            train_dataset = tf.placeholder(tf.int32, shape=[batch_size, 2 * window_size])
            train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
            embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
            softmax_weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size],stddev=1.0 / math.sqrt(embedding_size)))
            softmax_biases = tf.Variable(tf.zeros([vocabulary_size]))
            # 与skipgram不同， cbow的输入是上下文向量的均值，因此需要做相应变换
            context_embeddings = []
            for i in range(2 * window_size):
                context_embeddings.append(tf.nn.embedding_lookup(embeddings, train_dataset[:, i]))
            avg_embed = tf.reduce_mean(tf.stack(axis=0, values=context_embeddings), 0, keep_dims=False)
            loss = tf.reduce_mean(
                tf.nn.sampled_softmax_loss(weights=softmax_weights, biases=softmax_biases, inputs=avg_embed,
                                           labels=train_labels, num_sampled=num_sampled,
                                           num_classes=vocabulary_size))
            optimizer = tf.train.AdagradOptimizer(1.0).minimize(loss)
            norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
            normalized_embeddings = embeddings / norm

        with tf.Session(graph=graph) as session:
            tf.global_variables_initializer().run()
            print('Initialized')
            average_loss = 0
            for step in range(num_steps):
                batch_data, batch_labels = self.generate_batch(batch_size, window_size, data)
                feed_dict = {train_dataset: batch_data, train_labels: batch_labels}
                _, l = session.run([optimizer, loss], feed_dict=feed_dict)
                average_loss  = l
                if step % 2000 == 0:
                    if step > 0:
                        average_loss = average_loss / 2000
                    print('Average loss at step %d: %f' % (step, average_loss))
                    average_loss = 0
            final_embeddings = normalized_embeddings.eval()
        return final_embeddings

    #保存embedding文件
    def save_embedding(self, final_embeddings, model_path, reverse_dictionary):
        f = open(model_path,'w ')
        for index, item in enumerate(final_embeddings):
            f.write(reverse_dictionary[index]   't'   ','.join([str(vec) for vec in item])   'n')
        f.close()

    #训练主函数
    def train(self):
        data, count, dictionary, reverse_dictionary = self.build_dataset(self.words, self.min_count)
        vocabulary_size = len(count)
        final_embeddings = self.train_wordvec(vocabulary_size, self.batch_size, self.embedding_size, self.window_size, self.num_sampled, self.num_steps, data)
        self.save_embedding(final_embeddings, self.modelpath, reverse_dictionary)

def test():
    vector = CBOW()
    vector.train()

test()

skipgram.py

代码语言：javascript复制

#!/usr/bin/env python3
# coding: utf-8


import collections
import math
import random
import numpy as np
import tensorflow as tf
from load_data import *

class SkipGram:
    def __init__(self):
        self.data_index = 0
        self.trainpath = './data/data.txt'
        self.modelpath = './model/skipgram_wordvec.bin'
        self.min_count = 5#最低词频，保留模型中的词表
        self.batch_size = 200 # 每次迭代训练选取的样本数目
        self.embedding_size = 200  # 生成词向量的维度
        self.window_size = 5  # 考虑前后几个词，窗口大小, skipgram中的中心词-上下文pairs数目就是windowsize *2
        self.num_sampled = 100  # 负样本采样.
        self.num_steps = 1000000#定义最大迭代次数，创建并设置默认的session，开始实际训练
        self.dataset = DataLoader().dataset
        self.words = self.read_data(self.dataset)
    # 定义读取数据的函数，并把数据转成列表
    def read_data(self, dataset):
        words = []
        for data in dataset:
            words.extend(data)
        return words
    #创建数据集
    def build_dataset(self, words, min_count):
        # 创建词汇表，过滤低频次词语，这里使用的人是mincount>=5，其余单词认定为Unknown,编号为0,
        # 这一步在gensim提供的wordvector中，采用的是minicount的方法
        #对原words列表中的单词使用字典中的ID进行编号，即将单词转换成整数，储存在data列表中，同时对UNK进行计数
        count = [['UNK', -1]]
        count.extend([item for item in collections.Counter(words).most_common() if item[1] >= min_count])
        dictionary = dict()
        for word, _ in count:
            dictionary[word] = len(dictionary)
        data = list()
        unk_count = 0
        for word in words:
            if word in dictionary:
                index = dictionary[word]
            else:
                index = 0
                unk_count  = 1
            data.append(index)
        count[0][1] = unk_count
        # 将dictionary中的数据反转，即可以通过ID找到对应的单词，保存在reversed_dictionary中
        reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
        return data, count, dictionary, reverse_dictionary

    #生成训练样本，assert断言：申明其布尔值必须为真的判定，如果发生异常，就表示为假
    def generate_batch(self, batch_size, window_size, data):
        # 该函数根据训练样本中词的顺序抽取形成训练集
        # 这个函数的功能是对数据data中的每个单词，分别与前一个单词和后一个单词生成一个batch，
        # 即[data[1],data[0]]和[data[1],data[2]]，其中当前单词data[1]存在batch中，前后单词存在labels中
        # batch_size:每个批次训练多少样本
        # num_skips: 为每个单词生成多少样本（本次实验是2个），batch_size必须是num_skips的整数倍,这样可以确保由一个目标词汇生成的样本在同一个批次中。
        # window_size:单词最远可以联系的距离（本次实验设为1，即目标单词只能和相邻的两个单词生成样本），2*window_size>=num_skips
        '''
        eg:
        batch, labels = generate_batch(batch_size = 8, num_skips = 2, window_size = 1)
        #Sample data [0, 5241, 3082, 12, 6, 195, 2, 3137, 46, 59] ['UNK', 'anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used']
        #假设取num_steps为2, window_size为1, batchsize为8
        #batch:[5242, 3084, 12, 6]
        #labels[0, 3082, 5241, 12, 3082, 6, 12, 195]
        print(batch)  [5242 5242 3084 3084   12   12    6    6]，共8维
        print(labels) [[   0] [3082] [  12] [5242] [   6] [3082] [  12] [ 195]]，共8维
        '''
        batch = np.ndarray(shape = (batch_size), dtype = np.int32) #建一个batch大小的数组，保存任意单词
        labels = np.ndarray(shape = (batch_size, 1), dtype = np.int32)#建一个（batch，1）大小的二位数组，保存任意单词前一个或者后一个单词，从而形成一个pair
        span = 2 * window_size   1 #窗口大小，为3，结构为[ window_size target window_size ][wn-i,wn,wn i]
        buffer = collections.deque(maxlen = span) #建立一个结构为双向队列的缓冲区，大小不超过3，实际上是为了构造bath以及labels，采用队列的思想
        for _ in range(span):
            buffer.append(data[self.data_index])
            self.data_index = (self.data_index   1) % len(data)
        #batch_size一定是Num_skips的倍数，保证每个batch_size都能够用完num_skips
        for i in range(batch_size // (window_size*2)):
            target = window_size
            targets_to_avoid = [window_size]
            for j in range(window_size*2):
                while target in targets_to_avoid:
                    target = random.randint(0, span - 1)
                targets_to_avoid.append(target)
                batch[i * window_size*2   j] = buffer[window_size]
                labels[i * window_size*2   j, 0] = buffer[target]
            buffer.append(data[self.data_index])
            self.data_index = (self.data_index   1)%len(data)

        return batch, labels

    def train_wordvec(self, vocabulary_size, batch_size, embedding_size, window_size, num_sampled, num_steps, data):
        #定义Skip-Gram Word2Vec模型的网络结构
        graph = tf.Graph()
        with graph.as_default():
            #输入数据， 大小为一个batch_size
            train_inputs = tf.placeholder(tf.int32, shape = [batch_size])
            #目标数据，大小为[batch_size]
            train_labels = tf.placeholder(tf.int32, shape = [batch_size, 1])
            #使用cpu进行训练
            with tf.device('/cpu:0'):
                #生成一个vocabulary_size×embedding_size的随机矩阵，为词表中的每个词，随机生成一个embedding size维度大小的向量，
                #词向量矩阵，初始时为均匀随机正态分布，tf.random_uniform((4, 4), minval=low,maxval=high,dtype=tf.float32)))
                #随机初始化一个值于介于-1和1之间的随机数，矩阵大小为词表大小乘以词向量维度
                embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
                #tf.nn.embedding_lookup函数的用法主要是选取一个张量里面索引对应的元素。用于查找对应的wordembedding， ，将输入序列向量化
                #tf.nn.embedding_lookup(params, ids, partition_strategy='mod', name=None, validate_indices=True, max_norm=None)
                embed = tf.nn.embedding_lookup(embeddings, train_inputs)
                #全连接层，Wx b,设置W大小为，embedding_size×vocabulary_size的权重矩阵，模型内部参数矩阵，初始为截断正太分布
                nce_weights = tf.Variable(tf.truncated_normal([vocabulary_size, embedding_size], stddev = 1.0 / math.sqrt(embedding_size)))
                # 全连接层，Wx b,设置W大小为，vocabulary_size×1的偏置
                nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
            #定义loss，损失函数，tf.reduce_mean求平均值，# 得到NCE损失(负采样得到的损失)
            loss = tf.reduce_mean(tf.nn.nce_loss(weights = nce_weights,# 权重
                                                biases = nce_biases,# 偏差
                                                labels = train_labels,# 输入的标签
                                                inputs = embed, # 输入向量
                                                num_sampled = num_sampled,# 负采样的个数
                                                num_classes = vocabulary_size))# 类别数目
            #定义优化器，使用梯度下降优化算法
            optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)
            #计算每个词向量的模，并进行单位归一化，保留词向量维度
            norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims = True))
            normalized_embeddings = embeddings / norm
            #初始化模型变量
            init = tf.global_variables_initializer()

        #基于构造网络进行训练
        with tf.Session(graph = graph) as session:
            #初始化运行
            init.run()
            #定义平均损失
            average_loss = 0
            #每步进行迭代
            for step in range(num_steps):
                batch_inputs, batch_labels = self.generate_batch(batch_size, window_size, data)
                #feed_dict是一个字典，在字典中需要给出每一个用到的占位符的取值。
                feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}
                #计算每次迭代中的loss
                _, loss_val = session.run([optimizer, loss], feed_dict = feed_dict)
                #计算总loss
                average_loss  = loss_val
                if step % 2000 == 0:
                    if step > 0:
                        average_loss /= 2000
                    print("Average loss at step ", step, ":", average_loss)
                    average_loss = 0
            final_embeddings = normalized_embeddings.eval()

        return final_embeddings
    #保存embedding文件
    def save_embedding(self, final_embeddings, reverse_dictionary):
        f = open(self.modelpath, 'w ')
        for index, item in enumerate(final_embeddings):
            f.write(reverse_dictionary[index]   't'   ','.join([str(vec) for vec in item])   'n')
        f.close()
    #训练主函数
    def train(self):
        data, count, dictionary, reverse_dictionary = self.build_dataset(self.words, self.min_count)
        vocabulary_size = len(count)
        final_embeddings = self.train_wordvec(vocabulary_size, self.batch_size, self.embedding_size, self.window_size, self.num_sampled, self.num_steps, data)
        self.save_embedding(final_embeddings, reverse_dictionary)

def test():

    vector = SkipGram()
    vector.train()

test()

5、训练好的word embedding通过倒排进行检索？

word_cluster.py

代码语言：javascript复制

#!/usr/bin/env python3
# coding: utf-8

import numpy as np

class WordCluster:
    def __init__(self):
        self.embedding_path = 'model/word2word_wordvec.bin'
        self.embedding_path = 'model/word2doc_wordvec.bin'
        self.embedding_path = 'model/skipgram_wordvec.bin'
        self.embedding_path = 'model/cbow_wordvec.bin'
        self.word_embedding_dict, self.word_dict, self.word_embeddings = self.load_model(self.embedding_path)
        self.similar_num = 10

    #加载词向量文件
    def load_model(self, embedding_path):
        print('loading models....')
        word_embedding_dict = {}
        word_embeddings = []
        word_dict = {}
        index = 0
        for line in open(embedding_path):
            line = line.strip().split('t')
            word = line[0]
            word_embedding = np.array([float(item) for item in line[1].split(',') if item])
            word_embedding_dict[word] = word_embedding
            word_embeddings.append(word_embedding)
            word_dict[index] = word
            index  = 1
        return word_embedding_dict, word_dict, np.array(word_embeddings)
    # 计算相似度
    def similarity_cosine(self, word):
        A = self.word_embedding_dict[word]
        B = (self.word_embeddings).T
        dot_num = np.dot(A, B)
        denom = np.linalg.norm(A) * np.linalg.norm(B)
        cos = dot_num / denom
        sims = 0.5   0.5 * cos
        sim_dict = {self.word_dict[index]: sim for index, sim in enumerate(sims.tolist()) if word != self.word_dict[index]}
        sim_words = sorted(sim_dict.items(), key=lambda asd: asd[1], reverse=True)[:self.similar_num]
        return sim_words
    #获取相似词语
    def get_similar_words(self, word):
        if word in self.word_embedding_dict:
            return self.similarity_cosine(word)
        else:
            return []

def test():
    vec = WordCluster()
    while 1:
        word = input('enter an word to search:').strip()
        simi_words = vec.get_similar_words(word)
        for word in simi_words:
            print(word)

test()

python 数据挖掘 https 网络安全

0 人点赞