NLP学习3-基于计数方法的改进

基于计数方法的改进

本文记录的是鱼书第3章：如何对原有的计数方法进行改进。

基于统计方法函数

下面介绍的是传统的基于统计的方法。

预处理

代码语言：javascript复制

# 常用函数

def preprocess(text):
    text = text.lower()  # 转成小写
    text = text.replace('.', ' .')  # 增加空格
    words = text.split(' ')  #  切割

    # 单词和单词ID的对应关系
    word_to_id = {}
    id_to_word = {}
    for word in words:
        if word not in word_to_id.keys():   # 原文 if word not in word_to_id:
            new_id = len(word_to_id)
            word_to_id[word] = new_id
            id_to_word[new_id] = word
    # 单词列表-----> 单词ID列表
    corpus = np.array([word_to_id[w] for w in words])

    return corpus, word_to_id, id_to_word

共现矩阵

代码语言：javascript复制

def create_to_matrix(corpus, vocab_size,window_size=1):
    """
    corpus：单词ID列表
    vocab_size：词汇个数
    window_size：窗口大小
    """

    corpus_size =  len(corpus)
    # 全0矩阵初始化
    co_matrix = np.zeros((vocab_size, vocab_size), dtype=np.int32)

    for idx, word_id in enumerate(corpus):  # 遍历语料库中的每个单词
        for i in range(1, window_size   1):  # 遍历窗口中的数据；是否超出语料库的左右端
            left_idx = idx - i  # 左右索引值
            right_idx = idx   i

            if left_idx >= 0:  # 判断左索引大于0的时候
                left_word_id = corpus[left_idx]  # 取出索引对应的word_id
                co_matrix[word_id, left_word_id]  = 1  # 对应的位置赋值1

            if right_idx < corpus_size:  # 右索引小于整体的语料库长度
                right_word_id = corpus[right_idx]
                co_matrix[word_id, right_word_id]  = 1

    return co_matrix

相似度计算

代码语言：javascript复制

def cos_similarity(x, y, eps=1e-8):
    """
    优化版本
    """
    nx = x / (np.sqrt(np.sum(x ** 2))   eps) # x的正规化
    ny = y / (np.sqrt(np.sum(y ** 2))   eps) # y的正规化
    return np.dot(nx,ny)

相似单词的降序

代码语言：javascript复制

def most_similar(query, word_to_id, id_to_word,word_matrix, top=5):
    """
    query：查询单词
    word_to_id：单词到单词ID
    id_to_word：单词ID到单词
    word_matrix：汇总了单词向量的矩阵，假定保存了与各行对应的单词向量（共现矩阵）
    top：显示到前几位
    """

    if query not in word_to_id:   # 不存在查询词的处理
        print(f"{query} is not found")
        return

    print(f'{query}')

    query_id = word_to_id[query]  # 先找到查询词的id
    query_vec = word_matrix[query_id]  # 从共现矩阵中找出对应id的向量

    # 计算相似度
    vocab_size = len(id_to_word)  # 词汇总长度
    similarity = np.zeros(vocab_size)  # 相似度初始值；全0

    for i in range(vocab_size):  # 循环计算余弦相似度；
        similarity[i] = cos_similarity(word_matrix[i], query_vec)  # 赋值给对应的similarity的位置

    # 基于余弦相似度降序输出值
    count = 0
    for i in (-1 * similarity).argsort():  # argsort是返回索引值
        if id_to_word[i] == query:
            continue

        print(f'{id_to_word[i]}: {similarity[i]}')

        count  = 1
        if count >= top:
            return

调用函数

代码语言：javascript复制

import numpy as np
import sys
sys.path.append('..')

text = "you say goodbye and I say hello."
corpus, word_to_id, id_to_word = preprocess(text)
vocab_size = len(word_to_id)

C = create_to_matrix(corpus, vocab_size)  # 共现矩阵

most_similar("you", word_to_id, id_to_word, C, top=5)

代码语言：javascript复制

you
goodbye: 0.7071067691154799
i: 0.7071067691154799
hello: 0.7071067691154799
say: 0.0
and: 0.0

基于【计数】存在问题

比如，我们来考虑某个语料库中the和car共现的情况：

在这种情况下，我们会看到很多...the car...这样的短语。

因此，它们的共现次数将会很大。另外，car和drive也明显有很强的相关性。

但是，如果只看单词的出现次数，那么与drive相比，the和car的相关性更强。

这意味着，仅仅因为the是个常用词，它就被认为与car有很强的相关性

解决方法

点互信息PMI

使用点互信息Pointwise Mutual Information,PMI；PMI值越高表示相关性越强

定义为：

PMI(x,y)=log_2 frac{P(x,y)}{P{(x)}{P(y)}}

P(x)：表示x发生的概率
P(x,y)：表示x和y同时发生的概率

使用共现矩阵来重写上面的式子：

PMI(x,y)=log_2 frac{P(x,y)}{P{(x)}{P(y)}} = log_2frac{frac{C{(x,y)}}{N}}{frac{C{(x)}}{N} frac{C{(y)}}{N}} = log_2 frac{C{(x,y)} * N}{C{(x)}C{(y)}}

其中：N表示语料库的单词数量N

优化方案PPMI

上面基于点的互信息的方法有个缺点：当两个单词的共现次数为0时，会出现log_2{0}= infty

使用正的点互信息Positive Pointwise Mutual Information,PPMI

PPMI(x,y) = max(0, PMI(x,y))

代码实现PPMI

代码语言：javascript复制

def ppmi(C, verbose=False, eps=1e-8):

    """
    基于共现矩阵C求解PPMI
    """

    M = np.zeros_like(C, dtype=np.float32)  # 类比C共现矩阵的全0矩阵；后面进行更新
    N = np.sum(C)  # 全部数据求和：共现单词总个数
    S = np.sum(C,axis=0)  #  行方向求和
    #print("C: n", C)  #  共现矩阵
    #print("初始化M: n", M)  #  和共现矩阵行列数相同的全0矩阵（方阵）
    #print("N: n", N)  #  共现矩阵中所有数之和
    #print("S: n", S)  #  共现矩阵在每行上的求和

    total = C.shape[0] * C.shape[1]
    cnt = 0

    for i in range(C.shape[0]):  # 行
        for j in range(C.shape[1]):  # 列
            pmi = np.log2(C[i,j] * N / (S[j] * S[i])   eps)  # 计算pmi和ppmi
            #print("pmi: ",pmi)
            #print(max(0, pmi))
            M[i,j] = max(0, pmi)

            if verbose:
                cnt  = 1
                if cnt % (total // 100   1) == 0:
                    print('%.1f%% done' % (100 * cnt / total))

   #print("PPMI矩阵: n", M)
    return M

代码语言：javascript复制

import numpy as np
# numpy输出有效位数
np.set_printoptions(precision=3)

import sys
sys.path.append('..')

text = "you say goodbye and I say hello."
corpus, word_to_id, id_to_word = preprocess(text)
vocab_size = len(word_to_id)

C = create_to_matrix(corpus, vocab_size)  # 共现矩阵
M = ppmi(C)

代码语言：javascript复制

array([[0.   , 1.807, 0.   , 0.   , 0.   , 0.   , 0.   ],
       [1.807, 0.   , 0.807, 0.   , 0.807, 0.807, 0.   ],
       [0.   , 0.807, 0.   , 1.807, 0.   , 0.   , 0.   ],
       [0.   , 0.   , 1.807, 0.   , 1.807, 0.   , 0.   ],
       [0.   , 0.807, 0.   , 1.807, 0.   , 0.   , 0.   ],
       [0.   , 0.807, 0.   , 0.   , 0.   , 0.   , 2.807],
       [0.   , 0.   , 0.   , 0.   , 0.   , 2.807, 0.   ]], dtype=float32)

降维-dimensionality reduction

PPMI存在问题

PPMI矩阵存在的问题：

维度爆炸：随着语料库词汇量的增加，各个单词向量的维度也会随着增加
矩阵稀疏：在PPMI矩阵中存在很多的元素都是0，这表明向量中的很多元素是不重要的
向量中的大多数元素为0的矩阵（向量）称为稀疏矩阵（稀疏向量）
从稀疏向量中找出重要的轴，用更少的维度对其重新表示；稀疏矩阵转化为密集矩阵

奇异值分解SVD-Singular Value Decomposition

SVD基本原理：

SVD可以将任意矩阵分解为3个矩阵的乘积：

X = USV^T

UV是列向量彼此正交的正交矩阵；U矩阵构成了一些空间的基轴（基向量），看做是"单词空间"。
S是除了对角线元素外其他元素均为0的对角矩阵；奇异值在对角线上降序排列
S中奇异值越小，对应的基轴的重要性越低；因此通过去除U中多余的列向量来近似原始矩阵

基于SVD的降维

代码语言：javascript复制

import numpy as np
# numpy输出有效位数
np.set_printoptions(precision=3)

import sys
sys.path.append('..')

text = "you say goodbye and I say hello."
corpus, word_to_id, id_to_word = preprocess(text)
vocab_size = len(word_to_id)

C = create_to_matrix(corpus, vocab_size)  # 共现矩阵
M = ppmi(C)

# 降维
U,S,V = np.linalg.svd(M)

对比3大矩阵

对比原共现矩阵、PPMI矩阵、经过SVD降维后的密集U

代码语言：javascript复制

C[0]  # 共现矩阵

代码语言：javascript复制

array([0, 1, 0, 0, 0, 0, 0], dtype=int32)

代码语言：javascript复制

M[0]  # PPMI矩阵

代码语言：javascript复制

array([0.   , 1.807, 0.   , 0.   , 0.   , 0.   , 0.   ], dtype=float32)

代码语言：javascript复制

U[0] # PPMI矩阵的稀疏向量转成了密集向量U

代码语言：javascript复制

array([ 3.409e-01, -1.110e-16, -1.205e-01, -4.163e-16, -9.323e-01,
       -1.110e-16, -2.426e-17], dtype=float32)

比如现在我们将这个密集向量降到二维，只要取出前面两个元素：

代码语言：javascript复制

print(U)

代码语言：javascript复制

[[ 3.409e-01 -1.110e-16 -1.205e-01 -4.163e-16 -9.323e-01 -1.110e-16
  -2.426e-17]
 [ 0.000e 00 -5.976e-01  0.000e 00  1.802e-01  0.000e 00 -7.812e-01
   0.000e 00]
 [ 4.363e-01 -5.551e-17 -5.088e-01 -2.220e-16  2.253e-01 -1.388e-17
  -7.071e-01]
 [ 1.665e-16 -4.978e-01  2.776e-17  6.804e-01 -1.110e-16  5.378e-01
   7.467e-17]
 [ 4.363e-01 -3.124e-17 -5.088e-01 -1.600e-16  2.253e-01 -1.302e-17
   7.071e-01]
 [ 7.092e-01 -3.124e-17  6.839e-01 -1.600e-16  1.710e-01 -1.302e-17
   2.314e-17]
 [-1.665e-16 -6.285e-01 -4.163e-17 -7.103e-01  2.220e-16  3.169e-01
  -9.614e-17]]

代码语言：javascript复制

U[0, :2]

代码语言：javascript复制

array([ 3.409e-01, -1.110e-16], dtype=float32)

降维可视化

代码语言：javascript复制

import matplotlib.pyplot as plt
%matplotlib inline

# 和书中降维后的二维系数是反的；
# 在绘图的时候数据也是先取第1个，再取第0个

for word, word_id in word_to_id.items():
    plt.annotate(word, (U[word_id, 1], U[word_id, 0]))

plt.scatter(U[:,1], U[:, 0], alpha=0.5)
plt.show()

降维进阶-Truncated SVD

如果矩阵大小是N,SVD的计算复杂度将达到N^3；计算量大大增加，现实中无法达到。通常使用Truncated SVD等方法。Truncated SVD通过截去奇异值较小的部分，从而实现高速化。

PTB数据集(略)

PTB语料库是以文件文本的形式提供，一行保存一个句子。实战案例略。

nlp size word 数据索引

0 人点赞