正向最大匹配、反向最大匹配、双向最大匹配共同缺点
- 对词表极为依赖,如果没有词表,则无法进行;如果词表缺少需要的词,结果也不会准确
- 切分过程中不会关注整个句子表达的意思,只会将句子看成一个个片段
- 如果文本中出现错别字,会造成一连串影响
- 对于人名等无法枚举实体词无法有效处理
新词发现
词相当于一种固定搭配
- 词的内部应该是稳固的
内部凝固度(互信息)
公式
- p(w): 组合出现的概率(= 组合出现的次数 / 组合总数)
- 词的外部应该是多变的
左熵、右熵
判断组合左右的混乱程度,如果左、右熵都很大,表明组合左右变化很大,证明这个组合是一个词的概率很大
公式:
代码:
- 计算互信息(内部凝固度)
- 计算左右熵
- 根据互信息和左右熵统计发现的词
import math
from collections import defaultdict
class NewWordDetect:
def __init__(self, corpus_path):
self.max_word_length = 5
self.word_count = defaultdict(int) # 统计1到4个字各有多少种组合
self.left_neighbor = defaultdict(dict) # 统计每个组合的左邻
self.right_neighbor = defaultdict(dict)
self.load_corpus(corpus_path) # 加载文本
self.calc_pmi() # 计算互信息/内部凝固度
self.calc_entropy() # 计算左右熵
self.calc_word_values() # 统计
# 加载语料数据,并进行统计
def load_corpus(self, path):
with open(path, encoding="utf8") as f:
for line in f:
sentence = line.strip()
for word_length in range(1, self.max_word_length):
self.ngram_count(sentence, word_length) # 按行统计每个词长下的词数量
return
# 按照窗口长度取词,并记录左邻右邻
def ngram_count(sentence, word_length):
for i in range(len(sentence) - word_length 1):
word = sentence[i:i word_length]
self.word_count[word] = 1
if i-1>=0: # 此时有左邻
char = sentence[i-1]
self.left_neighbor[word][char] = self.left_neighbor.get(char, 0) 1
if i word_length<len(sentence): # 此时有右邻
char = sentence[i word_length]
self.right_neighbor[word][char] = self.right_neighbor.get(char, 0) 1
return
# 计算左右熵
def calc_entropy(self):
self.word_left_entropy = {} # 保存每个词的左熵
self.word_right_entropy = {}
for word, count_dict in self.left_neighbor.items():
self.word_left_entropy[word] = self.calc_entropy_by_word_count_dict(count_dict)
for word, count_dict in self.right_neighbor.items():
self.word_right_entropy[word] = self.calc_entropy_by_word_count_dict(count_dict)
# 计算熵
def calc_entropy_by_word_count_dict(self, word_count_dict):
total = sum(word_count_dict.values()) # 统计左/右邻一共多少字
entropy = sum([-(c / total) * math.log((c / total), 10) for c in word_count_dict.values()])
return entropy
# 计算每种词长下的词总数
def calc_total_count_by_length(self):
self.word_count_by_length = defaultdict(int)
for word, count in self.word_count.items():
self.word_count_by_length[len(word)] = count
return
# 计算互信息
def calc_pmi(self):
self.calc_total_count_by_length() # 计算每种词长下的词总数
self.pmi = {}
for word, count in self.word_count.items():
p_word = count / self.word_count_by_length[len(word)]
p_chars = 1
for char in word:
p_chars *= self.word_count[char] / self.word_count_by_length[1] # 计算单个字的概率,多个字时求概率乘积
self.pmi[word] = math.log(p_word / p_chars, 10) / len(word)
return
def calc_word_values(self):
self.word_values = {}
# print(self.pmi)
for word in self.pmi:
if len(word) < 2 or "," in word:
continue
pmi = self.pmi.get(word, 1e-3)
le = self.word_left_entropy.get(word, 1e-3)
re = self.word_right_entropy.get(word, 1e-3)
self.word_values[word] = pmi * le * re
if __name__ == '__main__':
nwd = NewWordDetect("sample_corpus.txt")
value_sort = sorted([(word, count) for word, count in nwd.word_values.items()], key=lambda x:x[1], reverse=True)
print([x for x, c in value_sort if len(x) == 2][:10]) # 词长度为2的概率最高的前10个词