在自然语言处理中,我们经常需要用到n元语法模型。
其中,有关中文分词的一些概念是我们需要掌握的,譬如:
unigram 一元分词,把句子分成一个一个的汉字 bigram 二元分词,把句子从头到尾每两个字组成一个词语 trigram 三元分词,把句子从头到尾每三个字组成一个词语.
我们来简单的做个练习:
输入的是断好词的文本,每个句子一行。 统计词unigram和bigram的频次,并将它们分别输出到`data.uni`和`data.bi`两个文件中。
下面代码为网络资源
代码语言:javascript复制#!/usr/bin/env python
class NGram(object):
def __init__(self, n):
# n is the order of n-gram language model
self.n = n
self.unigram = {}
self.bigram = {}
# scan a sentence, extract the ngram and update their
# frequence.
#
# @param sentence list{str}
# @return none
def scan(self, sentence):
# file your code here
for line in sentence:
self.ngram(line.split())
#unigram
if self.n == 1:
try:
fip = open("data.uni","w")
except:
print >> sys.stderr ,"failed to open data.uni"
for i in self.unigram:
fip.write("%s %dn" % (i,self.unigram[i]))
if self.n == 2:
try:
fip = open("data.bi","w")
except:
print >> sys.stderr ,"failed to open data.bi"
for i in self.bigram:
fip.write("%s %dn" % (i,self.bigram[i]))
# caluclate the ngram of the words
#
# @param words list{str}
# @return none
def ngram(self, words):
# unigram
if self.n == 1:
for word in words:
if word not in self.unigram:
self.unigram[word] = 1
else:
self.unigram[word] = self.unigram[word] 1
# bigram
if self.n == 2:
num = 0
stri = ''
for i in words:
num = num 1
if num == 2:
stri = stri " "
stri = stri i
if num == 2:
if stri not in self.bigram:
self.bigram[stri] = 1
else:
self.bigram[stri] = self.bigram[stri] 1
num = 0
stri = ''
if __name__=="__main__":
import sys
try:
fip = open(sys.argv[1],"r")
except:
print >> sys.stderr, "failed to open input file"
sentence = []
for line in fip:
if len(line.strip())!=0:
sentence.append(line.strip())
uni = NGram(1)
bi = NGram(2)
uni.scan(sentence)
bi.scan(sentence)