下载了一个微信聊天的语料库,大概11万条记录,采用问答方式,中间以“|”分割,用gensim做了个简单的检索聊天机器人,目前基本可用。还有个地方需要进一步优化,1万语料生成的模型库通过自动应答效率还可以,11万语料自动应答效率非常低,还需要进一步改进。
文本示例
代码语言:javascript复制敢不敢说句话 | 为什么不敢,胆小鬼
那重点是什么 | 好话不分轻重!
是程序吧?你不是人 | 就你是人?
代码示例
代码语言:javascript复制from gensim import corpora
from gensim import similarities
from gensim import models
import jieba
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
class myCorpus(object):
def __init__(self,corpusfile,stopwordsfile='',userdictfile=''):
self.corpusfile=corpusfile
self.stopwordsfile=stopwordsfile
self.userdictfile=userdictfile
# 问题和答案字典
self.questionanswerdict={}
# 问题分词和答案字典
self.questionseganswerdict = {}
# 问题和问题分词字典
self.questionsegdict={}
# 问题分词列表
self.questionseglist=[]
self.questionlist = []
self.corpussegmentfile='chatcorpus.out'
def createcorpus(self):
# 创建适合的语料
print('-------------------------createcorpus--------------------------')
with open(self.corpusfile, 'r', encoding='utf-8', errors='ignore') as f:
documents= f.readlines()
with open(self.corpussegmentfile, 'w', encoding='utf-8', errors='ignore') as f:
for document in documents:
documentlist=document.split('|')
question,answer = documentlist[0].strip(),documentlist[1].strip()
questionseg = jieba.lcut(question)
questionsegstr = ' '.join(jieba.lcut(question)).strip()
# 暂不做停用词处理
# 暂不做词频过滤
self.questionanswerdict[question] = answer
self.questionseganswerdict[questionsegstr] = answer
self.questionsegdict[question]=questionseg
self.questionseglist.append(questionseg)
self.questionlist.append(question)
f.write(questionsegstr 'n')
return self.questionanswerdict,self.questionlist
def createvector(self):
print('--------------------------createvector-------------------------')
# 初始化语料字典
dictionary = corpora.Dictionary(self.questionseglist)
# -----------dictionary的相关方法和属性----------------
# dictionary.token2id 存放的是单词-id key-value对,字典缺省按字符串排序
# dictionary.dfs,返回tokenid->多少文档包含这个token
# dictionary.num_docs,返回处理文档的数量
# dictionary.num_nnz,返回整个语料库中每个文档的唯一单词数之和)
# dictionary.filter_n_most_frequent(N) 过滤掉出现频率最高的N个单词
# dictionary.filter_tokens(bad_ids=None, good_ids=None) 有两种用法,一种是去掉bad_id对应的词,另一种是保留good_id对应的词而去掉其他词。注意这里bad_ids和good_ids都是列表形式
# dictionary.compacity() 在执行完前面的过滤操作以后,可能会造成单词的序号之间有空隙,这时就可以使用该函数来对词典来进行重新排序,去掉这些空隙。
# dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=100000)
# 1.去掉出现次数低于no_below的
# 2.去掉出现次数高于no_above的。注意这个小数指的是百分数
# 3.在1和2的基础上,保留出现频率前keep_n的单词
# 存储语料字典
dictionary.save('dictionary.dict')
def createmodel(self):
# 向量化
print('--------------------------createmodel-------------------------')
dictionary = corpora.Dictionary.load('dictionary.dict')
corpus = [dictionary.doc2bow(text) for text in self.questionseglist]
corpora.MmCorpus.serialize('dictionary.mm', corpus) # store to disk, for later use
def createtfidf(self):
# 创建tfidf模型
print('-------------------------createtfidf--------------------------')
corpus = corpora.MmCorpus('dictionary.mm')
tfidf = models.TfidfModel(corpus)
tfidf.save("dictionary.tfidf")
def createsimilarities(self):
print('-------------------------createsimilarities--------------------------')
tfidf = models.TfidfModel.load("dictionary.tfidf")
corpus = corpora.MmCorpus('dictionary.mm')
index = similarities.MatrixSimilarity(tfidf[corpus])
index.save('dictionary.index')
class myQuestion(object):
def __init__(self,corpusfile):
# 初始化加载相关字典、模型、相似度矩阵
print('-------------------------myQuestion--------------------------')
self.dictionary = corpora.Dictionary.load('dictionary.dict')
self.tfidf = models.TfidfModel.load("dictionary.tfidf")
self.index = similarities.MatrixSimilarity.load('dictionary.index')
questionanswer=myCorpus(corpusfile)
self.questionanswerdict, self.questionlist=questionanswer.createcorpus()
def creatematchdocment(self,query_document):
# 匹配问题
query_document = jieba.lcut(query_document.strip())
query_bow = self.dictionary.doc2bow(query_document)
query_tfidf=self.tfidf[query_bow]
sims = self.index[query_tfidf]
# 排序
simstop = sorted(enumerate(sims), key=lambda x: x[1], reverse=True)
# 获取索引
rownumber=simstop[0][0]
# 获取问题
question= self.questionlist[rownumber]
# 获取答案
answer= self.questionanswerdict[question]
return answer
# for document_number, score in sorted(enumerate(sims), key=lambda x: x[1], reverse=True):
# print(document_number, score) #, self.questionseglist[document_number],self.questionlist[document_number])
if __name__ == "__main__":
filename = 'chatcorpus.txt'
mycorpus = myCorpus(filename)
mycorpus.createcorpus()
mycorpus.createvector()
mycorpus.createmodel()
mycorpus.createtfidf()
mycorpus.createsimilarities()
myquestion = myQuestion(filename)
prompt = "问题:"
message = ""
while message != 'quit':
message = input(prompt)
print('答案:',myquestion.creatematchdocment(message))
运行结果如下: