lucene的联想词是在org.apache.lucene.lucene-grouping包下边,提供了组查询功能的支持。
简介
Grouping主要为用户提供了处理不同lucene的中含有相同filed的不同doc的分组统计。
代码语言:javascript复制<!--组查询-->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-grouping</artifactId>
<version>5.5.2</version>
</dependency>
Grouping
1. Grouping参数
代码语言:javascript复制groupField:要分组的字段(如果groupField在doc中不存在,会返回一个null的分组)
groupSort:分组的排序规则,排序字段决定了分组内容展示的先后顺序
topNGroup:分组展示的数量,只计算0到topNGroup条记录
groupOffset:从第几个TopGroup开始计算 例:groupOffset为3的话,会展示从3到topNGroup对应的记录,此数值我们可以用于分页查询
withinGroupSort:每组内怎么排序
maxDocsPerGroup:每组处理多少个doc
withinGroupOffset:每组显示的doc初始位置
2.Grouping实现
grouping实现需要两步:
- 利用TermFirstPassGroupingCollector来收集top groups
- 用TermSecondPassGroupingCollector处理每个group对应的doc
3. Grouping查询
对要搜索的信息创建Query查询对象,Lucene会根据Query查询对象生成最终的查询语法,类似关系数据库Sql语法一样Lucene也有自己的查询语法,比如:"word:北京长安中西医结合医院"表示查询Field的word为"北京长安中西医结合医院"的文档信息。
可通过两种方法创建查询对象:
- 使用Lucene提供Query子类
Query是一个抽象类,lucene提供了很多查询对象,比如TermQuery项精确查询,NumericRangeQuery数字范围查询等。
代码语言:javascript复制Query query = new TermQuery(new Term("word", "北京市鼓楼中医院"));
//参数1:要查询的域 参数2:最小值 参数3:最大值 参数4:是否包含最小值 参数5:是否包含最大值
Query query =NumericRangeQuery.newLongRange("size", 0, 1000, true, true);
扩展:
BooleanQuery query = new BooleanQuery();
Query query1 =NumericRangeQuery.newLongRange("size", 0, 1000, true, true);
Query query2 = new TermQuery(new Term("word","北京市鼓楼中医院"));
//添加到BooleanQuery对象中
query.add(query1,Occur.MUST);
query.add(query2,Occur.MUST);
备注:
Occur.MUST:必须满足此条件
Occur.SHOULD:应该满足,但是不满足也可以
Occur.MUST_NOT:必须不满足
- 使用QueryParse解析查询表达式
QueryParse会将用户输入的查询表达式解析成Query对象实例。
代码语言:javascript复制QueryParser queryParser =new QueryParser("word", new StandardAnalyzer());
Query query = queryParser.parse("北京市鼓楼中医院");
Grouping用例
1. Controller层
代码语言:javascript复制package com.spring.master.lucene.group.controller;
import com.spring.master.lucene.group.service.GroupService;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;
import javax.annotation.Resource;
/**
* @author Huan Lee
* @version 1.0
* @date 2020-09-14 14:19
* @describtion 业精于勤,荒于嬉;行成于思,毁于随。
*/
@RestController
@RequestMapping(value = "/group")
public class GroupController {
@Resource
private GroupService groupService;
/**
* 创建索引文档
* @return
*/
@GetMapping(value = "/createIndex")
public String createIndex() {
return groupService.createIndex();
}
/**
* 搜索分组
* lucene 5
* @return
*/
@GetMapping(value = "/searchGroup5")
public String searchGroup5(String keyword) {
groupService.searchGroup5(keyword);
return "true";
}
/**
* 搜索分组
* lucene 7
* @return
*/
@GetMapping(value = "/searchGroup7")
public String searchGroup7(String keyword) {
groupService.searchGroup7(keyword);
return "true";
}
}
访问地址:
localhost:2000/spring-master/group/createIndex
localhost:2000/spring-master/group/searchGroup5?keyword=北京长安中西医结合医院
localhost:2000/spring-master/group/searchGroup7?keyword=北京长安中西医结合医院
2. Service层
代码语言:javascript复制package com.spring.master.lucene.group.impl;
import com.spring.master.global.Identities;
import com.spring.master.lucene.group.constant.SystemConstants;
import com.spring.master.lucene.group.service.GroupService;
import com.spring.master.lucene.suggest.constant.SuggestConstants;
import com.spring.master.lucene.suggest.util.FileUtils;
import com.spring.master.lucene.suggest.vo.DictionaryVO;
import lombok.extern.slf4j.Slf4j;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.*;
import org.apache.lucene.index.*;
import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.queryparser.xml.builders.MatchAllDocsQueryBuilder;
import org.apache.lucene.search.*;
import org.apache.lucene.search.grouping.GroupDocs;
import org.apache.lucene.search.grouping.GroupingSearch;
import org.apache.lucene.search.grouping.SearchGroup;
import org.apache.lucene.search.grouping.TopGroups;
import org.apache.lucene.search.grouping.term.TermAllGroupsCollector;
import org.apache.lucene.search.grouping.term.TermFirstPassGroupingCollector;
import org.apache.lucene.search.grouping.term.TermSecondPassGroupingCollector;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Version;
import org.nlpcn.commons.lang.util.MD5;
import org.springframework.stereotype.Service;
import java.io.IOException;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Random;
/**
* @author Huan Lee
* @version 1.0
* @date 2020-09-14 18:45
* @describtion 业精于勤,荒于嬉;行成于思,毁于随。
*/
@Service
@Slf4j
public class GroupServiceImpl implements GroupService {
/**
* 指定在哪个索引上进行分组
*/
static String groupField = "sourceType";
/**
* 标准分词器
*/
private static Analyzer analyzer = new StandardAnalyzer();
@Override
public String createIndex() {
try {
Directory directory = FSDirectory.open(Paths.get(SystemConstants.indexDir));
IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);
indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
IndexWriter writer = new IndexWriter(directory, indexWriterConfig);
// 读DictionaryVO数据
List<DictionaryVO> diseases = FileUtils.readCsv(SuggestConstants.disease);
List<DictionaryVO> doctors = FileUtils.readCsv(SuggestConstants.doctor);
List<DictionaryVO> facultys = FileUtils.readCsv(SuggestConstants.faculty);
List<DictionaryVO> hospitals = FileUtils.readCsv(SuggestConstants.hospital);
List<DictionaryVO> drugcatalogues = FileUtils.readCsv(SuggestConstants.drugcatalogue);
diseases.forEach(disease -> {
Document doc = new Document();
// 进行分组的域上建立的必须是SortedDocValuesField类型
doc.add(new SortedDocValuesField(groupField, new BytesRef("Disease")));
doc.add(new StringField("id", Identities.uuid(), Field.Store.YES));
doc.add(new StringField("sourceType", "Disease", Field.Store.YES));
doc.add(new TextField("word", disease.getWord(), Field.Store.YES));
try {
writer.addDocument(doc);
} catch (Exception e) {
log.error(e.getMessage());
}
});
hospitals.forEach(hospital -> {
Document doc = new Document();
// 进行分组的域上建立的必须是SortedDocValuesField类型
doc.add(new SortedDocValuesField(groupField, new BytesRef("Hospital")));
doc.add(new StringField("id", Identities.uuid(), Field.Store.YES));
doc.add(new StringField("sourceType", "Hospital", Field.Store.YES));
doc.add(new TextField("word", hospital.getWord(), Field.Store.YES));
try {
writer.addDocument(doc);
} catch (Exception e) {
log.error(e.getMessage());
}
});
facultys.forEach(faculty -> {
Document doc = new Document();
// 进行分组的域上建立的必须是SortedDocValuesField类型
doc.add(new SortedDocValuesField(groupField, new BytesRef("Faculty")));
doc.add(new StringField("id", Identities.uuid(), Field.Store.YES));
doc.add(new StringField("sourceType", "Faculty", Field.Store.YES));
doc.add(new TextField("word", faculty.getWord(), Field.Store.YES));
try {
writer.addDocument(doc);
} catch (Exception e) {
log.error(e.getMessage());
}
});
drugcatalogues.forEach(drugcatalogue -> {
Document doc = new Document();
// 进行分组的域上建立的必须是SortedDocValuesField类型
doc.add(new SortedDocValuesField(groupField, new BytesRef("Drugcatalogue")));
doc.add(new StringField("id", Identities.uuid(), Field.Store.YES));
doc.add(new StringField("sourceType", "Drugcatalogue", Field.Store.YES));
doc.add(new TextField("word", drugcatalogue.getWord(), Field.Store.YES));
try {
writer.addDocument(doc);
} catch (Exception e) {
log.error(e.getMessage());
}
});
doctors.forEach(doctor -> {
Document doc = new Document();
// 进行分组的域上建立的必须是SortedDocValuesField类型
doc.add(new SortedDocValuesField(groupField, new BytesRef("Doctor")));
doc.add(new StringField("id", Identities.uuid(), Field.Store.YES));
doc.add(new StringField("sourceType", "Doctor", Field.Store.YES));
doc.add(new TextField("word", doctor.getWord(), Field.Store.YES));
try {
writer.addDocument(doc);
} catch (Exception e) {
log.error(e.getMessage());
}
});
// 6 -- no sourceType field
Document doc = new Document();
doc.add(new StringField("sourceType", "Faculty", Field.Store.YES));
doc.add(new TextField("word", "高压氧室", Field.Store.YES));
doc.add(new StringField("id", "0", Field.Store.YES));
writer.addDocument(doc);
writer.commit();
writer.close();
} catch (Exception e) {
log.error(e.getMessage());
}
return "true";
}
@Override
public void searchGroup5(String keyword) {
try {
Directory directory = FSDirectory.open(Paths.get(SystemConstants.indexDir));
IndexReader reader = DirectoryReader.open(directory);
IndexSearcher searcher = new IndexSearcher(reader);
// term查询
// Query query = new TermQuery(new Term("word", keyword));
QueryParser queryParser =new QueryParser("word", new StandardAnalyzer());
Query query = queryParser.parse(keyword);
// 每个分组内部的排序规则
Sort groupSort = Sort.RELEVANCE;
// 前N条中分组
int topNGroups = 10;
// 分组起始偏移量
int groupOffset = 0;
// 是否填充SearchGroup的sortValues
boolean fillFields = true;
// groupSort用于对组进行排序,docSort用于对组内记录进行排序,多数情况下两者是相同的,但也可不同
Sort docSort = groupSort;
// 用于组内分页,起始偏移量
int docOffset = 0;
// 每组返回多少条结果
int docsPerGroup = 5;
// 是否需要计算总的分组数量
boolean requiredTotalGroupCount = true;
// 是否需要缓存评分
boolean cacheScores = true;
// 第一次查询缓存容量的大小:设置为16M
double maxCacheRAMMB = 16.0;
// 支持lucene5.x版本
TermFirstPassGroupingCollector c1 = new TermFirstPassGroupingCollector("sourceType", groupSort, groupOffset topNGroups);
/** 将TermFirstPassGroupingCollector包装成CachingCollector,为第一次查询加缓存,避免重复评分
* CachingCollector就是用来为结果收集器添加缓存功能的
*/
CachingCollector cachedCollector = CachingCollector.create(c1, cacheScores, maxCacheRAMMB);
// 开始第一次分组统计
searcher.search(query, cachedCollector);
// 第一次查询返回的结果集TopGroups中只有分组域值以及每组总的评分,至于每个分组里有几条,分别哪些索引文档,则需要进行第二次查询获取
Collection<SearchGroup<BytesRef>> topGroups = c1.getTopGroups(groupOffset, fillFields);
if (topGroups == null) {
System.out.println("No groups matched ");
return;
}
// 是否获取每个分组内部每个索引的评分
boolean getScores = true;
// 是否计算最大评分
boolean getMaxScores = true;
// 如果需要对Lucene的score进行修正,则需要重载TermSecondPassGroupingCollector
TermSecondPassGroupingCollector c2 = new TermSecondPassGroupingCollector("sourceType", topGroups, groupSort, docSort, docOffset docsPerGroup, getScores, getMaxScores, fillFields);
// 如果需要计算总的分组数量,则需要把TermSecondPassGroupingCollector包装成TermAllGroupsCollector
// TermAllGroupsCollector就是用来收集总分组数量的
TermAllGroupsCollector allGroupsCollector = null;
Collector secondPassCollector = null;
//若需要统计总的分组数量
if (requiredTotalGroupCount) {
allGroupsCollector = new TermAllGroupsCollector("sourceType");
secondPassCollector = MultiCollector.wrap(c2, allGroupsCollector);
} else {
secondPassCollector = c2;
}
// 如果第一次查询已经加了缓存,则直接从缓存中取
if (cachedCollector.isCached()) {
// 第二次查询直接从缓存中取
cachedCollector.replay(secondPassCollector);
} else {
// 开始第二次分组查询
searcher.search(query, secondPassCollector);
}
// 所有组的数量
int totalGroupCount = 0;
// 所有满足条件的记录数
int totalHitCount = 0;
// 所有组内的满足条件的记录数(通常该值与totalHitCount是一致的)
int totalGroupedHitCount = -1;
if (requiredTotalGroupCount) {
totalGroupCount = allGroupsCollector.getGroupCount();
}
TopGroups<BytesRef> groupsResult = c2.getTopGroups(docOffset);
// 这里打印的3项信息就是第一次查询的统计结果
totalHitCount = groupsResult.totalHitCount;
totalGroupedHitCount = groupsResult.totalGroupedHitCount;
// 打印总的分组数量
log.info("groupCount: {}", totalGroupCount);
log.info("groupsResult.totalHitCount: {}", totalHitCount);
log.info("groupsResult.totalGroupedHitCount: {}", totalGroupedHitCount);
log.info("************************************");
int groupIdx = 0;
// 下面打印的是第二次查询的统计结果,如果你仅仅只需要第一次查询的统计结果信息,不需要每个分组内部的详细信息,则不需要进行第二次查询,请知晓
// 迭代组
for (GroupDocs<BytesRef> groupDocs : groupsResult.groups) {
groupIdx ;
String groupVL = groupDocs.groupValue == null ? "分组域的域值为空" : new String(groupDocs.groupValue.bytes);
// 分组域的域值,groupIdx表示组的索引即第几组
log.info("group[{}].groupFieldValue: {}", groupIdx, groupVL);
// 当前分组内命中的总记录数
log.info("group[{}].totalHits: {}", groupIdx, groupDocs.totalHits);
int docIdx = 0;
// 迭代组内的记录
for (ScoreDoc scoreDoc : groupDocs.scoreDocs) {
docIdx ;
// 打印分组内部每条记录的索引文档ID及其评分
log.info("group[" groupIdx "][" docIdx "]{docID:Score}:" scoreDoc.doc "/" scoreDoc.score);
// 根据docID可以获取到整个Document对象,通过doc.get(fieldName)可以获取某个存储域的域值
// 注意searcher.doc根据docID返回的document对象中不包含docValuesField域的域值,只包含非docValuesField域的域值,请知晓
Document doc = searcher.doc(scoreDoc.doc);
log.info("group[" groupIdx "][" docIdx "]{docID:author}:" doc.get("id") ":" doc.get("word"));
}
log.info("*****************************************");
}
} catch (Exception e) {
log.error(e.getMessage());
}
}
@Override
public void searchGroup7(String keyword) {
try {
Directory directory = FSDirectory.open(Paths.get(SystemConstants.indexDir));
IndexReader reader = DirectoryReader.open(directory);
IndexSearcher searcher = new IndexSearcher(reader);
// 标准分词
Analyzer analyzer = new StandardAnalyzer();
// 指定要进行分组的索引
GroupingSearch groupingSearch = new GroupingSearch(groupField);
// 指定分组排序规则
groupingSearch.setGroupSort(new Sort(SortField.FIELD_SCORE));
// 是否填充SearchGroup的sortValues
groupingSearch.setFillSortFields(true);
groupingSearch.setCachingInMB(4.0, true);
groupingSearch.setAllGroups(true);
// groupingSearch.setAllGroupHeads(true);
// 限制分组个数
groupingSearch.setGroupDocsLimit(10);
QueryParser parser = new QueryParser("word", analyzer);
Query query = parser.parse(keyword);
TopGroups<BytesRef> result = groupingSearch.search(searcher, query, 0, 1000);
// 总命中数
log.info("总命中数: {}", result.totalHitCount);
//
log.info("分组数:{}", result.groups.length);
// 按照分组打印查询结果
for (GroupDocs<BytesRef> groupDocs : result.groups){
if (groupDocs != null) {
if (groupDocs.groupValue != null) {
log.info("分组:{}", groupDocs.groupValue.utf8ToString());
}else{
// 由于建立索引时有一条数据没有在分组索引上建立SortedDocValued索引,因此这个分组的groupValue为null
log.info("分组:{}", "unknow");
}
log.info("组内数据条数:{}", groupDocs.totalHits);
for(ScoreDoc scoreDoc : groupDocs.scoreDocs){
log.info("sourceType:{}", searcher.doc(scoreDoc.doc).get("sourceType"));
log.info("word:{}", searcher.doc(scoreDoc.doc).get("word"));
log.info("*****************************");
}
System.out.println("=====================================");
}
}
} catch (Exception e) {
log.error(e.getMessage());
}
}
}
3. Util
代码语言:javascript复制package com.spring.master.lucene.util;
import com.spring.master.lucene.suggest.vo.DictionaryVO;
import lombok.extern.slf4j.Slf4j;
import java.io.BufferedReader;
import java.io.FileReader;
import java.util.ArrayList;
import java.util.List;
/**
* @author Huan Lee
* @version 1.0
* @date 2020-09-11 09:57
* @describtion 业精于勤,荒于嬉;行成于思,毁于随。
*/
@Slf4j
public class FileUtils {
/**
* 读取词典csv文件
* @param fileNamePath
* @return
*/
public static List<DictionaryVO> readCsv(String fileNamePath) {
List<DictionaryVO> dictionarys = new ArrayList<>();
try {
// 换成你的文件名
BufferedReader reader = new BufferedReader(new FileReader(fileNamePath));
String line;
while ((line = reader.readLine()) != null) {
// CSV格式文件为逗号分隔符文件,这里根据逗号切分
String[] item = line.split(",");
dictionarys.add(new DictionaryVO(item[0], item[1], Long.parseLong(item[2]), Long.parseLong(item[3])));
}
} catch (Exception e) {
e.printStackTrace();
log.error(e.getMessage());
}
return dictionarys;
}
}
4. Constant
代码语言:javascript复制package com.spring.master.lucene.group.constant;
/**
* @author Huan Lee
* @version 1.0
* @date 2020-09-14 14:27
* @describtion 业精于勤,荒于嬉;行成于思,毁于随。
*/
public class SystemConstants {
/**
* 索引目录
*/
public static final String indexDir = "/Users/lihuan/Documents/projects/git/me/dictionary";
}