最近一段时间由于公司需要 ,模糊搜索出相似的关键词,所以直接考虑使用了lucene。
lucene允许你往程序中添加搜索功能,lucene能够把你从文本中解析出来的数据进行索引和搜索 ,lucene不关心数据来源 甚至不关心语种,不过你需要把它转换成文本格式。也就是说你可以搜索 html网页,文本文档,word文档 ,pdf,或者其他一些 总之 只要能够提取出文本信息的即可。同样你也可以利用lucene来索引存储在数据库中的数据,以给你的用户提供一些 比如 全文搜索功能等 ,反正lucene的功能很是强大。里面还有很多开源的对不同语言进行分析的插件等。
下面我介绍一个例子 ,这里我进行对 一个txt文档的 每一行进行了 索引的添加 ,也就是说 把每一行 当作一个document对象来处理,实际上在lucene中 每一个document 相当于我们在数据库中的库名, 而每个field相当于我们的表名 ,它能够对文本进行自动处理去掉里面的一些语气词,它能把你规定的域当作关键词来进行索引 以备查询时使用,lucene比较容易使用 ,但是不如数据库灵活,速度很快。下面 我用一个例子来说明(这里我用的lucene4.7.2,最高版本 ,你需要注意把需要的一些jar包引入的到你的工程中,使用maven可直接引入依赖http://mvnrepository.com/artifact/org.apache.lucene需要的全部引入)我这里写了一个实例 你可以进行参考学习使用方法。
代码语言:javascript复制package lucene.home.clq;
/**
* @author chenlongquan
* Copyright Manning Publications Co..com
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific lan
*/
//创建索引
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Version;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileFilter;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.FileReader;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
/**
* This code was originally build for the index
*
*/
public class Indexer {
public static void main(String[] args) throws Exception {
String indexDir = "f:\index"; //1
String dataDir = "f:\baidu"; //2
long start = System.currentTimeMillis();
Indexer indexer = new Indexer(indexDir);
int numIndexed;
try {
numIndexed = indexer.index(dataDir, new TextFilesFilter());
} finally {
indexer.close();
}
long end = System.currentTimeMillis();
System.out.println("Indexing " numIndexed " files took "
(end - start) " milliseconds");
}
private IndexWriter writer;
public Indexer(String indexDir) throws IOException {
Directory dir = FSDirectory.open(new File(indexDir));
writer = new IndexWriter(dir,indexWriterConfig());
//在这里进行索引的调试
}
public void close() throws IOException {
writer.close(); //4
}
private IndexWriterConfig indexWriterConfig()
{
Analyzer analyzer = new SmartChineseAnalyzer(Version.LUCENE_47);
IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_47, analyzer);
return config;
}
public int index(String dataDir, FileFilter filter)
throws Exception {
File[] files = new File(dataDir).listFiles();
for (File f: files) {
if (!f.isDirectory() &&
!f.isHidden() &&
f.exists() &&
f.canRead() &&
(filter == null || filter.accept(f))) {
indexFile(f);
}
}
return writer.numDocs(); //5
}
private static class TextFilesFilter implements FileFilter {
public boolean accept(File path) {
return path.getName().toLowerCase() //6
.endsWith(".txt"); //6
}
}
/**
* 遍历每一个文件,然后读出文件中的每一行数据,当成一个document来处理
* @param f
* @throws Exception
*/
private void indexFile(File f) throws Exception {
System.out.println("Indexing " f.getCanonicalPath());
// Document doc = getDocument(f);
List<String> lists = readFileNoDup(f);
for(String list:lists){
Document doc = new Document();
doc.add(new Field("contents",list,TextField.TYPE_STORED));
writer.addDocument(doc);
}
//10
}
//读取一个文件
private List<String> readFile(File filePathAndName)throws IOException {
FileInputStream fis = new FileInputStream(filePathAndName);
InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
BufferedReader br = new BufferedReader(isr);
LineNumberReader lnr = new LineNumberReader(br);
List<String> returnValue = new ArrayList<String>();
int cnt = 0;
while (true) {
cnt ;
String tempStr = lnr.readLine();
if (tempStr == null)
break;
if (tempStr.length() < 2)
continue;
returnValue.add(tempStr);
}
lnr.close();
br.close();
isr.close();
fis.close();
return returnValue;
}
//读取一个文件并排重后返回
public static List<String> readFileNoDup(File filePathAndName)
throws IOException {
FileInputStream fis = new FileInputStream(filePathAndName);
InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
BufferedReader br = new BufferedReader(isr);
LineNumberReader lnr = new LineNumberReader(br);
Set<String> set = new HashSet<String>();
while (true) {
String tempStr = lnr.readLine();
if (tempStr == null)
break;
if (tempStr.length() < 2)
continue;
set.add(tempStr.trim());
}
lnr.close();
br.close();
isr.close();
fis.close();
List<String> returnValue = new ArrayList<String>(set.size());
returnValue.addAll(set);
return returnValue;
}
}
//对刚才已经建好的索引进行搜索
代码语言:javascript复制package lucene.home.clq;
/**
* Copyright Manning Publications Co.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific lan
*/
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TopFieldCollector;
import org.apache.lucene.search.TopFieldDocs;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;
// From chapter 1
/**
* This code was originally written for searcher
*
*/
public class Searcher {
public static void main(String[] args) throws IllegalArgumentException,
IOException, ParseException {
final String indexDir = "e:\soso\soso";
String q = " ";//输入你添加的所以 进行模糊搜索
docs = query(indexDir, q)
}
public static void search(String indexDir, String q)
throws IOException, ParseException {
IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexDir)));
// Directory dir = FSDirectory.open(new File(indexDir)); //3
IndexSearcher is = new IndexSearcher(reader); //3
QueryParser parser = new QueryParser(Version.LUCENE_47,"contents",new SmartChineseAnalyzer(Version.LUCENE_47));
Query query = parser.parse(q); //4
long start = System.currentTimeMillis();
TopDocs hits = is.search(query, 500); //5
//ScoreDoc[] hits = is.search(query, null, 10).scoreDocs;
long end = System.currentTimeMillis();
System.err.println("Found " hits.totalHits //6
" document(s) (in " (end - start) // 6
" milliseconds) that matched query '" // 6
q "':"); // 6
for(ScoreDoc scoreDoc : hits.scoreDocs) {
Document doc = is.doc(scoreDoc.doc); //7
System.out.println(doc.get("contents"));
}
reader.close();
}
private static List<String> query(String indexDir, String searcher) throws IOException, ParseException{
if (searcher == null || searcher.length() == -1) {
return null;
}
searcher = searcher.trim();
if (searcher.length() == 0) {
return null;
}
IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexDir)));//open the index
//IndexReader reader = DirectoryReader.open(SimpleFSDirectory.open(new File(indexDir)));//open the index
IndexSearcher is = new IndexSearcher(reader);//find the content
QueryParser parser = new QueryParser(Version.LUCENE_47, "contents", new SmartChineseAnalyzer(Version.LUCENE_47));//parser the content
Query query = parser.parse(searcher);
TopFieldDocs hits = is.search(query, 100, new Sort(new SortField("contents", SortField.Type.SCORE, false)));
TopDocs hits1 = is.search(query, 200);//搜索出前200条数据 按照评分进行排序
List<String> list = new ArrayList<String>();
for(ScoreDoc scoreDoc : hits.scoreDocs){
Document doc = is.doc(scoreDoc.doc);
list.add(doc.get("contents"));
}
reader.close();
return list;
}
}
//这里我主要给文档中的文本进行添加了索引 ,你也可以在Field 中给路径 等等一些属性进行添加索引 具体你可以搜索lucene api
进行使用 里面的一些方法。我这里说的比较粗,有问题欢迎讨论。