/* ==================================================================
* Created [2009-4-27 下午11:32:55] by Jon.King
* ==================================================================
* TSS
* ==================================================================
* mailTo:jinpujun@hotmail.com
* Copyright (c) Jon.King, 2009-2012
* ==================================================================
*/
package com.jinhe.tss.cms.lucene.analyzer;
import java.io.IOException;
import jeasy.analysis.MMAnalyzer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.TermPositionVector;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.TokenSources;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import com.jinhe.tss.cms.lucene.AnalyzerFactory;
/**
* <p>
* MMAnalyzerTest.java
* </p>
* 分词效率: 第一次分词需要1-2秒(读取词典),之后速度基本与Lucene自带分词持平。内存消耗: 30M+
* MMAnalyzer 单介:
* 1、支持英文、数字、中文(简体)混合分词
* 2、常用的数量和人名的匹配
* 3、超过22万词的词库整理
* 4、实现正向最大匹配算法
* 5、词典的动态扩展
*
* 常用API:
* 1、MMAnalyzer analyzer = new MMAnalyzer(); //采用正向最大匹配的中文分词算法,相当于分词粒度等于0
* 2、MMAnalyzer analyzer = new MMAnalyzer(2); //参数为分词粒度:当字数等于或超过该参数,且能成词,该词就被切分出来
* 3、MMAnalyzer.addDictionary(reader); //增加一个新词典,采用每行一个词的读取方式
* 4、MMAnalyzer.addWord(newWord); //增加一个新词
*
* 分词思路: 读取一个字,然后联想,直到联想到不能为止。如果当前可以构成词,便返回一个Token。
* 如果当前不能构成词语,便回溯到最近的可以构成词语的节点,返回。
* 最差的情况就是返回第一个单字。
* 然后从返回结果的下一个字重新开始联想。
*/
public class TestMMAnalyzer {
public static void main(String[] args) {
testSegment2();
}
static String text = "据路透社报道,印度尼西亚社会事务部一官员星期二(29日)表示,"
+ "日惹市附近当地时间27日晨5时53分 发生 的里氏6.2级地震已经造成至少5427人死亡,"
+ "20000余人受伤,近20万人无家可归。三季度。金普俊。我是额飞挺阿什顿"; // 检索内容
public static void testSegment() {
//MMAnalyzer analyzer = new MMAnalyzer(2);
//System.out.println(MMAnalyzer.contains("科学"));
MMAnalyzer analyzer = (MMAnalyzer) AnalyzerFactory.createAnalyzer("额飞挺");
try {
System.out.println(analyzer.segment(text, " | "));
} catch (IOException e) {
e.printStackTrace();
}
}
public static void testSegment2() {
String fieldName = "text";
// 采用正向最大匹配的中文分词算法
Analyzer analyzer = new MMAnalyzer();
Directory directory = new RAMDirectory();
// Directory directory = FSDirectory.getDirectory("/tmp/testindex", true);
try {
IndexWriter iwriter = new IndexWriter(directory, analyzer, true);
iwriter.setMaxFieldLength(25000);
Document doc = new Document();
doc.add(new Field(fieldName, text, Field.Store.YES, Field.Index.TOKENIZED));
iwriter.addDocument(doc);
iwriter.close();
IndexSearcher isearcher = new IndexSearcher(directory);
QueryParser parser = new QueryParser(fieldName, analyzer);
Query query = parser.parse("印度尼西亚 6.2级地震"); // 检索词
Hits hits = isearcher.search(query);
System.out.println("命中:" + hits.length());
for (int i = 0; i < hits.length(); i++) {
Document hitDoc = hits.doc(i);
System.out.println("内容:" + hitDoc.get(fieldName));
}
isearcher.close();
directory.close();
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* 测试收索结果对关键字进行高亮显示。
*/
public static void testSegment3() {
String fieldName = "text";
// 采用正向最大匹配的中文分词算法
Analyzer analyzer = new MMAnalyzer();
Directory directory = new RAMDirectory();
// Directory directory = FSDirectory.getDirectory("/tmp/testindex", true);
try {
IndexWriter iwriter = new IndexWriter(directory, analyzer, true);
iwriter.setMaxFieldLength(25000);
Document doc = new Document();
doc.add(new Field(fieldName, text, Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
iwriter.addDocument(doc);
iwriter.close();
IndexSearcher isearcher = new IndexSearcher(directory);
QueryParser parser = new QueryParser(fieldName, analyzer);
Query query = parser.parse("印度尼西亚 6.2级地震");// 检索词
Hits hits = isearcher.search(query);
System.out.println("命中:" + hits.length());
Highlighter highlighter = new Highlighter(new QueryScorer(query));
for (int i = 0; i < hits.length(); i++) {
text = hits.doc(i).get(fieldName);
TermPositionVector tpv = (TermPositionVector) IndexReader.open(directory).getTermFreqVector(hits.id(i), fieldName);
TokenStream tokenStream = TokenSources.getTokenStream(tpv);
String result = highlighter.getBestFragments(tokenStream, text, 3, "...");
System.out.println("内容:" + result);
}
isearcher.close();
directory.close();
} catch (Exception e) {
e.printStackTrace();
}
}
}