package me.test;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.analysis.core.SimpleAnalyzer;
import org.apache.lucene.analysis.core.StopAnalyzer;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.util.Version;
/**
* 查看分词结果。
* http://www.iteye.com/news/9637
*
* @author zll
*
*/
public class AnalyzerCN {
static final String str = "Lucene是apache软件基金会4 jakarta项目组的一个子项目,"
+ "是一个开放源代码的全文检索引擎工具包,即它不是一个完整的全文检索引擎,"
+ "而是一个全文检索引擎的架构,提供了完整的查询引擎和索引引擎,"
+ "部分文本分析引擎(英文与德文两种西方语言)。"
+ "Apache LuceneTM is a high-performance, "
+ "full-featured text search engine library written entirely in Java. "
+ "It is a technology suitable for nearly any application "
+ "that requires full-text search, especially cross-platform.";
public static void main(String[] args) {
System.out.println(" : " + str);
testWhitespaceAnalyzer();
testSimpleAnalyzer();
testStopAnalyzer();
testStandardAnalyzer();
testCJKAnalyzer();
testSmartChineseAnalyzer();
}
/**
* WhitespaceAnalyzer
* 只以空格作为分词分隔符。不太实用。
*/
private static void testWhitespaceAnalyzer() {
List<String> result = new ArrayList<String>();
Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_47);
try {
TokenStream tokenStream = analyzer.tokenStream("field", str);
CharTermAttribute term = tokenStream.addAttribute(CharTermAttribute.class);
tokenStream.reset();
while (tokenStream.incrementToken()) {
result.add(term.toString());
}
tokenStream.end();
tokenStream.close();
} catch (IOException e1) {
e1.printStackTrace();
}
System.out.println("testWhitespaceAnalyzer : " + result);
}
/**
* SimpleAnalyzer
* 以非字母符来分割文本信息,并将语汇单元统一为小写形式,并去掉数字类型的字符。很明显不适用于中文环境。
*/
private static void testSimpleAnalyzer() {
List<String> result = new ArrayList<String>();
Analyzer analyzer = new SimpleAnalyzer(Version.LUCENE_47);
try {
TokenStream tokenStream = analyzer.tokenStream("field", str);
CharTermAttribute term = tokenStream.addAttribute(CharTermAttribute.class);
tokenStream.reset();
while (tokenStream.incrementToken()) {
result.add(term.toString());
}
tokenStream.end();
tokenStream.close();
} catch (IOException e1) {
e1.printStackTrace();
}
System.out.println("testSimpleAnalyzer : " + result);
}
/**
* StopAnalyzer
* 停顿词分析器会去除一些常有a,the,an等等,也可以自定义禁用词,不适用于中文环境。
*/
private static void testStopAnalyzer() {
List<String> result = new ArrayList<String>();
Analyzer analyzer = new StopAnalyzer(Version.LUCENE_47);
try {
TokenStream tokenStream = analyzer.tokenStream("field", str);
CharTermAttribute term = tokenStream.addAttribute(CharTermAttribute.class);
tokenStream.reset();
while (tokenStream.incrementToken()) {
result.add(term.toString());
}
tokenStream.end();
tokenStream.close();
} catch (IOException e1) {
e1.printStackTrace();
}
System.out.println("testStopAnalyzer : " + result);
}
/**
* StandardAnalyzer
* 标准分析器是Lucene内置的分析器,会将语汇单元转成小写形式,并去除停用词及标点符号,很明显也是不适合于中文环境
*/
private static void testStandardAnalyzer() {
List<String> result = new ArrayList<String>();
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_47);
try {
TokenStream tokenStream = analyzer.tokenStream("field", str);
CharTermAttribute term = tokenStream.addAttribute(CharTermAttribute.class);
tokenStream.reset();
while (tokenStream.incrementToken()) {
result.add(term.toString());
}
tokenStream.end();
tokenStream.close();
} catch (IOException e1) {
e1.printStackTrace();
}
System.out.println("testStandardAnalyzer : " + result);
}
/**
* 中日韩分析器,能对中,日,韩语言进行分析的分词器,但是对中文支持效果一般,一般不用
*/
private static void testCJKAnalyzer() {
List<String> result = new ArrayList<String>();
Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_47);
try {
TokenStream tokenStream = analyzer.tokenStream("field", str);
CharTermAttribute term = tokenStream.addAttribute(CharTermAttribute.class);
tokenStream.reset();
while (tokenStream.incrementToken()) {
result.add(term.toString());
}
tokenStream.end();
tokenStream.close();
} catch (IOException e1) {
e1.printStackTrace();
}
System.out.println("testCJKAnalyzer : " + result);
}
/**
* SmartChineseAnalyzer
* 基于 Hidden Markov Model.基于
* 对中文支持稍好,但扩展性差,扩展词库,禁用词库和同义词库等不好处理
*/
private static void testSmartChineseAnalyzer() {
// 自定义停用词
String[] myStopWords = { "的", "了", "呢", ",", "0", ":", ",", "是", "流" };
CharArraySet cas = new CharArraySet(Version.LUCENE_47, 0, true);
for (int i = 0; i < myStopWords.length; i++) {
cas.add(myStopWords[i]);
}
Iterator<Object> itor = SmartChineseAnalyzer.getDefaultStopSet().iterator();
while (itor.hasNext()) {
cas.add(itor.next());
}
List<String> result = new ArrayList<String>();
Analyzer analyzer = new SmartChineseAnalyzer(Version.LUCENE_47, cas);
try {
TokenStream tokenStream = analyzer.tokenStream("field", str);
CharTermAttribute term = tokenStream.addAttribute(CharTermAttribute.class);
tokenStream.reset();
while (tokenStream.incrementToken()) {
result.add(term.toString());
}
tokenStream.end();
tokenStream.close();
} catch (IOException e1) {
e1.printStackTrace();
}
System.out.println("testSmartChineseAnalyzer : " + result);
}
}