package org.fnlp.app.lucene;
import java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.util.Version;
import edu.fudan.nlp.cn.CNFactory;
import edu.fudan.nlp.cn.CNFactory.Models;
import edu.fudan.util.exception.LoadModelException;
/**
* 基于FudanNLP的Lucene分析器
* @author xpqiu
*/
public final class FNLPAnalyzer extends Analyzer {
private final Version matchVersion;
boolean usePOSFilter;
/**
* 指定CNFactory路径
* @param matchVersion
* @param path CNFactory路径
* @param usePOSFilter 使用词性作为停用词过滤
* @throws LoadModelException
*/
public FNLPAnalyzer(Version matchVersion,String path,boolean usePOSFilter) throws LoadModelException {
this.matchVersion = matchVersion;
CNFactory.getInstance(path,Models.SEG_TAG);
this.usePOSFilter = usePOSFilter;
}
/**
* 指定CNFactory路径
* @param matchVersion CNFactory路径
* @param path
* @throws LoadModelException
*/
public FNLPAnalyzer(Version matchVersion,String path) throws LoadModelException {
this.matchVersion = matchVersion;
CNFactory.getInstance(path,Models.SEG_TAG);
}
/**
* 需要预先建立CNFactory
* @param matchVersion
*/
public FNLPAnalyzer(Version matchVersion) {
this.matchVersion = matchVersion;
this.usePOSFilter = true;
}
@Override
public TokenStreamComponents createComponents(String fieldName, Reader reader) {
Tokenizer tokenizer = new SentenceTokenizer(reader);
TokenStream result = new WordTokenFilter(tokenizer);
result = new POSTaggingFilter(true,result);
return new TokenStreamComponents(tokenizer, result);
}
}