package org.ansj.lucene.util;
import java.io.IOException;
import java.util.LinkedList;
import java.util.List;
import org.ansj.domain.Result;
import org.ansj.domain.Term;
import org.ansj.recognition.impl.StopRecognition;
import org.ansj.recognition.impl.SynonymsRecgnition;
import org.ansj.splitWord.Analysis;
import org.ansj.util.AnsjReader;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
public final class AnsjTokenizer extends Tokenizer {
// 当前词
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
// 偏移量
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
// 距离
private final PositionIncrementAttribute positionAttr = addAttribute(PositionIncrementAttribute.class);
// 分词词性
private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
protected Analysis ta = null;
private LinkedList<Object> result;
private List<StopRecognition> stops; //停用词对象
private List<SynonymsRecgnition> synonyms; //同义词词典
public AnsjTokenizer(Analysis ta, List<StopRecognition> stops, List<SynonymsRecgnition> synonyms) {
this.ta = ta;
this.stops = stops;
this.synonyms = synonyms;
}
@Override
public final boolean incrementToken() throws IOException {
int position = 0;
if (result == null) {
parse();
}
Object obj = result.pollFirst();
if (obj == null) {
result = null;
return false;
}
if (obj instanceof Term) {
clearAttributes();
Term term = (Term) obj;
while (filterTerm(term)) { //停用词
term = (Term) result.pollFirst();
if (term == null) {
result = null;
return false;
}
position++;
}
List<String> synonyms = term.getSynonyms(); //获得同义词
String rName = null;
if (synonyms != null) {
for (int i = 1; i < synonyms.size(); i++) {
result.addFirst(synonyms.get(i));
}
rName = synonyms.get(0);
} else {
rName = term.getName();
}
position++;
offsetAtt.setOffset(term.getOffe(), term.getOffe() + term.getName().length());
typeAtt.setType(term.getNatureStr());
positionAttr.setPositionIncrement(position);
termAtt.setEmpty().append(rName);
} else {
positionAttr.setPositionIncrement(position);
termAtt.setEmpty().append(obj.toString());
}
return true;
}
private boolean filterTerm(Term term) {
if (stops != null && stops.size() > 0) {
for (StopRecognition filterRecognition : stops) {
if (filterRecognition.filter(term)) {
return true;
}
}
}
return false;
}
/**
* 必须重载的方法,否则在批量索引文件时将会导致文件索引失败
*/
@Override
public void reset() throws IOException {
super.reset();
ta.resetContent(new AnsjReader(this.input));
parse();
}
private void parse() throws IOException {
Result parse = ta.parse();
if (synonyms != null) {
for (SynonymsRecgnition sr : synonyms) {
parse.recognition(sr);
}
}
result = new LinkedList<Object>(parse.getTerms());
}
}