package org.ansj.lucene.util; import java.io.IOException; import java.io.Reader; import java.util.Set; import org.ansj.domain.Term; import org.ansj.domain.TermNature; import org.ansj.splitWord.Analysis; import org.ansj.splitWord.analysis.ToAnalysis; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; public class AnsjTokenizer extends Tokenizer { // 当前词 private CharTermAttribute termAtt; // 偏移量 private OffsetAttribute offsetAtt; // 距离 private PositionIncrementAttribute positionAttr; private Analysis ta = null; private Set<String> filter; private boolean pstemming; private final PorterStemmer stemmer = new PorterStemmer(); public AnsjTokenizer(Analysis analysis, Reader input, Set<String> filter, boolean pstemming) { super(input); ta = analysis ; termAtt = addAttribute(CharTermAttribute.class); offsetAtt = addAttribute(OffsetAttribute.class); positionAttr = addAttribute(PositionIncrementAttribute.class); this.filter = filter; this.pstemming = pstemming; } @Override public boolean incrementToken() throws IOException { clearAttributes(); int position = 0; Term term = null; String name = null; int length = 0; do { term = ta.next(); if (term == null) { break; } length = term.getName().length(); if (pstemming && term.termNatures().termNatures[0] == TermNature.EN) { System.out.println(pstemming); name = stemmer.stem(term.getName()); term.setName(name); } position++; } while (filter != null && term != null && (filter.contains(term.getName()) || term.getName().length() > 30)); if (term != null) { positionAttr.setPositionIncrement(position); termAtt.copyBuffer(term.getName().toCharArray(), 0, term.getName().length()); offsetAtt.setOffset(term.getOffe(), term.getOffe() + length); return true; } else { end(); return false; } } }