package org.ansj.lucene.util; import java.io.IOException; import java.io.Reader; import java.util.Set; import org.ansj.domain.Term; import org.ansj.domain.TermNatures; import org.ansj.splitWord.Analysis; import org.ansj.util.AnsjReader; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; public class AnsjTokenizer extends Tokenizer { // 当前词 private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); // 偏移量 private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); // 距离 private final PositionIncrementAttribute positionAttr = addAttribute(PositionIncrementAttribute.class); protected Analysis ta = null; private Set<String> filter; private boolean pstemming; private final PorterStemmer stemmer = new PorterStemmer(); public AnsjTokenizer(Analysis ta, Reader input, Set<String> filter, boolean pstemming) { super(input); this.ta = ta; this.filter = filter; this.pstemming = pstemming; } @Override public boolean incrementToken() throws IOException { clearAttributes(); int position = 0; Term term = null; String name = null; int length = 0; boolean flag = true; do { term = ta.next(); if (term == null) { break; } name = term.getName(); length = name.length(); if (pstemming && term.termNatures() == TermNatures.EN) { name = stemmer.stem(name); term.setName(name); } if (filter != null && filter.contains(name)) { continue; } else { position++; flag = false; } } while (flag); if (term != null) { positionAttr.setPositionIncrement(position); termAtt.setEmpty().append(term.getName()); offsetAtt.setOffset(term.getOffe(), term.getOffe() + length); return true; } else { return false; } } /** * 必须重载的方法,否则在批量索引文件时将会导致文件索引失败 */ @Override public void reset() throws IOException { super.reset(); ta.resetContent(new AnsjReader(this.input)); } }