package tv.dyndns.kishibe.qmaclone.server.relevance; import java.io.IOException; import java.io.Reader; import java.util.List; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import tv.dyndns.kishibe.qmaclone.server.util.Normalizer; import com.google.common.base.Preconditions; import com.google.common.base.Throwables; import com.google.common.collect.Lists; import com.google.common.io.CharStreams; import com.google.inject.Inject; import com.google.inject.assistedinject.Assisted; public final class ViterbiTokenizer extends Tokenizer { public interface Factory { ViterbiTokenizer create(Reader input); } private final WordSegmenter wordSegmenter; private final String input; private final char[] buffer; private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); private List<Integer> offsets; private List<Integer> lengths; private int wordIndex = 0; @Inject public ViterbiTokenizer(WordSegmenter wordSegmenter, @Assisted Reader input) { this.wordSegmenter = Preconditions.checkNotNull(wordSegmenter); try { this.input = Normalizer.normalize(CharStreams.toString(input)); } catch (Exception e) { throw Throwables.propagate(e); } this.buffer = this.input.toCharArray(); } @Override public void reset() throws IOException { super.reset(); clearAttributes(); offsets = Lists.newArrayList(); lengths = Lists.newArrayList(); wordSegmenter.parse(input, null, offsets, lengths); wordIndex = 0; } @Override public boolean incrementToken() throws IOException { clearAttributes(); if (wordIndex == offsets.size()) { return false; } else { int offset = offsets.get(wordIndex); int length = lengths.get(wordIndex); ++wordIndex; termAtt.copyBuffer(buffer, offset, length); offsetAtt.setOffset(offset, offset + length); return true; } } @Override public void end() throws IOException { super.end(); offsetAtt.setOffset(buffer.length, buffer.length); } }