package org.xbib.elasticsearch.index.analysis.decompound.fst; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.xbib.elasticsearch.common.decompound.fst.FstDecompounder; import java.io.IOException; import java.util.LinkedList; /** * */ public class FstDecompoundTokenFilter extends TokenFilter { protected final LinkedList<DecompoundToken> tokens; protected final FstDecompounder fstDecompounder; protected final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); protected final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); private State current; protected FstDecompoundTokenFilter(TokenStream input, FstDecompounder fstDecompounder) { super(input); this.tokens = new LinkedList<>(); this.fstDecompounder = fstDecompounder; } @Override public final boolean incrementToken() throws IOException { if (!tokens.isEmpty()) { if (current == null) { throw new IllegalArgumentException("current is null"); } DecompoundToken token = tokens.removeFirst(); restoreState(current); termAtt.setEmpty().append(token.txt); offsetAtt.setOffset(token.startOffset, token.endOffset); posIncAtt.setPositionIncrement(0); return true; } if (input.incrementToken()) { decompound(); if (!tokens.isEmpty()) { current = captureState(); } return true; } else { return false; } } protected void decompound() { int start = offsetAtt.startOffset(); CharSequence term = new String(termAtt.buffer(), 0, termAtt.length()); for (String suggestions : fstDecompounder.decompound(term.toString())) { for (String suggestion : suggestions.split(",")) { int off = start; int maxlen = -1; for (String s : suggestion.split("\\.")) { int len = s.length(); tokens.add(new DecompoundToken(s, off, len)); if (len > maxlen) { maxlen = len; } } start += maxlen; } } } @Override public void reset() throws IOException { super.reset(); tokens.clear(); current = null; } @Override public boolean equals(Object object) { return object instanceof FstDecompoundTokenFilter && fstDecompounder.equals( ((FstDecompoundTokenFilter)object).fstDecompounder); } @Override public int hashCode() { return fstDecompounder.hashCode(); } private class DecompoundToken { final CharSequence txt; final int startOffset; final int endOffset; DecompoundToken(CharSequence txt, int offset, int length) { this.txt = txt; int startOff = FstDecompoundTokenFilter.this.offsetAtt.startOffset(); int endOff = FstDecompoundTokenFilter.this.offsetAtt.endOffset(); if (endOff - startOff != FstDecompoundTokenFilter.this.termAtt.length()) { this.startOffset = startOff; this.endOffset = endOff; } else { this.startOffset = offset; this.endOffset = offset + length; } } } }