package org.xbib.elasticsearch.index.analysis.lemmatize;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.KeywordAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.AttributeSource;
import org.xbib.elasticsearch.common.fsa.Dictionary;
import java.io.IOException;
import java.nio.charset.CharacterCodingException;
import java.util.LinkedList;
/**
*
*/
public class LemmatizeTokenFilter extends TokenFilter {
private final LinkedList<String> tokens;
private final Dictionary dictionary;
private final boolean respectKeywords;
private final boolean lemmaOnly;
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
private final KeywordAttribute keywordAtt = addAttribute(KeywordAttribute.class);
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
private AttributeSource.State current;
protected LemmatizeTokenFilter(TokenStream input, Dictionary dictionary,
boolean respectKeywords, boolean lemmaOnly) {
super(input);
this.tokens = new LinkedList<>();
this.dictionary = dictionary;
this.respectKeywords = respectKeywords;
this.lemmaOnly = lemmaOnly;
}
@Override
public final boolean incrementToken() throws IOException {
if (!tokens.isEmpty()) {
if (current == null) {
throw new IllegalArgumentException("current is null");
}
String token = tokens.removeFirst();
restoreState(current);
termAtt.setEmpty().append(token);
if (!lemmaOnly) {
posIncAtt.setPositionIncrement(0);
}
return true;
}
if (!input.incrementToken()) {
return false;
}
if (respectKeywords && keywordAtt.isKeyword()) {
return true;
}
if (!expand()) {
current = captureState();
if (lemmaOnly) {
String token = tokens.removeFirst();
restoreState(current);
termAtt.setEmpty().append(token);
return true;
}
}
return true;
}
private boolean expand() throws CharacterCodingException {
String term = new String(termAtt.buffer(), 0, termAtt.length());
CharSequence s = dictionary.lookup(term);
if (s != null) {
tokens.add(s.toString());
}
return tokens.isEmpty();
}
@Override
public void reset() throws IOException {
super.reset();
tokens.clear();
current = null;
}
@Override
public boolean equals(Object object) {
return object instanceof LemmatizeTokenFilter &&
tokens.equals(((LemmatizeTokenFilter)object).tokens) &&
dictionary.equals(((LemmatizeTokenFilter)object).dictionary) &&
respectKeywords == ((LemmatizeTokenFilter)object).respectKeywords &&
lemmaOnly == ((LemmatizeTokenFilter)object).lemmaOnly;
}
@Override
public int hashCode() {
return tokens.hashCode() ^ dictionary.hashCode()
^ Boolean.hashCode(respectKeywords) ^ Boolean.hashCode(lemmaOnly);
}
}