package org.apache.solr.analysis.author; import java.io.IOException; import java.util.HashSet; import java.util.List; import java.util.Set; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * This class harvests (eats) different spellings and variations of the * author names. The variations will not get indexed. * * The tokenizer chain is doing more work than if the generation and * harvesting was done in one tokenizer, however I gave preference to * clarity against speed. If we find the changed code is too slow, we * should revert it. * */ public final class AuthorCollectorFilter extends TokenFilter { public static final Logger log = LoggerFactory.getLogger(AuthorCollectorFilter.class); private final CharTermAttribute termAtt; private final TypeAttribute typeAtt; private boolean emitTokens = false; private Set<String> tokenTypes; public AuthorCollectorFilter(TokenStream input) { super(input); termAtt = addAttribute(CharTermAttribute.class); typeAtt = addAttribute(TypeAttribute.class); tokenTypes = new HashSet<String>(); } /* (non-Javadoc) * @see org.apache.lucene.analysis.TokenStream#incrementToken() */ @Override public boolean incrementToken() throws IOException { if (!input.incrementToken()) { return false; } //System.out.println("token:" + termAtt.toString()); if (emitTokens) { if (tokenTypes.contains(typeAtt.type())) { return true; } else { // we'll eat the tokens while (input.incrementToken()) { if (tokenTypes.contains(typeAtt.type())) { return true; } } return false; } } else { if (tokenTypes.contains(typeAtt.type())) { // we'll eat the tokens while (input.incrementToken()) { if (tokenTypes.contains(typeAtt.type())) { // pass } else { return true; } } return false; } else { return true; } } } public void setEmitTokens(boolean b) { emitTokens = b; } public void setTokenTypes(List<String> tokenTypes) { for (String tt: tokenTypes) { this.tokenTypes.add(tt); } } }