package experiments.collective.entdoccentric.query; import java.io.IOException; import java.io.Reader; import org.apache.lucene.analysis.core.StopAnalyzer; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.util.Version; public final class CalbCAnalyzer extends StopwordAnalyzerBase { /** Default maximum allowed token length */ public static final int DEFAULT_MAX_TOKEN_LENGTH = 255; private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH; /** * An unmodifiable set containing some common English words that are usually * not useful for searching. */ public static final CharArraySet STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET; /** * Builds an analyzer with the given stop words. * * @param matchVersion * Lucene version to match See * {@link <a href="#version">above</a>} * @param stopWords * stop words */ public CalbCAnalyzer(Version matchVersion, CharArraySet stopWords) { super(matchVersion, stopWords); } /** * Builds an analyzer with the default stop words ({@link #STOP_WORDS_SET}). * * @param matchVersion * Lucene version to match See * {@link <a href="#version">above</a>} */ public CalbCAnalyzer(Version matchVersion) { this(matchVersion, STOP_WORDS_SET); } /** * Builds an analyzer with the stop words from the given reader. * * @see WordlistLoader#getWordSet(Reader, Version) * @param matchVersion * Lucene version to match See * {@link <a href="#version">above</a>} * @param stopwords * Reader to read stop words from */ public CalbCAnalyzer(Version matchVersion, Reader stopwords) throws IOException { this(matchVersion, loadStopwordSet(stopwords, matchVersion)); } /** * Set maximum allowed token length. If a token is seen that exceeds this * length then it is discarded. This setting only takes effect the next time * tokenStream or tokenStream is called. */ public void setMaxTokenLength(int length) { maxTokenLength = length; } /** * @see #setMaxTokenLength */ public int getMaxTokenLength() { return maxTokenLength; } @Override protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) { final CalbCTokenizer src = new CalbCTokenizer(matchVersion, reader); return new TokenStreamComponents(src) { @Override protected void setReader(final Reader reader) throws IOException { super.setReader(reader); } }; } }