package doser.lucene.analysis; import java.io.IOException; import java.io.Reader; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.StopAnalyzer; import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.analysis.util.WordlistLoader; import org.apache.lucene.util.Version; public final class DoserStandardAnalyzer extends StopwordAnalyzerBase { /** Default maximum allowed token length */ public static final int DEFAULT_MAX_TOKEN_LENGTH = 255; /** * An unmodifiable set containing some common English words that are usually * not useful for searching. */ public static final CharArraySet STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET; private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH; /** * Builds an analyzer with the default stop words ({@link #STOP_WORDS_SET}). * */ public DoserStandardAnalyzer() { this(STOP_WORDS_SET); } /** * Builds an analyzer with the given stop words. * * @param stopWords * stop words */ public DoserStandardAnalyzer(CharArraySet stopWords) { super(stopWords); } /** * Builds an analyzer with the stop words from the given reader. * * @see WordlistLoader#getWordSet(Reader, Version) * @param stopwords * Reader to read stop words from */ public DoserStandardAnalyzer(Reader stopwords) throws IOException { this(loadStopwordSet(stopwords)); } @Override protected TokenStreamComponents createComponents(final String fieldName, final Reader reader) { final DoserStandardTokenizer src = new DoserStandardTokenizer(reader); TokenStream tok = new StandardFilter(src); tok = new LowerCaseFilter(tok); return new TokenStreamComponents(src, tok) { @Override protected void setReader(final Reader reader) throws IOException { super.setReader(reader); } }; } /** * @see #setMaxTokenLength */ public int getMaxTokenLength() { return maxTokenLength; } /** * Set maximum allowed token length. If a token is seen that exceeds this * length then it is discarded. This setting only takes effect the next time * tokenStream or tokenStream is called. */ public void setMaxTokenLength(int length) { maxTokenLength = length; } }