/* * To change this license header, choose License Headers in Project Properties. * To change this template file, choose Tools | Templates * and open the template in the editor. */ package di.uniba.it.tri.tokenizer; import java.io.Reader; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.en.EnglishAnalyzer; import org.apache.lucene.analysis.en.EnglishPossessiveFilter; import org.apache.lucene.analysis.en.PorterStemFilter; import org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.StopwordAnalyzerBase; import org.apache.lucene.util.Version; /** * * @author pierpaolo */ public class EnglishNoStemAnalyzer extends StopwordAnalyzerBase { /** * Returns an unmodifiable instance of the default stop words set. * * @return default stop words set. */ public static CharArraySet getDefaultStopSet() { return DefaultSetHolder.DEFAULT_STOP_SET; } /** * Atomically loads the DEFAULT_STOP_SET in a lazy fashion once the outer * class accesses the static final set the first time.; */ private static class DefaultSetHolder { static final CharArraySet DEFAULT_STOP_SET = StandardAnalyzer.STOP_WORDS_SET; } public EnglishNoStemAnalyzer() { this(DefaultSetHolder.DEFAULT_STOP_SET); } /** * Builds an analyzer with the given stop words. * * @param stopwords a stopword set */ public EnglishNoStemAnalyzer(CharArraySet stopwords) { super(stopwords); } /** * Creates a * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} which * tokenizes all the text in the provided {@link Reader}. * * @param fieldName * @param reader * @return A * {@link org.apache.lucene.analysis.Analyzer.TokenStreamComponents} built * from an {@link StandardTokenizer} filtered with null null null null {@link StandardFilter}, {@link EnglishPossessiveFilter}, * {@link LowerCaseFilter}, {@link StopFilter} * , {@link SetKeywordMarkerFilter} if a stem exclusion set is provided and * {@link PorterStemFilter}. */ @Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { final Tokenizer source = new StandardTokenizer(reader); TokenStream result = new StandardFilter(source); // prior to this we get the classic behavior, standardfilter does it for us. result = new EnglishPossessiveFilter(Version.LATEST, result); result = new LowerCaseFilter(result); result = new StopFilter(result, stopwords); return new TokenStreamComponents(source, result); } }