package plugins.LuceneIndex;
import java.io.Reader;
//import java.util.Hashtable;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.LowerCaseFilter;
//import org.apache.lucene.analysis.StopAnalyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.StopFilter;
//import org.apache.lucene.analysis.LowerCaseTokenizer;
import org.apache.lucene.analysis.PorterStemFilter;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.util.Version;
/**
* PorterStemAnalyzer processes input text by stemming English words to their
* roots. This Analyzer also converts the input to lower case and removes stop
* words. A small set of default stop words is defined in the STOP_WORDS array,
* but a caller can specify an alternative set of stop words by calling
* non-default constructor.
*/
public class PorterStemAnalyzer extends Analyzer
{
private Set<Object> stopSet;
public PorterStemAnalyzer()
{
this(STOP_WORDS);
}
public static final String[] STOP_WORDS =
{ "a", "you", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "aren't",
"as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "can't",
"cannot", "could", "couldn't", "did", "didn't", "do", "does", "doesn't", "doing", "don't", "down",
"during", "each", "few", "for", "from", "further", "had", "hadn't", "has", "hasn't", "have", "haven't",
"having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself",
"his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "isn't", "it", "it's",
"its", "itself", "let's", "me", "more", "most", "mustn't", "my", "myself", "no", "nor", "not", "of", "off",
"on", "once", "only", "or", "other", "ought", "our", "ours ", " ourselves", "out", "over", "own", "same",
"shan't", "she", "she'd", "she'll", "she's", "should", "shouldn't", "so", "some", "such", "than", "that",
"that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they",
"they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up",
"very", "was", "wasn't", "we", "we'd", "we'll", "we're", "we've", "were", "weren't", "what", "what's",
"when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with",
"won't", "would", "wouldn't", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself",
"yourselves" }; // StopAnalyzer.ENGLISH_STOP_WORDS;
public PorterStemAnalyzer(String[] stopWords)
{
stopSet = StopFilter.makeStopSet(stopWords);
}
public TokenStream tokenStream(String fieldName, Reader reader)
{
TokenStream result = new StandardTokenizer(Version.LUCENE_30, reader);
result = new StandardFilter(result);
result = new LowerCaseFilter(result);
result = new PorterStemFilter(result);
result = new StopFilter(true, result, stopSet, true);
return result;
}
}