package edu.wiki.index; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.util.ArrayList; import java.util.Set; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.CustomFilter; import org.apache.lucene.analysis.CustomTokenizer; import org.apache.lucene.analysis.LengthFilter; import org.apache.lucene.analysis.LetterTokenizer; import org.apache.lucene.analysis.LowerCaseFilter; import org.apache.lucene.analysis.PorterStemFilter; import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.standard.StandardFilter; public class WikipediaAnalyzer extends Analyzer { /** An unmodifiable set containing some common English words that are not usually useful for searching.*/ public final Set<?> ENGLISH_STOP_WORDS_SET; public WikipediaAnalyzer() throws IOException { // read stop words InputStream is = ESAWikipediaIndexer.class.getResourceAsStream("/config/stopwords.txt"); BufferedReader br = new BufferedReader(new InputStreamReader(is)); ArrayList<String> stopWords = new ArrayList<String>(500); String line; while((line = br.readLine()) != null){ line = line.trim(); if(!line.equals("")){ stopWords.add(line.trim()); } } br.close(); final CharArraySet stopSet = new CharArraySet(stopWords.size(), false); stopSet.addAll(stopWords); ENGLISH_STOP_WORDS_SET = CharArraySet.unmodifiableSet(stopSet); } public TokenStream reusableTokenStream( String fieldName, Reader reader) throws IOException { SavedStreams streams = (SavedStreams) getPreviousTokenStream(); if (streams == null) { streams = new SavedStreams(); setPreviousTokenStream(streams); // streams.tokenizer = new LetterTokenizer(reader); streams.tokenizer = new CustomTokenizer(reader); streams.stream = new StandardFilter(streams.tokenizer); streams.stream = new LengthFilter(streams.stream, 3, 100); streams.stream = new LowerCaseFilter(streams.stream); // streams.stream = new StopFilter(true, streams.stream, StopAnalyzer.ENGLISH_STOP_WORDS_SET); streams.stream = new StopFilter(true, streams.stream, ENGLISH_STOP_WORDS_SET); streams.stream = new CustomFilter(streams.stream); streams.stream = new PorterStemFilter(streams.stream); streams.stream = new PorterStemFilter(streams.stream); streams.stream = new PorterStemFilter(streams.stream); } else { streams.tokenizer.reset(reader); } return streams.stream; } private class SavedStreams { Tokenizer tokenizer; TokenStream stream; } public TokenStream tokenStream( String fieldName, Reader reader) { // Tokenizer tokenizer = new LetterTokenizer(reader); Tokenizer tokenizer = new CustomTokenizer(reader); TokenStream stream = new StandardFilter(tokenizer); stream = new LengthFilter(stream, 3, 100); stream = new LowerCaseFilter(stream); // stream = new StopFilter(true, stream, StopAnalyzer.ENGLISH_STOP_WORDS_SET); stream = new StopFilter(true, stream, ENGLISH_STOP_WORDS_SET); stream = new CustomFilter(stream); stream = new PorterStemFilter(stream); stream = new PorterStemFilter(stream); stream = new PorterStemFilter(stream); return stream; } }