/* * Source code for Listing 12.3 * */ package mia.clustering.ch12; import java.io.IOException; import java.io.Reader; import java.io.StringReader; import org.apache.commons.codec.language.DoubleMetaphone; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.PorterStemFilter; import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.WhitespaceTokenizer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.util.Version; @SuppressWarnings("deprecation") public class TwitterAnalyzer extends Analyzer { private DoubleMetaphone filter = new DoubleMetaphone(); @Override public TokenStream tokenStream(String fieldName, Reader reader) { final TokenStream result = new PorterStemFilter(new StopFilter( true, new StandardTokenizer(Version.LUCENE_CURRENT, reader), StandardAnalyzer.STOP_WORDS_SET)); TermAttribute termAtt = (TermAttribute) result .addAttribute(TermAttribute.class); StringBuilder buf = new StringBuilder(); try { while (result.incrementToken()) { String word = new String(termAtt.termBuffer(), 0, termAtt .termLength()); buf.append(filter.encode(word)).append(" "); } } catch (IOException e) { e.printStackTrace(); } return new WhitespaceTokenizer(new StringReader(buf.toString())); } }