/* * Source code for Listing 9.5 * */ package mia.clustering.ch09; import java.io.IOException; import java.io.Reader; import java.io.StringReader; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.LowerCaseFilter; import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.WhitespaceTokenizer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.standard.StandardFilter; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.lucene.util.Version; public class MyAnalyzer extends Analyzer { private final Pattern alphabets = Pattern.compile("[a-z]+"); @Override public TokenStream tokenStream(String fieldName, Reader reader) { TokenStream result = new StandardTokenizer(Version.LUCENE_CURRENT, reader); result = new StandardFilter(result); result = new LowerCaseFilter(result); result = new StopFilter(true, result, StandardAnalyzer.STOP_WORDS_SET); TermAttribute termAtt = (TermAttribute) result.addAttribute(TermAttribute.class); StringBuilder buf = new StringBuilder(); try { while (result.incrementToken()) { if (termAtt.termLength() < 3) continue; String word = new String(termAtt.termBuffer(), 0, termAtt.termLength()); Matcher m = alphabets.matcher(word); if (m.matches()) { buf.append(word).append(" "); } } } catch (IOException e) { e.printStackTrace(); } return new WhitespaceTokenizer(new StringReader(buf.toString())); } }