package storm.cookbook.tfidf.functions; import java.io.IOException; import java.io.StringReader; import org.apache.lucene.analysis.StopAnalyzer; import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.util.Version; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import backtype.storm.tuple.Values; import edu.washington.cs.knowitall.morpha.MorphaStemmer; import storm.cookbook.tfidf.TfidfTopologyFields; import storm.trident.operation.BaseFunction; import storm.trident.operation.TridentCollector; import storm.trident.tuple.TridentTuple; public class DocumentTokenizer extends BaseFunction { Logger LOG = LoggerFactory.getLogger(DocumentTokenizer.class); private static final long serialVersionUID = 1L; public void execute(TridentTuple tuple, TridentCollector collector) { String documentContents = tuple.getStringByField(TfidfTopologyFields.DOCUMENT); TokenStream ts = null; try { ts = new StopFilter(Version.LUCENE_30, new StandardTokenizer(Version.LUCENE_30, new StringReader(documentContents)), StopAnalyzer.ENGLISH_STOP_WORDS_SET); CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); while (ts.incrementToken()) { String lemma = MorphaStemmer.stemToken(termAtt.toString()); lemma = lemma.trim().replaceAll("\n", "").replaceAll("\r", ""); collector.emit(new Values(lemma)); } ts.close(); } catch (IOException e) { LOG.error(e.toString()); } finally { if (ts != null) { try { ts.close(); } catch (IOException e) { } } } } }