package storm.cookbook.tfidf.functions; import java.io.File; import java.net.URL; import java.util.Arrays; import java.util.List; import java.util.Map; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.search.spell.PlainTextDictionary; import org.apache.lucene.search.spell.SpellChecker; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import storm.trident.operation.BaseFunction; import storm.trident.operation.TridentCollector; import storm.trident.operation.TridentOperationContext; import storm.trident.tuple.TridentTuple; public class TermFilter extends BaseFunction { private SpellChecker spellchecker; private List<String> filterTerms = Arrays.asList(new String[] { "http" }); Logger LOG = LoggerFactory.getLogger(TermFilter.class); private static final long serialVersionUID = 1L; @Override public void prepare(Map conf, TridentOperationContext context) { super.prepare(conf, context); File dir = new File(System.getProperty("user.home") + "/dictionaries"); Directory directory; try { directory = FSDirectory.open(dir); spellchecker = new SpellChecker(directory); StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_36); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_36, analyzer); URL dictionaryFile = TermFilter.class.getResource("/dictionaries/fulldictionary00.txt"); spellchecker.indexDictionary(new PlainTextDictionary(new File(dictionaryFile.toURI())), config, true); } catch (Exception e) { LOG.error(e.toString()); } } private boolean shouldKeep(String stem) { if (stem == null) return false; if (stem.equals("")) return false; if (filterTerms.contains(stem)) return false; // we don't want integers try { Integer.parseInt(stem); return false; } catch (Exception e) { } // or floating point numbers try { Double.parseDouble(stem); return false; } catch (Exception e) { } try { return spellchecker.exist(stem); } catch (Exception e) { LOG.error(e.toString()); return false; } } public boolean isKeep(TridentTuple tuple) { LOG.debug("Filtering Tuple"); return shouldKeep(tuple.getString(0)); } public void execute(TridentTuple tuple, TridentCollector collector) { if (isKeep(tuple)) { collector.emit(tuple); } } }