package com.produban.openbus.analysis; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import storm.trident.operation.BaseFilter; import storm.trident.tuple.TridentTuple; import java.util.List; import static com.produban.openbus.util.Common.join; /** * Trident filter that allows only messages that contain at least one keyword. * * This filter expects the text to be present in the first field of the input tuple. * * If the text contains one or more keywords, it will pass. */ public class KeywordsFilter extends BaseFilter { String regex; private static final Logger logger = LoggerFactory.getLogger(KeywordsFilter.class); public KeywordsFilter(List<String> keywords) { //lowercase keywords logger.info("filter keywords:"); for (int i = 0; i < keywords.size(); i++) { keywords.set(i, keywords.get(i).toLowerCase()); System.out.println(keywords.get(i)); } this.regex = ".*\\W("+join(keywords, "|") + ")\\W.*"; } @Override //filter every text message not containing at least one keyword public boolean isKeep(TridentTuple objects) { String text = objects.getString(0).toLowerCase(); if (text == null){ logger.info("no text could be retrieved from tuple"); return false; } if (text.matches(this.regex)) { logger.info("matches!"); return true; } else { logger.info("filtered!"); return false; } } }