package qa.qcri.aidr.collector.collectors; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.json.JsonObject; import org.apache.log4j.Logger; import qa.qcri.aidr.collector.beans.CollectionTask; import qa.qcri.aidr.collector.java7.Predicate; /** * Main class to implement everything related to keywords filtering and validations. * */ public class TrackFilter implements Predicate<JsonObject> { private static Logger logger = Logger.getLogger(TrackFilter.class.getName()); private String[] toTrack = null; private Set<KeywordPredicate> simpleWordBasedPredicates = null; private Set<KeywordPredicate> phraseBasedPredicates = null; private String patternString = "([^\"]\\S*|\".+?\")\\s*"; private String phrasePatternString = ".*\".*\".*"; private Pattern pattern = null; public TrackFilter() { pattern = Pattern.compile(patternString); } public TrackFilter(CollectionTask task){ this(); if (task != null) { this.setToTrack(task.getToTrack()); } else { logger.error("Collection can't be null!"); } } public TrackFilter(String keywords) { this(); this.setToTrack(keywords); } public String[] getToTrack() { return this.toTrack; } public void setToTrack(final String keywords) { if (keywords != null && !keywords.isEmpty()) { simpleWordBasedPredicates = new HashSet<KeywordPredicate>(); phraseBasedPredicates = new HashSet<KeywordPredicate>(); this.toTrack = keywords.split(",(?=([^\"]*\"[^\"]*\")*[^\"]*$)", -1); // split on comma, ignoring those within double quotations for (int i = 0; i < toTrack.length;i++) { this.toTrack[i] = this.toTrack[i].trim().toLowerCase(); // remove leading and trailing whitespaces, make all lower-case KeywordPredicate pred = new KeywordPredicate(this.toTrack[i]); // Now divide the list into two: simpleWordBased (strings without any quotes) and phraseBased (string of form "*"*) if (this.toTrack[i].matches(phrasePatternString)) { //System.out.println("Matched phrase pattern, string = " + this.toTrack[i]); this.phraseBasedPredicates.add(pred); } else { // simpleWord: either single or multi-word keyword simpleWordBasedPredicates.add(pred); } } } else { this.toTrack = null; } } private Set<String> createTweetSetOfWords(String tweetText) { Set<String> tweetTextSet = toLowerCase(new HashSet<String>(splitOnWhitespace(tweetText))); // first remove all punctuation Set<String> strippedPuncts = new HashSet<String>(); Iterator<String> itr = tweetTextSet.iterator(); while (itr.hasNext()) { String word = itr.next(); int start = word.indexOf("\""); int end = word.lastIndexOf("\""); if (start != -1) { if (end > start) { String strippedWord = word.substring(start+1, end); strippedPuncts.add(strippedWord); } else { // Handle the special case of malformed quotations: runaway quote in text String strippedWord = word.substring(start+1); strippedPuncts.add(strippedWord); } itr.remove(); } } tweetTextSet.addAll(strippedPuncts); // Next, handle all #-tagged words in the tweet text // For each #-tagged word, add also the word following the #-tag to the tweetTextSet Set<String> hashWordSet = new HashSet<String>(); for (String w: tweetTextSet) { /* String[] hashSplit = w.split("#"); for (int i = 0;i < hashSplit.length;i++) { hashSplit[i] = hashSplit[i].trim(); } Set<String> tokensSet = new HashSet<String>(Arrays.asList(hashSplit)); tokensSet.removeAll(Collections.singleton("")); hashWordSet.addAll(tokensSet); */ if (w.startsWith("#")) { // This is stricter check than the above commented code String strippedHash = w.substring(1); hashWordSet.add(strippedHash); } } tweetTextSet.addAll(hashWordSet); return tweetTextSet; } private Set<String> toLowerCase(Set<String> wordSet) { Set<String> toLower = new HashSet<String>(); for (String word : wordSet) { toLower.add(word.toLowerCase()); } return toLower; } @Override public boolean test(JsonObject t) { if (null == toTrack) return true; String tweetText = t.get("text").toString(); if (null == tweetText) { return false; // there are filter-keywords but no text and hence reject tweet } // Otherwise test the tweet text for matching at least one of the keywords boolean result = hasKeyWords(tweetText); //logger.info("Filtering result for tweet text : \"" + tweetText + "\": " + result); return result; } public boolean test(String text) { if (null == toTrack) return true; String tweetText = text.replaceAll("\"", ""); //System.out.println("Unquoted tweet text: " + tweetText); if (null == tweetText) { return false; // there are filter-keywords but no text and hence reject tweet } boolean result = hasKeyWords(tweetText); //logger.info("Filtering result for text : \"" + tweetText + "\": " + result); return result; } private boolean matchSimplePredicates(Set<String> tweetTextSet) { for (KeywordPredicate predicate: this.simpleWordBasedPredicates) { boolean flag = true; for (String word: predicate.getUnorderedWords()) { //System.out.println("For keyword in simple predicate = " + word + ", contained in = " + tweetTextSet.contains(word)); if (!tweetTextSet.contains(word)) { flag = false; break; } } if (flag) { //System.out.println("Simple Predicate match found: " + predicate); return true; // found a match! } } return false; } private boolean matchPhrasePredicates(String tweetText, Set<String> tweetTextSet) { for (KeywordPredicate predicate: this.phraseBasedPredicates) { boolean flag = true; for (String word: predicate.getUnorderedWords()) { //System.out.println("For unordered keyword in phrase predicate = " + word + ", contained in = " + tweetTextSet.contains(word)); if (!tweetTextSet.contains(word)) { flag = false; break; } } if (!flag) { //System.out.println("Simple word Predicate match NOT found "); return false; // Didn't find a match } // Otherwise, check for phrases too, in original tweet text for (String phrase: predicate.getPhraseSet()) { flag = false; //System.out.println("For phrase = " + phrase + ", contained in = " + tweetText.contains(phrase)); if (tweetText.contains(phrase)) { //System.out.println("For phrase = " + phrase + " match found: " + tweetText.contains(phrase)); flag = true; break; } } if (flag) { //System.out.println("Phrase Predicate match found "); return true; // found a match! } } return false; } private boolean hasKeyWords(final String tweetText) { Set<String> tweetTextSet = createTweetSetOfWords(tweetText); // first test simplePredicates boolean result = matchSimplePredicates(tweetTextSet); if (result) return result; // Found a match // Otherwise, we need to check for phrasePredicates result = !this.phraseBasedPredicates.isEmpty() ? matchPhrasePredicates(tweetText, tweetTextSet) : false; return result; } /** * * @param str string to split * @return */ private List<String> splitOnWhitespace(String str) { List<String> list = new ArrayList<String>(); Matcher m = pattern.matcher(str); while (m.find()) { list.add(m.group(1)); // Add .replace("\"", "") to remove surrounding quotes. } return list; } public static void main(String args[]) throws Exception { String tweet = "The quick brown fox jumped over the internet #fence #Fox @google \"yeah right!\""; String keywords = "hello, brown Internets, \"yeah, babby\", \"yeah right\", \"yeah right!\""; TrackFilter filter = new TrackFilter(keywords); System.out.println("Keyword List: "); for (String w: filter.getToTrack()) { System.out.println(w); } System.out.println("Match result = " + filter.test(tweet)); String hashString = "abc#a ab ##a # #a#b"; String[] tokens = hashString.split("#"); for (int i = 0; i < tokens.length;i++) { tokens[i] = tokens[i].trim(); System.out.println("<start>" + tokens[i] + "<end>"); } Set<String> tokensList = new HashSet<String>(Arrays.asList(tokens)); tokensList.removeAll(Collections.singleton("")); for (String s: tokensList) { System.out.println("<START>" + s + "<END>"); } } @Override public String getFilterName() { return this.getClass().getSimpleName(); } }