package qa.qcri.aidr.collector.collectors;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.json.JsonObject;
import org.apache.log4j.Logger;
import qa.qcri.aidr.collector.beans.CollectionTask;
import qa.qcri.aidr.collector.java7.Predicate;
/**
* Main class to implement everything related to keywords filtering and validations.
*
*/
public class TrackFilter implements Predicate<JsonObject> {
private static Logger logger = Logger.getLogger(TrackFilter.class.getName());
private String[] toTrack = null;
private Set<KeywordPredicate> simpleWordBasedPredicates = null;
private Set<KeywordPredicate> phraseBasedPredicates = null;
private String patternString = "([^\"]\\S*|\".+?\")\\s*";
private String phrasePatternString = ".*\".*\".*";
private Pattern pattern = null;
public TrackFilter() {
pattern = Pattern.compile(patternString);
}
public TrackFilter(CollectionTask task){
this();
if (task != null) {
this.setToTrack(task.getToTrack());
} else {
logger.error("Collection can't be null!");
}
}
public TrackFilter(String keywords) {
this();
this.setToTrack(keywords);
}
public String[] getToTrack() {
return this.toTrack;
}
public void setToTrack(final String keywords) {
if (keywords != null && !keywords.isEmpty()) {
simpleWordBasedPredicates = new HashSet<KeywordPredicate>();
phraseBasedPredicates = new HashSet<KeywordPredicate>();
this.toTrack = keywords.split(",(?=([^\"]*\"[^\"]*\")*[^\"]*$)", -1); // split on comma, ignoring those within double quotations
for (int i = 0; i < toTrack.length;i++) {
this.toTrack[i] = this.toTrack[i].trim().toLowerCase(); // remove leading and trailing whitespaces, make all lower-case
KeywordPredicate pred = new KeywordPredicate(this.toTrack[i]);
// Now divide the list into two: simpleWordBased (strings without any quotes) and phraseBased (string of form "*"*)
if (this.toTrack[i].matches(phrasePatternString)) {
//System.out.println("Matched phrase pattern, string = " + this.toTrack[i]);
this.phraseBasedPredicates.add(pred);
} else {
// simpleWord: either single or multi-word keyword
simpleWordBasedPredicates.add(pred);
}
}
} else {
this.toTrack = null;
}
}
private Set<String> createTweetSetOfWords(String tweetText) {
Set<String> tweetTextSet = toLowerCase(new HashSet<String>(splitOnWhitespace(tweetText)));
// first remove all punctuation
Set<String> strippedPuncts = new HashSet<String>();
Iterator<String> itr = tweetTextSet.iterator();
while (itr.hasNext()) {
String word = itr.next();
int start = word.indexOf("\"");
int end = word.lastIndexOf("\"");
if (start != -1) {
if (end > start) {
String strippedWord = word.substring(start+1, end);
strippedPuncts.add(strippedWord);
} else {
// Handle the special case of malformed quotations: runaway quote in text
String strippedWord = word.substring(start+1);
strippedPuncts.add(strippedWord);
}
itr.remove();
}
}
tweetTextSet.addAll(strippedPuncts);
// Next, handle all #-tagged words in the tweet text
// For each #-tagged word, add also the word following the #-tag to the tweetTextSet
Set<String> hashWordSet = new HashSet<String>();
for (String w: tweetTextSet) {
/*
String[] hashSplit = w.split("#");
for (int i = 0;i < hashSplit.length;i++) {
hashSplit[i] = hashSplit[i].trim();
}
Set<String> tokensSet = new HashSet<String>(Arrays.asList(hashSplit));
tokensSet.removeAll(Collections.singleton(""));
hashWordSet.addAll(tokensSet);
*/
if (w.startsWith("#")) {
// This is stricter check than the above commented code
String strippedHash = w.substring(1);
hashWordSet.add(strippedHash);
}
}
tweetTextSet.addAll(hashWordSet);
return tweetTextSet;
}
private Set<String> toLowerCase(Set<String> wordSet) {
Set<String> toLower = new HashSet<String>();
for (String word : wordSet) {
toLower.add(word.toLowerCase());
}
return toLower;
}
@Override
public boolean test(JsonObject t) {
if (null == toTrack) return true;
String tweetText = t.get("text").toString();
if (null == tweetText) {
return false; // there are filter-keywords but no text and hence reject tweet
}
// Otherwise test the tweet text for matching at least one of the keywords
boolean result = hasKeyWords(tweetText);
//logger.info("Filtering result for tweet text : \"" + tweetText + "\": " + result);
return result;
}
public boolean test(String text) {
if (null == toTrack) return true;
String tweetText = text.replaceAll("\"", "");
//System.out.println("Unquoted tweet text: " + tweetText);
if (null == tweetText) {
return false; // there are filter-keywords but no text and hence reject tweet
}
boolean result = hasKeyWords(tweetText);
//logger.info("Filtering result for text : \"" + tweetText + "\": " + result);
return result;
}
private boolean matchSimplePredicates(Set<String> tweetTextSet) {
for (KeywordPredicate predicate: this.simpleWordBasedPredicates) {
boolean flag = true;
for (String word: predicate.getUnorderedWords()) {
//System.out.println("For keyword in simple predicate = " + word + ", contained in = " + tweetTextSet.contains(word));
if (!tweetTextSet.contains(word)) {
flag = false;
break;
}
}
if (flag) {
//System.out.println("Simple Predicate match found: " + predicate);
return true; // found a match!
}
}
return false;
}
private boolean matchPhrasePredicates(String tweetText, Set<String> tweetTextSet) {
for (KeywordPredicate predicate: this.phraseBasedPredicates) {
boolean flag = true;
for (String word: predicate.getUnorderedWords()) {
//System.out.println("For unordered keyword in phrase predicate = " + word + ", contained in = " + tweetTextSet.contains(word));
if (!tweetTextSet.contains(word)) {
flag = false;
break;
}
}
if (!flag) {
//System.out.println("Simple word Predicate match NOT found ");
return false; // Didn't find a match
}
// Otherwise, check for phrases too, in original tweet text
for (String phrase: predicate.getPhraseSet()) {
flag = false;
//System.out.println("For phrase = " + phrase + ", contained in = " + tweetText.contains(phrase));
if (tweetText.contains(phrase)) {
//System.out.println("For phrase = " + phrase + " match found: " + tweetText.contains(phrase));
flag = true;
break;
}
}
if (flag) {
//System.out.println("Phrase Predicate match found ");
return true; // found a match!
}
}
return false;
}
private boolean hasKeyWords(final String tweetText) {
Set<String> tweetTextSet = createTweetSetOfWords(tweetText);
// first test simplePredicates
boolean result = matchSimplePredicates(tweetTextSet);
if (result) return result; // Found a match
// Otherwise, we need to check for phrasePredicates
result = !this.phraseBasedPredicates.isEmpty() ? matchPhrasePredicates(tweetText, tweetTextSet) : false;
return result;
}
/**
*
* @param str string to split
* @return
*/
private List<String> splitOnWhitespace(String str) {
List<String> list = new ArrayList<String>();
Matcher m = pattern.matcher(str);
while (m.find()) {
list.add(m.group(1)); // Add .replace("\"", "") to remove surrounding quotes.
}
return list;
}
public static void main(String args[]) throws Exception {
String tweet = "The quick brown fox jumped over the internet #fence #Fox @google \"yeah right!\"";
String keywords = "hello, brown Internets, \"yeah, babby\", \"yeah right\", \"yeah right!\"";
TrackFilter filter = new TrackFilter(keywords);
System.out.println("Keyword List: ");
for (String w: filter.getToTrack()) {
System.out.println(w);
}
System.out.println("Match result = " + filter.test(tweet));
String hashString = "abc#a ab ##a # #a#b";
String[] tokens = hashString.split("#");
for (int i = 0; i < tokens.length;i++) {
tokens[i] = tokens[i].trim();
System.out.println("<start>" + tokens[i] + "<end>");
}
Set<String> tokensList = new HashSet<String>(Arrays.asList(tokens));
tokensList.removeAll(Collections.singleton(""));
for (String s: tokensList) {
System.out.println("<START>" + s + "<END>");
}
}
@Override
public String getFilterName() {
return this.getClass().getSimpleName();
}
}