package qa.qcri.aidr.collector.collectors; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Helper class to split user-defined keywords to generate phrase set or unordered words to apply strict keyword match filter for incoming Tweets. * */ public class KeywordPredicate { private String patternString = "([^\"]\\S*|\".+?\")\\s*"; private Pattern pattern = null; private Set<String> phraseSet = null; private Set<String> unorderedWords = null; public KeywordPredicate() { pattern = Pattern.compile(patternString); phraseSet = new HashSet<String>(); unorderedWords = new HashSet<String>(); } public KeywordPredicate(final String keyword) { this(); if (keyword != null) { // first split on whitespace List<String> words = splitOnWhitespace(keyword); // If word startWith("\"") then split further and put in ordered list // Otherwise put in unordered list // TODO: handle boundary conditions! for (String w: words) { if (w.startsWith("\"")) { String strippedWord = null; int end = w.lastIndexOf("\""); if (end > 0) { strippedWord = w.substring(1, end); } else { // Handle the special case of malformed quotations: runaway quote in text strippedWord = w.substring(1).trim(); } StringBuffer buf = new StringBuffer(); buf.append(" ").append(strippedWord); phraseSet.add(new String(buf)); // Will match end of a string buf.append(" "); phraseSet.add(new String(buf)); // Will match middle of a string buf.delete(0, 1); // delete leading whitespace phraseSet.add(new String(buf)); // Will match beginning of a string } else { unorderedWords.add(w.trim()); } } } } public Set<String> getPhraseSet() { return this.phraseSet; } public Set<String> getUnorderedWords() { return this.unorderedWords; } /** * * @param str string to split * @return */ private List<String> splitOnWhitespace(String str) { List<String> list = new LinkedList<String>(); Matcher m = pattern.matcher(str); while (m.find()) { list.add(m.group(1)); // Add .replace("\"", "") to remove surrounding quotes. } return list; } }