package qa.qcri.aidr.predict.featureextraction; import java.util.Arrays; import java.util.HashSet; import org.apache.log4j.Logger; import qa.qcri.aidr.predict.common.DocumentType; import qa.qcri.aidr.predict.common.PipelineProcess; import qa.qcri.aidr.predict.data.Document; import qa.qcri.aidr.predict.data.Facebook; import qa.qcri.aidr.predict.data.SMS; import qa.qcri.aidr.predict.data.Tweet; /** * FeatureExtractor consumes DocumentSet objects from a Redis queue, performs * feature extraction and pushes the DocumentSet to another queue for futher * processing. * * @author jrogstadius * */ public class FeatureExtractor extends PipelineProcess { private static Logger logger = Logger.getLogger(FeatureExtractor.class); private static String SINGLE_QUOTE_PATTERN_AT_EXTREME = "(^')|('$)"; @Override protected void processItem(Document doc) { //logger.info("Received doc class: " + doc.getClass()); //logger.info("Doc coming from crisis: " + doc.getCrisisCode() // + ", having docType: " + doc.getDoctype() // + ", having id: " + doc.getDocumentID()); if (doc.getDoctype().equals(DocumentType.TWIITER_DOC)) { processTweet((Tweet) doc); } else if (doc.getDoctype().equals(DocumentType.SMS_DOC)){ processSMS((SMS) doc); } else if (doc.getDoctype().equals(DocumentType.FACEBOOK_DOC)){ processFacebook((Facebook) doc); } else { logger.error("Unknown datatype: " + doc + ", doctype = " + doc.getDoctype()); throw new RuntimeException("Unknown doctype: " + doc.getDoctype()); } } void processTweet(Tweet tweet) { logger.debug("processing twitter doc"); WordSet wordSet = new WordSet(); String text = tweet.getText(); wordSet.addAll(getWordsInStringWithBigrams(text, false)); tweet.addFeatureSet(wordSet); } void processFacebook(Facebook facebook) { logger.debug("processing facebook doc"); WordSet wordSet = new WordSet(); String text = facebook.getText(); wordSet.addAll(getWordsInStringWithBigrams(text, false)); facebook.addFeatureSet(wordSet); } void processSMS(SMS sms) { // TODO: the following code is only a placeholder for now! logger.debug("processing SMS doc"); WordSet wordSet = new WordSet(); String text = sms.getText(); wordSet.addAll(getWordsInStringWithBigrams(text, false)); sms.addFeatureSet(wordSet); } static public String[] getWordsInStringWithBigrams(String inputText, boolean useStemming) { // remove URLs, rt @username, and a bunch of special characters String text = inputText; text = text.toLowerCase(); String regexp = "(^|\\s)rt\\s|@\\S+|http\\S+|www\\.\\S+|[-.,;:_+?&=\"*~¨^´`<>\\[\\]{}()\\\\/|%€¤$£@!§½…]"; // text = text.replaceAll(regexp, ""); String[] words = text.split("\\s+"); // Stem words if (useStemming) { for (int i = 0; i < words.length; i++) { words[i] = naiveStemming(words[i]); } } // Make bigrams HashSet<String> bigrams = new HashSet<String>(); // remove single quotes from word's beginning and end for (int index = 0; index < words.length; index ++) { words[index] = words[index].replaceAll(SINGLE_QUOTE_PATTERN_AT_EXTREME,""); } for (int i = 0; i < words.length - 1; i++) { String w1 = words[i]; if (isStopword(w1)) { continue; } String w2 = ""; int j = i + 1; while (j < words.length && isStopword(w2 = words[j])) { j++; } // Perform stopword removal if (!isStopword(w2)) { bigrams.add(w1 + "_" + w2); } } bigrams.addAll(Arrays.asList(words)); if (bigrams.isEmpty()) { return new String[0]; } else { return bigrams.toArray(new String[bigrams.size()]); } } public static String naiveStemming(String str) { if (str.length() < 4 || str.startsWith("#")) { return str; } String before = str; while ((str = str.replaceAll("(es|ed|s|ing|ly|n)$", "")) != before) { before = str; } return str; } static boolean isStopword(String word) { return false; // TODO: Implement stopword handling } }