package qa.qcri.aidr.predict.data; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import org.apache.commons.lang3.text.translate.UnicodeEscaper; import org.apache.log4j.Logger; import org.json.JSONArray; import org.json.JSONException; import org.json.JSONObject; import qa.qcri.aidr.predict.DataStore; import qa.qcri.aidr.predict.classification.DocumentLabel; import qa.qcri.aidr.predict.classification.geo.GeoLabel; import qa.qcri.aidr.predict.classification.nominal.NominalLabelBC; import qa.qcri.aidr.predict.common.DocumentType; import qa.qcri.aidr.predict.common.Helpers; import qa.qcri.aidr.predict.dbentities.ModelFamilyEC; import qa.qcri.aidr.predict.dbentities.NominalAttributeEC; import qa.qcri.aidr.predict.dbentities.NominalLabelEC; import qa.qcri.aidr.predict.featureextraction.DocumentFeature; import qa.qcri.aidr.predict.featureextraction.WordSet; /** * Helper class for converting between native Document objects and their JSON * representation. * * @author jrogstadius */ public class DocumentJSONConverter { private static Logger logger = Logger.getLogger(DocumentJSONConverter.class); private static final String name = "DocumentJsonConverter"; private static long lastModelInfoUpdate = 0; private static HashMap<Integer, HashMap<Integer, ModelFamilyEC>> activeModelFamiliesByID = new HashMap<>(); private static HashMap<Integer, HashMap<String, ModelFamilyEC>> activeModelFamiliesByCode = new HashMap<>(); private static HashMap<String, Integer> activeCrisisIDs = new HashMap<>(); private static UnicodeEscaper unicodeEscaper = UnicodeEscaper.above(127); public static Document parseDocument(String jsonInput) throws JSONException, IOException { //logger.info("Going to parse received doc on REDIS: " + jsonInput); JSONObject jsonObj = new JSONObject(jsonInput); if (!jsonObj.has("aidr")) { logger.error("Missing aidr field in input object"); throw new JSONException("Missing aidr field in input object"); } JSONObject aidr = jsonObj.getJSONObject("aidr"); if (!aidr.has("doctype")) { logger.error("Missing doctype in input object"); throw new JSONException("Missing doctype in input object"); } String doctype = aidr.getString("doctype"); Document doc = null; switch (doctype) { case DocumentType.TWIITER_DOC: doc = parseTweet(jsonObj); break; case DocumentType.SMS_DOC: doc = parseSMS(jsonObj); break; case DocumentType.FACEBOOK_DOC: doc = parseFacebook(jsonObj); break; default: logger.error("Exception when parsing input document: Unhandled doctype"); throw new RuntimeException( "Exception when parsing input document: Unhandled doctype"); } if (!aidr.has("crisis_code")) { logger.error("Exception when parsing input document: Missing crisis_code"); throw new RuntimeException( "Exception when parsing input document: Missing crisis_code"); } doc.crisisID = new Long(getCrisisID(aidr.getString("crisis_code"))); doc.crisisCode = aidr.getString("crisis_code"); doc.inputJson = jsonObj; //logger.info("Done creating new doc: " + aidr + ", has nominal_labels = " + aidr.has("nominal_labels")); return doc; } public static SMS parseSMS(JSONObject input) { // TODO: the following code is only a placeholder! //logger.info("parsing as SMS doc"); try { SMS sms = new SMS(); sms.setText(input.getString("text")); sms.setDoctype(DocumentType.SMS_DOC); JSONObject aidrJSON = input.getJSONObject("aidr"); AIDR aidrObj = new AIDR(); aidrObj.setCrisis_code(aidrJSON.getString("crisis_code")); aidrObj.setCrisis_name(aidrJSON.getString("crisis_name")); aidrObj.setDoctype(aidrJSON.getString("doctype")); sms.setAidr(aidrObj); return sms; } catch (JSONException e) { logger.error("Json exception in parsing tweet: " + input); throw new RuntimeException(e); } } public static Tweet parseTweet(JSONObject input) { // Example of a tweet in JSON format: // https://dev.twitter.com/docs/api/1/get/search //logger.info("parsing as twitter doc"); try { Tweet t = new Tweet(); JSONObject user; user = input.getJSONObject("user"); t.userID = Long.parseLong(user.getString("id_str")); t.text = input.getString("text"); t.isRetweet = !input.isNull("retweeted_status"); t.setDoctype(DocumentType.TWIITER_DOC); if (input.has("coordinates") && !input.isNull("coordinates")) { JSONObject geo = input .getJSONObject("coordinates"); if (geo.getString("type") == "Point") { JSONArray coords = geo.getJSONArray("coordinates"); GeoLabel.LonLatPair geotag = new GeoLabel.LonLatPair(); geotag.setLongitude(coords.getDouble(0)); geotag.setLatitude(coords.getDouble(1)); t.setGeotag(geotag); } } return t; } catch (JSONException e) { logger.error("Json exception in parsing tweet: " + input); throw new RuntimeException(e); } } public static Facebook parseFacebook(JSONObject input) { // Example of a facebook post: // https://developers.facebook.com/docs/graph-api/reference/v2.6/post try { Facebook fb = new Facebook(); JSONObject user; user = input.getJSONObject("from"); fb.userID = Long.parseLong(user.getString("id")); fb.text = input.getString("message"); fb.isShared = false; fb.setDoctype(DocumentType.FACEBOOK_DOC); return fb; } catch (JSONException e) { logger.error("Json exception in parsing tweet: " + input); throw new RuntimeException(e); } } public static String getDocumentSetJson(Document doc) { try { JSONObject input = doc.getInputJson(); JSONObject aidr = input.getJSONObject("aidr"); // Add features if (!aidr.has("features")) { ArrayList<DocumentFeature> features = doc .getFeatures(DocumentFeature.class); JSONArray featureArray = new JSONArray(); for (DocumentFeature f : features) featureArray.put(f.toJSONObject()); aidr.put("features", featureArray); } // Add labels if (!aidr.has("nominal_labels")) { ArrayList<NominalLabelBC> labels = doc .getLabels(NominalLabelBC.class); JSONArray labelArray = new JSONArray(); if (!labels.isEmpty()) { //logger.info("labels field is non-empty"); for (NominalLabelBC l : labels) { try { JSONObject labelJson = getLabelJson(doc.crisisID.intValue(), l); labelArray.put(labelJson); //logger.info("Added label: " + l); } catch (RuntimeException e) { logger.error("Exception while converting document to JSON:" + l); } } } else { //logger.warn("Empty nominal_labels field! Inserting a dummy nominal label."); labelArray.put(createEmptyLabelJson()); } aidr.put("nominal_labels", labelArray); //logger.info("Added nominal_labels with size = " + labelArray.length()); } return unicodeEscaper.translate(input.toString()); } catch (JSONException e) { logger.error("Error in creating JSON from document: " + doc); throw new RuntimeException(e); } } public static <T extends DocumentFeature> String getFeaturesJson( Class<T> featureFilter, Document docSet) { if (featureFilter == WordSet.class) { ArrayList<T> items = docSet.getFeatures(featureFilter); ArrayList<String> allWords = new ArrayList<String>(); for (T item : items) { WordSet words = (WordSet) item; allWords.addAll(words.getWords()); } String s = "{\"words\":[\"" + Helpers.join(allWords, "\",\"") + "\"]}"; return s; } else { logger.warn("Not implemented: " + featureFilter); throw new RuntimeException("Not implemented"); } } public static NominalLabelBC parseNominalLabel(JSONObject input) { try { int crisisID = getCrisisID(input.getString("crisis_code")); ModelFamilyEC modelFamily = getModelFamily(crisisID, input.getString("attribute_code")); NominalAttributeEC attr = modelFamily.getNominalAttribute(); NominalLabelBC l = new NominalLabelBC( input.getLong("source_id"), attr.getNominalAttributeID(), attr.getNominalLabel(input.getString("label_code")).getNominalLabelID(), input.getDouble("confidence")); //TODO: Remove this, training samples should be "true" if (input.has("from_human")) l.setHumanLabel(input.getBoolean("from_human")); return l; } catch (JSONException e) { logger.error("Error in parsing nominal label for: " + input); throw new RuntimeException(e); } } public static JSONObject createEmptyLabelJson() { //logger.info("Going to insert an empty nominal_labels"); JSONObject obj = new JSONObject(); try { obj.put("source_id", 0); obj.put("attribute_code", JSONObject.NULL); obj.put("attribute_name", JSONObject.NULL); obj.put("attribute_description", JSONObject.NULL); obj.put("label_code", JSONObject.NULL); obj.put("label_name", JSONObject.NULL); obj.put("label_description", JSONObject.NULL); obj.put("confidence", JSONObject.NULL); obj.put("from_human", false); } catch (JSONException e) { logger.error("Error in creating empty json object"); throw new RuntimeException(e); } return obj; } public static JSONObject getLabelJson(int crisisID, DocumentLabel label) { try { if (label instanceof NominalLabelBC) { //logger.info("Going to insert existing label to nominal_labels"); NominalLabelBC l = (NominalLabelBC) label; ModelFamilyEC family = getModelFamily(crisisID, l.getAttributeID()); JSONObject obj = new JSONObject(); obj.put("source_id", l.getSourceID()); obj.put("attribute_code", family.getNominalAttribute().getCode()); obj.put("attribute_name", family.getNominalAttribute().getName()); obj.put("attribute_description", family.getNominalAttribute().getDescription()); NominalLabelEC lEC = family.getNominalAttribute().getNominalLabel(l.getNominalLabelID()); obj.put("label_code", lEC.getNominalLabelCode()); obj.put("label_name", lEC.getName()); obj.put("label_description", lEC.getDescription()); obj.put("confidence", l.getConfidence()); obj.put("from_human", l.isHumanLabel()); return obj; } } catch (JSONException e) { logger.error("Error in creating json object from: " + label); throw new RuntimeException(e); } logger.error("Unsupported label type: " + label.getClass().getSimpleName()); throw new RuntimeException("Unsupported label type: " + label.getClass().getSimpleName()); } private static int getCrisisID(String crisisCode) { if ((System.currentTimeMillis() - lastModelInfoUpdate) > 300000 || (!activeCrisisIDs.containsKey(crisisCode) && (System .currentTimeMillis() - lastModelInfoUpdate) > 10000)) { //updateModelInfo(); updateModelFamilyInfo(); } if (!activeCrisisIDs.containsKey(crisisCode)) throw new RuntimeException("Crisis code has not been defined: " + crisisCode); return activeCrisisIDs.get(crisisCode); } private static ModelFamilyEC getModelFamily(int crisisID, int attributeID) { if ((System.currentTimeMillis() - lastModelInfoUpdate) > 300000 || ((!activeModelFamiliesByID.containsKey(crisisID) || !activeModelFamiliesByID.get(crisisID).containsKey(attributeID)) && (System.currentTimeMillis() - lastModelInfoUpdate) > 10000)) { //updateModelInfo(); updateModelFamilyInfo(); } if (!activeModelFamiliesByID.containsKey(crisisID) || !activeModelFamiliesByID.get(crisisID).containsKey(attributeID)) throw new RuntimeException("ModelInfo is missing for crisis " + crisisID + " and attribute " + attributeID); return activeModelFamiliesByID.get(crisisID).get(attributeID); } private static ModelFamilyEC getModelFamily(int crisisID, String attributeCode) { if ((System.currentTimeMillis() - lastModelInfoUpdate) > 300000 || ((!activeModelFamiliesByCode.containsKey(crisisID) || !activeModelFamiliesByCode.get(crisisID).containsKey(attributeCode)) && (System.currentTimeMillis() - lastModelInfoUpdate) > 10000)) { //updateModelInfo(); updateModelFamilyInfo(); } if (!activeModelFamiliesByCode.containsKey(crisisID) || !activeModelFamiliesByCode.get(crisisID).containsKey(attributeCode)) { logger.error("ModelInfo is missing for crisis " + crisisID + " and attribute " + attributeCode); throw new RuntimeException( "ModelInfo is missing for crisis " + crisisID + " and attribute " + attributeCode); } return activeModelFamiliesByCode.get(crisisID).get(attributeCode); } @Deprecated private static void updateModelInfo() { activeModelFamiliesByID.clear(); activeModelFamiliesByCode.clear(); activeCrisisIDs.clear(); activeCrisisIDs = DataStore.getCrisisIDs(); ArrayList<ModelFamilyEC> families = DataStore.getActiveModels(); for (ModelFamilyEC family : families) { int crisisID = family.getCrisisID(); int attributeID = family.getNominalAttribute().getNominalAttributeID(); String attributeCode = family.getNominalAttribute().getCode(); if (!activeModelFamiliesByID.containsKey(crisisID)) { activeModelFamiliesByID.put(crisisID, new HashMap<Integer, ModelFamilyEC>()); activeModelFamiliesByCode.put(crisisID, new HashMap<String, ModelFamilyEC>()); } activeModelFamiliesByID.get(crisisID).put(attributeID, family); activeModelFamiliesByCode.get(crisisID).put(attributeCode, family); } lastModelInfoUpdate = System.currentTimeMillis(); } private static void updateModelFamilyInfo() { activeCrisisIDs.clear(); activeCrisisIDs = DataStore.getCrisisIDs(); DataStore.getActiveModelsDocCount(activeModelFamiliesByID,activeModelFamiliesByCode); lastModelInfoUpdate = System.currentTimeMillis(); } }