package qa.qcri.aidr.predict.data;
import java.util.ArrayList;
import org.json.JSONObject;
import qa.qcri.aidr.predict.classification.DocumentLabel;
import qa.qcri.aidr.predict.classification.DocumentLabelFilter;
import qa.qcri.aidr.predict.common.DocumentType;
import qa.qcri.aidr.predict.common.Helpers;
import qa.qcri.aidr.predict.dbentities.TaggerDocument;
import qa.qcri.aidr.predict.featureextraction.DocumentFeature;
import qa.qcri.aidr.predict.featureextraction.FeatureExtractor;
import qa.qcri.aidr.predict.featureextraction.WordSet;
/**
* Document is an abstract representation of a work item in the processing
* pipeline. Each process in the pipeline annotates the Document with additional
* information such as features and class labels.
*
* @author jrogstadius
*/
public abstract class Document implements java.io.Serializable {
static final long serialVersionUID = 1L;
public Long crisisID;
public String crisisCode;
public JSONObject inputJson;
//public InetAddress sourceIP;
public String language = "en";
public Long documentID;
public String doctype;
public ArrayList<DocumentFeature> features = new ArrayList<DocumentFeature>();
public ArrayList<DocumentLabel> labels = new ArrayList<DocumentLabel>();
public int humanLabelCount = 0;
public double valueAsTrainingSample = 0.5;
// added by koushik: 21/12/2014
Long userID;
public Document() {
}
public void setDocumentID(Integer documentID) {
this.documentID = new Long(documentID);
}
public void setDocumentID(Long documentID) {
this.documentID = documentID;
}
public Long getDocumentID() {
return documentID;
}
public void setInputJson(JSONObject inputJson) {
this.inputJson = inputJson;
}
public JSONObject getInputJson() {
return inputJson;
}
public void setLanguage(String language) {
this.language = language;
}
public String getLanguage() {
return language;
}
public void setValueAsTrainingSample(double value) {
this.valueAsTrainingSample = value;
}
public double getValueAsTrainingSample() {
return valueAsTrainingSample;
}
public abstract String getDoctype();
public abstract void setDoctype(String type);
public abstract boolean isNovel();
public void setCrisisID(int crisisID) {
this.crisisID = new Long(crisisID);
}
public void setCrisisID(Long crisisID) {
this.crisisID = crisisID;
}
public Long getCrisisID() {
return crisisID;
}
public void setCrisisCode(String crisisCode) {
this.crisisCode = crisisCode;
}
public String getCrisisCode() {
return crisisCode;
}
public Long getUserID() {
return userID;
}
public void setUserID(Long userID) {
this.userID = userID;
}
public void addLabel(DocumentLabel label) {
labels.add(label);
if (label.isHumanLabel())
humanLabelCount++;
}
@SuppressWarnings("unchecked")
public <T extends DocumentLabel> ArrayList<T> getLabels(Class<T> classFilter) {
ArrayList<T> items = new ArrayList<T>();
for (DocumentLabel label : labels) {
if (classFilter.isAssignableFrom(label.getClass()))
items.add((T) label);
}
return items;
}
@SuppressWarnings("unchecked")
public <T extends DocumentLabel> ArrayList<T> getHumanLabels(
Class<T> classFilter) {
ArrayList<T> items = new ArrayList<T>();
for (DocumentLabel label : labels) {
if (label.isHumanLabel()
&& classFilter.isAssignableFrom(label.getClass()))
items.add((T) label);
}
return items;
}
public boolean hasLabel(DocumentLabelFilter filter) {
for (DocumentLabel label : labels) {
if (filter.match(label))
return true;
}
return false;
}
public boolean hasHumanLabels() {
return humanLabelCount > 0;
}
@SuppressWarnings("unchecked")
public <T extends DocumentFeature> ArrayList<T> getFeatures(
Class<T> classFilter) {
ArrayList<T> items = new ArrayList<T>();
for (DocumentFeature feature : features) {
if (classFilter.isAssignableFrom(feature.getClass()))
items.add((T) feature);
}
return items;
}
public void addFeatureSet(DocumentFeature set) {
features.add(set);
}
public static TaggerDocument fromDocumentToTaggerDocument(Document doc) {
TaggerDocument document = new TaggerDocument();
if (doc != null) {
// NOTE: documentID needs to be set separately as Auto Generation ID from DB/Hibernate
// Now copy the remaining fields
document.setHasHumanLabels(doc.hasHumanLabels());
document.setCrisisID(doc.getCrisisID());
document.setCrisisCode(doc.getCrisisCode());
document.setReceivedAt(new java.sql.Timestamp(
java.util.Calendar.getInstance().getTimeInMillis()));
document.setLanguage(doc.getLanguage());
document.setDoctype(doc.getClass().getSimpleName().toString());
if (doc.getInputJson() != null) {
document.setData(Helpers.escapeJson(doc.getInputJson().toString()));
} else {
document.setData(null);
}
if (doc.features != null) {
document.setWordFeatures(DocumentJSONConverter.getFeaturesJson(WordSet.class, doc));
}
document.setGeoFeatures(null);
document.setValueAsTrainingSample(doc.getValueAsTrainingSample());
boolean val = Math.random() < (1.0/5.0) ? true : false;
document.setIsEvaluationSet(val);
/*
List<NominalLabelBC> labels = doc.getHumanLabels(NominalLabelBC.class);
if (!labels.isEmpty()) {
List<NominalLabel> nbList = new ArrayList<NominalLabel>();
for (NominalLabelBC label : labels) {
NominalLabel nb = new NominalLabel(label.getNominalLabelID());
nbList.add(nb);
}
document.setNominalLabelCollection(nbList);
} else {
document.setNominalLabelCollection(null);
}*/
return document;
}
return null;
}
public static Document fromTaggerDocumentToDocument(TaggerDocument doc) {
Document document = null;
if (doc != null) {
if (doc.getDoctype().equalsIgnoreCase(DocumentType.TWIITER_DOC)) {
document = new Tweet();
} else if (doc.getDoctype().equalsIgnoreCase(DocumentType.SMS_DOC)) {
document = new SMS();
} else if (doc.getDoctype().equalsIgnoreCase(DocumentType.FACEBOOK_DOC)) {
document = new Facebook();
}
document.setDocumentID(doc.getDocumentID());
document.setCrisisID(doc.getCrisisID());
document.humanLabelCount = (doc.hasHumanLabels() == false) ? 0 : 1;
document.setCrisisCode(doc.getCrisisCode());
document.setLanguage(doc.getLanguage());
WordSet wordSet = new WordSet();
String text = doc.getWordFeatures();
wordSet.addAll(FeatureExtractor.getWordsInStringWithBigrams(text, false));
document.addFeatureSet(wordSet);
document.setValueAsTrainingSample(doc.getValueAsTrainingSample());
/*
List<NominalLabelBC> labels = doc.getHumanLabels(NominalLabelBC.class);
if (!labels.isEmpty()) {
for (NominalLabelBC label : labels) {
document.addLabel(label);
}
}*/
}
return document;
}
}