package hu.u_szeged.utils;
import hu.u_szeged.kpe.candidates.NGram;
import hu.u_szeged.kpe.candidates.NGram.SequenceType;
import hu.u_szeged.kpe.candidates.NGramStats;
import java.util.HashMap;
import java.util.Map;
import java.util.Map.Entry;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.StopWordAnnotator.StopWordAnnotation;
public class ClassificationInstance {
private String id;
private double tfIdf;
private double firstOcc;
private double prob;
private int rank;
private Object label;
private Map<NGram, Integer> orthographicForms;
/**
*
* @param stringFormat
*/
public ClassificationInstance(String stringFormat) {
String[] parts = stringFormat.split("\t");
id = parts[0];
tfIdf = Double.parseDouble(parts[1]);
firstOcc = Double.parseDouble(parts[2]);
prob = Double.parseDouble(parts[3]);
rank = Integer.parseInt(parts[4]);
label = parts[5];
String[] forms = parts[6].split(", ");
String[] lemmas = parts[7].split(",");
orthographicForms = new HashMap<NGram, Integer>();
for (int f = 0; f < forms.length; ++f) {
String[] formAndOccurrence = forms[f].replaceAll("^\\{|\\}$", "").split("=");
int occurrence = Integer.parseInt(formAndOccurrence[1]);
NGram ng = new NGram(formAndOccurrence[0].split(" "), lemmas[f].split(" "));
orthographicForms.put(ng, occurrence);
}
}
public ClassificationInstance(String malletId, double[] dedicatedFeatures, Map<NGram, NGramStats> ngramForms) {
id = malletId.replaceAll(" ", "_");
tfIdf = dedicatedFeatures[0];
firstOcc = dedicatedFeatures[1];
label = new Boolean(dedicatedFeatures[2] == 1.0d);
orthographicForms = new HashMap<NGram, Integer>();
for (Entry<NGram, NGramStats> form : ngramForms.entrySet()) {
orthographicForms.put(form.getKey(), form.getValue().getPositions().size());
}
rank = Integer.MAX_VALUE;
}
public int hashCode() {
return id.hashCode();
}
public boolean equals(Object o) {
return o instanceof ClassificationInstance ? ((ClassificationInstance) o).getId().equals(id) : o.equals(id);
}
public double getTfIdf() {
return tfIdf;
}
public double getFirstOccurr() {
return firstOcc;
}
public int getRanking() {
return rank;
}
public void setRanking(int ranking) {
rank = ranking;
}
public double getProbability() {
return prob;
}
public void setProbability(double probability) {
this.prob = probability;
}
public String getId() {
return id;
}
public Object getClassLabel() {
return label;
}
public Map<NGram, Integer> getOrthographicForms() {
return orthographicForms;
}
public String toString() {
Map<String, Integer> lemmasToFreqs = new HashMap<String, Integer>();
for (Entry<NGram, Integer> ng : orthographicForms.entrySet()) {
String lemmaForm = ng.getKey().getSequenceAsString(SequenceType.LEMMA);
Integer actualFreq = ng.getValue();
Integer prevVal = lemmasToFreqs.get(lemmaForm);
lemmasToFreqs.put(lemmaForm, (prevVal == null ? 0 : prevVal) + actualFreq);
}
return id + "\t" + tfIdf + "\t" + firstOcc + "\t" + prob + "\t" + rank + "\t" + label + "\t" + orthographicForms + "\t" + lemmasToFreqs;
}
/**
* Tries to determine the most likely form of a normalized candidate as acting a keyphrase.
*
* @param inst
* @return
*/
public String getProbableForm() {
StringBuffer toWriteOut = new StringBuffer();
int max = Integer.MIN_VALUE;
for (Entry<NGram, Integer> ngram : getOrthographicForms().entrySet()) {
boolean hasStopword = false;
StringBuffer temp = new StringBuffer();
NGram ng = ngram.getKey();
for (int i = 0; i < ng.size(); ++i) {
CoreLabel cl = ng.get(i);
temp.append((i > 0 ? " " : "") + cl.word().toLowerCase());
hasStopword = hasStopword || cl.get(StopWordAnnotation.class);
}
if ((!hasStopword && ngram.getValue() > max)) {
max = ngram.getValue();
toWriteOut = temp;
} else if (toWriteOut.length() == 0) {
toWriteOut = temp;
}
}
return toWriteOut.toString();
}
}