package hu.u_szeged.kpe.features;
import hu.u_szeged.kpe.candidates.NGram;
import hu.u_szeged.kpe.candidates.NGramStats;
import hu.u_szeged.kpe.features.Feature.Scale;
import hu.u_szeged.kpe.readers.DocumentData;
import hu.u_szeged.kpe.readers.DocumentSet;
import hu.u_szeged.ml.DataHandler;
import hu.u_szeged.utils.NLPUtils;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.TreeMap;
import edu.stanford.nlp.util.CoreMap;
public class FeatureHolder extends ArrayList<Feature> {
/**
* Class for storing the various Feature object and the mapping to their list of double values.
*/
private static final long serialVersionUID = 4577522680430595054L;
private transient Map<String, Map<Double, Integer>> featureValDistribution;
public FeatureHolder(int initialCapacity) {
super(initialCapacity);
featureValDistribution = new HashMap<String, Map<Double, Integer>>();
}
public void updateDataHandler(double[] dedicatedFeats, String phrase, String instanceId, List<int[]> length,
List<Map<String, Map<NGram, NGramStats>>> listOfHashs, List<Map<Integer, List<CoreMap>>> grammars,
DataHandler dataHandler, boolean train, DocumentData... docs) {
Feature.numberOfDocs = listOfHashs.size();
double[] ngramAlternations = new double[listOfHashs.size()];
for (int docToCheck = 0; docToCheck < listOfHashs.size(); ++docToCheck) {
// DocumentType docType = docs[docToCheck].getDocumentType();
Map<Integer, List<CoreMap>> documentSentences = grammars.get(docToCheck);
Map<String, Map<NGram, NGramStats>> docVocabulary = listOfHashs.get(docToCheck);
Map<NGram, NGramStats> docPhrasesStats = docVocabulary.get(phrase);
// when keyphrasing multiple documents it is possible that one keyphrase aspirant is not present in some
// of the documents. Add some dummy feature values in these cases.
if (docPhrasesStats == null) {
for (Feature f : this) {
f.updateFeatureVals(f.getDummyVal(), docToCheck);
}
continue;
}
ngramAlternations[docToCheck] = docPhrasesStats.size();
int[] docLength = length.get(docToCheck);
for (Entry<NGram, NGramStats> ngramModificationOfDoc : docPhrasesStats.entrySet()) {
List<int[]> sentenceLocations = ngramModificationOfDoc.getValue().getSentencePositions();
List<CoreMap> sentences = new LinkedList<CoreMap>();
for (int[] sentenceLocation : sentenceLocations) {
sentences.add(documentSentences.get(sentenceLocation[0]).get(sentenceLocation[1]));
}
for (Feature f : this) {
f.value(phrase, docLength, ngramModificationOfDoc, train, docToCheck, listOfHashs, sentences, docs);
}
}
}
double averageAlternations = NLPUtils.mean(ngramAlternations);
dataHandler.setNumericValue(instanceId, "numOfAppearances", averageAlternations);
if (DocumentSet.adaptationType % 2 == 1) {
if (!train || instanceId.matches("target_(\\d+_)+.*")) {
dataHandler.setNumericValue(instanceId, "T_numOfAppearances", averageAlternations);
} else {
dataHandler.setNumericValue(instanceId, "S_numOfAppearances", averageAlternations);
}
}
// set the DataHandler Object from feature to feature
for (Feature f : this) {
Map<String, Double> values = f.aggregateVals(train, phrase, length, dedicatedFeats);
for (Entry<String, Double> val : values.entrySet()) {
if (Double.isNaN(val.getValue()) || Double.isInfinite(val.getValue())) {
System.err.println(f + "\n" + phrase + "\n" + val);
System.exit(2);
}
if (f.scale == Scale.BINARY) {
dataHandler.setBinaryValue(instanceId, val.getKey(), val.getValue() >= 0.5);
if (DocumentSet.adaptationType % 2 == 1) {
if (!train || instanceId.matches("target_(\\d+_)+.*")) {
dataHandler.setBinaryValue(instanceId, "T_" + val.getKey(), val.getValue() >= 0.5);
} else {
dataHandler.setBinaryValue(instanceId, "S_" + val.getKey(), val.getValue() >= 0.5);
}
}
if (train) {
Map<Double, Integer> valsForFeature = featureValDistribution.get(val.getKey());
valsForFeature = valsForFeature == null ? new TreeMap<Double, Integer>() : valsForFeature;
featureValDistribution.put(val.getKey(), valsForFeature);
Integer prevVal = valsForFeature.get(val.getValue() >= 0.5 ? 1.0d : 0.0d);
valsForFeature.put(val.getValue() >= 0.5 ? 1.0d : 0.0d, prevVal == null ? 1 : ++prevVal);
}
} else if (f.scale == Scale.NOMINAL) {
// TODO implement such cases
System.out.println("Dealing with nominal attributes in class FeatureHolder is not implemented yet.");
} else {
dataHandler.setNumericValue(instanceId, val.getKey(), val.getValue());
if (DocumentSet.adaptationType % 2 == 1) {
if (!train || instanceId.matches("target_(\\d+_)+.*")) {
dataHandler.setNumericValue(instanceId, "T_" + val.getKey(), val.getValue());
} else {
dataHandler.setNumericValue(instanceId, "S_" + val.getKey(), val.getValue());
}
}
if (train) {
Map<Double, Integer> valsForFeature = featureValDistribution.get(val.getKey());
valsForFeature = valsForFeature == null ? new TreeMap<Double, Integer>() : valsForFeature;
featureValDistribution.put(val.getKey(), valsForFeature);
Integer prevVal = valsForFeature.get(val.getValue());
valsForFeature.put(val.getValue(), prevVal == null ? 1 : ++prevVal);
}
}
}
}
}
public Map<String, Map<Double, Integer>> getFeatureValDistribution() {
return featureValDistribution;
}
public Set<String> getRareFeatures(int threshold) {
Set<String> unwantedFeatureNames = new HashSet<String>();
try (PrintWriter out = new PrintWriter("feature.stats")) {
for (Entry<String, Map<Double, Integer>> featVals : featureValDistribution.entrySet()) {
out.println(featVals.getKey() + "\t" + featVals.getValue());
int nonZero = 0;
for (Entry<Double, Integer> featValOfFeature : featVals.getValue().entrySet()) {
if (featValOfFeature.getKey() > 0.0d) {
nonZero += featValOfFeature.getValue();
}
}
if (nonZero <= threshold)
unwantedFeatureNames.add(featVals.getKey());
}
} catch (IOException io) {
io.printStackTrace();
}
return unwantedFeatureNames;
}
}