package hu.u_szeged.kpe.features;
import hu.u_szeged.kpe.candidates.NGram;
import hu.u_szeged.kpe.candidates.NGramStats;
import hu.u_szeged.kpe.readers.DocumentData;
import hu.u_szeged.utils.NLPUtils;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import edu.stanford.nlp.util.CoreMap;
/**
* Determines the number of occurrences of a candidate phrase as true keyphrase on the training set.
*/
public class KeyFreqFeature extends Feature {
private static final long serialVersionUID = 2994697250601490879L;
public KeyFreqFeature() {
scale = Scale.NUMERIC;
collectionToStoreDocVals = HashSet.class;
}
public void value(String phrase, int[] length, Entry<NGram, NGramStats> ngramForm, boolean train, int docToCheck,
List<Map<String, Map<NGram, NGramStats>>> listOfHashs, List<CoreMap> sentences, DocumentData... docs) {
if (train && docToCheck != documentToExamine)
updateFeatureVals(docs[docToCheck].getKeyphrases().containsKey(ngramForm.getKey()) ? 1.0d : 0.0d, docToCheck);
}
protected double aggregation(List<Collection<Number>> docVals, String phrase, boolean train, List<int[]> length) {
Integer keyphraseCounter = 0;
Integer[] globalPhraseStats = dict.get(phrase);
if (globalPhraseStats != null && globalPhraseStats[1] != null) {
keyphraseCounter = globalPhraseStats[1];
}
double[] vals = new double[docVals.size()];
for (int c = 0; c < docVals.size(); ++c) {
Set<Number> val = (HashSet<Number>) docVals.get(c);
vals[c] = keyphraseCounter - (val.contains(1.0d) ? 1.0d : 0.0d);
}
return NLPUtils.mean(vals);
}
}