package hu.u_szeged.kpe.features; import hu.u_szeged.kpe.candidates.NGram; import hu.u_szeged.kpe.candidates.NGramStats; import hu.u_szeged.kpe.readers.DocumentData; import hu.u_szeged.utils.NLPUtils; import java.util.ArrayList; import java.util.Collection; import java.util.List; import java.util.Map; import java.util.Map.Entry; import edu.stanford.nlp.util.CoreMap; /** * Calculates tf-idf score of a candidate phrase. */ public class TfIdfFeature extends Feature { private static final long serialVersionUID = -2391316562648124265L; public TfIdfFeature() { scale = Scale.NUMERIC; collectionToStoreDocVals = ArrayList.class; } public void value(String phrase, int[] length, Entry<NGram, NGramStats> ngramForm, boolean train, int docToCheck, List<Map<String, Map<NGram, NGramStats>>> listOfHashs, List<CoreMap> sentences, DocumentData... docs) { // for (Integer position : ngramForm.getValue().getPositions()) updateFeatureVals(ngramForm.getValue().getPositions().size(), docToCheck); } protected double aggregation(List<Collection<Number>> docVals, String phrase, boolean train, List<int[]> length) { double globalVal = 0.0d; Integer[] phraseStats = dict.get(phrase); if (phraseStats != null) globalVal = (phraseStats[0] == null ? 0 : phraseStats[0]) + (phraseStats[1] == null ? 0 : phraseStats[1]) - (train ? 1 : 0); double[] values = new double[docVals.size()]; for (int i = 0; i < values.length; ++i) { double occurrenceInDoc = ((ArrayList<Number>) docVals.get(i)).get(docVals.get(i).size() - 1).doubleValue(); values[i] = (occurrenceInDoc / length.get(i)[0]) * Math.log((numDoc + 1) / (globalVal + 1)); } return NLPUtils.mean(values); } }