package hu.u_szeged.kpe.features; import hu.u_szeged.kpe.candidates.NGram; import hu.u_szeged.kpe.candidates.NGramStats; import hu.u_szeged.kpe.main.KPEFilter; import hu.u_szeged.kpe.readers.DocumentData; import hu.u_szeged.utils.NLPUtils; import java.util.Collection; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Map.Entry; import edu.stanford.nlp.util.CoreMap; /** * Calculates sf-isf (section frequency-inverted section frequency) in the form of sf(term, document)*isf(term). */ public class SectionFeature extends Feature { private static final long serialVersionUID = 7606431493967247562L; private int numSections; public SectionFeature() { scale = Scale.NUMERIC; collectionToStoreDocVals = HashSet.class; } public void setFeatureField(KPEFilter kf) { numSections = kf.getNumSections(); } public void value(String phrase, int[] length, Entry<NGram, NGramStats> ngramForm, boolean train, int docToCheck, List<Map<String, Map<NGram, NGramStats>>> listOfHashs, List<CoreMap> sentences, DocumentData... docs) { for (Integer sectionPresent : ngramForm.getValue().getSectionIds()) updateFeatureVals(sectionPresent, docToCheck); } protected double aggregation(List<Collection<Number>> docVals, String phrase, boolean train, List<int[]> length) { double globalVal = 0.0d; double[] globalVals = new double[docVals.size()]; Integer[] phraseStats = dict.get(phrase); if (phraseStats != null) globalVal += phraseStats[2]; for (int doc = 0; train && doc < docVals.size(); ++doc) { globalVals[doc] = globalVal - docVals.get(doc).size(); if (globalVals[doc] < 0) { System.err.println("Apparently, we have some problem."); } } double[] values = new double[globalVals.length]; for (int i = 0; i < globalVals.length; ++i) values[i] = ((double) docVals.get(i).size() / length.get(i)[1]) * Math.log((numSections + 1) / (globalVals[i] + 1)); return NLPUtils.mean(values); } }