package hu.u_szeged.kpe.features; import hu.u_szeged.kpe.candidates.NGram; import hu.u_szeged.kpe.candidates.NGramStats; import hu.u_szeged.kpe.readers.DocumentData; import java.util.Collection; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Map.Entry; import edu.stanford.nlp.util.CoreMap; /** * Determines the number of tokens a candidate phrase consists of. */ public class LengthFeature extends Feature { private static final long serialVersionUID = -6828991006669939839L; public LengthFeature() { scale = Scale.NUMERIC; dummyValue = -1; collectionToStoreDocVals = LinkedList.class; } public void value(String phrase, int[] length, Entry<NGram, NGramStats> ngramForm, boolean train, int docToCheck, List<Map<String, Map<NGram, NGramStats>>> listOfHashs, List<CoreMap> sentences, DocumentData... docs) { updateFeatureVals(ngramForm.getKey().size(), docToCheck); updateFeatureVals(ngramForm.getValue().getPositions().size(), docToCheck); } public Map<String, Double> aggregateVals(boolean train, String token, List<int[]> length, double[] dedicatedFeatures) { Map<Integer, Integer> lengthFreqs = new HashMap<Integer, Integer>(); int sumOccurrence = 0, sumTotalLength = 0, minLength = Integer.MAX_VALUE, maxLength = Integer.MIN_VALUE; for (List<Collection<Number>> entry : featureVals.values()) { int documents = entry.size(); doc: for (int i = 0; i < documents; ++i) { Iterator<Number> lengthOccurrencesIterator = entry.get(i).iterator(); while (lengthOccurrencesIterator.hasNext()) { int tokenLength = lengthOccurrencesIterator.next().intValue(); if (tokenLength == 0) { break doc; } if (tokenLength > maxLength) { maxLength = tokenLength; } if (tokenLength < minLength) { minLength = tokenLength; } int freq = lengthOccurrencesIterator.next().intValue(); Integer prevVal = lengthFreqs.get(tokenLength); lengthFreqs.put(tokenLength, (prevVal == null ? 0 : prevVal) + freq); sumOccurrence += freq; sumTotalLength += freq * tokenLength; } } } double mean = sumTotalLength / (double) sumOccurrence; double stdev = 0.0; for (Entry<Integer, Integer> occurrences : lengthFreqs.entrySet()) { stdev += occurrences.getValue() * (occurrences.getKey() - mean) * (occurrences.getKey() - mean); } stdev = Math.sqrt(stdev / sumOccurrence); Map<String, Double> aggregatedVals = new HashMap<String, Double>(); aggregatedVals.put("TokenLengthMean", mean); aggregatedVals.put("TokenLengthMinLength", (double) minLength); aggregatedVals.put("TokenLengthMaxLength", (double) maxLength); aggregatedVals.put("TokenLengthSTDev", stdev); // reset it so the next time a set of documents are to be keyphrased this flag can start to count from the // beginning documentToExamine = -1; featureVals = new HashMap<String, List<Collection<Number>>>(); return aggregatedVals; } // public LengthFeature() { // scale = Scale.NUMERIC; // collectionToStoreDocVals = TreeSet.class; // } // // public void value(String phrase, int[] length, Entry<NGram, NGramStats> ngramForm, boolean train, int // docToCheck, // List<Map<String, Map<NGram, NGramStats>>> listOfHashs, List<CoreMap> sentences, DocumentData... docs) { // updateFeatureVals(ngramForm.getKey().size(), docToCheck); // } }