FeatureHolder.java example

Explorer
kpe-master
- src
  - edu
    - stanford
      - nlp
        pipeline
        HunTokenizerAnnotator.java
        MweDictAnnotator.java
        MyCleanXmlAnnotator.java
        NormalizerAnnotator.java
        OwnMorphaAnnotator.java
        OwnPOSTaggerAnnotator.java
        StopWordAnnotator.java
        SzTEAnnotationPipeline.java
        SzTECoreNLP.java
        process
        HunPTBLexer.java
        HunTokenizer.java
        tagger
        maxent
        OwnMaxentTagger.java
        OwnTestSentence.java
  - hu
    - u_szeged
package hu.u_szeged.kpe.features;

import hu.u_szeged.kpe.candidates.NGram;
import hu.u_szeged.kpe.candidates.NGramStats;
import hu.u_szeged.kpe.features.Feature.Scale;
import hu.u_szeged.kpe.readers.DocumentData;
import hu.u_szeged.kpe.readers.DocumentSet;
import hu.u_szeged.ml.DataHandler;
import hu.u_szeged.utils.NLPUtils;

import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.TreeMap;

import edu.stanford.nlp.util.CoreMap;

public class FeatureHolder extends ArrayList<Feature> {

  /**
   * Class for storing the various Feature object and the mapping to their list of double values.
   */
  private static final long serialVersionUID = 4577522680430595054L;
  private transient Map<String, Map<Double, Integer>> featureValDistribution;

  public FeatureHolder(int initialCapacity) {
    super(initialCapacity);
    featureValDistribution = new HashMap<String, Map<Double, Integer>>();
  }

  public void updateDataHandler(double[] dedicatedFeats, String phrase, String instanceId, List<int[]> length,
      List<Map<String, Map<NGram, NGramStats>>> listOfHashs, List<Map<Integer, List<CoreMap>>> grammars,
      DataHandler dataHandler, boolean train, DocumentData... docs) {

    Feature.numberOfDocs = listOfHashs.size();
    double[] ngramAlternations = new double[listOfHashs.size()];
    for (int docToCheck = 0; docToCheck < listOfHashs.size(); ++docToCheck) {
      // DocumentType docType = docs[docToCheck].getDocumentType();
      Map<Integer, List<CoreMap>> documentSentences = grammars.get(docToCheck);
      Map<String, Map<NGram, NGramStats>> docVocabulary = listOfHashs.get(docToCheck);
      Map<NGram, NGramStats> docPhrasesStats = docVocabulary.get(phrase);
      // when keyphrasing multiple documents it is possible that one keyphrase aspirant is not present in some
      // of the documents. Add some dummy feature values in these cases.
      if (docPhrasesStats == null) {
        for (Feature f : this) {
          f.updateFeatureVals(f.getDummyVal(), docToCheck);
        }
        continue;
      }
      ngramAlternations[docToCheck] = docPhrasesStats.size();
      int[] docLength = length.get(docToCheck);
      for (Entry<NGram, NGramStats> ngramModificationOfDoc : docPhrasesStats.entrySet()) {
        List<int[]> sentenceLocations = ngramModificationOfDoc.getValue().getSentencePositions();
        List<CoreMap> sentences = new LinkedList<CoreMap>();
        for (int[] sentenceLocation : sentenceLocations) {
          sentences.add(documentSentences.get(sentenceLocation[0]).get(sentenceLocation[1]));
        }
        for (Feature f : this) {
          f.value(phrase, docLength, ngramModificationOfDoc, train, docToCheck, listOfHashs, sentences, docs);
        }
      }
    }

    double averageAlternations = NLPUtils.mean(ngramAlternations);
    dataHandler.setNumericValue(instanceId, "numOfAppearances", averageAlternations);
    if (DocumentSet.adaptationType % 2 == 1) {
      if (!train || instanceId.matches("target_(\\d+_)+.*")) {
        dataHandler.setNumericValue(instanceId, "T_numOfAppearances", averageAlternations);
      } else {
        dataHandler.setNumericValue(instanceId, "S_numOfAppearances", averageAlternations);
      }
    }

    // set the DataHandler Object from feature to feature
    for (Feature f : this) {
      Map<String, Double> values = f.aggregateVals(train, phrase, length, dedicatedFeats);
      for (Entry<String, Double> val : values.entrySet()) {
        if (Double.isNaN(val.getValue()) || Double.isInfinite(val.getValue())) {
          System.err.println(f + "\n" + phrase + "\n" + val);
          System.exit(2);
        }
        if (f.scale == Scale.BINARY) {
          dataHandler.setBinaryValue(instanceId, val.getKey(), val.getValue() >= 0.5);
          if (DocumentSet.adaptationType % 2 == 1) {
            if (!train || instanceId.matches("target_(\\d+_)+.*")) {
              dataHandler.setBinaryValue(instanceId, "T_" + val.getKey(), val.getValue() >= 0.5);
            } else {
              dataHandler.setBinaryValue(instanceId, "S_" + val.getKey(), val.getValue() >= 0.5);
            }
          }
          if (train) {
            Map<Double, Integer> valsForFeature = featureValDistribution.get(val.getKey());
            valsForFeature = valsForFeature == null ? new TreeMap<Double, Integer>() : valsForFeature;
            featureValDistribution.put(val.getKey(), valsForFeature);
            Integer prevVal = valsForFeature.get(val.getValue() >= 0.5 ? 1.0d : 0.0d);
            valsForFeature.put(val.getValue() >= 0.5 ? 1.0d : 0.0d, prevVal == null ? 1 : ++prevVal);
          }
        } else if (f.scale == Scale.NOMINAL) {
          // TODO implement such cases
          System.out.println("Dealing with nominal attributes in class FeatureHolder is not implemented yet.");
        } else {
          dataHandler.setNumericValue(instanceId, val.getKey(), val.getValue());
          if (DocumentSet.adaptationType % 2 == 1) {
            if (!train || instanceId.matches("target_(\\d+_)+.*")) {
              dataHandler.setNumericValue(instanceId, "T_" + val.getKey(), val.getValue());
            } else {
              dataHandler.setNumericValue(instanceId, "S_" + val.getKey(), val.getValue());
            }
          }
          if (train) {
            Map<Double, Integer> valsForFeature = featureValDistribution.get(val.getKey());
            valsForFeature = valsForFeature == null ? new TreeMap<Double, Integer>() : valsForFeature;
            featureValDistribution.put(val.getKey(), valsForFeature);
            Integer prevVal = valsForFeature.get(val.getValue());
            valsForFeature.put(val.getValue(), prevVal == null ? 1 : ++prevVal);
          }
        }
      }
    }
  }

  public Map<String, Map<Double, Integer>> getFeatureValDistribution() {
    return featureValDistribution;
  }

  public Set<String> getRareFeatures(int threshold) {
    Set<String> unwantedFeatureNames = new HashSet<String>();
    try (PrintWriter out = new PrintWriter("feature.stats")) {
      for (Entry<String, Map<Double, Integer>> featVals : featureValDistribution.entrySet()) {
        out.println(featVals.getKey() + "\t" + featVals.getValue());
        int nonZero = 0;
        for (Entry<Double, Integer> featValOfFeature : featVals.getValue().entrySet()) {
          if (featValOfFeature.getKey() > 0.0d) {
            nonZero += featValOfFeature.getValue();
          }
        }
        if (nonZero <= threshold)
          unwantedFeatureNames.add(featVals.getKey());
      }
    } catch (IOException io) {
      io.printStackTrace();
    }
    return unwantedFeatureNames;
  }
}