ConcatVectorNamespace.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.loglinear.model;

import java.io.BufferedWriter;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

/**
 * Created on 10/20/15.
 * @author keenon
 * <p>
 * This is a wrapper function to keep a namespace of namespace of recognized features, so that building a set of
 * ConcatVectors for featurizing a model is easier and more intuitive. It's actually quite simple, and threadsafe.
 */
public class ConcatVectorNamespace {
  final Map<String, Integer> featureToIndex = new HashMap<>();
  final Map<String, Map<String, Integer>> sparseFeatureIndex = new HashMap<>();
  final Map<String, Map<Integer, String>> reverseSparseFeatureIndex = new HashMap<>();

  /**
   * Creates a new vector that is appropriately sized to accommodate all the features that have been named so far.
   *
   * @return a new, empty ConcatVector
   */
  public ConcatVector newVector() {
    return new ConcatVector(featureToIndex.size());
  }

  /**
   * This constructs a fresh vector that is sized correctly to accommodate all the known sparse values for vectors
   * that are possibly sparse.
   *
   * @return a new, internally correctly sized ConcatVector that will work correctly as weights for features from
   * this namespace;
   */
  public ConcatVector newWeightsVector() {
    ConcatVector vector = new ConcatVector(featureToIndex.size());
    for (String s : sparseFeatureIndex.keySet()) {
      int size = sparseFeatureIndex.get(s).size();
      vector.setDenseComponent(ensureFeature(s), new double[size]);
    }
    return vector;
  }

  /**
   * An optimization, this lets clients inform the ConcatVectorNamespace of how many features to expect, so
   * that we can avoid resizing ConcatVectors.
   *
   * @param featureName the feature to add to our index
   */
  public int ensureFeature(String featureName) {
    synchronized (featureToIndex) {
      if (!featureToIndex.containsKey(featureName)) {
        featureToIndex.put(featureName, featureToIndex.size());
      }
      return featureToIndex.get(featureName);
    }
  }

  /**
   * An optimization, this lets clients inform the ConcatVectorNamespace of how many sparse feature components to
   * expect, again so that we can avoid resizing ConcatVectors.
   *
   * @param featureName the feature to use in our index
   * @param index       the sparse value to ensure is available
   */
  public int ensureSparseFeature(String featureName, String index) {
    ensureFeature(featureName);
    synchronized (sparseFeatureIndex) {
      if (!sparseFeatureIndex.containsKey(featureName)) {
        sparseFeatureIndex.put(featureName, new HashMap<>());
        reverseSparseFeatureIndex.put(featureName, new HashMap<>());
      }
    }
    final Map<String, Integer> sparseIndex = sparseFeatureIndex.get(featureName);
    final Map<Integer, String> reverseSparseIndex = reverseSparseFeatureIndex.get(featureName);
    synchronized (sparseIndex) {
      if (!sparseIndex.containsKey(index)) {
        reverseSparseIndex.put(sparseIndex.size(), index);
        sparseIndex.put(index, sparseIndex.size());
      }
      return sparseIndex.get(index);
    }
  }

  /**
   * This adds a dense feature to a vector, setting the appropriate component of the given vector to the passed in
   * value.
   *
   * @param vector      the vector
   * @param featureName the feature whose value to set
   * @param value       the value we want to set this vector to
   */
  public void setDenseFeature(ConcatVector vector, String featureName, double[] value) {
    vector.setDenseComponent(ensureFeature(featureName), value);
  }

  /**
   * This adds a sparse feature to a vector, setting the appropriate component of the given vector to the passed in
   * value.
   *
   * @param vector      the vector
   * @param featureName the feature whose value to set
   * @param index       the index of the one-hot vector to set, as a string, which we will translate into a mapping
   * @param value       the value we want to set this one-hot index to
   */
  public void setSparseFeature(ConcatVector vector, String featureName, String index, double value) {
    vector.setSparseComponent(ensureFeature(featureName), ensureSparseFeature(featureName, index), value);
  }

  /**
   * This prints out a ConcatVector by mapping to the namespace, to make debugging learning algorithms easier.
   *
   * @param vector the vector to print
   * @param bw     the output stream to write to
   */
  public void debugVector(ConcatVector vector, BufferedWriter bw) throws IOException {
    for (String key : featureToIndex.keySet()) {
      bw.write(key);
      bw.write(":\n");
      int i = featureToIndex.get(key);
      if (vector.isComponentSparse(i)) {
        debugFeatureValue(key, vector.getSparseIndex(i), vector, bw);
      } else {
        double[] arr = vector.getDenseComponent(i);
        for (int j = 0; j < arr.length; j++) {
          debugFeatureValue(key, j, vector, bw);
        }
      }
    }
  }

  /**
   * This writes a feature's individual value, using the human readable name if possible, to a StringBuilder
   */
  private void debugFeatureValue(String feature, int index, ConcatVector vector, BufferedWriter bw) throws IOException {
    bw.write("\t");
    if (sparseFeatureIndex.containsKey(feature) && sparseFeatureIndex.get(feature).values().contains(index)) {
      // we can map this index to an interpretable string, so we do
      bw.write(reverseSparseFeatureIndex.get(feature).get(index));
    } else {
      // we can't map this to a useful string, so we default to the number
      bw.write(Integer.toString(index));
    }
    bw.write(": ");
    bw.write(Double.toString(vector.getValueAt(featureToIndex.get(feature), index)));
    bw.write("\n");
  }
}