package edu.stanford.nlp.loglinear.model;
import java.io.BufferedWriter;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
/**
* Created on 10/20/15.
* @author keenon
* <p>
* This is a wrapper function to keep a namespace of namespace of recognized features, so that building a set of
* ConcatVectors for featurizing a model is easier and more intuitive. It's actually quite simple, and threadsafe.
*/
public class ConcatVectorNamespace {
final Map<String, Integer> featureToIndex = new HashMap<>();
final Map<String, Map<String, Integer>> sparseFeatureIndex = new HashMap<>();
final Map<String, Map<Integer, String>> reverseSparseFeatureIndex = new HashMap<>();
/**
* Creates a new vector that is appropriately sized to accommodate all the features that have been named so far.
*
* @return a new, empty ConcatVector
*/
public ConcatVector newVector() {
return new ConcatVector(featureToIndex.size());
}
/**
* This constructs a fresh vector that is sized correctly to accommodate all the known sparse values for vectors
* that are possibly sparse.
*
* @return a new, internally correctly sized ConcatVector that will work correctly as weights for features from
* this namespace;
*/
public ConcatVector newWeightsVector() {
ConcatVector vector = new ConcatVector(featureToIndex.size());
for (String s : sparseFeatureIndex.keySet()) {
int size = sparseFeatureIndex.get(s).size();
vector.setDenseComponent(ensureFeature(s), new double[size]);
}
return vector;
}
/**
* An optimization, this lets clients inform the ConcatVectorNamespace of how many features to expect, so
* that we can avoid resizing ConcatVectors.
*
* @param featureName the feature to add to our index
*/
public int ensureFeature(String featureName) {
synchronized (featureToIndex) {
if (!featureToIndex.containsKey(featureName)) {
featureToIndex.put(featureName, featureToIndex.size());
}
return featureToIndex.get(featureName);
}
}
/**
* An optimization, this lets clients inform the ConcatVectorNamespace of how many sparse feature components to
* expect, again so that we can avoid resizing ConcatVectors.
*
* @param featureName the feature to use in our index
* @param index the sparse value to ensure is available
*/
public int ensureSparseFeature(String featureName, String index) {
ensureFeature(featureName);
synchronized (sparseFeatureIndex) {
if (!sparseFeatureIndex.containsKey(featureName)) {
sparseFeatureIndex.put(featureName, new HashMap<>());
reverseSparseFeatureIndex.put(featureName, new HashMap<>());
}
}
final Map<String, Integer> sparseIndex = sparseFeatureIndex.get(featureName);
final Map<Integer, String> reverseSparseIndex = reverseSparseFeatureIndex.get(featureName);
synchronized (sparseIndex) {
if (!sparseIndex.containsKey(index)) {
reverseSparseIndex.put(sparseIndex.size(), index);
sparseIndex.put(index, sparseIndex.size());
}
return sparseIndex.get(index);
}
}
/**
* This adds a dense feature to a vector, setting the appropriate component of the given vector to the passed in
* value.
*
* @param vector the vector
* @param featureName the feature whose value to set
* @param value the value we want to set this vector to
*/
public void setDenseFeature(ConcatVector vector, String featureName, double[] value) {
vector.setDenseComponent(ensureFeature(featureName), value);
}
/**
* This adds a sparse feature to a vector, setting the appropriate component of the given vector to the passed in
* value.
*
* @param vector the vector
* @param featureName the feature whose value to set
* @param index the index of the one-hot vector to set, as a string, which we will translate into a mapping
* @param value the value we want to set this one-hot index to
*/
public void setSparseFeature(ConcatVector vector, String featureName, String index, double value) {
vector.setSparseComponent(ensureFeature(featureName), ensureSparseFeature(featureName, index), value);
}
/**
* This prints out a ConcatVector by mapping to the namespace, to make debugging learning algorithms easier.
*
* @param vector the vector to print
* @param bw the output stream to write to
*/
public void debugVector(ConcatVector vector, BufferedWriter bw) throws IOException {
for (String key : featureToIndex.keySet()) {
bw.write(key);
bw.write(":\n");
int i = featureToIndex.get(key);
if (vector.isComponentSparse(i)) {
debugFeatureValue(key, vector.getSparseIndex(i), vector, bw);
} else {
double[] arr = vector.getDenseComponent(i);
for (int j = 0; j < arr.length; j++) {
debugFeatureValue(key, j, vector, bw);
}
}
}
}
/**
* This writes a feature's individual value, using the human readable name if possible, to a StringBuilder
*/
private void debugFeatureValue(String feature, int index, ConcatVector vector, BufferedWriter bw) throws IOException {
bw.write("\t");
if (sparseFeatureIndex.containsKey(feature) && sparseFeatureIndex.get(feature).values().contains(index)) {
// we can map this index to an interpretable string, so we do
bw.write(reverseSparseFeatureIndex.get(feature).get(index));
} else {
// we can't map this to a useful string, so we default to the number
bw.write(Integer.toString(index));
}
bw.write(": ");
bw.write(Double.toString(vector.getValueAt(featureToIndex.get(feature), index)));
bw.write("\n");
}
}