/** * */ package edu.berkeley.nlp.discPCFG; import edu.berkeley.nlp.util.Counter; /** * EncodedDatums are sparse representations of (labeled) feature count vectors for a given data point. Use * getNumActiveFeatures() to see how many features have non-zero count in a datum. Then, use getFeatureIndex() and * getFeatureCount() to retreive the number and count of each non-zero feature. Use getLabelIndex() to get the * label's number. */ public class EncodedDatum { public static <F,L> EncodedDatum encodeDatum(Encoding<F, L> encoding, Counter<F> features) { return encodeLabeledDatum(encoding, features, null, null); } public static <F,L> EncodedDatum encodeLabeledDatum(Encoding<F, L> encoding, Counter<F> features, L label, double[] weights) { Counter<F> knownFeatures = new Counter<F>(); for (F feature : features.keySet()) { if (encoding.getFeatureIndex(feature) < 0) continue; knownFeatures.incrementCount(feature, features.getCount(feature)); } int numActiveFeatures = knownFeatures.keySet().size(); int[] featureIndexes = new int[numActiveFeatures]; double[] featureCounts = new double[knownFeatures.keySet().size()]; int i = 0; for (F feature : knownFeatures.keySet()) { int index = encoding.getFeatureIndex(feature); double count = knownFeatures.getCount(feature); featureIndexes[i] = index; featureCounts[i] = count; i++; } int labelIndex = encoding.getLabelIndex(label); EncodedDatum encodedDatum = new EncodedDatum(labelIndex, featureIndexes, featureCounts, weights); return encodedDatum; } int labelIndex; int[] featureIndexes; double[] featureCounts; double[] weights; // the probability of each substate of the label (allows partial labeling) public int getLabelIndex() { return labelIndex; } public double[] getWeights() { return weights; } public int getNumActiveFeatures() { return featureCounts.length; } public int getFeatureIndex(int num) { return featureIndexes[num]; } public double getFeatureCount(int num) { return featureCounts[num]; } public EncodedDatum(int labelIndex, int[] featureIndexes, double[] featureCounts, double[] weights) { this.labelIndex = labelIndex; this.featureIndexes = featureIndexes; this.featureCounts = featureCounts; this.weights = weights; } }