package LBJ2.classify; import java.io.Serializable; import java.util.Iterator; import java.util.Map; import java.util.NoSuchElementException; import java.util.Set; import java.util.SortedMap; import java.util.TreeMap; import LBJ2.learn.Lexicon; import LBJ2.util.ExceptionlessInputStream; import LBJ2.util.ExceptionlessOutputStream; import LBJ2.util.FVector; /** * Objects of this class are returned by classifiers that have been applied * to an object. * * @author Nick Rizzolo **/ public class FeatureVector implements Cloneable, Serializable { /** Stores non-label features. */ protected FVector features; /** Stores labels. */ protected FVector labels; /** With this variable, the user can weight the entire vector. */ protected double weight; /** Caches the result of the {@link #makeReal()} method. */ protected FeatureVector realCache; /** Simply instantiates the member variables. */ public FeatureVector() { features = new FVector(); labels = new FVector(); weight = 1; } /** * Creates the vector and adds the given feature to it. * * @param f A feature to start this vector off with. **/ public FeatureVector(Feature f) { this(); addFeature(f); } /** * Creates the vector and adds the given features to it. * * @param features A feature array to start this vector off with. **/ public FeatureVector(Feature[] features) { this(); for (int f = 0; f < features.length; f++) addFeature(features[f]); } /** * Instantiates a feature vector from example arrays and lexicons. * * @param ex The example array. * @param lex The feature lexicon. * @param llex The label lexicon. **/ public FeatureVector(Object[] ex, Lexicon lex, Lexicon llex) { this(); int[] fs = (int[]) ex[0]; double[] vs = (double[]) ex[1]; for (int i = 0; i < fs.length; ++i) { Feature f = lex.lookupKey(fs[i]); Feature ff = f.withStrength(vs[i]); addFeature(ff == null ? f : ff); } if (ex.length > 2) { int[] ls = (int[]) ex[2]; double[] lvs = (double[]) ex[3]; for (int i = 0; i < ls.length; ++i) { Feature f = llex.lookupKey(ls[i]); if (!f.isDiscrete()) f = f.withStrength(lvs[i]); addLabel(f); } } } /** * The size of this vector is defined as the size of {@link #features} plus * the size of {@link #labels}. * * @return The size of this vector. **/ public int size() { return features.size() + labels.size(); } /** Returns the size of just the {@link #features} list. */ public int featuresSize() { return features.size(); } /** Returns the size of just the {@link #labels} list. */ public int labelsSize() { return labels.size(); } /** * Returns the feature at the specified index. * * @param index The index of the requested feature. * @return The feature. **/ public Feature getFeature(int index) { return features.get(index); } /** * Returns the label at the specified index. * * @param index The index of the requested label. * @return The label. **/ public Feature getLabel(int index) { return labels.get(index); } /** Returns the value of {@link #weight}. */ public double getWeight() { return weight; } /** Removes all elements from both {@link #features} and {@link #labels}. */ public void clear() { features = new FVector(); labels = new FVector(); realCache = null; } /** Removes all elements from just the {@link #labels} list. */ public void clearLabels() { labels = new FVector(); } /** Sorts both of the feature lists. */ public void sort() { features.sort(); labels.sort(); } /** * Adds a feature to the vector. * * @param f The features to be added. **/ public void addFeature(Feature f) { features.add(f); realCache = null; } /** * Adds all the features in another vector to this vector. * * @param v The vector whose features are to be added. **/ public void addFeatures(FeatureVector v) { features.addAll(v.features); realCache = null; } /** * Adds a label to the vector. * * @param l The label to be added. **/ public void addLabel(Feature l) { labels.add(l); } /** * Adds all the features in another vector (but not the labels in that * vector) to the labels of this vector. * * @param v The vector whose features will become this vector's labels. **/ public void addLabels(FeatureVector v) { labels.addAll(v.features); } /** * Determines whether this vector has any labels. * * @return <code>true</code> iff this vector has at least one label. **/ public boolean isLabeled() { return labels.size() > 0; } /** * Converts all of the features in the {@link #features} list to * {@link RealFeature}s with appropriate strengths. Otherwise, the * returned feature vector is the same as this one. In particular, the * {@link #labels} list of the returned vector is a shallow clone of this * vector's {@link #labels} list. * * @return A new feature vector which is the same as this one, except all * features have been converted to {@link RealFeature}s. **/ public FeatureVector makeReal() { if (realCache == null) { realCache = (FeatureVector) clone(); int N = realCache.labels.size(); for (int i = 0; i < N; ++i) realCache.labels.set(i, realCache.labels.get(i).makeReal()); N = realCache.features.size(); for (int i = 0; i < N; ++i) realCache.features.set(i, realCache.features.get(i).makeReal()); } return realCache; } /** * Returns all the values of the features in this vector (not labels) * arranged in a <code>String</code> array. * * @return An array of <code>String</code>s with all the feature values * from this vector, or <code>null</code> if there are any * {@link RealFeature}s in this vector. **/ public String[] discreteValueArray() { String[] result = new String[features.size()]; for (int i = 0; i < result.length; ++i) result[i] = features.get(i).getStringValue(); return result; } /** * Returns all the values of the features in this vector (not labels) * arranged in a <code>double</code> array. * * @return An array of <code>double</code>s with all the feature values * from this vector, or <code>null</code> if there are any * {@link DiscreteFeature}s in this vector. **/ public double[] realValueArray() { double[] result = new double[features.size()]; for (int i = 0; i < result.length; ++i) result[i] = features.get(i).getStrength(); return result; } /** * Returns the first feature in {@link #features}. * * @return The first feature, or <code>null</code> if there aren't any. **/ public Feature firstFeature() { return features.get(0); } /** Removes and returns the first feature in {@link #features}. * / public Feature removeFirstFeature() { realCache = null; return (Feature) features.removeFirst(); } */ /** * Returns the first feature in {@link #labels}. * * @return The first label, or <code>null</code> if there aren't any. **/ public Feature firstLabel() { return labels.get(0); } /** * Returns the square of the magnitude of the feature vector. * * @return The square of the magnitude of the feature vector. **/ public double L2NormSquared() { double sum = 0; int N = features.size(); for (int i = 0; i < N; ++i) { double val = features.get(i).getStrength(); sum += val * val; } return sum; } /** * Returns the square of the magnitude of the given vector. * * @param exampleValues A vector. * @return The square of the magnitude of the given vector. **/ public static double L2NormSquared(double[] exampleValues) { double sum = 0; for (int i = 0; i < exampleValues.length; i++) sum += exampleValues[i] * exampleValues[i]; return sum; } /** * The hash code for a <code>FeatureVector</code> is simply the sum of the * hash codes of the features and the labels. * * @return The hash code of this vector. **/ public int hashCode() { int result = 0; int N = features.size(); for (int i = 0; i < N; ++i) result = 17 * result + features.get(i).hashCode(); N = labels.size(); for (int i = 0; i < N; ++i) result = 31 * result + labels.get(i).hashCode(); return result; } /** * Two <code>FeatureVector</code>s are equivalent if they contain the same * features and labels, as defined by {@link Feature} equivalence. * * @param o The object to compare with this <code>FeatureVector</code> for * equality. * @return True iff <code>o</code> is a <code>FeatureVector</code> * equivalent with this vector as defined above. **/ public boolean equals(Object o) { if (!(o instanceof FeatureVector)) return false; FeatureVector v = (FeatureVector) o; return features.equals(v.features) && labels.equals(v.labels); } /** * Returns a sorted map where the key is the feature index and the value is * the feature value. If there are multiple occurrences of the same * feature, then the corresponding values are summed up. * * @param features The feature indices. * @param values The feature values. * @return The sorted map. **/ public static SortedMap getSortedMap(int[] features, double[] values) { SortedMap map = new TreeMap(); for (int i = 0; i < features.length; i++) { Integer key = Integer.valueOf(features[i]); Object value = map.get(key); if (value == null) map.put(key, Double.valueOf(values[i])); else map.put(key, Double.valueOf(((Double) value).doubleValue() + values[i])); } return map; } /** * Computes the dot product of the 2 argument vectors. * * @param firstFeatures The first feature vector's indices. * @param firstValues The first feature vector's values. * @param secondFeatures The second feature vector's indices. * @param secondValues The second feature vector's values. * @return The dot product. **/ public static double dot(int[] firstFeatures, double[] firstValues, int[] secondFeatures, double[] secondValues) { Set firstFeatureValueSet = getSortedMap(firstFeatures, firstValues).entrySet(); Set secondFeatureValueSet = getSortedMap(secondFeatures, secondValues).entrySet(); double result = 0.0; try { Iterator firstIterator = firstFeatureValueSet.iterator(); Iterator secondIterator = secondFeatureValueSet.iterator(); Map.Entry firstEntry = (Map.Entry) firstIterator.next(); Map.Entry secondEntry = (Map.Entry) secondIterator.next(); while(true) { int firstEntryKey = ((Integer) firstEntry.getKey()).intValue(); int secondEntryKey = ((Integer) secondEntry.getKey()).intValue(); if (firstEntryKey == secondEntryKey) { result += ((Double) firstEntry.getValue()).doubleValue() * ((Double) secondEntry.getValue()).doubleValue(); firstEntry = (Map.Entry) firstIterator.next(); secondEntry = (Map.Entry) secondIterator.next(); } else if (firstEntryKey < secondEntryKey) firstEntry = (Map.Entry) firstIterator.next(); else secondEntry = (Map.Entry) secondIterator.next(); } } catch (NoSuchElementException nsee) { // Program reaches here when one of the iterator.next() in the above // try catch block leads to this exception, and so we are done // computing the dot product. } return result; } /** * Take the dot product of two feature vectors. * * @param vector The feature vector to take the dot product with. * @return The dot product of this feature vector and <code>vector</code>. **/ public double dot(FeatureVector vector) { if (features.size() == 0 || vector.features.size() == 0) return 0; FVector v1 = (FVector) features.clone(); FVector v2 = (FVector) vector.features.clone(); v1.sort(); v2.sort(); double res = 0; int i = 0, j = 0; Feature f1 = v1.get(0); Feature f2 = v2.get(0); while (f1 != null && f2 != null) { if (f1.equals(f2)) { res += f1.getStrength() * f2.getStrength(); f1 = v1.get(++i); f2 = v2.get(++j); } else if (f1.compareTo(f2) < 0) f1 = v1.get(++i); else f2 = v2.get(++j); } return res; } /** * Two <code>FeatureVector</code>s have equal value if they contain the * same number of {@link Feature}s and if the values of those * {@link Feature}s are pair-wise equivalent according to the * {@link Feature#valueEquals(String)} method. * * @param vector The vector with which to test equivalence. * @return <code>true</code> iff the two vectors are "value equivalent" as * defined above. **/ public boolean valueEquals(FeatureVector vector) { if (features.size() != vector.features.size() || labels.size() != vector.labels.size()) return false; int N = features.size(); for (int i = 0; i < N; ++i) if (!features.get(i) .valueEquals(vector.features.get(i).getStringValue())) return false; N = labels.size(); for (int i = 0; i < N; ++i) if (!labels.get(i).valueEquals(vector.labels.get(i).getStringValue())) return false; return true; } /** * Creates a string representation of this <code>FeatureVector</code>. A * comma separated list of labels appears first, surrounded by square * brackets. Then follows a comma separated list of features. * * @param buffer The buffer in which to create the representation. **/ public void write(StringBuffer buffer) { write(buffer, true); } /** * Creates a string representation of this <code>FeatureVector</code>. A * comma separated list of labels appears first, surrounded by square * brackets. Then follows a comma separated list of features. * * @param buffer The buffer in which to create the representation. * @param packages Whether or not to print package names. **/ public void write(StringBuffer buffer, boolean packages) { buffer.append("["); int N = labels.size(); if (N > 0) { if (packages) labels.get(0).write(buffer); else labels.get(0).writeNoPackage(buffer); for (int i = 1; i < N; ++i) { buffer.append(", "); if (packages) labels.get(i).write(buffer); else labels.get(i).writeNoPackage(buffer); } } buffer.append("]"); N = features.size(); if (N > 0) { buffer.append(" "); if (packages) features.get(0).write(buffer); else features.get(0).writeNoPackage(buffer); for (int i = 1; i < N; ++i) { buffer.append(", "); if (packages) features.get(i).write(buffer); else features.get(i).writeNoPackage(buffer); } } } /** * Returns the string representation of this <code>FeatureVector</code> as * created by {@link #write(StringBuffer)}. **/ public String toString() { StringBuffer result = new StringBuffer(); write(result); return result.toString(); } /** * Returns the string representation of this <code>FeatureVector</code> * like {@link #toString()} except without package names. **/ public String toStringNoPackage() { StringBuffer result = new StringBuffer(); write(result, false); return result.toString(); } /** * Writes a binary representation of the feature vector. * * @param out The output stream. **/ public void write(ExceptionlessOutputStream out) { out.writeDouble(weight); features.write(out); labels.write(out); } /** * Reads the binary representation of a feature vector from the specified * stream, overwriting the contents of this vector. * * @param in The input stream. **/ public void read(ExceptionlessInputStream in) { realCache = null; weight = in.readDouble(); features = new FVector(); features.read(in); labels = new FVector(); labels.read(in); } /** * Returns a shallow clone of this vector; the vectors are cloned, but * their elements aren't. **/ public Object clone() { FeatureVector clone = new FeatureVector(); clone.features = (FVector) features.clone(); clone.labels = (FVector) labels.clone(); clone.weight = weight; return clone; } }