SparseMIRA.java example

Explorer
MinorThird-master
package LBJ2.learn;

import java.io.PrintStream;
import java.util.Collection;
import java.util.Iterator;

import LBJ2.classify.Classifier;
import LBJ2.classify.DiscretePrimitiveStringFeature;
import LBJ2.classify.Feature;
import LBJ2.classify.FeatureVector;
import LBJ2.classify.ScoreSet;
import LBJ2.util.ExceptionlessInputStream;
import LBJ2.util.ExceptionlessOutputStream;
import LBJ2.util.OVector;


/**
  * An implementation of the Margin Infused Relaxed Algorithm of Crammer and
  * Singer.  This is a multi-class, online learner that maintains a separate
  * weight vector for every prediction class, just as
  * {@link SparseNetworkLearner} does.  However, updates to these weight
  * vectors given an example vector <i>x</i> with label <i>y</i> are dependent
  * on each other as follows.  For each weight vector <i>w<sub>v</sub></i>
  * corresponding to a prediction value <i>v</i>, a multiplier
  * <i>t<sub>v</sub></i> is selected and used to update <i>w<sub>v</sub></i>
  * as <i>w<sub>v</sub></i> += <i>t<sub>v</sub> x</i>.  <i>t<sub>v</sub></i>
  * must be less than or equal to zero for all <i>v</i> != <i>y</i>.
  * <i>t<sub>y</sub></i> must be less than or equal to one.  MIRA selects
  * these multipliers so that they sum to 0 and so that the vector norm of all
  * updated weight vectors concatenated is as small as possible.
  *
  * <p> In this sparse implementation of the algorithm, weight vectors
  * corresponding to labels and weights for features within those vectors are
  * added as they are observed in the data.  Whenever a feature is observed
  * for the first time, its corresponding weight in any given weight vector is
  * set to a random number, which is necessary to make this algorithm work.
  * It must never be the case that all weight vectors are equal to each other,
  * or updates will stop happening.  To ensure that results are reproducible,
  * the random number generator is seeded with the same seed every time.
  *
  * <p> In addition to the observed features, each weight vector also contains
  * a bias.  For this reason, we also halucinate an extra dimension on every
  * example vector containing a feature whose strength is <i>1</i>.
  *
  * <p> It is assumed that a single discrete label feature will be produced in
  * association with each example object.  A feature taking one of the values
  * observed in that label feature will be produced by the learned classifier.
  *
  * <p> This algorithm's user-configurable parameters are stored in member
  * fields of this class.  They may be set via either a constructor that names
  * each parameter explicitly or a constructor that takes an instance of
  * {@link LBJ2.learn.SparseMIRA.Parameters Parameters} as input.
  * The documentation in each member field in this class indicates the default
  * value of the associated parameter when using the former type of
  * constructor.  The documentation of the associated member field in the
  * {@link LBJ2.learn.SparseMIRA.Parameters Parameters} class
  * indicates the default value of the parameter when using the latter type of
  * constructor.
  *
  * @author Nick Rizzolo
 **/
public class SparseMIRA extends Learner
{
  /**
    * Used to decide if two values are nearly equal to each other.
    * @see #nearlyEqualTo(double,double)
   **/
  public static final double TOLERANCE = 1e-9;


  /** A map from labels to the weight vector corresponding to that label. */
  protected OVector network;
  /** Whether or not this learner's labeler produces conjunctive features. */
  protected boolean conjunctiveLabels;


  /** This algorithm has no parameters to set! */
  public SparseMIRA() { this(""); }

  /**
    * Initializing constructor.  This constructor appears here for
    * completeness; the algorithm takes no parameters.
    *
    * @param p  The settings of all parameters.
   **/
  public SparseMIRA(Parameters p) { this("", p); }

  /**
    * This algorithm has no parameters to set!
    *
    * @param n  The name of the classifier.
   **/
  public SparseMIRA(String n) {
    super(n);
    network = new OVector();
  }

  /**
    * Initializing constructor.  This constructor appears here for
    * completeness; the algorithm takes no parameters.
    *
    * @param n  The name of the classifier.
    * @param p  The settings of all parameters.
   **/
  public SparseMIRA(String n, Parameters p) { this(n); }


  /**
    * Retrieves the parameters that are set in this learner.
    *
    * @return An object containing all the values of the parameters that
    *         control the behavior of this learning algorithm.
   **/
  public Learner.Parameters getParameters() { return new Parameters(); }


  /**
    * Sets the labeler.
    *
    * @param l  A labeling classifier.
   **/
  public void setLabeler(Classifier l) {
    if (getClass().getName().indexOf("SparseMIRA") != -1
        && !l.getOutputType().equals("discrete")) {
      System.err.println(
          "LBJ WARNING: SparseMIRA will only work with a label classifier "
          + "that returns discrete.");
      System.err.println(
          "             The given label classifier, " + l.getClass().getName()
          + ", returns " + l.getOutputType() + ".");
    }

    super.setLabeler(l);
  }


  /**
    * Finds the optimal multiplier settings before updating the weight
    * vectors.
    *
    * @param exampleFeatures  The example's array of feature indices.
    * @param exampleValues    The example's array of feature values.
    * @param exampleLabels    The example's label(s).
    * @param labelValues      The labels' values.
   **/
  public void learn(int[] exampleFeatures, double[] exampleValues,
                    int[] exampleLabels, double[] labelValues) {
    int label = exampleLabels[0];
    int N = network.size();

    if (label >= N) {
      conjunctiveLabels |= labelLexicon.lookupKey(label).isConjunctive();
      while (N++ <= label)
        network.add(new BiasedRandomWeightVector());
    }

    if (N == 1) return;

    double norm2 = FeatureVector.L2NormSquared(exampleValues) + 1;

    double[] scores = new double[N];
    boolean[] isLabel = new boolean[scores.length];

    BiasedRandomWeightVector[] w =
      new BiasedRandomWeightVector[scores.length];
    double min = Double.MAX_VALUE, max = -Double.MAX_VALUE;

    for (int i = 0; i < N; ++i) {
      isLabel[i] = i == label;
      w[i] = (BiasedRandomWeightVector) network.get(i);
      scores[i] = w[i].dot(exampleFeatures, exampleValues) / norm2;
      min = Math.min(min, scores[i]);
      max = Math.max(max, scores[i]);
    }

    min--;
    max++;

    while (!nearlyEqualTo(min, max)) {
      double mid = (max + min) / 2;
      if (sumMultipliers(mid, scores, isLabel) <= 0) min = mid;
      else max = mid;
    }

    for (int i = 0; i < N; ++i) {
      double t = getMultiplier(min, scores[i], isLabel[i]);
      if (!nearlyEqualTo(t, 0))
        w[i].scaledAdd(exampleFeatures, exampleValues, t);
    }
  }


  /**
    * Returns the multiplier for a given weight vector update.  See Section
    * 5.1 of Crammer and Singer (2003) for a description of where this
    * computation comes from.
    *
    * @param theta    See Crammer and Singer (2003).
    * @param score    The dot product of the weight vector with the example
    *                 vector, divided by the norm of the example vector
    *                 squared.
    * @param isLabel  <code>true</code> iff this weight vector corresponds to
    *                 the example's label.
    * @return The multiplier for this weight vector's update.
   **/
  private static double getMultiplier(double theta, double score,
                                      boolean isLabel) {
    return Math.min(theta - score, isLabel ? 1 : 0);
  }


  /**
    * Finds the sum of the multipliers for a given value of theta.  See
    * Section 5.1 of Crammer and Singer (2003) for an explanation of what
    * theta is.
    *
    * @param theta    There should exist a value for this parameter that
    *                 causes this method to return zero.
    * @param scores   The dot products of the various weight vectors with the
    *                 example vector, divided by the norm of the example
    *                 vector squared.
    * @param isLabel  <code>true</code> at element <code>i</code> iff
    *                 <code>scores[i]</code> is the dot product involving the
    *                 weight vector corresponding to the example's label.
    * @return The sum of the multipliers assuming the given value of
    *         <code>theta</code>.
   **/
  private static double sumMultipliers(double theta, double[] scores,
                                       boolean[] isLabel) {
    double result = 0;
    for (int i = 0; i < scores.length; ++i)
      result += getMultiplier(theta, scores[i], isLabel[i]);
    return result;
  }


  /**
    * Determines if <code>a</code> is nearly equal to <code>b</code> based on
    * the value of the {@link #TOLERANCE} member variable.
    *
    * @param a  The first value.
    * @param b  The second value.
    * @return True if they are nearly equal, false otherwise.
   **/
  private static boolean nearlyEqualTo(double a, double b) {
    return -TOLERANCE < a - b && a - b < TOLERANCE;
  }


  /** Clears the network. */
  public void forget() {
    super.forget();
    network = new OVector();
  }


  /**
    * Produces a set of scores indicating the degree to which each possible
    * discrete classification value is associated with the given example
    * object.  These scores are just the dot product of each weight vector
    * with the example vector.
    *
    * @param exampleFeatures  The example's array of feature indices.
    * @param exampleValues    The example's array of feature values.
   **/
  public ScoreSet scores(int[] exampleFeatures, double[] exampleValues) {
    ScoreSet result = new ScoreSet();
    int N = network.size();

    for (int l = 0; l < N; l++) {
      double score =
        ((BiasedRandomWeightVector) network.get(l))
        .dot(exampleFeatures, exampleValues);
      result.put(labelLexicon.lookupKey(l).getStringValue(), score);
    }

    return result;
  }


  /**
    * Returns the classification of the given example as a single feature
    * instead of a {@link FeatureVector}.
    *
    * @param f  The features array.
    * @param v  The values array.
    * @return The classification of the example as a feature.
   **/
  public Feature featureValue(int[] f, double[] v) {
    double bestScore = Double.NEGATIVE_INFINITY;
    int bestLabel = -1;
    int N = network.size();

    for (int l = 0; l < N; l++) {
      double score = ((BiasedRandomWeightVector) network.get(l)).dot(f, v);

      if (score > bestScore) {
        bestLabel = l;
        bestScore = score;
      }
    }

    if (bestLabel == -1) return null;
    return predictions.get(bestLabel);
  }


  /**
    * This implementation uses a winner-take-all comparison of the individual
    * weight vectors' dot products.
    *
    * @param exampleFeatures  The example's array of feature indices.
    * @param exampleValues    The example's array of feature values.
    * @return The discrete value of the best prediction.
   **/
  public String discreteValue(int[] exampleFeatures, double[] exampleValues) {
    return featureValue(exampleFeatures, exampleValues).getStringValue();
  }


  /**
    * This implementation uses a winner-take-all comparison of the individual
    * weight vectors' dot products.
    *
    * @param exampleFeatures  The example's array of feature indices.
    * @param exampleValues    The example's array of feature values.
    * @return A single feature with the winning weight vector's associated
    *         value.
   **/
  public FeatureVector classify(int[] exampleFeatures, double[] exampleValues)
  {
    return new FeatureVector(featureValue(exampleFeatures, exampleValues));
  }


  /**
    * Using this method, the winner-take-all competition is narrowed to
    * involve only those labels contained in the specified list.  The list
    * must contain only <code>String</code>s.
    *
    * @param example    The example object.
    * @param candidates A list of the only labels the example may take.
    * @return The prediction as a feature or <code>null</code> if the network
    *         did not contain any of the specified labels.
   **/
  public Feature valueOf(Object example, Collection candidates) {
    Object[] array = getExampleArray(example, false);
    return valueOf((int[]) array[0], (double[]) array[1], candidates);
  }


  /**
    * Using this method, the winner-take-all competition is narrowed to
    * involve only those labels contained in the specified list.  The list
    * must contain only <code>String</code>s.
    *
    * @param exampleFeatures  The example's array of feature indices.
    * @param exampleValues    The example's array of feature values.
    * @param candidates       A list of the only labels the example may take.
    * @return The prediction as a feature or <code>null</code> if the network
    *         did not contain any of the specified labels.
   **/
  public Feature valueOf(int[] exampleFeatures, double[] exampleValues,
                         Collection candidates) {
    double bestScore = Double.NEGATIVE_INFINITY;
    int bestValue = -1;
    Iterator I = candidates.iterator();

    if (I.hasNext()) {
      if (conjunctiveLabels)
        return conjunctiveValueOf(exampleFeatures, exampleValues, I);

      while (I.hasNext()) {
        double score = Double.NEGATIVE_INFINITY;
        String label = (String) I.next();
        Feature f =
          new DiscretePrimitiveStringFeature(
                labeler.containingPackage, labeler.name, "", label,
                labeler.valueIndexOf(label),
                (short) labeler.allowableValues().length);

        int key = -1;
        if (labelLexicon.contains(f)) {
          key = labelLexicon.lookup(f);
          score = ((BiasedRandomWeightVector) network.get(key))
                  .dot(exampleFeatures, exampleValues);
        }

        if (score > bestScore) {
          bestValue = key;
          bestScore = score;
        }
      }
    }
    else {
      int N = network.size();
      for (int l = 0; l < N; l++) {
        double score =
          ((BiasedRandomWeightVector) network.get(l))
          .dot(exampleFeatures, exampleValues);

        if (score > bestScore) {
          bestValue = l;
          bestScore = score;
        }
      }
    }

    return bestValue == -1 ? null : predictions.get(bestValue);
  }


  /**
    * This method is a surrogate for
    * {@link #valueOf(int[],double[],Collection)} when the labeler is known to
    * produce conjunctive features.  It is necessary because when given a
    * string label from the collection, we will not know how to construct the
    * appropriate conjunctive feature key for lookup in the label lexicon.
    * So, we must go through each feature in the label lexicon and use
    * {@link LBJ2.classify.Feature#valueEquals(String)}.
    *
    * @param exampleFeatures  The example's array of feature indices.
    * @param exampleValues    The example's array of feature values.
    * @param I                An iterator over the set of labels to choose
    *                         from.
    * @return The prediction as a feature or <code>null</code> if the network
    *         did not contain any of the specified labels.
   **/
  protected Feature conjunctiveValueOf(
      int[] exampleFeatures, double[] exampleValues, Iterator I) {
    double bestScore = Double.NEGATIVE_INFINITY;
    int bestValue = -1;
    int N = network.size();

    while (I.hasNext()) {
      String label = (String) I.next();

      for (int i = 0; i < N; ++i) {
        LinearThresholdUnit ltu = (LinearThresholdUnit) network.get(i);
        if (ltu == null || !predictions.get(i).valueEquals(label))
          continue;
        double score = ltu.score(exampleFeatures, exampleValues);
        if (score > bestScore) {
          bestScore = score;
          bestValue = i;
        }
        break;
      }
    }

    return bestValue == -1 ? null : predictions.get(bestValue);
  }


  /**
    * Returns scores for only those labels in the given collection.  If the
    * given collection is empty, scores for all labels will be returned.  If
    * there is no {@link BiasedRandomWeightVector} associated with a given
    * label from the collection, that label's score in the returned
    * {@link ScoreSet} will be set to <code>Double.NEGATIVE_INFINITY</code>.
    *
    * <p> The elements of <code>candidates</code> must all be
    * <code>String</code>s.
    *
    * @param example    The example object.
    * @param candidates A list of the only labels the example may take.
    * @return Scores for only those labels in <code>candidates</code>.
   **/
  public ScoreSet scores(Object example, Collection candidates) {
    Object[] array = getExampleArray(example, false);
    return scores((int[]) array[0], (double[]) array[1], candidates);
  }


  /**
    * Returns scores for only those labels in the given collection.  If the
    * given collection is empty, scores for all labels will be returned.  If
    * there is no {@link BiasedRandomWeightVector} associated with a given
    * label from the collection, that label's score in the returned
    * {@link ScoreSet} will be set to <code>Double.NEGATIVE_INFINITY</code>.
    *
    * <p> The elements of <code>candidates</code> must all be
    * <code>String</code>s.
    *
    * @param exampleFeatures  The example's array of feature indices.
    * @param exampleValues    The example's array of feature values.
    * @param candidates       A list of the only labels the example may take.
    * @return Scores for only those labels in <code>candidates</code>.
   **/
  public ScoreSet scores(int[] exampleFeatures, double[] exampleValues,
                         Collection candidates) {
    ScoreSet result = new ScoreSet();
    Iterator I = candidates.iterator();

    if (I.hasNext()) {
      if (conjunctiveLabels)
        return conjunctiveScores(exampleFeatures, exampleValues, I);

      while (I.hasNext()) {
        double score = Double.NEGATIVE_INFINITY;
        String label = (String) I.next();
        Feature f =
          new DiscretePrimitiveStringFeature(
                labeler.containingPackage, labeler.name, "", label,
                labeler.valueIndexOf(label),
                (short) labeler.allowableValues().length);

        if (labelLexicon.contains(f)) {
          int key = labelLexicon.lookup(f);
          score = ((BiasedRandomWeightVector) network.get(key))
                  .dot(exampleFeatures, exampleValues);
          result.put(label.toString(), score);
        }
      }
    }
    else {
      int N = network.size();
      for (int l = 0; l < N; l++) {
        double score =
          ((BiasedRandomWeightVector) network.get(l))
          .dot(exampleFeatures, exampleValues);
        result.put(labelLexicon.lookupKey(l).getStringValue(), score);
      }
    }

    return result;
  }


  /**
    * This method is a surrogate for
    * {@link #scores(int[],double[],Collection)} when the labeler is known to
    * produce conjunctive features.  It is necessary because when given a
    * string label from the collection, we will not know how to construct the
    * appropriate conjunctive feature key for lookup in the label lexicon.
    * So, we must go through each feature in the label lexicon and use
    * {@link LBJ2.classify.Feature#valueEquals(String)}.
    *
    * @param exampleFeatures  The example's array of feature indices.
    * @param exampleValues    The example's array of feature values.
    * @param I                An iterator over the set of labels to choose
    *                         from.
    * @return The label chosen by this classifier or <code>null</code> if the
    *         network did not contain any of the specified labels.
   **/
  protected ScoreSet conjunctiveScores(int[] exampleFeatures,
                                       double[] exampleValues, Iterator I) {
    ScoreSet result = new ScoreSet();
    int N = network.size();

    while (I.hasNext()) {
      String label = (String) I.next();

      for (int i = 0; i < N; ++i) {
        LinearThresholdUnit ltu = (LinearThresholdUnit) network.get(i);
        if (ltu == null || !labelLexicon.lookupKey(i).valueEquals(label))
          continue;
        double score = ltu.score(exampleFeatures, exampleValues);
        result.put(label.toString(), score);
        break;
      }
    }

    return result;
  }


  /**
    * Writes the algorithm's internal representation as text.
    *
    * @param out  The output stream.
   **/
  public void write(PrintStream out) {
    int N = network.size();
    for (int i = 0; i < N; ++i) {
      out.println("label: " + predictions.get(i).getStringValue());
      ((BiasedRandomWeightVector) network.get(i)).write(out, lexicon);
    }

    out.println("End of SparseMIRA");
  }


  /**
    * Writes the learned function's internal representation in binary form.
    *
    * @param out  The output stream.
   **/
  public void write(ExceptionlessOutputStream out) {
    super.write(out);
    int N = network.size();
    out.writeInt(N);
    for (int i = 0; i < N; ++i)
      ((BiasedRandomWeightVector) network.get(i)).write(out);
  }


  /**
    * Reads the binary representation of a learner with this object's run-time
    * type, overwriting any and all learned or manually specified parameters
    * as well as the label lexicon but without modifying the feature lexicon.
    *
    * @param in The input stream.
   **/
  public void read(ExceptionlessInputStream in) {
    super.read(in);
    int N = in.readInt();
    network = new OVector(N);
    for (int i = 0; i < N; ++i)
      network.add(SparseWeightVector.readWeightVector(in));
  }


  /** Returns a deep clone of this learning algorithm. */
  public Object clone() {
    SparseMIRA clone = null;

    try { clone = (SparseMIRA) super.clone(); }
    catch (Exception e) {
      System.err.println("Error cloning SparseMIRA: " + e);
      e.printStackTrace();
      System.exit(1);
    }

    int N = network.size();
    clone.network = new OVector(N);
    for (int i = 0; i < N; ++i)
      clone.network.add(((BiasedRandomWeightVector) network.get(i)).clone());

    return clone;
  }


  /**
    * Simply a container for all of {@link SparseMIRA}'s
    * configurable parameters.  This class appears here for completeness; the
    * algorithm has no parameters to set.
    *
    * @author Nick Rizzolo
   **/
  public static class Parameters extends Learner.Parameters
  {
    /** Sets all the default values. */
    public Parameters() { }


    /**
      * Sets the parameters from the parent's parameters object, giving
      * defaults to all parameters declared in this object.
     **/
    public Parameters(Learner.Parameters p) { super(p); }


    /** Copy constructor. */
    public Parameters(Parameters p) { super(p); }


    /**
      * Calls the appropriate <code>Learner.setParameters(Parameters)</code>
      * method for this <code>Parameters</code> object.
      *
      * @param l  The learner whose parameters will be set.
     **/
    public void setParameters(Learner l) { }
  }
}