SparseAveragedPerceptron.java example

Explorer
MinorThird-master
package LBJ2.learn;

import java.io.PrintStream;
import java.util.Arrays;
import java.util.Comparator;
import java.util.Map;

import LBJ2.classify.Classifier;
import LBJ2.classify.Feature;
import LBJ2.util.DVector;
import LBJ2.util.ExceptionlessInputStream;
import LBJ2.util.ExceptionlessOutputStream;


/**
  * An approximation to voted Perceptron, in which a weighted average of the
  * weight vectors arrived at during training becomes the weight vector used
  * to make predictions after training.
  *
  * <p> During training, after each example <i>e<sub>i</sub></i> is processed,
  * the weight vector <i>w<sub>i</sub></i> becomes the active weight vector
  * used to make predictions on future training examples.  If a mistake was
  * made on <i>e<sub>i</sub></i>, <i>w<sub>i</sub></i> will be different than
  * <i>w<sub>i - 1</sub></i>.  Otherwise, it will remain unchanged.
  *
  * <p> After training, each distinct weight vector arrived at during training
  * is associated with an integer weight equal to the number of examples whose
  * training made that weight vector active.  A new weight vector
  * <i>w<sup>*</sup></i> is computed by taking the average of all these weight
  * vectors weighted as described.  <i>w<sup>*</sup></i> is used to make all
  * predictions returned to the user through methods such as
  * {@link Classifier#classify(Object)} or
  * {@link Classifier#discreteValue(Object)}.
  *
  * <p> The above description is a useful way to think about the operation of
  * this {@link Learner}.  However, the user should note that this
  * implementation never explicitly stores <i>w<sup>*</sup></i>.  Instead, it
  * is computed efficiently on demand.  Thus, interspersed online training and
  * evaluation is efficient and operates as expected.
  *
  * <p> It is assumed that {@link Learner#labeler} is a single discrete
  * classifier that produces the same feature for every example object and
  * that the values that feature may take are available through the
  * {@link Classifier#allowableValues()} method.  The second value returned
  * from {@link Classifier#allowableValues()} is treated as "positive", and it
  * is assumed there are exactly 2 allowable values.  Assertions will produce
  * error messages if these assumptions do not hold.
  *
  * @author Nick Rizzolo
 **/
public class SparseAveragedPerceptron extends SparsePerceptron
{
  /** Default for {@link LinearThresholdUnit#weightVector}. */
  public static final AveragedWeightVector defaultWeightVector =
    new AveragedWeightVector();

  /**
    * Holds the same reference as {@link LinearThresholdUnit#weightVector}
    * casted to {@link SparseAveragedPerceptron.AveragedWeightVector}.
   **/
  protected AveragedWeightVector awv;
  /** Keeps the extra information necessary to compute the averaged bias. */
  protected double averagedBias;


  /**
    * The learning rate and threshold take default values, while the name of
    * the classifier gets the empty string.
   **/
  public SparseAveragedPerceptron() { this(""); }

  /**
    * Sets the learning rate to the specified value, and the threshold takes
    * the default, while the name of the classifier gets the empty string.
    *
    * @param r  The desired learning rate value.
   **/
  public SparseAveragedPerceptron(double r) { this("", r); }

  /**
    * Sets the learning rate and threshold to the specified values, while the
    * name of the classifier gets the empty string.
    *
    * @param r  The desired learning rate value.
    * @param t  The desired threshold value.
   **/
  public SparseAveragedPerceptron(double r, double t) { this("", r, t); }

  /**
    * Use this constructor to fit a thick separator, where both the positive
    * and negative sides of the hyperplane will be given the specified
    * thickness, while the name of the classifier gets the empty string.
    *
    * @param r  The desired learning rate value.
    * @param t  The desired threshold value.
    * @param pt The desired thickness.
   **/
  public SparseAveragedPerceptron(double r, double t, double pt) {
    this("", r, t, pt);
  }

  /**
    * Use this constructor to fit a thick separator, where the positive and
    * negative sides of the hyperplane will be given the specified separate
    * thicknesses, while the name of the classifier gets the empty string.
    *
    * @param r  The desired learning rate value.
    * @param t  The desired threshold value.
    * @param pt The desired positive thickness.
    * @param nt The desired negative thickness.
   **/
  public SparseAveragedPerceptron(double r, double t, double pt, double nt) {
    this("", r, t, pt, nt);
  }

  /**
    * Initializing constructor.  Sets all member variables to their associated
    * settings in the {@link SparseAveragedPerceptron.Parameters} object.
    *
    * @param p  The settings of all parameters.
   **/
  public SparseAveragedPerceptron(SparseAveragedPerceptron.Parameters p) {
    this("", p);
  }


  /**
    * The learning rate and threshold take default values.
    *
    * @param n  The name of the classifier.
   **/
  public SparseAveragedPerceptron(String n) { this(n, defaultLearningRate); }

  /**
    * Sets the learning rate to the specified value, and the threshold takes
    * the default.
    *
    * @param n  The name of the classifier.
    * @param r  The desired learning rate value.
   **/
  public SparseAveragedPerceptron(String n, double r) {
    this(n, r, defaultThreshold);
  }

  /**
    * Sets the learning rate and threshold to the specified values.
    *
    * @param n  The name of the classifier.
    * @param r  The desired learning rate value.
    * @param t  The desired threshold value.
   **/
  public SparseAveragedPerceptron(String n, double r, double t) {
    this(n, r, t, defaultThickness);
  }

  /**
    * Use this constructor to fit a thick separator, where both the positive
    * and negative sides of the hyperplane will be given the specified
    * thickness.
    *
    * @param n  The name of the classifier.
    * @param r  The desired learning rate value.
    * @param t  The desired threshold value.
    * @param pt The desired thickness.
   **/
  public SparseAveragedPerceptron(String n, double r, double t, double pt) {
    this(n, r, t, pt, pt);
  }

  /**
    * Use this constructor to fit a thick separator, where the positive and
    * negative sides of the hyperplane will be given the specified separate
    * thicknesses.
    *
    * @param n  The name of the classifier.
    * @param r  The desired learning rate value.
    * @param t  The desired threshold value.
    * @param pt The desired positive thickness.
    * @param nt The desired negative thickness.
   **/
  public SparseAveragedPerceptron(String n, double r, double t, double pt,
                                  double nt) {
    super(n);
    Parameters p = new Parameters();
    p.learningRate = r;
    p.threshold = t;
    p.positiveThickness = pt;
    p.negativeThickness = nt;
    setParameters(p);
  }

  /**
    * Initializing constructor.  Sets all member variables to their associated
    * settings in the {@link SparseAveragedPerceptron.Parameters} object.
    *
    * @param n  The name of the classifier.
    * @param p  The settings of all parameters.
   **/
  public SparseAveragedPerceptron(String n,
                                  SparseAveragedPerceptron.Parameters p) {
    super(n);
    setParameters(p);
  }


  /**
    * Retrieves the parameters that are set in this learner.
    *
    * @return An object containing all the values of the parameters that
    *         control the behavior of this learning algorithm.
   **/
  public Learner.Parameters getParameters() {
    Parameters p =
      new Parameters((SparsePerceptron.Parameters) super.getParameters());
    return p;
  }


  /**
    * Sets the values of parameters that control the behavior of this learning
    * algorithm.
    *
    * @param p  The parameters.
   **/
  public void setParameters(Parameters p) {
    super.setParameters(p);
    awv = (AveragedWeightVector) weightVector;
  }


  /**
    * The score of the specified object is equal to <code>w * x + bias</code>
    * where <code>*</code> is dot product, <code>w</code> is the weight
    * vector, and <code>x</code> is the feature vector produced by the
    * extractor.
    *
    * @param exampleFeatures  The example's array of feature indices.
    * @param exampleValues    The example's array of feature values.
    * @return The result of the dot product plus the bias.
   **/
  public double score(int[] exampleFeatures, double[] exampleValues) {
    double result = awv.dot(exampleFeatures, exampleValues, initialWeight);
    int examples = awv.getExamples();

    if (examples > 0)
      result += (examples * bias - averagedBias) / (double) examples;
    return result;
  }


  /**
    * Scales the feature vector produced by the extractor by the learning rate
    * and adds it to the weight vector.
    *
    * @param exampleFeatures  The example's array of feature indices.
    * @param exampleValues    The example's array of feature values.
   **/
  public void promote(int[] exampleFeatures, double[] exampleValues,
                      double rate) {
    bias += rate;

    int examples = awv.getExamples();
    averagedBias += examples * rate;
    awv.scaledAdd(exampleFeatures, exampleValues, rate, initialWeight);
  }


  /**
    * Scales the feature vector produced by the extractor by the learning rate
    * and subtracts it from the weight vector.
    *
    * @param exampleFeatures  The example's array of feature indices.
    * @param exampleValues    The example's array of feature values.
   **/
  public void demote(int[] exampleFeatures, double[] exampleValues,
                     double rate) {
    bias -= rate;

    int examples = awv.getExamples();
    averagedBias -= examples * rate;
    awv.scaledAdd(exampleFeatures, exampleValues, -rate, initialWeight);
  }


  /**
    * This method works just like
    * {@link LinearThresholdUnit#learn(int[],double[],int[],double[])}, except
    * it notifies its weight vector when it got an example correct in addition
    * to updating it when it makes a mistake.
    *
    * @param exampleFeatures  The example's array of feature indices
    * @param exampleValues    The example's array of feature values
    * @param exampleLabels    The example's label(s)
    * @param labelValues      The labels' values
   **/
  public void learn(int[] exampleFeatures, double[] exampleValues,
                    int[] exampleLabels, double[] labelValues) {
    assert exampleLabels.length == 1
      : "Example must have a single label.";
    assert exampleLabels[0] == 0 || exampleLabels[0] == 1
      : "Example has unallowed label value.";

    boolean label = (exampleLabels[0] == 1);

    double s =
      awv.simpleDot(exampleFeatures, exampleValues, initialWeight) + bias;
    if (label && s < threshold + positiveThickness)
      promote(exampleFeatures, exampleValues, getLearningRate());
    else if (!label && s >= threshold - negativeThickness)
      demote(exampleFeatures, exampleValues, getLearningRate());
    else awv.correctExample();
  }


  /**
    * Initializes the weight vector array to the size of
    * the supplied number of features, with each cell taking
    * the default value of {@link #initialWeight}.
    *
    * @param numExamples   The number of examples
    * @param numFeatures   The number of features
   **/
  public void initialize(int numExamples, int numFeatures) {
    double[] weights = new double[numFeatures];
    Arrays.fill(weights, initialWeight);
    weightVector = awv = new AveragedWeightVector(weights);
  }


  /** Resets the weight vector to all zeros. */
  public void forget() {
    super.forget();
    awv = (AveragedWeightVector) weightVector;
    averagedBias = 0;
  }


  /**
    * Writes the algorithm's internal representation as text.  In the first
    * line of output, the name of the classifier is printed, followed by
    * {@link SparsePerceptron#learningRate},
    * {@link LinearThresholdUnit#initialWeight},
    * {@link LinearThresholdUnit#threshold},
    * {@link LinearThresholdUnit#positiveThickness},
    * {@link LinearThresholdUnit#negativeThickness},
    * {@link LinearThresholdUnit#bias}, and finally {@link #averagedBias}.
    *
    * @param out  The output stream.
   **/
  public void write(PrintStream out) {
    out.println(name + ": " + learningRate + ", " + initialWeight + ", "
                + threshold + ", " + positiveThickness + ", "
                + negativeThickness + ", " + bias + ", " + averagedBias);
    if (lexicon == null || lexicon.size() == 0) awv.write(out);
    else awv.write(out, lexicon);
  }


  /**
    * Writes the learned function's internal representation in binary form.
    *
    * @param out  The output stream.
   **/
  public void write(ExceptionlessOutputStream out) {
    super.write(out);
    out.writeDouble(averagedBias);
  }


  /**
    * Reads the binary representation of a learner with this object's run-time
    * type, overwriting any and all learned or manually specified parameters
    * as well as the label lexicon but without modifying the feature lexicon.
    *
    * @param in The input stream.
   **/
  public void read(ExceptionlessInputStream in) {
    super.read(in);
    awv = (AveragedWeightVector) weightVector;
    averagedBias = in.readDouble();
  }


  /**
    * Simply a container for all of {@link SparseAveragedPerceptron}'s
    * configurable parameters.  Using instances of this class should make code
    * more readable and constructors less complicated.  Note that if the
    * object referenced by {@link LinearThresholdUnit.Parameters#weightVector}
    * is replaced via an instance of this class, it must be replaced with an
    * {@link SparseAveragedPerceptron.AveragedWeightVector}.
    *
    * @author Nick Rizzolo
   **/
  public static class Parameters extends SparsePerceptron.Parameters
  {
    /** Sets all the default values. */
    public Parameters() {
      weightVector = (AveragedWeightVector) defaultWeightVector.clone();
    }


    /**
      * Sets the parameters from the parent's parameters object, giving
      * defaults to all parameters declared in this object.
     **/
    public Parameters(SparsePerceptron.Parameters p) { super(p); }


    /** Copy constructor. */
    public Parameters(Parameters p) { super(p); }


    /**
      * Calls the appropriate <code>Learner.setParameters(Parameters)</code>
      * method for this <code>Parameters</code> object.
      *
      * @param l  The learner whose parameters will be set.
     **/
    public void setParameters(Learner l) {
      ((SparseAveragedPerceptron) l).setParameters(this);
    }
  }


  /**
    * This implementation of a sparse weight vector associates two
    * <code>double</code>s with each {@link Feature}.  The first plays the
    * role of the usual weight vector, and the second accumulates multiples of
    * examples on which mistakes were made to help implement the weighted
    * average.
    *
    * @author Nick Rizzolo
   **/
  public static class AveragedWeightVector extends SparseWeightVector
  {
    /**
      * Together with {@link SparseWeightVector#weights}, this vector provides
      * enough information to reconstruct the average of all weight vectors
      * arrived at during the course of learning.
     **/
    public DVector averagedWeights;
    /** Counts the total number of training examples this vector has seen. */
    protected int examples;


    /** Simply instantiates the weight vectors. */
    public AveragedWeightVector() { this(new DVector(defaultCapacity)); }

    /**
      * Simply initializes the weight vectors.
      *
      * @param w  An array of weights.
     **/
    public AveragedWeightVector(double[] w) { this(new DVector(w)); }

    /**
      * Simply initializes the weight vectors.
      *
      * @param w  A vector of weights.
     **/
    public AveragedWeightVector(DVector w) {
      super((DVector) w.clone());
      averagedWeights = w;
    }


    /** Increments the {@link #examples} variable. */
    public void correctExample() { ++examples; }
    /** Returns the {@link #examples} variable. */
    public int getExamples() { return examples; }


    /**
      * Returns the averaged weight of the given feature.
      *
      * @param featureIndex The feature index.
      * @param defaultW     The default weight.
      * @return The weight of the feature.
     **/
    public double getAveragedWeight(int featureIndex, double defaultW) {
      if (examples == 0) return 0;
      double aw = averagedWeights.get(featureIndex, defaultW);
      double w = getWeight(featureIndex, defaultW);
      return (examples*w - aw) / (double) examples;
    }


    /**
      * Takes the dot product of this <code>AveragedWeightVector</code> with
      * the argument vector, using the hard coded default weight.
      *
      * @param exampleFeatures  The example's array of feature indices.
      * @param exampleValues    The example's array of feature values.
      * @return The computed dot product.
     **/
    public double dot(int[] exampleFeatures, double[] exampleValues) {
      return dot(exampleFeatures, exampleValues, defaultWeight);
    }


    /**
      * Takes the dot product of this <code>AveragedWeightVector</code> with
      * the argument vector, using the specified default weight when one is
      * not yet present in this vector.
      *
      * @param exampleFeatures  The example's array of feature indices.
      * @param exampleValues    The example's array of feature values.
      * @param defaultW         The default weight.
      * @return The computed dot product.
     **/
    public double dot(int[] exampleFeatures, double[] exampleValues,
                      double defaultW) {
      double sum = 0;

      for (int i = 0; i < exampleFeatures.length; i++) {
        double w = getAveragedWeight(exampleFeatures[i], defaultW);
        sum += w * exampleValues[i];
      }

      return sum;
    }


    /**
      * Takes the dot product of the regular, non-averaged, Perceptron weight
      * vector with the given vector, using the hard coded default weight.
      *
      * @param exampleFeatures  The example's array of feature indices.
      * @param exampleValues    The example's array of feature values.
      * @return The computed dot product.
     **/
    public double simpleDot(int[] exampleFeatures, double[] exampleValues) {
      return super.dot(exampleFeatures, exampleValues, defaultWeight);
    }


    /**
      * Takes the dot product of the regular, non-averaged, Perceptron weight
      * vector with the given vector, using the specified default weight when
      * a feature is not yet present in this vector.
      *
      * @param exampleFeatures  The example's array of feature indices.
      * @param exampleValues    The example's array of feature values.
      * @param defaultW         An initial weight for new features.
      * @return The computed dot product.
     **/
    public double simpleDot(int[] exampleFeatures, double[] exampleValues,
                            double defaultW) {
      return super.dot(exampleFeatures, exampleValues, defaultW);
    }


    /**
      * Performs pairwise addition of the feature values in the given vector
      * scaled by the given factor, modifying this weight vector, using the
      * specified default weight when a feature from the given vector is not
      * yet present in this vector.  The default weight is used to initialize
      * new feature weights.
      *
      * @param exampleFeatures  The example's array of feature indices.
      * @param exampleValues    The example's array of feature values.
      * @param factor           The scaling factor.
     **/
    public void scaledAdd(int[] exampleFeatures, double[] exampleValues,
                          double factor) {
      scaledAdd(exampleFeatures, exampleValues, factor, defaultWeight);
    }


    /**
      * Performs pairwise addition of the feature values in the given vector
      * scaled by the given factor, modifying this weight vector, using the
      * specified default weight when a feature from the given vector is not
      * yet present in this vector.
      *
      * @param exampleFeatures  The example's array of feature indices.
      * @param exampleValues    The example's array of feature values.
      * @param factor           The scaling factor.
      * @param defaultW         An initial weight for new features.
     **/
    public void scaledAdd(int[] exampleFeatures, double[] exampleValues,
                         double factor, double defaultW) {
      for (int i = 0; i < exampleFeatures.length; i++) {
        int featureIndex = exampleFeatures[i];
        double currentWeight = getWeight(featureIndex, defaultW);
        double w = currentWeight + factor*exampleValues[i];

        double difference = w - currentWeight;
        updateAveragedWeight(featureIndex, examples*difference);

        setWeight(featureIndex, w);
      }

      ++examples;
    }


    /**
      * Adds a new value to the current averaged weight indexed
      * by the supplied feature index.
      *
      * @param featureIndex The feature index.
      * @param w            The value to add to the current weight.
     **/
    protected void updateAveragedWeight(int featureIndex, double w) {
      double newWeight = averagedWeights.get(featureIndex, defaultWeight) + w;
      averagedWeights.set(featureIndex, newWeight, defaultWeight);
    }


    /**
      * Outputs the contents of this <code>SparseWeightVector</code> into the
      * specified <code>PrintStream</code>.  The string representation starts
      * with a <code>"Begin"</code> annotation, ends with an
      * <code>"End"</code> annotation, and without a <code>Lexicon</code>
      * passed as a parameter, the weights are simply printed in the order of
      * their integer indices.
      *
      * @param out  The stream to write to.
     **/
    public void write(PrintStream out) {
      out.println("Begin AveragedWeightVector");
      for (int i = 0; i < averagedWeights.size(); ++i)
        out.println(getAveragedWeight(i, 0));
      out.println("End AveragedWeightVector");
    }


    /**
      * Outputs the contents of this <code>SparseWeightVector</code> into the
      * specified <code>PrintStream</code>.  The string representation starts
      * with a <code>"Begin"</code> annotation, ends with an
      * <code>"End"</code> annotation, and lists each feature with its
      * corresponding weight on the same, separate line in between.
      *
      * @param out  The stream to write to.
      * @param lex  The feature lexicon.
     **/
    public void write(PrintStream out, Lexicon lex) {
      out.println("Begin AveragedWeightVector");

      Map map = lex.getMap();
      Map.Entry[] entries =
        (Map.Entry[]) map.entrySet().toArray(new Map.Entry[map.size()]);
      Arrays.sort(entries,
                  new Comparator() {
                    public int compare(Object o1, Object o2) {
                      Map.Entry e1 = (Map.Entry) o1;
                      Map.Entry e2 = (Map.Entry) o2;
                      int i1 = ((Integer) e1.getValue()).intValue();
                      int i2 = ((Integer) e2.getValue()).intValue();
                      if ((i1 < weights.size()) != (i2 < weights.size()))
                        return i1 - i2;
                      return ((Feature) e1.getKey()).compareTo(e2.getKey());
                    }
                  });

      int i, biggest = 0;
      for (i = 0; i < entries.length; ++i) {
        String key =
          entries[i].getKey().toString()
          + (((Integer) entries[i].getValue()).intValue() < weights.size()
             ? "" : " (pruned)");
        biggest = Math.max(biggest, key.length());
      }

      if (biggest % 2 == 0) biggest += 2;
      else ++biggest;

      for (i = 0; i < entries.length; ++i) {
        String key =
          entries[i].getKey().toString()
          + (((Integer) entries[i].getValue()).intValue() < weights.size()
             ? "" : " (pruned)");
        out.print(key);
        for (int j = 0; key.length() + j < biggest; ++j) out.print(" ");

        int index = ((Integer) entries[i].getValue()).intValue();
        double weight = getAveragedWeight(index, 0);
        out.println(weight);
      }

      out.println("End AveragedWeightVector");
    }


    /**
      * Writes the weight vector's internal representation in binary form.
      *
      * @param out  The output stream.
     **/
    public void write(ExceptionlessOutputStream out) {
      super.write(out);
      out.writeInt(examples);
      averagedWeights.write(out);
    }


    /**
      * Reads the representation of a weight vector with this object's
      * run-time type from the given stream, overwriting the data in this
      * object.
      *
      * <p> This method is appropriate for reading weight vectors as written
      * by {@link #write(ExceptionlessOutputStream)}.
      *
      * @param in The input stream.
     **/
    public void read(ExceptionlessInputStream in) {
      super.read(in);
      examples = in.readInt();
      averagedWeights.read(in);
    }


    /**
      * Returns a copy of this <code>AveragedWeightVector</code>.
      *
      * @return A copy of this <code>AveragedWeightVector</code>.
     **/
    public Object clone() {
      AveragedWeightVector clone = (AveragedWeightVector) super.clone();
      clone.averagedWeights = (DVector) averagedWeights.clone();
      return clone;
    }


    /**
      * Returns a new, empty weight vector with the same parameter settings as
      * this one.
      *
      * @return An empty weight vector.
     **/
    public SparseWeightVector emptyClone() {
      return new AveragedWeightVector();
    }
  }
}