package LBJ2.learn;
import java.io.PrintStream;
import java.util.Collection;
import java.util.Iterator;
import LBJ2.classify.Classifier;
import LBJ2.classify.DiscretePrimitiveStringFeature;
import LBJ2.classify.Feature;
import LBJ2.classify.FeatureVector;
import LBJ2.classify.ScoreSet;
import LBJ2.util.ExceptionlessInputStream;
import LBJ2.util.ExceptionlessOutputStream;
import LBJ2.util.OVector;
/**
* An implementation of the Margin Infused Relaxed Algorithm of Crammer and
* Singer. This is a multi-class, online learner that maintains a separate
* weight vector for every prediction class, just as
* {@link SparseNetworkLearner} does. However, updates to these weight
* vectors given an example vector <i>x</i> with label <i>y</i> are dependent
* on each other as follows. For each weight vector <i>w<sub>v</sub></i>
* corresponding to a prediction value <i>v</i>, a multiplier
* <i>t<sub>v</sub></i> is selected and used to update <i>w<sub>v</sub></i>
* as <i>w<sub>v</sub></i> += <i>t<sub>v</sub> x</i>. <i>t<sub>v</sub></i>
* must be less than or equal to zero for all <i>v</i> != <i>y</i>.
* <i>t<sub>y</sub></i> must be less than or equal to one. MIRA selects
* these multipliers so that they sum to 0 and so that the vector norm of all
* updated weight vectors concatenated is as small as possible.
*
* <p> In this sparse implementation of the algorithm, weight vectors
* corresponding to labels and weights for features within those vectors are
* added as they are observed in the data. Whenever a feature is observed
* for the first time, its corresponding weight in any given weight vector is
* set to a random number, which is necessary to make this algorithm work.
* It must never be the case that all weight vectors are equal to each other,
* or updates will stop happening. To ensure that results are reproducible,
* the random number generator is seeded with the same seed every time.
*
* <p> In addition to the observed features, each weight vector also contains
* a bias. For this reason, we also halucinate an extra dimension on every
* example vector containing a feature whose strength is <i>1</i>.
*
* <p> It is assumed that a single discrete label feature will be produced in
* association with each example object. A feature taking one of the values
* observed in that label feature will be produced by the learned classifier.
*
* <p> This algorithm's user-configurable parameters are stored in member
* fields of this class. They may be set via either a constructor that names
* each parameter explicitly or a constructor that takes an instance of
* {@link LBJ2.learn.SparseMIRA.Parameters Parameters} as input.
* The documentation in each member field in this class indicates the default
* value of the associated parameter when using the former type of
* constructor. The documentation of the associated member field in the
* {@link LBJ2.learn.SparseMIRA.Parameters Parameters} class
* indicates the default value of the parameter when using the latter type of
* constructor.
*
* @author Nick Rizzolo
**/
public class SparseMIRA extends Learner
{
/**
* Used to decide if two values are nearly equal to each other.
* @see #nearlyEqualTo(double,double)
**/
public static final double TOLERANCE = 1e-9;
/** A map from labels to the weight vector corresponding to that label. */
protected OVector network;
/** Whether or not this learner's labeler produces conjunctive features. */
protected boolean conjunctiveLabels;
/** This algorithm has no parameters to set! */
public SparseMIRA() { this(""); }
/**
* Initializing constructor. This constructor appears here for
* completeness; the algorithm takes no parameters.
*
* @param p The settings of all parameters.
**/
public SparseMIRA(Parameters p) { this("", p); }
/**
* This algorithm has no parameters to set!
*
* @param n The name of the classifier.
**/
public SparseMIRA(String n) {
super(n);
network = new OVector();
}
/**
* Initializing constructor. This constructor appears here for
* completeness; the algorithm takes no parameters.
*
* @param n The name of the classifier.
* @param p The settings of all parameters.
**/
public SparseMIRA(String n, Parameters p) { this(n); }
/**
* Retrieves the parameters that are set in this learner.
*
* @return An object containing all the values of the parameters that
* control the behavior of this learning algorithm.
**/
public Learner.Parameters getParameters() { return new Parameters(); }
/**
* Sets the labeler.
*
* @param l A labeling classifier.
**/
public void setLabeler(Classifier l) {
if (getClass().getName().indexOf("SparseMIRA") != -1
&& !l.getOutputType().equals("discrete")) {
System.err.println(
"LBJ WARNING: SparseMIRA will only work with a label classifier "
+ "that returns discrete.");
System.err.println(
" The given label classifier, " + l.getClass().getName()
+ ", returns " + l.getOutputType() + ".");
}
super.setLabeler(l);
}
/**
* Finds the optimal multiplier settings before updating the weight
* vectors.
*
* @param exampleFeatures The example's array of feature indices.
* @param exampleValues The example's array of feature values.
* @param exampleLabels The example's label(s).
* @param labelValues The labels' values.
**/
public void learn(int[] exampleFeatures, double[] exampleValues,
int[] exampleLabels, double[] labelValues) {
int label = exampleLabels[0];
int N = network.size();
if (label >= N) {
conjunctiveLabels |= labelLexicon.lookupKey(label).isConjunctive();
while (N++ <= label)
network.add(new BiasedRandomWeightVector());
}
if (N == 1) return;
double norm2 = FeatureVector.L2NormSquared(exampleValues) + 1;
double[] scores = new double[N];
boolean[] isLabel = new boolean[scores.length];
BiasedRandomWeightVector[] w =
new BiasedRandomWeightVector[scores.length];
double min = Double.MAX_VALUE, max = -Double.MAX_VALUE;
for (int i = 0; i < N; ++i) {
isLabel[i] = i == label;
w[i] = (BiasedRandomWeightVector) network.get(i);
scores[i] = w[i].dot(exampleFeatures, exampleValues) / norm2;
min = Math.min(min, scores[i]);
max = Math.max(max, scores[i]);
}
min--;
max++;
while (!nearlyEqualTo(min, max)) {
double mid = (max + min) / 2;
if (sumMultipliers(mid, scores, isLabel) <= 0) min = mid;
else max = mid;
}
for (int i = 0; i < N; ++i) {
double t = getMultiplier(min, scores[i], isLabel[i]);
if (!nearlyEqualTo(t, 0))
w[i].scaledAdd(exampleFeatures, exampleValues, t);
}
}
/**
* Returns the multiplier for a given weight vector update. See Section
* 5.1 of Crammer and Singer (2003) for a description of where this
* computation comes from.
*
* @param theta See Crammer and Singer (2003).
* @param score The dot product of the weight vector with the example
* vector, divided by the norm of the example vector
* squared.
* @param isLabel <code>true</code> iff this weight vector corresponds to
* the example's label.
* @return The multiplier for this weight vector's update.
**/
private static double getMultiplier(double theta, double score,
boolean isLabel) {
return Math.min(theta - score, isLabel ? 1 : 0);
}
/**
* Finds the sum of the multipliers for a given value of theta. See
* Section 5.1 of Crammer and Singer (2003) for an explanation of what
* theta is.
*
* @param theta There should exist a value for this parameter that
* causes this method to return zero.
* @param scores The dot products of the various weight vectors with the
* example vector, divided by the norm of the example
* vector squared.
* @param isLabel <code>true</code> at element <code>i</code> iff
* <code>scores[i]</code> is the dot product involving the
* weight vector corresponding to the example's label.
* @return The sum of the multipliers assuming the given value of
* <code>theta</code>.
**/
private static double sumMultipliers(double theta, double[] scores,
boolean[] isLabel) {
double result = 0;
for (int i = 0; i < scores.length; ++i)
result += getMultiplier(theta, scores[i], isLabel[i]);
return result;
}
/**
* Determines if <code>a</code> is nearly equal to <code>b</code> based on
* the value of the {@link #TOLERANCE} member variable.
*
* @param a The first value.
* @param b The second value.
* @return True if they are nearly equal, false otherwise.
**/
private static boolean nearlyEqualTo(double a, double b) {
return -TOLERANCE < a - b && a - b < TOLERANCE;
}
/** Clears the network. */
public void forget() {
super.forget();
network = new OVector();
}
/**
* Produces a set of scores indicating the degree to which each possible
* discrete classification value is associated with the given example
* object. These scores are just the dot product of each weight vector
* with the example vector.
*
* @param exampleFeatures The example's array of feature indices.
* @param exampleValues The example's array of feature values.
**/
public ScoreSet scores(int[] exampleFeatures, double[] exampleValues) {
ScoreSet result = new ScoreSet();
int N = network.size();
for (int l = 0; l < N; l++) {
double score =
((BiasedRandomWeightVector) network.get(l))
.dot(exampleFeatures, exampleValues);
result.put(labelLexicon.lookupKey(l).getStringValue(), score);
}
return result;
}
/**
* Returns the classification of the given example as a single feature
* instead of a {@link FeatureVector}.
*
* @param f The features array.
* @param v The values array.
* @return The classification of the example as a feature.
**/
public Feature featureValue(int[] f, double[] v) {
double bestScore = Double.NEGATIVE_INFINITY;
int bestLabel = -1;
int N = network.size();
for (int l = 0; l < N; l++) {
double score = ((BiasedRandomWeightVector) network.get(l)).dot(f, v);
if (score > bestScore) {
bestLabel = l;
bestScore = score;
}
}
if (bestLabel == -1) return null;
return predictions.get(bestLabel);
}
/**
* This implementation uses a winner-take-all comparison of the individual
* weight vectors' dot products.
*
* @param exampleFeatures The example's array of feature indices.
* @param exampleValues The example's array of feature values.
* @return The discrete value of the best prediction.
**/
public String discreteValue(int[] exampleFeatures, double[] exampleValues) {
return featureValue(exampleFeatures, exampleValues).getStringValue();
}
/**
* This implementation uses a winner-take-all comparison of the individual
* weight vectors' dot products.
*
* @param exampleFeatures The example's array of feature indices.
* @param exampleValues The example's array of feature values.
* @return A single feature with the winning weight vector's associated
* value.
**/
public FeatureVector classify(int[] exampleFeatures, double[] exampleValues)
{
return new FeatureVector(featureValue(exampleFeatures, exampleValues));
}
/**
* Using this method, the winner-take-all competition is narrowed to
* involve only those labels contained in the specified list. The list
* must contain only <code>String</code>s.
*
* @param example The example object.
* @param candidates A list of the only labels the example may take.
* @return The prediction as a feature or <code>null</code> if the network
* did not contain any of the specified labels.
**/
public Feature valueOf(Object example, Collection candidates) {
Object[] array = getExampleArray(example, false);
return valueOf((int[]) array[0], (double[]) array[1], candidates);
}
/**
* Using this method, the winner-take-all competition is narrowed to
* involve only those labels contained in the specified list. The list
* must contain only <code>String</code>s.
*
* @param exampleFeatures The example's array of feature indices.
* @param exampleValues The example's array of feature values.
* @param candidates A list of the only labels the example may take.
* @return The prediction as a feature or <code>null</code> if the network
* did not contain any of the specified labels.
**/
public Feature valueOf(int[] exampleFeatures, double[] exampleValues,
Collection candidates) {
double bestScore = Double.NEGATIVE_INFINITY;
int bestValue = -1;
Iterator I = candidates.iterator();
if (I.hasNext()) {
if (conjunctiveLabels)
return conjunctiveValueOf(exampleFeatures, exampleValues, I);
while (I.hasNext()) {
double score = Double.NEGATIVE_INFINITY;
String label = (String) I.next();
Feature f =
new DiscretePrimitiveStringFeature(
labeler.containingPackage, labeler.name, "", label,
labeler.valueIndexOf(label),
(short) labeler.allowableValues().length);
int key = -1;
if (labelLexicon.contains(f)) {
key = labelLexicon.lookup(f);
score = ((BiasedRandomWeightVector) network.get(key))
.dot(exampleFeatures, exampleValues);
}
if (score > bestScore) {
bestValue = key;
bestScore = score;
}
}
}
else {
int N = network.size();
for (int l = 0; l < N; l++) {
double score =
((BiasedRandomWeightVector) network.get(l))
.dot(exampleFeatures, exampleValues);
if (score > bestScore) {
bestValue = l;
bestScore = score;
}
}
}
return bestValue == -1 ? null : predictions.get(bestValue);
}
/**
* This method is a surrogate for
* {@link #valueOf(int[],double[],Collection)} when the labeler is known to
* produce conjunctive features. It is necessary because when given a
* string label from the collection, we will not know how to construct the
* appropriate conjunctive feature key for lookup in the label lexicon.
* So, we must go through each feature in the label lexicon and use
* {@link LBJ2.classify.Feature#valueEquals(String)}.
*
* @param exampleFeatures The example's array of feature indices.
* @param exampleValues The example's array of feature values.
* @param I An iterator over the set of labels to choose
* from.
* @return The prediction as a feature or <code>null</code> if the network
* did not contain any of the specified labels.
**/
protected Feature conjunctiveValueOf(
int[] exampleFeatures, double[] exampleValues, Iterator I) {
double bestScore = Double.NEGATIVE_INFINITY;
int bestValue = -1;
int N = network.size();
while (I.hasNext()) {
String label = (String) I.next();
for (int i = 0; i < N; ++i) {
LinearThresholdUnit ltu = (LinearThresholdUnit) network.get(i);
if (ltu == null || !predictions.get(i).valueEquals(label))
continue;
double score = ltu.score(exampleFeatures, exampleValues);
if (score > bestScore) {
bestScore = score;
bestValue = i;
}
break;
}
}
return bestValue == -1 ? null : predictions.get(bestValue);
}
/**
* Returns scores for only those labels in the given collection. If the
* given collection is empty, scores for all labels will be returned. If
* there is no {@link BiasedRandomWeightVector} associated with a given
* label from the collection, that label's score in the returned
* {@link ScoreSet} will be set to <code>Double.NEGATIVE_INFINITY</code>.
*
* <p> The elements of <code>candidates</code> must all be
* <code>String</code>s.
*
* @param example The example object.
* @param candidates A list of the only labels the example may take.
* @return Scores for only those labels in <code>candidates</code>.
**/
public ScoreSet scores(Object example, Collection candidates) {
Object[] array = getExampleArray(example, false);
return scores((int[]) array[0], (double[]) array[1], candidates);
}
/**
* Returns scores for only those labels in the given collection. If the
* given collection is empty, scores for all labels will be returned. If
* there is no {@link BiasedRandomWeightVector} associated with a given
* label from the collection, that label's score in the returned
* {@link ScoreSet} will be set to <code>Double.NEGATIVE_INFINITY</code>.
*
* <p> The elements of <code>candidates</code> must all be
* <code>String</code>s.
*
* @param exampleFeatures The example's array of feature indices.
* @param exampleValues The example's array of feature values.
* @param candidates A list of the only labels the example may take.
* @return Scores for only those labels in <code>candidates</code>.
**/
public ScoreSet scores(int[] exampleFeatures, double[] exampleValues,
Collection candidates) {
ScoreSet result = new ScoreSet();
Iterator I = candidates.iterator();
if (I.hasNext()) {
if (conjunctiveLabels)
return conjunctiveScores(exampleFeatures, exampleValues, I);
while (I.hasNext()) {
double score = Double.NEGATIVE_INFINITY;
String label = (String) I.next();
Feature f =
new DiscretePrimitiveStringFeature(
labeler.containingPackage, labeler.name, "", label,
labeler.valueIndexOf(label),
(short) labeler.allowableValues().length);
if (labelLexicon.contains(f)) {
int key = labelLexicon.lookup(f);
score = ((BiasedRandomWeightVector) network.get(key))
.dot(exampleFeatures, exampleValues);
result.put(label.toString(), score);
}
}
}
else {
int N = network.size();
for (int l = 0; l < N; l++) {
double score =
((BiasedRandomWeightVector) network.get(l))
.dot(exampleFeatures, exampleValues);
result.put(labelLexicon.lookupKey(l).getStringValue(), score);
}
}
return result;
}
/**
* This method is a surrogate for
* {@link #scores(int[],double[],Collection)} when the labeler is known to
* produce conjunctive features. It is necessary because when given a
* string label from the collection, we will not know how to construct the
* appropriate conjunctive feature key for lookup in the label lexicon.
* So, we must go through each feature in the label lexicon and use
* {@link LBJ2.classify.Feature#valueEquals(String)}.
*
* @param exampleFeatures The example's array of feature indices.
* @param exampleValues The example's array of feature values.
* @param I An iterator over the set of labels to choose
* from.
* @return The label chosen by this classifier or <code>null</code> if the
* network did not contain any of the specified labels.
**/
protected ScoreSet conjunctiveScores(int[] exampleFeatures,
double[] exampleValues, Iterator I) {
ScoreSet result = new ScoreSet();
int N = network.size();
while (I.hasNext()) {
String label = (String) I.next();
for (int i = 0; i < N; ++i) {
LinearThresholdUnit ltu = (LinearThresholdUnit) network.get(i);
if (ltu == null || !labelLexicon.lookupKey(i).valueEquals(label))
continue;
double score = ltu.score(exampleFeatures, exampleValues);
result.put(label.toString(), score);
break;
}
}
return result;
}
/**
* Writes the algorithm's internal representation as text.
*
* @param out The output stream.
**/
public void write(PrintStream out) {
int N = network.size();
for (int i = 0; i < N; ++i) {
out.println("label: " + predictions.get(i).getStringValue());
((BiasedRandomWeightVector) network.get(i)).write(out, lexicon);
}
out.println("End of SparseMIRA");
}
/**
* Writes the learned function's internal representation in binary form.
*
* @param out The output stream.
**/
public void write(ExceptionlessOutputStream out) {
super.write(out);
int N = network.size();
out.writeInt(N);
for (int i = 0; i < N; ++i)
((BiasedRandomWeightVector) network.get(i)).write(out);
}
/**
* Reads the binary representation of a learner with this object's run-time
* type, overwriting any and all learned or manually specified parameters
* as well as the label lexicon but without modifying the feature lexicon.
*
* @param in The input stream.
**/
public void read(ExceptionlessInputStream in) {
super.read(in);
int N = in.readInt();
network = new OVector(N);
for (int i = 0; i < N; ++i)
network.add(SparseWeightVector.readWeightVector(in));
}
/** Returns a deep clone of this learning algorithm. */
public Object clone() {
SparseMIRA clone = null;
try { clone = (SparseMIRA) super.clone(); }
catch (Exception e) {
System.err.println("Error cloning SparseMIRA: " + e);
e.printStackTrace();
System.exit(1);
}
int N = network.size();
clone.network = new OVector(N);
for (int i = 0; i < N; ++i)
clone.network.add(((BiasedRandomWeightVector) network.get(i)).clone());
return clone;
}
/**
* Simply a container for all of {@link SparseMIRA}'s
* configurable parameters. This class appears here for completeness; the
* algorithm has no parameters to set.
*
* @author Nick Rizzolo
**/
public static class Parameters extends Learner.Parameters
{
/** Sets all the default values. */
public Parameters() { }
/**
* Sets the parameters from the parent's parameters object, giving
* defaults to all parameters declared in this object.
**/
public Parameters(Learner.Parameters p) { super(p); }
/** Copy constructor. */
public Parameters(Parameters p) { super(p); }
/**
* Calls the appropriate <code>Learner.setParameters(Parameters)</code>
* method for this <code>Parameters</code> object.
*
* @param l The learner whose parameters will be set.
**/
public void setParameters(Learner l) { }
}
}