AbstractVectorClassifier.java example

Explorer
mahout-rbmClassifier-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.mahout.classifier;

import org.apache.mahout.math.DenseMatrix;
import org.apache.mahout.math.DenseVector;
import org.apache.mahout.math.Matrix;
import org.apache.mahout.math.Vector;

import com.google.common.base.Preconditions;

/**
 * Defines the interface for classifiers that take input as a vector.  This is implemented
 * as an abstract class so that it can implement a number of handy convenience methods
 * related to classification of vectors.
 */
public abstract class AbstractVectorClassifier {
  // ------ These are all that are necessary to define a vector classifier.

  /**
   * Returns the number of categories for the target variable.  A vector classifier
   * will encode it's output using a zero-based 1 of numCategories encoding.
   * @return The number of categories.
   */
  public abstract int numCategories();

  /**
   * Classify a vector returning a vector of numCategories-1 scores.  It is assumed that
   * the score for the missing category is one minus the sum of the scores that are returned.
   *
   * Note that the missing score is the 0-th score.
   * @param instance  A feature vector to be classified.
   * @return  A vector of probabilities in 1 of n-1 encoding.
   */
  public abstract Vector classify(Vector instance);

  /**
   * Classify a vector, but don't apply the inverse link function.  For logistic regression
   * and other generalized linear models, this is just the linear part of the classification.
   * @param features  A feature vector to be classified.
   * @return  A vector of scores.  If transformed by the link function, these will become probabilities.
   */
  public Vector classifyNoLink(Vector features) {
    throw new UnsupportedOperationException(
        this.getClass().getName() + " doesn't support classification without a link");
  }

  /**
   * Classifies a vector in the special case of a binary classifier where
   * {@link #classify(Vector)} would return a vector with only one element.  As such,
   * using this method can void the allocation of a vector.
   * @param instance   The feature vector to be classified.
   * @return The score for category 1.
   *
   * @see #classify(Vector)
   */
  public abstract double classifyScalar(Vector instance);

  // ------- From here on, we have convenience methods that provide an easier API to use.

  /**
   * Returns n probabilities, one for each category.  If you can use an n-1 coding, and are touchy
   * about allocation performance, then the classify method is probably better to use.  The 0-th
   * element of the score vector returned by this method is the missing score as computed by the
   * classify method.
   *
   * @see #classify(Vector)
   * @see #classifyFull(Vector r, Vector instance)
   *
   * @param instance A vector of features to be classified.
   * @return A vector of probabilities, one for each category.
   */
  public Vector classifyFull(Vector instance) {
    return classifyFull(new DenseVector(numCategories()), instance);
  }

  /**
   * Returns n probabilities, one for each category into a pre-allocated vector.  One
   * vector allocation is still done in the process of multiplying by the coefficient
   * matrix, but that is hard to avoid.  The cost of such an ephemeral allocation is
   * very small in any case compared to the multiplication itself.
   *
   * @param r        Where to put the results.
   * @param instance A vector of features to be classified.
   * @return A vector of probabilities, one for each category.
   */
  public Vector classifyFull(Vector r, Vector instance) {
    r.viewPart(1, numCategories() - 1).assign(classify(instance));
    r.setQuick(0, 1.0 - r.zSum());
    return r;
  }


  /**
   * Returns n-1 probabilities, one for each category but the last, for each row of a matrix. The
   * probability of the missing 0-th category is 1 - rowSum(this result).
   *
   * @param data The matrix whose rows are vectors to classify
   * @return A matrix of scores, one row per row of the input matrix, one column for each but the
   *         last category.
   */
  public Matrix classify(Matrix data) {
    Matrix r = new DenseMatrix(data.numRows(), numCategories() - 1);
    for (int row = 0; row < data.numRows(); row++) {
      r.assignRow(row, classify(data.viewRow(row)));
    }
    return r;
  }

  /**
   * Returns n probabilities, one for each category, for each row of a matrix.
   *
   * @param data The matrix whose rows are vectors to classify
   * @return A matrix of scores, one row per row of the input matrix, one column for each but the
   *         last category.
   */
  public Matrix classifyFull(Matrix data) {
    Matrix r = new DenseMatrix(data.numRows(), numCategories());
    for (int row = 0; row < data.numRows(); row++) {
      classifyFull(r.viewRow(row), data.viewRow(row));
    }
    return r;
  }

  /**
   * Returns a vector of probabilities of the first category, one for each row of a matrix. This
   * only makes sense if there are exactly two categories, but calling this method in that case can
   * save a number of vector allocations.
   *
   * @param data The matrix whose rows are vectors to classify
   * @return A vector of scores, with one value per row of the input matrix.
   */
  public Vector classifyScalar(Matrix data) {
    Preconditions.checkArgument(numCategories() == 2, "Can only call classifyScalar with two categories");

    Vector r = new DenseVector(data.numRows());
    for (int row = 0; row < data.numRows(); row++) {
      r.set(row, classifyScalar(data.viewRow(row)));
    }
    return r;
  }

  /**
   * Returns a measure of how good the classification for a particular example actually is.
   *
   * @param actual The correct category for the example.
   * @param data   The vector to be classified.
   * @return The log likelihood of the correct answer as estimated by the current model.  This will
   *         always be <= 0 and larger (closer to 0) indicates better accuracy.  In order to simplify
   *         code that maintains running averages, we bound this value at -100.
   */
  public double logLikelihood(int actual, Vector data) {
    if (numCategories() == 2) {
      double p = classifyScalar(data);
      if (actual > 0) {
        return Math.max(-100.0, Math.log(p));
      } else {
        return Math.max(-100.0, Math.log1p(-p));
      }
    } else {
      Vector p = classify(data);
      if (actual > 0) {
        return Math.max(-100.0, Math.log(p.get(actual - 1)));
      } else {
        return Math.max(-100.0, Math.log1p(-p.zSum()));
      }
    }
  }
}