NBLinearClassifierFactory.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.classify;

import edu.stanford.nlp.ling.BasicDatum;
import edu.stanford.nlp.optimization.GoldenSectionLineSearch;

import java.util.function.Function;


import edu.stanford.nlp.util.logging.Redwood;

/**
 * Provides a medium-weight implementation of Bernoulli (or binary)
 * Naive Bayes via a linear classifier.  It's medium weight in that
 * it uses dense arrays for counts and calculation (but, hey, NB is
 * efficient to estimate).  Each feature is treated as an independent
 * binary variable.
 * <p>
 * CDM Jun 2003: I added a dirty trick so that if there is a feature
 * that is always on in input examples, then its weight is turned into
 * a prior feature!  (This will work well iff it is also always on at
 * test time.)  In fact, this is done for each such feature, so by
 * having several such features, one can even get an integral prior
 * boost out of this.
 *
 * @author Dan Klein
 * @author Sarah Spikes (sdspikes@cs.stanford.edu) (Templatization)
 *
 * @param <L> The type of the labels in the Classifier
 * @param <F> The type of the features in the Classifier
 */
public class NBLinearClassifierFactory<L, F> extends AbstractLinearClassifierFactory<L, F>  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(NBLinearClassifierFactory.class);

  private static final boolean VERBOSE = false;

  private double sigma;     // amount of add-k smoothing of evidence
  private final boolean interpretAlwaysOnFeatureAsPrior;
  private static final double epsilon = 1e-30;   // fudge to keep nonzero
  private boolean tuneSigma = false;
  private int folds;

  final static Redwood.RedwoodChannels logger = Redwood.channels(NBLinearClassifierFactory.class);


  @Override
  protected double[][] trainWeights(GeneralDataset<L, F> data) {
    return trainWeights(data.getDataArray(), data.getLabelsArray());
  }

  /**
   * Train weights.
   * If tuneSigma is true, the optimal sigma value is found using cross-validation:
   * the number of folds is determined by the <code>folds</code> variable,
   * if there are less training examples than folds,
   * leave-one-out is used.
   */
  double[][] trainWeights(int[][] data, int[] labels) {
    if (tuneSigma) {
      tuneSigma(data, labels);
    }
    if (VERBOSE) {
      logger.info("NB CF: " + data.length + " data items ");
      for (int i = 0; i < data.length; i++) {
        log.info("Datum " + i + ": " + labels[i] + ":");
        for (int j = 0; j < data[i].length; j++) {
          log.info(" " + data[i][j]);
        }
        logger.info("");
      }
    }
    int numFeatures = numFeatures();
    int numClasses = numClasses();
    double[][] weights = new double[numFeatures][numClasses];
    // find P(C|F)/P(C)
    int num = 0;
    double[] numc = new double[numClasses];
    double n = 0;   // num active features in whole dataset
    double[] n_c = new double[numClasses];  // num active features in class c items
    double[] n_f = new double[numFeatures]; // num data items for which feature is active
    double[][] n_fc = new double[numFeatures][numClasses];  // num times feature active in class c
    for (int d = 0; d < data.length; d++) {
      num++;
      numc[labels[d]]++;
      for (int i = 0; i < data[d].length; i++) {
        n++;
        n_c[labels[d]]++;
        n_f[data[d][i]]++;
        n_fc[data[d][i]][labels[d]]++;
      }
    }
    for (int c = 0; c < numClasses; c++) {
      for (int f = 0; f < numFeatures; f++) {
        if (interpretAlwaysOnFeatureAsPrior && n_f[f] == data.length) {
          // interpret always on feature as prior!
          weights[f][c] = Math.log(numc[c] / num);
        } else {
          // p_c_f = (N(f,c)+k)/(N(f)+|C|k) = Paddk(c|f)
          // set lambda = log (P()/P())
          double p_c = (n_c[c] + epsilon) / (n + numClasses * epsilon);
          double p_c_f = (n_fc[f][c] + sigma) / (n_f[f] + sigma * numClasses);
          if (VERBOSE) {
            logger.info("Prob ratio(f=" + f + ",c=" + c + ") = " + p_c_f / p_c + " (nc=" + n_c[c] + ", nf=" + n_f[f] + ", nfc=" + n_fc[f][c] + ")");
          }
          weights[f][c] = Math.log(p_c_f / p_c);
        }
      }
    }
    return weights;
  }

  double[][] weights(int[][] data, int[] labels, int testMin, int testMax, double trialSigma, int foldSize) {
    int numFeatures = numFeatures();
    int numClasses = numClasses();
    double[][] weights = new double[numFeatures][numClasses];
    // find P(C|F)/P(C)
    int num = 0;
    double[] numc = new double[numClasses];
    double n = 0;   // num active features in whole dataset
    double[] n_c = new double[numClasses];  // num active features in class c items
    double[] n_f = new double[numFeatures]; // num data items for which feature is active
    double[][] n_fc = new double[numFeatures][numClasses];  // num times feature active in class c
    for (int d = 0; d < data.length; d++) {
      if (d == testMin) {
        d = testMax - 1;
        continue;
      }
      num++;
      numc[labels[d]]++;
      for (int i = 0; i < data[d].length; i++) {
        if (i == testMin) {
          i = testMax - 1;
          continue;
        }
        n++;
        n_c[labels[d]]++;
        n_f[data[d][i]]++;
        n_fc[data[d][i]][labels[d]]++;
      }
    }
    for (int c = 0; c < numClasses; c++) {
      for (int f = 0; f < numFeatures; f++) {
        if (interpretAlwaysOnFeatureAsPrior && n_f[f] == data.length - foldSize) {
          // interpret always on feature as prior!
          weights[f][c] = Math.log(numc[c] / num);
        } else {
          // p_c_f = (N(f,c)+k)/(N(f)+|C|k) = Paddk(c|f)
          // set lambda = log (P()/P())
          double p_c = (n_c[c] + epsilon) / (n + numClasses * epsilon);
          double p_c_f = (n_fc[f][c] + trialSigma) / (n_f[f] + trialSigma * numClasses);
          weights[f][c] = Math.log(p_c_f / p_c);
        }
      }
    }
    return weights;
  }


  private void tuneSigma(final int[][] data, final int[] labels) {

    Function<Double, Double> CVSigmaToPerplexity = trialSigma -> {
      double score = 0.0;
      double sumScore = 0.0;
      int foldSize, nbCV;
      logger.info("Trying sigma = " + trialSigma);
      //test if enough training data
      if (data.length >= folds) {
        foldSize = data.length / folds;
        nbCV = folds;
      } else { //leave-one-out
        foldSize = 1;
        nbCV = data.length;
      }

      for (int j = 0; j < nbCV; j++) {
        //System.out.println("CV j: "+ j);
        int testMin = j * foldSize;
        int testMax = testMin + foldSize;

        LinearClassifier<L, F> c = new LinearClassifier<>(weights(data, labels, testMin, testMax, trialSigma, foldSize), featureIndex, labelIndex);
        for (int i = testMin; i < testMax; i++) {
          //System.out.println("test i: "+ i + " "+ new BasicDatum(featureIndex.objects(data[i])));
          score -= c.logProbabilityOf(new BasicDatum<>(featureIndex.objects(data[i]))).getCount(labelIndex.get(labels[i]));
        }
        //System.err.printf("%d: %8g%n", j, score);
        sumScore += score;
      }
      System.err.printf(": %8g%n", sumScore);
      return sumScore;
    };

    GoldenSectionLineSearch gsls = new GoldenSectionLineSearch(true);
    sigma = gsls.minimize(CVSigmaToPerplexity, 0.01, 0.0001, 2.0);
    System.out.println("Sigma used: " + sigma);
  }

  /**
   * Create a ClassifierFactory.
   */
  public NBLinearClassifierFactory() {
    this(1.0);
  }

  /**
   * Create a ClassifierFactory.
   *
   * @param sigma The amount of add-sigma smoothing of evidence
   */
  public NBLinearClassifierFactory(double sigma) {
    this(sigma, false);
  }

  /**
   * Create a ClassifierFactory.
   *
   * @param sigma The amount of add-sigma smoothing of evidence
   * @param interpretAlwaysOnFeatureAsPrior If true, a feature that is in every
   *              data item is interpreted as an indication to include a prior
   *              factor over classes.  (If there are multiple such features, an
   *              integral "prior boost" will occur.)  If false, an always on
   *              feature is interpreted as an evidence feature (and, following
   *              the standard math) will have no effect on the model.

   */
  public NBLinearClassifierFactory(double sigma, boolean interpretAlwaysOnFeatureAsPrior) {
    this.sigma = sigma;
    this.interpretAlwaysOnFeatureAsPrior = interpretAlwaysOnFeatureAsPrior;
  }

  /**
   * setTuneSigmaCV sets the <code>tuneSigma</code> flag: when turned on,
   * the sigma is tuned by cross-validation.
   * If there is less data than the number of folds, leave-one-out is used.
   * The default for tuneSigma is false.
   *
   * @param folds Number of folds for cross validation
   */
  public void setTuneSigmaCV(int folds) {
    tuneSigma = true;
    this.folds = folds;
  }

  private static final long serialVersionUID = 1;

}