LogConditionalObjectiveFunction.java example

Explorer
CoreNLP-master
package edu.stanford.nlp.classify;
import edu.stanford.nlp.util.logging.Redwood;

import java.lang.reflect.Array;
import java.util.Arrays;
import java.util.Collection;
import java.util.concurrent.CountDownLatch;

import edu.stanford.nlp.ling.Datum;
import edu.stanford.nlp.math.ADMath;
import edu.stanford.nlp.math.ArrayMath;
import edu.stanford.nlp.math.DoubleAD;
import edu.stanford.nlp.optimization.AbstractStochasticCachingDiffUpdateFunction;
import edu.stanford.nlp.optimization.StochasticCalculateMethods;
import edu.stanford.nlp.util.ArgumentParser;
import edu.stanford.nlp.util.Index;
import edu.stanford.nlp.util.RuntimeInterruptedException;


/**
 * Maximizes the conditional likelihood with a given prior.
 *
 * @author Dan Klein
 * @author Galen Andrew
 * @author Chris Cox (merged w/ SumConditionalObjectiveFunction, 2/16/05)
 * @author Sarah Spikes (Templatization, allowing an {@code Iterable<Datum<L, F>>} to be passed in instead of a {@code GeneralDataset<L, F>})
 * @author Angel Chang (support in place SGD - extend AbstractStochasticCachingDiffUpdateFunction)
 * @author Christopher Manning (cleaned out the cruft and sped it up in 2014)
 * @author Keenon Werling added some multithreading to the batch evaluations
 */

public class LogConditionalObjectiveFunction<L, F> extends AbstractStochasticCachingDiffUpdateFunction  {

  /** A logger for this class */
  private static Redwood.RedwoodChannels log = Redwood.channels(LogConditionalObjectiveFunction.class);

  protected final LogPrior prior;

  protected final int numFeatures;
  protected final int numClasses;

  /** Normally, this contains the data. The first index is the datum number,
   *  and then there is an array of feature indices for each datum.
   */
  protected final int[][] data;
  /** Alternatively, the data may be available from an Iterable in not yet
   *  indexed form.  (In 2014, it's not clear any code actually uses this option.)
   *  And then you need an index for both.
   */
  protected final Iterable<Datum<L, F>> dataIterable;
  protected final Index<L> labelIndex;
  protected final Index<F> featureIndex;

  /** Same size as data if the features have values; null if the features are binary. */
  protected final double[][] values;
  /** The label of each data index. */
  protected final int[] labels;

  protected final float[] dataWeights;

  protected final boolean useSummedConditionalLikelihood; //whether to use sumConditional or logConditional

  /** This is used to cache the numerator in batch methods. */
  protected double[] derivativeNumerator = null;

  /** The only reason this is around is because the Prior Functions don't handle stochastic calculations yet. */
  protected double [] priorDerivative = null;

  /** The flag to tell the gradient computations to multithread over the data.
   * keenon (june 2015): On my machine,
   * */
  protected boolean parallelGradientCalculation = true;

  /** Multithreading gradient calculations is a bit cheaper if you reuse the threads. */
  protected int threads = ArgumentParser.threads;

  @Override
  public int domainDimension() {
    return numFeatures * numClasses;
  }

  @Override
  public int dataDimension(){
    return data.length;
  }

  private int classOf(int index) {
    return index % numClasses;
  }

  private int featureOf(int index) {
    return index / numClasses;
  }

  /** Converts a Phi feature number and class index into an f(x,y) feature index. */
  // [cdm2014: Tried inline this; no big gains.]
  protected int indexOf(int f, int c) {
    return f * numClasses + c;
  }

  public double[][] to2D(double[] x) {
    double[][] x2 = new double[numFeatures][numClasses];
    for (int i = 0; i < numFeatures; i++) {
      for (int j = 0; j < numClasses; j++) {
        x2[i][j] = x[indexOf(i, j)];
      }
    }
    return x2;
  }

  /**
   * Calculate the conditional likelihood.
   * If {@code useSummedConditionalLikelihood} is {@code false} (the default),
   * this calculates standard(product) CL, otherwise this calculates summed CL.
   * What's the difference?  See Klein and Manning's 2002 EMNLP paper.
   */
  @Override
  protected void calculate(double[] x) {
    //If the batchSize is 0 then use the regular calculate methods
    if (useSummedConditionalLikelihood) {
      calculateSCL(x);
    } else {
      calculateCL(x);
    }
  }


  /**
   * This function is used to come up with an estimate of the value / gradient based on only a small
   * portion of the data (referred to as the batchSize for lack of a better term.  In this case batch does
   * not mean All!!  It should be thought of in the sense of "a small batch of the data".
   */
  @Override
  public void calculateStochastic(double[] x, double[] v, int[] batch) {

    if(method.calculatesHessianVectorProduct() && v != null){
      //  This is used for Stochastic Methods that involve second order information (SMD for example)
      if(method.equals(StochasticCalculateMethods.AlgorithmicDifferentiation)){
        calculateStochasticAlgorithmicDifferentiation(x,v,batch);
      }else if(method.equals(StochasticCalculateMethods.IncorporatedFiniteDifference)){
        calculateStochasticFiniteDifference(x,v,finiteDifferenceStepSize,batch);
      }
    } else{
      //This is used for Stochastic Methods that don't need anything but the gradient (SGD)
      calculateStochasticGradientLocal(x,batch);
    }

  }


  /**
   * Calculate the summed conditional likelihood of this data by summing
   * conditional estimates.
   *
   */
  private void calculateSCL(double[] x) {
    //System.out.println("Checking at: "+x[0]+" "+x[1]+" "+x[2]);
    value = 0.0;
    Arrays.fill(derivative, 0.0);
    double[] sums = new double[numClasses];
    double[] probs = new double[numClasses];
    // double[] counts = new double[numClasses];
    // Arrays.fill(counts, 0.0); // not needed; Java arrays zero initialized
    for (int d = 0; d < data.length; d++) {
      int[] features = data[d];
      // activation
      Arrays.fill(sums, 0.0);
      for (int c = 0; c < numClasses; c++) {
        for (int feature : features) {
          int i = indexOf(feature, c);
          sums[c] += x[i];
        }
      }
      // expectation (slower routine replaced by fast way)
      // double total = Double.NEGATIVE_INFINITY;
      // for (int c=0; c<numClasses; c++) {
      //   total = SloppyMath.logAdd(total, sums[c]);
      // }
      double total = ArrayMath.logSum(sums);
      int ld = labels[d];
      for (int c = 0; c < numClasses; c++) {
        probs[c] = Math.exp(sums[c] - total);
        for (int feature : features) {
          int i = indexOf(feature, c);
          derivative[i] += probs[ld] * probs[c];
        }
      }
      // observed
      for (int feature : features) {
        int i = indexOf(feature, labels[d]);
        derivative[i] -= probs[ld];
      }
      value -= probs[ld];
    }
    // priors
    if (true) {
      for (int i = 0; i < x.length; i++) {
        double k = 1.0;
        double w = x[i];
        value += k * w * w / 2.0;
        derivative[i] += k * w;
      }
    }
  }

  /**
   * Calculate the conditional likelihood of this data by multiplying
   * conditional estimates. Full dataset batch estimation.
   */
  private void calculateCL(double[] x) {
    if (values != null) {
      rvfcalculate(x);
    } else if (dataIterable != null) {
      calculateCLiterable(x);
    } else {
      calculateCLbatch(x);
    }
  }

  private class CLBatchDerivativeCalculation implements Runnable {
    int numThreads;
    int threadIdx;
    double localValue = 0.0;
    double[] x;
    int[] batch;
    double[] localDerivative;
    CountDownLatch latch;

    public CLBatchDerivativeCalculation(int numThreads, int threadIdx, int[] batch, double[] x, int derivativeSize, CountDownLatch latch) {
      this.numThreads = numThreads;
      this.threadIdx = threadIdx;
      this.x = x;
      this.batch = batch;
      this.localDerivative = new double[derivativeSize];
      this.latch = latch;
    }

    @Override
    public void run() {
      double[] sums = new double[numClasses];
      double[] probs = new double[numClasses];

      // TODO: could probably get slightly better speedup if threads took linear subsequences, for cacheing
      int batchSize = batch == null ? data.length : batch.length;
      for (int m = threadIdx; m < batchSize; m += numThreads) {
        int d = batch == null ? m : batch[m];

        // activation
        Arrays.fill(sums, 0.0);

        int[] featuresArr = data[d];

        for (int c = 0; c < numClasses; c++) {
          for (int feature : featuresArr) {
            int i = indexOf(feature, c);
            sums[c] += x[i];
          }
        }
        // expectation (slower routine replaced by fast way)
        // double total = Double.NEGATIVE_INFINITY;
        // for (int c=0; c<numClasses; c++) {
        //   total = SloppyMath.logAdd(total, sums[c]);
        // }
        double total = ArrayMath.logSum(sums);
        for (int c = 0; c < numClasses; c++) {
          probs[c] = Math.exp(sums[c] - total);
          if (dataWeights != null) {
            probs[c] *= dataWeights[d];
          }
        }

        for (int c = 0; c < numClasses; c++) {
          for (int feature : featuresArr) {
            int i = indexOf(feature, c);
            localDerivative[i] += probs[c];
          }
        }

        int labelindex = labels[d];
        double dV = sums[labelindex] - total;
        if (dataWeights != null) {
          dV *= dataWeights[d];
        }
        localValue -= dV;
      }

      latch.countDown();
    }
  }

  private void calculateCLbatch(double[] x) {
    //System.out.println("Checking at: "+x[0]+" "+x[1]+" "+x[2]);
    value = 0.0;
    // [cdm Mar 2014] This next bit seems unnecessary: derivative is allocated by ensure() in AbstractCachingDiffFunction
    // before calculate() is called; and after the next block, derivativeNumerator is copied into it.
    // if (derivative == null) {
    //   derivative = new double[x.length];
    // } else {
    //   Arrays.fill(derivative, 0.0);
    // }

    if (derivativeNumerator == null) {
      derivativeNumerator = new double[x.length];
      for (int d = 0; d < data.length; d++) {
        int[] features = data[d];
        for (int feature : features) {
          int i = indexOf(feature, labels[d]);
          if (dataWeights == null) {
            derivativeNumerator[i] -= 1;
          } else {
            derivativeNumerator[i] -= dataWeights[d];
          }
        }
      }
    }

    copy(derivative, derivativeNumerator);
    //    Arrays.fill(derivative, 0.0);
    //    double[] counts = new double[numClasses];
    //    Arrays.fill(counts, 0.0);

    if (parallelGradientCalculation && threads > 1) {
      // Launch several threads (reused out of our fixed pool) to handle the computation
      @SuppressWarnings("unchecked")
      CLBatchDerivativeCalculation[] runnables = (CLBatchDerivativeCalculation[])Array.newInstance(CLBatchDerivativeCalculation.class, threads);
      CountDownLatch latch = new CountDownLatch(threads);
      for (int i = 0; i < threads; i++) {
        runnables[i] = new CLBatchDerivativeCalculation(threads, i, null, x, derivative.length, latch);
        new Thread(runnables[i]).start();
      }
      try {
        latch.await();
      } catch (InterruptedException e) {
        throw new RuntimeInterruptedException(e);
      }

      for (int i = 0; i < threads; i++) {
        value += runnables[i].localValue;
        for (int j = 0; j < derivative.length; j++) {
          derivative[j] += runnables[i].localDerivative[j];
        }
      }
    }
    else {
      double[] sums = new double[numClasses];
      double[] probs = new double[numClasses];

      for (int d = 0; d < data.length; d++) {
        // activation
        Arrays.fill(sums, 0.0);

        int[] featuresArr = data[d];

        for (int feature : featuresArr) {
          for (int c = 0; c < numClasses; c++) {
            int i = indexOf(feature, c);
            sums[c] += x[i];
          }
        }
        // expectation (slower routine replaced by fast way)
        // double total = Double.NEGATIVE_INFINITY;
        // for (int c=0; c<numClasses; c++) {
        //   total = SloppyMath.logAdd(total, sums[c]);
        // }
        double total = ArrayMath.logSum(sums);
        for (int c = 0; c < numClasses; c++) {
          probs[c] = Math.exp(sums[c] - total);
          if (dataWeights != null) {
            probs[c] *= dataWeights[d];
          }
        }

        for (int feature : featuresArr) {
          for (int c = 0; c < numClasses; c++) {
            int i = indexOf(feature, c);
            derivative[i] += probs[c];
          }
        }

        int labelindex = labels[d];
        double dV = sums[labelindex] - total;
        if (dataWeights != null) {
          dV *= dataWeights[d];
        }
        value -= dV;
      }
    }

    value += prior.compute(x, derivative);
  }


  private void calculateCLiterable(double[] x) {
    //System.out.println("Checking at: "+x[0]+" "+x[1]+" "+x[2]);
    value = 0.0;
    // [cdm Mar 2014] This next bit seems unnecessary: derivative is allocated by ensure() in AbstractCachingDiffFunction
    // before calculate() is called; and after the next block, derivativeNumerator is copied into it.
    // if (derivative == null) {
    //   derivative = new double[x.length];
    // } else {
    //   Arrays.fill(derivative, 0.0);
    // }

    if (derivativeNumerator == null) {
      derivativeNumerator = new double[x.length];
      //use dataIterable if data is null & vice versa
      //TODO: Make sure this work as expected!!
      //int index = 0;
      for (Datum<L, F> datum : dataIterable) {
        Collection<F> features = datum.asFeatures();
        for (F feature : features) {
          int i = indexOf(featureIndex.indexOf(feature), labelIndex.indexOf(datum.label()));
          if (dataWeights == null) {
            derivativeNumerator[i] -= 1;
          } /*else {
              derivativeNumerator[i] -= dataWeights[index];
            }*/
        }
      }
    }

    copy(derivative, derivativeNumerator);
    //    Arrays.fill(derivative, 0.0);
    double[] sums = new double[numClasses];
    double[] probs = new double[numClasses];
    //    double[] counts = new double[numClasses];
    //    Arrays.fill(counts, 0.0);

    for (Datum<L, F> datum : dataIterable) {
      // activation
      Arrays.fill(sums, 0.0);
      Collection<F> features = datum.asFeatures();
      for (F feature : features) {
        for (int c = 0; c < numClasses; c++) {
          int i = indexOf(featureIndex.indexOf(feature), c);
          sums[c] += x[i];
        }
      }
      // expectation (slower routine replaced by fast way)
      // double total = Double.NEGATIVE_INFINITY;
      // for (int c=0; c<numClasses; c++) {
      //   total = SloppyMath.logAdd(total, sums[c]);
      // }
      double total = ArrayMath.logSum(sums);
      for (int c = 0; c < numClasses; c++) {
        probs[c] = Math.exp(sums[c] - total);
      }

      for (F feature : features) {
        for (int c = 0; c < numClasses; c++) {
          int i = indexOf(featureIndex.indexOf(feature), c);
          derivative[i] += probs[c];
        }
      }

      int label = this.labelIndex.indexOf(datum.label());
      double dV = sums[label] - total;
      value -= dV;
    }

    value += prior.compute(x, derivative);
  }


  public void calculateStochasticFiniteDifference(double[] x,double[] v, double h, int[] batch){
    //  THOUGHTS:
    //  does applying the renormalization (g(x+hv)-g(x)) / h at each step along the way
    //  introduce too much error to makes this method numerically accurate?
    //  akleeman Feb 23 2007

    //  Answer to my own question:     Feb 25th
    //      Doesn't look like it!!  With h = 1e-4 it seems like the Finite Difference makes almost
    //     exactly the same step as the exact hessian vector product calculated through AD.
    //     That said it's probably (in the case of the Log Conditional Objective function) logical
    //     to only use finite difference.  Unless of course the function is somehow nearly singular,
    //     in which case finite difference could turn what is a convex problem into a singular proble... NOT GOOD.

    if (values != null) {
      rvfcalculate(x);
      return;
    }

    value = 0.0;

    if (priorDerivative == null) {
      priorDerivative = new double[x.length];
    }

    double priorFactor = batch.length/(data.length*prior.getSigma()*prior.getSigma());

    derivative = ArrayMath.multiply(x,priorFactor);
    HdotV = ArrayMath.multiply(v,priorFactor);

    //Arrays.fill(derivative, 0.0);
    double[] sums = new double[numClasses];
    double[] sumsV = new double[numClasses];
    double[] probs = new double[numClasses];
    double[] probsV = new double[numClasses];

    for (int m : batch) {

      //Sets the index based on the current batch
      int[] features = data[m];
      // activation

      Arrays.fill(sums, 0.0);
      Arrays.fill(sumsV, 0.0);

      for (int c = 0; c < numClasses; c++) {
        for (int feature : features) {
          int i = indexOf(feature, c);
          sums[c] += x[i];
          sumsV[c] += x[i] + h * v[i];
        }
      }

      double total = ArrayMath.logSum(sums);
      double totalV = ArrayMath.logSum(sumsV);

      for (int c = 0; c < numClasses; c++) {
        probs[c] = Math.exp(sums[c] - total);
        probsV[c] = Math.exp(sumsV[c] - totalV);

        if (dataWeights != null) {
          probs[c] *= dataWeights[m];
          probsV[c] *= dataWeights[m];
        }
        for (int feature : features) {
          int i = indexOf(feature, c);
          //derivative[i] += (-1);
          derivative[i] += probs[c];
          HdotV[i] += (probsV[c] - probs[c]) / h;
          if (c == labels[m]) {
            derivative[i] -= 1;
          }

        }
      }

      double dV = sums[labels[m]] - total;
      if (dataWeights != null) {
        dV *= dataWeights[m];
      }
      value -= dV;
    }

    //Why was this being copied?  -akleeman
    //double[] tmpDeriv = new double[derivative.length];
    //System.arraycopy(derivative,0,tmpDeriv,0,derivative.length);
    value += ((double) batch.length)/((double) data.length)*prior.compute(x,priorDerivative);
  }




  public void calculateStochasticGradientLocal(double[] x, int[] batch) {
    if (values != null) {
      rvfcalculate(x);
      return;
    }

    value = 0.0;

    int batchSize = batch.length;

    if (priorDerivative == null) {
      priorDerivative = new double[x.length];
    }

    double priorFactor = batchSize/(data.length*prior.getSigma()*prior.getSigma());

    derivative = ArrayMath.multiply(x,priorFactor);

    //Arrays.fill(derivative, 0.0);
    double[] sums = new double[numClasses];
    //double[] sumsV = new double[numClasses];
    double[] probs = new double[numClasses];
    //double[] probsV = new double[numClasses];

    for (int m : batch) {

      //Sets the index based on the current batch
      int[] features = data[m];
      // activation

      Arrays.fill(sums, 0.0);
      //Arrays.fill(sumsV,0.0);

      for (int c = 0; c < numClasses; c++) {
        for (int feature : features) {
          int i = indexOf(feature, c);
          sums[c] += x[i];
        }
      }

      double total = ArrayMath.logSum(sums);
      //double totalV = ArrayMath.logSum(sumsV);

      for (int c = 0; c < numClasses; c++) {
        probs[c] = Math.exp(sums[c] - total);
        //probsV[c] = Math.exp(sumsV[c]- totalV);

        if (dataWeights != null) {
          probs[c] *= dataWeights[m];
          //probsV[c] *= dataWeights[m];
        }
        for (int feature : features) {
          int i = indexOf(feature, c);
          //derivative[i] += (-1);
          derivative[i] += probs[c];
          if (c == labels[m]) {
            derivative[i] -= 1;
          }

        }
      }

      double dV = sums[labels[m]] - total;
      if (dataWeights != null) {
        dV *= dataWeights[m];
      }
      value -= dV;
    }

    value += ((double) batchSize)/((double) data.length)*prior.compute(x,priorDerivative);
  }

  @Override
  public double valueAt(double[] x, double xscale, int[] batch) {
    value = 0.0;
    double[] sums = new double[numClasses];

    for (int m : batch) {
      //Sets the index based on the current batch
      int[] features = data[m];
      Arrays.fill(sums, 0.0);

      for (int c = 0; c < numClasses; c++) {
        for (int f = 0; f < features.length; f++) {
          int i = indexOf(features[f], c);
          if (values != null) {
            sums[c] += x[i] * xscale * values[m][f];
          } else {
            sums[c] += x[i] * xscale;
          }
        }
      }

      double total = ArrayMath.logSum(sums);
      double dV = sums[labels[m]] - total;
      if (dataWeights != null) {
        dV *= dataWeights[m];
      }
      value -= dV;
    }
    return value;
  }

  @Override
  public double calculateStochasticUpdate(double[] x, double xscale, int[] batch, double gain) {
    value = 0.0;

    // Double check that we don't have a mismatch between parallel and batch size settings

    if (parallelGradientCalculation && threads > 1) {
      int examplesPerProcessor = 50;
      if (batch.length <= Runtime.getRuntime().availableProcessors() * examplesPerProcessor) {
        log.info("\n\n***************");
        log.info("CONFIGURATION ERROR: YOUR BATCH SIZE DOESN'T MEET PARALLEL MINIMUM SIZE FOR PERFORMANCE");
        log.info("Batch size: " + batch.length);
        log.info("CPUS: " + Runtime.getRuntime().availableProcessors());
        log.info("Minimum batch size per CPU: " + examplesPerProcessor);
        log.info("MINIMIM BATCH SIZE ON THIS MACHINE: " + (Runtime.getRuntime().availableProcessors() * examplesPerProcessor));
        log.info("TURNING OFF PARALLEL GRADIENT COMPUTATION");
        log.info("***************\n");
        parallelGradientCalculation = false;
      }
    }

    if (parallelGradientCalculation && threads > 1) {
      // Launch several threads (reused out of our fixed pool) to handle the computation
      @SuppressWarnings("unchecked")
      CLBatchDerivativeCalculation[] runnables = (CLBatchDerivativeCalculation[])Array.newInstance(CLBatchDerivativeCalculation.class, threads);
      CountDownLatch latch = new CountDownLatch(threads);
      for (int i = 0; i < threads; i++) {
        runnables[i] = new CLBatchDerivativeCalculation(threads, i, batch, x, x.length, latch);
        new Thread(runnables[i]).start();
      }
      try {
        latch.await();
      } catch (InterruptedException e) {
        throw new RuntimeInterruptedException(e);
      }

      for (int i = 0; i < threads; i++) {
        value += runnables[i].localValue;
        for (int j = 0; j < x.length; j++) {
          x[j] += runnables[i].localDerivative[j] * xscale * gain;
        }
      }
    }
    else {
      double[] sums = new double[numClasses];
      double[] probs = new double[numClasses];

      for (int m : batch) {

        // Sets the index based on the current batch
        int[] features = data[m];
        // activation

        Arrays.fill(sums, 0.0);

        for (int c = 0; c < numClasses; c++) {
          for (int f = 0; f < features.length; f++) {
            int i = indexOf(features[f], c);
            if (values != null) {
              sums[c] += x[i] * xscale * values[m][f];
            } else {
              sums[c] += x[i] * xscale;
            }
          }
        }

        for (int f = 0; f < features.length; f++) {
          int i = indexOf(features[f], labels[m]);
          double v = (values != null) ? values[m][f] : 1;
          double delta = (dataWeights != null) ? dataWeights[m] * v : v;
          x[i] += delta * gain;
        }

        double total = ArrayMath.logSum(sums);

        for (int c = 0; c < numClasses; c++) {
          probs[c] = Math.exp(sums[c] - total);

          if (dataWeights != null) {
            probs[c] *= dataWeights[m];
          }
          for (int f = 0; f < features.length; f++) {
            int i = indexOf(features[f], c);
            double v = (values != null) ? values[m][f] : 1;
            double delta = probs[c] * v;
            x[i] -= delta * gain;
          }
        }

        double dV = sums[labels[m]] - total;
        if (dataWeights != null) {
          dV *= dataWeights[m];
        }
        value -= dV;
      }
    }
    return value;
  }

  @Override
  public void calculateStochasticGradient(double[] x, int[] batch) {
    if (derivative == null) {
      derivative = new double[domainDimension()];
    }
    Arrays.fill(derivative, 0.0);
    double[] sums = new double[numClasses];
    double[] probs = new double[numClasses];
    //double[] counts = new double[numClasses];
    // Arrays.fill(counts, 0.0); // not needed; Java arrays zero initialized
    for (int d : batch) {

      //Sets the index based on the current batch
      int[] features = data[d];
      // activation
      Arrays.fill(sums, 0.0);
      for (int c = 0; c < numClasses; c++) {
        for (int feature : features) {
          int i = indexOf(feature, c);
          sums[c] += x[i];
        }
      }
      // expectation (slower routine replaced by fast way)
      // double total = Double.NEGATIVE_INFINITY;
      // for (int c=0; c<numClasses; c++) {
      //   total = SloppyMath.logAdd(total, sums[c]);
      // }
      double total = ArrayMath.logSum(sums);
      int ld = labels[d];
      for (int c = 0; c < numClasses; c++) {
        probs[c] = Math.exp(sums[c] - total);
        for (int feature : features) {
          int i = indexOf(feature, c);
          derivative[i] += probs[ld] * probs[c];
        }
      }
      // observed
      for (int feature : features) {
        int i = indexOf(feature, labels[d]);
        derivative[i] -= probs[ld];
      }
    }
  }


  protected void calculateStochasticAlgorithmicDifferentiation(double[] x, double[] v, int[] batch) {

    log.info("*");

    //Initialize
    value = 0.0;

    //initialize any variables
    DoubleAD[] derivativeAD = new DoubleAD[x.length];
    for (int i = 0; i < x.length;i++) {
      derivativeAD[i] = new DoubleAD(0.0,0.0);
    }

    DoubleAD[] xAD = new DoubleAD[x.length];
    for (int i = 0; i < x.length;i++){
      xAD[i] = new DoubleAD(x[i],v[i]);
    }

    // Initialize the sums
    DoubleAD[] sums = new DoubleAD[numClasses];
    for (int c = 0; c<numClasses;c++){
      sums[c] = new DoubleAD(0,0);
    }

    DoubleAD[] probs = new DoubleAD[numClasses];
    for (int c = 0; c<numClasses;c++) {
      probs[c] = new DoubleAD(0,0);
    }

    //long curTime = System.currentTimeMillis();
    // Copy the Derivative numerator, and set up the vector V to be used for Hess*V
    for (int i = 0; i < x.length;i++){
      xAD[i].set(x[i],v[i]);
      derivativeAD[i].set(0.0,0.0);
    }

    //log.info(System.currentTimeMillis() - curTime + " - ");
    //curTime = System.currentTimeMillis();

    for (int d = 0; d <batch.length ; d++) {

      //Sets the index based on the current batch
      int m = (curElement + d) % data.length;

      int[] features = data[m];

      for (int c = 0; c<numClasses;c++){
        sums[c].set(0.0,0.0);
      }


      for (int c = 0; c < numClasses; c++) {
        for (int feature : features) {
          int i = indexOf(feature, c);
          sums[c] = ADMath.plus(sums[c], xAD[i]);
        }
      }

      DoubleAD total = ADMath.logSum(sums);

      for (int c = 0; c < numClasses; c++) {
        probs[c] = ADMath.exp( ADMath.minus(sums[c], total) );
        if (dataWeights != null) {
          probs[c] = ADMath.multConst(probs[c], dataWeights[d]);
        }
        for (int feature : features) {
          int i = indexOf(feature, c);
          if (c == labels[m]) {
            derivativeAD[i].plusEqualsConst(-1.0);
          }
          derivativeAD[i].plusEquals(probs[c]);
        }
      }

      double dV = sums[labels[m]].getval() - total.getval();
      if (dataWeights != null) {
        dV *= dataWeights[d];
      }
      value -= dV;
    }

    // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    // DANGEROUS!!!!!!! Divide by Zero possible!!!!!!!!!!
    // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    // Need to modify the prior class to handle AD  -akleeman

    //log.info(System.currentTimeMillis() - curTime + " - ");
    //curTime = System.currentTimeMillis();

    double[] tmp = new double[x.length];
    for(int i = 0; i < x.length; i++){
      tmp[i] = derivativeAD[i].getval();
      derivativeAD[i].plusEquals(ADMath.multConst(xAD[i], batch.length/(data.length * prior.getSigma()*prior.getSigma())));
      derivative[i] = derivativeAD[i].getval();
      HdotV[i] = derivativeAD[i].getdot();
    }
    value += ((double) batch.length)/((double) data.length)*prior.compute(x, tmp);

    //log.info(System.currentTimeMillis() - curTime + " - ");
    //log.info("");
  }

  private class RVFDerivativeCalculation implements Runnable {
    int numThreads;
    int threadIdx;
    double localValue = 0.0;
    double[] x;
    double[] localDerivative;
    CountDownLatch latch;

    public RVFDerivativeCalculation(int numThreads, int threadIdx, double[] x, int derivativeSize, CountDownLatch latch) {
      this.numThreads = numThreads;
      this.threadIdx = threadIdx;
      this.x = x;
      this.localDerivative = new double[derivativeSize];
      this.latch = latch;
    }

    @Override
    public void run() {
      double[] sums = new double[numClasses];
      double[] probs = new double[numClasses];

      for (int d = threadIdx; d < data.length; d += numThreads) {
        final int[] features = data[d];
        final double[] vals = values[d];
        // activation
        Arrays.fill(sums, 0.0);

        for (int c = 0; c < numClasses; c++) {
          for (int f = 0; f < features.length; f++) {
            final int feature = features[f];
            final double val = vals[f];
            int i = indexOf(feature, c);
            sums[c] += x[i] * val;
          }
        }
        // expectation (slower routine replaced by fast way)
        // double total = Double.NEGATIVE_INFINITY;
        // for (int c=0; c<numClasses; c++) {
        //   total = SloppyMath.logAdd(total, sums[c]);
        // }
        // it is faster to split these two loops. More striding
        double total = ArrayMath.logSum(sums);
        for (int c = 0; c < numClasses; c++) {
          probs[c] = Math.exp(sums[c] - total);
          if (dataWeights != null) {
            probs[c] *= dataWeights[d];
          }
        }

        for (int c = 0; c < numClasses; c++) {
          for (int f = 0; f < features.length; f++) {
            final int feature = features[f];
            final double val = vals[f];
            int i = indexOf(feature, c);
            localDerivative[i] += probs[c] * val;
          }
        }

        double dV = sums[labels[d]] - total;
        if (dataWeights != null) {
          dV *= dataWeights[d];
        }
        localValue -= dV;
      }
      latch.countDown();
    }
  }

  /**
   * Calculate conditional likelihood for datasets with real-valued features.
   * Currently this can calculate CL only (no support for SCL).
   * TODO: sum-conditional obj. fun. with RVFs.
   */
  protected void rvfcalculate(double[] x) {
    value = 0.0;
    // This is only calculated once per training run, not worth the effort to multi-thread properly
    if (derivativeNumerator == null) {
      derivativeNumerator = new double[x.length];
      for (int d = 0; d < data.length; d++) {
        final int[] features = data[d];
        final double[] vals = values[d];
        for (int f = 0; f < features.length; f++) {
          int i = indexOf(features[f], labels[d]);
          if (dataWeights == null) {
            derivativeNumerator[i] -= vals[f];
          } else {
            derivativeNumerator[i] -= dataWeights[d] * vals[f];
          }
        }
      }
    }
    copy(derivative, derivativeNumerator);
    //    Arrays.fill(derivative, 0.0);
    //    double[] counts = new double[numClasses];
    //    Arrays.fill(counts, 0.0);

    if (parallelGradientCalculation && threads > 1) {
      // Launch several threads (reused out of our fixed pool) to handle the computation
      @SuppressWarnings("unchecked")
      RVFDerivativeCalculation[] runnables = (RVFDerivativeCalculation[])Array.newInstance(RVFDerivativeCalculation.class, threads);
      CountDownLatch latch = new CountDownLatch(threads);
      for (int i = 0; i < threads; i++) {
        runnables[i] = new RVFDerivativeCalculation(threads, i, x, derivative.length, latch);
        new Thread(runnables[i]).start();
      }
      try {
        latch.await();
      } catch (InterruptedException e) {
        throw new RuntimeInterruptedException(e);
      }

      for (int i = 0; i < threads; i++) {
        value += runnables[i].localValue;
        for (int j = 0; j < derivative.length; j++) {
          derivative[j] += runnables[i].localDerivative[j];
        }
      }
    }
    else {
      // Do the calculation locally on this thread
      double[] sums = new double[numClasses];
      double[] probs = new double[numClasses];

      for (int d = 0; d < data.length; d++) {
        final int[] features = data[d];
        final double[] vals = values[d];
        // activation
        Arrays.fill(sums, 0.0);

        for (int f = 0; f < features.length; f++) {
          final int feature = features[f];
          final double val = vals[f];
          for (int c = 0; c < numClasses; c++) {
            int i = indexOf(feature, c);
            sums[c] += x[i] * val;
          }
        }
        // expectation (slower routine replaced by fast way)
        // double total = Double.NEGATIVE_INFINITY;
        // for (int c=0; c<numClasses; c++) {
        //   total = SloppyMath.logAdd(total, sums[c]);
        // }
        // it is faster to split these two loops. More striding
        double total = ArrayMath.logSum(sums);
        for (int c = 0; c < numClasses; c++) {
          probs[c] = Math.exp(sums[c] - total);
          if (dataWeights != null) {
            probs[c] *= dataWeights[d];
          }
        }

        for (int f = 0; f < features.length; f++) {
          final int feature = features[f];
          final double val = vals[f];
          for (int c = 0; c < numClasses; c++) {
            int i = indexOf(feature, c);
            derivative[i] += probs[c] * val;
          }
        }

        double dV = sums[labels[d]] - total;
        if (dataWeights != null) {
          dV *= dataWeights[d];
        }
        value -= dV;
      }
    }
    value += prior.compute(x, derivative);
  }


  public LogConditionalObjectiveFunction(GeneralDataset<L, F> dataset) {
    this(dataset, new LogPrior(LogPrior.LogPriorType.QUADRATIC));
  }

  public LogConditionalObjectiveFunction(GeneralDataset<L, F> dataset, LogPrior prior) {
    this(dataset, prior, false);
  }

  public LogConditionalObjectiveFunction(GeneralDataset<L, F> dataset, float[] dataWeights, LogPrior prior) {
    this(dataset, prior, false, dataWeights);
  }

  public LogConditionalObjectiveFunction(GeneralDataset<L, F> dataset, LogPrior prior, boolean useSumCondObjFun) {
    this(dataset, prior, useSumCondObjFun, null);
  }

  /** Version passing in a GeneralDataset, which may be binary or real-valued features. */
  public LogConditionalObjectiveFunction(GeneralDataset<L, F> dataset, LogPrior prior, boolean useSumCondObjFun,
                                         float[] dataWeights) {
    this.prior = prior;
    this.useSummedConditionalLikelihood = useSumCondObjFun;
    this.numFeatures = dataset.numFeatures();
    this.numClasses = dataset.numClasses();
    this.data = dataset.getDataArray();
    this.labels = dataset.getLabelsArray();
    this.values = dataset.getValuesArray();
    if (dataWeights != null) {
      this.dataWeights = dataWeights;
    } else if (dataset instanceof WeightedDataset<?,?>) {
      this.dataWeights = ((WeightedDataset<L, F>)dataset).getWeights();
    } else if (dataset instanceof WeightedRVFDataset<?,?>) {
      this.dataWeights = ((WeightedRVFDataset<L, F>)dataset).getWeights();
    } else {
      this.dataWeights = null;
    }
    this.labelIndex = null;
    this.featureIndex = null;
    this.dataIterable = null;
  }

  //TODO: test this [none of our code actually even uses it].
  /** Version where an Iterable is passed in for the data. Doesn't support dataWeights. */
  public LogConditionalObjectiveFunction(Iterable<Datum<L, F>> dataIterable, LogPrior logPrior, Index<F> featureIndex, Index<L> labelIndex) {
    this.prior = logPrior;
    this.useSummedConditionalLikelihood = false;
    this.numFeatures = featureIndex.size();
    this.numClasses = labelIndex.size();
    this.data = null;
    this.dataIterable = dataIterable;

    this.labelIndex = labelIndex;
    this.featureIndex = featureIndex;
    this.labels = null;//dataset.getLabelsArray();
    this.values = null;//dataset.getValuesArray();
    this.dataWeights = null;
  }

  public LogConditionalObjectiveFunction(int numFeatures, int numClasses, int[][] data, int[] labels, boolean useSumCondObjFun) {
    this(numFeatures, numClasses, data, labels, null, new LogPrior(LogPrior.LogPriorType.QUADRATIC), useSumCondObjFun);
  }

  public LogConditionalObjectiveFunction(int numFeatures, int numClasses, int[][] data, int[] labels) {
    this(numFeatures, numClasses, data, labels, new LogPrior(LogPrior.LogPriorType.QUADRATIC));
  }

  public LogConditionalObjectiveFunction(int numFeatures, int numClasses, int[][] data, int[] labels, LogPrior prior) {
    this(numFeatures, numClasses, data, labels, null, prior);
  }

  public LogConditionalObjectiveFunction(int numFeatures, int numClasses, int[][] data, int[] labels, float[] dataWeights) {
    this(numFeatures, numClasses, data, labels, dataWeights, new LogPrior(LogPrior.LogPriorType.QUADRATIC));
  }

  public LogConditionalObjectiveFunction(int numFeatures, int numClasses, int[][] data, int[] labels, float[] dataWeights, LogPrior prior) {
    this(numFeatures, numClasses, data, labels, dataWeights, prior, false);
  }

  /* For binary features. Supports dataWeights. */
  public LogConditionalObjectiveFunction(int numFeatures, int numClasses, int[][] data, int[] labels,
                                         float[] dataWeights, LogPrior prior, boolean useSummedConditionalLikelihood) {
    this.numFeatures = numFeatures;
    this.numClasses = numClasses;
    this.data = data;
    this.values = null;
    this.labels = labels;
    this.prior = prior;
    this.dataWeights = dataWeights;
    this.labelIndex = null;
    this.featureIndex = null;
    this.dataIterable = null;
    this.useSummedConditionalLikelihood = useSummedConditionalLikelihood;
  }

  public LogConditionalObjectiveFunction(int numFeatures, int numClasses, int[][] data, int[] labels, int intPrior, double sigma, double epsilon) {
    this(numFeatures, numClasses, data, null, labels, intPrior, sigma, epsilon);
  }

  /** For real-valued features. Passing in processed data set. */
  public LogConditionalObjectiveFunction(int numFeatures, int numClasses, int[][] data, double[][] values, int[] labels, int intPrior, double sigma, double epsilon) {
    this.numFeatures = numFeatures;
    this.numClasses = numClasses;
    this.data = data;
    this.values = values;
    this.labels = labels;
    this.prior = new LogPrior(intPrior, sigma, epsilon);
    this.labelIndex = null;
    this.featureIndex = null;
    this.dataIterable = null;
    this.useSummedConditionalLikelihood = false;
    this.dataWeights = null;
  }

}