/* * Apache License * Version 2.0, January 2004 * http://www.apache.org/licenses/ * * Copyright 2013 Aurelian Tutuianu * Copyright 2014 Aurelian Tutuianu * Copyright 2015 Aurelian Tutuianu * Copyright 2016 Aurelian Tutuianu * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ package rapaio.experiment.math.optimization; import rapaio.math.MathTools; import rapaio.math.linear.RV; import rapaio.math.linear.dense.SolidRV; import rapaio.util.Pair; /** * Created by <a href="mailto:padreati@yahoo.com">Aurelian Tutuianu</a> on 11/24/15. */ @Deprecated public interface Gradient { /** * Compute the gradient and loss given the features of a single data point. * * @param data features for one data point * @param label label for this data point * @param weights weights/coefficients corresponding to features * @return Pair(Vector gradient, Double loss) */ default Pair<RV, Double> compute(RV data, double label, RV weights) { RV gradient = SolidRV.empty(weights.count()); Double loss = compute(data, label, weights, gradient); return Pair.from(gradient, loss); } /** * Compute the gradient and loss given the features of a single data point, * add the gradient to a provided vector to avoid creating new objects, and return loss. * * @param data features for one data point * @param label label for this data point * @param weights weights/coefficients corresponding to features * @param cumGradient the computed gradient will be added to this vector * @return loss */ Double compute(RV data, double label, RV weights, RV cumGradient); } /** * Compute gradient and loss for a Least-squared loss function, as used in linear regression. * This is correct for the averaged least squares loss function (mean squared error) * L = 1/2n ||A weights-y||^2 * See also the documentation for the precise formulation. * <p> * Created by <a href="mailto:padreati@yahoo.com">Aurelian Tutuianu</a> on 11/24/15. */ @Deprecated class LeastSquareGradient implements Gradient { @Override public Pair<RV, Double> compute(RV data, double label, RV weights) { double diff = data.dotProd(weights) - label; double loss = diff * diff / 2.0; RV gradient = data.solidCopy(); gradient.dot(diff); return Pair.from(gradient, loss); } @Override public Double compute(RV data, double label, RV weights, RV cumGradient) { double diff = data.dotProd(weights) - label; cumGradient.plus(data.solidCopy().dot(diff)); return diff * diff / 2.0; } } /** * /** * Compute gradient and loss for a multinomial logistic loss function, as used * in multi-class classification (it is also used in binary logistic regression). * <p> * In `The Elements of Statistical Learning: Data Mining, Inference, and Prediction, 2nd Edition` * by Trevor Hastie, Robert Tibshirani, and Jerome Friedman, which can be downloaded from * http://statweb.stanford.edu/~tibs/ElemStatLearn/ , Eq. (4.17) on page 119 gives the formula of * multinomial logistic regression model. A simple calculation shows that * <p> * {{{ * P(y=0|x, w) = 1 / (1 + \sum_i^{K-1} \exp(x w_i)) * P(y=1|x, w) = exp(x w_1) / (1 + \sum_i^{K-1} \exp(x w_i)) * ... * P(y=K-1|x, w) = exp(x w_{K-1}) / (1 + \sum_i^{K-1} \exp(x w_i)) * }}} * <p> * for K classes multiclass classification problem. * <p> * The model weights w = (w_1, w_2, ..., w_{K-1})^T becomes a matrix which has dimension of * (K-1) * (N+1) if the intercepts are added. If the intercepts are not added, the dimension * will be (K-1) * N. * <p> * As a result, the loss of objective function for a single instance of data can be written as * {{{ * l(w, x) = -log P(y|x, w) = -\alpha(y) log P(y=0|x, w) - (1-\alpha(y)) log P(y|x, w) * = log(1 + \sum_i^{K-1}\exp(x w_i)) - (1-\alpha(y)) x w_{y-1} * = log(1 + \sum_i^{K-1}\exp(margins_i)) - (1-\alpha(y)) margins_{y-1} * }}} * <p> * where \alpha(i) = 1 if i != 0, and * \alpha(i) = 0 if i == 0, * margins_i = x w_i. * <p> * For optimization, we have to calculate the first derivative of the loss function, and * a simple calculation shows that * <p> * {{{ * \frac{\partial l(w, x)}{\partial w_{ij}} * = (\exp(x w_i) / (1 + \sum_k^{K-1} \exp(x w_k)) - (1-\alpha(y)\delta_{y, i+1})) * x_j * = multiplier_i * x_j * }}} * <p> * where \delta_{i, j} = 1 if i == j, * \delta_{i, j} = 0 if i != j, and * multiplier = * \exp(margins_i) / (1 + \sum_k^{K-1} \exp(margins_i)) - (1-\alpha(y)\delta_{y, i+1}) * <p> * If any of margins is larger than 709.78, the numerical computation of multiplier and loss * function will be suffered from arithmetic overflow. This issue occurs when there are outliers * in data which are far away from hyperplane, and this will cause the failing of training once * infinity / infinity is introduced. Note that this is only a concern when max(margins) > 0. * <p> * Fortunately, when max(margins) = maxMargin > 0, the loss function and the multiplier can be * easily rewritten into the following equivalent numerically stable formula. * <p> * {{{ * l(w, x) = log(1 + \sum_i^{K-1}\exp(margins_i)) - (1-\alpha(y)) margins_{y-1} * = log(\exp(-maxMargin) + \sum_i^{K-1}\exp(margins_i - maxMargin)) + maxMargin * - (1-\alpha(y)) margins_{y-1} * = log(1 + sum) + maxMargin - (1-\alpha(y)) margins_{y-1} * }}} * <p> * where sum = \exp(-maxMargin) + \sum_i^{K-1}\exp(margins_i - maxMargin) - 1. * <p> * Note that each term, (margins_i - maxMargin) in \exp is smaller than zero; as a result, * overflow will not happen with this formula. * <p> * For multiplier, similar trick can be applied as the following, * <p> * {{{ * multiplier = \exp(margins_i) / (1 + \sum_k^{K-1} \exp(margins_i)) - (1-\alpha(y)\delta_{y, i+1}) * = \exp(margins_i - maxMargin) / (1 + sum) - (1-\alpha(y)\delta_{y, i+1}) * }}} * <p> * where each term in \exp is also smaller than zero, so overflow is not a concern. * <p> * For the detailed mathematical derivation, see the reference at * http://www.slideshare.net/dbtsai/2014-0620-mlor-36132297 * <p> * numClasses the number of possible outcomes for k classes classification problem in * Multinomial Logistic Regression. By default, it is binary logistic regression * so numClasses will be set to 2. * Created by <a href="mailto:padreati@yahoo.com">Aurelian Tutuianu</a> on 11/24/15. */ @Deprecated class LogisticGradient implements Gradient { private final int numClasses; public LogisticGradient() { this(2); } public LogisticGradient(int numClasses) { this.numClasses = numClasses; } @Override public Pair<RV, Double> compute(RV data, double label, RV weights) { RV gradient = SolidRV.empty(weights.count()); double loss = compute(data, label, weights, gradient); return Pair.from(gradient, loss); } @Override public Double compute(RV data, double label, RV weights, RV cumGradient) { int dataSize = data.count(); // (weights.size / dataSize + 1) is number of classes if (weights.count() % dataSize == 0 && numClasses == weights.count() / dataSize * 1.0 + 1) throw new IllegalArgumentException(""); switch (numClasses) { case 2: /** * For Binary Logistic Regression. * * Although the loss and gradient calculation for multinomial one is more generalized, * and multinomial one can also be used in binary case, we still implement a specialized * binary version for performance reason. */ double margin2 = -1.0 * data.dotProd(weights); double multiplier2 = (1.0 / (1.0 + Math.exp(margin2))) - label; cumGradient.plus(data.solidCopy().dot(multiplier2)); if (label > 0) { // The following is equivalent to log(1 + exp(margin)) but more numerically stable. return MathTools.log1pExp(margin2); } else { return MathTools.log1pExp(margin2) - margin2; } default: /** * For Multinomial Logistic Regression. */ // marginY is margins(label - 1) in the formula. double marginY = 0.0; double maxMargin = Double.NEGATIVE_INFINITY; double maxMarginIndex = 0; RV margins = SolidRV.empty(numClasses - 1); for (int i = 0; i < margins.count(); i++) { double margin = 0.0; for (int j = 0; j < data.count(); j++) { double value = data.get(j); if (value != 0.0) margin += value * weights.get((i * dataSize) + j); } if (i == (int) label - 1) marginY = margin; if (margin > maxMargin) { maxMargin = margin; maxMarginIndex = i; } margins.set(i, margin); } /** * When maxMargin > 0, the original formula will cause overflow as we discuss * in the previous comment. * We address this by subtracting maxMargin from all the margins, so it's guaranteed * that all of the new margins will be smaller than zero to prevent arithmetic overflow. */ double sum = 0.0; if (maxMargin > 0) { for (int i = 0; i < numClasses - 1; i++) { margins.increment(i, -maxMargin); if (i == maxMarginIndex) { sum += Math.exp(-maxMargin); } else { sum += Math.exp(margins.get(i)); } } } else { for (int i = 0; i < numClasses - 1; i++) { sum += Math.exp(margins.get(i)); } } for (int i = 0; i < numClasses - 1; i++) { double multiplier = Math.exp(margins.get(i)) / (sum + 1.0) - ((label != 0.0 && label == i + 1) ? 1.0 : 0.0); for (int j = 0; j < data.count(); j++) { double value = data.get(j); if (value != 0.0) cumGradient.increment(i * dataSize + j, multiplier * value); } } double loss = (label > 0.0) ? Math.log1p(sum) - marginY : Math.log1p(sum); if (maxMargin > 0) { return loss + maxMargin; } else { return loss; } } } } /** * Created by <a href="mailto:padreati@yahoo.com">Aurelian Tutuianu</a> on 11/24/15. */ @Deprecated class HingeGradient implements Gradient { @Override public Pair<RV, Double> compute(RV data, double label, RV weights) { double dotProduct = data.dotProd(weights); // Our loss function with {0, 1} labels is max(0, 1 - (2y - 1) (f_w(x))) // Therefore the gradient is -(2y - 1)*x double labelScaled = 2 * label - 1.0; if (1.0 > labelScaled * dotProduct) { RV gradient = data.solidCopy(); gradient.dot(-labelScaled); return Pair.from(gradient, 1.0 - labelScaled * dotProduct); } else { return Pair.from(SolidRV.empty(weights.count()), 0.0); } } @Override public Double compute(RV data, double label, RV weights, RV cumGradient) { double dotProduct = data.dotProd(weights); // Our loss function with {0, 1} labels is max(0, 1 - (2y - 1) (f_w(x))) // Therefore the gradient is -(2y - 1)*x double labelScaled = 2 * label - 1.0; if (1.0 > labelScaled * dotProduct) { cumGradient.plus(data.solidCopy().dot(-labelScaled)); return 1.0 - labelScaled * dotProduct; } return 0.0; } }