/** * Copyright (C) 2001-2017 by RapidMiner and the contributors * * Complete list of developers available at our web site: * * http://rapidminer.com * * This program is free software: you can redistribute it and/or modify it under the terms of the * GNU Affero General Public License as published by the Free Software Foundation, either version 3 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License along with this program. * If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.operator.learner.igss.utility; import com.rapidminer.operator.learner.igss.hypothesis.Hypothesis; /** * Abstract superclass for all utility functions. * * @author Dirk Dach */ public abstract class AbstractUtility implements Utility { /** The prior probability of the two classes of the label. */ protected double[] priors; /** The number of covered examples before normal approximation is used. */ protected int large; /** Constructor for all utilities. */ public AbstractUtility(double[] priors, int large) { this.priors = new double[priors.length]; System.arraycopy(priors, 0, this.priors, 0, 2); this.large = large; } /** Calculates the M-value needed for the GSS algorithm. */ @Override public double calculateM(double delta, double epsilon) { double i = 1; // perfomance: start with step=10000 while (conf(i, delta) > epsilon / 2.0d) { i = i + 10000; } if (i > 1) { // i=i+10000 has been executed at least once. i = i - 10000; } while (conf(i, delta) > (epsilon / 2.0d)) { i++; } return Math.ceil(i); } /** * Calculates the the unspecific confidence intervall. Uses Chernoff bounds if the number of * random experiments is too small and normal approximatione otherwise. Considers the number of * examples as the number of random experiments. problematic for g*(p-p0)) hypothesis, that only * cover a small amount of examples. No normal approximation should be used in this case. */ @Override public double confidenceIntervall(double totalWeight, double delta) { if (totalWeight < large) { return confSmallM(totalWeight, delta); } else { return conf(totalWeight, delta); } } /** * Calculates the the confidence intervall for a specific hypothesis. Uses Chernoff bounds if * the number of random experiments is too small and normal approximation otherwise. This method * is adapted for g*(p-p0) utility types. Every example for that the rule is applicable is one * random experiment. Should be overwritten by subclasses if they make a different random * experiment. */ @Override public double confidenceIntervall(double totalWeight, double totalPositiveWeight, Hypothesis hypo, double delta) { if (hypo.getCoveredWeight() < large) { return confSmallM(totalWeight, delta); } else { return conf(totalWeight, totalPositiveWeight, hypo, delta); } } /** Calculates the confidence intervall for small numbers of examples. */ public abstract double confSmallM(double totalWeight, double delta); /** Calculates the normal approximation of the confidence intervall. */ public abstract double conf(double totalWeight, double delta); /** Calculates the normal approximation of the confidence intervall for a specific hypothesis. */ public abstract double conf(double totalWeight, double totalPositiveWeight, Hypothesis hypo, double delta); /** Calculates the inverse of the normal distribution, e.g.inverseNormal(0.95)==1.64. */ public double inverseNormal(double p) { // Coefficients in rational approximations double[] a = { -3.969683028665376e+01, 2.209460984245205e+02, -2.759285104469687e+02, 1.383577518672690e+02, -3.066479806614716e+01, 2.506628277459239e+00 }; double[] b = { -5.447609879822406e+01, 1.615858368580409e+02, -1.556989798598866e+02, 6.680131188771972e+01, -1.328068155288572e+01 }; double[] c = { -7.784894002430293e-03, -3.223964580411365e-01, -2.400758277161838e+00, -2.549732539343734e+00, 4.374664141464968e+00, 2.938163982698783e+00 }; double[] d = { 7.784695709041462e-03, 3.224671290700398e-01, 2.445134137142996e+00, 3.754408661907416e+00 }; // Define break-points. double plow = 0.02425; double phigh = 1 - plow; // Rational approximation for lower region: if (p < plow) { double q = Math.sqrt(-2 * Math.log(p)); return (((((c[0] * q + c[1]) * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5]) / ((((d[0] * q + d[1]) * q + d[2]) * q + d[3]) * q + 1); } // Rational approximation for upper region: if (phigh < p) { double q = Math.sqrt(-2 * Math.log(1 - p)); return -(((((c[0] * q + c[1]) * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5]) / ((((d[0] * q + d[1]) * q + d[2]) * q + d[3]) * q + 1); } // Rational approximation for central region: double q = p - 0.5; double r = q * q; return (((((a[0] * r + a[1]) * r + a[2]) * r + a[3]) * r + a[4]) * r + a[5]) * q / (((((b[0] * r + b[1]) * r + b[2]) * r + b[3]) * r + b[4]) * r + 1); } }