/* * RapidMiner * * Copyright (C) 2001-2008 by Rapid-I and the contributors * * Complete list of developers available at our web site: * * http://rapid-i.com * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.operator.learner.igss.utility; import com.rapidminer.operator.learner.igss.hypothesis.Hypothesis; /** Abstract superclass for all utility functions. * * @author Dirk Dach * @version $Id: AbstractUtility.java,v 1.3 2008/05/09 19:23:24 ingomierswa Exp $ */ public abstract class AbstractUtility implements Utility{ /** The prior probability of the two classes of the label. */ protected double[] priors; /** The number of covered examples before normal approximation is used. */ protected int large; /** Constructor for all utilities. */ public AbstractUtility (double[] priors, int large) { this.priors=new double[priors.length]; System.arraycopy(priors,0,this.priors,0,2); this.large=large; } /** Calculates the M-value needed for the GSS algorithm. */ public double calculateM (double delta, double epsilon) { double i=1; // perfomance: start with step=10000 while (conf(i,delta) > epsilon/2.0d) { i=i+10000; } if (i>1) { //i=i+10000 has been executed at least once. i=i-10000; } while (conf(i,delta) > (epsilon/2.0d)) { i++; } return Math.ceil(i); } /** Calculates the the unspecific confidence intervall. * Uses Chernoff bounds if the number of random experiments is too small and normal approximatione otherwise. * Considers the number of examples as the number of random experiments. problematic for g*(p-p0)) hypothesis, * that only cover a small amount of examples. No normal approximation should be used in this case. */ public double confidenceIntervall (double totalWeight, double delta) { if (totalWeight<large) { return confSmallM(totalWeight,delta); } else { return conf(totalWeight,delta); } } /** Calculates the the confidence intervall for a specific hypothesis. * Uses Chernoff bounds if the number of random experiments is too small and normal approximation otherwise. * This method is adapted for g*(p-p0) utility types. Every example for that the rule is applicable is one random experiment. * Should be overwritten by subclasses if they make a different random experiment.*/ public double confidenceIntervall (double totalWeight, double totalPositiveWeight, Hypothesis hypo, double delta) { if (hypo.getCoveredWeight()<large) { return confSmallM(totalWeight,delta); } else { return conf(totalWeight,totalPositiveWeight,hypo,delta); } } /** Calculates the confidence intervall for small numbers of examples.*/ public abstract double confSmallM (double totalWeight, double delta); /** Calculates the normal approximation of the confidence intervall. */ public abstract double conf(double totalWeight, double delta); /** Calculates the normal approximation of the confidence intervall for a specific hypothesis.*/ public abstract double conf(double totalWeight, double totalPositiveWeight, Hypothesis hypo, double delta); /** Calculates the inverse of the normal distribution, e.g.inverseNormal(0.95)==1.64. */ public double inverseNormal(double p) { // Coefficients in rational approximations double[] a = {-3.969683028665376e+01, 2.209460984245205e+02, -2.759285104469687e+02, 1.383577518672690e+02, -3.066479806614716e+01, 2.506628277459239e+00}; double[] b = {-5.447609879822406e+01, 1.615858368580409e+02, -1.556989798598866e+02, 6.680131188771972e+01, -1.328068155288572e+01 }; double[] c = {-7.784894002430293e-03, -3.223964580411365e-01, -2.400758277161838e+00, -2.549732539343734e+00, 4.374664141464968e+00, 2.938163982698783e+00}; double[] d = {7.784695709041462e-03, 3.224671290700398e-01, 2.445134137142996e+00, 3.754408661907416e+00}; // Define break-points. double plow = 0.02425; double phigh = 1 - plow; // Rational approximation for lower region: if ( p < plow ) { double q = Math.sqrt(-2*Math.log(p)); return (((((c[0]*q+c[1])*q+c[2])*q+c[3])*q+c[4])*q+c[5]) / ((((d[0]*q+d[1])*q+d[2])*q+d[3])*q+1); } // Rational approximation for upper region: if ( phigh < p ) { double q = Math.sqrt(-2*Math.log(1-p)); return -(((((c[0]*q+c[1])*q+c[2])*q+c[3])*q+c[4])*q+c[5]) / ((((d[0]*q+d[1])*q+d[2])*q+d[3])*q+1); } // Rational approximation for central region: double q = p - 0.5; double r = q*q; return (((((a[0]*r+a[1])*r+a[2])*r+a[3])*r+a[4])*r+a[5])*q / (((((b[0]*r+b[1])*r+b[2])*r+b[3])*r+b[4])*r+1); } }