/* * RapidMiner * * Copyright (C) 2001-2008 by Rapid-I and the contributors * * Complete list of developers available at our web site: * * http://rapid-i.com * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.operator.learner.meta; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.ListIterator; import java.util.Vector; import com.rapidminer.example.Attribute; import com.rapidminer.example.Example; import com.rapidminer.example.ExampleSet; import com.rapidminer.example.set.SplittedExampleSet; import com.rapidminer.operator.IOContainer; import com.rapidminer.operator.IOObject; import com.rapidminer.operator.Model; import com.rapidminer.operator.OperatorChain; import com.rapidminer.operator.OperatorDescription; import com.rapidminer.operator.OperatorException; import com.rapidminer.operator.UserError; import com.rapidminer.operator.ValueDouble; import com.rapidminer.operator.condition.InnerOperatorCondition; import com.rapidminer.operator.condition.LastInnerOperatorCondition; import com.rapidminer.parameter.ParameterType; import com.rapidminer.parameter.ParameterTypeBoolean; import com.rapidminer.parameter.ParameterTypeDouble; import com.rapidminer.parameter.ParameterTypeInt; import com.rapidminer.tools.Ontology; import com.rapidminer.tools.Tools; /** * Subgroup discovery learner. * * @author Martin Scholz * @version $Id: SDRulesetInduction.java,v 1.20 2006/04/05 08:57:26 ingomierswa * Exp $ */ public class SDRulesetInduction extends OperatorChain { /** * Name of the variable specifying the maximal number of iterations of the * learner. */ public static final String PARAMETER_ITERATIONS = "iterations"; /** Name of the flag indicating internal bootstrapping. */ public static final String PARAMETER_RATIO_INTERNAL_BOOTSTRAP = "ratio_internal_bootstrap"; /** * A parameter whether to discard all rules not lying on the convex hull in * ROC space. */ public static final String PARAMETER_ROC_CONVEX_HULL_FILTER = "ROC_convex_hull_filter"; /** * Boolean parameter: true for additive reweighting, false for * multiplicative. */ public static final String PARAMETER_ADDITIVE_REWEIGHT = "additive_reweight"; /** * Boolean parameter to specify whether the label priors should be equally * likely after first iteration. */ public static final String PARAMETER_GAMMA = "gamma"; /** * Name of special attribute counting the times an example has been covered * by a rule. This attribute is created for additive reweighting, only. */ public static final String TIMES_COVERED = "TIMES_COVERED_SPECIAL_ATTRIB"; /** Discard models with an advantage of less than the specified value. */ public static final double MIN_ADVANTAGE = 0.001; // A performance measure to be visualized. Not yet implemented! private double performance = 0; // field for visualizing performance private int currentIteration; /** Constructor. */ public SDRulesetInduction(OperatorDescription description) { super(description); addValue(new ValueDouble("performance", "The performance.") { public double getDoubleValue() { return performance; } }); addValue(new ValueDouble("iteration", "The current iteration.") { public double getDoubleValue() { return currentIteration; } }); } public InnerOperatorCondition getInnerOperatorCondition() { return new LastInnerOperatorCondition(new Class[] { ExampleSet.class }, new Class[] { Model.class }); } /** * @see OperatorChain#getMaxNumberOfInnerOperators() */ public int getMaxNumberOfInnerOperators() { return 1; } /** * @see OperatorChain#getMinNumberOfInnerOperators() */ public int getMinNumberOfInnerOperators() { return 1; } /** * @see com.rapidminer.operator.Operator#getInputClasses() */ public Class<?>[] getInputClasses() { return new Class[] { ExampleSet.class }; } /** * @see com.rapidminer.operator.Operator#getOutputClasses() */ public Class<?>[] getOutputClasses() { return new Class[] { Model.class }; } public static int getPosIndex(Attribute label) { return (label.getMapping().getPositiveIndex()); } /** * Creates a weight attribute if not yet done and fills it with an initial * value so that positive and negative examples are equally probable. * * @param exampleSet * the example set to be prepared */ private double[] prepareWeights(ExampleSet exampleSet) throws OperatorException { Attribute weightAttr = com.rapidminer.example.Tools.createWeightAttribute(exampleSet); Attribute timesCoveredAttrib = null; boolean additive = this.getParameterAsBoolean(PARAMETER_ADDITIVE_REWEIGHT); if (additive && (timesCoveredAttrib = exampleSet.getAttributes().get(TIMES_COVERED)) == null) { timesCoveredAttrib = com.rapidminer.example.Tools.createSpecialAttribute(exampleSet, TIMES_COVERED, Ontology.INTEGER); exampleSet.getExampleTable().addAttribute(timesCoveredAttrib); } Iterator<Example> exRead = exampleSet.iterator(); int numPos = 0; final int positiveClass = getPosIndex(exampleSet.getAttributes().getLabel()); final int negativeClass = 1 - positiveClass; while (exRead.hasNext()) { if ((exRead.next().getLabel()) == positiveClass) numPos++; } final double[] classPriors = new double[2]; classPriors[positiveClass] = ((double) numPos) / exampleSet.size(); classPriors[negativeClass] = 1.0d - classPriors[positiveClass]; final double posWeight = 0.5 / classPriors[positiveClass]; final double negWeight = 0.5 / classPriors[negativeClass]; exRead = exampleSet.iterator(); while (exRead.hasNext()) { Example example = exRead.next(); double w = (example.getLabel() == positiveClass) ? posWeight : negWeight; example.setValue(weightAttr, w); if (additive) example.setValue(timesCoveredAttrib, 0); } return classPriors; } /** * Runs the "embedded" learner on the example set and retuns a * model. * * @param exampleSet * an <code>ExampleSet</code> to train a model for * @return a <code>Model</code> */ private Model trainModel(ExampleSet exampleSet) throws OperatorException { IOContainer result = getOperator(0).apply(new IOContainer(new IOObject[] { exampleSet })); Model model = result.remove(Model.class); return model; } /** * Constructs a <code>Model</code> repeatedly running a weak learner, * reweighting the training example set accordingly, and combining the * hypothesis using the available weighted performance values. If the input * contains a model, then this model is used as a starting point for * weighting the examples. */ public IOObject[] apply() throws OperatorException { // Reads the input example set and initiales its weights. ExampleSet exampleSet = this.getInput(ExampleSet.class); // Check if label is present and fits the learning task if (exampleSet.getAttributes().getLabel() == null) { throw new UserError(this, 105); } double[] classPriors = this.prepareWeights(exampleSet); Model model = this.trainRuleset(exampleSet, classPriors); return new IOObject[] { model }; } /** Main method for training the ensemble classifier */ private SDEnsemble trainRuleset(ExampleSet trainingSet, final double[] classPriors) throws OperatorException { // for models and their probability estimates Vector<Object[]> modelInfo = new Vector<Object[]>(); // check whether to use the complete training set for training final double splitRatio = this.getParameterAsDouble(PARAMETER_RATIO_INTERNAL_BOOTSTRAP); final boolean bootstrap = ((splitRatio > 0) && (splitRatio < 1.0)); log(bootstrap ? "Bootstrapping enabled." : "Bootstrapping disabled."); // maximum number of iterations final int iterations = this.getParameterAsInt(PARAMETER_ITERATIONS); final boolean roc_filter = this.getParameterAsBoolean(PARAMETER_ROC_CONVEX_HULL_FILTER); List<double[]> rocCurve = null; if (roc_filter) { rocCurve = new LinkedList<double[]>(); rocCurve.add(new double[] { 0, 0 }); rocCurve.add(new double[] { 1, 1 }); } for (int i = 0; i < iterations; i++) { this.currentIteration = i; // int size = trainingSet.getSize(); ExampleSet splittedSet = trainingSet; if (bootstrap == true) { splittedSet = new SplittedExampleSet(trainingSet, splitRatio, SplittedExampleSet.SHUFFLED_SAMPLING, -1); ((SplittedExampleSet) splittedSet).selectSingleSubset(0); // switch // to // training // set } // train one model per iteration Model model = this.trainModel(splittedSet); ExampleSet resultSet = null; if (bootstrap == true) { ((SplittedExampleSet) splittedSet).selectSingleSubset(1); // switch // to // out-of-bag // set resultSet = model.apply(splittedSet); // apply model to all examples } else { resultSet = model.apply(trainingSet); // apply model to all examples } // get the weighted performance value of the example set with // respect to the model SDReweightMeasures wp = new SDReweightMeasures(resultSet); final boolean additive = this.getParameterAsBoolean(PARAMETER_ADDITIVE_REWEIGHT); wp.setAdditive(additive); if (!additive) { wp.setGamma(this.getParameterAsDouble(PARAMETER_GAMMA)); } //this.debugMessage(wp); // Calculate the unweighted distributions and the true/false // positive rate: double[][] modelWeightMatrix = new double[2][2]; double tpr = 0; double fpr = 0; boolean defaultRule = false; { // assuming indexes "0" and "1" for predictions: int[][] predClasses = new int[2][]; predClasses[0] = wp.getCoveredExamplesNumForPred(0); predClasses[1] = wp.getCoveredExamplesNumForPred(1); int[] rowTotals = new int[2]; rowTotals[0] = predClasses[0][0] + predClasses[0][1]; rowTotals[1] = predClasses[1][0] + predClasses[1][1]; int total = rowTotals[0] + rowTotals[1]; // Just the distribution for the covered subset is stored. // It is not visible which label is explicitly predicted // (syntactically) // in the rule, so we assume the label that results in higher // WRAcc. double cov0 = ((double) rowTotals[0]) / total; double cov1 = ((double) rowTotals[1]) / total; double prior0 = ((double) predClasses[0][0] + predClasses[1][0]) / total; double prior1 = ((double) predClasses[0][1] + predClasses[1][1]) / total; // used // later double bias0 = Math.abs(((double) predClasses[0][0] / rowTotals[0]) - prior0); double bias1 = Math.abs(((double) predClasses[1][0] / rowTotals[1]) - prior0); int subset = (Double.isNaN(bias1) || cov0 * bias0 >= cov1 * bias1) ? 0 : 1; // WRAcc // is // coverage // * // bias // The subset not covered by the rule is marked with zero // estimates. modelWeightMatrix[subset][0] = ((double) predClasses[subset][0]) / rowTotals[subset]; modelWeightMatrix[subset][1] = ((double) predClasses[subset][1]) / rowTotals[subset]; double ratio0 = (((double) predClasses[subset][0]) / total) / prior0; double ratio1 = (((double) predClasses[subset][1]) / total) / prior1; // Reweight the example set with respect to the weighted // performance values. // The last parameter is the positive class. It is selected so // that TPr is higher. wp.reweightExamples(trainingSet, ((ratio0 > ratio1) ? 0 : 1), subset); // As "positive" and "negative" depend on the explicitly // predicted class // (which is not visible) we sometimes need to translate tnr // into tpr. if (roc_filter) { tpr = Math.max(ratio0, ratio1); fpr = Math.min(ratio0, ratio1); } defaultRule = (cov0 == 0) || (cov1 == 0); } // If activated just keep rules lying on the convex hull in ROC // space: if ((defaultRule == false) && (roc_filter == false || this.isOnConvexHull(rocCurve, tpr, fpr))) { // Add the new model and its weights to the collection of // models: modelInfo.add(new Object[] { model, modelWeightMatrix }); } inApplyLoop(); } if (roc_filter) { StringBuffer message = new StringBuffer("The convex hull in ROC space contains the following points (TPr/FPr):" + Tools.getLineSeparator()); Iterator it = rocCurve.iterator(); while (it.hasNext()) { double[] tpfp = (double[]) it.next(); message.append("(" + tpfp[0] + ", " + tpfp[1] + ") "); } log(message.toString()); } // Build a Model object. short combinationMethod = this.getParameterAsBoolean(PARAMETER_ADDITIVE_REWEIGHT) ? SDEnsemble.RULE_COMBINE_ADDITIVE : SDEnsemble.RULE_COMBINE_MULTIPLY; return new SDEnsemble(trainingSet, modelInfo, classPriors, combinationMethod); } /* private void debugMessage(SDReweightMeasures wp) { String message = Tools.getLineSeparator() + "Model learned - training performance of rule:" + Tools.getLineSeparator() + "TPR: " + wp.getProbability(0, 0) + " FPR: " + wp.getProbability(1, 0) + " | Positively predicted: " + (wp.getProbability(1, 0) + wp.getProbability(0, 0)) + Tools.getLineSeparator() + "FNR: " + wp.getProbability(0, 1) + " TNR: " + wp.getProbability(1, 1) + " | Negatively predicted: " + (wp.getProbability(0, 1) + wp.getProbability(1, 1)) + Tools.getLineSeparator() + "Positively labeled: " + (wp.getProbability(0, 0) + wp.getProbability(0, 1)) + Tools.getLineSeparator() + "Negatively labeled: " + (wp.getProbability(1, 0) + wp.getProbability(1, 1)); LogService.getGlobal().log(message, LogService.STATUS); } */ private boolean isOnConvexHull(List<double[]> rocCurve, double tpr, double fpr) { if ((tpr <= 0) || (tpr > 1) || (fpr < 0) || (fpr >= 1)) return false; ListIterator iter = rocCurve.listIterator(); double slope = Double.POSITIVE_INFINITY; boolean fprGreater = true; while (fprGreater) { double[] current = (double[]) (iter.next()); fprGreater = (fpr > current[1]); if (fprGreater) { double newSlope = (tpr - current[0]) / (fpr - current[1]); if (newSlope >= slope) { iter.remove(); } else { slope = newSlope; // slope connecting the new point to the // candidate double finalSlope = (1 - current[0]) / (1 - current[1]); // connection // new // point // to // (1,1) if (slope <= finalSlope) { // slope needs to be greater // than connection to (1,1) return false; // candidate lies below } } } else if (fpr == current[1]) { // no slope defined if (tpr > current[0]) { rocCurve.set(iter.previousIndex(), new double[] { tpr, fpr }); } else return false; } else { // The last slope is still available. It must be higher // than the next one! double nextSlope = (current[0] - tpr) / (current[1] - fpr); if (slope > nextSlope) { rocCurve.add(iter.previousIndex(), new double[] { tpr, fpr }); } else return false; } } slope = (1 - tpr) / (1 - fpr); // slope of connecting line between // candidate and (1,1) iter = rocCurve.listIterator(rocCurve.size()); while (iter.hasPrevious()) { double[] current = (double[]) iter.previous(); if (current[1] <= fpr) // found the candidate from end of list return true; // done. double newSlope = (current[0] - tpr) / (current[1] - fpr); // slope // new // point // to // candidate if ((current[1]) < 1 && (newSlope <= slope)) { // needs to be // greater than last // slope iter.remove(); } else slope = newSlope; } return true; } /** * Adds the parameters "number of iterations" and "model * file". */ public List<ParameterType> getParameterTypes() { List<ParameterType> types = super.getParameterTypes(); types.add(new ParameterTypeDouble(PARAMETER_RATIO_INTERNAL_BOOTSTRAP, "Fraction of examples used for training (internal bootstrapping). If activated (value < 1) only the rest is used to estimate the biases.", 0, 1, 0.7)); types.add(new ParameterTypeInt(PARAMETER_ITERATIONS, "The maximum number of iterations.", 1, Integer.MAX_VALUE, 10)); types.add(new ParameterTypeBoolean(PARAMETER_ROC_CONVEX_HULL_FILTER, "A parameter whether to discard all rules not lying on the convex hull in ROC space.", true)); types.add(new ParameterTypeBoolean(PARAMETER_ADDITIVE_REWEIGHT, "If enabled then resampling is done by additive reweighting, otherwise by multiplicative reweighting.", true)); types.add(new ParameterTypeDouble(PARAMETER_GAMMA, "Factor used for multiplicative reweighting. Has no effect in case of additive reweighting.", 0, 1, 0.9)); return types; } }