/* * RapidMiner * * Copyright (C) 2001-2008 by Rapid-I and the contributors * * Complete list of developers available at our web site: * * http://rapid-i.com * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.operator.learner.meta; import java.util.Iterator; import java.util.List; import java.util.Vector; import com.rapidminer.example.Attribute; import com.rapidminer.example.Example; import com.rapidminer.example.ExampleSet; import com.rapidminer.example.set.SplittedExampleSet; import com.rapidminer.operator.MissingIOObjectException; import com.rapidminer.operator.Model; import com.rapidminer.operator.OperatorDescription; import com.rapidminer.operator.OperatorException; import com.rapidminer.operator.ValueDouble; import com.rapidminer.operator.learner.LearnerCapability; import com.rapidminer.operator.learner.PredictionModel; import com.rapidminer.parameter.ParameterType; import com.rapidminer.parameter.ParameterTypeBoolean; import com.rapidminer.parameter.ParameterTypeDouble; import com.rapidminer.parameter.ParameterTypeInt; import com.rapidminer.tools.Tools; /** * <p>This operator trains an ensemble of classifiers for boolean target * attributes. In each iteration the training set is reweighted, so that * previously discovered patterns and other kinds of prior knowledge are * "sampled out" {@rapidminer.cite Scholz/2005b}. An inner classifier, * typically a rule or decision tree induction algorithm, is sequentially * applied several times, and the models are combined to a single global model. * The number of models to be trained maximally are specified by the parameter * <code>iterations</code>.</p> * * <p>If the parameter <code>rescale_label_priors</code> is set, then the example * set is reweighted, so that all classes are equally probable (or frequent). * For two-class problems this turns the problem of fitting models to maximize * weighted relative accuracy into the more common task of classifier induction * {@rapidminer.cite Scholz/2005a}. Applying a rule induction algorithm as an inner * learner allows to do subgroup discovery. This option is also recommended for * data sets with class skew, if a "very weak learner" like a decision * stump is used. If <code>rescale_label_priors</code> is not set, then the * operator performs boosting based on probability estimates.</p> * * <p>The estimates used by this operator may either be computed using the same set * as for training, or in each iteration the training set may be split randomly, * so that a model is fitted based on the first subset, and the probabilities * are estimated based on the second. The first solution may be advantageous in * situations where data is rare. Set the parameter * <code>ratio_internal_bootstrap</code> to 1 to use the same set for training * as for estimation. Set this parameter to a value of lower than 1 to use the * specified subset of data for training, and the remaining examples for * probability estimation.</p> * * <p>If the parameter <code>allow_marginal_skews</code> is <em>not</em> set, * then the support of each subset defined in terms of common base model * predictions does not change from one iteration to the next. Analogously the * class priors do not change. This is the procedure originally described in * {@rapidminer.cite Scholz/2005b} in the context of subgroup discovery.</p> * * <p>Setting the <code>allow_marginal_skews</code> option to <code>true</code> * leads to a procedure that changes the marginal weights/probabilities of * subsets, if this is beneficial in a boosting context, and stratifies the two * classes to be equally likely. As for AdaBoost, the total weight upper-bounds * the training error in this case. This bound is reduced more quickly by the * BayesianBoosting operator, however.</p> * * <p>In sum, to reproduce the sequential sampling, or knowledge-based sampling, * from {@rapidminer.cite Scholz/2005b} for subgroup discovery, two of the * default parameter settings of this operator have to be changed: * <code>rescale_label_priors</code> must * be set to <code>true</code>, and <code>allow_marginal_skews</code> must * be set to <code>false</code>. In addition, a boolean (binomial) label * has to be used.</p> * * <p>The operator requires an example set as its input. To sample out prior * knowledge of a different form it is possible to provide another model as an * optional additional input. The predictions of this model are used to weight * produce an initial weighting of the training set. The ouput of the operator * is a classification model applicable for estimating conditional class * probabilities or for plain crisp classification. It contains up to the * specified number of inner base models. In the case of an optional initial * model, this model will also be stored in the output model, in order to * produce the same initial weighting during model application.</p> * * @author Martin Scholz * @version $Id: BayesianBoosting.java,v 1.56 2006/04/14 15:14:32 ingomierswa * Exp $ */ public class BayesianBoosting extends AbstractMetaLearner { /** * Name of the variable specifying the maximal number of iterations of the * learner. */ public static final String PARAMETER_ITERATIONS = "iterations"; /** Name of the flag indicating internal bootstrapping. */ public static final String PARAMETER_USE_SUBSET_FOR_TRAINING = "use_subset_for_training"; /** * Boolean parameter to specify whether the label priors should be equally * likely after first iteration. */ public static final String PARAMETER_RESCALE_LABEL_PRIORS = "rescale_label_priors"; /** * Boolean parameter that switches between KBS (if set to false) and a * boosting-like reweighting. */ public static final String PARAMETER_ALLOW_MARGINAL_SKEWS = "allow_marginal_skews"; /** Discard models with an advantage of less than the specified value. */ public static final double MIN_ADVANTAGE = 0.001; /** A model to initialise the example weights. */ private Model startModel; /** Field for visualizing performance. */ protected int currentIteration; /** A performance measure to be visualized. */ private double performance = 0; /** A backup of the original weights of the training set to restore them * after learning. */ private double[] oldWeights; /** Constructor. */ public BayesianBoosting(OperatorDescription description) { super(description); addValue(new ValueDouble("performance", "The performance.") { public double getDoubleValue() { return performance; } }); addValue(new ValueDouble("iteration", "The current iteration.") { public double getDoubleValue() { return currentIteration; } }); } /** * Overrides the method of the super class. Returns true for polynominal * class. */ public boolean supportsCapability(LearnerCapability lc) { if (lc == LearnerCapability.NUMERICAL_CLASS || lc == LearnerCapability.POLYNOMINAL_CLASS) return false; if (lc == LearnerCapability.WEIGHTED_EXAMPLES) return true; return super.supportsCapability(lc); } /** * Constructs a <code>Model</code> repeatedly running a weak learner, * reweighting the training example set accordingly, and combining the * hypothesis using the available weighted performance values. If the input * contains a model, then this model is used as a starting point for * weighting the examples. */ public Model learn(ExampleSet exampleSet) throws OperatorException { // Read start model if present. this.readOptionalParameters(); // the resulting model of this operator Model model; double[] classPriors = this.prepareWeights(exampleSet); // check whether only one or no class is present double maxPrior = Double.NEGATIVE_INFINITY; double sumPriors = 0; for (int i=0; i<classPriors.length; i++) { if (classPriors[i] > maxPrior) maxPrior = classPriors[i]; sumPriors += classPriors[i]; } if (Tools.isEqual(sumPriors, maxPrior)) { // nothing to do, return an empty ensemble model model = new BayBoostModel(exampleSet, new Vector<BayBoostBaseModelInfo>(), classPriors); } else { // only in this case boosting makes sense model = this.trainBoostingModel(exampleSet, classPriors); } if (this.oldWeights != null) { // need to reset weights Iterator<Example> reader = exampleSet.iterator(); int i = 0; while (reader.hasNext() && i < this.oldWeights.length) { reader.next().setWeight(this.oldWeights[i++]); } } else { // need to delete the weights attribute Attribute weight = exampleSet.getAttributes().getWeight(); exampleSet.getAttributes().remove(weight); exampleSet.getExampleTable().removeAttribute(weight); } return model; } /** * Creates a weight attribute if not yet done. It either backs up the old * weoghts for restoring them later, or it fills the newly created attribute * with the initial value of 1. If rescaling to equal class priors is * activated then the weights are set accordingly. * * @param exampleSet * the example set to be prepared * @return a <code>double[]</code> array containing the class priors. */ protected double[] prepareWeights(ExampleSet exampleSet) { Attribute weightAttr = exampleSet.getAttributes().getWeight(); if (weightAttr == null) { this.oldWeights = null; // example weights are initialized so that the total weight // is equal to the number of examples: this.performance = exampleSet.size(); return this.createNewWeightAttribute(exampleSet); } else { // Back up old weights and compute priors: this.oldWeights = new double[exampleSet.size()]; double[] priors = new double[exampleSet.getAttributes().getLabel().getMapping().size()]; double totalWeight = 0; Iterator<Example> reader = exampleSet.iterator(); for (int i = 0; (reader.hasNext() && i < oldWeights.length); i++) { Example example = reader.next(); if (example != null) { double weight = example.getWeight(); this.oldWeights[i] = weight; int label = (int) example.getLabel(); if (0 <= label && label < priors.length) { priors[label] += weight; totalWeight += weight; } else example.setWeight(0); // Unrecognized label, try to ignore it! } } this.performance = totalWeight; // Normalize: for (int i = 0; i < priors.length; i++) { priors[i] /= totalWeight; } return priors; } } private double[] createNewWeightAttribute(ExampleSet exampleSet) { com.rapidminer.example.Tools.createWeightAttribute(exampleSet); Iterator<Example> exRead = exampleSet.iterator(); int numClasses = exampleSet.getAttributes().getLabel().getMapping().getValues().size(); double[] classPriors = new double[numClasses]; int total = exampleSet.size(); double invTotal = 1.0d / total; if (this.getParameterAsBoolean(PARAMETER_RESCALE_LABEL_PRIORS) == false) { while (exRead.hasNext()) { Example example = exRead.next(); example.setWeight(1); classPriors[(int) (example.getLabel())] += invTotal; } } else { // first count the class frequencies while (exRead.hasNext()) { classPriors[(int) (exRead.next().getLabel())] += invTotal; } this.rescaleToEqualPriors(exampleSet, classPriors); } return classPriors; } private void rescaleToEqualPriors(ExampleSet exampleSet, double[] currentPriors) { // The weights of class i are calculated as // (1 / #classes) / (#rel_freq_class_i) double[] weights = new double[currentPriors.length]; for (int i = 0; i < weights.length; i++) { weights[i] = 1.0d / (weights.length * (currentPriors[i])); } Iterator<Example> exRead = exampleSet.iterator(); while (exRead.hasNext()) { Example example = exRead.next(); example.setWeight(weights[(int) (example.getLabel())]); } } /** * Runs the "embedded" learner on the example set and retuns a * model. * * @param exampleSet * an <code>ExampleSet</code> to train a model for * @return a <code>Model</code> */ protected Model trainBaseModel(ExampleSet exampleSet) throws OperatorException { Model model = applyInnerLearner(exampleSet); return model; } /** Helper method reading a start model from the input if present. */ private void readOptionalParameters() { try { this.startModel = getInput(Model.class); } catch (MissingIOObjectException e) { log(getName() + ": No model found in input."); } } /** * Helper method applying the start model and adding it to the modelInfo * collection */ private void applyPriorModel(ExampleSet trainingSet, List<BayBoostBaseModelInfo> modelInfo) throws OperatorException { // If the input contains a model already, initialise the example weights. if (this.startModel != null) { ExampleSet resultSet = this.startModel.apply((ExampleSet)trainingSet.clone()); // Initial values and the input model are stored in the output model. WeightedPerformanceMeasures wp = new WeightedPerformanceMeasures(resultSet); this.reweightExamples(wp, resultSet); modelInfo.add(new BayBoostBaseModelInfo(this.startModel, wp.getContingencyMatrix())); PredictionModel.removePredictedLabel(resultSet); } } /** Main method for training the ensemble classifier */ private BayBoostModel trainBoostingModel(ExampleSet trainingSet, final double[] classPriors) throws OperatorException { // for models and their probability estimates Vector<BayBoostBaseModelInfo> modelInfo = new Vector<BayBoostBaseModelInfo>(); // if present apply the start model first this.applyPriorModel(trainingSet, modelInfo); // check whether to use the complete training set for training final double splitRatio = this.getParameterAsDouble(PARAMETER_USE_SUBSET_FOR_TRAINING); final boolean bootstrap = ((splitRatio > 0) && (splitRatio < 1.0)); log(bootstrap ? "Bootstrapping enabled." : "Bootstrapping disabled."); final boolean allowSkew = this.getParameterAsBoolean(PARAMETER_ALLOW_MARGINAL_SKEWS); SplittedExampleSet splittedSet = null; if (bootstrap == true) { splittedSet = new SplittedExampleSet(trainingSet, splitRatio, SplittedExampleSet.SHUFFLED_SAMPLING, -1); } // maximum number of iterations final int iterations = this.getParameterAsInt(PARAMETER_ITERATIONS); L: for (int i = 0; i < iterations; i++) { this.currentIteration = i; Model model; WeightedPerformanceMeasures wp; ExampleSet iterationSet = (ExampleSet)trainingSet.clone(); if (bootstrap == true) { splittedSet.selectSingleSubset(0); // switch to learning subset model = this.trainBaseModel(splittedSet); // apply model to all examples iterationSet = model.apply(iterationSet); // reweight learning subset wp = new WeightedPerformanceMeasures(splittedSet); WeightedPerformanceMeasures.reweightExamples(splittedSet, wp.getContingencyMatrix(), allowSkew); // handle test set: reweight it separately, use its estimates // for future predictions splittedSet.selectSingleSubset(1); wp = new WeightedPerformanceMeasures(splittedSet); this.performance = // performance should be estimated based on the hold-out set WeightedPerformanceMeasures.reweightExamples(splittedSet, wp.getContingencyMatrix(), allowSkew); } else { // train one model per iteration model = this.trainBaseModel(iterationSet); iterationSet = model.apply(iterationSet); // get the weighted performance value of the example set with // respect to the model wp = new WeightedPerformanceMeasures(iterationSet); // Reweight the example set with respect to the weighted // performance values: this.performance = this.reweightExamples(wp, iterationSet); } PredictionModel.removePredictedLabel(iterationSet); if (classPriors.length == 2) { //this.debugMessage(wp); } // Stop if only one class is present/left. if (wp.getNumberOfNonEmptyClasses() < 2) { // Using the model here is just necessary to avoid a // NullPointerException if this is the first iteration. // One could use an empty model instead: modelInfo.add(new BayBoostBaseModelInfo(model, wp.getContingencyMatrix())); break L; // No more iterations! } final ContingencyMatrix cm = wp.getContingencyMatrix(); // Add the new model and its weights to the collection of models: modelInfo.add(new BayBoostBaseModelInfo(model, cm)); if (this.isModelUseful(cm) == false) { // If the model is not considered to be useful (low advantage) // then discard it and stop. log("Discard model because of low advantage on training data."); modelInfo.remove(modelInfo.size() - 1); break L; } // Stop if weight is null, because all examples have been explained // "deterministically". if (this.performance == 0) { break L; } inApplyLoop(); } // Build a Model object. Last parameter is "crispPredictions", nowadays // always true. return new BayBoostModel(trainingSet, modelInfo, classPriors); } /** * This method reweights the example set with respect to the * <code>WeightedPerformanceMeasures</code> object. Please note that the * weights will not be reset at any time, because they continuously change * from one iteration to the next. This method does not change the priors of * the classes. * * @param wp * the WeightedPerformanceMeasures to use * @param exampleSet * <code>ExampleSet</code> to be reweighted * @return the total weight of examples as an error estimate */ protected double reweightExamples(WeightedPerformanceMeasures wp, ExampleSet exampleSet) throws OperatorException { boolean allowMarginalSkews = this.getParameterAsBoolean(PARAMETER_ALLOW_MARGINAL_SKEWS); double remainingWeight = WeightedPerformanceMeasures. reweightExamples(exampleSet, wp.getContingencyMatrix(), allowMarginalSkews); return remainingWeight; } /** * Helper method to decide whether a model improves the training error * enough to be considered. Returns always true. * * @param cm * the lift ratio matrix as returned by the getter of the * WeightedPerformance class * @return <code>true</code> iff the advantage is high enough to consider * the model to be useful */ private boolean isModelUseful(ContingencyMatrix cm) { // should rather be decided offline by properly setting // the number of iterations return true; } /** * Adds the parameters "number of iterations" and "model * file". */ public List<ParameterType> getParameterTypes() { List<ParameterType> types = super.getParameterTypes(); types.add(new ParameterTypeBoolean(PARAMETER_RESCALE_LABEL_PRIORS, "Specifies whether the proportion of labels should be equal by construction after first iteration .", false)); types.add(new ParameterTypeDouble(PARAMETER_USE_SUBSET_FOR_TRAINING, "Fraction of examples used for training, remaining ones are used to estimate the confusion matrix. Set to 1 to turn off test set.", 0, 1, 1)); types.add(new ParameterTypeInt(PARAMETER_ITERATIONS, "The maximum number of iterations.", 1, Integer.MAX_VALUE, 10)); types.add(new ParameterTypeBoolean(PARAMETER_ALLOW_MARGINAL_SKEWS, "Allow to skew the marginal distribution (P(x)) during learning.", true)); return types; } }