/* * RapidMiner * * Copyright (C) 2001-2008 by Rapid-I and the contributors * * Complete list of developers available at our web site: * * http://rapid-i.com * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.operator.learner.igss; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.ListIterator; import com.rapidminer.example.Attribute; import com.rapidminer.example.Example; import com.rapidminer.example.ExampleSet; import com.rapidminer.example.Tools; import com.rapidminer.operator.IOObject; import com.rapidminer.operator.Model; import com.rapidminer.operator.OperatorDescription; import com.rapidminer.operator.OperatorException; import com.rapidminer.operator.UserError; import com.rapidminer.operator.learner.AbstractLearner; import com.rapidminer.operator.learner.CapabilityCheck; import com.rapidminer.operator.learner.LearnerCapability; import com.rapidminer.operator.learner.PredictionModel; import com.rapidminer.operator.learner.igss.hypothesis.GSSModel; import com.rapidminer.operator.learner.igss.hypothesis.Hypothesis; import com.rapidminer.operator.learner.igss.hypothesis.Rule; import com.rapidminer.operator.learner.igss.utility.Accuracy; import com.rapidminer.operator.learner.igss.utility.Binomial; import com.rapidminer.operator.learner.igss.utility.Linear; import com.rapidminer.operator.learner.igss.utility.Squared; import com.rapidminer.operator.learner.igss.utility.Utility; import com.rapidminer.operator.learner.igss.utility.WRAcc; import com.rapidminer.operator.learner.meta.BayBoostBaseModelInfo; import com.rapidminer.operator.learner.meta.BayBoostModel; import com.rapidminer.operator.learner.meta.ContingencyMatrix; import com.rapidminer.operator.learner.meta.WeightedPerformanceMeasures; import com.rapidminer.parameter.ParameterType; import com.rapidminer.parameter.ParameterTypeBoolean; import com.rapidminer.parameter.ParameterTypeCategory; import com.rapidminer.parameter.ParameterTypeDouble; import com.rapidminer.parameter.ParameterTypeInt; import com.rapidminer.tools.RandomGenerator; /** * This operator implements the IteratingGSS algorithmus presented in the diploma thesis 'Effiziente Entdeckung unabhaengiger Subgruppen in grossen Datenbanken' at the Department of Computer Science, * University of Dortmund. * * @author Dirk Dach * @version $Id: IteratingGSS.java,v 1.10 2008/07/07 07:06:46 ingomierswa Exp $ */ public class IteratingGSS extends AbstractLearner { /** The parameter name for "approximation parameter" */ public static final String PARAMETER_EPSILON = "epsilon"; /** The parameter name for "desired confidence" */ public static final String PARAMETER_DELTA = "delta"; /** The parameter name for "minimum utility used for pruning" */ public static final String PARAMETER_MIN_UTILITY_PRUNING = "min_utility_pruning"; /** The parameter name for "minimum utility for the usefulness of a rule" */ public static final String PARAMETER_MIN_UTILITY_USEFUL = "min_utility_useful"; /** The parameter name for "the number of examples drawn before the next hypothesis update" */ public static final String PARAMETER_STEPSIZE = "stepsize"; /** The parameter name for "the number of examples a hypothesis must cover before normal approximation is used" */ public static final String PARAMETER_LARGE = "large"; /** The parameter name for "the maximum complexity of hypothesis" */ public static final String PARAMETER_MAX_COMPLEXITY = "max_complexity"; /** The parameter name for "the minimum complexity of hypothesis" */ public static final String PARAMETER_MIN_COMPLEXITY = "min_complexity"; /** The parameter name for "the number of iterations" */ public static final String PARAMETER_ITERATIONS = "iterations"; /** The parameter name for "Switch to binomial utility funtion before increasing complexity" */ public static final String PARAMETER_USE_BINOMIAL = "use_binomial"; /** The parameter name for "the utility function to be used" */ public static final String PARAMETER_UTILITY_FUNCTION = "utility_function"; /** The parameter name for "use kbs to reweight examples after each iteration" */ public static final String PARAMETER_USE_KBS = "use_kbs"; /** The parameter name for "use rejection sampling instead of weighted examples" */ public static final String PARAMETER_REJECTION_SAMPLING = "rejection_sampling"; /** The parameter name for "criterion to decide if the complexity is increased " */ public static final String PARAMETER_USEFUL_CRITERION = "useful_criterion"; /** The parameter name for "used by example criterion to determine usefulness of a hypothesis" */ public static final String PARAMETER_EXAMPLE_FACTOR = "example_factor"; /** The parameter name for "make all iterations even if termination criterion is met" */ public static final String PARAMETER_FORCE_ITERATIONS = "force_iterations"; /** The parameter name for "generate h->Y+/Y- or h->Y+ only." */ public static final String PARAMETER_GENERATE_ALL_HYPOTHESIS = "generate_all_hypothesis"; /** The parameter name for "Set weights back to 1 when complexity is increased." */ public static final String PARAMETER_RESET_WEIGHTS = "reset_weights"; public static final String[] CRITERION_TYPES = { "worst_utility", "utility", "best_utility", "example" }; public static final int FIRST_TYPE_INDEX = 0; public static final int TYPE_WORST_UTILITY = 0; public static final int TYPE_UTILITY = 1; public static final int TYPE_BEST_UTILITY = 2; public static final int TYPE_EXAMPLE = 3; public static final int LAST_TYPE_INDEX = 3; /** stores all results */ private IGSSResult gssResult; /** The regular atributes */ private Attribute[] regularAttributes; /** The label attribute */ private Attribute label; /** The utility function */ private Utility theUtility; /** global random generator */ private RandomGenerator random; /** First hypothesis used to create all others. */ private Hypothesis seed; /** Total weight used by GSS */ private double totalWeight; /** Total positive weight used by GSS */ private double totalPositiveWeight; /** Stores the k-best hypothesis. */ private LinkedList<Hypothesis> bestList; /** Worst of the k best hypothesis */ private Result minBest; /** Best of the hypothesis not among the k best */ private Result maxRest; /** Parameter k of the GSS algorithm */ private int numberOfSolutions; /** Remaining delta */ private double currentDelta; /** Parameter epsilon of the GSS algorithm */ private double epsilon; /** Parameter stepsize of the IGSS algorithm */ private int stepsize; /** Maximum hypothesis complexity */ private int maxComplexity; /** Minimum hypothesis complexity */ private int minComplexity; /** Minimum utility used for pruning */ private double min_utility_pruning; /** Minimum utility needed for a utility to be useful */ private double min_utility_useful; /** Indicates if kbs should be used. */ private boolean useKBS; /** Indicates if Binomial should be used before increasing complexity. */ private boolean useBinomial; /** the useful criterion for the IGSS algorithm */ private int useful_criterion; /** Always make all iterations? */ private boolean forceIterations; /** Reset weights after complexity increase? */ private boolean resetWeights; /** Factor needed by example_criterion. */ private double exampleFactor; /** minimal model number for example_criterion */ public int MIN_MODEL_NUMBER = 2; /** Use rejection sampling or weights directly. */ private boolean rejectionSampling; /** Number of random experiments before a normal approximation is used. */ private int large; /** The number of iterations for the IGSS algorithm. */ private int iterations; /** Must pass the given object to the superclass. */ public IteratingGSS(OperatorDescription description) { super(description); } /** Updates bestList,bestRest and minBest */ private void updateLists(LinkedList<Hypothesis> hypothesisList, int n, double totalExampleWeight, double totalPositiveWeight, double delta_h_m) { this.bestList = new LinkedList<Hypothesis>(); LinkedList<Result> bestList = new LinkedList<Result>(); // local variable covers global field!! LinkedList<Result> restList = new LinkedList<Result>(); this.minBest = null; this.maxRest = null; // Find n rules with best empirical utility and partition into bestList and restList Iterator it = hypothesisList.iterator(); while (it.hasNext()) { Hypothesis hypo = (Hypothesis) it.next(); if (hypo.getCoveredWeight() > 0.0d) { if (bestList.size() < n) { if (bestList.isEmpty()) { double util = theUtility.utility(totalExampleWeight, totalPositiveWeight, hypo); double conf = theUtility.confidenceIntervall(totalExampleWeight, totalPositiveWeight, hypo, delta_h_m); bestList.addLast(new Result(hypo, totalExampleWeight, totalPositiveWeight, util, conf)); } else { double util = theUtility.utility(totalExampleWeight, totalPositiveWeight, hypo); double conf = theUtility.confidenceIntervall(totalExampleWeight, totalPositiveWeight, hypo, delta_h_m); ListIterator<Result> listIterator = bestList.listIterator(0); while (listIterator.hasNext()) { Result current = listIterator.next(); if (util > current.getUtility()) { listIterator.previous(); break; } } listIterator.add(new Result(hypo, totalExampleWeight, totalPositiveWeight, util, conf)); } } else { double util = theUtility.utility(totalExampleWeight, totalPositiveWeight, hypo); double conf = theUtility.confidenceIntervall(totalExampleWeight, totalPositiveWeight, hypo, delta_h_m); if (util > bestList.getLast().getUtility()) { ListIterator<Result> listIterator = bestList.listIterator(0); while (listIterator.hasNext()) { Result current = listIterator.next(); if (util > current.getUtility()) { listIterator.previous(); break; } } listIterator.add(new Result(hypo, totalExampleWeight, totalPositiveWeight, util, conf)); restList.addLast(bestList.removeLast()); } else { restList.addLast(new Result(hypo, totalExampleWeight, totalPositiveWeight, util, conf)); } } } else { double util = theUtility.utility(totalExampleWeight, totalPositiveWeight, hypo); double conf = theUtility.confidenceIntervall(totalExampleWeight, totalPositiveWeight, hypo, delta_h_m); restList.addLast(new Result(hypo, totalExampleWeight, totalPositiveWeight, util, conf)); } } // Find min(bestList) Result r = bestList.getLast(); double minimum = r.getUtility() - r.getConfidence(); this.minBest = r; it = bestList.iterator(); while (it.hasNext()) { r = (Result) it.next(); double current = r.getUtility() - r.getConfidence(); if (current < minimum) { minimum = current; this.minBest = r; } } // Find max(restList) r = restList.getLast(); double maximum = r.getUtility() + r.getConfidence(); this.maxRest = r; it = restList.iterator(); while (it.hasNext()) { r = (Result) it.next(); double current = r.getUtility() + r.getConfidence(); if (current > maximum) { maximum = current; this.maxRest = r; } } it = bestList.iterator(); while (it.hasNext()) { this.bestList.addLast(((Result) it.next()).getHypothesis()); // Add hypothesis from local variable to global field. } } /** Returns the n best hypothesis with maximum error epsilon with confidence 1-delta. */ public LinkedList<Result> gss(ExampleSet exampleSet, LinkedList<Hypothesis> hypothesisList, double delta, double epsilon) throws OperatorException { // Initialization. LinkedList<Hypothesis> delete = new LinkedList<Hypothesis>();// Stores deleted hypothesis. LinkedList<Hypothesis> output = new LinkedList<Hypothesis>();// Stores hypothesis that became output. LinkedList<Result> results = new LinkedList<Result>();// Stores the results. this.bestList = new LinkedList<Hypothesis>(); int n = this.numberOfSolutions; this.totalWeight = 0.0d; this.totalPositiveWeight = 0.0d; // Calculate m and current delta value double delta_h = delta / (2.0d * hypothesisList.size()); double m = theUtility.calculateM(delta_h, epsilon); double delta_h_m = delta / (2.0d * hypothesisList.size() * Math.ceil(m / stepsize)); double r = 0.0d; double weightToAdd = 1.0d; int nextUpdateValue = stepsize; // Draw random examples and apply all rules to each example. do { // Query a random example. int rand = random.nextInt(exampleSet.size()); Example e = exampleSet.getExample(rand); // Get random value from [0,1] if rejection samplingis used or // the correct weight to add if normal weights are uesed. if (this.rejectionSampling) { r = random.nextDouble(); } else { weightToAdd = e.getWeight(); } if (r <= e.getWeight()) { // Apply the given example to all rules in the hypothesisList. Iterator it = hypothesisList.iterator(); while (it.hasNext()) { ((Hypothesis) it.next()).apply(e); } totalWeight += weightToAdd; if ((int) e.getLabel() == Hypothesis.POSITIVE_CLASS) { totalPositiveWeight += weightToAdd; } // Update rules that already became output. They are needed for pruning. if (!output.isEmpty()) { Iterator iter = output.iterator(); while (iter.hasNext()) { Hypothesis hypo = (Hypothesis) iter.next(); hypo.apply(e); } } // Update already removed rules. They are needed for pruning. if (!delete.isEmpty()) { Iterator iter = delete.iterator(); while (iter.hasNext()) { Hypothesis hypo = (Hypothesis) iter.next(); hypo.apply(e); } } /* * Update the utility of all rules and determine the best rule and output/delete rules that are good/bad enough. Only once per stepsize! */ if (((int) totalWeight) >= nextUpdateValue) { nextUpdateValue += stepsize; updateLists(hypothesisList, n, totalWeight, totalPositiveWeight, delta_h_m); // Look for hypothesis to delete/output. ListIterator iter = hypothesisList.listIterator(); while (iter.hasNext() && n > 0 && hypothesisList.size() != n) { Hypothesis hypo = (Hypothesis) iter.next(); double util = theUtility.utility(totalWeight, totalPositiveWeight, hypo); double conf = theUtility.confidenceIntervall(totalWeight, totalPositiveWeight, hypo, delta_h_m); if ((util >= conf + maxRest.getUtility() + maxRest.getConfidence() - epsilon) && bestList.contains(hypo)) { results.addLast(new Result(hypo.clone(), totalWeight, totalPositiveWeight, util, conf)); output.addLast(hypo); iter.remove(); n--; // Adapt values to new hypothesis list size. delta_h = delta / (2.0d * hypothesisList.size()); delta_h_m = delta / (2.0d * hypothesisList.size() * Math.ceil(m / stepsize)); if (n != 0) { this.updateLists(hypothesisList, n, totalWeight, totalPositiveWeight, delta_h_m); } } else if (util <= minBest.getUtility() - minBest.getConfidence() - conf) { delete.addLast(hypo); iter.remove(); // Adapt values to new hypothesis list size. delta_h = delta / (2.0d * hypothesisList.size()); delta_h_m = delta / (2.0d * hypothesisList.size() * Math.ceil(m / stepsize)); if (hypo.equals(maxRest.getHypothesis()) && hypothesisList.size() > n) { this.updateLists(hypothesisList, n, totalWeight, totalPositiveWeight, delta_h_m); } } } } } } while (!(n == 0 || hypothesisList.size() == n || theUtility.confidenceIntervall(totalWeight, delta_h) <= epsilon / 2.0)); if (n > 0) { // Add hypothesis to result that are still in the best list after the loop is exited. // Their confidence intervall is guranteed to be epsilon/2 at most. if (bestList.isEmpty()) { // Happens for big epsilon; then M<stepsize. updateLists(hypothesisList, n, totalWeight, totalPositiveWeight, delta_h_m); } while (!bestList.isEmpty()) { Hypothesis hypo = bestList.removeFirst(); double util = theUtility.utility(totalWeight, totalPositiveWeight, hypo); double conf = theUtility.confidenceIntervall(totalWeight, totalPositiveWeight, hypo, delta_h_m); // confidence calculated for bestList if (conf > epsilon / 2.0) { conf = epsilon / 2.0; } results.addLast(new Result(hypo.clone(), totalWeight, totalPositiveWeight, util, conf)); } } else { this.currentDelta = this.currentDelta + delta / 2.0d; } hypothesisList.addAll(delete); // Add deleted rules again, because all rules are needed for pruning. hypothesisList.addAll(output); // Add output rules again, because all rules are needed for pruning. return results; } /** * Reweights the examples according to knowledge based sampling. Normalizes weights to [0,1] if the parameter normalize is set to true. */ public ContingencyMatrix reweight(ExampleSet exampleSet, Model model, boolean normalize) throws OperatorException { exampleSet = model.apply(exampleSet); // apply and create predicted label WeightedPerformanceMeasures wpm = new WeightedPerformanceMeasures(exampleSet); WeightedPerformanceMeasures.reweightExamples(exampleSet, wpm.getContingencyMatrix(), false); if (normalize) { double maxWeight = Double.NEGATIVE_INFINITY; Iterator<Example> reader = exampleSet.iterator(); while (reader.hasNext()) { Example e = reader.next(); if (e.getWeight() > maxWeight) { maxWeight = e.getWeight(); } } // Normalize with maximum weight. reader = exampleSet.iterator(); while (reader.hasNext()) { Example e = reader.next(); e.setValue(e.getAttributes().getWeight(), e.getWeight() / maxWeight); } } PredictionModel.removePredictedLabel(exampleSet); return wpm.getContingencyMatrix(); } // TODO: move stuff into learn method and remove method public IOObject[] apply() throws OperatorException { ExampleSet eSet = getInput(ExampleSet.class); // some checks if (eSet.getAttributes().getLabel() == null) { throw new UserError(this, 105, new Object[0]); } if (eSet.getAttributes().size() == 0) { throw new UserError(this, 106, new Object[0]); } // check capabilities and produce errors if they are not fullfilled CapabilityCheck check = new CapabilityCheck(this, com.rapidminer.tools.Tools.booleanValue(System.getProperty(AbstractLearner.PROPERTY_RAPIDMINER_GENERAL_CAPABILITIES_WARN), true)); check.checkLearnerCapabilities(this, eSet); for (Attribute attribute : eSet.getAttributes()) { if (!attribute.isNominal()) { throw new UserError(this, 103, getName(), attribute.getName()); } } // Initialization this.random = RandomGenerator.getGlobalRandomGenerator(); this.epsilon = getParameterAsDouble(PARAMETER_EPSILON); this.currentDelta = getParameterAsDouble(PARAMETER_DELTA); this.stepsize = getParameterAsInt(PARAMETER_STEPSIZE); this.large = getParameterAsInt(PARAMETER_LARGE); this.useKBS = getParameterAsBoolean(PARAMETER_USE_KBS); this.rejectionSampling = getParameterAsBoolean(PARAMETER_REJECTION_SAMPLING); this.numberOfSolutions = 1; this.iterations = getParameterAsInt(PARAMETER_ITERATIONS); this.useful_criterion = getParameterAsInt(PARAMETER_USEFUL_CRITERION); this.min_utility_pruning = getParameterAsDouble(PARAMETER_MIN_UTILITY_PRUNING); this.min_utility_useful = getParameterAsDouble(PARAMETER_MIN_UTILITY_USEFUL); this.useBinomial = getParameterAsBoolean(PARAMETER_USE_BINOMIAL); this.maxComplexity = getParameterAsInt(PARAMETER_MAX_COMPLEXITY); this.minComplexity = getParameterAsInt(PARAMETER_MIN_COMPLEXITY); this.forceIterations = getParameterAsBoolean(PARAMETER_FORCE_ITERATIONS); this.resetWeights = getParameterAsBoolean(PARAMETER_RESET_WEIGHTS); this.exampleFactor = getParameterAsDouble(PARAMETER_EXAMPLE_FACTOR); if (this.minComplexity > this.maxComplexity) { throw new UserError(this, 116, "max_complexity", this.maxComplexity); } // Initialize label, weight. this.label = eSet.getAttributes().getLabel(); Tools.createWeightAttribute(eSet); // Initialize Result. this.gssResult = new IGSSResult(eSet); // Initialize regularAttributes; regularAttributes = new Attribute[eSet.getAttributes().size()]; int counter = 0; for (Attribute attribute : eSet.getAttributes()) { regularAttributes[counter++] = attribute; } // Initialize hypothesis space this.seed = new Rule(regularAttributes, label, this.rejectionSampling, getParameterAsBoolean(PARAMETER_GENERATE_ALL_HYPOTHESIS)); // Initialize utility function. int utility_type = getParameterAsInt(PARAMETER_UTILITY_FUNCTION); switch (utility_type) { case Utility.TYPE_ACCURACY: theUtility = new Accuracy(gssResult.getPriors(), large); break; case Utility.TYPE_LINEAR: theUtility = new Linear(gssResult.getPriors(), large); break; case Utility.TYPE_SQUARED: theUtility = new Squared(gssResult.getPriors(), large); break; case Utility.TYPE_BINOMIAL: theUtility = new Binomial(gssResult.getPriors(), large); break; case Utility.TYPE_WRACC: theUtility = new WRAcc(gssResult.getPriors(), large); break; default: } // Learn. List<IOObject> results = new LinkedList<IOObject>(); Model model = learn(eSet); results.add(model); // results.add(this.gssResult); // Return result. IOObject[] resultArray = new IOObject[results.size()]; results.toArray(resultArray); return resultArray; } public Model learn(ExampleSet exampleSet) throws OperatorException { LinkedList<Hypothesis> hypothesisList = seed.init(this.minComplexity); LinkedList<Model> allModels = new LinkedList<Model>(); int currentComplexity = this.minComplexity; boolean binomialTestPerformed = false; boolean switchedInThisIteration = false; Utility utilityStorage = this.theUtility; // needed if use_binomial=true LinkedList<BayBoostBaseModelInfo> modelInfo = new LinkedList<BayBoostBaseModelInfo>(); LinkedList<Result> allResultsOfCurrentComplexity = new LinkedList<Result>(); LinkedList<Hypothesis> deletedHypothesis = new LinkedList<Hypothesis>(); // stores deleted hypos if no kbs is used. for (int i = 0; i < this.iterations; i++) { // Reset hypothesis space Iterator iter = hypothesisList.iterator(); while (iter.hasNext()) { Hypothesis hypo = (Hypothesis) iter.next(); hypo.reset(); } // calculate current delta values double deltaForGSS = (2.0d * this.currentDelta) / (3.0d * (this.iterations - i)); double deltaForPruning = this.currentDelta / (3.0d * (this.iterations - i)); this.currentDelta = this.currentDelta - deltaForGSS - deltaForPruning; LinkedList<Result> currentResults = new LinkedList<Result>(); currentResults.addAll(gss(exampleSet, hypothesisList, deltaForGSS, epsilon)); // Create model. Hypothesis h = currentResults.getFirst().getHypothesis(); double[] precisions = new double[2]; if (h.getPrediction() == Hypothesis.POSITIVE_CLASS) { precisions[Hypothesis.POSITIVE_CLASS] = h.getPositiveWeight() / h.getCoveredWeight(); precisions[Hypothesis.NEGATIVE_CLASS] = 1.0d - precisions[1]; } else { precisions[Hypothesis.NEGATIVE_CLASS] = h.getPositiveWeight() / h.getCoveredWeight(); precisions[Hypothesis.POSITIVE_CLASS] = 1.0d - precisions[1]; } GSSModel model = new GSSModel(exampleSet, h, precisions); boolean increaseComplexity = false; // Test if model is useful according to the criterions in the method 'isUseful' or has already been found. if (!isUseful(currentResults.getFirst(), allResultsOfCurrentComplexity, this.useful_criterion, exampleSet, this.MIN_MODEL_NUMBER) || allModels.contains(model)) { if ((!binomialTestPerformed && this.useBinomial)) { // Switch to binomial before increasing complexity this.theUtility = new Binomial(gssResult.getPriors(), large); binomialTestPerformed = true; switchedInThisIteration = true; } else { if (currentComplexity < this.maxComplexity) { // Increase complexity after binomial test if possible. increaseComplexity = true; currentComplexity++; this.theUtility = utilityStorage; binomialTestPerformed = false; // Reset for next complexity level. } else { if (!this.forceIterations) { break; // Break for (int i=0;i<this.iterations;i++) } } } } if (increaseComplexity) { // Do not add result. Prune and increase complexity. Reset weight if resetWeight=true. if (!this.useKBS) { hypothesisList.addAll(deletedHypothesis); // put back all deleted hypothesis deletedHypothesis = new LinkedList<Hypothesis>(); // re-intialize deletedHypothesis } LinkedList<Hypothesis> prunedList = new LinkedList<Hypothesis>(); prunedList = this.prune(hypothesisList, min_utility_pruning, this.totalWeight, this.totalPositiveWeight, deltaForPruning); hypothesisList = new LinkedList<Hypothesis>(); hypothesisList.addAll(generate(prunedList)); allResultsOfCurrentComplexity = new LinkedList<Result>(); if (this.resetWeights) { Tools.createWeightAttribute(exampleSet); } } else { // Add result(not directly after switch to Binomial). No pruning. Create model. Reweight. if (!switchedInThisIteration) { allModels.addLast(model); this.gssResult.addResult(currentResults.getFirst()); currentDelta = currentDelta + deltaForPruning; // deltaForPruning not needed ContingencyMatrix contingencyMatrix = null; if (this.useKBS) { // kbs used: reweight! contingencyMatrix = reweight(exampleSet, model, this.rejectionSampling); } else {// no kbs: don't reweight, remove found hypothesis so that it can't be found again. WeightedPerformanceMeasures wpm = new WeightedPerformanceMeasures(exampleSet); contingencyMatrix = wpm.getContingencyMatrix(); int hypoIndex = hypothesisList.indexOf(currentResults.getFirst().getHypothesis()); deletedHypothesis.addLast(hypothesisList.remove(hypoIndex)); } modelInfo.addLast(new BayBoostBaseModelInfo(model, contingencyMatrix)); allResultsOfCurrentComplexity.addLast(currentResults.getFirst()); } else { switchedInThisIteration = false; } } } // Create BayBoostModel double[] priors = new double[2]; priors[Hypothesis.POSITIVE_CLASS] = gssResult.getPriors()[Hypothesis.POSITIVE_CLASS]; priors[Hypothesis.NEGATIVE_CLASS] = gssResult.getPriors()[Hypothesis.NEGATIVE_CLASS]; return new BayBoostModel(exampleSet, modelInfo, priors); } /** Test if the model is useful according to the given criterion. */ public boolean isUseful(Result current, LinkedList<Result> otherResults, int criterion, ExampleSet exampleSet, int min_model_number) { boolean result = true; switch (criterion) { case IteratingGSS.TYPE_WORST_UTILITY: double worstUtility = current.getUtility() - current.getConfidence(); if (worstUtility < this.min_utility_useful) { result = false; } else { result = true; } break; case IteratingGSS.TYPE_UTILITY: double utility = current.getUtility(); if (utility < this.min_utility_useful) { result = false; } else { result = true; } break; case IteratingGSS.TYPE_BEST_UTILITY: double bestUtility = current.getUtility() + current.getConfidence(); if (bestUtility < this.min_utility_useful) { result = false; } else { result = true; } break; case IteratingGSS.TYPE_EXAMPLE: if (otherResults.size() == 0 || otherResults.size() < min_model_number) { return true; } // Calculate average number of examples double sum = 0.0d; Iterator it = otherResults.iterator(); while (it.hasNext()) { Result r = (Result) it.next(); sum = sum + r.getTotalWeight(); } double average = sum / otherResults.size(); if (current.getTotalWeight() < (this.exampleFactor * average)) { result = true; } else { result = false; } break; } return result; } /** * Prunes the given list of hypothesis. All hypothesis with an upper utility bound less than the parameter minUtility is pruned. */ public LinkedList<Hypothesis> prune(LinkedList<Hypothesis> hypoList, double minUtility, double totalWeight, double totalPositiveWeight, double delta_p) { double delta_hp = delta_p / hypoList.size(); ListIterator it = hypoList.listIterator(); while (it.hasNext()) { Hypothesis hypo = (Hypothesis) it.next(); double upperBound = theUtility.getUpperBound(totalWeight, totalPositiveWeight, hypo, delta_hp); if (upperBound < minUtility) { it.remove(); } } return hypoList; } /** Generates all successors of the hypothesis in the given list. */ public LinkedList<Hypothesis> generate(LinkedList<Hypothesis> oldHypothesis) { LinkedList<Hypothesis> newHypothesis = new LinkedList<Hypothesis>(); while (!oldHypothesis.isEmpty()) { Hypothesis hypo = oldHypothesis.removeFirst(); if (hypo.canBeRefined()) { newHypothesis.addAll(hypo.refine()); } } return newHypothesis; } /** Returns the logarithm to base 2 */ public static double log2(double arg) { return Math.log(arg) / Math.log(2); } public boolean supportsCapability(LearnerCapability lc) { if (lc == com.rapidminer.operator.learner.LearnerCapability.POLYNOMINAL_ATTRIBUTES) return true; if (lc == com.rapidminer.operator.learner.LearnerCapability.BINOMINAL_ATTRIBUTES) return true; if (lc == com.rapidminer.operator.learner.LearnerCapability.BINOMINAL_CLASS) return true; return false; } public Class<?>[] getOutputClasses() { return new Class[] { Model.class, IGSSResult.class }; } public List<ParameterType> getParameterTypes() { List<ParameterType> types = super.getParameterTypes(); types.add(new ParameterTypeDouble(PARAMETER_EPSILON, "approximation parameter", 0.01, 1.0, 0.04)); types.add(new ParameterTypeDouble(PARAMETER_DELTA, "desired confidence", 0.01, 1.0, 0.1)); types.add(new ParameterTypeDouble(PARAMETER_MIN_UTILITY_PRUNING, "minimum utility used for pruning", -1.0d, 1.0d, 0.0d)); types.add(new ParameterTypeDouble(PARAMETER_MIN_UTILITY_USEFUL, "minimum utility for the usefulness of a rule", -1.0, 1.0, 0.0d)); types.add(new ParameterTypeInt(PARAMETER_STEPSIZE, "the number of examples drawn before the next hypothesis update", 1, 10000, 100)); types.add(new ParameterTypeInt(PARAMETER_LARGE, "the number of examples a hypothesis must cover before normal approximation is used", 1, 10000, 100)); types.add(new ParameterTypeInt(PARAMETER_MAX_COMPLEXITY, "the maximum complexity of hypothesis", 1, 10, 1)); types.add(new ParameterTypeInt(PARAMETER_MIN_COMPLEXITY, "the minimum complexity of hypothesis", 1, 10, 1)); types.add(new ParameterTypeInt(PARAMETER_ITERATIONS, "the number of iterations", 1, 50, 10)); types.add(new ParameterTypeBoolean(PARAMETER_USE_BINOMIAL, "Switch to binomial utility funtion before increasing complexity", false)); types.add(new ParameterTypeCategory(PARAMETER_UTILITY_FUNCTION, "the utility function to be used", Utility.UTILITY_TYPES, 4)); types.add(new ParameterTypeBoolean(PARAMETER_USE_KBS, "use kbs to reweight examples after each iteration", true)); types.add(new ParameterTypeBoolean(PARAMETER_REJECTION_SAMPLING, "use rejection sampling instead of weighted examples", true)); types.add(new ParameterTypeCategory(PARAMETER_USEFUL_CRITERION, "criterion to decide if the complexity is increased ", IteratingGSS.CRITERION_TYPES, 1)); types.add(new ParameterTypeDouble(PARAMETER_EXAMPLE_FACTOR, "used by example criterion to determine usefulness of a hypothesis", 1.0, 5.0, 1.5)); types.add(new ParameterTypeBoolean(PARAMETER_FORCE_ITERATIONS, "make all iterations even if termination criterion is met", false)); types.add(new ParameterTypeBoolean(PARAMETER_GENERATE_ALL_HYPOTHESIS, "generate h->Y+/Y- or h->Y+ only.", false)); types.add(new ParameterTypeBoolean(PARAMETER_RESET_WEIGHTS, "Set weights back to 1 when complexity is increased.", false)); return types; } }