SubgroupDiscovery.java example

/*
 *  RapidMiner
 *
 *  Copyright (C) 2001-2011 by Rapid-I and the contributors
 *
 *  Complete list of developers available at our web site:
 *
 *       http://rapid-i.com
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Affero General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Affero General Public License for more details.
 *
 *  You should have received a copy of the GNU Affero General Public License
 *  along with this program.  If not, see http://www.gnu.org/licenses/.
 */
package com.rapidminer.operator.learner.subgroups;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;

import com.rapidminer.example.Example;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.operator.Model;
import com.rapidminer.operator.OperatorCapability;
import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.learner.AbstractLearner;
import com.rapidminer.operator.learner.PredictionModel;
import com.rapidminer.operator.learner.subgroups.hypothesis.Hypothesis;
import com.rapidminer.operator.learner.subgroups.hypothesis.Rule;
import com.rapidminer.operator.learner.subgroups.utility.UtilityFunction;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeCategory;
import com.rapidminer.parameter.ParameterTypeDouble;
import com.rapidminer.parameter.ParameterTypeInt;


/**
 * This operator discovers subgroups (or induces a rule set, respectively) 
 * by generating hypotheses exhaustively. Generation is done by stepwise refining
 * the empty hypothesis (which contains no literals). The loop for this task hence
 * iterates over the depth of the search space, i.e. the number of literals of the
 * generated hypotheses. The maximum depth of the search can be specified.
 * Furthermore the search space can be pruned by specifying a minimum coverage
 * of the hypothesis or by using only a given amount of hypotheses which have
 * the highest coverage.
 * 
 * From the hypotheses, rules are derived according to the users preference. The
 * operator allows the derivation of positive rules (Y+) and negative rules (Y-)
 * separately or the combination by deriving both rules or only the one which is 
 * the most probable due to the examples covered by the hypothesis (hence: the 
 * actual prediction for that subset).
 * 
 * All generated rules are evaluated on the example set by a user specified 
 * utility function and stored in the final rule set if they (1) exceed a minimum
 * utility threshold or (2) are among the k best rules. The desired behavior
 * can be specified as well.
 * 
 * @author Tobias Malbrecht
 */
public class SubgroupDiscovery extends AbstractLearner {

	// comparator class that compares rules according to
	// the specified utility function
	private static class RuleComparator implements Comparator<Rule> {
		Class<? extends UtilityFunction> functionClass;

		public RuleComparator(Class<? extends UtilityFunction> functionClass) {
			this.functionClass = functionClass;
		}

		public int compare(Rule firstRule, Rule secondRule) {
			return Double.compare(secondRule.getUtility(functionClass), firstRule.getUtility(functionClass));
		}
	}

	private static class HypothesisComparator implements Comparator<Hypothesis> {
		public int compare(Hypothesis firstHypothesis, Hypothesis secondHypothesis) {
			return Double.compare(secondHypothesis.getCoveredWeight(), firstHypothesis.getCoveredWeight());
		}
	}

	public static final String PARAMETER_DISCOVERY_MODE = "mode";

	public static final String[] DISCOVERY_MODES = { "above minimum utility" , "k best rules" };

	public static final int DISCOVERY_MODE_ABOVE_MINIMUM_UTILITY = 0;

	public static final int DISCOVERY_MODE_K_BEST_RULES = 1;

	public static final String PARAMETER_UTILITY_FUNCTION = "utility_function";

	public static final String PARAMETER_RULE_GENERATION = "rule_generation";

	public static final String[] RULE_GENERATION_MODES = Hypothesis.RULE_GENERATION_MODES;

	public static final String PARAMETER_MAX_DEPTH = "max_depth";

	public static final String PARAMETER_MIN_UTILITY = "min_utility";

	public static final String PARAMETER_K_BEST_RULES = "k_best_rules";

	public static final String PARAMETER_MIN_COVERAGE = "min_coverage";

	public static final String PARAMETER_MAX_CACHE = "max_cache";


	public SubgroupDiscovery(OperatorDescription description) {
		super(description);
	}

	public Model learn(ExampleSet exampleSet) throws OperatorException {
		int mode = getParameterAsInt(PARAMETER_DISCOVERY_MODE);
		int maxDepth = getParameterAsInt(PARAMETER_MAX_DEPTH);
		double minUtility = getParameterAsDouble(PARAMETER_MIN_UTILITY);
		int kBestRules = getParameterAsInt(PARAMETER_K_BEST_RULES);
		int ruleGenerationMode = getParameterAsInt(PARAMETER_RULE_GENERATION);
		double coverageThreshold = getParameterAsDouble(PARAMETER_MIN_COVERAGE);
		int maxCache = getParameterAsInt(PARAMETER_MAX_CACHE);

		// determine a priori statistics
		int numberOfAttributes = exampleSet.getAttributes().size();
		double totalWeight = 0.0d;
		double totalPositiveWeight = 0.0d;
		for (Example example : exampleSet) {
			double weight = 1.0d; 
			if (exampleSet.getAttributes().getWeight() != null) {
				weight = example.getWeight();
			}
			totalWeight += weight;
			if (example.getLabel() == example.getAttributes().getLabel().getMapping().getPositiveIndex()) {
				totalPositiveWeight += weight;
			}
		}

		// initialise utility functions
		UtilityFunction[] utilityFunctions = UtilityFunction.getUtilityFunctions(totalWeight, totalPositiveWeight);
		UtilityFunction mainUtilityFunction = utilityFunctions[getParameterAsInt(PARAMETER_UTILITY_FUNCTION)];
		RuleComparator ruleComparator = new RuleComparator(mainUtilityFunction.getClass());

		LinkedList<Rule> acceptedRules = new LinkedList<Rule>();
		ArrayList<Rule> bestRules = new ArrayList<Rule>(kBestRules);

		// create initial hypotheses
		LinkedList<Hypothesis> hypotheses = new LinkedList<Hypothesis>();
		Hypothesis emptyHypothesis = new Hypothesis();
		hypotheses.addAll(emptyHypothesis.restrictedRefine(exampleSet.getAttributes()));

		for (int i = 0; i < (maxDepth > numberOfAttributes ? numberOfAttributes : maxDepth); i++) {

			if (hypotheses.size() == 0) {
				break;
			}

			// evaluate hypotheses on data set
			log("evaluating " + hypotheses.size() + " hypotheses with " + (i+1) + " literals");
			for (Example example : exampleSet) {
				for (Hypothesis hypothesis : hypotheses) {
					hypothesis.apply(example);
				}
			}

			int discarded = 0;
			for (Iterator<Hypothesis> iterator = hypotheses.iterator(); iterator.hasNext(); ) {
				Hypothesis hypothesis = iterator.next();
				// discard hypotheses which cover too few examples
				if ((hypothesis.getCoveredWeight() / totalWeight) <= coverageThreshold) {
					iterator.remove();
					discarded++;
					continue;
				}				
			}
			if (discarded > 0) {
				log("removed " + discarded + " hypotheses not exceeding min coverage");
			}

			if (maxCache != -1) {
				Collections.sort(hypotheses, new HypothesisComparator());
				int deleteHypotheses = hypotheses.size() - maxCache;
				for (int j = 0; j < deleteHypotheses; j++) {
					hypotheses.removeLast();
				}
				if (deleteHypotheses > 0) {
					log("removed " + deleteHypotheses + " hypotheses with the lowest coverage");
				}
			}

			log("generating rules from " + hypotheses.size() + " hypotheses");
			LinkedList<Hypothesis> nextHypotheses = new LinkedList<Hypothesis>();
			for (Iterator<Hypothesis> iterator = hypotheses.iterator(); iterator.hasNext(); ) {
				Hypothesis hypothesis = iterator.next();

				LinkedList<Rule> rules = hypothesis.generateRules(ruleGenerationMode, exampleSet.getAttributes().getLabel());
				for (Rule rule : rules) {

					// utility evaluation
					for (int j = 0; j < utilityFunctions.length; j++) {
						rule.setUtility(utilityFunctions[j], utilityFunctions[j].utility(rule));
					}
					double utility = mainUtilityFunction.utility(rule);

					// add rule to result rule set ...
					switch (mode) {

					// ... if it exceeds a utility threshold ...
					case DISCOVERY_MODE_ABOVE_MINIMUM_UTILITY:
						if (utility >= minUtility) {
							acceptedRules.add(rule);
							log("scored: " + rule);
						}
						break;

						// ... or if it is among the (current) k best rules 
					case DISCOVERY_MODE_K_BEST_RULES:
						if (bestRules.size() < kBestRules) {
							bestRules.add(rule);
							log("scored: " + rule + " [q(h)=" + utility + "]");
							Collections.sort(bestRules, ruleComparator);
							break;
						}
						if (utility > bestRules.get(kBestRules - 1).getUtility(mainUtilityFunction.getClass())) {
							bestRules.set(kBestRules - 1, rule);
							minUtility = utility;
							log("scored: " + rule + " [q(h)=" + utility + "]");
							Collections.sort(bestRules, ruleComparator);
						}
						break;
					}

				}

				// prune (do not consider hypothesis further) or add refinements to new hypothesis list
				double optimisticEstimate = mainUtilityFunction.optimisticEstimate(hypothesis);
				if (optimisticEstimate >= minUtility) {
					for (Hypothesis nextHypothesis : hypothesis.restrictedRefine()) {
						nextHypotheses.add(nextHypothesis);						
					}
				}
			}
			hypotheses = nextHypotheses;
		}

		// create model
		RuleSet model = new RuleSet(exampleSet);
		switch (mode) {
		case DISCOVERY_MODE_ABOVE_MINIMUM_UTILITY:
			Collections.sort(acceptedRules, ruleComparator);
			for (Rule rule : acceptedRules) {
				model.addRule(rule);
			}
			break;
		case DISCOVERY_MODE_K_BEST_RULES:
			Collections.sort(bestRules, ruleComparator);
			for (Rule rule : bestRules) {
				model.addRule(rule);
			}
			break;
		}
		return model;
	}
	
	@Override
	public Class<? extends PredictionModel> getModelClass() {
		return RuleSet.class;
	}

	public boolean supportsCapability(OperatorCapability lc) {
		if (lc == com.rapidminer.operator.OperatorCapability.POLYNOMINAL_ATTRIBUTES)
			return true;
		if (lc == com.rapidminer.operator.OperatorCapability.BINOMINAL_ATTRIBUTES)
			return true;
		if (lc == com.rapidminer.operator.OperatorCapability.BINOMINAL_LABEL)
			return true;
		if (lc == com.rapidminer.operator.OperatorCapability.WEIGHTED_EXAMPLES)
			return true;
		return false;
	}

	@Override
	public List<ParameterType> getParameterTypes() {
		List<ParameterType> types = super.getParameterTypes();
		types.add(new ParameterTypeCategory(PARAMETER_DISCOVERY_MODE, "Discovery mode.", DISCOVERY_MODES, 1, false));
		types.add(new ParameterTypeCategory(PARAMETER_UTILITY_FUNCTION, "Utility function.", UtilityFunction.FUNCTIONS, UtilityFunction.WRACC));
		types.add(new ParameterTypeDouble(PARAMETER_MIN_UTILITY, "Minimum quality which has to be reached.", Double.NEGATIVE_INFINITY, Double.POSITIVE_INFINITY, 0.0));
		types.add(new ParameterTypeInt(PARAMETER_K_BEST_RULES, "Report the k best rules.", 1, Integer.MAX_VALUE, 10, false));
		types.add(new ParameterTypeCategory(PARAMETER_RULE_GENERATION, "Determines which rules are generated.", RULE_GENERATION_MODES, Hypothesis.POSITIVE_AND_NEGATIVE_RULES));
		types.add(new ParameterTypeInt(PARAMETER_MAX_DEPTH, "Maximum depth of BFS.", 0, Integer.MAX_VALUE, 5));
		types.add(new ParameterTypeDouble(PARAMETER_MIN_COVERAGE, "Only consider rules which exceed the given coverage threshold.", 0, 1, 0));
		types.add(new ParameterTypeInt(PARAMETER_MAX_CACHE, "Bounds the number of rules which are evaluated (only the most supported rules are used).", -1, Integer.MAX_VALUE, -1));
		return types;
	}
}