GroupedANOVAOperator.java example

Explorer
rapidminer-vega-master
/*
 *  RapidMiner
 *
 *  Copyright (C) 2001-2011 by Rapid-I and the contributors
 *
 *  Complete list of developers available at our web site:
 *
 *       http://rapid-i.com
 *
 *  This program is free software: you can redistribute it and/or modify
 *  it under the terms of the GNU Affero General Public License as published by
 *  the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU Affero General Public License for more details.
 *
 *  You should have received a copy of the GNU Affero General Public License
 *  along with this program.  If not, see http://www.gnu.org/licenses/.
 */
package com.rapidminer.operator.preprocessing.transformation;

import java.util.Collection;
import java.util.LinkedList;
import java.util.List;
import java.util.TreeSet;

import com.rapidminer.example.Attribute;
import com.rapidminer.example.Example;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.example.set.SplittedExampleSet;
import com.rapidminer.operator.Operator;
import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.UserError;
import com.rapidminer.operator.annotation.ResourceConsumptionEstimator;
import com.rapidminer.operator.ports.InputPort;
import com.rapidminer.operator.ports.OutputPort;
import com.rapidminer.operator.ports.metadata.AttributeSetPrecondition;
import com.rapidminer.operator.ports.metadata.ExampleSetMetaData;
import com.rapidminer.operator.ports.metadata.GenerateNewMDRule;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeAttribute;
import com.rapidminer.parameter.ParameterTypeBoolean;
import com.rapidminer.parameter.ParameterTypeDouble;
import com.rapidminer.tools.Ontology;
import com.rapidminer.tools.OperatorResourceConsumptionHandler;
import com.rapidminer.tools.math.AnovaCalculator;
import com.rapidminer.tools.math.SignificanceCalculationException;
import com.rapidminer.tools.math.SignificanceTestResult;
import com.rapidminer.tools.math.function.aggregation.AggregationFunction;
import com.rapidminer.tools.math.function.aggregation.AverageFunction;
import com.rapidminer.tools.math.function.aggregation.VarianceFunction;

/**
 * <p>
 * This operator creates groups of the input example set based on the defined grouping attribute. For each of the groups
 * the mean and variance of another attribute (the anova attribute) is calculated and an ANalysis Of VAriance (ANOVA) is
 * performed. The result will be a significance test result for the specified significance level indicating if the
 * values for the attribute are significantly different between the groups defined by the grouping attribute.
 * </p>
 * 
 * @author Ingo Mierswa
 */
public class GroupedANOVAOperator extends Operator {

	private InputPort exampleSetInput = getInputPorts().createPort("example set", new ExampleSetMetaData());
	private OutputPort significanceOutput = getOutputPorts().createPort("significance");
	private OutputPort exampleSetOutput = getOutputPorts().createPort("example set");

	public static final String PARAMETER_ANOVA_ATTRIBUTE = "anova_attribute";
	public static final String PARAMETER_GROUP_BY_ATTRIBUTE = "group_by_attribute";
	public static final String PARAMETER_SIGNIFICANCE_LEVEL = "significance_level";
	public static final String PARAMETER_ONLY_DISTINCT = "only_distinct";

	public GroupedANOVAOperator(OperatorDescription desc) {
		super(desc);
		getTransformer().addRule(new GenerateNewMDRule(significanceOutput, SignificanceTestResult.class));
		getTransformer().addPassThroughRule(exampleSetInput, exampleSetOutput);
		exampleSetInput.addPrecondition(new AttributeSetPrecondition(exampleSetInput, AttributeSetPrecondition.getAttributesByParameter(this, PARAMETER_ANOVA_ATTRIBUTE), Ontology.NUMERICAL));
		exampleSetInput.addPrecondition(new AttributeSetPrecondition(exampleSetInput, AttributeSetPrecondition.getAttributesByParameter(this, PARAMETER_GROUP_BY_ATTRIBUTE), Ontology.NOMINAL));
	}

	@Override
	public void doWork() throws OperatorException {
		ExampleSet exampleSet = exampleSetInput.getData();
		significanceOutput.deliver(apply(exampleSet));
	}

	public SignificanceTestResult apply(ExampleSet exampleSet) throws OperatorException {
		// init and checks
		String attributeName = getParameterAsString(PARAMETER_ANOVA_ATTRIBUTE);
		String groupByAttributeName = getParameterAsString(PARAMETER_GROUP_BY_ATTRIBUTE);
		boolean onlyDistinct = getParameterAsBoolean(PARAMETER_ONLY_DISTINCT);

		Attribute anovaAttribute = exampleSet.getAttributes().get(attributeName);
		if (anovaAttribute == null) {
			throw new UserError(this, 111, this.getParameterAsString(PARAMETER_ANOVA_ATTRIBUTE));
		}
		if (anovaAttribute.isNominal()) {
			throw new UserError(this, 104, new Object[] { "anova calculation", this.getParameterAsString(PARAMETER_ANOVA_ATTRIBUTE)});
		}

		Attribute groupByAttribute = exampleSet.getAttributes().get(groupByAttributeName);
		if (groupByAttribute == null) {
			throw new UserError(this, 111, this.getParameterAsString(PARAMETER_GROUP_BY_ATTRIBUTE));
		}
		if (!groupByAttribute.isNominal()) {
			throw new UserError(this, 103, new Object[] {"the parameter grouping by", this.getParameterAsString(PARAMETER_GROUP_BY_ATTRIBUTE)});
		}

		// create anova calculator
		AnovaCalculator anovaCalculator = new AnovaCalculator();
		double alpha = getParameterAsDouble(PARAMETER_SIGNIFICANCE_LEVEL);
		anovaCalculator.setAlpha(alpha);

		// add groups
		SplittedExampleSet grouped = SplittedExampleSet.splitByAttribute(exampleSet, groupByAttribute);
		AggregationFunction meanFunction = new AverageFunction();
		AggregationFunction varianceFunction = new VarianceFunction();
		for (int i = 0; i < grouped.getNumberOfSubsets(); i++) {
			grouped.selectSingleSubset(i);
			double[] values = getValues(grouped, anovaAttribute, onlyDistinct);
			double mean = meanFunction.calculate(values);
			double variance = varianceFunction.calculate(values);
			anovaCalculator.addGroup(grouped.size(), mean, variance);
		}

		// calculate and return result
		SignificanceTestResult result = null;
		try {
			result = anovaCalculator.performSignificanceTest();
		} catch (SignificanceCalculationException e) {
			throw new UserError(this, 920, e.getMessage());
		}

		exampleSetOutput.deliver(exampleSet);

		return result;
	}

	private double[] getValues(ExampleSet exampleSet, Attribute attribute, boolean onlyDistinct) {
		Collection<Double> valueCollection = new LinkedList<Double>();
		if (onlyDistinct)
			valueCollection = new TreeSet<Double>();

		for (Example e : exampleSet) {
			valueCollection.add(e.getValue(attribute));
		}

		double[] result = new double[valueCollection.size()];
		int counter = 0;
		for (double d : valueCollection)
			result[counter++] = d;
		return result;
	}

	@Override
	public boolean shouldAutoConnect(OutputPort port) {
		if (port == exampleSetOutput) {
			return getParameterAsBoolean("keep_example_set");
		} else {
			return super.shouldAutoConnect(port);
		}
	}

	@Override
	public List<ParameterType> getParameterTypes() {
		List<ParameterType> types = super.getParameterTypes();
		types.add(new ParameterTypeAttribute(PARAMETER_ANOVA_ATTRIBUTE, "Calculate the ANOVA for this attribute based on the groups defines by " + PARAMETER_GROUP_BY_ATTRIBUTE + ".", exampleSetInput, false));
		types.add(new ParameterTypeAttribute(PARAMETER_GROUP_BY_ATTRIBUTE, "Performs a grouping by the values of the attribute with this name.", exampleSetInput, false));
		types.add(new ParameterTypeDouble(PARAMETER_SIGNIFICANCE_LEVEL, "The significance level for the ANOVA calculation.", 0.0d, 1.0d, 0.05d, false));
		types.add(new ParameterTypeBoolean(PARAMETER_ONLY_DISTINCT, "Indicates if only rows with distinct values for the aggregation attribute should be used for the calculation of the aggregation function.", false));
		return types;
	}
	
	@Override
	public ResourceConsumptionEstimator getResourceConsumptionEstimator() {
		return OperatorResourceConsumptionHandler.getResourceConsumptionEstimator(getInputPorts().getPortByIndex(0), GroupedANOVAOperator.class, null);
	}
}