/* * RapidMiner * * Copyright (C) 2001-2008 by Rapid-I and the contributors * * Complete list of developers available at our web site: * * http://rapid-i.com * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.operator.olap; import java.util.Collection; import java.util.LinkedList; import java.util.List; import java.util.TreeSet; import com.rapidminer.example.Attribute; import com.rapidminer.example.Example; import com.rapidminer.example.ExampleSet; import com.rapidminer.example.set.SplittedExampleSet; import com.rapidminer.operator.IOObject; import com.rapidminer.operator.InputDescription; import com.rapidminer.operator.Operator; import com.rapidminer.operator.OperatorDescription; import com.rapidminer.operator.OperatorException; import com.rapidminer.operator.UserError; import com.rapidminer.parameter.ParameterType; import com.rapidminer.parameter.ParameterTypeBoolean; import com.rapidminer.parameter.ParameterTypeDouble; import com.rapidminer.parameter.ParameterTypeString; import com.rapidminer.tools.math.AnovaCalculator; import com.rapidminer.tools.math.SignificanceCalculationException; import com.rapidminer.tools.math.SignificanceTestResult; import com.rapidminer.tools.math.function.AggregationFunction; import com.rapidminer.tools.math.function.AverageFunction; import com.rapidminer.tools.math.function.VarianceFunction; /** * <p>This operator creates groups of the input example set based on * the defined grouping attribute. For each of the groups the mean and * variance of another attribute (the anova attribute) is calculated * and an ANalysis Of VAriance (ANOVA) is performed. The result will * be a significance test result for the specified significance level * indicating if the values for the attribute are significantly different * between the groups defined by the grouping attribute.</p> * * @author Ingo Mierswa * @version $Id: GroupedANOVAOperator.java,v 1.6 2008/07/07 07:06:46 ingomierswa Exp $ */ public class GroupedANOVAOperator extends Operator { public static final String PARAMETER_ANOVA_ATTRIBUTE = "anova_attribute"; public static final String PARAMETER_GROUP_BY_ATTRIBUTE = "group_by_attribute"; public static final String PARAMETER_SIGNIFICANCE_LEVEL = "significance_level"; public static final String PARAMETER_ONLY_DISTINCT = "only_distinct"; public GroupedANOVAOperator(OperatorDescription desc) { super(desc); } public IOObject[] apply() throws OperatorException { ExampleSet exampleSet = getInput(ExampleSet.class); // init and checks String attributeName = getParameterAsString(PARAMETER_ANOVA_ATTRIBUTE); String groupByAttributeName = getParameterAsString(PARAMETER_GROUP_BY_ATTRIBUTE); boolean onlyDistinct = getParameterAsBoolean(PARAMETER_ONLY_DISTINCT); Attribute anovaAttribute = exampleSet.getAttributes().get(attributeName); if (anovaAttribute == null) { throw new UserError(this, 111, this.getParameterAsString(PARAMETER_ANOVA_ATTRIBUTE)); } if (anovaAttribute.isNominal()) { throw new UserError(this, 104, new Object[] { this.getParameterAsString(PARAMETER_ANOVA_ATTRIBUTE), "anova calculation" }); } Attribute groupByAttribute = exampleSet.getAttributes().get(groupByAttributeName); if (groupByAttribute == null) { throw new UserError(this, 111, this.getParameterAsString(PARAMETER_GROUP_BY_ATTRIBUTE)); } if (!groupByAttribute.isNominal()) { throw new UserError(this, 103, new Object[] { this.getParameterAsString(PARAMETER_GROUP_BY_ATTRIBUTE), "grouping by attribute." }); } // create anova calculator AnovaCalculator anovaCalculator = new AnovaCalculator(); double alpha = getParameterAsDouble(PARAMETER_SIGNIFICANCE_LEVEL); anovaCalculator.setAlpha(alpha); // add groups SplittedExampleSet grouped = SplittedExampleSet.splitByAttribute(exampleSet, groupByAttribute); AggregationFunction meanFunction = new AverageFunction(); AggregationFunction varianceFunction = new VarianceFunction(); for (int i = 0; i < grouped.getNumberOfSubsets(); i++) { grouped.selectSingleSubset(i); double[] values = getValues(grouped, anovaAttribute, onlyDistinct); double mean = meanFunction.calculate(values); double variance = varianceFunction.calculate(values); anovaCalculator.addGroup(grouped.size(), mean, variance); } // calculate and return result SignificanceTestResult result = null; try { result = anovaCalculator.performSignificanceTest(); } catch (SignificanceCalculationException e) { throw new UserError(this, 920, e.getMessage()); } return new IOObject[] { result }; } private double[] getValues(ExampleSet exampleSet, Attribute attribute, boolean onlyDistinct) { Collection<Double> valueCollection = new LinkedList<Double>(); if (onlyDistinct) valueCollection = new TreeSet<Double>(); for (Example e : exampleSet) { valueCollection.add(e.getValue(attribute)); } double[] result = new double[valueCollection.size()]; int counter = 0; for (double d : valueCollection) result[counter++] = d; return result; } /** Indicates that the consumption of example sets can be user defined (default: no consumption). */ public InputDescription getInputDescription(Class cls) { if (ExampleSet.class.isAssignableFrom(cls)) { return new InputDescription(cls, false, true); } else { return super.getInputDescription(cls); } } public Class<?>[] getInputClasses() { return new Class[] { ExampleSet.class }; } public Class<?>[] getOutputClasses() { return new Class[] { SignificanceTestResult.class }; } public List<ParameterType> getParameterTypes() { List<ParameterType> types = super.getParameterTypes(); types.add(new ParameterTypeString(PARAMETER_ANOVA_ATTRIBUTE, "Calculate the ANOVA for this attribute based on the groups defines by " + PARAMETER_GROUP_BY_ATTRIBUTE + ".", false)); types.add(new ParameterTypeString(PARAMETER_GROUP_BY_ATTRIBUTE, "Performs a grouping by the values of the attribute with this name.", false)); types.add(new ParameterTypeDouble(PARAMETER_SIGNIFICANCE_LEVEL, "The significance level for the ANOVA calculation.", 0.0d, 1.0d, 0.05d)); types.add(new ParameterTypeBoolean(PARAMETER_ONLY_DISTINCT, "Indicates if only rows with distinct values for the aggregation attribute should be used for the calculation of the aggregation function.", false)); return types; } }