/*
* RapidMiner
*
* Copyright (C) 2001-2011 by Rapid-I and the contributors
*
* Complete list of developers available at our web site:
*
* http://rapid-i.com
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see http://www.gnu.org/licenses/.
*/
package com.rapidminer.operator.preprocessing.transformation.aggregation;
import java.lang.reflect.Constructor;
import java.util.HashMap;
import java.util.Map;
import java.util.TreeMap;
import com.rapidminer.example.Attribute;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.example.table.DoubleArrayDataRow;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.ProcessSetupError.Severity;
import com.rapidminer.operator.UserError;
import com.rapidminer.operator.ports.InputPort;
import com.rapidminer.operator.ports.metadata.AttributeMetaData;
import com.rapidminer.operator.ports.metadata.SimpleMetaDataError;
import com.rapidminer.tools.Ontology;
/**
* This is an abstract class for all {@link AggregationFunction}s, that can be selected to
* aggregate values of a certain group.
* Each {@link AggregationFunction} must be able to provide a certain {@link Aggregator}, that
* will count the examples of one single group and compute the aggregated value. So for example the {@link MeanAggregationFunction}
* provides an {@link MeanAggregator}, that will calculate the mean on all examples delivered to him.
*
* The list of the names of all available functions can be queried from the static method {@link #getAvailableAggregationFunctionNames()}.
* With a name one can call the static method {@link #createAggregationFunction(String, Attribute)} to
* create a certain aggregator for the actual counting.
*
* Additional functions can be registered by calling {@link #registerNewAggregationFunction(String, Class)} from
* extensions, preferable during their initialization. Please notice that there will be no warning prior process execution
* if the extension is missing but the usage of it's function is still configured.
*
* @author Sebastian Land
*/
public abstract class AggregationFunction {
public static final String FUNCTION_SEPARATOR_OPEN = "(";
public static final String FUNCTION_SEPARATOR_CLOSE = ")";
public static final Map<String, Class<? extends AggregationFunction>> AGGREATION_FUNCTIONS = new TreeMap<String, Class<? extends AggregationFunction>>();
static {
AGGREATION_FUNCTIONS.put("sum", SumAggregationFunction.class);
AGGREATION_FUNCTIONS.put("median", MedianAggregationFunction.class);
AGGREATION_FUNCTIONS.put("average", MeanAggregationFunction.class);
AGGREATION_FUNCTIONS.put("variance", VarianceAggregationFunction.class);
AGGREATION_FUNCTIONS.put("standard_deviation", StandardDeviationAggregationFunction.class);
AGGREATION_FUNCTIONS.put("count (ignoring missings)", CountIgnoringMissingsAggregationFunction.class);
AGGREATION_FUNCTIONS.put("count (including missings)", CountIncludingMissingsAggregationFunction.class);
AGGREATION_FUNCTIONS.put("count", CountAggregationFunction.class);
AGGREATION_FUNCTIONS.put("minimum", MinAggregationFunction.class);
AGGREATION_FUNCTIONS.put("maximum", MaxAggregationFunction.class);
AGGREATION_FUNCTIONS.put("log product", LogProductAggregationFunction.class);
AGGREATION_FUNCTIONS.put("product", ProductAggregationFunction.class);
// Nominal Aggregations
AGGREATION_FUNCTIONS.put("mode", ModeAggregationFunction.class);
AGGREATION_FUNCTIONS.put("least", LeastAggregationFunction.class);
AGGREATION_FUNCTIONS.put("least (only occurring)", LeastOccurringAggregationFunction.class);
}
public static final Map<String, AggregationFunctionMetaDataProvider> AGGREGATION_FUNCTIONS_META_DATA_PROVIDER = new HashMap<String, AggregationFunctionMetaDataProvider>();
static {
AGGREGATION_FUNCTIONS_META_DATA_PROVIDER.put("sum", new DefaultAggregationFunctionMetaDataProvider("sum", SumAggregationFunction.FUNCTION_SUM, FUNCTION_SEPARATOR_OPEN, FUNCTION_SEPARATOR_CLOSE, new int[] { Ontology.NUMERICAL }));
AGGREGATION_FUNCTIONS_META_DATA_PROVIDER.put("median", new DefaultAggregationFunctionMetaDataProvider("median", MedianAggregationFunction.FUNCTION_MEDIAN, FUNCTION_SEPARATOR_OPEN, FUNCTION_SEPARATOR_CLOSE, new int[] { Ontology.NUMERICAL, Ontology.DATE_TIME }));
AGGREGATION_FUNCTIONS_META_DATA_PROVIDER.put("average", new DefaultAggregationFunctionMetaDataProvider("average", MeanAggregationFunction.FUNCTION_AVERAGE, FUNCTION_SEPARATOR_OPEN, FUNCTION_SEPARATOR_CLOSE, new int[] { Ontology.NUMERICAL, Ontology.DATE_TIME }, Ontology.REAL));
AGGREGATION_FUNCTIONS_META_DATA_PROVIDER.put("variance", new DefaultAggregationFunctionMetaDataProvider("variance", VarianceAggregationFunction.FUNCTION_VARIANCE, FUNCTION_SEPARATOR_OPEN, FUNCTION_SEPARATOR_CLOSE, new int[] { Ontology.NUMERICAL }, Ontology.REAL));
AGGREGATION_FUNCTIONS_META_DATA_PROVIDER.put("standard_deviation", new DefaultAggregationFunctionMetaDataProvider("standard_deviation", StandardDeviationAggregationFunction.FUNCTION_STANDARD_DEVIATION, FUNCTION_SEPARATOR_OPEN, FUNCTION_SEPARATOR_CLOSE, new int[] { Ontology.NUMERICAL }, Ontology.REAL));
AGGREGATION_FUNCTIONS_META_DATA_PROVIDER.put("count (ignoring missings)", new DefaultAggregationFunctionMetaDataProvider("count (ignoring missings)", CountIgnoringMissingsAggregationFunction.FUNCTION_COUNT, FUNCTION_SEPARATOR_OPEN, FUNCTION_SEPARATOR_CLOSE, new int[] { Ontology.ATTRIBUTE_VALUE }, Ontology.INTEGER));
AGGREGATION_FUNCTIONS_META_DATA_PROVIDER.put("count (including missings)", new DefaultAggregationFunctionMetaDataProvider("count (including missings)", CountIncludingMissingsAggregationFunction.FUNCTION_COUNT, FUNCTION_SEPARATOR_OPEN, FUNCTION_SEPARATOR_CLOSE, new int[] { Ontology.ATTRIBUTE_VALUE }, Ontology.INTEGER));
AGGREGATION_FUNCTIONS_META_DATA_PROVIDER.put("count", new DefaultAggregationFunctionMetaDataProvider("count", CountAggregationFunction.FUNCTION_COUNT, FUNCTION_SEPARATOR_OPEN, FUNCTION_SEPARATOR_CLOSE, new int[] { Ontology.ATTRIBUTE_VALUE }, Ontology.INTEGER));
AGGREGATION_FUNCTIONS_META_DATA_PROVIDER.put("minimum", new DefaultAggregationFunctionMetaDataProvider("minimum", MinAggregationFunction.FUNCTION_MIN, FUNCTION_SEPARATOR_OPEN, FUNCTION_SEPARATOR_CLOSE, new int[] { Ontology.NUMERICAL, Ontology.DATE_TIME }, Ontology.REAL));
AGGREGATION_FUNCTIONS_META_DATA_PROVIDER.put("maximum", new DefaultAggregationFunctionMetaDataProvider("maximum", MaxAggregationFunction.FUNCTION_MAX, FUNCTION_SEPARATOR_OPEN, FUNCTION_SEPARATOR_CLOSE, new int[] { Ontology.NUMERICAL, Ontology.DATE_TIME }, Ontology.REAL));
AGGREGATION_FUNCTIONS_META_DATA_PROVIDER.put("log product", new DefaultAggregationFunctionMetaDataProvider("log product", LogProductAggregationFunction.FUNCTION_LOG_PRODUCT, FUNCTION_SEPARATOR_OPEN, FUNCTION_SEPARATOR_CLOSE, new int[] { Ontology.NUMERICAL }, Ontology.REAL));
AGGREGATION_FUNCTIONS_META_DATA_PROVIDER.put("product", new DefaultAggregationFunctionMetaDataProvider("product", ProductAggregationFunction.FUNCTION_PRODUCT, FUNCTION_SEPARATOR_OPEN, FUNCTION_SEPARATOR_CLOSE, new int[] { Ontology.NUMERICAL }, Ontology.REAL));
// Nominal Aggregations
AGGREGATION_FUNCTIONS_META_DATA_PROVIDER.put("mode", new DefaultAggregationFunctionMetaDataProvider("mode", ModeAggregationFunction.FUNCTION_MODE, FUNCTION_SEPARATOR_OPEN, FUNCTION_SEPARATOR_CLOSE, new int[] { Ontology.ATTRIBUTE_VALUE }));
AGGREGATION_FUNCTIONS_META_DATA_PROVIDER.put("least", new DefaultAggregationFunctionMetaDataProvider("least", LeastAggregationFunction.FUNCTION_LEAST, FUNCTION_SEPARATOR_OPEN, FUNCTION_SEPARATOR_CLOSE, new int[] { Ontology.NOMINAL }, Ontology.POLYNOMINAL));
AGGREGATION_FUNCTIONS_META_DATA_PROVIDER.put("least (only occurring)", new DefaultAggregationFunctionMetaDataProvider("least (only occurring)", LeastOccurringAggregationFunction.FUNCTION_LEAST_OCCURRING, FUNCTION_SEPARATOR_OPEN, FUNCTION_SEPARATOR_CLOSE, new int[] { Ontology.NOMINAL }, Ontology.POLYNOMINAL));
}
private Attribute sourceAttribute;
private boolean isIgnoringMissings;
private boolean isCountingOnlyDistinct;
public AggregationFunction(Attribute sourceAttribute, boolean ignoreMissings, boolean countOnlyDistinct) {
this.sourceAttribute = sourceAttribute;
this.isIgnoringMissings = ignoreMissings;
this.isCountingOnlyDistinct = countOnlyDistinct;
}
/**
* This returns the attribute this aggregation function will derive the data from.
*/
public Attribute getSourceAttribute() {
return sourceAttribute;
}
/**
* This returns the attribute that will be created in the resulting {@link ExampleSet} to
* get the aggregated values for each group.
*/
public abstract Attribute getTargetAttribute();
/**
* This will return the {@link Aggregator} object that computes the value of this
* particular {@link AggregationFunction} for a specific group.
*/
public abstract Aggregator createAggregator();
/**
* This determines, if any missing values will be just ignored or counted with the
* respective aggregation function. Some functions might cope with that, others will
* just turn to be NaN.
*/
public boolean isIgnoringMissings() {
return isIgnoringMissings;
}
/**
* This determines, if values are counted only once, if occurring more than once. Please note
* that will increase the memory load drastically on numerical attributes.
*/
public boolean isCountingOnlyDistinct() {
return isCountingOnlyDistinct;
}
/**
* This will return whether this {@link AggregationFunction} is compatible with the given
* sourceAttribute.
*/
public abstract boolean isCompatible();
/**
* This method will fill in the default value of this aggregation function. It has to
* maintain the mapping, if the function is nominal.
* The default value will be a NaN. Every subclass that wants to change this, has to override
* this method.
*/
public void setDefault(Attribute attribute, DoubleArrayDataRow row) {
row.set(attribute, Double.NaN);
}
/**
* This will create the {@link AggregationFunction} with the given name for the given
* source Attribute. This method might return
*/
public static final AggregationFunction createAggregationFunction(String name, Attribute sourceAttribute, boolean ignoreMissings, boolean countOnlyDistinct) throws OperatorException {
Class<? extends AggregationFunction> aggregationFunctionClass = AGGREATION_FUNCTIONS.get(name);
if (aggregationFunctionClass == null)
throw new UserError(null, "aggregation.illegal_function_name", name);
try {
Constructor<? extends AggregationFunction> constructor = aggregationFunctionClass.getConstructor(Attribute.class, boolean.class, boolean.class);
return constructor.newInstance(sourceAttribute, ignoreMissings, countOnlyDistinct);
} catch (Exception e) {
throw new RuntimeException("All implementations of AggregationFunction need to have a constructor accepting an Attribute and boolean. Other reasons for this error may be class loader problems.", e);
}
}
/**
* This method can be called in order to get the target attribute meta data after the
* aggregation functions have been applied.
* This method can register errors on the given InputPort, if there's an illegal state. If
* the state makes applying an {@link AggregationFunction} impossible, this method will return null!
*/
public static final AttributeMetaData getAttributeMetaData(String aggregationFunctionName, AttributeMetaData sourceAttributeMetaData, InputPort inputPort) {
AggregationFunctionMetaDataProvider metaDataProvider = AGGREGATION_FUNCTIONS_META_DATA_PROVIDER.get(aggregationFunctionName);
if (metaDataProvider != null) {
return metaDataProvider.getTargetAttributeMetaData(sourceAttributeMetaData, inputPort);
} else {
// register error about unknown aggregation function
inputPort.addError(new SimpleMetaDataError(Severity.ERROR, inputPort, "aggregation.unknown_aggregation_function", aggregationFunctionName));
return null;
}
}
/**
* This method will return the array containing the names of all available
* aggregation functions. The names are sorted according to natural ordering.
*/
public static String[] getAvailableAggregationFunctionNames() {
String[] names = new String[AGGREATION_FUNCTIONS.size()];
int i = 0;
for (String name: AGGREATION_FUNCTIONS.keySet()) {
names[i] = name;
i++;
}
return names;
}
/**
* With this method extensions might register additional aggregation functions if needed.
*/
public static void registerNewAggregationFunction(String name, Class<? extends AggregationFunction> clazz, AggregationFunctionMetaDataProvider metaDataProvider) {
AGGREATION_FUNCTIONS.put(name, clazz);
AGGREGATION_FUNCTIONS_META_DATA_PROVIDER.put(name, metaDataProvider);
}
}