/** * Copyright (C) 2001-2017 by RapidMiner and the contributors * * Complete list of developers available at our web site: * * http://rapidminer.com * * This program is free software: you can redistribute it and/or modify it under the terms of the * GNU Affero General Public License as published by the Free Software Foundation, either version 3 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License along with this program. * If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.operator.preprocessing.transformation.aggregation; import java.lang.reflect.Constructor; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.TreeMap; import com.rapidminer.example.Attribute; import com.rapidminer.example.ExampleSet; import com.rapidminer.example.table.AttributeFactory; import com.rapidminer.example.table.DoubleArrayDataRow; import com.rapidminer.operator.OperatorException; import com.rapidminer.operator.OperatorVersion; import com.rapidminer.operator.ProcessSetupError.Severity; import com.rapidminer.operator.UserError; import com.rapidminer.operator.ports.InputPort; import com.rapidminer.operator.ports.metadata.AttributeMetaData; import com.rapidminer.operator.ports.metadata.SimpleMetaDataError; import com.rapidminer.tools.Ontology; /** * This is an abstract class for all {@link AggregationFunction}s, that can be selected to aggregate * values of a certain group. Each {@link AggregationFunction} must be able to provide a certain * {@link Aggregator}, that will count the examples of one single group and compute the aggregated * value. So for example the {@link MeanAggregationFunction} provides an {@link MeanAggregator}, * that will calculate the mean on all examples delivered to him. * * The list of the names of all available functions can be queried from the static method * {@link #getAvailableAggregationFunctionNames()}. With a name one can call the static method * {@link #createAggregationFunction(String, Attribute)} to create a certain aggregator for the * actual counting. * * Additional functions can be registered by calling * {@link #registerNewAggregationFunction(String, Class)} from extensions, preferable during their * initialization. Please notice that there will be no warning prior process execution if the * extension is missing but the usage of it's function is still configured. * * @author Sebastian Land, Marius Helf */ public abstract class AggregationFunction { public static final String FUNCTION_SEPARATOR_OPEN = "("; public static final String FUNCTION_SEPARATOR_CLOSE = ")"; // available functions public static final String FUNCTION_NAME_SUM = "sum"; public static final String FUNCTION_NAME_SUM_FRACTIONAL = "sum (fractional)"; public static final String FUNCTION_NAME_MEDIAN = "median"; public static final String FUNCTION_NAME_AVERAGE = "average"; public static final String FUNCTION_NAME_VARIANCE = "variance"; public static final String FUNCTION_NAME_STANDARD_DEVIATION = "standard_deviation"; public static final String FUNCTION_NAME_COUNT_IGNORE_MISSINGS = "count (ignoring missings)"; public static final String FUNCTION_NAME_COUNT_INCLUDE_MISSINGS = "count (including missings)"; public static final String FUNCTION_NAME_COUNT = "count"; public static final String FUNCTION_NAME_COUNT_FRACTIONAL = "count (fractional)"; public static final String FUNCTION_NAME_COUNT_PERCENTAGE = "count (percentage)"; public static final String FUNCTION_NAME_MINIMUM = "minimum"; public static final String FUNCTION_NAME_MAXIMUM = "maximum"; public static final String FUNCTION_NAME_LOG_PRODUCT = "log product"; public static final String FUNCTION_NAME_PRODOCT = "product"; public static final String FUNCTION_NAME_MODE = "mode"; public static final String FUNCTION_NAME_LEAST = "least"; public static final String FUNCTION_NAME_LEAST_ONLY_OCCURRING = "least (only occurring)"; public static final String FUNCTION_NAME_CONCATENATION = "concatenation"; public static final Map<String, Class<? extends AggregationFunction>> AGGREATION_FUNCTIONS = new TreeMap<>(); static { // numerical/date AGGREATION_FUNCTIONS.put(FUNCTION_NAME_SUM, SumAggregationFunction.class); AGGREATION_FUNCTIONS.put(FUNCTION_NAME_SUM_FRACTIONAL, SumFractionalAggregationFunction.class); AGGREATION_FUNCTIONS.put(FUNCTION_NAME_MEDIAN, MedianAggregationFunction.class); AGGREATION_FUNCTIONS.put(FUNCTION_NAME_AVERAGE, MeanAggregationFunction.class); AGGREATION_FUNCTIONS.put(FUNCTION_NAME_VARIANCE, VarianceAggregationFunction.class); AGGREATION_FUNCTIONS.put(FUNCTION_NAME_STANDARD_DEVIATION, StandardDeviationAggregationFunction.class); AGGREATION_FUNCTIONS.put(FUNCTION_NAME_MINIMUM, MinAggregationFunction.class); AGGREATION_FUNCTIONS.put(FUNCTION_NAME_MAXIMUM, MaxAggregationFunction.class); AGGREATION_FUNCTIONS.put(FUNCTION_NAME_LOG_PRODUCT, LogProductAggregationFunction.class); AGGREATION_FUNCTIONS.put(FUNCTION_NAME_PRODOCT, ProductAggregationFunction.class); // numerical/date/nominal AGGREATION_FUNCTIONS.put(FUNCTION_NAME_COUNT_IGNORE_MISSINGS, CountIgnoringMissingsAggregationFunction.class); AGGREATION_FUNCTIONS.put(FUNCTION_NAME_COUNT_INCLUDE_MISSINGS, CountIncludingMissingsAggregationFunction.class); AGGREATION_FUNCTIONS.put(FUNCTION_NAME_COUNT, CountAggregationFunction.class); AGGREATION_FUNCTIONS.put(FUNCTION_NAME_COUNT_FRACTIONAL, CountFractionalAggregationFunction.class); AGGREATION_FUNCTIONS.put(FUNCTION_NAME_COUNT_PERCENTAGE, CountPercentageAggregationFunction.class); // Nominal Aggregations AGGREATION_FUNCTIONS.put(FUNCTION_NAME_MODE, ModeAggregationFunction.class); AGGREATION_FUNCTIONS.put(FUNCTION_NAME_LEAST, LeastAggregationFunction.class); AGGREATION_FUNCTIONS.put(FUNCTION_NAME_LEAST_ONLY_OCCURRING, LeastOccurringAggregationFunction.class); AGGREATION_FUNCTIONS.put(FUNCTION_NAME_CONCATENATION, ConcatAggregationFunction.class); } /** * This map contains legacy aggregation function names and the class, which contains the legacy * functionality. Each of the map elements has to be represented in the * LEGACY_AGGREATION_FUNCTIONS_VERSIONS map, too. */ private static final Map<String, Class<? extends AggregationFunction>> LEGACY_AGGREATION_FUNCTIONS = new TreeMap<>(); static { // median has been replaced after version 7.4.1 LEGACY_AGGREATION_FUNCTIONS.put(FUNCTION_NAME_MEDIAN, MedianAggregationFunctionLegacy.class); } /** * This map contains legacy aggregation function names and the {@link OperatorVersion} until the * legacy function should be used. Each of the map elements has to be represented in the * LEGACY_AGGREATION_FUNCTIONS map, too. */ private static final Map<String, OperatorVersion> LEGACY_AGGREATION_FUNCTIONS_VERSIONS = new TreeMap<>(); static { LEGACY_AGGREATION_FUNCTIONS_VERSIONS.put(FUNCTION_NAME_MEDIAN, AggregationOperator.VERSION_7_4_0); } public static final Map<String, AggregationFunctionMetaDataProvider> AGGREGATION_FUNCTIONS_META_DATA_PROVIDER = new HashMap<>(); static { HashMap<Integer, Integer> transformationRules = new HashMap<Integer, Integer>() { private static final long serialVersionUID = 8941596913239332241L; { put(Ontology.DATE_TIME, Ontology.DATE_TIME); put(Ontology.NUMERICAL, Ontology.REAL); } }; AGGREGATION_FUNCTIONS_META_DATA_PROVIDER.put(FUNCTION_NAME_SUM, new DefaultAggregationFunctionMetaDataProvider(FUNCTION_NAME_SUM, SumAggregationFunction.FUNCTION_SUM, FUNCTION_SEPARATOR_OPEN, FUNCTION_SEPARATOR_CLOSE, new int[] { Ontology.NUMERICAL })); AGGREGATION_FUNCTIONS_META_DATA_PROVIDER.put(FUNCTION_NAME_SUM_FRACTIONAL, new DefaultAggregationFunctionMetaDataProvider("fractionalSum", SumFractionalAggregationFunction.FUNCTION_SUM_FRACTIONAL, FUNCTION_SEPARATOR_OPEN, FUNCTION_SEPARATOR_CLOSE, new int[] { Ontology.NUMERICAL })); AGGREGATION_FUNCTIONS_META_DATA_PROVIDER.put(FUNCTION_NAME_MEDIAN, new MappingAggregationFunctionMetaDataProvider(FUNCTION_NAME_MEDIAN, MedianAggregationFunction.FUNCTION_MEDIAN, FUNCTION_SEPARATOR_OPEN, FUNCTION_SEPARATOR_CLOSE, transformationRules)); AGGREGATION_FUNCTIONS_META_DATA_PROVIDER.put(FUNCTION_NAME_AVERAGE, new MappingAggregationFunctionMetaDataProvider(FUNCTION_NAME_AVERAGE, MeanAggregationFunction.FUNCTION_AVERAGE, FUNCTION_SEPARATOR_OPEN, FUNCTION_SEPARATOR_CLOSE, transformationRules)); AGGREGATION_FUNCTIONS_META_DATA_PROVIDER.put(FUNCTION_NAME_VARIANCE, new DefaultAggregationFunctionMetaDataProvider(FUNCTION_NAME_VARIANCE, VarianceAggregationFunction.FUNCTION_VARIANCE, FUNCTION_SEPARATOR_OPEN, FUNCTION_SEPARATOR_CLOSE, new int[] { Ontology.NUMERICAL }, Ontology.REAL)); AGGREGATION_FUNCTIONS_META_DATA_PROVIDER.put(FUNCTION_NAME_STANDARD_DEVIATION, new DefaultAggregationFunctionMetaDataProvider(FUNCTION_NAME_STANDARD_DEVIATION, StandardDeviationAggregationFunction.FUNCTION_STANDARD_DEVIATION, FUNCTION_SEPARATOR_OPEN, FUNCTION_SEPARATOR_CLOSE, new int[] { Ontology.NUMERICAL }, Ontology.REAL)); AGGREGATION_FUNCTIONS_META_DATA_PROVIDER.put(FUNCTION_NAME_COUNT_IGNORE_MISSINGS, new DefaultAggregationFunctionMetaDataProvider(FUNCTION_NAME_COUNT_IGNORE_MISSINGS, CountIgnoringMissingsAggregationFunction.FUNCTION_COUNT, FUNCTION_SEPARATOR_OPEN, FUNCTION_SEPARATOR_CLOSE, new int[] { Ontology.ATTRIBUTE_VALUE }, Ontology.INTEGER)); AGGREGATION_FUNCTIONS_META_DATA_PROVIDER.put(FUNCTION_NAME_COUNT_INCLUDE_MISSINGS, new DefaultAggregationFunctionMetaDataProvider(FUNCTION_NAME_COUNT_INCLUDE_MISSINGS, CountIncludingMissingsAggregationFunction.FUNCTION_COUNT, FUNCTION_SEPARATOR_OPEN, FUNCTION_SEPARATOR_CLOSE, new int[] { Ontology.ATTRIBUTE_VALUE }, Ontology.INTEGER)); AGGREGATION_FUNCTIONS_META_DATA_PROVIDER.put(FUNCTION_NAME_COUNT, new DefaultAggregationFunctionMetaDataProvider(FUNCTION_NAME_COUNT, CountAggregationFunction.FUNCTION_COUNT, FUNCTION_SEPARATOR_OPEN, FUNCTION_SEPARATOR_CLOSE, new int[] { Ontology.ATTRIBUTE_VALUE }, Ontology.INTEGER)); AGGREGATION_FUNCTIONS_META_DATA_PROVIDER.put(FUNCTION_NAME_COUNT_FRACTIONAL, new DefaultAggregationFunctionMetaDataProvider("fractionalCount", CountFractionalAggregationFunction.FUNCTION_COUNT_FRACTIONAL, FUNCTION_SEPARATOR_OPEN, FUNCTION_SEPARATOR_CLOSE, new int[] { Ontology.ATTRIBUTE_VALUE }, Ontology.REAL)); AGGREGATION_FUNCTIONS_META_DATA_PROVIDER.put(FUNCTION_NAME_COUNT_PERCENTAGE, new DefaultAggregationFunctionMetaDataProvider("percentageCount", CountPercentageAggregationFunction.FUNCTION_COUNT_PERCENTAGE, FUNCTION_SEPARATOR_OPEN, FUNCTION_SEPARATOR_CLOSE, new int[] { Ontology.ATTRIBUTE_VALUE }, Ontology.REAL)); AGGREGATION_FUNCTIONS_META_DATA_PROVIDER.put(FUNCTION_NAME_MINIMUM, new MappingAggregationFunctionMetaDataProvider(FUNCTION_NAME_MINIMUM, MinAggregationFunction.FUNCTION_MIN, FUNCTION_SEPARATOR_OPEN, FUNCTION_SEPARATOR_CLOSE, transformationRules)); AGGREGATION_FUNCTIONS_META_DATA_PROVIDER.put(FUNCTION_NAME_MAXIMUM, new MappingAggregationFunctionMetaDataProvider(FUNCTION_NAME_MAXIMUM, MaxAggregationFunction.FUNCTION_MAX, FUNCTION_SEPARATOR_OPEN, FUNCTION_SEPARATOR_CLOSE, transformationRules)); AGGREGATION_FUNCTIONS_META_DATA_PROVIDER.put(FUNCTION_NAME_LOG_PRODUCT, new DefaultAggregationFunctionMetaDataProvider("log product", LogProductAggregationFunction.FUNCTION_LOG_PRODUCT, FUNCTION_SEPARATOR_OPEN, FUNCTION_SEPARATOR_CLOSE, new int[] { Ontology.NUMERICAL }, Ontology.REAL)); AGGREGATION_FUNCTIONS_META_DATA_PROVIDER.put(FUNCTION_NAME_PRODOCT, new DefaultAggregationFunctionMetaDataProvider(FUNCTION_NAME_PRODOCT, ProductAggregationFunction.FUNCTION_PRODUCT, FUNCTION_SEPARATOR_OPEN, FUNCTION_SEPARATOR_CLOSE, new int[] { Ontology.NUMERICAL }, Ontology.REAL)); // Nominal Aggregations AGGREGATION_FUNCTIONS_META_DATA_PROVIDER.put(FUNCTION_NAME_MODE, new DefaultAggregationFunctionMetaDataProvider(FUNCTION_NAME_MODE, ModeAggregationFunction.FUNCTION_MODE, FUNCTION_SEPARATOR_OPEN, FUNCTION_SEPARATOR_CLOSE, new int[] { Ontology.ATTRIBUTE_VALUE })); AGGREGATION_FUNCTIONS_META_DATA_PROVIDER.put(FUNCTION_NAME_LEAST, new DefaultAggregationFunctionMetaDataProvider(FUNCTION_NAME_LEAST, LeastAggregationFunction.FUNCTION_LEAST, FUNCTION_SEPARATOR_OPEN, FUNCTION_SEPARATOR_CLOSE, new int[] { Ontology.NOMINAL }, Ontology.POLYNOMINAL)); AGGREGATION_FUNCTIONS_META_DATA_PROVIDER.put(FUNCTION_NAME_LEAST_ONLY_OCCURRING, new DefaultAggregationFunctionMetaDataProvider(FUNCTION_NAME_LEAST_ONLY_OCCURRING, LeastOccurringAggregationFunction.FUNCTION_LEAST_OCCURRING, FUNCTION_SEPARATOR_OPEN, FUNCTION_SEPARATOR_CLOSE, new int[] { Ontology.NOMINAL }, Ontology.POLYNOMINAL)); AGGREGATION_FUNCTIONS_META_DATA_PROVIDER.put(FUNCTION_NAME_CONCATENATION, new DefaultAggregationFunctionMetaDataProvider(FUNCTION_NAME_CONCATENATION, ConcatAggregationFunction.FUNCTION_CONCAT, FUNCTION_SEPARATOR_OPEN, FUNCTION_SEPARATOR_CLOSE, new int[] { Ontology.NOMINAL }, Ontology.POLYNOMINAL)); } private Attribute sourceAttribute; private boolean isIgnoringMissings; private boolean isCountingOnlyDistinct; public AggregationFunction(Attribute sourceAttribute, boolean ignoreMissings, boolean countOnlyDistinct) { this.sourceAttribute = sourceAttribute; this.isIgnoringMissings = ignoreMissings; this.isCountingOnlyDistinct = countOnlyDistinct; } /** * This returns the attribute this aggregation function will derive the data from. */ public Attribute getSourceAttribute() { return sourceAttribute; } /** * This returns the attribute that will be created in the resulting {@link ExampleSet} to get * the aggregated values for each group. */ public abstract Attribute getTargetAttribute(); /** * This will return the {@link Aggregator} object that computes the value of this particular * {@link AggregationFunction} for a specific group. */ public abstract Aggregator createAggregator(); /** * This determines, if any missing values will be just ignored or counted with the respective * aggregation function. Some functions might cope with that, others will just turn to be NaN. */ public boolean isIgnoringMissings() { return isIgnoringMissings; } /** * This determines, if values are counted only once, if occurring more than once. Please note * that will increase the memory load drastically on numerical attributes. */ public boolean isCountingOnlyDistinct() { return isCountingOnlyDistinct; } /** * This will return whether this {@link AggregationFunction} is compatible with the given * sourceAttribute. */ public abstract boolean isCompatible(); /** * This method will fill in the default value of this aggregation function. It has to maintain * the mapping, if the function is nominal. The default value will be a NaN. Every subclass that * wants to change this, has to override this method. */ public void setDefault(Attribute attribute, DoubleArrayDataRow row) { row.set(attribute, Double.NaN); } /** * This will create the {@link AggregationFunction} with the given name for the given source * Attribute. * * @param name * please use one of the FUNCTION_NAME_* constants to prevent unnecessary errors */ public static final AggregationFunction createAggregationFunction(String name, Attribute sourceAttribute, boolean ignoreMissings, boolean countOnlyDistinct) throws OperatorException { Class<? extends AggregationFunction> aggregationFunctionClass = AGGREATION_FUNCTIONS.get(name); if (aggregationFunctionClass == null) { throw new UserError(null, "aggregation.illegal_function_name", name); } try { Constructor<? extends AggregationFunction> constructor = aggregationFunctionClass.getConstructor(Attribute.class, boolean.class, boolean.class); return constructor.newInstance(sourceAttribute, ignoreMissings, countOnlyDistinct); } catch (Exception e) { throw new RuntimeException( "All implementations of AggregationFunction need to have a constructor accepting an Attribute and boolean. Other reasons for this error may be class loader problems.", e); } } /** * This will create the {@link AggregationFunction} with the given name for the given source * Attribute with a fallback to a legacy {@link AggregationFunction} if necessary. * * @param name * please use one of the FUNCTION_NAME_* constants to prevent unnecessary errors * @param version * The {@link OperatorVersion} of the executing operator to ensure that a legacy * function will be used for old versions */ public static final AggregationFunction createAggregationFunction(String name, Attribute sourceAttribute, boolean ignoreMissings, boolean countOnlyDistinct, OperatorVersion version) throws OperatorException { Class<? extends AggregationFunction> aggregationFunctionClass = null; // check if the legacy version should be used Iterator<String> iterator = LEGACY_AGGREATION_FUNCTIONS.keySet().iterator(); while (iterator.hasNext()) { String current = iterator.next(); if (name.equals(current) && version.isAtMost(LEGACY_AGGREATION_FUNCTIONS_VERSIONS.get(current))) { aggregationFunctionClass = LEGACY_AGGREATION_FUNCTIONS.get(current); break; } } if (aggregationFunctionClass == null) { aggregationFunctionClass = AGGREATION_FUNCTIONS.get(name); } if (aggregationFunctionClass == null) { throw new UserError(null, "aggregation.illegal_function_name", name); } try { Constructor<? extends AggregationFunction> constructor = aggregationFunctionClass.getConstructor(Attribute.class, boolean.class, boolean.class); return constructor.newInstance(sourceAttribute, ignoreMissings, countOnlyDistinct); } catch (Exception e) { throw new RuntimeException( "All implementations of AggregationFunction need to have a constructor accepting an Attribute and boolean. Other reasons for this error may be class loader problems.", e); } } /** * This method can be called in order to get the target attribute meta data after the * aggregation functions have been applied. This method can register errors on the given * InputPort (if not null), if there's an illegal state. If the state makes applying an * {@link AggregationFunction} impossible, this method will return null! * * @param aggregationFunctionName * please use one of the FUNCTION_NAME_* constants to prevent unnecessary errors */ public static final AttributeMetaData getAttributeMetaData(String aggregationFunctionName, AttributeMetaData sourceAttributeMetaData, InputPort inputPort) { AggregationFunctionMetaDataProvider metaDataProvider = AGGREGATION_FUNCTIONS_META_DATA_PROVIDER .get(aggregationFunctionName); if (metaDataProvider != null) { return metaDataProvider.getTargetAttributeMetaData(sourceAttributeMetaData, inputPort); } else { // register error about unknown aggregation function if (inputPort != null) { inputPort.addError(new SimpleMetaDataError(Severity.ERROR, inputPort, "aggregation.unknown_aggregation_function", aggregationFunctionName)); } return null; } } /** * This method will return the array containing the names of all available aggregation * functions. The names are sorted according to natural ordering. */ public static String[] getAvailableAggregationFunctionNames() { String[] names = new String[AGGREATION_FUNCTIONS.size()]; int i = 0; for (String name : AGGREATION_FUNCTIONS.keySet()) { names[i] = name; i++; } return names; } /** * This method will return a list of aggregate functions that are compatible with the provided * valueType. * * @param valueType * a valueType found in {@link Ontology}. */ public static List<String> getCompatibleAggregationFunctionNames(int valueType) { List<String> compatibleAggregationFunctions = new LinkedList<>(); Attribute sampleAttribute = AttributeFactory.createAttribute(valueType); for (String name : getAvailableAggregationFunctionNames()) { try { if (createAggregationFunction(name, sampleAttribute, true, true).isCompatible()) { compatibleAggregationFunctions.add(name); } } catch (OperatorException e) { // do nothing } } return compatibleAggregationFunctions; } /** * With this method extensions might register additional aggregation functions if needed. */ public static void registerNewAggregationFunction(String name, Class<? extends AggregationFunction> clazz, AggregationFunctionMetaDataProvider metaDataProvider) { AGGREATION_FUNCTIONS.put(name, clazz); AGGREGATION_FUNCTIONS_META_DATA_PROVIDER.put(name, metaDataProvider); } /** * This function is called once during the aggregation process, when all {@link Aggregator}s are * known. In this step post-processing like normalization etc. can be done. * * The default implementation does nothing. */ public void postProcessing(List<Aggregator> allAggregators) { // do nothing } }