/* * RapidMiner * * Copyright (C) 2001-2011 by Rapid-I and the contributors * * Complete list of developers available at our web site: * * http://rapid-i.com * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.operator.preprocessing.sampling; import java.util.Collections; import java.util.List; import com.rapidminer.example.Attribute; import com.rapidminer.example.Example; import com.rapidminer.example.ExampleSet; import com.rapidminer.example.set.MappedExampleSet; import com.rapidminer.example.set.SplittedExampleSet; import com.rapidminer.operator.OperatorDescription; import com.rapidminer.operator.OperatorException; import com.rapidminer.operator.ProcessSetupError.Severity; import com.rapidminer.operator.UserError; import com.rapidminer.operator.annotation.ResourceConsumptionEstimator; import com.rapidminer.operator.ports.metadata.ExampleSetMetaData; import com.rapidminer.operator.ports.metadata.MDInteger; import com.rapidminer.operator.ports.metadata.MetaDataInfo; import com.rapidminer.operator.ports.metadata.SimpleMetaDataError; import com.rapidminer.operator.ports.quickfix.ParameterSettingQuickFix; import com.rapidminer.operator.preprocessing.sampling.sequences.AbsoluteSamplingSequenceGenerator; import com.rapidminer.operator.preprocessing.sampling.sequences.ProbabilitySamplingSequenceGenerator; import com.rapidminer.operator.preprocessing.sampling.sequences.RelativeSamplingSequenceGenerator; import com.rapidminer.operator.preprocessing.sampling.sequences.SamplingSequenceGenerator; import com.rapidminer.parameter.ParameterType; import com.rapidminer.parameter.ParameterTypeBoolean; import com.rapidminer.parameter.ParameterTypeCategory; import com.rapidminer.parameter.ParameterTypeDouble; import com.rapidminer.parameter.ParameterTypeInt; import com.rapidminer.parameter.ParameterTypeList; import com.rapidminer.parameter.ParameterTypeString; import com.rapidminer.parameter.UndefinedParameterError; import com.rapidminer.parameter.conditions.BooleanParameterCondition; import com.rapidminer.parameter.conditions.EqualTypeCondition; import com.rapidminer.tools.OperatorResourceConsumptionHandler; import com.rapidminer.tools.RandomGenerator; /** * This operator will sample the given example set without replacement. Three modes are available: Absolute returning a * determined number, relative returning a determined fraction of the input set and probability, that will return each * example with the same probability. * * The operator offers the possibility to specify sampling parameter per class to re-balance the data. * * @author Sebastian Land, Ingo Mierswa, Tobias Malbrecht */ public class SamplingOperator extends AbstractSamplingOperator { public static final String PARAMETER_SAMPLE = "sample"; public static String[] SAMPLE_MODES = { "absolute", "relative", "probability" }; public static final int SAMPLE_ABSOLUTE = 0; public static final int SAMPLE_RELATIVE = 1; public static final int SAMPLE_PROBABILITY = 2; /** The parameter name for "The number of examples which should be sampled" */ public static final String PARAMETER_SAMPLE_SIZE = "sample_size"; /** The parameter name for "The fraction of examples which should be sampled" */ public static final String PARAMETER_SAMPLE_RATIO = "sample_ratio"; public static final String PARAMETER_SAMPLE_PROBABILITY = "sample_probability"; public static final String PARAMETER_BALANCE_DATA = "balance_data"; public static final String PARAMETER_SAMPLE_SIZE_LIST = "sample_size_per_class"; public static final String PARAMETER_SAMPLE_RATIO_LIST = "sample_ratio_per_class"; public static final String PARAMETER_SAMPLE_PROBABILITY_LIST = "sample_probability_per_class"; public SamplingOperator(OperatorDescription description) { super(description); } @Override protected MDInteger getSampledSize(ExampleSetMetaData emd) throws UndefinedParameterError { boolean balanceData = getParameterAsBoolean(PARAMETER_BALANCE_DATA); int absoluteNumber = 0; switch (getParameterAsInt(PARAMETER_SAMPLE)) { case SAMPLE_ABSOLUTE: if (balanceData) { List<String[]> parameterList = getParameterList(PARAMETER_SAMPLE_SIZE_LIST); for (String[] pair: parameterList) { absoluteNumber += Integer.valueOf(pair[1]); } } else { absoluteNumber = getParameterAsInt(PARAMETER_SAMPLE_SIZE); } if (emd.getNumberOfExamples().isAtLeast(absoluteNumber) == MetaDataInfo.NO) { getExampleSetInputPort().addError(new SimpleMetaDataError(Severity.ERROR, getExampleSetInputPort(), Collections.singletonList(new ParameterSettingQuickFix(this, PARAMETER_SAMPLE_SIZE, emd.getNumberOfExamples().getValue().toString())), "exampleset.need_more_examples", absoluteNumber + "")); } return new MDInteger(absoluteNumber); case SAMPLE_RELATIVE: if (balanceData) { MDInteger numberOfExamples = emd.getNumberOfExamples(); numberOfExamples.reduceByUnknownAmount(); return numberOfExamples; } if (emd.getNumberOfExamples().isKnown()) { return new MDInteger((int) ((getParameterAsDouble(PARAMETER_SAMPLE_RATIO) * emd.getNumberOfExamples().getValue()))); } return new MDInteger(); case SAMPLE_PROBABILITY: if (balanceData) { MDInteger numberOfExamples = emd.getNumberOfExamples(); numberOfExamples.reduceByUnknownAmount(); return numberOfExamples; } if (emd.getNumberOfExamples().isKnown()) { return new MDInteger((int) ((getParameterAsDouble(PARAMETER_SAMPLE_PROBABILITY) * emd.getNumberOfExamples().getValue()))); } return new MDInteger(); default: return new MDInteger(); } } @Override public ExampleSet apply(ExampleSet originalSet) throws OperatorException { int resultSize = 0; int[] usedIndices = new int[originalSet.size()]; SplittedExampleSet perLabelSets = null; int numberOfIterations = 1; // if sampling not per class, just one iteration boolean balanceData = getParameterAsBoolean(PARAMETER_BALANCE_DATA); Attribute label = null; List<String[]> pairs = null; ExampleSet exampleSet = null; if (balanceData) { label = originalSet.getAttributes().getLabel(); if (label != null) { if (label.isNominal()) { perLabelSets = SplittedExampleSet.splitByAttribute(originalSet, label); exampleSet = perLabelSets; } else { throw new UserError(this, 105); } } else { throw new UserError(this, resultSize); } switch (getParameterAsInt(PARAMETER_SAMPLE)) { case SAMPLE_RELATIVE: pairs = getParameterList(PARAMETER_SAMPLE_RATIO_LIST); break; case SAMPLE_ABSOLUTE: pairs = getParameterList(PARAMETER_SAMPLE_SIZE_LIST); break; case SAMPLE_PROBABILITY: default: pairs = getParameterList(PARAMETER_SAMPLE_PROBABILITY_LIST); } numberOfIterations = pairs.size(); } else { exampleSet = originalSet; } // now iterate over all subsets for (int i = 0; i < numberOfIterations; i++) { SamplingSequenceGenerator sampleSequence = null; if (balanceData) { perLabelSets.clearSelection(); perLabelSets.selectAdditionalSubset(i); String parameter = "0"; // finding parameter list of the selected sample method if (exampleSet.size() > 0) { // getting parameter value for current class Example next = exampleSet.iterator().next(); String labelValue = next.getValueAsString(label); for (String[] pair : pairs) { if (labelValue.equals(pair[0])) { parameter = pair[1]; break; } } } // now generate sampling sequence for this class's parameter value switch (getParameterAsInt(PARAMETER_SAMPLE)) { case SAMPLE_RELATIVE: sampleSequence = new RelativeSamplingSequenceGenerator(exampleSet.size(), Double.valueOf(parameter), RandomGenerator.getRandomGenerator(this)); break; case SAMPLE_ABSOLUTE: sampleSequence = new AbsoluteSamplingSequenceGenerator(exampleSet.size(), Integer.valueOf(parameter), RandomGenerator.getRandomGenerator(this)); break; case SAMPLE_PROBABILITY: default: sampleSequence = new ProbabilitySamplingSequenceGenerator(Double.valueOf(parameter), RandomGenerator.getRandomGenerator(this)); break; } } else { // just retrieve the standard parameters switch (getParameterAsInt(PARAMETER_SAMPLE)) { case SAMPLE_RELATIVE: sampleSequence = new RelativeSamplingSequenceGenerator(exampleSet.size(), getParameterAsDouble(PARAMETER_SAMPLE_RATIO), RandomGenerator.getRandomGenerator(this)); break; case SAMPLE_ABSOLUTE: int size = getParameterAsInt(PARAMETER_SAMPLE_SIZE); if (size > exampleSet.size()) { throw new UserError(this, 110, size); } sampleSequence = new AbsoluteSamplingSequenceGenerator(exampleSet.size(), size, RandomGenerator.getRandomGenerator(this)); break; case SAMPLE_PROBABILITY: default: sampleSequence = new ProbabilitySamplingSequenceGenerator(getParameterAsDouble(PARAMETER_SAMPLE_PROBABILITY), RandomGenerator.getRandomGenerator(this)); break; } } // add indices which are used for (int j = 0; j < exampleSet.size(); j++) { if (sampleSequence.useNext()) { if (balanceData) { usedIndices[resultSize] = perLabelSets.getActualParentIndex(j); } else { usedIndices[resultSize] = j; } resultSize++; } } } // create new filtered example set int[] resultIndices = new int[resultSize]; System.arraycopy(usedIndices, 0, resultIndices, 0, resultSize); return new MappedExampleSet(originalSet, resultIndices, true, true); } @Override public List<ParameterType> getParameterTypes() { List<ParameterType> types = super.getParameterTypes(); ParameterType type = new ParameterTypeCategory(PARAMETER_SAMPLE, "Determines how the amount of data is specified.", SAMPLE_MODES, SAMPLE_ABSOLUTE); type.setExpert(false); types.add(type); type = new ParameterTypeBoolean(PARAMETER_BALANCE_DATA, "If you need to sample differently for examples of a certain class, you might check this.", false, true); types.add(type); type = new ParameterTypeInt(PARAMETER_SAMPLE_SIZE, "The number of examples which should be sampled", 1, Integer.MAX_VALUE, 100); type.registerDependencyCondition(new EqualTypeCondition(this, PARAMETER_SAMPLE, SAMPLE_MODES, true, SAMPLE_ABSOLUTE)); type.registerDependencyCondition(new BooleanParameterCondition(this, PARAMETER_BALANCE_DATA, true, false)); type.setExpert(false); types.add(type); type = new ParameterTypeDouble(PARAMETER_SAMPLE_RATIO, "The fraction of examples which should be sampled", 0.0d, 1.0d, 0.1d); type.registerDependencyCondition(new EqualTypeCondition(this, PARAMETER_SAMPLE, SAMPLE_MODES, true, SAMPLE_RELATIVE)); type.registerDependencyCondition(new BooleanParameterCondition(this, PARAMETER_BALANCE_DATA, true, false)); type.setExpert(false); types.add(type); type = new ParameterTypeDouble(PARAMETER_SAMPLE_PROBABILITY, "The sample probability for each example.", 0.0d, 1.0d, 0.1d); type.registerDependencyCondition(new EqualTypeCondition(this, PARAMETER_SAMPLE, SAMPLE_MODES, true, SAMPLE_PROBABILITY)); type.registerDependencyCondition(new BooleanParameterCondition(this, PARAMETER_BALANCE_DATA, true, false)); type.setExpert(false); types.add(type); type = new ParameterTypeList(PARAMETER_SAMPLE_SIZE_LIST, "The absolut sample size per class.", new ParameterTypeString("class", "The class name this sample size applies to."), new ParameterTypeInt("size", "The number of sampled examples of this class.", 0, Integer.MAX_VALUE)); type.registerDependencyCondition(new EqualTypeCondition(this, PARAMETER_SAMPLE, SAMPLE_MODES, true, SAMPLE_ABSOLUTE)); type.registerDependencyCondition(new BooleanParameterCondition(this, PARAMETER_BALANCE_DATA, true, true)); type.setExpert(false); types.add(type); type = new ParameterTypeList(PARAMETER_SAMPLE_RATIO_LIST, "The fraction per class.", new ParameterTypeString("class", "The class name this sample size applies to."), new ParameterTypeDouble("ratio", "The fractions of examples of this class.", 0, 1)); type.registerDependencyCondition(new EqualTypeCondition(this, PARAMETER_SAMPLE, SAMPLE_MODES, true, SAMPLE_RELATIVE)); type.registerDependencyCondition(new BooleanParameterCondition(this, PARAMETER_BALANCE_DATA, true, true)); type.setExpert(false); types.add(type); type = new ParameterTypeList(PARAMETER_SAMPLE_PROBABILITY_LIST, "The fraction per class.", new ParameterTypeString("class", "The class name this sample size applies to."), new ParameterTypeDouble("probability", "The probability of examples of this class to belong to the sample.", 0, 1)); type.registerDependencyCondition(new EqualTypeCondition(this, PARAMETER_SAMPLE, SAMPLE_MODES, true, SAMPLE_PROBABILITY)); type.registerDependencyCondition(new BooleanParameterCondition(this, PARAMETER_BALANCE_DATA, true, true)); type.setExpert(false); types.add(type); types.addAll(RandomGenerator.getRandomGeneratorParameters(this)); return types; } @Override public ResourceConsumptionEstimator getResourceConsumptionEstimator() { return OperatorResourceConsumptionHandler.getResourceConsumptionEstimator(getInputPort(), SamplingOperator.class, null); } }