/** * Copyright (C) 2001-2017 by RapidMiner and the contributors * * Complete list of developers available at our web site: * * http://rapidminer.com * * This program is free software: you can redistribute it and/or modify it under the terms of the * GNU Affero General Public License as published by the Free Software Foundation, either version 3 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License along with this program. * If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.operator.preprocessing.sampling; import com.rapidminer.example.Attribute; import com.rapidminer.example.Attributes; import com.rapidminer.example.Example; import com.rapidminer.example.ExampleSet; import com.rapidminer.example.Statistics; import com.rapidminer.example.set.Partition; import com.rapidminer.example.set.SplittedExampleSet; import com.rapidminer.operator.OperatorDescription; import com.rapidminer.operator.OperatorException; import com.rapidminer.operator.ProcessSetupError.Severity; import com.rapidminer.operator.annotation.ResourceConsumptionEstimator; import com.rapidminer.operator.ports.metadata.ExampleSetMetaData; import com.rapidminer.operator.ports.metadata.MDInteger; import com.rapidminer.operator.ports.metadata.MetaDataInfo; import com.rapidminer.operator.ports.metadata.SimpleMetaDataError; import com.rapidminer.operator.ports.quickfix.ParameterSettingQuickFix; import com.rapidminer.parameter.ParameterType; import com.rapidminer.parameter.ParameterTypeCategory; import com.rapidminer.parameter.ParameterTypeDouble; import com.rapidminer.parameter.ParameterTypeInt; import com.rapidminer.parameter.UndefinedParameterError; import com.rapidminer.parameter.conditions.EqualTypeCondition; import com.rapidminer.tools.Ontology; import com.rapidminer.tools.OperatorResourceConsumptionHandler; import com.rapidminer.tools.math.similarity.DistanceMeasure; import com.rapidminer.tools.math.similarity.numerical.EuclideanDistance; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.TreeSet; /** * This operator performs a Kennard-Stone Sampling. This sampling Algorithm works as follows: First * find the two points most separated in the training set. For each candidate point, find the * smallest distance to any object already selected. Select that point for the training set which * has the largest of these smallest distances As described above, this algorithm always gives the * same result, due to the two starting points which are always the same. This implementation * reduces number of iterations by holding a list with candidates of the largest smallest distances. * The parameters controll the number of examples in the sample * * @author Sebastian Land */ public class KennardStoneSampling extends AbstractSamplingOperator { public static final String PARAMETER_SAMPLE = "sample"; public static final String[] SAMPLE_MODES = { "absolute", "relative" }; public static final int SAMPLE_ABSOLUTE = 0; public static final int SAMPLE_RELATIVE = 1; /** The parameter name for "The fraction of examples which should be sampled" */ public static final String PARAMETER_SAMPLE_SIZE = "sample_size"; /** The parameter name for "This ratio determines the size of the new example set." */ public static final String PARAMETER_SAMPLE_RATIO = "sample_ratio"; private static class Candidate implements Comparable<Candidate> { private double[] attributeValues; private double distance; private int exampleIndex; public Candidate(double[] exampleValues, double distance, int exampleIndex) { attributeValues = exampleValues; this.distance = distance; this.exampleIndex = exampleIndex; } public double getDistance() { return distance; } public double[] getValues() { return attributeValues; } public int getExampleIndex() { return exampleIndex; } @Override public int compareTo(Candidate o) { return Double.compare(this.distance, o.getDistance()); } } public KennardStoneSampling(OperatorDescription description) { super(description); } @Override protected MDInteger getSampledSize(ExampleSetMetaData emd) throws UndefinedParameterError { switch (getParameterAsInt(PARAMETER_SAMPLE)) { case SAMPLE_ABSOLUTE: int absoluteNumber = getParameterAsInt(PARAMETER_SAMPLE_SIZE); if (emd.getNumberOfExamples().isAtLeast(absoluteNumber) == MetaDataInfo.NO) { getExampleSetInputPort().addError( new SimpleMetaDataError(Severity.ERROR, getExampleSetInputPort(), Collections .singletonList(new ParameterSettingQuickFix(this, PARAMETER_SAMPLE_SIZE, emd .getNumberOfExamples().getValue().toString())), "need_more_examples", absoluteNumber + "")); } return new MDInteger(absoluteNumber); case SAMPLE_RELATIVE: MDInteger number = emd.getNumberOfExamples(); number.multiply(getParameterAsDouble(PARAMETER_SAMPLE_RATIO)); return number; default: return new MDInteger(); } } @Override public ExampleSet apply(ExampleSet exampleSet) throws OperatorException { // creating kernel and settings from Parameters int k = Math.min(100, exampleSet.getAttributes().size() * 2); int size = exampleSet.size(); switch (getParameterAsInt(PARAMETER_SAMPLE)) { case SAMPLE_ABSOLUTE: size = getParameterAsInt(PARAMETER_SAMPLE_SIZE); break; case SAMPLE_RELATIVE: size = (int) Math.round(exampleSet.size() * getParameterAsDouble(PARAMETER_SAMPLE_RATIO)); break; } DistanceMeasure distanceMeasure = new EuclideanDistance(); distanceMeasure.init(exampleSet); // finding farthest and nearest example to mean Vector double[] meanVector = getMeanVector(exampleSet); Candidate min = new Candidate(meanVector, Double.POSITIVE_INFINITY, 0); Candidate max = new Candidate(meanVector, Double.NEGATIVE_INFINITY, 0); int i = 0; for (Example example : exampleSet) { this.checkForStop(); double[] exampleValues = getExampleValues(example); Candidate current = new Candidate(exampleValues, Math.abs(distanceMeasure.calculateDistance(meanVector, exampleValues)), i); if (current.compareTo(min) < 0) { min = current; } if (current.compareTo(max) > 0) { max = current; } i++; } this.checkForStop(); ArrayList<Candidate> recentlySelected = new ArrayList<>(10); int[] partition = new int[exampleSet.size()]; int numberOfSelectedExamples = 2; recentlySelected.add(min); recentlySelected.add(max); partition[min.getExampleIndex()] = 1; partition[max.getExampleIndex()] = 1; double[] minimalDistances = new double[exampleSet.size()]; Arrays.fill(minimalDistances, Double.POSITIVE_INFINITY); // running now through examples, checking for smallest distance to one of the candidates while (numberOfSelectedExamples < size) { TreeSet<Candidate> candidates = new TreeSet<>(); this.checkForStop(); i = 0; // check distance only for candidates recently selected. for (Example example : exampleSet) { // if example not has been selected allready if (partition[i] == 0) { double[] exampleValues = getExampleValues(example); for (Candidate candidate : recentlySelected) { minimalDistances[i] = Math.min(minimalDistances[i], Math.abs(distanceMeasure.calculateDistance(exampleValues, candidate.getValues()))); } Candidate newCandidate = new Candidate(exampleValues, minimalDistances[i], i); candidates.add(newCandidate); if (candidates.size() > k) { Iterator<Candidate> iterator = candidates.iterator(); iterator.next(); iterator.remove(); } } i++; this.checkForStop(); } // clearing recently selected since now new ones will be selected recentlySelected.clear(); // now running in descending order through candidates and adding to selected // IM: descendingIterator() is not available in Java versions less than 6 !!! Iterator<Candidate> descendingIterator = candidates.descendingIterator(); while (descendingIterator.hasNext() && numberOfSelectedExamples < size) { Candidate candidate = descendingIterator.next(); this.checkForStop(); boolean existSmallerDistance = false; Iterator<Candidate> addedIterator = recentlySelected.iterator(); // test if a distance to recently selected is smaller than previously calculated // minimal distance // if one exists: This is not selected while (addedIterator.hasNext()) { double distance = Math.abs(distanceMeasure.calculateDistance(addedIterator.next().getValues(), candidate.getValues())); existSmallerDistance = existSmallerDistance || distance < candidate.getDistance(); } if (!existSmallerDistance) { recentlySelected.add(candidate); partition[candidate.getExampleIndex()] = 1; numberOfSelectedExamples++; } else { break; } } } // building new exampleSet containing only Examples with indices in selectedExamples SplittedExampleSet sample = new SplittedExampleSet(exampleSet, new Partition(partition, 2)); sample.selectSingleSubset(1); return sample; } private double[] getMeanVector(ExampleSet exampleSet) { exampleSet.recalculateAllAttributeStatistics(); Attributes attributes = exampleSet.getAttributes(); double[] meanVector = new double[attributes.size()]; int i = 0; for (Attribute attribute : attributes) { if (Ontology.ATTRIBUTE_VALUE_TYPE.isA(attribute.getValueType(), Ontology.DATE_TIME)) { meanVector[i] = exampleSet.getStatistics(attribute, Statistics.MINIMUM); } else if (attribute.isNominal()) { meanVector[i] = exampleSet.getStatistics(attribute, Statistics.MODE); } else { meanVector[i] = exampleSet.getStatistics(attribute, Statistics.AVERAGE); } i++; } return meanVector; } private double[] getExampleValues(Example example) { Attributes attributes = example.getAttributes(); double[] attributeValues = new double[attributes.size()]; int i = 0; for (Attribute attribute : attributes) { attributeValues[i] = example.getValue(attribute); i++; } return attributeValues; } @Override public List<ParameterType> getParameterTypes() { List<ParameterType> types = new LinkedList<>(); ParameterType type = new ParameterTypeCategory(PARAMETER_SAMPLE, "Determines how the amount of data is specified.", SAMPLE_MODES, SAMPLE_ABSOLUTE); type.setExpert(false); types.add(type); type = new ParameterTypeInt(PARAMETER_SAMPLE_SIZE, "The number of examples which should be sampled", 1, Integer.MAX_VALUE, 100); type.registerDependencyCondition(new EqualTypeCondition(this, PARAMETER_SAMPLE, SAMPLE_MODES, true, SAMPLE_ABSOLUTE)); type.setExpert(false); types.add(type); type = new ParameterTypeDouble(PARAMETER_SAMPLE_RATIO, "The fraction of examples which should be sampled", 0.0d, 1.0d, 0.1d); type.registerDependencyCondition(new EqualTypeCondition(this, PARAMETER_SAMPLE, SAMPLE_MODES, true, SAMPLE_RELATIVE)); type.setExpert(false); types.add(type); types.addAll(super.getParameterTypes()); return types; } @Override public ResourceConsumptionEstimator getResourceConsumptionEstimator() { return OperatorResourceConsumptionHandler.getResourceConsumptionEstimator(getInputPort(), KennardStoneSampling.class, null); } }