/* * RapidMiner * * Copyright (C) 2001-2008 by Rapid-I and the contributors * * Complete list of developers available at our web site: * * http://rapid-i.com * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.operator.preprocessing.sampling; import java.util.ArrayList; import java.util.Arrays; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.ListIterator; import java.util.TreeSet; import com.rapidminer.example.Attribute; import com.rapidminer.example.Attributes; import com.rapidminer.example.Example; import com.rapidminer.example.ExampleSet; import com.rapidminer.example.Statistics; import com.rapidminer.example.set.Partition; import com.rapidminer.example.set.SplittedExampleSet; import com.rapidminer.operator.IOObject; import com.rapidminer.operator.Operator; import com.rapidminer.operator.OperatorDescription; import com.rapidminer.operator.OperatorException; import com.rapidminer.operator.similarity.attributebased.AbstractValueBasedSimilarity; import com.rapidminer.operator.similarity.attributebased.EuclideanDistance; import com.rapidminer.parameter.ParameterType; import com.rapidminer.parameter.ParameterTypeBoolean; import com.rapidminer.parameter.ParameterTypeDouble; import com.rapidminer.parameter.ParameterTypeInt; import com.rapidminer.tools.Ontology; /** * This operator performs a Kennard-Stone Sampling. This sampling Algorithm works as follows: * First find the two points most separated in the training set. * For each candidate point, find the smallest distance to any object already selected. * Select that point for the training set which has the largest of these smallest distances * As described above, this algorithm always gives the same result, due to the two starting * points which are always the same. * This implementation reduces number of iterations by holding a list with candidates of the largest * smallest distances. * The parameters controll the number of examples in the sample * * @author Sebastian Land * @version $Id: KennardStoneSampling.java,v 1.5 2008/07/07 07:06:47 ingomierswa Exp $ */ public class KennardStoneSampling extends Operator { /** The parameter name for "The fraction of examples which should be sampled" */ public static final String PARAMETER_SAMPLE_RATIO = "sample_ratio"; public static final String PARAMETER_ABSOLUTE_SAMPLE = "absolute_sample"; public static final String PARAMETER_SAMPLE_SIZE = "sample_size"; private class Candidate implements Comparable<Candidate>{ private double[] attributeValues; private double distance; private int exampleIndex; public Candidate(double[] exampleValues, double distance, int exampleIndex) { attributeValues = exampleValues; this.distance = distance; this.exampleIndex = exampleIndex; } public double getDistance() { return distance; } public double[] getValues() { return attributeValues; } public int getExampleIndex() { return exampleIndex; } public int compareTo(Candidate o) { return Double.compare(this.distance, o.getDistance()); } } public KennardStoneSampling(OperatorDescription description) { super(description); } public IOObject[] apply() throws OperatorException { ExampleSet exampleSet = getInput(ExampleSet.class); // creating kernel and settings from Parameters int k = Math.min(100, exampleSet.getAttributes().size() * 2); int desiredNumber = (int) ((double)exampleSet.size() * getParameterAsDouble(PARAMETER_SAMPLE_RATIO)); if (getParameterAsBoolean(PARAMETER_ABSOLUTE_SAMPLE)) desiredNumber = getParameterAsInt(PARAMETER_SAMPLE_SIZE); AbstractValueBasedSimilarity distanceMeasure = new EuclideanDistance(); // finding farthest and nearest example to mean Vector double[] meanVector = getMeanVector(exampleSet); Candidate min = new Candidate(meanVector, Double.POSITIVE_INFINITY, 0); Candidate max = new Candidate(meanVector, Double.NEGATIVE_INFINITY, 0); int i = 0; for (Example example: exampleSet) { double[] exampleValues = getExampleValues(example); Candidate current = new Candidate(exampleValues, Math.abs(distanceMeasure.similarity(meanVector, exampleValues)), i); if (current.compareTo(min) < 0) { min = current; } if (current.compareTo(max) > 0) { max = current; } i++; } ArrayList<Candidate> recentlySelected = new ArrayList<Candidate>(10); int[] partition = new int[exampleSet.size()]; int numberOfSelectedExamples = 2; recentlySelected.add(min); recentlySelected.add(max); partition[min.getExampleIndex()] = 1; partition[max.getExampleIndex()] = 1; double[] minimalDistances = new double[exampleSet.size()]; Arrays.fill(minimalDistances, Double.POSITIVE_INFINITY); // running now through examples, checking for smallest distance to one of the candidates while (numberOfSelectedExamples < desiredNumber) { TreeSet<Candidate> candidates = new TreeSet<Candidate>(); i = 0; // check distance only for candidates recently selected. for (Example example: exampleSet) { // if example not has been selected allready if (partition[i] == 0) { double[] exampleValues = getExampleValues(example); for (Candidate candidate: recentlySelected) { minimalDistances[i] = Math.min(minimalDistances[i], Math.abs(distanceMeasure.similarity(exampleValues, candidate.getValues()))); } Candidate newCandidate = new Candidate(exampleValues, minimalDistances[i], i); candidates.add(newCandidate); if (candidates.size() > k) { Iterator<Candidate> iterator = candidates.iterator(); iterator.next(); iterator.remove(); } } i++; } // clearing recently selected since now new ones will be selected recentlySelected.clear(); // now running in descending order through candidates and adding to selected // IM: descendingIterator() is not available in Java versions less than 6 !!! // IM: Bad workaround for now by adding all candidates into a list and using a listIterator() and hasPrevious... /* Iterator<Candidate> descendingIterator = candidates.descendingIterator(); while (descendingIterator.hasNext() && numberOfSelectedExamples < desiredNumber) { Candidate candidate = descendingIterator.next(); */ List<Candidate> reverseCandidateList = new LinkedList<Candidate>(); Iterator<Candidate> it = candidates.iterator(); while (it.hasNext()) { reverseCandidateList.add(it.next()); } ListIterator<Candidate> lit = reverseCandidateList.listIterator(reverseCandidateList.size() - 1); while (lit.hasPrevious()) { Candidate candidate = lit.previous(); // IM: end of workaround boolean existSmallerDistance = false; Iterator<Candidate> addedIterator = recentlySelected.iterator(); // test if a distance to recently selected is smaller than previously calculated minimal distance // if one exists: This is not selected while (addedIterator.hasNext()) { double distance = Math.abs(distanceMeasure.similarity(addedIterator.next().getValues(), candidate.getValues())); existSmallerDistance = existSmallerDistance || distance < candidate.getDistance(); } if (!existSmallerDistance) { recentlySelected.add(candidate); partition[candidate.getExampleIndex()] = 1; numberOfSelectedExamples++; } else break; } } // building new exampleSet containing only Examples with indices in selectedExamples SplittedExampleSet sample = new SplittedExampleSet(exampleSet, new Partition(partition, 2)); sample.selectSingleSubset(1); return new IOObject[] {sample}; } private double[] getMeanVector(ExampleSet exampleSet) { exampleSet.recalculateAllAttributeStatistics(); Attributes attributes = exampleSet.getAttributes(); double[] meanVector = new double[attributes.size()]; int i = 0; for (Attribute attribute: attributes) { if (Ontology.ATTRIBUTE_VALUE_TYPE.isA(attribute.getValueType(), Ontology.DATE_TIME)) { meanVector[i] = exampleSet.getStatistics(attribute, Statistics.MINIMUM); } else if (attribute.isNominal()) meanVector[i] = exampleSet.getStatistics(attribute, Statistics.MODE); else meanVector[i] = exampleSet.getStatistics(attribute, Statistics.AVERAGE); i++; } return meanVector; } private double[] getExampleValues(Example example) { Attributes attributes = example.getAttributes(); double[] attributeValues = new double[attributes.size()]; int i = 0; for (Attribute attribute: attributes) { attributeValues[i] = example.getValue(attribute); i++; } return attributeValues; } public Class<?>[] getInputClasses() { return new Class[] {ExampleSet.class}; } public Class<?>[] getOutputClasses() { return new Class[] {ExampleSet.class}; } public List<ParameterType> getParameterTypes() { List<ParameterType> types = super.getParameterTypes(); ParameterType type = new ParameterTypeDouble(PARAMETER_SAMPLE_RATIO, "The fraction of examples which should be sampled", 0.0d, 1.0d, 0.1d); type.setExpert(false); types.add(type); types.add(new ParameterTypeBoolean(PARAMETER_ABSOLUTE_SAMPLE, "If checked, the absolute number of examples will be used. Otherwise the ratio.", false)); types.add(new ParameterTypeInt(PARAMETER_SAMPLE_SIZE, "The number of examples which should be sampled", 1, Integer.MAX_VALUE, 1000)); return types; } }