/* * RapidMiner * * Copyright (C) 2001-2008 by Rapid-I and the contributors * * Complete list of developers available at our web site: * * http://rapid-i.com * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.example.set; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.concurrent.atomic.AtomicInteger; import com.rapidminer.example.Attribute; import com.rapidminer.example.Attributes; import com.rapidminer.example.Example; import com.rapidminer.example.ExampleSet; import com.rapidminer.example.table.ExampleTable; /** * An example set that can be split into subsets by using a {@link Partition}. * * @author Simon Fischer, Ingo Mierswa, Felix Jungermann * @version $Id: SplittedExampleSet.java,v 2.26 2006/03/23 17:48:24 ingomierswa * Exp $ */ public class SplittedExampleSet extends AbstractExampleSet { private static final long serialVersionUID = 4573262969007377183L; /** Indicates a non-shuffled sampling for partition building. */ public static final String[] SAMPLING_NAMES = { "linear sampling", "shuffled sampling", "stratified sampling" }; /** Indicates a non-shuffled sampling for partition building. */ public static final int LINEAR_SAMPLING = 0; /** Indicates a shuffled sampling for partition building. */ public static final int SHUFFLED_SAMPLING = 1; /** Indicates a stratified shuffled sampling for partition building. */ public static final int STRATIFIED_SAMPLING = 2; /** The partition. */ private Partition partition; /** The parent example set. */ private ExampleSet parent; /** Constructs a SplittedExampleSet with the given partition. */ public SplittedExampleSet(ExampleSet exampleSet, Partition partition) { this.parent = (ExampleSet)exampleSet.clone(); this.partition = partition; } /** * Creates an example set that is splitted into two subsets using the given * sampling type. */ public SplittedExampleSet(ExampleSet exampleSet, double splitRatio, int samplingType, int seed) { this(exampleSet, new double[] { splitRatio, 1 - splitRatio }, samplingType, seed); } /** * Creates an example set that is splitted into n subsets with the given * sampling type. */ public SplittedExampleSet(ExampleSet exampleSet, double[] splitRatios, int samplingType, int seed) { this(exampleSet, new Partition(splitRatios, exampleSet.size(), createPartitionBuilder(exampleSet, samplingType, seed))); } /** * Creates an example set that is splitted into <i>numberOfSubsets</i> * parts with the given sampling type. */ public SplittedExampleSet(ExampleSet exampleSet, int numberOfSubsets, int samplingType, int seed) { this(exampleSet, new Partition(numberOfSubsets, exampleSet.size(), createPartitionBuilder(exampleSet, samplingType, seed))); } /** Clone constructor. */ public SplittedExampleSet(SplittedExampleSet exampleSet) { this.parent = (ExampleSet)exampleSet.parent.clone(); this.partition = (Partition) exampleSet.partition.clone(); } public boolean equals(Object o) { if (!super.equals(o)) return false; if (!(o instanceof SplittedExampleSet)) return false; return this.partition.equals(((SplittedExampleSet)o).partition); } public int hashCode() { return super.hashCode() ^ partition.hashCode(); } /** Creates the partition builder for the given sampling type. */ private static PartitionBuilder createPartitionBuilder(ExampleSet exampleSet, int samplingType, int seed) { PartitionBuilder builder = null; switch (samplingType) { case LINEAR_SAMPLING: builder = new SimplePartitionBuilder(); break; case SHUFFLED_SAMPLING: builder = new ShuffledPartitionBuilder(seed); break; case STRATIFIED_SAMPLING: default: Attribute label = exampleSet.getAttributes().getLabel(); if ((label != null) && (label.isNominal())) builder = new StratifiedPartitionBuilder(exampleSet, seed); else { exampleSet.getLog().logNote("Example set has no nominal label: using shuffled partition instead of stratified partition!"); builder = new ShuffledPartitionBuilder(seed); } break; } return builder; } /** Adds the given subset. */ public void selectAdditionalSubset(int index) { partition.selectSubset(index); } /** Selects exactly one subset. */ public void selectSingleSubset(int index) { partition.clearSelection(); partition.selectSubset(index); } /** Selects all but one subset. */ public void selectAllSubsetsBut(int index) { partition.clearSelection(); for (int i = 0; i < partition.getNumberOfSubsets(); i++) { if (i != index) partition.selectSubset(i); } } /** Selects all subsets. */ public void selectAllSubsets() { partition.clearSelection(); for (int i = 0; i < partition.getNumberOfSubsets(); i++) { partition.selectSubset(i); } } /** Inverts the current selection */ public void invertSelection() { partition.invertSelection(); } /** Clears the current selection */ public void clearSelection() { partition.clearSelection(); } /** Returns the number of subsets. */ public int getNumberOfSubsets() { return partition.getNumberOfSubsets(); } /** Returns an example reader that splits all examples that are not selected. */ public Iterator<Example> iterator() { return new SplittedExampleSetReader(this.parent.iterator(), (Partition) partition.clone()); } public int size() { return partition.getSelectionSize(); } /** * Searches i-th example in the currently selected partition. This is done * in constant time. */ public Example getExample(int index) { int actualIndex = partition.mapIndex(index); return this.parent.getExample(actualIndex); } /** Returns the index of the example in the parent example set. */ public int getActualParentIndex(int index) { return partition.mapIndex(index); } public ExampleTable getExampleTable() { return parent.getExampleTable(); } public Attributes getAttributes() { return this.parent.getAttributes(); } // -------------------- Factory methods -------------------- /** * Works only for nominal and integer attributes. If <i>k</i> is the number * of different values, this method splits the example set into <i>k</i> * subsets according to the value of the given attribute. */ public static SplittedExampleSet splitByAttribute(ExampleSet exampleSet, Attribute attribute) { int[] elements = new int[exampleSet.size()]; int i = 0; Map<Integer, Integer> indexMap = new HashMap<Integer, Integer>(); AtomicInteger currentIndex = new AtomicInteger(0); for (Example example : exampleSet) { int value = (int) example.getValue(attribute); Integer indexObject = indexMap.get(value); if (indexObject == null) { indexMap.put(value, currentIndex.getAndIncrement()); } int intValue = indexMap.get(value).intValue(); elements[i++] = intValue; } int maxNumber = indexMap.size(); indexMap.clear(); Partition partition = new Partition(elements, maxNumber); return new SplittedExampleSet(exampleSet, partition); } /** * Works only for real-value attributes. Returns an example set splitted into * two parts containing all examples providing a greater (smaller) value * for the given attribute than the given value. The first partition contains * all examples providing a smaller or the same value than the given one. */ public static SplittedExampleSet splitByAttribute(ExampleSet exampleSet, Attribute attribute, double value) { int[] elements = new int[exampleSet.size()]; Iterator<Example> reader = exampleSet.iterator(); int i = 0; while (reader.hasNext()) { Example example = reader.next(); double currentValue = example.getValue(attribute); if (currentValue <= value) elements[i++] = 0; else elements[i++] = 1; } Partition partition = new Partition(elements, 2); return new SplittedExampleSet(exampleSet, partition); } }