/* * RapidMiner * * Copyright (C) 2001-2008 by Rapid-I and the contributors * * Complete list of developers available at our web site: * * http://rapid-i.com * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.example; import java.util.ArrayList; import java.util.List; import com.rapidminer.example.table.AttributeFactory; import com.rapidminer.example.table.DataRow; import com.rapidminer.example.table.DoubleArrayDataRow; import com.rapidminer.example.table.ExampleTable; import com.rapidminer.example.table.MemoryExampleTable; import com.rapidminer.tools.Ontology; /** * <p>This class can be used to easily create @link ExampleSet}s and the underlying * {@link ExampleTable} with simple method calls. Please note that it is often better * to explicitly fill the data table yourself, or, if possible, to extend {@link ExampleTable} * or {@link DataRow} to provide the necessary data to RapidMiner. For memory usage reasons, * it is also often not recommended to create the data matrix from your existing data * in an extra step and then use one of the factory methods. In these cases, it is better * to directly fill an {@link ExampleTable} from your data source.</p> * * <p>However, in some cases it might be more convenient to use this class in order to * create example sets from data matrices in a fast and simple way. The resulting example * set will be backed up by a {@link MemoryExampleTable}. If the data set * at hand is completely numerical, one can simply use one of the double matrix methods * provided by this class. This will lead to an {@link ExampleSet} containing only numerical * attributes. Otherwise, one have to use the Object matrix methods. Please * note that only String objects and Number objects (Double, Integer) are allowed in this * case. Otherwise an Exception will be thrown. In case of the Object matrix methods the * method tries to identify the type itself and initialized the example set with the * correct attribute types (nominal or numerical).</p> * * <p>Please note that the internal representation of the nominal attribute values * depend on the order they appear in the data set. If this is not allowed (e.g. * for the label attribute of different training and testing sets, where the * internal representation should be the same in order to prevent label flips) * one should definitely use the usual ExampleTable - ExampleSet construction where * the nominal attribute value mapping can and should be performed beforehand. In these * cases the usage of this class is definitely not recommended.</p> * * @author Ingo Mierswa * @version $Id: ExampleSetFactory.java,v 1.3 2008/05/09 19:22:42 ingomierswa Exp $ */ public class ExampleSetFactory { /** Create a numerical example set from the given data matrix. The resulting example * set will not contain a label and consists of numerical attributes only. */ public static ExampleSet createExampleSet(double[][] data) { return createExampleSet(data, null); } /** Create a numerical example set from the given data matrix. The label of the * resulting example set be build from the column with the given index. The example * set consists of numerical attributes only. */ public static ExampleSet createExampleSet(double[][] data, int classColumn) { if (data.length == 0) { throw new RuntimeException("ExampleSetFactory.createExampleSet(double[][], int): data matrix is not allowed to be empty."); } double[][] dataWithoutLabel = new double[data.length][data[0].length - 1]; double[] labels = new double[data.length]; for (int e = 0; e < data.length; e++) { int counter = 0; for (int a = 0; a < data[e].length; a++) { if (a == classColumn) { labels[e] = data[e][a]; } else { dataWithoutLabel[e][counter++] = data[e][a]; } } } return createExampleSet(dataWithoutLabel, labels); } /** Create a numerical example set from the given data matrix. The label of the * resulting example set be build from the given double array. The example * set consists of numerical attributes only. */ public static ExampleSet createExampleSet(double[][] data, double[] labels) { if (data.length == 0) { throw new RuntimeException("ExampleSetFactory.createExampleSet(double[][], double[]): data matrix is not allowed to be empty."); } // create attributes int numberOfAttributes = data[0].length; List<Attribute> attributeList = new ArrayList<Attribute>(numberOfAttributes + (labels != null ? 1 : 0)); for (int a = 0; a < numberOfAttributes; a++) { attributeList.add(AttributeFactory.createAttribute("att" + (a+1), Ontology.NUMERICAL)); } Attribute labelAttribute = null; if (labels != null) { labelAttribute = AttributeFactory.createAttribute("label", Ontology.NUMERICAL); attributeList.add(labelAttribute); } // create table MemoryExampleTable table = new MemoryExampleTable(attributeList); for (int e = 0; e < data.length; e++) { double[] dataRow = data[e]; if (labelAttribute != null) { dataRow = new double[numberOfAttributes + 1]; System.arraycopy(data[e], 0, dataRow, 0, data[e].length); dataRow[dataRow.length - 1] = labels[e]; } table.addDataRow(new DoubleArrayDataRow(dataRow)); } return table.createExampleSet(labelAttribute); } /** Create a mixed-type example set from the given data matrix. The resulting example * set will not contain a label and might consist of numerical and nominal attributes. */ public static ExampleSet createExampleSet(Object[][] data) { return createExampleSet(data, null); } /** Create a numerical example set from the given data matrix. The label of the * resulting example set be build from the column with the given index. The example * set might consist of numerical and nominal attributes. */ public static ExampleSet createExampleSet(Object[][] data, int classColumn) { if (data.length == 0) { throw new RuntimeException("ExampleSetFactory.createExampleSet(Object[][], int): data matrix is not allowed to be empty."); } Object[][] dataWithoutLabel = new Object[data.length][data[0].length - 1]; Object[] labels = new Object[data.length]; for (int e = 0; e < data.length; e++) { int counter = 0; for (int a = 0; a < data[e].length; a++) { if (a == classColumn) { labels[e] = data[e][a]; } else { dataWithoutLabel[e][counter++] = data[e][a]; } } } return createExampleSet(dataWithoutLabel, labels); } /** Create a numerical example set from the given data matrix. The label of the * resulting example set be build from the given double array. The example * set might consist of numerical and nominal attributes. */ public static ExampleSet createExampleSet(Object[][] data, Object[] labels) { if (data.length == 0) { throw new RuntimeException("ExampleSetFactory.createExampleSet(Object[][], Object[]): data matrix is not allowed to be empty."); } // create attributes int numberOfAttributes = data[0].length; int totalNumber = numberOfAttributes + (labels != null ? 1 : 0); boolean[] nominal = new boolean[totalNumber]; List<Attribute> attributeList = new ArrayList<Attribute>(totalNumber); for (int a = 0; a < numberOfAttributes; a++) { Object current = data[0][a]; if (current instanceof Number) { attributeList.add(AttributeFactory.createAttribute("att" + (a+1), Ontology.NUMERICAL)); nominal[a] = false; } else if (current instanceof String) { attributeList.add(AttributeFactory.createAttribute("att" + (a+1), Ontology.NOMINAL)); nominal[a] = true; } else { throw new RuntimeException("ExampleSetFactory.createExampleSet(Object[][], Object[]): only objects of type String or Number (Double, Integer) are allowed for the object data matrix."); } } Attribute labelAttribute = null; if (labels != null) { Object current = labels[0]; if (current instanceof Number) { labelAttribute = AttributeFactory.createAttribute("label", Ontology.NUMERICAL); nominal[nominal.length - 1] = false; } else if (current instanceof String) { labelAttribute = AttributeFactory.createAttribute("label", Ontology.NOMINAL); nominal[nominal.length - 1] = true; } else { throw new RuntimeException("ExampleSetFactory.createExampleSet(Object[][], Object[]): only objects of type String or Number (Double, Integer) are allowed for the object data matrix."); } attributeList.add(labelAttribute); } // create table MemoryExampleTable table = new MemoryExampleTable(attributeList); for (int e = 0; e < data.length; e++) { double[] dataRow = new double[totalNumber]; for (int a = 0; a < numberOfAttributes; a++) { Object current = data[e][a]; if (current instanceof Number) { if (nominal[a]) throw new RuntimeException("ExampleSetFactory.createExampleSet(Object[][], Object[]): type of objects did change in column. Only the same type of objects is allowed for complete columns."); dataRow[a] = ((Number)current).doubleValue(); } else if (current instanceof String) { if (!nominal[a]) throw new RuntimeException("ExampleSetFactory.createExampleSet(Object[][], Object[]): type of objects did change in column. Only the same type of objects is allowed for complete columns."); dataRow[a] = attributeList.get(a).getMapping().mapString((String)current); } else { throw new RuntimeException("ExampleSetFactory.createExampleSet(Object[][], Object[]): only objects of type String or Number (Double, Integer) are allowed for the object data matrix."); } } if (labelAttribute != null) { Object current = labels[e]; if (current instanceof Number) { if (nominal[nominal.length - 1]) throw new RuntimeException("ExampleSetFactory.createExampleSet(Object[][], Object[]): type of objects did change in column. Only the same type of objects is allowed for complete columns."); dataRow[dataRow.length - 1] = ((Number)current).doubleValue(); } else if (current instanceof String) { if (!nominal[nominal.length - 1]) throw new RuntimeException("ExampleSetFactory.createExampleSet(Object[][], Object[]): type of objects did change in column. Only the same type of objects is allowed for complete columns."); dataRow[dataRow.length - 1] = attributeList.get(attributeList.size() - 1).getMapping().mapString((String)current); } else { throw new RuntimeException("ExampleSetFactory.createExampleSet(Object[][], Object[]): only objects of type String or Number (Double, Integer) are allowed for the object data matrix."); } } table.addDataRow(new DoubleArrayDataRow(dataRow)); } return table.createExampleSet(labelAttribute); } }