/** * Copyright (C) 2001-2017 by RapidMiner and the contributors * * Complete list of developers available at our web site: * * http://rapidminer.com * * This program is free software: you can redistribute it and/or modify it under the terms of the * GNU Affero General Public License as published by the Free Software Foundation, either version 3 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License along with this program. * If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.example; import java.util.ArrayList; import java.util.Date; import java.util.List; import com.rapidminer.example.table.AttributeFactory; import com.rapidminer.example.table.ExampleTable; import com.rapidminer.example.utils.ExampleSetBuilder; import com.rapidminer.example.utils.ExampleSets; import com.rapidminer.tools.Ontology; /** * <p> * This class can be used to easily create @link {ExampleSet}s and the underlying * {@link ExampleTable} with simple method calls. Please note that it is often better to explicitly * fill the data table yourself using an {@link ExampleSetBuilder} provided by {@link ExampleSets}. * For memory usage reasons, it is also often not recommended to create the data matrix from your * existing data in an extra step and then use one of the factory methods. In these cases, it is * better to directly fill an {@link ExampleSetBuilder} from your data source. * </p> * * <p> * However, in some cases it might be more convenient to use this class in order to create example * sets from data matrices in a fast and simple way. The resulting example set will be backed up by * a {@link ExampleTable} created by a {@link ExampleSetBuilder}. If the data set at hand is * completely numerical, one can simply use one of the double matrix methods provided by this class. * This will lead to an {@link ExampleSet} containing only numerical attributes. Otherwise, one have * to use the Object matrix methods. Please note that only String objects and Number objects * (Double, Integer) are allowed in this case. Otherwise an Exception will be thrown. In case of the * Object matrix methods the method tries to identify the type itself and initialized the example * set with the correct attribute types (nominal or numerical). * </p> * * <p> * Please note that the internal representation of the nominal attribute values depend on the order * they appear in the data set. If this is not allowed (e.g. for the label attribute of different * training and testing sets, where the internal representation should be the same in order to * prevent label flips) one should definitely use the usual ExampleTable - ExampleSet construction * where the nominal attribute value mapping can and should be performed beforehand. In these cases * the usage of this class is definitely not recommended. * </p> * * @author Ingo Mierswa */ public class ExampleSetFactory { /** * Create a numerical example set from the given data matrix. The resulting example set will not * contain a label and consists of numerical attributes only. */ public static ExampleSet createExampleSet(double[][] data) { return createExampleSet(data, null); } /** * Create a numerical example set from the given data matrix. The label of the resulting example * set be build from the column with the given index. The example set consists of numerical * attributes only. */ public static ExampleSet createExampleSet(double[][] data, int classColumn) { if (data.length == 0) { throw new RuntimeException( "ExampleSetFactory.createExampleSet(double[][], int): data matrix is not allowed to be empty."); } double[][] dataWithoutLabel = new double[data.length][data[0].length - 1]; double[] labels = new double[data.length]; for (int e = 0; e < data.length; e++) { int counter = 0; for (int a = 0; a < data[e].length; a++) { if (a == classColumn) { labels[e] = data[e][a]; } else { dataWithoutLabel[e][counter++] = data[e][a]; } } } return createExampleSet(dataWithoutLabel, labels); } /** * Create a numerical example set from the given data matrix. The label of the resulting example * set be build from the given double array. The example set consists of numerical attributes * only. */ public static ExampleSet createExampleSet(double[][] data, double[] labels) { if (data.length == 0) { throw new RuntimeException( "ExampleSetFactory.createExampleSet(double[][], double[]): data matrix is not allowed to be empty."); } // create attributes int numberOfAttributes = data[0].length; List<Attribute> attributeList = new ArrayList<Attribute>(numberOfAttributes + (labels != null ? 1 : 0)); for (int a = 0; a < numberOfAttributes; a++) { attributeList.add(AttributeFactory.createAttribute("att" + (a + 1), Ontology.NUMERICAL)); } Attribute labelAttribute = null; if (labels != null) { labelAttribute = AttributeFactory.createAttribute("label", Ontology.NUMERICAL); attributeList.add(labelAttribute); } // create example set ExampleSetBuilder builder = ExampleSets.from(attributeList).withExpectedSize(data.length); for (int e = 0; e < data.length; e++) { double[] dataRow = data[e]; if (labelAttribute != null) { dataRow = new double[numberOfAttributes + 1]; System.arraycopy(data[e], 0, dataRow, 0, data[e].length); dataRow[dataRow.length - 1] = labels[e]; } builder.addRow(dataRow); } if (labelAttribute != null) { builder.withRole(labelAttribute, Attributes.LABEL_NAME); } return builder.build(); } /** * Create a mixed-type example set from the given data matrix. The resulting example set will * not contain a label and might consist of numerical, nominal or date attributes. */ public static ExampleSet createExampleSet(Object[][] data) { return createExampleSet(data, null); } /** * Create a numerical example set from the given data matrix. The label of the resulting example * set be build from the column with the given index. The example set might consist of * numerical, nominal or date attributes. */ public static ExampleSet createExampleSet(Object[][] data, int classColumn) { if (data.length == 0) { throw new RuntimeException( "ExampleSetFactory.createExampleSet(Object[][], int): data matrix is not allowed to be empty."); } Object[][] dataWithoutLabel = new Object[data.length][data[0].length - 1]; Object[] labels = new Object[data.length]; for (int e = 0; e < data.length; e++) { int counter = 0; for (int a = 0; a < data[e].length; a++) { if (a == classColumn) { labels[e] = data[e][a]; } else { dataWithoutLabel[e][counter++] = data[e][a]; } } } return createExampleSet(dataWithoutLabel, labels); } /** * Create a numerical example set from the given data matrix. The label of the resulting example * set be build from the given double array. The example set might consist of numerical, nominal * or date attributes. */ public static ExampleSet createExampleSet(Object[][] data, Object[] labels) { if (data.length == 0) { throw new RuntimeException( "ExampleSetFactory.createExampleSet(Object[][], Object[]): data matrix is not allowed to be empty."); } // create attributes int numberOfAttributes = data[0].length; int totalNumber = numberOfAttributes + (labels != null ? 1 : 0); boolean[] nominal = new boolean[totalNumber]; List<Attribute> attributeList = new ArrayList<Attribute>(totalNumber); for (int a = 0; a < numberOfAttributes; a++) { Object current = getFirstNonNull(data, a); if (current instanceof Number) { attributeList.add(AttributeFactory.createAttribute("att" + (a + 1), Ontology.NUMERICAL)); nominal[a] = false; } else if (current instanceof String) { attributeList.add(AttributeFactory.createAttribute("att" + (a + 1), Ontology.NOMINAL)); nominal[a] = true; } else if (current instanceof Date) { attributeList.add(AttributeFactory.createAttribute("att" + (a + 1), Ontology.DATE_TIME)); nominal[a] = false; } else { throw new RuntimeException( "ExampleSetFactory.createExampleSet(Object[][], Object[]): only objects of type String or Number (Double, Integer) are allowed for the object data matrix."); } } Attribute labelAttribute = null; if (labels != null) { Object current = labels[0]; if (current instanceof Number) { labelAttribute = AttributeFactory.createAttribute("label", Ontology.NUMERICAL); nominal[nominal.length - 1] = false; } else if (current instanceof String) { labelAttribute = AttributeFactory.createAttribute("label", Ontology.NOMINAL); nominal[nominal.length - 1] = true; } else if (current instanceof Date) { labelAttribute = AttributeFactory.createAttribute("label", Ontology.DATE_TIME); nominal[nominal.length - 1] = false; } else { throw new RuntimeException( "ExampleSetFactory.createExampleSet(Object[][], Object[]): only objects of type String or Number (Double, Integer) are allowed for the object data matrix."); } attributeList.add(labelAttribute); } // create example set ExampleSetBuilder builder = ExampleSets.from(attributeList).withExpectedSize(data.length); for (int e = 0; e < data.length; e++) { double[] dataRow = new double[totalNumber]; for (int a = 0; a < numberOfAttributes; a++) { Object current = data[e][a]; if (current == null) { dataRow[a] = Double.NaN; } else if (current instanceof Number) { if (nominal[a]) { throw new RuntimeException( "ExampleSetFactory.createExampleSet(Object[][], Object[]): type of objects did change in column. Only the same type of objects is allowed for complete columns."); } dataRow[a] = ((Number) current).doubleValue(); } else if (current instanceof String) { if (!nominal[a]) { throw new RuntimeException( "ExampleSetFactory.createExampleSet(Object[][], Object[]): type of objects did change in column. Only the same type of objects is allowed for complete columns."); } dataRow[a] = attributeList.get(a).getMapping().mapString((String) current); } else if (current instanceof Date) { if (nominal[a]) { throw new RuntimeException( "ExampleSetFactory.createExampleSet(Object[][], Object[]): type of objects did change in column. Only the same type of objects is allowed for complete columns."); } dataRow[a] = ((Date) current).getTime(); } else { throw new RuntimeException( "ExampleSetFactory.createExampleSet(Object[][], Object[]): only objects of type String or Number (Double, Integer) are allowed for the object data matrix."); } } if (labelAttribute != null) { Object current = labels[e]; if (current == null) { dataRow[dataRow.length - 1] = Double.NaN; } else if (current instanceof Number) { if (nominal[nominal.length - 1]) { throw new RuntimeException( "ExampleSetFactory.createExampleSet(Object[][], Object[]): type of objects did change in column. Only the same type of objects is allowed for complete columns."); } dataRow[dataRow.length - 1] = ((Number) current).doubleValue(); } else if (current instanceof String) { if (!nominal[nominal.length - 1]) { throw new RuntimeException( "ExampleSetFactory.createExampleSet(Object[][], Object[]): type of objects did change in column. Only the same type of objects is allowed for complete columns."); } dataRow[dataRow.length - 1] = attributeList.get(attributeList.size() - 1).getMapping() .mapString((String) current); } else if (current instanceof Date) { if (nominal[nominal.length - 1]) { throw new RuntimeException( "ExampleSetFactory.createExampleSet(Object[][], Object[]): type of objects did change in column. Only the same type of objects is allowed for complete columns."); } dataRow[dataRow.length - 1] = ((Date) current).getTime(); } else { throw new RuntimeException( "ExampleSetFactory.createExampleSet(Object[][], Object[]): only objects of type String or Number (Double, Integer) are allowed for the object data matrix."); } } builder.addRow(dataRow); } if (labelAttribute != null) { builder.withRole(labelAttribute, Attributes.LABEL_NAME); } return builder.build(); } /** * @param data * the data array object * @param a * the current attribute index * @return the first non-null object */ private static Object getFirstNonNull(Object[][] data, int a) { int tryCounter = 0; Object current = data[tryCounter][a]; while (current == null) { tryCounter++; if (tryCounter < data.length) { current = data[tryCounter][a]; } else { throw new RuntimeException( "ExampleSetFactory.createExampleSet(Object[][], Object[]): provided attribute at column " + a + " does only contain null values."); } } return current; } }