/** * Copyright (C) 2001-2017 by RapidMiner and the contributors * * Complete list of developers available at our web site: * * http://rapidminer.com * * This program is free software: you can redistribute it and/or modify it under the terms of the * GNU Affero General Public License as published by the Free Software Foundation, either version 3 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License along with this program. * If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.studio.concurrency.internal.util; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import com.rapidminer.example.Attribute; import com.rapidminer.example.AttributeRole; import com.rapidminer.example.Example; import com.rapidminer.example.ExampleSet; import com.rapidminer.example.table.AttributeFactory; import com.rapidminer.example.table.DataRow; import com.rapidminer.example.table.DataRowFactory; import com.rapidminer.example.utils.ExampleSetBuilder; import com.rapidminer.example.utils.ExampleSets; import com.rapidminer.operator.MissingIOObjectException; import com.rapidminer.operator.Operator; import com.rapidminer.operator.OperatorException; import com.rapidminer.operator.UserError; import com.rapidminer.tools.Ontology; /** * Provides utility methods to merge multiple {@link ExampleSet}s into a single one. * * * @author Sebastian Land * @since 7.4 * */ public class ExampleSetAppender { /** * Merge the given {@link ExampleSet}s into one ExampleSet. Checks for validity of the arguments * and throws in case the provided sets cannot be merged. * * @param caller * the calling operator which is checked for stop; can be {@code null} * @param allExampleSets * the {@link ExampleSet}s to merge together * @return the merged {@link ExampleSet} * @throws OperatorException * if the provided ExampleSets are incompatible with each other */ public static ExampleSet merge(Operator caller, ExampleSet... allExampleSets) throws OperatorException { if (allExampleSets == null) { throw new IllegalArgumentException("allExampleSets must not be null!"); } return merge(caller, Arrays.asList(allExampleSets)); } /** * Merge the given {@link ExampleSet}s into one ExampleSet. Checks for validity of the arguments * and throws in case the provided sets cannot be merged. * * @param caller * the calling operator which is checked for stop; can be {@code null} * @param allExampleSets * the {@link ExampleSet}s to merge together * @return the merged {@link ExampleSet} * @throws OperatorException * if the provided ExampleSets are incompatible with each other */ public static ExampleSet merge(Operator caller, List<ExampleSet> allExampleSets) throws OperatorException { // input sanity checks if (allExampleSets == null) { throw new IllegalArgumentException("allExampleSets must not be null!"); } if (allExampleSets.isEmpty()) { throw new MissingIOObjectException(ExampleSet.class); } // checks if all example sets have the same signature checkForCompatibility(allExampleSets, caller); // create new example table List<ExampleSet> remainingExampleSets = allExampleSets.subList(1, allExampleSets.size()); ExampleSet firstSet = allExampleSets.get(0); int numberOfAtts = firstSet.getAttributes().allSize(); List<Attribute> newAttributeList = new ArrayList<>(numberOfAtts); HashMap<String, Attribute> newAttributeNameMap = new HashMap<>(numberOfAtts, 1.0f); Map<Attribute, String> specialAttributesMap = new HashMap<>(numberOfAtts, 1.0f); Iterator<AttributeRole> a = firstSet.getAttributes().allAttributeRoles(); while (a.hasNext()) { AttributeRole role = a.next(); Attribute oldAttribute = role.getAttribute(); int newType = oldAttribute.getValueType(); if (oldAttribute.isNominal()) { // collect values to see if we have at least two Set<String> values = new HashSet<>(); values.addAll(oldAttribute.getMapping().getValues()); boolean hasPolynominal = false; for (ExampleSet otherExampleSet : remainingExampleSets) { Attribute otherAttribute = otherExampleSet.getAttributes().get(oldAttribute.getName()); // At least one non-nominal -> throw if (!otherAttribute.isNominal()) { throwIncompatible(oldAttribute, otherAttribute, caller); } values.addAll(otherAttribute.getMapping().getValues()); hasPolynominal |= Ontology.ATTRIBUTE_VALUE_TYPE.isA(otherAttribute.getValueType(), Ontology.POLYNOMINAL); } // binominals with more than 2 values cannot keep their value type, else try to // preserve value type is all have the same if (hasPolynominal || values.size() > 2) { newType = Ontology.POLYNOMINAL; } } else if (oldAttribute.isNumerical()) { boolean hasReal = false; for (ExampleSet otherExampleSet : remainingExampleSets) { Attribute otherAttribute = otherExampleSet.getAttributes().get(oldAttribute.getName()); // At least one non-numerical -> throw if (!otherAttribute.isNumerical()) { throwIncompatible(oldAttribute, otherAttribute, caller); } hasReal |= Ontology.ATTRIBUTE_VALUE_TYPE.isA(otherAttribute.getValueType(), Ontology.REAL); } if (hasReal) { newType = Ontology.REAL; } } else if (oldAttribute.isDateTime()) { for (ExampleSet otherExampleSet : remainingExampleSets) { Attribute otherAttribute = otherExampleSet.getAttributes().get(oldAttribute.getName()); // this case covers the date, time, date_time valueType // if all attribute valueTypes are the same keep it, otherwise switch to // date_time as the parent valueType if (otherAttribute.getValueType() != newType) { if (Ontology.ATTRIBUTE_VALUE_TYPE.isA(otherAttribute.getValueType(), Ontology.DATE) || Ontology.ATTRIBUTE_VALUE_TYPE.isA(otherAttribute.getValueType(), Ontology.TIME) || Ontology.ATTRIBUTE_VALUE_TYPE.isA(otherAttribute.getValueType(), Ontology.DATE_TIME)) { newType = Ontology.DATE_TIME; } else { // totally different valueType, cannot merge -> throw throwIncompatible(oldAttribute, otherAttribute, caller); } } } } else { // cannot happen throw new IllegalStateException( "Cannot merge example sets! One attribute was of unsupported type!" + oldAttribute); } if (newType == Ontology.NUMERICAL) { // we always kill numerical by converting to real newType = Ontology.REAL; } else if (newType == Ontology.NOMINAL) { // we always kill nominal by converting to polynominal newType = Ontology.POLYNOMINAL; } Attribute newAttribute = AttributeFactory.createAttribute(oldAttribute.getName(), newType, oldAttribute.getBlockType()); newAttributeNameMap.put(newAttribute.getName(), newAttribute); newAttributeList.add(newAttribute); if (role.isSpecial()) { specialAttributesMap.put(newAttribute, role.getSpecialName()); } } int finalSize = 0; for (ExampleSet otherExampleSet : allExampleSets) { finalSize += otherExampleSet.size(); } ExampleSetBuilder builder = ExampleSets.from(newAttributeList).withExpectedSize(finalSize); // now fill table with rows, copied from source example sets DataRowFactory factory = new DataRowFactory(DataRowFactory.TYPE_DOUBLE_ARRAY, '.'); int numberOfAttributes = newAttributeList.size(); for (ExampleSet exampleSet : allExampleSets) { Attribute[] allAttributes = new Attribute[numberOfAttributes]; int i = 0; for (Iterator<Attribute> iterator = exampleSet.getAttributes().allAttributes(); iterator.hasNext();) { allAttributes[i++] = iterator.next(); } for (Example example : exampleSet) { DataRow dataRow = factory.create(numberOfAttributes); for (Attribute oldAttribute : allAttributes) { Attribute newAttribute = newAttributeNameMap.get(oldAttribute.getName()); double oldValue = example.getValue(oldAttribute); if (Double.isNaN(oldValue)) { dataRow.set(newAttribute, oldValue); } else { if (oldAttribute.isNominal()) { dataRow.set(newAttribute, newAttribute.getMapping().mapString(oldAttribute.getMapping().mapIndex((int) oldValue))); } else { dataRow.set(newAttribute, oldValue); } } } // adding new row to table builder.addDataRow(dataRow); } if (caller != null) { caller.checkForStop(); } } // create result example set ExampleSet resultSet = builder.withRoles(specialAttributesMap).build(); resultSet.getAnnotations().addAll(firstSet.getAnnotations()); return resultSet; } private static void throwIncompatible(Attribute oldAttribute, Attribute otherAttribute, Operator caller) throws UserError { throw new UserError(caller, 925, "Attribute '" + oldAttribute.getName() + "' has incompatible types (" + Ontology.ATTRIBUTE_VALUE_TYPE.mapIndex(oldAttribute.getValueType()) + " and " + Ontology.ATTRIBUTE_VALUE_TYPE.mapIndex(otherAttribute.getValueType()) + ") in two input sets."); } /** * Checks whether all attributes in set 1 occur in the others as well. Types are (deliberately) * not checked. Type check happens in {@link #merge(List)} itself. * * @throws OperatorException * on failed check */ private static void checkForCompatibility(List<ExampleSet> allExampleSets, Operator caller) throws OperatorException { Iterator<ExampleSet> i = allExampleSets.iterator(); ExampleSet first = i.next(); while (i.hasNext()) { checkForCompatibility(first, i.next(), caller); } } private static void checkForCompatibility(ExampleSet first, ExampleSet second, Operator caller) throws OperatorException { if (first.getAttributes().allSize() != second.getAttributes().allSize()) { throw new UserError(caller, 925, "numbers of attributes are different"); } Iterator<Attribute> firstIterator = first.getAttributes().allAttributes(); while (firstIterator.hasNext()) { Attribute firstAttribute = firstIterator.next(); Attribute secondAttribute = second.getAttributes().get(firstAttribute.getName()); if (secondAttribute == null) { throw new UserError(caller, 925, "Attribute with name '" + firstAttribute.getName() + "' is not part of second example set."); } } } }