/** * Copyright (C) 2001-2017 by RapidMiner and the contributors * * Complete list of developers available at our web site: * * http://rapidminer.com * * This program is free software: you can redistribute it and/or modify it under the terms of the * GNU Affero General Public License as published by the Free Software Foundation, either version 3 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License along with this program. * If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.operator.preprocessing.join; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Set; import java.util.function.IntToDoubleFunction; import com.rapidminer.RapidMiner; import com.rapidminer.example.Attribute; import com.rapidminer.example.AttributeRole; import com.rapidminer.example.Example; import com.rapidminer.example.ExampleSet; import com.rapidminer.example.table.AttributeFactory; import com.rapidminer.example.table.DataRow; import com.rapidminer.example.table.DataRowFactory; import com.rapidminer.example.utils.ExampleSetBuilder; import com.rapidminer.example.utils.ExampleSets; import com.rapidminer.operator.MissingIOObjectException; import com.rapidminer.operator.Operator; import com.rapidminer.operator.OperatorDescription; import com.rapidminer.operator.OperatorException; import com.rapidminer.operator.ProcessSetupError.Severity; import com.rapidminer.operator.ProcessStoppedException; import com.rapidminer.operator.SimpleProcessSetupError; import com.rapidminer.operator.UserError; import com.rapidminer.operator.annotation.ResourceConsumptionEstimator; import com.rapidminer.operator.generator.ExampleSetGenerator; import com.rapidminer.operator.ports.InputPort; import com.rapidminer.operator.ports.InputPortExtender; import com.rapidminer.operator.ports.OutputPort; import com.rapidminer.operator.ports.metadata.AttributeMetaData; import com.rapidminer.operator.ports.metadata.ExampleSetMetaData; import com.rapidminer.operator.ports.metadata.ExampleSetPrecondition; import com.rapidminer.operator.ports.metadata.MDTransformationRule; import com.rapidminer.operator.ports.metadata.MetaData; import com.rapidminer.operator.ports.metadata.MetaDataInfo; import com.rapidminer.operator.ports.metadata.Precondition; import com.rapidminer.parameter.ParameterType; import com.rapidminer.parameter.ParameterTypeCategory; import com.rapidminer.parameter.UndefinedParameterError; import com.rapidminer.studio.internal.ProcessStoppedRuntimeException; import com.rapidminer.tools.Ontology; import com.rapidminer.tools.OperatorResourceConsumptionHandler; import com.rapidminer.tools.ParameterService; import com.rapidminer.tools.parameter.internal.DataManagementParameterHelper; /** * <p> * This operator merges two or more given example sets by adding all examples in one example table * containing all data rows. Please note that the new example table is built in memory and this * operator might therefore not be applicable for merging huge data set tables from a database. In * that case other preprocessing tools should be used which aggregates, joins, and merges tables * into one table which is then used by RapidMiner. * </p> * * <p> * All input example sets must provide the same attribute signature. That means that all examples * sets must have the same number of (special) attributes and attribute names. If this is true this * operator simply merges all example sets by adding all examples of all table into a new set which * is then returned. * </p> * * @author Ingo Mierswa */ public class ExampleSetMerge extends Operator { private final InputPortExtender inputExtender = new InputPortExtender("example set", getInputPorts()) { @Override protected Precondition makePrecondition(InputPort port) { return new ExampleSetPrecondition(port) { { setOptional(true); } @Override public void makeAdditionalChecks(ExampleSetMetaData emd) throws UndefinedParameterError { for (MetaData metaData : inputExtender.getMetaData(true)) { if (metaData instanceof ExampleSetMetaData) { MetaDataInfo result = emd.equalHeader((ExampleSetMetaData) metaData); if (result == MetaDataInfo.NO) { addError(new SimpleProcessSetupError(Severity.ERROR, getPortOwner(), "exampleset.sets_incompatible")); break; } if (result == MetaDataInfo.UNKNOWN) { addError(new SimpleProcessSetupError(Severity.WARNING, getPortOwner(), "exampleset.sets_incompatible")); break; } } } } }; } }; private final OutputPort mergedOutput = getOutputPorts().createPort("merged set"); /** The parameter name for "Determines, how the data is represented internally." */ public static final String PARAMETER_DATAMANAGEMENT = ExampleSetGenerator.PARAMETER_DATAMANAGEMENT; public ExampleSetMerge(OperatorDescription description) { super(description); inputExtender.start(); getTransformer().addRule(inputExtender.makeFlatteningPassThroughRule(mergedOutput)); getTransformer().addRule(new MDTransformationRule() { @Override public void transformMD() { List<MetaData> metaDatas = inputExtender.getMetaData(true); List<ExampleSetMetaData> emds = new ArrayList<ExampleSetMetaData>(metaDatas.size()); for (MetaData metaData : metaDatas) { if (metaData instanceof ExampleSetMetaData) { emds.add((ExampleSetMetaData) metaData); } } // now unify all single attributes meta data if (emds.size() > 0) { ExampleSetMetaData resultEMD = emds.get(0); for (int i = 1; i < emds.size(); i++) { ExampleSetMetaData mergerEMD = emds.get(i); resultEMD.getNumberOfExamples().add(mergerEMD.getNumberOfExamples()); // now iterating over all single attributes in order to merge their meta // data for (AttributeMetaData amd : resultEMD.getAllAttributes()) { String name = amd.getName(); AttributeMetaData mergingAMD = mergerEMD.getAttributeByName(name); if (mergingAMD != null) { // values if (amd.isNominal()) { amd.getValueSet().addAll(mergingAMD.getValueSet()); } else { amd.getValueRange().union(mergingAMD.getValueRange()); } amd.getValueSetRelation().merge(mergingAMD.getValueSetRelation()); // missing values amd.getNumberOfMissingValues().add(mergingAMD.getNumberOfMissingValues()); } } } mergedOutput.deliverMD(resultEMD); } } }); } @Override public void doWork() throws OperatorException { List<ExampleSet> allExampleSets = inputExtender.getData(ExampleSet.class, true); mergedOutput.deliver(merge(allExampleSets)); } public ExampleSet merge(List<ExampleSet> allExampleSets) throws OperatorException { // throw error if no example sets were available if (allExampleSets.size() == 0) { throw new MissingIOObjectException(ExampleSet.class); } // checks if all example sets have the same signature checkForCompatibility(allExampleSets); // create new example table ExampleSet firstSet = allExampleSets.get(0); List<Attribute> newAttributeList = new ArrayList<Attribute>(); HashMap<String, Attribute> newAttributeNameMap = new HashMap<String, Attribute>(); Map<Attribute, String> specialAttributesMap = new LinkedHashMap<Attribute, String>(); Iterator<AttributeRole> a = firstSet.getAttributes().allAttributeRoles(); while (a.hasNext()) { AttributeRole role = a.next(); Attribute oldAttribute = role.getAttribute(); int newType; if (oldAttribute.isNominal()) { // collect values to see if we have at least two Set<String> values = new HashSet<String>(); values.addAll(oldAttribute.getMapping().getValues()); boolean hasNominal = false; boolean hasPolynominal = false; boolean hasSameValueType = true; for (ExampleSet otherExampleSet : allExampleSets) { Attribute otherAttribute = otherExampleSet.getAttributes().get(oldAttribute.getName()); // At least one non-nominal -> throw if (!otherAttribute.isNominal()) { throwIncompatible(oldAttribute, otherAttribute); } values.addAll(otherAttribute.getMapping().getValues()); hasSameValueType &= (otherAttribute.getValueType() == oldAttribute.getValueType()); hasNominal |= (otherAttribute.getValueType() == Ontology.NOMINAL); hasPolynominal |= Ontology.ATTRIBUTE_VALUE_TYPE.isA(otherAttribute.getValueType(), Ontology.POLYNOMINAL); } // binominals with more than 2 values cannot keep their value type, else try to // preserve value type is all have the same if (hasSameValueType && (!Ontology.ATTRIBUTE_VALUE_TYPE.isA(oldAttribute.getValueType(), Ontology.BINOMINAL) || values .size() <= 2)) { newType = oldAttribute.getValueType(); } else if (hasNominal) { newType = Ontology.NOMINAL; } else if (hasPolynominal || values.size() > 2) { newType = Ontology.POLYNOMINAL; } else { newType = oldAttribute.getValueType(); } } else if (oldAttribute.isNumerical()) { boolean hasReal = false; boolean hasNumerical = false; boolean hasSameValueType = true; for (ExampleSet otherExampleSet : allExampleSets) { Attribute otherAttribute = otherExampleSet.getAttributes().get(oldAttribute.getName()); // At least one non-numerical -> throw if (!otherAttribute.isNumerical()) { throwIncompatible(oldAttribute, otherAttribute); } hasSameValueType &= (otherAttribute.getValueType() == oldAttribute.getValueType()); hasNumerical |= (otherAttribute.getValueType() == Ontology.NUMERICAL); hasReal |= Ontology.ATTRIBUTE_VALUE_TYPE.isA(otherAttribute.getValueType(), Ontology.REAL); } if (hasSameValueType) { newType = oldAttribute.getValueType(); } else if (hasNumerical) { newType = Ontology.NUMERICAL; } else if (hasReal) { newType = Ontology.REAL; } else { newType = oldAttribute.getValueType(); } } else if (Ontology.ATTRIBUTE_VALUE_TYPE.isA(oldAttribute.getValueType(), Ontology.DATE) || (Ontology.ATTRIBUTE_VALUE_TYPE.isA(oldAttribute.getValueType(), Ontology.TIME) || (Ontology.ATTRIBUTE_VALUE_TYPE .isA(oldAttribute.getValueType(), Ontology.DATE_TIME)))) { // this case covers the date, time, date_time valueType // if all attribute valueTypes are the same keep it, otherwise switch to date_time // as the parent valueType newType = oldAttribute.getValueType(); for (ExampleSet otherExampleSet : allExampleSets) { Attribute otherAttribute = otherExampleSet.getAttributes().get(oldAttribute.getName()); // not the same type but all if (otherAttribute.getValueType() != newType) { if (((Ontology.ATTRIBUTE_VALUE_TYPE.isA(oldAttribute.getValueType(), Ontology.DATE) || (Ontology.ATTRIBUTE_VALUE_TYPE .isA(oldAttribute.getValueType(), Ontology.TIME) || (Ontology.ATTRIBUTE_VALUE_TYPE.isA( oldAttribute.getValueType(), Ontology.DATE_TIME)))))) { newType = Ontology.DATE_TIME; } else { // totally different valueType, cannot merge -> throw throwIncompatible(oldAttribute, otherAttribute); } } } } else { for (ExampleSet otherExampleSet : allExampleSets) { Attribute otherAttribute = otherExampleSet.getAttributes().get(oldAttribute.getName()); // At least one non-numerical -> throw if (otherAttribute.getValueType() != oldAttribute.getValueType()) { throwIncompatible(oldAttribute, otherAttribute); } } newType = oldAttribute.getValueType(); } Attribute newAttribute = AttributeFactory.createAttribute(oldAttribute.getName(), newType, // oldAttribute.getValueType(), oldAttribute.getBlockType()); newAttributeNameMap.put(newAttribute.getName(), newAttribute); newAttributeList.add(newAttribute); if (role.isSpecial()) { specialAttributesMap.put(newAttribute, role.getSpecialName()); } } int totalSize = 0; for (ExampleSet set : allExampleSets) { totalSize += set.size(); } ExampleSetBuilder builder = ExampleSets.from(newAttributeList); if (Boolean.parseBoolean( ParameterService.getParameterValue(RapidMiner.PROPERTY_RAPIDMINER_SYSTEM_LEGACY_DATA_MGMT))) { // to preserve the (legacy) data management, we must use the data row factory builder.withExpectedSize(totalSize); int datamanagement = getParameterAsInt(PARAMETER_DATAMANAGEMENT); int numberOfAttributes = newAttributeList.size(); DataRowFactory factory = new DataRowFactory(datamanagement, '.'); for (ExampleSet exampleSet : allExampleSets) { for (Example example : exampleSet) { DataRow dataRow = factory.create(numberOfAttributes); Iterator<Attribute> iterator = exampleSet.getAttributes().allAttributes(); while (iterator.hasNext()) { Attribute oldAttribute = iterator.next(); Attribute newAttribute = newAttributeNameMap.get(oldAttribute.getName()); double oldValue = example.getValue(oldAttribute); if (Double.isNaN(oldValue)) { dataRow.set(newAttribute, oldValue); } else { if (oldAttribute.isNominal()) { dataRow.set(newAttribute, newAttribute.getMapping() .mapString(oldAttribute.getMapping().mapIndex((int) oldValue))); } else { dataRow.set(newAttribute, oldValue); } } } // adding new row to builder builder.addDataRow(dataRow); } checkForStop(); } } else { builder.withBlankSize(totalSize); builder.withOptimizationHint(DataManagementParameterHelper.getSelectedDataManagement(this)); int[] sizes = new int[allExampleSets.size()]; int i = 0; for (ExampleSet set : allExampleSets) { sizes[i] = set.size(); i++; } int[] sizesSums = new int[sizes.length]; sizesSums[0] = sizes[0]; for (int j = 1; j < sizes.length; j++) { sizesSums[j] = sizesSums[j - 1] + sizes[j]; } for (Attribute newAttribute : newAttributeList) { builder.withColumnFiller(newAttribute, new IntToDoubleFunction() { private final String attributeName = newAttribute.getName(); private ExampleSet oldSet = null; private Attribute oldAttribute = null; private int start = 0; private int end = 0; private int oldExampleSetIndex = -1; @Override public synchronized double applyAsDouble(int i) { if (i < start || i >= end) { try { ExampleSetMerge.this.checkForStop(); } catch (ProcessStoppedException e) { throw new ProcessStoppedRuntimeException(); } int startIndex = 0; end = 0; if (oldExampleSetIndex > -1 && i >= sizesSums[oldExampleSetIndex]) { startIndex = oldExampleSetIndex + 1; end = sizesSums[oldExampleSetIndex]; } for (int j = startIndex; j < sizesSums.length; j++) { start = end; end = sizesSums[j]; if (end > i) { oldExampleSetIndex = j; oldSet = allExampleSets.get(j); oldAttribute = oldSet.getAttributes().get(attributeName); break; } } } double oldValue = oldSet.getExample(i - start).getValue(oldAttribute); if (Double.isNaN(oldValue)) { return Double.NaN; } else { if (oldAttribute.isNominal()) { return newAttribute.getMapping() .mapString(oldAttribute.getMapping().mapIndex((int) oldValue)); } else { return oldValue; } } } }); } } // create result example set ExampleSet resultSet = builder.withRoles(specialAttributesMap).build(); resultSet.getAnnotations().addAll(firstSet.getAnnotations()); return resultSet; } private void throwIncompatible(Attribute oldAttribute, Attribute otherAttribute) throws UserError { throw new UserError(this, 925, "Attribute '" + oldAttribute.getName() + "' has incompatible types (" + Ontology.ATTRIBUTE_VALUE_TYPE.mapIndex(oldAttribute.getValueType()) + " and " + Ontology.ATTRIBUTE_VALUE_TYPE.mapIndex(otherAttribute.getValueType()) + ") in two input sets."); } /** * Checks whether all attributes in set 1 occur in the others as well. Types are (deliberately) * not checked. Type check happens in {@link #merge(List)} itself. * * @throws on * failed check */ private void checkForCompatibility(List<ExampleSet> allExampleSets) throws OperatorException { ExampleSet first = allExampleSets.get(0); Iterator<ExampleSet> i = allExampleSets.iterator(); while (i.hasNext()) { checkForCompatibility(first, i.next()); } } private void checkForCompatibility(ExampleSet first, ExampleSet second) throws OperatorException { if (first.getAttributes().allSize() != second.getAttributes().allSize()) { throw new UserError(this, 925, "numbers of attributes are different"); } Iterator<Attribute> firstIterator = first.getAttributes().allAttributes(); while (firstIterator.hasNext()) { Attribute firstAttribute = firstIterator.next(); Attribute secondAttribute = second.getAttributes().get(firstAttribute.getName()); if (secondAttribute == null) { throw new UserError(this, 925, "Attribute with name '" + firstAttribute.getName() + "' is not part of second example set."); // No type check necessary. Type check is done in mrege() itself. // //if (firstAttribute.getValueType() != secondAttribute.getValueType()) { // // ATTENTION: Breaks compatibility for previously running // processes // // maybe even better: check for subtypes in both directions and use super-type // above // if (!Ontology.ATTRIBUTE_VALUE_TYPE.isA(secondAttribute.getValueType(), // firstAttribute.getValueType())) { // throw new UserError(this, 925, "Attribute '" + firstAttribute.getName() + // "' has incompatible types (" + // Ontology.ATTRIBUTE_VALUE_TYPE.mapIndex(firstAttribute.getValueType()) + " and " + // Ontology.ATTRIBUTE_VALUE_TYPE.mapIndex(secondAttribute.getValueType()) + // ") in two input sets."); // } } } } @Override public List<ParameterType> getParameterTypes() { List<ParameterType> types = super.getParameterTypes(); DataManagementParameterHelper.addParameterTypes(types, this); // deprecated parameter ParameterType type = new ParameterTypeCategory("merge_type", "Indicates if all input example sets or only the first two example sets should be merged.", new String[] { "all", "first_two" }, 0); type.setDeprecated(); types.add(type); return types; } @Override public ResourceConsumptionEstimator getResourceConsumptionEstimator() { return OperatorResourceConsumptionHandler.getResourceConsumptionEstimator(getInputPorts().getPortByIndex(0), ExampleSetMerge.class, null); } }