/** * Copyright (C) 2001-2017 by RapidMiner and the contributors * * Complete list of developers available at our web site: * * http://rapidminer.com * * This program is free software: you can redistribute it and/or modify it under the terms of the * GNU Affero General Public License as published by the Free Software Foundation, either version 3 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License along with this program. * If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.operator.preprocessing.join; import java.util.Iterator; import java.util.List; import com.rapidminer.example.Attribute; import com.rapidminer.example.ExampleSet; import com.rapidminer.example.utils.ExampleSetBuilder; import com.rapidminer.example.utils.ExampleSets; import com.rapidminer.operator.OperatorDescription; import com.rapidminer.operator.OperatorException; import com.rapidminer.operator.OperatorProgress; import com.rapidminer.operator.ProcessStoppedException; import com.rapidminer.operator.UserError; import com.rapidminer.operator.annotation.ResourceConsumptionEstimator; import com.rapidminer.operator.ports.metadata.ExampleSetPrecondition; import com.rapidminer.studio.internal.ProcessStoppedRuntimeException; import com.rapidminer.tools.OperatorResourceConsumptionHandler; /** * <p> * Build the cartesian product of two example sets. In contrast to the {@link ExampleSetJoin} * operator, this operator does not depend on Id attributes. The result example set will consist of * the union set or the union list (depending on parameter setting double attributes will be removed * or renamed) of both feature sets. In case of removing double attribute the attribute values must * be the same for the examples of both example set, otherwise an exception will be thrown. * </p> * * <p> * Please note that this check for double attributes will only be applied for regular attributes. * Special attributes of the second input example set which do not exist in the first example set * will simply be added. If they already exist they are simply skipped. * </p> * * @author Peter B. Volk */ public class ExampleSetCartesian extends AbstractExampleSetJoin { public ExampleSetCartesian(OperatorDescription description) { super(description); getLeftInput().addPrecondition(new ExampleSetPrecondition(getLeftInput())); getRightInput().addPrecondition(new ExampleSetPrecondition(getRightInput())); } /** * Joins the data WITHOUT a WHERE criteria. * * @param es1 * @param es2 * @param originalAttributeSources * @param unionAttributeList * @return the table with the joined data * @throws OperatorException */ @Override protected ExampleSetBuilder joinData(ExampleSet es1, ExampleSet es2, List<AttributeSource> originalAttributeSources, List<Attribute> unionAttributeList) throws OperatorException { ExampleSetBuilder builder = ExampleSets.from(unionAttributeList); long total = (long) es1.size() * es2.size(); if (total > Integer.MAX_VALUE) { throw new UserError(this, "cartesian_product_too_big"); } int intTotal = (int) total; builder.withExpectedSize(intTotal); builder.withBlankSize(intTotal); Iterator<Attribute> unionIterator = unionAttributeList.iterator(); final int unionSize = unionAttributeList.size(); OperatorProgress progress = getProgress(); // report progress 10 times per attribute or if that is too much once per attribute long steps = 10L * unionSize; int batchSize; if (steps > Integer.MAX_VALUE) { progress.setTotal(unionSize); batchSize = intTotal; } else { progress.setTotal((int) steps); batchSize = Math.max(1, intTotal / 10); } for (AttributeSource source : originalAttributeSources) { if (source.getSource() == AttributeSource.FIRST_SOURCE) { builder.withColumnFiller(unionIterator.next(), i -> { if ((i + 1) % batchSize == 0) { try { progress.step(); } catch (ProcessStoppedException e) { throw new ProcessStoppedRuntimeException(); } } // every value of the first source attribute is repeated es2.size() times return es1.getExample(i / es2.size()).getValue(source.getAttribute()); }); } else if (source.getSource() == AttributeSource.SECOND_SOURCE) { builder.withColumnFiller(unionIterator.next(), i -> { if ((i + 1) % batchSize == 0) { try { progress.step(); } catch (ProcessStoppedException e) { throw new ProcessStoppedRuntimeException(); } } // the values of the second source attributes are repeated in es2.size() large // blocks return es2.getExample(i % es2.size()).getValue(source.getAttribute()); }); } } return builder; } @Override protected boolean isIdNeeded() { return false; } @Override public ResourceConsumptionEstimator getResourceConsumptionEstimator() { return OperatorResourceConsumptionHandler.getResourceConsumptionEstimator(getInputPorts().getPortByIndex(0), ExampleSetCartesian.class, null); } }