/** * Copyright (C) 2001-2017 by RapidMiner and the contributors * * Complete list of developers available at our web site: * * http://rapidminer.com * * This program is free software: you can redistribute it and/or modify it under the terms of the * GNU Affero General Public License as published by the Free Software Foundation, either version 3 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License along with this program. * If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.operator.preprocessing.join; import java.util.Collections; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; import com.rapidminer.example.Attribute; import com.rapidminer.example.AttributeRole; import com.rapidminer.example.Attributes; import com.rapidminer.example.ExampleSet; import com.rapidminer.example.utils.ExampleSetBuilder; import com.rapidminer.operator.Operator; import com.rapidminer.operator.OperatorDescription; import com.rapidminer.operator.OperatorException; import com.rapidminer.operator.OperatorVersion; import com.rapidminer.operator.ProcessSetupError.Severity; import com.rapidminer.operator.UserError; import com.rapidminer.operator.ports.InputPort; import com.rapidminer.operator.ports.OutputPort; import com.rapidminer.operator.ports.metadata.AttributeMetaData; import com.rapidminer.operator.ports.metadata.ExampleSetMetaData; import com.rapidminer.operator.ports.metadata.ExampleSetUnionRule; import com.rapidminer.operator.ports.metadata.SimpleMetaDataError; import com.rapidminer.parameter.ParameterType; import com.rapidminer.parameter.ParameterTypeBoolean; import com.rapidminer.tools.Ontology; import com.rapidminer.tools.container.Pair; /** * <p> * Build the join of two example sets. * </p> * <p> * Please note that this check for duplicate attributes will only be applied for regular attributes. * Special attributes of the second input example set which do not exist in the first example set * will simply be added. If they already exist they are simply skipped. * </p> * * @author Ingo Mierswa */ public abstract class AbstractExampleSetJoin extends Operator { public static final OperatorVersion VERSION_SWAPPED_INPUT_PORTS = new OperatorVersion(5, 1, 8); protected static final String LEFT_EXAMPLE_SET_INPUT = "left"; protected static final String RIGHT_EXAMPLE_SET_INPUT = "right"; private InputPort leftInput = getInputPorts().createPort(LEFT_EXAMPLE_SET_INPUT); private InputPort rightInput = getInputPorts().createPort(RIGHT_EXAMPLE_SET_INPUT); private OutputPort joinOutput = getOutputPorts().createPort("join"); /** * The parameter name for "Indicates if double attributes should be removed or * renamed" */ public static final String PARAMETER_REMOVE_DOUBLE_ATTRIBUTES = "remove_double_attributes"; /** Helper class to find the correct data for all union attributes. */ protected static class AttributeSource { protected static final int FIRST_SOURCE = 1; protected static final int SECOND_SOURCE = 2; protected int source; protected Attribute attribute; public AttributeSource(int source, Attribute attribute) { this.source = source; this.attribute = attribute; } protected int getSource() { return source; } protected Attribute getAttribute() { return attribute; } } public AbstractExampleSetJoin(OperatorDescription description) { super(description); getTransformer().addRule(new ExampleSetUnionRule(leftInput, rightInput, joinOutput, "_from_ES2") { @Override protected String getPrefix() { return getParameterAsBoolean(PARAMETER_REMOVE_DOUBLE_ATTRIBUTES) ? null : "_from_ES2"; } @Override protected ExampleSetMetaData modifyMetaData(ExampleSetMetaData leftEMD, ExampleSetMetaData rightEMD) { List<AttributeMetaData> joinedAttributesMetaData = getUnionAttributesMetaData(leftEMD, rightEMD); ExampleSetMetaData joinedEMD = new ExampleSetMetaData(); joinedEMD.addAllAttributes(joinedAttributesMetaData); return joinedEMD; } }); } public InputPort getLeftInput() { return leftInput; } public InputPort getRightInput() { return rightInput; } public OutputPort getJoinOutput() { return joinOutput; } protected abstract ExampleSetBuilder joinData(ExampleSet es1, ExampleSet es2, List<AttributeSource> originalAttributeSources, List<Attribute> unionAttributeList) throws OperatorException; protected abstract boolean isIdNeeded(); @Override public void doWork() throws OperatorException { ExampleSet es1; ExampleSet es2; if (getCompatibilityLevel().isAtMost(VERSION_SWAPPED_INPUT_PORTS)) { /* * please note the order of calls: As a result from the transformation from process tree * to process flow this error was introduced. We introduced an incompatibly version * change to overcome this. */ es2 = leftInput.getData(ExampleSet.class); es1 = rightInput.getData(ExampleSet.class); } else { /* * This is the correct order used by all operators that using a more current version * than VERSION_SWAPPED_INPUT_PORTS */ es1 = leftInput.getData(ExampleSet.class); es2 = rightInput.getData(ExampleSet.class); } if (this.isIdNeeded()) { Attribute id1 = es1.getAttributes().getId(); Attribute id2 = es2.getAttributes().getId(); // sanity checks if (id1 == null || id2 == null) { throw new UserError(this, 129); } if (!Ontology.ATTRIBUTE_VALUE_TYPE.isA(id1.getValueType(), id2.getValueType()) && !Ontology.ATTRIBUTE_VALUE_TYPE.isA(id2.getValueType(), id1.getValueType())) { // if (id1.getValueType() != id2.getValueType()) { throw new UserError(this, 120, new Object[] { id2.getName(), Ontology.VALUE_TYPE_NAMES[id2.getValueType()], Ontology.VALUE_TYPE_NAMES[id1.getValueType()] }); } } Set<Pair<Integer, Attribute>> excludedAttributes = getExcludedAttributes(es1, es2); // regular attributes List<AttributeSource> originalAttributeSources = new LinkedList<>(); List<Attribute> unionAttributeList = new LinkedList<>(); for (Attribute attribute : es1.getAttributes()) { if (!excludedAttributes.contains(new Pair<>(AttributeSource.FIRST_SOURCE, attribute))) { originalAttributeSources.add(new AttributeSource(AttributeSource.FIRST_SOURCE, attribute)); unionAttributeList.add((Attribute) attribute.clone()); } } boolean removeDoubleAttributes = getParameterAsBoolean(PARAMETER_REMOVE_DOUBLE_ATTRIBUTES); for (Attribute attribute : es2.getAttributes()) { if (!excludedAttributes.contains(new Pair<>(AttributeSource.SECOND_SOURCE, attribute))) { Attribute cloneAttribute = (Attribute) attribute.clone(); if (containsAttribute(unionAttributeList, attribute)) { // in list... if (!removeDoubleAttributes) { // ... but should not be removed --> rename originalAttributeSources.add(new AttributeSource(AttributeSource.SECOND_SOURCE, attribute)); cloneAttribute.setName(cloneAttribute.getName() + "_from_ES2"); if (containsAttribute(unionAttributeList, cloneAttribute)) { cloneAttribute.setName(cloneAttribute.getName() + "_from_ES2"); } unionAttributeList.add(cloneAttribute); } // else do nothing, i.e. remove } else { // not in list --> add originalAttributeSources.add(new AttributeSource(AttributeSource.SECOND_SOURCE, attribute)); unionAttributeList.add(cloneAttribute); } } } // special attributes Map<Attribute, String> unionSpecialAttributes = new LinkedHashMap<>(); Set<String> usedSpecialAttributes = new HashSet<>(); // first example set's special attributes Iterator<AttributeRole> s = es1.getAttributes().specialAttributes(); while (s.hasNext()) { AttributeRole role = s.next(); Attribute specialAttribute = role.getAttribute(); Attribute specialAttributeClone = (Attribute) specialAttribute.clone(); Iterator<Attribute> ia = unionAttributeList.iterator(); while (ia.hasNext()) { Attribute unionAttribute = ia.next(); if (unionAttribute.getName().equals(specialAttribute.getName())) { ia.remove(); } } Iterator<AttributeSource> ias = originalAttributeSources.iterator(); while (ias.hasNext()) { AttributeSource unionAttributeSource = ias.next(); if (unionAttributeSource.getAttribute().getName().equals(specialAttribute.getName())) { ias.remove(); } } unionAttributeList.add(specialAttributeClone); originalAttributeSources.add(new AttributeSource(AttributeSource.FIRST_SOURCE, specialAttribute)); unionSpecialAttributes.put(specialAttributeClone, role.getSpecialName()); usedSpecialAttributes.add(role.getSpecialName()); } // second example set's special attributes s = es2.getAttributes().specialAttributes(); while (s.hasNext()) { AttributeRole role = s.next(); String specialName = role.getSpecialName(); Attribute specialAttribute = role.getAttribute(); if (!usedSpecialAttributes.contains(specialName) && !excludedAttributes.contains(new Pair<>(AttributeSource.SECOND_SOURCE, specialAttribute))) { // not // there Attribute specialAttributeClone = (Attribute) specialAttribute.clone(); boolean addToUnionList = true; for (Attribute unionAttribute : unionAttributeList) { if (unionAttribute.getName().equals(specialAttribute.getName())) { addToUnionList = false; break; } } if (addToUnionList) { originalAttributeSources.add(new AttributeSource(AttributeSource.SECOND_SOURCE, specialAttribute)); unionAttributeList.add(specialAttributeClone); unionSpecialAttributes.put(specialAttributeClone, specialName); usedSpecialAttributes.add(specialName); } } else { if (!isKeyAttribute(role)) { logWarning("Special attribute '" + specialName + "' already exist, skipping!"); } } } // join data ExampleSetBuilder unionBuilder = joinData(es1, es2, originalAttributeSources, unionAttributeList); // create new example set ExampleSet result = unionBuilder.withRoles(unionSpecialAttributes).build(); result.getAnnotations().addAll(es1.getAnnotations()); joinOutput.deliver(result); } /** * The method isKeyAttribute can be overwritten by subclasses which are using key attributes, in * order to determine if a specific attribute is used as a join key. By default, there is no use * of key attributes, so the method returns false. */ protected boolean isKeyAttribute(AttributeRole attributeRole) throws OperatorException { return false; } /** * Returns a list of AttributeMetaData which contains the correctly joined MetaData arising from * both input ports. */ protected List<AttributeMetaData> getUnionAttributesMetaData(ExampleSetMetaData emd1, ExampleSetMetaData emd2) { if (!leftInput.isConnected() || !rightInput.isConnected()) { return new LinkedList<>(); } if (this.isIdNeeded()) { AttributeMetaData id1 = emd1.getSpecial(Attributes.ID_NAME); AttributeMetaData id2 = emd2.getSpecial(Attributes.ID_NAME); // sanity checks // if (id1 == null) leftInput.addError(new SimpleMetaDataError(Severity.ERROR, // leftInput, "missing_id")); // if (id2 == null) rightInput.addError(new SimpleMetaDataError(Severity.ERROR, // rightInput, "missing_id")); if (id1 == null || id2 == null) { return new LinkedList<>(); } if (!Ontology.ATTRIBUTE_VALUE_TYPE.isA(id1.getValueType(), id2.getValueType()) && !Ontology.ATTRIBUTE_VALUE_TYPE.isA(id2.getValueType(), id1.getValueType())) { // this.addError(new SimpleProcessSetupError(Severity.ERROR, getPortOwner(), // "attributes_type_mismatch", id1.getName(), "left", id2.getName(), "right")); return new LinkedList<>(); } } Set<Pair<Integer, AttributeMetaData>> excludedAttributes = new HashSet<>(); try { excludedAttributes = getExcludedAttributesMD(emd1, emd2); } catch (OperatorException e) { excludedAttributes = Collections.emptySet(); } // adding attributes List<AttributeMetaData> unionAttributeList = new LinkedList<>(); List<String> unionSpecialRoleList = new LinkedList<>(); for (AttributeMetaData attributeMD : emd1.getAllAttributes()) { if (!excludedAttributes.contains(new Pair<>(AttributeSource.FIRST_SOURCE, attributeMD))) { unionAttributeList.add(attributeMD.clone()); if (attributeMD.isSpecial()) { unionSpecialRoleList.add(attributeMD.getRole()); } } } boolean removeDoubleAttributes = getParameterAsBoolean(PARAMETER_REMOVE_DOUBLE_ATTRIBUTES); for (AttributeMetaData attributeMD : emd2.getAllAttributes()) { if (!excludedAttributes.contains(new Pair<>(AttributeSource.SECOND_SOURCE, attributeMD))) { AttributeMetaData cloneAttribute = attributeMD.clone(); if (containsAttributeMD(unionAttributeList, attributeMD)) { // in list... if (!removeDoubleAttributes) { // ... but should not be removed --> rename if (attributeMD.isSpecial() && unionSpecialRoleList.contains(attributeMD.getRole())) { // this special attribute's role already exists rightInput.addError(new SimpleMetaDataError(Severity.WARNING, rightInput, "already_contains_role", attributeMD.getRole())); continue; } cloneAttribute.setName(cloneAttribute.getName() + "_from_ES2"); if (containsAttributeMD(unionAttributeList, cloneAttribute)) { cloneAttribute.setName(cloneAttribute.getName() + "_from_ES2"); } unionAttributeList.add(cloneAttribute); } // else do nothing, i.e. remove } else { // not in list --> add if (attributeMD.isSpecial() && unionSpecialRoleList.contains(attributeMD.getRole())) { // this special attribute's role already exists rightInput.addError(new SimpleMetaDataError(Severity.WARNING, rightInput, "already_contains_role", attributeMD.getRole())); continue; } unionAttributeList.add(cloneAttribute); } } } // special attributes check for (AttributeMetaData attributeMD : unionAttributeList) { if (attributeMD.isSpecial()) { } } return unionAttributeList; } /** * Returns a set of original attributes which will not be copied to the output example set. The * default implementation returns an empty set. */ protected Set<Pair<Integer, Attribute>> getExcludedAttributes(ExampleSet es1, ExampleSet es2) throws OperatorException { return new HashSet<>(); } /** * Returns a set of original attributes which will not be copied to the output example set. The * default implementation returns an empty set. */ protected Set<Pair<Integer, AttributeMetaData>> getExcludedAttributesMD(ExampleSetMetaData esm1, ExampleSetMetaData esm2) throws OperatorException { return new HashSet<>(); } /** * Returns true if the list already contains an attribute with the given name. The method * contains from List cannot be used since the equals method of Attribute also checks for the * same table index which is not applicable here. */ public boolean containsAttribute(List<Attribute> attributeList, Attribute attribute) { Iterator<Attribute> i = attributeList.iterator(); while (i.hasNext()) { if (i.next().getName().equals(attribute.getName())) { return true; } } return false; } /** * Returns true if the list already contains an attribute with the given name. The method * contains from List cannot be used since the equals method of Attribute also checks for the * same table index which is not applicable here. */ public boolean containsAttributeMD(List<AttributeMetaData> attributeMDList, AttributeMetaData attributeMD) { Iterator<AttributeMetaData> i = attributeMDList.iterator(); while (i.hasNext()) { if (i.next().getName().equals(attributeMD.getName())) { return true; } } return false; } @Override public OperatorVersion[] getIncompatibleVersionChanges() { return new OperatorVersion[] { VERSION_SWAPPED_INPUT_PORTS }; } @Override public List<ParameterType> getParameterTypes() { List<ParameterType> types = super.getParameterTypes(); types.add(new ParameterTypeBoolean(PARAMETER_REMOVE_DOUBLE_ATTRIBUTES, "Indicates if double attributes should be removed or renamed", true)); return types; } }