/**
* Copyright (C) 2001-2017 by RapidMiner and the contributors
*
* Complete list of developers available at our web site:
*
* http://rapidminer.com
*
* This program is free software: you can redistribute it and/or modify it under the terms of the
* GNU Affero General Public License as published by the Free Software Foundation, either version 3
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
* even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License along with this program.
* If not, see http://www.gnu.org/licenses/.
*/
package com.rapidminer.operator.preprocessing.join;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.rapidminer.example.Attribute;
import com.rapidminer.example.AttributeRole;
import com.rapidminer.example.Attributes;
import com.rapidminer.example.Example;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.example.utils.ExampleSetBuilder;
import com.rapidminer.example.utils.ExampleSets;
import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.ProcessSetupError.Severity;
import com.rapidminer.operator.ProcessStoppedException;
import com.rapidminer.operator.SimpleProcessSetupError;
import com.rapidminer.operator.UserError;
import com.rapidminer.operator.annotation.ResourceConsumptionEstimator;
import com.rapidminer.operator.ports.metadata.AttributeMetaData;
import com.rapidminer.operator.ports.metadata.ExampleSetMetaData;
import com.rapidminer.operator.ports.metadata.ExampleSetPrecondition;
import com.rapidminer.operator.ports.metadata.ParameterConditionedPrecondition;
import com.rapidminer.operator.ports.metadata.SimpleMetaDataError;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeAttribute;
import com.rapidminer.parameter.ParameterTypeBoolean;
import com.rapidminer.parameter.ParameterTypeCategory;
import com.rapidminer.parameter.ParameterTypeList;
import com.rapidminer.parameter.conditions.BooleanParameterCondition;
import com.rapidminer.tools.Ontology;
import com.rapidminer.tools.OperatorResourceConsumptionHandler;
import com.rapidminer.tools.container.Pair;
/**
* <p>
* Build the join of two example sets using the id attributes of the sets, i.e. both example sets
* must have an id attribute where the same id indicate the same examples. If examples are missing
* an exception will be thrown. The result example set will consist of the same number of examples
* but the union set or the union list (depending on parameter setting double attributes will be
* removed or renamed) of both feature sets. In case of removing double attribute the attribute
* values must be the same for the examples of both example set, otherwise an exception will be
* thrown.
* </p>
* <p>
* Please note that this check for double attributes will only be applied for regular attributes.
* Special attributes of the second input example set which do not exist in the first example set
* will simply be added. If they already exist they are simply skipped.
* </p>
*
* @author Ingo Mierswa, Tobias Malbrecht, Marius Helf
*/
public class ExampleSetJoin extends AbstractExampleSetJoin {
public static class DoubleArrayWrapper {
public DoubleArrayWrapper(double[] data) {
this.data = data;
}
public double[] getData() {
return data;
}
private double[] data;
@Override
public boolean equals(Object other) {
if (!(other instanceof DoubleArrayWrapper)) {
return false;
}
return Arrays.equals(data, ((DoubleArrayWrapper) other).data);
}
@Override
public int hashCode() {
return Arrays.hashCode(data);
}
}
public static final String PARAMETER_JOIN_TYPE = "join_type";
public static final String PARAMETER_LEFT_ATTRIBUTE_FOR_JOIN = "left_key_attributes";
public static final String PARAMETER_RIGHT_ATTRIBUTE_FOR_JOIN = "right_key_attributes";
public static final String PARAMETER_JOIN_ATTRIBUTES = "key_attributes";
public static final String PARAMETER_USE_ID = "use_id_attribute_as_key";
public static final String PARAMETER_KEEP_BOTH_JOIN_ATTRIBUTES = "keep_both_join_attributes";
public static final String PARAMETER_FILL_LEFT_ID = "";
public static final String[] JOIN_TYPES = { "inner", "left", "right", "outer" };
public static final int JOIN_TYPE_INNER = 0;
public static final int JOIN_TYPE_LEFT = 1;
public static final int JOIN_TYPE_RIGHT = 2;
public static final int JOIN_TYPE_OUTER = 3;
public ExampleSetJoin(OperatorDescription description) {
super(description);
getLeftInput().addPrecondition(new ParameterConditionedPrecondition(getLeftInput(),
new ExampleSetPrecondition(getLeftInput(), Ontology.ATTRIBUTE_VALUE, Attributes.ID_NAME), this,
PARAMETER_USE_ID, "true"));
getLeftInput().addPrecondition(new ParameterConditionedPrecondition(getLeftInput(),
new ExampleSetPrecondition(getLeftInput()), this, PARAMETER_USE_ID, "false"));
getRightInput().addPrecondition(new ParameterConditionedPrecondition(getRightInput(),
new ExampleSetPrecondition(getRightInput(), Ontology.ATTRIBUTE_VALUE, Attributes.ID_NAME), this,
PARAMETER_USE_ID, "true"));
getRightInput().addPrecondition(new ParameterConditionedPrecondition(getRightInput(),
new ExampleSetPrecondition(getRightInput()), this, PARAMETER_USE_ID, "false"));
}
/** Same as {@link getKeyAttributes}, but returns the MetaData of the KeyAttributes. **/
private Pair<AttributeMetaData[], AttributeMetaData[]> getKeyAttributesMD(ExampleSetMetaData leftEMD,
ExampleSetMetaData rightEMD) throws OperatorException {
boolean useIdForJoin = getParameterAsBoolean(PARAMETER_USE_ID);
boolean keepBothJoinAttributes = getParameterAsBoolean(PARAMETER_KEEP_BOTH_JOIN_ATTRIBUTES);
Pair<AttributeMetaData[], AttributeMetaData[]> keyAttributes;
if (!useIdForJoin) {
List<String[]> parKeyAttributes;
parKeyAttributes = getParameterList(PARAMETER_JOIN_ATTRIBUTES);
int numKeyAttributes = parKeyAttributes.size();
keyAttributes = new Pair<>(new AttributeMetaData[numKeyAttributes], new AttributeMetaData[numKeyAttributes]);
int i = 0;
// iterate user input
for (String[] attributePair : parKeyAttributes) {
// map user input to actual Attribute objects:
AttributeMetaData amdLeft = leftEMD.getAttributeByName(attributePair[0]);
AttributeMetaData amdRight = rightEMD.getAttributeByName(attributePair[1]);
// check if attributes could be found:
if (amdLeft == null) {
getLeftInput().addError(
new SimpleMetaDataError(Severity.ERROR, getLeftInput(), "missing_attribute", attributePair[0]));
throw new UserError(this, "join.illegal_key_attribute", attributePair[0], "left", attributePair[1],
"right");
} else if (amdRight == null) {
getRightInput().addError(
new SimpleMetaDataError(Severity.ERROR, getRightInput(), "missing_attribute", attributePair[1]));
throw new UserError(this, "join.illegal_key_attribute", attributePair[1], "right", attributePair[0],
"left");
}
// check for incompatible types
if (!Ontology.ATTRIBUTE_VALUE_TYPE.isA(amdLeft.getValueType(), amdRight.getValueType())
&& !Ontology.ATTRIBUTE_VALUE_TYPE.isA(amdRight.getValueType(), amdLeft.getValueType())) {
this.addError(new SimpleProcessSetupError(Severity.ERROR, getPortOwner(), "attributes_type_mismatch",
attributePair[0], "left", attributePair[1], "right"));
throw new UserError(this, "join.illegal_key_attribute", attributePair[1], "right", attributePair[0],
"left");
}
// add attributes to list
if (!keepBothJoinAttributes) {
keyAttributes.getFirst()[i] = amdLeft;
keyAttributes.getSecond()[i] = amdRight;
++i;
}
}
} else {
keyAttributes = new Pair<>(new AttributeMetaData[] { leftEMD.getSpecial(Attributes.ID_NAME) },
new AttributeMetaData[] { rightEMD.getSpecial(Attributes.ID_NAME) });
}
if (!keepBothJoinAttributes) {
return keyAttributes;
} else {
return null;
}
}
@Override
protected ExampleSetBuilder joinData(ExampleSet leftExampleSet, ExampleSet rightExampleSet,
List<AttributeSource> originalAttributeSources, List<Attribute> unionAttributeList) throws OperatorException {
int joinType = getParameterAsInt(PARAMETER_JOIN_TYPE);
leftExampleSet.remapIds();
rightExampleSet.remapIds();
// the attributes that are used in the left and the right table as key attributes:
Pair<Attribute[], Attribute[]> keyAttributes = getKeyAttributes(leftExampleSet, rightExampleSet);
switch (joinType) {
case JOIN_TYPE_INNER:
getProgress().setTotal(leftExampleSet.size());
return performInnerJoin(leftExampleSet, rightExampleSet, originalAttributeSources, unionAttributeList,
keyAttributes);
case JOIN_TYPE_LEFT:
getProgress().setTotal(leftExampleSet.size());
return performLeftJoin(leftExampleSet, rightExampleSet, originalAttributeSources, unionAttributeList,
keyAttributes, null);
case JOIN_TYPE_RIGHT:
getProgress().setTotal(rightExampleSet.size());
return performRightJoin(leftExampleSet, rightExampleSet, originalAttributeSources, unionAttributeList,
keyAttributes);
case JOIN_TYPE_OUTER:
getProgress().setTotal(leftExampleSet.size() + rightExampleSet.size());
return performOuterJoin(leftExampleSet, rightExampleSet, originalAttributeSources, unionAttributeList,
keyAttributes);
default:
assert false; // illegal join type
return null;
}
}
/**
* Returns a Pair that contains two arrays of attributes of equals lenghts. Attributes in these
* arrays with the same index resemble attributes which must be equal during the join operation
* to match an example. Only if all key attributes match, the example match. Thus, each returned
* array defines a key for the example sets, whereby the the first entry of the pair is for the
* left example set, the second one for the right example set.
*/
private Pair<Attribute[], Attribute[]> getKeyAttributes(ExampleSet leftExampleSet, ExampleSet rightExampleSet)
throws OperatorException {
boolean useIdForJoin = getParameterAsBoolean(PARAMETER_USE_ID);
Pair<Attribute[], Attribute[]> keyAttributes;
if (!useIdForJoin) {
List<String[]> parKeyAttributes = getParameterList(PARAMETER_JOIN_ATTRIBUTES);
int numKeyAttributes = parKeyAttributes.size();
keyAttributes = new Pair<>(new Attribute[numKeyAttributes], new Attribute[numKeyAttributes]);
int i = 0;
// iterate user input
for (String[] attributePair : parKeyAttributes) {
// map user input to actual Attribute objects:
Attribute leftAttribute = leftExampleSet.getAttributes().get(attributePair[0]);
Attribute rightAttribute = rightExampleSet.getAttributes().get(attributePair[1]);
// check if attributes could be found:
if (leftAttribute == null) {
throw new UserError(this, "join.illegal_key_attribute", attributePair[0], "left", attributePair[1],
"right");
} else if (rightAttribute == null) {
throw new UserError(this, "join.illegal_key_attribute", attributePair[1], "right", attributePair[0],
"left");
}
// check for incompatible types
if (!Ontology.ATTRIBUTE_VALUE_TYPE.isA(leftAttribute.getValueType(), rightAttribute.getValueType())
&& !Ontology.ATTRIBUTE_VALUE_TYPE.isA(rightAttribute.getValueType(), leftAttribute.getValueType())) {
throw new UserError(this, "join.illegal_key_attribute", attributePair[1], "right", attributePair[0],
"left");
}
// add attributes to list
keyAttributes.getFirst()[i] = leftAttribute;
keyAttributes.getSecond()[i] = rightAttribute;
++i;
}
} else {
keyAttributes = new Pair<>(new Attribute[] { leftExampleSet.getAttributes().getId() },
new Attribute[] { rightExampleSet.getAttributes().getId() });
}
return keyAttributes;
}
@Override
protected boolean isKeyAttribute(AttributeRole attributeRole) throws OperatorException {
String attributeName = attributeRole.getAttribute().getName();
String attributeRoleName = attributeRole.getSpecialName();
boolean useIdForJoin = getParameterAsBoolean(PARAMETER_USE_ID);
if (!useIdForJoin) {
List<String[]> parKeyAttributes;
parKeyAttributes = getParameterList(PARAMETER_JOIN_ATTRIBUTES);
for (String[] keyAttributePair : parKeyAttributes) {
if (attributeName.equals(keyAttributePair[1])) {
return true;
}
}
} else {
return attributeRoleName.equals(Attributes.ID_NAME);
}
return false;
}
/**
* Performs an inner join, i.e. the result table contains all examples from the source example
* sets whose key attributes match.
*
*/
private ExampleSetBuilder performInnerJoin(ExampleSet leftExampleSet, ExampleSet rightExampleSet,
List<AttributeSource> originalAttributeSources, List<Attribute> unionAttributeList,
Pair<Attribute[], Attribute[]> keyAttributes) throws ProcessStoppedException {
ExampleSetBuilder builder = ExampleSets.from(unionAttributeList);
Attribute[] leftKeyAttributes = null;
Attribute[] rightKeyAttributes = null;
Map<DoubleArrayWrapper, List<Example>> rightKeyMapping = null;
boolean useId = getParameterAsBoolean(PARAMETER_USE_ID);
if (!useId) {
// create key mapping for right example set
leftKeyAttributes = keyAttributes.getFirst();
rightKeyAttributes = keyAttributes.getSecond();
rightKeyMapping = createKeyMapping(rightExampleSet, rightKeyAttributes, leftKeyAttributes);
}
int progressCounter = 0;
// iterate over all example from left table and search for matching examples in right table:
for (Example leftExample : leftExampleSet) {
List<Example> matchingRightExamples = getMatchingExamples(leftExampleSet, rightExampleSet, leftKeyAttributes,
rightKeyMapping, useId, leftExample);
if (matchingRightExamples != null) {
for (Example rightExample : matchingRightExamples) {
addCombinedOccurence(originalAttributeSources, unionAttributeList, builder, leftExample, rightExample);
}
}
// trigger operator progress every 100 examples
++progressCounter;
if (progressCounter % 100 == 0) {
getProgress().step(100);
progressCounter = 0;
}
}
return builder;
}
/**
* Performs a left join.
*
*/
private ExampleSetBuilder performLeftJoin(ExampleSet leftExampleSet, ExampleSet rightExampleSet,
List<AttributeSource> originalAttributeSources, List<Attribute> unionAttributeList,
Pair<Attribute[], Attribute[]> keyAttributes, Set<DoubleArrayWrapper> matchedExamplesInRightTable)
throws ProcessStoppedException {
ExampleSetBuilder builder = ExampleSets.from(unionAttributeList);
Attribute[] leftKeyAttributes = null;
Attribute[] rightKeyAttributes = null;
Map<DoubleArrayWrapper, List<Example>> rightKeyMapping = null;
boolean useId = getParameterAsBoolean(PARAMETER_USE_ID);
leftKeyAttributes = keyAttributes.getFirst();
rightKeyAttributes = keyAttributes.getSecond();
if (!useId) {
// create key mapping for right example set
rightKeyMapping = createKeyMapping(rightExampleSet, rightKeyAttributes, leftKeyAttributes);
}
int progressCounter = 0;
// iterate over all example from left table and search for matching examples in right table:
for (Example leftExample : leftExampleSet) {
List<Example> matchingRightExamples = getMatchingExamples(leftExampleSet, rightExampleSet, leftKeyAttributes,
rightKeyMapping, useId, leftExample);
if (matchingRightExamples != null) {
// add combination of left example and all matching right examples
for (Example rightExample : matchingRightExamples) {
addCombinedOccurence(originalAttributeSources, unionAttributeList, builder, leftExample, rightExample);
if (matchedExamplesInRightTable != null) {
matchedExamplesInRightTable
.add(new DoubleArrayWrapper(getKeyValues(rightExample, rightKeyAttributes)));
}
}
} else { // no rows with this key in right table
// insert this row with null values for the right table
addLeftOnlyOccurence(originalAttributeSources, unionAttributeList, builder, leftExample);
}
// trigger operator progress every 100 examples
++progressCounter;
if (progressCounter % 100 == 0) {
getProgress().step(100);
progressCounter = 0;
}
}
return builder;
}
/**
* Performs a right join.
*
*/
private ExampleSetBuilder performRightJoin(ExampleSet leftExampleSet, ExampleSet rightExampleSet,
List<AttributeSource> originalAttributeSources, List<Attribute> unionAttributeList,
Pair<Attribute[], Attribute[]> keyAttributes) throws ProcessStoppedException {
ExampleSetBuilder builder = ExampleSets.from(unionAttributeList);
Attribute[] leftKeyAttributes = null;
Attribute[] rightKeyAttributes = null;
Map<DoubleArrayWrapper, List<Example>> leftKeyMapping = null;
boolean useId = getParameterAsBoolean(PARAMETER_USE_ID);
Attribute leftIdAttribute = null;
Attribute rightIdAttribute = null;
if (useId) {
// needed for getting the right id when adding examples which occur only in right table
leftIdAttribute = leftExampleSet.getAttributes().getId();
rightIdAttribute = rightExampleSet.getAttributes().getId();
leftKeyAttributes = new Attribute[] { leftIdAttribute };
rightKeyAttributes = new Attribute[] { rightIdAttribute };
} else {
// create key mapping for right example set
leftKeyAttributes = keyAttributes.getFirst();
rightKeyAttributes = keyAttributes.getSecond();
leftKeyMapping = createKeyMapping(leftExampleSet, leftKeyAttributes, rightKeyAttributes);
}
boolean keepBoth = getParameterAsBoolean(PARAMETER_KEEP_BOTH_JOIN_ATTRIBUTES);
boolean removeDoubleAttributes = getParameterAsBoolean(PARAMETER_REMOVE_DOUBLE_ATTRIBUTES);
int progressCounter = 0;
// iterate over all example from left table and search for matching examples in right table:
for (Example rightExample : rightExampleSet) {
List<Example> matchingLeftExamples = getMatchingExamples(rightExampleSet, leftExampleSet, rightKeyAttributes,
leftKeyMapping, useId, rightExample);
if (matchingLeftExamples != null) {
// add combination of left example and all matching right examples
for (Example leftExample : matchingLeftExamples) {
addCombinedOccurence(originalAttributeSources, unionAttributeList, builder, leftExample, rightExample);
}
} else {
addRightOnlyOccurence(originalAttributeSources, unionAttributeList, builder, rightExample, leftKeyAttributes,
rightKeyAttributes, keepBoth, removeDoubleAttributes);
}
// trigger operator progress every 100 examples
++progressCounter;
if (progressCounter % 100 == 0) {
getProgress().step(100);
progressCounter = 0;
}
}
return builder;
}
/**
* Performs an outer join (not to be confused with a full outer join).
*
*/
private ExampleSetBuilder performOuterJoin(ExampleSet leftExampleSet, ExampleSet rightExampleSet,
List<AttributeSource> originalAttributeSources, List<Attribute> unionAttributeList,
Pair<Attribute[], Attribute[]> keyAttributes) throws ProcessStoppedException {
ExampleSetBuilder builder;
Attribute[] leftKeyAttributes = keyAttributes.getFirst();
Attribute[] rightKeyAttributes = keyAttributes.getSecond();
// perform left join (an outer join is the union of a left join and a right join on the same
// tables)
Set<DoubleArrayWrapper> mappedRightExamples = new HashSet<>();
builder = performLeftJoin(leftExampleSet, rightExampleSet, originalAttributeSources, unionAttributeList,
keyAttributes, mappedRightExamples);
boolean keepBoth = getParameterAsBoolean(PARAMETER_KEEP_BOTH_JOIN_ATTRIBUTES);
boolean removeDoubleAttributes = getParameterAsBoolean(PARAMETER_REMOVE_DOUBLE_ATTRIBUTES);
int progressCounter = 0;
for (Example rightExample : rightExampleSet) {
// perform right join, but add example only if it has not been matched during left join
// above
if (!mappedRightExamples.contains(new DoubleArrayWrapper(getKeyValues(rightExample, rightKeyAttributes)))) {
addRightOnlyOccurence(originalAttributeSources, unionAttributeList, builder, rightExample, leftKeyAttributes,
rightKeyAttributes, keepBoth, removeDoubleAttributes);
}
// trigger operator progress every 100 examples
++progressCounter;
if (progressCounter % 100 == 0) {
getProgress().step(100);
progressCounter = 0;
}
}
return builder;
}
/**
* Creates an example which consists of the combination of leftExample an rightExample. Only
* those attributes are added, which are present in originalAttributeSources. The newly
* constructed example is added to unionTable.
*/
private void addCombinedOccurence(List<AttributeSource> originalAttributeSources, List<Attribute> unionAttributeList,
ExampleSetBuilder builder, Example leftExample, Example rightExample) {
double[] unionDataRow = new double[unionAttributeList.size()];
int attributeIndex = 0;
for (AttributeSource attributeSource : originalAttributeSources) {
if (attributeSource.getSource() == AttributeSource.FIRST_SOURCE) {
unionDataRow[attributeIndex] = leftExample.getValue(attributeSource.getAttribute());
} else if (attributeSource.getSource() == AttributeSource.SECOND_SOURCE) {
unionDataRow[attributeIndex] = rightExample.getValue(attributeSource.getAttribute());
}
attributeIndex++;
}
builder.addRow(unionDataRow);
}
/**
* Creates an example and adds it to unionTable. The example contains all attributes from
* leftExample, which are also in originalAttributeSources, and NaN for all attributes which
* should normally be taken from a right example.
*/
private void addLeftOnlyOccurence(List<AttributeSource> originalAttributeSources, List<Attribute> unionAttributeList,
ExampleSetBuilder builder, Example leftExample) {
double[] unionDataRow = new double[unionAttributeList.size()];
int attributeIndex = 0;
for (AttributeSource attributeSource : originalAttributeSources) {
if (attributeSource.getSource() == AttributeSource.FIRST_SOURCE) {
unionDataRow[attributeIndex] = leftExample.getValue(attributeSource.getAttribute());
} else if (attributeSource.getSource() == AttributeSource.SECOND_SOURCE) {
unionDataRow[attributeIndex] = Double.NaN;
}
attributeIndex++;
}
builder.addRow(unionDataRow);
}
/**
* Creates an example and adds it to unionTable. The example contains all attributes from
* rightExample, which are also in originalAttributeSources, and NaN for all attributes which
* should normally be taken from a left example. Exception: if key attributes would be taken
* from left example and only one id attribute is kept, instead of NaN the value of the
* corresponding attribute in rightExample is taken.
*/
private void addRightOnlyOccurence(List<AttributeSource> originalAttributeSources, List<Attribute> unionAttributeList,
ExampleSetBuilder builder, Example rightExample, Attribute[] leftKeyAttributes, Attribute[] rightKeyAttributes,
boolean keepBoth, boolean removeDoubleAttributes) {
double[] unionDataRow = new double[unionAttributeList.size()];
int attributeIndex = 0;
for (AttributeSource attributeSource : originalAttributeSources) {
if (attributeSource.getSource() == AttributeSource.FIRST_SOURCE) {
// since keys attributes are always taken from left example set, ID value must be
// fetched
// from right example set explicitly
// find key id
int id = -1;
for (int i = 0; i < leftKeyAttributes.length; ++i) {
if (attributeSource.getAttribute() == leftKeyAttributes[i]) {
id = i;
break;
}
}
// now use correct key attribute
if (id >= 0) {
boolean sameName = leftKeyAttributes[id].getName().equals(rightKeyAttributes[id].getName());
if (keepBoth && !(removeDoubleAttributes && sameName)) {
unionDataRow[attributeIndex] = Double.NaN;
} else {
if (leftKeyAttributes[id].isNominal()) {
// consider different mapping in left and right attribute
Attribute rightAttribute = rightKeyAttributes[id];
Attribute leftAttribute = leftKeyAttributes[id];
int rightIndex = (int) rightExample.getValue(rightAttribute);
String valueAsString = rightAttribute.getMapping().mapIndex(rightIndex);
int leftIndex = leftAttribute.getMapping().mapString(valueAsString);
unionDataRow[attributeIndex] = leftIndex;
} else {
unionDataRow[attributeIndex] = rightExample.getValue(rightKeyAttributes[id]);
}
}
} else {
unionDataRow[attributeIndex] = Double.NaN;
}
} else if (attributeSource.getSource() == AttributeSource.SECOND_SOURCE) {
unionDataRow[attributeIndex] = rightExample.getValue(attributeSource.getAttribute());
}
attributeIndex++;
}
builder.addRow(unionDataRow);
}
/**
* Maps all values of the keyAttributes which occur in exampleSet to a list of matching
* examples.
*
* @param exampleSet
* The example set for whose key attributes the mapping is created
* @param keyAttributes
* the attributes which resemble the key attributes
* @param matchKeyAttributes
* if not null, the values of nominal keyAttributes are mapped to match the mapping
* of these attributes prior to adding them to the map
* @return
*/
private Map<DoubleArrayWrapper, List<Example>> createKeyMapping(ExampleSet exampleSet, Attribute[] keyAttributes,
Attribute[] matchKeyAttributes) {
Map<DoubleArrayWrapper, List<Example>> keyMapping = new HashMap<>();
assert keyAttributes.length == matchKeyAttributes.length;
// create mapping from nominal values of keyAttributes to matchKeyAttributes
Map<Attribute, Map<Double, Double>> valueMapping = null;
if (matchKeyAttributes != null) {
valueMapping = new HashMap<>();
for (int attributeNumber = 0; attributeNumber < keyAttributes.length; ++attributeNumber) {
if (keyAttributes[attributeNumber].isNominal()) {
Map<Double, Double> valueMap = new HashMap<>();
// TODO: iterate over getMappint().values() rather than relying on the
// assumption that values appear in increasing order
for (int valueNumber = 0; valueNumber < keyAttributes[attributeNumber].getMapping()
.size(); ++valueNumber) {
String valueString = keyAttributes[attributeNumber].getMapping().mapIndex(valueNumber);
valueMap.put((double) valueNumber,
(double) matchKeyAttributes[attributeNumber].getMapping().mapString(valueString));
}
valueMapping.put(keyAttributes[attributeNumber], valueMap);
}
}
}
double[] keyValues;
for (Example example : exampleSet) {
boolean continueIteration = false;
// fetch key values from example
keyValues = getKeyValues(example, keyAttributes);
if (valueMapping != null) {
// remap keyValues to match values of other attributes:
for (int i = 0; i < keyValues.length; ++i) {
if (Double.isNaN(keyValues[i])) {
continueIteration = true;
break;
}
if (keyAttributes[i].isNominal()) {
keyValues[i] = valueMapping.get(keyAttributes[i]).get(keyValues[i]);
}
}
if (continueIteration) {
continue;
}
}
// check if this key is in keyMapping. If not, add:
List<Example> keyExamples = keyMapping.get(new DoubleArrayWrapper(keyValues));
if (keyExamples != null) {
// add current example:
keyExamples.add(example);
} else {
// create set and add to keyMapping:
keyExamples = new LinkedList<>();
keyExamples.add(example);
keyMapping.put(new DoubleArrayWrapper(keyValues), keyExamples);
}
}
;
return keyMapping;
}
/**
* Gets examples from secondExampleSet which match the values of the keyAttributes from
* firstExample. If PARAMETER_USE_ID_FOR_JOIN is true, the standard id-mapping of example sets
* is used. If not, secondKeyMapping is used (@see createKeyMapping())
*
*/
private List<Example> getMatchingExamples(ExampleSet firstExampleSet, ExampleSet secondExampleSet,
Attribute[] firstKeyAttributes, Map<DoubleArrayWrapper, List<Example>> secondKeyMapping, boolean useId,
Example referenceExample) {
// find right examples matching current left example:
List<Example> matchingExamples = null;
if (useId) {
// use existent id mapping of right example set
Attribute firstIdAttribute = firstExampleSet.getAttributes().getId();
Attribute secondIdAttribute = secondExampleSet.getAttributes().getId();
double firstIdValue = referenceExample.getValue(firstIdAttribute);
// firstIdValue is NaN if the first value in the id column is a missing value
if (Double.isNaN(firstIdValue)) {
return null;
}
int[] matchingExampleIndices = null;
if (firstIdAttribute.isNominal()) {
matchingExampleIndices = secondExampleSet.getExampleIndicesFromId(
secondIdAttribute.getMapping().getIndex(firstIdAttribute.getMapping().mapIndex((int) firstIdValue)));
} else {
matchingExampleIndices = secondExampleSet.getExampleIndicesFromId(firstIdValue);
}
if (matchingExampleIndices != null) {
matchingExamples = new LinkedList<>();
for (int secondExampleIndex : matchingExampleIndices) {
Example matchingExample = secondExampleSet.getExample(secondExampleIndex);
matchingExamples.add(matchingExample);
}
}
} else {
// use previously created mapping
double[] leftKeyValues = getKeyValues(referenceExample, firstKeyAttributes);
matchingExamples = secondKeyMapping.get(new DoubleArrayWrapper(leftKeyValues));
}
return matchingExamples;
}
/**
* Returns an array of doubles, which contains the values of the keyAttributes of example.
*/
private double[] getKeyValues(Example example, Attribute[] keyAttributes) {
int numKeys = keyAttributes.length;
double[] keyValues = new double[numKeys];
for (int i = 0; i < numKeys; ++i) {
keyValues[i] = example.getValue(keyAttributes[i]);
}
return keyValues;
}
/**
* Returns all attributes from the right example which are key attributes.
*
* As the values of the key attributes of left and right example set are always the same, only
* one set of key attributes is necessary. This is taken from the left example set. Thus, the
* right key attributes are excluded.
*/
@Override
protected Set<Pair<Integer, Attribute>> getExcludedAttributes(ExampleSet leftExampleSet, ExampleSet rightExampleSet)
throws OperatorException {
if (getParameterAsBoolean(PARAMETER_KEEP_BOTH_JOIN_ATTRIBUTES)) {
return Collections.emptySet();
} else {
Attribute[] keyAttributes = getKeyAttributes(leftExampleSet, rightExampleSet).getSecond();
Set<Pair<Integer, Attribute>> excludedAttributes = new HashSet<>();
for (Attribute attribute : keyAttributes) {
excludedAttributes.add(new Pair<>(AttributeSource.SECOND_SOURCE, attribute));
}
return excludedAttributes;
}
}
/**
* Returns the metadata from all attributes from the right example which are key attributes.
*
* As the values of the key attributes of left and right example set are always the same, only
* one set of key attributes is necessary. This is taken from the left example set metadata.
* Thus, the right key attributes are excluded.
*/
@Override
protected Set<Pair<Integer, AttributeMetaData>> getExcludedAttributesMD(ExampleSetMetaData leftExampleSetMD,
ExampleSetMetaData rightExampleSetMD) throws OperatorException {
Pair<AttributeMetaData[], AttributeMetaData[]> keyAttributeMD = getKeyAttributesMD(leftExampleSetMD,
rightExampleSetMD);
if (keyAttributeMD == null) {
return Collections.emptySet();
}
AttributeMetaData[] keyAttributes = keyAttributeMD.getSecond();
Set<Pair<Integer, AttributeMetaData>> excludedAttributes = new HashSet<>();
for (int i = 0; i < keyAttributes.length; ++i) {
excludedAttributes.add(new Pair<>(AttributeSource.SECOND_SOURCE, keyAttributes[i]));
}
return excludedAttributes;
}
@Override
protected boolean isIdNeeded() {
return getParameterAsBoolean(PARAMETER_USE_ID);
}
@Override
public List<ParameterType> getParameterTypes() {
List<ParameterType> types = super.getParameterTypes();
types.add(new ParameterTypeCategory(PARAMETER_JOIN_TYPE, "Specifies which join should be executed.", JOIN_TYPES,
JOIN_TYPE_INNER, false));
types.add(
new ParameterTypeBoolean(PARAMETER_USE_ID, "Indicates if the id attribute is used for join.", true, false));
ParameterType joinAttributes = new ParameterTypeList(PARAMETER_JOIN_ATTRIBUTES,
"The attributes which shall be used for join. Attributes which shall be matched must be of the same type.",
new ParameterTypeAttribute(PARAMETER_LEFT_ATTRIBUTE_FOR_JOIN,
"The attribute in the left example set to be used for the join.",
getInputPorts().getPortByName(LEFT_EXAMPLE_SET_INPUT), true),
new ParameterTypeAttribute(PARAMETER_RIGHT_ATTRIBUTE_FOR_JOIN,
"The attribute in the left example set to be used for the join.",
getInputPorts().getPortByName(RIGHT_EXAMPLE_SET_INPUT), true),
false);
joinAttributes.registerDependencyCondition(new BooleanParameterCondition(this, PARAMETER_USE_ID, true, false));
types.add(joinAttributes);
ParameterType keepBoth = new ParameterTypeBoolean(PARAMETER_KEEP_BOTH_JOIN_ATTRIBUTES,
"If checked, both columns of a join pair will be kept. Usually this is unneccessary since both attributes are identical.",
false, true);
types.add(keepBoth);
return types;
}
@Override
public ResourceConsumptionEstimator getResourceConsumptionEstimator() {
return OperatorResourceConsumptionHandler.getResourceConsumptionEstimator(getInputPorts().getPortByIndex(0),
ExampleSetJoin.class, null);
}
}