/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.runtime.operators.hash; import org.apache.flink.api.common.functions.FlatJoinFunction; import org.apache.flink.api.common.typeutils.TypeComparator; import org.apache.flink.api.common.typeutils.TypePairComparator; import org.apache.flink.api.common.typeutils.TypeSerializer; import org.apache.flink.runtime.io.disk.iomanager.IOManager; import org.apache.flink.runtime.io.disk.iomanager.IOManagerAsync; import org.apache.flink.runtime.jobgraph.tasks.AbstractInvokable; import org.apache.flink.runtime.memory.MemoryManager; import org.apache.flink.runtime.operators.testutils.DiscardingOutputCollector; import org.apache.flink.runtime.operators.testutils.DummyInvokable; import org.apache.flink.runtime.operators.testutils.TestData; import org.apache.flink.runtime.operators.testutils.TestData.TupleGenerator; import org.apache.flink.runtime.operators.testutils.TestData.TupleGenerator.KeyMode; import org.apache.flink.runtime.operators.testutils.TestData.TupleGenerator.ValueMode; import org.apache.flink.runtime.operators.testutils.UniformIntPairGenerator; import org.apache.flink.runtime.operators.testutils.UnionIterator; import org.apache.flink.runtime.operators.testutils.types.IntPair; import org.apache.flink.runtime.operators.testutils.types.IntPairSerializer; import org.apache.flink.types.NullKeyFieldException; import org.apache.flink.util.Collector; import org.apache.flink.util.MutableObjectIterator; import org.apache.flink.util.TestLogger; import org.junit.After; import org.junit.Assert; import org.junit.Before; import org.junit.Test; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import org.apache.flink.api.common.typeutils.GenericPairComparator; import org.apache.flink.api.java.tuple.Tuple2; @SuppressWarnings({"serial", "EqualsWhichDoesntCheckParameterClass", "StatementWithEmptyBody", "KeySetIterationMayUseEntrySet"}) public class NonReusingHashJoinIteratorITCase extends TestLogger { private static final int MEMORY_SIZE = 16000000; // total memory private static final int INPUT_1_SIZE = 20000; private static final int INPUT_2_SIZE = 1000; private static final long SEED1 = 561349061987311L; private static final long SEED2 = 231434613412342L; private final AbstractInvokable parentTask = new DummyInvokable(); private IOManager ioManager; private MemoryManager memoryManager; private TypeSerializer<Tuple2<Integer, String>> recordSerializer; private TypeComparator<Tuple2<Integer, String>> record1Comparator; private TypeComparator<Tuple2<Integer, String>> record2Comparator; private TypePairComparator<Tuple2<Integer, String>, Tuple2<Integer, String>> recordPairComparator; private TypeSerializer<IntPair> pairSerializer; private TypeComparator<IntPair> pairComparator; private TypePairComparator<IntPair, Tuple2<Integer, String>> pairRecordPairComparator; private TypePairComparator<Tuple2<Integer, String>, IntPair> recordPairPairComparator; @SuppressWarnings("unchecked") @Before public void beforeTest() { this.recordSerializer = TestData.getIntStringTupleSerializer(); this.record1Comparator = TestData.getIntStringTupleComparator(); this.record2Comparator = TestData.getIntStringTupleComparator(); this.recordPairComparator = new GenericPairComparator(record1Comparator, record2Comparator); this.pairSerializer = new IntPairSerializer(); this.pairComparator = new TestData.IntPairComparator(); this.pairRecordPairComparator = new IntPairTuplePairComparator(); this.recordPairPairComparator = new TupleIntPairPairComparator(); this.memoryManager = new MemoryManager(MEMORY_SIZE, 1); this.ioManager = new IOManagerAsync(); } @After public void afterTest() { if (this.ioManager != null) { this.ioManager.shutdown(); if (!this.ioManager.isProperlyShutDown()) { Assert.fail("I/O manager failed to properly shut down."); } this.ioManager = null; } if (this.memoryManager != null) { Assert.assertTrue("Memory Leak: Not all memory has been returned to the memory manager.", this.memoryManager.verifyEmpty()); this.memoryManager.shutdown(); this.memoryManager = null; } } @Test public void testBuildFirst() { try { TupleGenerator generator1 = new TupleGenerator(SEED1, 500, 4096, KeyMode.RANDOM, ValueMode.RANDOM_LENGTH); TupleGenerator generator2 = new TupleGenerator(SEED2, 500, 2048, KeyMode.RANDOM, ValueMode.RANDOM_LENGTH); final TestData.TupleGeneratorIterator input1 = new TestData.TupleGeneratorIterator(generator1, INPUT_1_SIZE); final TestData.TupleGeneratorIterator input2 = new TestData.TupleGeneratorIterator(generator2, INPUT_2_SIZE); // collect expected data final Map<Integer, Collection<TupleMatch>> expectedMatchesMap = joinTuples( collectTupleData(input1), collectTupleData(input2)); final TupleMatchRemovingJoin matcher = new TupleMatchRemovingJoin(expectedMatchesMap); final Collector<Tuple2<Integer, String>> collector = new DiscardingOutputCollector<Tuple2<Integer, String>>(); // reset the generators generator1.reset(); generator2.reset(); input1.reset(); input2.reset(); // compare with iterator values NonReusingBuildFirstHashJoinIterator<Tuple2<Integer, String>, Tuple2<Integer, String>, Tuple2<Integer, String>> iterator = new NonReusingBuildFirstHashJoinIterator<>( input1, input2, this.recordSerializer, this.record1Comparator, this.recordSerializer, this.record2Comparator, this.recordPairComparator, this.memoryManager, ioManager, this.parentTask, 1.0, false, false, true); iterator.open(); //noinspection StatementWithEmptyBody while (iterator.callWithNextKey(matcher, collector)); iterator.close(); // assert that each expected match was seen for (Entry<Integer, Collection<TupleMatch>> entry : expectedMatchesMap.entrySet()) { if (!entry.getValue().isEmpty()) { Assert.fail("Collection for key " + entry.getKey() + " is not empty"); } } } catch (Exception e) { e.printStackTrace(); Assert.fail("An exception occurred during the test: " + e.getMessage()); } } @Test public void testBuildFirstWithHighNumberOfCommonKeys() { // the size of the left and right inputs final int INPUT_1_SIZE = 200; final int INPUT_2_SIZE = 100; final int INPUT_1_DUPLICATES = 10; final int INPUT_2_DUPLICATES = 2000; final int DUPLICATE_KEY = 13; try { TupleGenerator generator1 = new TupleGenerator(SEED1, 500, 4096, KeyMode.RANDOM, ValueMode.RANDOM_LENGTH); TupleGenerator generator2 = new TupleGenerator(SEED2, 500, 2048, KeyMode.RANDOM, ValueMode.RANDOM_LENGTH); final TestData.TupleGeneratorIterator gen1Iter = new TestData.TupleGeneratorIterator(generator1, INPUT_1_SIZE); final TestData.TupleGeneratorIterator gen2Iter = new TestData.TupleGeneratorIterator(generator2, INPUT_2_SIZE); final TestData.TupleConstantValueIterator const1Iter = new TestData.TupleConstantValueIterator(DUPLICATE_KEY, "LEFT String for Duplicate Keys", INPUT_1_DUPLICATES); final TestData.TupleConstantValueIterator const2Iter = new TestData.TupleConstantValueIterator(DUPLICATE_KEY, "RIGHT String for Duplicate Keys", INPUT_2_DUPLICATES); final List<MutableObjectIterator<Tuple2<Integer, String>>> inList1 = new ArrayList<>(); inList1.add(gen1Iter); inList1.add(const1Iter); final List<MutableObjectIterator<Tuple2<Integer, String>>> inList2 = new ArrayList<>(); inList2.add(gen2Iter); inList2.add(const2Iter); MutableObjectIterator<Tuple2<Integer, String>> input1 = new UnionIterator<>(inList1); MutableObjectIterator<Tuple2<Integer, String>> input2 = new UnionIterator<>(inList2); // collect expected data final Map<Integer, Collection<TupleMatch>> expectedMatchesMap = joinTuples( collectTupleData(input1), collectTupleData(input2)); // re-create the whole thing for actual processing // reset the generators and iterators generator1.reset(); generator2.reset(); const1Iter.reset(); const2Iter.reset(); gen1Iter.reset(); gen2Iter.reset(); inList1.clear(); inList1.add(gen1Iter); inList1.add(const1Iter); inList2.clear(); inList2.add(gen2Iter); inList2.add(const2Iter); input1 = new UnionIterator<>(inList1); input2 = new UnionIterator<>(inList2); final TupleMatchRemovingJoin matcher = new TupleMatchRemovingJoin(expectedMatchesMap); final Collector<Tuple2<Integer, String>> collector = new DiscardingOutputCollector<>(); NonReusingBuildFirstHashJoinIterator<Tuple2<Integer, String>, Tuple2<Integer, String>, Tuple2<Integer, String>> iterator = new NonReusingBuildFirstHashJoinIterator<>( input1, input2, this.recordSerializer, this.record1Comparator, this.recordSerializer, this.record2Comparator, this.recordPairComparator, this.memoryManager, ioManager, this.parentTask, 1.0, false, false, true); iterator.open(); while (iterator.callWithNextKey(matcher, collector)); iterator.close(); // assert that each expected match was seen for (Entry<Integer, Collection<TupleMatch>> entry : expectedMatchesMap.entrySet()) { if (!entry.getValue().isEmpty()) { Assert.fail("Collection for key " + entry.getKey() + " is not empty"); } } } catch (Exception e) { e.printStackTrace(); Assert.fail("An exception occurred during the test: " + e.getMessage()); } } @Test public void testBuildSecond() { try { TupleGenerator generator1 = new TupleGenerator(SEED1, 500, 4096, KeyMode.RANDOM, ValueMode.RANDOM_LENGTH); TupleGenerator generator2 = new TupleGenerator(SEED2, 500, 2048, KeyMode.RANDOM, ValueMode.RANDOM_LENGTH); final TestData.TupleGeneratorIterator input1 = new TestData.TupleGeneratorIterator(generator1, INPUT_1_SIZE); final TestData.TupleGeneratorIterator input2 = new TestData.TupleGeneratorIterator(generator2, INPUT_2_SIZE); // collect expected data final Map<Integer, Collection<TupleMatch>> expectedMatchesMap = joinTuples( collectTupleData(input1), collectTupleData(input2)); final TupleMatchRemovingJoin matcher = new TupleMatchRemovingJoin(expectedMatchesMap); final Collector<Tuple2<Integer, String>> collector = new DiscardingOutputCollector<>(); // reset the generators generator1.reset(); generator2.reset(); input1.reset(); input2.reset(); // compare with iterator values NonReusingBuildSecondHashJoinIterator<Tuple2<Integer, String>, Tuple2<Integer, String>, Tuple2<Integer, String>> iterator = new NonReusingBuildSecondHashJoinIterator<>( input1, input2, this.recordSerializer, this.record1Comparator, this.recordSerializer, this.record2Comparator, this.recordPairComparator, this.memoryManager, ioManager, this.parentTask, 1.0, false, false, true); iterator.open(); while (iterator.callWithNextKey(matcher, collector)); iterator.close(); // assert that each expected match was seen for (Entry<Integer, Collection<TupleMatch>> entry : expectedMatchesMap.entrySet()) { if (!entry.getValue().isEmpty()) { Assert.fail("Collection for key " + entry.getKey() + " is not empty"); } } } catch (Exception e) { e.printStackTrace(); Assert.fail("An exception occurred during the test: " + e.getMessage()); } } @Test public void testBuildSecondWithHighNumberOfCommonKeys() { // the size of the left and right inputs final int INPUT_1_SIZE = 200; final int INPUT_2_SIZE = 100; final int INPUT_1_DUPLICATES = 10; final int INPUT_2_DUPLICATES = 2000; final int DUPLICATE_KEY = 13; try { TupleGenerator generator1 = new TupleGenerator(SEED1, 500, 4096, KeyMode.RANDOM, ValueMode.RANDOM_LENGTH); TupleGenerator generator2 = new TupleGenerator(SEED2, 500, 2048, KeyMode.RANDOM, ValueMode.RANDOM_LENGTH); final TestData.TupleGeneratorIterator gen1Iter = new TestData.TupleGeneratorIterator(generator1, INPUT_1_SIZE); final TestData.TupleGeneratorIterator gen2Iter = new TestData.TupleGeneratorIterator(generator2, INPUT_2_SIZE); final TestData.TupleConstantValueIterator const1Iter = new TestData.TupleConstantValueIterator(DUPLICATE_KEY, "LEFT String for Duplicate Keys", INPUT_1_DUPLICATES); final TestData.TupleConstantValueIterator const2Iter = new TestData.TupleConstantValueIterator(DUPLICATE_KEY, "RIGHT String for Duplicate Keys", INPUT_2_DUPLICATES); final List<MutableObjectIterator<Tuple2<Integer, String>>> inList1 = new ArrayList<>(); inList1.add(gen1Iter); inList1.add(const1Iter); final List<MutableObjectIterator<Tuple2<Integer, String>>> inList2 = new ArrayList<>(); inList2.add(gen2Iter); inList2.add(const2Iter); MutableObjectIterator<Tuple2<Integer, String>> input1 = new UnionIterator<>(inList1); MutableObjectIterator<Tuple2<Integer, String>> input2 = new UnionIterator<>(inList2); // collect expected data final Map<Integer, Collection<TupleMatch>> expectedMatchesMap = joinTuples( collectTupleData(input1), collectTupleData(input2)); // re-create the whole thing for actual processing // reset the generators and iterators generator1.reset(); generator2.reset(); const1Iter.reset(); const2Iter.reset(); gen1Iter.reset(); gen2Iter.reset(); inList1.clear(); inList1.add(gen1Iter); inList1.add(const1Iter); inList2.clear(); inList2.add(gen2Iter); inList2.add(const2Iter); input1 = new UnionIterator<>(inList1); input2 = new UnionIterator<>(inList2); final TupleMatchRemovingJoin matcher = new TupleMatchRemovingJoin(expectedMatchesMap); final Collector<Tuple2<Integer, String>> collector = new DiscardingOutputCollector<>(); NonReusingBuildSecondHashJoinIterator<Tuple2<Integer, String>, Tuple2<Integer, String>, Tuple2<Integer, String>> iterator = new NonReusingBuildSecondHashJoinIterator<>( input1, input2, this.recordSerializer, this.record1Comparator, this.recordSerializer, this.record2Comparator, this.recordPairComparator, this.memoryManager, ioManager, this.parentTask, 1.0, false, false, true); iterator.open(); while (iterator.callWithNextKey(matcher, collector)); iterator.close(); // assert that each expected match was seen for (Entry<Integer, Collection<TupleMatch>> entry : expectedMatchesMap.entrySet()) { if (!entry.getValue().isEmpty()) { Assert.fail("Collection for key " + entry.getKey() + " is not empty"); } } } catch (Exception e) { e.printStackTrace(); Assert.fail("An exception occurred during the test: " + e.getMessage()); } } @Test public void testBuildFirstWithMixedDataTypes() { try { MutableObjectIterator<IntPair> input1 = new UniformIntPairGenerator(500, 40, false); final TupleGenerator generator2 = new TupleGenerator(SEED2, 500, 2048, KeyMode.RANDOM, ValueMode.RANDOM_LENGTH); final TestData.TupleGeneratorIterator input2 = new TestData.TupleGeneratorIterator(generator2, INPUT_2_SIZE); // collect expected data final Map<Integer, Collection<TupleIntPairMatch>> expectedMatchesMap = joinIntPairs( collectIntPairData(input1), collectTupleData(input2)); final FlatJoinFunction<IntPair, Tuple2<Integer, String>, Tuple2<Integer, String>> matcher = new TupleIntPairMatchRemovingMatcher(expectedMatchesMap); final Collector<Tuple2<Integer, String>> collector = new DiscardingOutputCollector<>(); // reset the generators input1 = new UniformIntPairGenerator(500, 40, false); generator2.reset(); input2.reset(); // compare with iterator values NonReusingBuildSecondHashJoinIterator<IntPair, Tuple2<Integer, String>, Tuple2<Integer, String>> iterator = new NonReusingBuildSecondHashJoinIterator<>( input1, input2, this.pairSerializer, this.pairComparator, this.recordSerializer, this.record2Comparator, this.pairRecordPairComparator, this.memoryManager, this.ioManager, this.parentTask, 1.0, false, false, true); iterator.open(); while (iterator.callWithNextKey(matcher, collector)); iterator.close(); // assert that each expected match was seen for (Entry<Integer, Collection<TupleIntPairMatch>> entry : expectedMatchesMap.entrySet()) { if (!entry.getValue().isEmpty()) { Assert.fail("Collection for key " + entry.getKey() + " is not empty"); } } } catch (Exception e) { e.printStackTrace(); Assert.fail("An exception occurred during the test: " + e.getMessage()); } } @Test public void testBuildSecondWithMixedDataTypes() { try { MutableObjectIterator<IntPair> input1 = new UniformIntPairGenerator(500, 40, false); final TupleGenerator generator2 = new TupleGenerator(SEED2, 500, 2048, KeyMode.RANDOM, ValueMode.RANDOM_LENGTH); final TestData.TupleGeneratorIterator input2 = new TestData.TupleGeneratorIterator(generator2, INPUT_2_SIZE); // collect expected data final Map<Integer, Collection<TupleIntPairMatch>> expectedMatchesMap = joinIntPairs( collectIntPairData(input1), collectTupleData(input2)); final FlatJoinFunction<IntPair, Tuple2<Integer, String>, Tuple2<Integer, String>> matcher = new TupleIntPairMatchRemovingMatcher(expectedMatchesMap); final Collector<Tuple2<Integer, String>> collector = new DiscardingOutputCollector<>(); // reset the generators input1 = new UniformIntPairGenerator(500, 40, false); generator2.reset(); input2.reset(); // compare with iterator values NonReusingBuildFirstHashJoinIterator<IntPair, Tuple2<Integer, String>, Tuple2<Integer, String>> iterator = new NonReusingBuildFirstHashJoinIterator<>( input1, input2, this.pairSerializer, this.pairComparator, this.recordSerializer, this.record2Comparator, this.recordPairPairComparator, this.memoryManager, this.ioManager, this.parentTask, 1.0, false, false, true); iterator.open(); while (iterator.callWithNextKey(matcher, collector)); iterator.close(); // assert that each expected match was seen for (Entry<Integer, Collection<TupleIntPairMatch>> entry : expectedMatchesMap.entrySet()) { if (!entry.getValue().isEmpty()) { Assert.fail("Collection for key " + entry.getKey() + " is not empty"); } } } catch (Exception e) { e.printStackTrace(); Assert.fail("An exception occurred during the test: " + e.getMessage()); } } @Test public void testBuildFirstAndProbeSideOuterJoin() { try { TupleGenerator generator1 = new TupleGenerator(SEED1, 500, 4096, KeyMode.RANDOM, ValueMode.RANDOM_LENGTH); TupleGenerator generator2 = new TupleGenerator(SEED2, 1000, 2048, KeyMode.RANDOM, ValueMode.RANDOM_LENGTH); final TestData.TupleGeneratorIterator input1 = new TestData.TupleGeneratorIterator(generator1, INPUT_1_SIZE); final TestData.TupleGeneratorIterator input2 = new TestData.TupleGeneratorIterator(generator2, INPUT_2_SIZE); // collect expected data final Map<Integer, Collection<TupleMatch>> expectedMatchesMap = rightOuterJoinTuples( collectTupleData(input1), collectTupleData(input2)); final TupleMatchRemovingJoin matcher = new TupleMatchRemovingJoin(expectedMatchesMap); final Collector<Tuple2<Integer, String>> collector = new DiscardingOutputCollector<>(); // reset the generators generator1.reset(); generator2.reset(); input1.reset(); input2.reset(); // compare with iterator values NonReusingBuildFirstHashJoinIterator<Tuple2<Integer, String>, Tuple2<Integer, String>, Tuple2<Integer, String>> iterator = new NonReusingBuildFirstHashJoinIterator<>( input1, input2, this.recordSerializer, this.record1Comparator, this.recordSerializer, this.record2Comparator, this.recordPairComparator, this.memoryManager, ioManager, this.parentTask, 1.0, true, false, false); iterator.open(); while (iterator.callWithNextKey(matcher, collector)); iterator.close(); // assert that each expected match was seen for (Entry<Integer, Collection<TupleMatch>> entry : expectedMatchesMap.entrySet()) { if (!entry.getValue().isEmpty()) { Assert.fail("Collection for key " + entry.getKey() + " is not empty"); } } } catch (Exception e) { e.printStackTrace(); Assert.fail("An exception occurred during the test: " + e.getMessage()); } } @Test public void testBuildFirstAndBuildSideOuterJoin() { try { TupleGenerator generator1 = new TupleGenerator(SEED1, 500, 4096, KeyMode.RANDOM, ValueMode.RANDOM_LENGTH); TupleGenerator generator2 = new TupleGenerator(SEED2, 1000, 2048, KeyMode.RANDOM, ValueMode.RANDOM_LENGTH); final TestData.TupleGeneratorIterator input1 = new TestData.TupleGeneratorIterator(generator1, INPUT_1_SIZE); final TestData.TupleGeneratorIterator input2 = new TestData.TupleGeneratorIterator(generator2, INPUT_2_SIZE); // collect expected data final Map<Integer, Collection<TupleMatch>> expectedMatchesMap = leftOuterJoinTuples( collectTupleData(input1), collectTupleData(input2)); final TupleMatchRemovingJoin matcher = new TupleMatchRemovingJoin(expectedMatchesMap); final Collector<Tuple2<Integer, String>> collector = new DiscardingOutputCollector<>(); // reset the generators generator1.reset(); generator2.reset(); input1.reset(); input2.reset(); // compare with iterator values NonReusingBuildFirstHashJoinIterator<Tuple2<Integer, String>, Tuple2<Integer, String>, Tuple2<Integer, String>> iterator = new NonReusingBuildFirstHashJoinIterator<>( input1, input2, this.recordSerializer, this.record1Comparator, this.recordSerializer, this.record2Comparator, this.recordPairComparator, this.memoryManager, ioManager, this.parentTask, 1.0, false, true, false); iterator.open(); while (iterator.callWithNextKey(matcher, collector)); iterator.close(); // assert that each expected match was seen for (Entry<Integer, Collection<TupleMatch>> entry : expectedMatchesMap.entrySet()) { if (!entry.getValue().isEmpty()) { Assert.fail("Collection for key " + entry.getKey() + " is not empty"); } } } catch (Exception e) { e.printStackTrace(); Assert.fail("An exception occurred during the test: " + e.getMessage()); } } @Test public void testBuildFirstAndFullOuterJoin() { try { TupleGenerator generator1 = new TupleGenerator(SEED1, 500, 4096, KeyMode.RANDOM, ValueMode.RANDOM_LENGTH); TupleGenerator generator2 = new TupleGenerator(SEED2, 1000, 2048, KeyMode.RANDOM, ValueMode.RANDOM_LENGTH); final TestData.TupleGeneratorIterator input1 = new TestData.TupleGeneratorIterator(generator1, INPUT_1_SIZE); final TestData.TupleGeneratorIterator input2 = new TestData.TupleGeneratorIterator(generator2, INPUT_2_SIZE); // collect expected data final Map<Integer, Collection<TupleMatch>> expectedMatchesMap = fullOuterJoinTuples( collectTupleData(input1), collectTupleData(input2)); final TupleMatchRemovingJoin matcher = new TupleMatchRemovingJoin(expectedMatchesMap); final Collector<Tuple2<Integer, String>> collector = new DiscardingOutputCollector<>(); // reset the generators generator1.reset(); generator2.reset(); input1.reset(); input2.reset(); // compare with iterator values NonReusingBuildFirstHashJoinIterator<Tuple2<Integer, String>, Tuple2<Integer, String>, Tuple2<Integer, String>> iterator = new NonReusingBuildFirstHashJoinIterator<>( input1, input2, this.recordSerializer, this.record1Comparator, this.recordSerializer, this.record2Comparator, this.recordPairComparator, this.memoryManager, ioManager, this.parentTask, 1.0, true, true, false); iterator.open(); while (iterator.callWithNextKey(matcher, collector)); iterator.close(); // assert that each expected match was seen for (Entry<Integer, Collection<TupleMatch>> entry : expectedMatchesMap.entrySet()) { if (!entry.getValue().isEmpty()) { Assert.fail("Collection for key " + entry.getKey() + " is not empty"); } } } catch (Exception e) { e.printStackTrace(); Assert.fail("An exception occurred during the test: " + e.getMessage()); } } @Test public void testBuildSecondAndProbeSideOuterJoin() { try { TupleGenerator generator1 = new TupleGenerator(SEED1, 1000, 4096, KeyMode.RANDOM, ValueMode.RANDOM_LENGTH); TupleGenerator generator2 = new TupleGenerator(SEED2, 500, 2048, KeyMode.RANDOM, ValueMode.RANDOM_LENGTH); final TestData.TupleGeneratorIterator input1 = new TestData.TupleGeneratorIterator(generator1, INPUT_1_SIZE); final TestData.TupleGeneratorIterator input2 = new TestData.TupleGeneratorIterator(generator2, INPUT_2_SIZE); // collect expected data final Map<Integer, Collection<TupleMatch>> expectedMatchesMap = leftOuterJoinTuples( collectTupleData(input1), collectTupleData(input2)); final TupleMatchRemovingJoin matcher = new TupleMatchRemovingJoin(expectedMatchesMap); final Collector<Tuple2<Integer, String>> collector = new DiscardingOutputCollector<>(); // reset the generators generator1.reset(); generator2.reset(); input1.reset(); input2.reset(); // compare with iterator values NonReusingBuildSecondHashJoinIterator<Tuple2<Integer, String>, Tuple2<Integer, String>, Tuple2<Integer, String>> iterator = new NonReusingBuildSecondHashJoinIterator<>( input1, input2, this.recordSerializer, this.record1Comparator, this.recordSerializer, this.record2Comparator, this.recordPairComparator, this.memoryManager, ioManager, this.parentTask, 1.0, true, false, false); iterator.open(); while (iterator.callWithNextKey(matcher, collector)); iterator.close(); // assert that each expected match was seen for (Entry<Integer, Collection<TupleMatch>> entry : expectedMatchesMap.entrySet()) { if (!entry.getValue().isEmpty()) { Assert.fail("Collection for key " + entry.getKey() + " is not empty"); } } } catch (Exception e) { e.printStackTrace(); Assert.fail("An exception occurred during the test: " + e.getMessage()); } } @Test public void testBuildSecondAndBuildSideOuterJoin() { try { TupleGenerator generator1 = new TupleGenerator(SEED1, 1000, 4096, KeyMode.RANDOM, ValueMode.RANDOM_LENGTH); TupleGenerator generator2 = new TupleGenerator(SEED2, 500, 2048, KeyMode.RANDOM, ValueMode.RANDOM_LENGTH); final TestData.TupleGeneratorIterator input1 = new TestData.TupleGeneratorIterator(generator1, INPUT_1_SIZE); final TestData.TupleGeneratorIterator input2 = new TestData.TupleGeneratorIterator(generator2, INPUT_2_SIZE); // collect expected data final Map<Integer, Collection<TupleMatch>> expectedMatchesMap = rightOuterJoinTuples( collectTupleData(input1), collectTupleData(input2)); final TupleMatchRemovingJoin matcher = new TupleMatchRemovingJoin(expectedMatchesMap); final Collector<Tuple2<Integer, String>> collector = new DiscardingOutputCollector<>(); // reset the generators generator1.reset(); generator2.reset(); input1.reset(); input2.reset(); // compare with iterator values NonReusingBuildSecondHashJoinIterator<Tuple2<Integer, String>, Tuple2<Integer, String>, Tuple2<Integer, String>> iterator = new NonReusingBuildSecondHashJoinIterator<>( input1, input2, this.recordSerializer, this.record1Comparator, this.recordSerializer, this.record2Comparator, this.recordPairComparator, this.memoryManager, ioManager, this.parentTask, 1.0, false, true, false); iterator.open(); while (iterator.callWithNextKey(matcher, collector)); iterator.close(); // assert that each expected match was seen for (Entry<Integer, Collection<TupleMatch>> entry : expectedMatchesMap.entrySet()) { if (!entry.getValue().isEmpty()) { Assert.fail("Collection for key " + entry.getKey() + " is not empty"); } } } catch (Exception e) { e.printStackTrace(); Assert.fail("An exception occurred during the test: " + e.getMessage()); } } @Test public void testBuildSecondAndFullOuterJoin() { try { TupleGenerator generator1 = new TupleGenerator(SEED1, 1000, 4096, KeyMode.RANDOM, ValueMode.RANDOM_LENGTH); TupleGenerator generator2 = new TupleGenerator(SEED2, 500, 2048, KeyMode.RANDOM, ValueMode.RANDOM_LENGTH); final TestData.TupleGeneratorIterator input1 = new TestData.TupleGeneratorIterator(generator1, INPUT_1_SIZE); final TestData.TupleGeneratorIterator input2 = new TestData.TupleGeneratorIterator(generator2, INPUT_2_SIZE); // collect expected data final Map<Integer, Collection<TupleMatch>> expectedMatchesMap = fullOuterJoinTuples( collectTupleData(input1), collectTupleData(input2)); final TupleMatchRemovingJoin matcher = new TupleMatchRemovingJoin(expectedMatchesMap); final Collector<Tuple2<Integer, String>> collector = new DiscardingOutputCollector<>(); // reset the generators generator1.reset(); generator2.reset(); input1.reset(); input2.reset(); // compare with iterator values NonReusingBuildSecondHashJoinIterator<Tuple2<Integer, String>, Tuple2<Integer, String>, Tuple2<Integer, String>> iterator = new NonReusingBuildSecondHashJoinIterator<>( input1, input2, this.recordSerializer, this.record1Comparator, this.recordSerializer, this.record2Comparator, this.recordPairComparator, this.memoryManager, ioManager, this.parentTask, 1.0, true, true, false); iterator.open(); while (iterator.callWithNextKey(matcher, collector)); iterator.close(); // assert that each expected match was seen for (Entry<Integer, Collection<TupleMatch>> entry : expectedMatchesMap.entrySet()) { if (!entry.getValue().isEmpty()) { Assert.fail("Collection for key " + entry.getKey() + " is not empty"); } } } catch (Exception e) { e.printStackTrace(); Assert.fail("An exception occurred during the test: " + e.getMessage()); } } // -------------------------------------------------------------------------------------------- // Utilities // -------------------------------------------------------------------------------------------- public static Map<Integer, Collection<TupleMatch>> joinTuples( Map<Integer, Collection<String>> leftMap, Map<Integer, Collection<String>> rightMap) { Map<Integer, Collection<TupleMatch>> map = new HashMap<>(); for (Integer key : leftMap.keySet()) { Collection<String> leftValues = leftMap.get(key); Collection<String> rightValues = rightMap.get(key); if (rightValues == null) { continue; } if (!map.containsKey(key)) { map.put(key, new ArrayList<TupleMatch>()); } Collection<TupleMatch> matchedValues = map.get(key); for (String leftValue : leftValues) { for (String rightValue : rightValues) { matchedValues.add(new TupleMatch(leftValue, rightValue)); } } } return map; } public static Map<Integer, Collection<TupleMatch>> leftOuterJoinTuples( Map<Integer, Collection<String>> leftMap, Map<Integer, Collection<String>> rightMap) { Map<Integer, Collection<TupleMatch>> map = new HashMap<>(); for (Integer key : leftMap.keySet()) { Collection<String> leftValues = leftMap.get(key); Collection<String> rightValues = rightMap.get(key); if (!map.containsKey(key)) { map.put(key, new ArrayList<TupleMatch>()); } Collection<TupleMatch> matchedValues = map.get(key); for (String leftValue : leftValues) { if(rightValues != null) { for (String rightValue : rightValues) { matchedValues.add(new TupleMatch(leftValue, rightValue)); } } else { matchedValues.add(new TupleMatch(leftValue, null)); } } } return map; } public static Map<Integer, Collection<TupleMatch>> rightOuterJoinTuples( Map<Integer, Collection<String>> leftMap, Map<Integer, Collection<String>> rightMap) { Map<Integer, Collection<TupleMatch>> map = new HashMap<>(); for (Integer key : rightMap.keySet()) { Collection<String> leftValues = leftMap.get(key); Collection<String> rightValues = rightMap.get(key); if (!map.containsKey(key)) { map.put(key, new ArrayList<TupleMatch>()); } Collection<TupleMatch> matchedValues = map.get(key); for (String rightValue : rightValues) { if(leftValues != null) { for (String leftValue : leftValues) { matchedValues.add(new TupleMatch(leftValue, rightValue)); } } else { matchedValues.add(new TupleMatch(null, rightValue)); } } } return map; } public static Map<Integer, Collection<TupleMatch>> fullOuterJoinTuples( Map<Integer, Collection<String>> leftMap, Map<Integer, Collection<String>> rightMap) { Map<Integer, Collection<TupleMatch>> map = new HashMap<>(); for (Integer key : rightMap.keySet()) { Collection<String> leftValues = leftMap.get(key); Collection<String> rightValues = rightMap.get(key); if (!map.containsKey(key)) { map.put(key, new ArrayList<TupleMatch>()); } Collection<TupleMatch> matchedValues = map.get(key); for (String rightValue : rightValues) { if(leftValues != null) { for (String leftValue : leftValues) { matchedValues.add(new TupleMatch(leftValue, rightValue)); } } else { matchedValues.add(new TupleMatch(null, rightValue)); } } } for (Integer key : leftMap.keySet()) { Collection<String> leftValues = leftMap.get(key); Collection<String> rightValues = rightMap.get(key); if (rightValues == null) { if (!map.containsKey(key)) { map.put(key, new ArrayList<TupleMatch>()); } Collection<TupleMatch> matchedValues = map.get(key); for (String leftValue : leftValues) { matchedValues.add(new TupleMatch(leftValue, null)); } } } return map; } public static Map<Integer, Collection<TupleIntPairMatch>> joinIntPairs( Map<Integer, Collection<Integer>> leftMap, Map<Integer, Collection<String>> rightMap) { final Map<Integer, Collection<TupleIntPairMatch>> map = new HashMap<>(); for (Integer i : leftMap.keySet()) { final Collection<Integer> leftValues = leftMap.get(i); final Collection<String> rightValues = rightMap.get(i); if (rightValues == null) { continue; } if (!map.containsKey(i)) { map.put(i, new ArrayList<TupleIntPairMatch>()); } final Collection<TupleIntPairMatch> matchedValues = map.get(i); for (Integer v : leftValues) { for (String val : rightValues) { matchedValues.add(new TupleIntPairMatch(v, val)); } } } return map; } public static Map<Integer, Collection<String>> collectTupleData(MutableObjectIterator<Tuple2<Integer, String>> iter) throws Exception { Map<Integer, Collection<String>> map = new HashMap<>(); Tuple2<Integer, String> pair = new Tuple2<>(); while ((pair = iter.next(pair)) != null) { Integer key = pair.f0; if (!map.containsKey(key)) { map.put(key, new ArrayList<String>()); } Collection<String> values = map.get(key); values.add(pair.f1); } return map; } public static Map<Integer, Collection<Integer>> collectIntPairData(MutableObjectIterator<IntPair> iter) throws Exception { Map<Integer, Collection<Integer>> map = new HashMap<>(); IntPair pair = new IntPair(); while ((pair = iter.next(pair)) != null) { final int key = pair.getKey(); final int value = pair.getValue(); if (!map.containsKey(key)) { map.put(key, new ArrayList<Integer>()); } Collection<Integer> values = map.get(key); values.add(value); } return map; } /** * Class used for storage of the expected matches in a hash-map. */ public static class TupleMatch { private final String left; private final String right; public TupleMatch(String left, String right) { this.left = left; this.right = right; } @Override public boolean equals(Object obj) { TupleMatch that = (TupleMatch) obj; return (this.right == null ? that.right == null : (that.right != null && this.right.equals(that.right))) && (this.left == null ? that.left == null : (that.left != null && this.left.equals(that.left))); } @Override public int hashCode() { int hc = this.left != null ? this.left.hashCode() : 23; hc = hc ^ (this.right != null ? this.right.hashCode() : 41); return hc; } @Override public String toString() { String s = left == null ? "<null>" : left; s += ", " + (right == null ? "<null>" : right); return s; } } /** * Private class used for storage of the expected matches in a hash-map. */ public static class TupleIntPairMatch { private final int left; private final String right; public TupleIntPairMatch(int left, String right) { this.left = left; this.right = right; } @Override public boolean equals(Object obj) { TupleIntPairMatch o = (TupleIntPairMatch) obj; return this.left == o.left && this.right.equals(o.right); } @Override public int hashCode() { return this.left ^ this.right.hashCode(); } @Override public String toString() { return left + ", " + right; } } static final class TupleMatchRemovingJoin implements FlatJoinFunction<Tuple2<Integer, String>, Tuple2<Integer, String>, Tuple2<Integer, String>> { private final Map<Integer, Collection<TupleMatch>> toRemoveFrom; protected TupleMatchRemovingJoin(Map<Integer, Collection<TupleMatch>> map) { this.toRemoveFrom = map; } @Override public void join(Tuple2<Integer, String> rec1, Tuple2<Integer, String> rec2, Collector<Tuple2<Integer, String>> out) throws Exception { int key = rec1 != null ? rec1.f0 : rec2.f0; String value1 = rec1 != null ? rec1.f1 : null; String value2 = rec2 != null ? rec2.f1 : null; //System.err.println("rec1 key = "+key+" rec2 key= "+rec2.f0); Collection<TupleMatch> matches = this.toRemoveFrom.get(key); if (matches == null) { Assert.fail("Match " + key + " - " + value1 + ":" + value2 + " is unexpected."); } Assert.assertTrue("Produced match was not contained: " + key + " - " + value1 + ":" + value2, matches.remove(new TupleMatch(value1, value2))); if (matches.isEmpty()) { this.toRemoveFrom.remove(key); } } } static final class TupleIntPairMatchRemovingMatcher implements FlatJoinFunction<IntPair, Tuple2<Integer, String>, Tuple2<Integer, String>> { private final Map<Integer, Collection<TupleIntPairMatch>> toRemoveFrom; protected TupleIntPairMatchRemovingMatcher(Map<Integer, Collection<TupleIntPairMatch>> map) { this.toRemoveFrom = map; } @Override public void join(IntPair rec1, Tuple2<Integer, String> rec2, Collector<Tuple2<Integer, String>> out) throws Exception { final int k = rec1.getKey(); final int v = rec1.getValue(); final Integer key = rec2.f0; final String value = rec2.f1; Assert.assertTrue("Key does not match for matching IntPair Tuple combination.", k == key); Collection<TupleIntPairMatch> matches = this.toRemoveFrom.get(key); if (matches == null) { Assert.fail("Match " + key + " - " + v + ":" + value + " is unexpected."); } Assert.assertTrue("Produced match was not contained: " + key + " - " + v + ":" + value, matches.remove(new TupleIntPairMatch(v, value))); if (matches.isEmpty()) { this.toRemoveFrom.remove(key); } } } static final class IntPairTuplePairComparator extends TypePairComparator<IntPair, Tuple2<Integer, String>> { private int reference; @Override public void setReference(IntPair reference) { this.reference = reference.getKey(); } @Override public boolean equalToReference(Tuple2<Integer, String> candidate) { try { return candidate.f0 == this.reference; } catch (NullPointerException npex) { throw new NullKeyFieldException(); } } @Override public int compareToReference(Tuple2<Integer, String> candidate) { try { return candidate.f0 - this.reference; } catch (NullPointerException npex) { throw new NullKeyFieldException(); } } } static final class TupleIntPairPairComparator extends TypePairComparator<Tuple2<Integer, String>, IntPair> { private int reference; @Override public void setReference(Tuple2<Integer, String> reference) { this.reference = reference.f0; } @Override public boolean equalToReference(IntPair candidate) { return this.reference == candidate.getKey(); } @Override public int compareToReference(IntPair candidate) { return candidate.getKey() - this.reference; } } }