/*********************************************************************************************************************** * Copyright (C) 2010-2013 by the Stratosphere project (http://stratosphere.eu) * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the * specific language governing permissions and limitations under the License. **********************************************************************************************************************/ package eu.stratosphere.pact.runtime.hash; import static org.junit.Assert.fail; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import org.junit.After; import org.junit.Assert; import org.junit.Before; import org.junit.Test; import eu.stratosphere.api.common.typeutils.TypeComparator; import eu.stratosphere.api.common.typeutils.TypePairComparator; import eu.stratosphere.api.common.typeutils.TypeSerializer; import eu.stratosphere.api.java.record.functions.JoinFunction; import eu.stratosphere.core.memory.MemorySegment; import eu.stratosphere.nephele.services.iomanager.IOManager; import eu.stratosphere.nephele.services.memorymanager.MemoryAllocationException; import eu.stratosphere.nephele.services.memorymanager.MemoryManager; import eu.stratosphere.nephele.services.memorymanager.spi.DefaultMemoryManager; import eu.stratosphere.nephele.template.AbstractInvokable; import eu.stratosphere.nephele.template.AbstractTask; import eu.stratosphere.pact.runtime.hash.HashMatchIteratorITCase.RecordMatch; import eu.stratosphere.pact.runtime.hash.HashMatchIteratorITCase.RecordMatchRemovingJoin; import eu.stratosphere.pact.runtime.hash.HashTableITCase.ConstantsKeyValuePairsIterator; import eu.stratosphere.pact.runtime.hash.MutableHashTable.HashBucketIterator; import eu.stratosphere.api.java.typeutils.runtime.record.RecordComparator; import eu.stratosphere.api.java.typeutils.runtime.record.RecordPairComparator; import eu.stratosphere.api.java.typeutils.runtime.record.RecordSerializer; import eu.stratosphere.pact.runtime.test.util.DiscardingOutputCollector; import eu.stratosphere.pact.runtime.test.util.DummyInvokable; import eu.stratosphere.pact.runtime.test.util.TestData; import eu.stratosphere.pact.runtime.test.util.TestData.Generator; import eu.stratosphere.pact.runtime.test.util.TestData.Generator.KeyMode; import eu.stratosphere.pact.runtime.test.util.TestData.Generator.ValueMode; import eu.stratosphere.pact.runtime.test.util.TestData.Key; import eu.stratosphere.pact.runtime.test.util.UniformRecordGenerator; import eu.stratosphere.pact.runtime.test.util.UnionIterator; import eu.stratosphere.types.IntValue; import eu.stratosphere.types.Record; import eu.stratosphere.util.Collector; import eu.stratosphere.util.MutableObjectIterator; /** * Test specialized hash join that keeps the build side data (in memory and on hard disk) * This is used for iterative tasks. */ public class ReOpenableHashTableITCase { private static final int PAGE_SIZE = 8 * 1024; private static final long MEMORY_SIZE = PAGE_SIZE * 1000; // 100 Pages. private static final long SEED1 = 561349061987311L; private static final long SEED2 = 231434613412342L; private static final int NUM_PROBES = 3; // number of reopenings of hash join private final AbstractTask parentTask = new DummyInvokable(); private IOManager ioManager; private MemoryManager memoryManager; private TypeSerializer<Record> recordSerializer; private TypeComparator<Record> record1Comparator; private TypeComparator<Record> record2Comparator; private TypePairComparator<Record, Record> recordPairComparator; private static final AbstractInvokable MEM_OWNER = new DummyInvokable(); private TypeSerializer<Record> recordBuildSideAccesssor; private TypeSerializer<Record> recordProbeSideAccesssor; private TypeComparator<Record> recordBuildSideComparator; private TypeComparator<Record> recordProbeSideComparator; private TypePairComparator<Record, Record> pactRecordComparator; @SuppressWarnings("unchecked") @Before public void beforeTest() { this.recordSerializer = RecordSerializer.get(); this.record1Comparator = new RecordComparator(new int[] {0}, new Class[] {TestData.Key.class}); this.record2Comparator = new RecordComparator(new int[] {0}, new Class[] {TestData.Key.class}); this.recordPairComparator = new RecordPairComparator(new int[] {0}, new int[] {0}, new Class[] {TestData.Key.class}); final int[] keyPos = new int[] {0}; final Class<? extends Key>[] keyType = (Class<? extends Key>[]) new Class[] { IntValue.class }; this.recordBuildSideAccesssor = RecordSerializer.get(); this.recordProbeSideAccesssor = RecordSerializer.get(); this.recordBuildSideComparator = new RecordComparator(keyPos, keyType); this.recordProbeSideComparator = new RecordComparator(keyPos, keyType); this.pactRecordComparator = new HashTableITCase.RecordPairComparatorFirstInt(); this.memoryManager = new DefaultMemoryManager(MEMORY_SIZE, PAGE_SIZE); this.ioManager = new IOManager(); } @After public void afterTest() { if (this.ioManager != null) { this.ioManager.shutdown(); if (!this.ioManager.isProperlyShutDown()) { Assert.fail("I/O manager failed to properly shut down."); } this.ioManager = null; } if (this.memoryManager != null) { Assert.assertTrue("Memory Leak: Not all memory has been returned to the memory manager.", this.memoryManager.verifyEmpty()); this.memoryManager.shutdown(); this.memoryManager = null; } } /** * Test behavior with overflow buckets (Overflow buckets must be initialized correctly * if the input is reopened again) */ @Test public void testOverflow() { int buildSize = 1000; int probeSize = 1000; try { Generator bgen = new Generator(SEED1, 200, 1024, KeyMode.RANDOM, ValueMode.FIX_LENGTH); Generator pgen = new Generator(SEED2, 0, 1024, KeyMode.SORTED, ValueMode.FIX_LENGTH); final TestData.GeneratorIterator buildInput = new TestData.GeneratorIterator(bgen, buildSize); final TestData.GeneratorIterator probeInput = new TestData.GeneratorIterator(pgen, probeSize); doTest(buildInput,probeInput, bgen, pgen); } catch (Exception e) { e.printStackTrace(); Assert.fail("An exception occurred during the test: " + e.getMessage()); } } /** * Verify proper operation if the build side is spilled to disk. */ @Test public void testDoubleProbeSpilling() { int buildSize = 1000; int probeSize = 1000; try { Generator bgen = new Generator(SEED1, 0, 1024, KeyMode.SORTED, ValueMode.FIX_LENGTH); Generator pgen = new Generator(SEED2, 0, 1024, KeyMode.SORTED, ValueMode.FIX_LENGTH); final TestData.GeneratorIterator buildInput = new TestData.GeneratorIterator(bgen, buildSize); final TestData.GeneratorIterator probeInput = new TestData.GeneratorIterator(pgen, probeSize); doTest(buildInput,probeInput, bgen, pgen); } catch (Exception e) { e.printStackTrace(); Assert.fail("An exception occurred during the test: " + e.getMessage()); } } /** * This test case verifies that hybrid hash join is able to handle multiple probe phases * when the build side fits completely into memory. */ @Test public void testDoubleProbeInMemory() { int buildSize = 1000; int probeSize = 1000; try { Generator bgen = new Generator(SEED1, 0, 28, KeyMode.SORTED, ValueMode.FIX_LENGTH); Generator pgen = new Generator(SEED2, 0, 28, KeyMode.SORTED, ValueMode.FIX_LENGTH); final TestData.GeneratorIterator buildInput = new TestData.GeneratorIterator(bgen, buildSize); final TestData.GeneratorIterator probeInput = new TestData.GeneratorIterator(pgen, probeSize); doTest(buildInput,probeInput, bgen, pgen); } catch (Exception e) { e.printStackTrace(); Assert.fail("An exception occurred during the test: " + e.getMessage()); } } private void doTest(TestData.GeneratorIterator buildInput, TestData.GeneratorIterator probeInput, Generator bgen, Generator pgen) throws Exception { // collect expected data final Map<TestData.Key, Collection<RecordMatch>> expectedFirstMatchesMap = HashMatchIteratorITCase.matchRecordValues( HashMatchIteratorITCase.collectRecordData(buildInput), HashMatchIteratorITCase.collectRecordData(probeInput)); final List<Map<TestData.Key, Collection<RecordMatch>>> expectedNMatchesMapList = new ArrayList<Map<Key,Collection<RecordMatch>>>(NUM_PROBES); final JoinFunction[] nMatcher = new RecordMatchRemovingJoin[NUM_PROBES]; for(int i = 0; i < NUM_PROBES; i++) { Map<TestData.Key, Collection<RecordMatch>> tmp; expectedNMatchesMapList.add(tmp = deepCopy(expectedFirstMatchesMap)); nMatcher[i] = new RecordMatchRemovingJoin(tmp); } final JoinFunction firstMatcher = new RecordMatchRemovingJoin(expectedFirstMatchesMap); final Collector<Record> collector = new DiscardingOutputCollector<Record>(); // reset the generators bgen.reset(); pgen.reset(); buildInput.reset(); probeInput.reset(); // compare with iterator values BuildFirstReOpenableHashMatchIterator<Record, Record, Record> iterator = new BuildFirstReOpenableHashMatchIterator<Record, Record, Record>( buildInput, probeInput, this.recordSerializer, this.record1Comparator, this.recordSerializer, this.record2Comparator, this.recordPairComparator, this.memoryManager, ioManager, this.parentTask, MEMORY_SIZE); iterator.open(); // do first join with both inputs while (iterator.callWithNextKey(firstMatcher, collector)); // assert that each expected match was seen for the first input for (Entry<TestData.Key, Collection<RecordMatch>> entry : expectedFirstMatchesMap.entrySet()) { if (!entry.getValue().isEmpty()) { Assert.fail("Collection for key " + entry.getKey() + " is not empty"); } } for(int i = 0; i < NUM_PROBES; i++) { pgen.reset(); probeInput.reset(); // prepare .. iterator.reopenProbe(probeInput); // .. and do second join while (iterator.callWithNextKey(nMatcher[i], collector)); // assert that each expected match was seen for the second input for (Entry<TestData.Key, Collection<RecordMatch>> entry : expectedNMatchesMapList.get(i).entrySet()) { if (!entry.getValue().isEmpty()) { Assert.fail("Collection for key " + entry.getKey() + " is not empty"); } } } iterator.close(); } // // // Tests taken from HahTableITCase! // // private final MutableObjectIterator<Record> getProbeInput(final int numKeys, final int probeValsPerKey, final int repeatedValue1, final int repeatedValue2) { MutableObjectIterator<Record> probe1 = new UniformRecordGenerator(numKeys, probeValsPerKey, true); MutableObjectIterator<Record> probe2 = new ConstantsKeyValuePairsIterator(repeatedValue1, 17, 5); MutableObjectIterator<Record> probe3 = new ConstantsKeyValuePairsIterator(repeatedValue2, 23, 5); List<MutableObjectIterator<Record>> probes = new ArrayList<MutableObjectIterator<Record>>(); probes.add(probe1); probes.add(probe2); probes.add(probe3); return new UnionIterator<Record>(probes); } @Test public void testSpillingHashJoinWithMassiveCollisions() throws IOException { // the following two values are known to have a hash-code collision on the initial level. // we use them to make sure one partition grows over-proportionally large final int REPEATED_VALUE_1 = 40559; final int REPEATED_VALUE_2 = 92882; final int REPEATED_VALUE_COUNT_BUILD = 200000; final int REPEATED_VALUE_COUNT_PROBE = 5; final int NUM_KEYS = 1000000; final int BUILD_VALS_PER_KEY = 3; final int PROBE_VALS_PER_KEY = 10; // create a build input that gives 3 million pairs with 3 values sharing the same key, plus 400k pairs with two colliding keys MutableObjectIterator<Record> build1 = new UniformRecordGenerator(NUM_KEYS, BUILD_VALS_PER_KEY, false); MutableObjectIterator<Record> build2 = new ConstantsKeyValuePairsIterator(REPEATED_VALUE_1, 17, REPEATED_VALUE_COUNT_BUILD); MutableObjectIterator<Record> build3 = new ConstantsKeyValuePairsIterator(REPEATED_VALUE_2, 23, REPEATED_VALUE_COUNT_BUILD); List<MutableObjectIterator<Record>> builds = new ArrayList<MutableObjectIterator<Record>>(); builds.add(build1); builds.add(build2); builds.add(build3); MutableObjectIterator<Record> buildInput = new UnionIterator<Record>(builds); // allocate the memory for the HashTable List<MemorySegment> memSegments; try { memSegments = this.memoryManager.allocatePages(MEM_OWNER, 896); } catch (MemoryAllocationException maex) { fail("Memory for the Join could not be provided."); return; } // create the map for validating the results HashMap<Integer, Long> map = new HashMap<Integer, Long>(NUM_KEYS); // ---------------------------------------------------------------------------------------- final ReOpenableMutableHashTable<Record, Record> join = new ReOpenableMutableHashTable<Record, Record>( this.recordBuildSideAccesssor, this.recordProbeSideAccesssor, this.recordBuildSideComparator, this.recordProbeSideComparator, this.pactRecordComparator, memSegments, ioManager); for(int probe = 0; probe < NUM_PROBES; probe++) { // create a probe input that gives 10 million pairs with 10 values sharing a key MutableObjectIterator<Record> probeInput = getProbeInput(NUM_KEYS, PROBE_VALS_PER_KEY, REPEATED_VALUE_1, REPEATED_VALUE_2); if(probe == 0) { join.open(buildInput, probeInput); } else { join.reopenProbe(probeInput); } Record record; final Record recordReuse = new Record(); while (join.nextRecord()) { int numBuildValues = 0; final Record probeRec = join.getCurrentProbeRecord(); int key = probeRec.getField(0, IntValue.class).getValue(); HashBucketIterator<Record, Record> buildSide = join.getBuildSideIterator(); if ((record = buildSide.next(recordReuse)) != null) { numBuildValues = 1; Assert.assertEquals("Probe-side key was different than build-side key.", key, record.getField(0, IntValue.class).getValue()); } else { fail("No build side values found for a probe key."); } while ((record = buildSide.next(record)) != null) { numBuildValues++; Assert.assertEquals("Probe-side key was different than build-side key.", key, record.getField(0, IntValue.class).getValue()); } Long contained = map.get(key); if (contained == null) { contained = new Long(numBuildValues); } else { contained = new Long(contained.longValue() + numBuildValues); } map.put(key, contained); } } join.close(); Assert.assertEquals("Wrong number of keys", NUM_KEYS, map.size()); for (Map.Entry<Integer, Long> entry : map.entrySet()) { long val = entry.getValue(); int key = entry.getKey(); if( key == REPEATED_VALUE_1 || key == REPEATED_VALUE_2) { Assert.assertEquals("Wrong number of values in per-key cross product for key " + key, (PROBE_VALS_PER_KEY + REPEATED_VALUE_COUNT_PROBE) * (BUILD_VALS_PER_KEY + REPEATED_VALUE_COUNT_BUILD) * NUM_PROBES, val); } else { Assert.assertEquals("Wrong number of values in per-key cross product for key " + key, PROBE_VALS_PER_KEY * BUILD_VALS_PER_KEY * NUM_PROBES, val); } } // ---------------------------------------------------------------------------------------- this.memoryManager.release(join.getFreedMemory()); } /* * This test is basically identical to the "testSpillingHashJoinWithMassiveCollisions" test, only that the number * of repeated values (causing bucket collisions) are large enough to make sure that their target partition no longer * fits into memory by itself and needs to be repartitioned in the recursion again. */ @Test public void testSpillingHashJoinWithTwoRecursions() throws IOException { // the following two values are known to have a hash-code collision on the first recursion level. // we use them to make sure one partition grows over-proportionally large final int REPEATED_VALUE_1 = 40559; final int REPEATED_VALUE_2 = 92882; final int REPEATED_VALUE_COUNT_BUILD = 200000; final int REPEATED_VALUE_COUNT_PROBE = 5; final int NUM_KEYS = 1000000; final int BUILD_VALS_PER_KEY = 3; final int PROBE_VALS_PER_KEY = 10; // create a build input that gives 3 million pairs with 3 values sharing the same key, plus 400k pairs with two colliding keys MutableObjectIterator<Record> build1 = new UniformRecordGenerator(NUM_KEYS, BUILD_VALS_PER_KEY, false); MutableObjectIterator<Record> build2 = new ConstantsKeyValuePairsIterator(REPEATED_VALUE_1, 17, REPEATED_VALUE_COUNT_BUILD); MutableObjectIterator<Record> build3 = new ConstantsKeyValuePairsIterator(REPEATED_VALUE_2, 23, REPEATED_VALUE_COUNT_BUILD); List<MutableObjectIterator<Record>> builds = new ArrayList<MutableObjectIterator<Record>>(); builds.add(build1); builds.add(build2); builds.add(build3); MutableObjectIterator<Record> buildInput = new UnionIterator<Record>(builds); // allocate the memory for the HashTable List<MemorySegment> memSegments; try { memSegments = this.memoryManager.allocatePages(MEM_OWNER, 896); } catch (MemoryAllocationException maex) { fail("Memory for the Join could not be provided."); return; } // create the map for validating the results HashMap<Integer, Long> map = new HashMap<Integer, Long>(NUM_KEYS); // ---------------------------------------------------------------------------------------- final ReOpenableMutableHashTable<Record, Record> join = new ReOpenableMutableHashTable<Record, Record>( this.recordBuildSideAccesssor, this.recordProbeSideAccesssor, this.recordBuildSideComparator, this.recordProbeSideComparator, this.pactRecordComparator, memSegments, ioManager); for(int probe = 0; probe < NUM_PROBES; probe++) { // create a probe input that gives 10 million pairs with 10 values sharing a key MutableObjectIterator<Record> probeInput = getProbeInput(NUM_KEYS, PROBE_VALS_PER_KEY, REPEATED_VALUE_1, REPEATED_VALUE_2); if(probe == 0) { join.open(buildInput, probeInput); } else { join.reopenProbe(probeInput); } Record record; final Record recordReuse = new Record(); while (join.nextRecord()) { int numBuildValues = 0; final Record probeRec = join.getCurrentProbeRecord(); int key = probeRec.getField(0, IntValue.class).getValue(); HashBucketIterator<Record, Record> buildSide = join.getBuildSideIterator(); if ((record = buildSide.next(recordReuse)) != null) { numBuildValues = 1; Assert.assertEquals("Probe-side key was different than build-side key.", key, record.getField(0, IntValue.class).getValue()); } else { fail("No build side values found for a probe key."); } while ((record = buildSide.next(recordReuse)) != null) { numBuildValues++; Assert.assertEquals("Probe-side key was different than build-side key.", key, record.getField(0, IntValue.class).getValue()); } Long contained = map.get(key); if (contained == null) { contained = new Long(numBuildValues); } else { contained = new Long(contained.longValue() + numBuildValues); } map.put(key, contained); } } join.close(); Assert.assertEquals("Wrong number of keys", NUM_KEYS, map.size()); for (Map.Entry<Integer, Long> entry : map.entrySet()) { long val = entry.getValue(); int key = entry.getKey(); if( key == REPEATED_VALUE_1 || key == REPEATED_VALUE_2) { Assert.assertEquals("Wrong number of values in per-key cross product for key " + key, (PROBE_VALS_PER_KEY + REPEATED_VALUE_COUNT_PROBE) * (BUILD_VALS_PER_KEY + REPEATED_VALUE_COUNT_BUILD) * NUM_PROBES, val); } else { Assert.assertEquals("Wrong number of values in per-key cross product for key " + key, PROBE_VALS_PER_KEY * BUILD_VALS_PER_KEY * NUM_PROBES, val); } } // ---------------------------------------------------------------------------------------- this.memoryManager.release(join.getFreedMemory()); } static Map<Key, Collection<RecordMatch>> deepCopy(Map<Key, Collection<RecordMatch>> expectedSecondMatchesMap) { Map<Key, Collection<RecordMatch>> copy = new HashMap<Key, Collection<RecordMatch>>(expectedSecondMatchesMap.size()); for(Map.Entry<Key, Collection<RecordMatch>> entry : expectedSecondMatchesMap.entrySet()) { List<RecordMatch> matches = new ArrayList<RecordMatch>(entry.getValue().size()); for(RecordMatch m : entry.getValue()) { matches.add(m); } copy.put(entry.getKey(), matches); } return copy; } }