package edu.washington.escience.myria.operator; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; import java.util.AbstractMap.SimpleEntry; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import org.junit.Test; import com.google.common.collect.HashMultiset; import com.google.common.collect.ImmutableList; import com.google.common.collect.Multiset; import edu.washington.escience.myria.DbException; import edu.washington.escience.myria.Schema; import edu.washington.escience.myria.Type; import edu.washington.escience.myria.storage.TupleBatch; import edu.washington.escience.myria.storage.TupleBatchBuffer; import edu.washington.escience.myria.util.TestUtils; import edu.washington.escience.myria.util.TestEnvVars; public class OperatorTest { public class EntryComparator implements Comparator<Entry<Long, String>> { @Override public int compare(final Entry<Long, String> o1, final Entry<Long, String> o2) { int res = o1.getKey().compareTo(o2.getKey()); if (res != 0) { return res; } return o1.getValue().compareTo(o2.getValue()); } } /** * @param numTuples how many tuples in output * @param sampleSize how many different values should be created at random (around numTuples/sampleSize duplicates) * @param sorted Generate sorted tuples, sorted by id * @return */ public TupleBatchBuffer generateRandomTuples( final int numTuples, final int sampleSize, final boolean sorted) { final ArrayList<Entry<Long, String>> entries = new ArrayList<Entry<Long, String>>(); final long[] ids = TestUtils.randomLong(0, sampleSize, numTuples); final String[] names = TestUtils.randomFixedLengthNumericString(0, sampleSize, numTuples, 20); for (int i = 0; i < numTuples; i++) { entries.add(new SimpleEntry<Long, String>(ids[i], names[i])); } Comparator<Entry<Long, String>> comparator = new EntryComparator(); if (sorted) { Collections.sort(entries, comparator); } final Schema schema = new Schema( ImmutableList.of(Type.LONG_TYPE, Type.STRING_TYPE), ImmutableList.of("id", "name")); final TupleBatchBuffer tbb = new TupleBatchBuffer(schema); for (Entry<Long, String> entry : entries) { tbb.putLong(0, entry.getKey()); tbb.putString(1, entry.getValue()); } return tbb; } @Test public void testUnionAllConstructorWithNull() throws DbException { BatchTupleSource[] children = new BatchTupleSource[1]; children[0] = new BatchTupleSource(generateRandomTuples(10, 1000, false)); UnionAll union = new UnionAll(null); union.setChildren(children); } @Test public void testUnionAllCount() throws DbException { BatchTupleSource[] children = new BatchTupleSource[3]; children[0] = new BatchTupleSource(generateRandomTuples(12300, 5000, false)); children[1] = new BatchTupleSource(generateRandomTuples(4200, 2000, false)); children[2] = new BatchTupleSource(generateRandomTuples(19900, 5000, false)); UnionAll union = new UnionAll(children); union.open(TestEnvVars.get()); TupleBatch tb = null; int count = 0; while (!union.eos()) { tb = union.nextReady(); if (tb != null) { count += tb.numTuples(); } } union.close(); assertEquals(12300 + 4200 + 19900, count); } @Test public void testUnionAllCorrectTuples() throws DbException { TupleBatchBuffer[] randomTuples = new TupleBatchBuffer[2]; randomTuples[0] = generateRandomTuples(12300, 5000, false); randomTuples[1] = generateRandomTuples(4200, 2000, false); BatchTupleSource[] children = new BatchTupleSource[2]; children[0] = new BatchTupleSource(randomTuples[0]); children[1] = new BatchTupleSource(randomTuples[1]); UnionAll union = new UnionAll(children); union.open(TestEnvVars.get()); TupleBatch tb; Multiset<Long> actualCounts = HashMultiset.create(); while (!union.eos()) { tb = union.nextReady(); if (tb != null) { for (int i = 0; i < tb.numTuples(); i++) { long index = tb.getLong(0, i); actualCounts.add(index); } } } union.close(); Multiset<Long> expectedCounts = HashMultiset.create(); for (TupleBatchBuffer randomTuple : randomTuples) { for (TupleBatch tuples : randomTuple.getAll()) { for (int j = 0; j < tuples.numTuples(); j++) { Long index = tuples.getLong(0, j); expectedCounts.add(index); } } } for (Multiset.Entry<Long> expectedEntry : expectedCounts.entrySet()) { assertEquals(expectedEntry.getCount(), actualCounts.count(expectedEntry.getElement())); } } @Test public void testMergeConstructorWithNull() throws DbException { BatchTupleSource[] children = new BatchTupleSource[1]; children[0] = new BatchTupleSource(generateRandomTuples(10, 10, false)); Merge merge = new Merge(null, null, null); merge.setChildren(children); merge.setSortedColumns(new int[] {0}, new boolean[] {true}); } @Test public void testMergeCount() throws DbException { BatchTupleSource[] children = new BatchTupleSource[3]; children[0] = new BatchTupleSource(generateRandomTuples(12300, 5000, true)); children[1] = new BatchTupleSource(generateRandomTuples(4200, 2000, true)); children[2] = new BatchTupleSource(generateRandomTuples(9900, 5000, true)); NAryOperator merge = new Merge(children, new int[] {0}, new boolean[] {true}); merge.open(TestEnvVars.get()); TupleBatch tb = null; int count = 0; while (!merge.eos()) { tb = merge.nextReady(); if (tb != null) { count += tb.numTuples(); } } merge.close(); assertEquals(12300 + 4200 + 9900, count); } @Test public void testMergeTuplesSorted() throws DbException { TupleBatchBuffer[] randomTuples = new TupleBatchBuffer[3]; randomTuples[0] = generateRandomTuples(52300, 5000, true); randomTuples[1] = generateRandomTuples(14200, 5000, true); randomTuples[2] = generateRandomTuples(29900, 5000, true); BatchTupleSource[] children = new BatchTupleSource[3]; children[0] = new BatchTupleSource(randomTuples[0]); children[1] = new BatchTupleSource(randomTuples[1]); children[2] = new BatchTupleSource(randomTuples[2]); NAryOperator merge = new Merge(children, new int[] {0, 1}, new boolean[] {true, true}); merge.open(TestEnvVars.get()); TupleBatch tb; final ArrayList<Entry<Long, String>> entries = new ArrayList<Entry<Long, String>>(); while (!merge.eos()) { tb = merge.nextReady(); if (tb != null) { for (int i = 0; i < tb.numTuples(); i++) { entries.add(new SimpleEntry<Long, String>(tb.getLong(0, i), tb.getString(1, i))); } } } merge.close(); assertEquals(52300 + 14200 + 29900, entries.size()); Comparator<Entry<Long, String>> comparator = new EntryComparator(); Entry<Long, String> previous = null; for (Entry<Long, String> entry : entries) { if (previous != null) { assertTrue(comparator.compare(previous, entry) <= 0); } previous = entry; } } @Test public void testOrderedDupElim() throws DbException { TupleBatchBuffer randomTuples = generateRandomTuples(52300, 5000, true); BatchTupleSource child = new BatchTupleSource(randomTuples); OrderedDupElim dupElim = new OrderedDupElim(child); int count = 0; /* Count the dupelim */ dupElim.open(TestEnvVars.get()); while (!dupElim.eos()) { TupleBatch tb = dupElim.nextReady(); if (tb == null) { continue; } count += tb.numTuples(); } dupElim.close(); /* Count the real answer */ Map<Long, Set<String>> map = new HashMap<Long, Set<String>>(); for (TupleBatch tuples : randomTuples.getAll()) { for (int i = 0; i < tuples.numTuples(); ++i) { Set<String> set = map.get(tuples.getLong(0, i)); if (set == null) { set = new HashSet<String>(); map.put(tuples.getLong(0, i), set); } set.add(tuples.getString(1, i)); } } int realCount = 0; for (Set<String> set : map.values()) { realCount += set.size(); } assertEquals(count, realCount); } @Test public void testKeepAndSortedOnMinValue() throws DbException { final int N = 52345; final int MaxID = 10000; final Schema schema = new Schema( ImmutableList.of(Type.LONG_TYPE, Type.LONG_TYPE), ImmutableList.of("id", "value")); TupleBatchBuffer input = new TupleBatchBuffer(schema); final int[] ids = new int[N]; for (int i = 0; i < N; ++i) { ids[i] = i; } long[] values1 = TestUtils.randomLong(0, MaxID, N); long[] values2 = TestUtils.randomLong(0, MaxID, N); long[] values3 = TestUtils.randomLong(0, MaxID, N); long[] minValue = new long[N]; for (int i = 0; i < N; ++i) { input.putLong(0, ids[i]); input.putLong(1, values1[i]); input.putLong(0, ids[i]); input.putLong(1, values2[i]); input.putLong(0, ids[i]); input.putLong(1, values3[i]); minValue[i] = Math.min(values1[i], Math.min(values2[i], values3[i])); } BatchTupleSource scan = new BatchTupleSource(input); StreamingStateWrapper keepmin = new StreamingStateWrapper(scan, new KeepAndSortOnMinValue(new int[] {0}, new int[] {1})); keepmin.open(TestEnvVars.get()); while (!keepmin.eos()) { keepmin.nextReady(); long lastValue = -1; List<TupleBatch> result = keepmin.getStreamingState().exportState(); for (TupleBatch tb : result) { for (int i = 0; i < tb.numTuples(); i++) { long value = tb.getLong(1, i); assertTrue(lastValue <= value); lastValue = value; } } } long lastValue = -1; double sum = 0; List<TupleBatch> result = keepmin.getStreamingState().exportState(); for (TupleBatch tb : result) { for (int i = 0; i < tb.numTuples(); i++) { long value = tb.getLong(1, i); assertTrue(lastValue <= value); lastValue = value; assertTrue(minValue[(int) (tb.getLong(0, i))] == value); } sum += tb.numTuples(); } keepmin.close(); assertTrue(sum == N); } @Test public void testMergeJoin() throws DbException { final Schema leftSchema = new Schema( ImmutableList.of(Type.LONG_TYPE, Type.STRING_TYPE), ImmutableList.of("id", "name")); TupleBatchBuffer leftTbb = new TupleBatchBuffer(leftSchema); { long[] ids = new long[] {0, 2, 2, 2, 3, 5, 6, 8, 8, 10}; String[] names = new String[] {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j"}; for (int i = 0; i < ids.length; i++) { leftTbb.putLong(0, ids[i]); leftTbb.putString(1, names[i]); } } final Schema rightSchema = new Schema( ImmutableList.of(Type.LONG_TYPE, Type.STRING_TYPE), ImmutableList.of("id2", "name2")); TupleBatchBuffer rightTbb = new TupleBatchBuffer(rightSchema); { long[] ids = new long[] {1, 2, 2, 4, 8, 8, 10}; String[] names = new String[] {"a", "b", "c", "d", "e", "f", "g"}; for (int i = 0; i < ids.length; i++) { rightTbb.putLong(0, ids[i]); rightTbb.putString(1, names[i]); } } BatchTupleSource[] children = new BatchTupleSource[2]; children[0] = new BatchTupleSource(leftTbb); children[1] = new BatchTupleSource(rightTbb); BinaryOperator join = new MergeJoin(children[0], children[1], new int[] {0}, new int[] {0}, new boolean[] {true}); join.open(TestEnvVars.get()); TupleBatch tb; final ArrayList<TupleBatch> batches = new ArrayList<TupleBatch>(); while (!join.eos()) { tb = join.nextReady(); if (tb != null) { batches.add(tb); } } join.close(); assertEquals(1, batches.size()); assertEquals(11, batches.get(0).numTuples()); } @Test public void testMergeJoinLarge() throws DbException { TupleBatchBuffer[] randomTuples = new TupleBatchBuffer[2]; randomTuples[0] = generateRandomTuples(12200, 12000, true); randomTuples[1] = generateRandomTuples(13200, 13000, true); // we need to rename the columns from the second tuples ImmutableList.Builder<String> sb = ImmutableList.builder(); sb.add("id").add("name"); sb.add("id2").add("name2"); BatchTupleSource[] children = new BatchTupleSource[2]; children[0] = new BatchTupleSource(randomTuples[0]); children[1] = new BatchTupleSource(randomTuples[1]); BinaryOperator join = new MergeJoin( sb.build(), children[0], children[1], new int[] {0}, new int[] {0}, new boolean[] {true}); join.open(TestEnvVars.get()); TupleBatch tb; final ArrayList<Entry<Long, String>> entries = new ArrayList<Entry<Long, String>>(); while (!join.eos()) { tb = join.nextReady(); if (tb != null) { for (int i = 0; i < tb.numTuples(); i++) { entries.add(new SimpleEntry<Long, String>(tb.getLong(0, i), tb.getString(1, i))); } } } // output should be sorted by join keys Entry<Long, String> previous = null; for (Entry<Long, String> entry : entries) { if (previous != null) { assertTrue(previous.getKey() <= entry.getKey()); } previous = entry; } join.close(); } @Test public void testMergeJoinCross() throws DbException { final Schema schema = new Schema( ImmutableList.of(Type.LONG_TYPE, Type.LONG_TYPE), ImmutableList.of("id", "value")); TupleBatchBuffer[] randomTuples = new TupleBatchBuffer[2]; randomTuples[0] = new TupleBatchBuffer(schema); randomTuples[1] = new TupleBatchBuffer(schema); for (int i = 0; i < 5; i++) { randomTuples[0].putLong(0, 42); randomTuples[0].putLong(1, i); randomTuples[1].putLong(0, 42); randomTuples[1].putLong(1, 100 + i); } // we need to rename the columns from the second tuples ImmutableList.Builder<String> sb = ImmutableList.builder(); sb.addAll(schema.getColumnNames()); sb.add("id2"); sb.add("value2"); BatchTupleSource[] children = new BatchTupleSource[2]; children[0] = new BatchTupleSource(randomTuples[0]); children[1] = new BatchTupleSource(randomTuples[1]); BinaryOperator join = new MergeJoin( sb.build(), children[0], children[1], new int[] {0}, new int[] {0}, new boolean[] {true}); join.open(TestEnvVars.get()); TupleBatch tb = null; int count = 0; Multiset<Long> left = HashMultiset.create(); Multiset<Long> right = HashMultiset.create(); while (!join.eos()) { tb = join.nextReady(); if (tb != null) { count += tb.numTuples(); for (int i = 0; i < tb.numTuples(); i++) { left.add(tb.getLong(1, i)); right.add(tb.getLong(3, i)); } } } for (long i = 0; i < 5; i++) { assertEquals(5, left.count(i)); assertEquals(5, right.count(i + 100)); } assertEquals(25, count); join.close(); } @Test public void testInMemoryOrderBy() throws DbException { TupleBatchBuffer randomTuples = generateRandomTuples(52300, 5000, false); BatchTupleSource child = new BatchTupleSource(randomTuples); InMemoryOrderBy order = new InMemoryOrderBy(child, new int[] {0, 1}, new boolean[] {true, true}); order.open(TestEnvVars.get()); TupleBatch tb; final ArrayList<Entry<Long, String>> entries = new ArrayList<Entry<Long, String>>(); while (!order.eos()) { tb = order.nextReady(); if (tb != null) { for (int i = 0; i < tb.numTuples(); i++) { entries.add(new SimpleEntry<Long, String>(tb.getLong(0, i), tb.getString(1, i))); } } } order.close(); assertEquals(52300, entries.size()); Comparator<Entry<Long, String>> comparator = new EntryComparator(); Entry<Long, String> previous = null; for (Entry<Long, String> entry : entries) { if (previous != null) { assertTrue(comparator.compare(previous, entry) <= 0); } previous = entry; } } @Test public void testInMemoryOrderBy2() throws DbException { // we had a bug where ordering by certain subsets of the columns caused index out of bound exceptions. in other // cases only the results were wrong. TupleBatchBuffer randomTuples = generateRandomTuples(52300, 5000, false); BatchTupleSource child = new BatchTupleSource(randomTuples); InMemoryOrderBy order = new InMemoryOrderBy(child, new int[] {1}, new boolean[] {false}); order.open(TestEnvVars.get()); TupleBatch tb; final ArrayList<String> entries = new ArrayList<String>(); while (!order.eos()) { tb = order.nextReady(); if (tb != null) { for (int i = 0; i < tb.numTuples(); i++) { entries.add(tb.getString(1, i)); } } } order.close(); assertEquals(52300, entries.size()); String previous = null; for (String entry : entries) { if (previous != null) { assertTrue(previous.compareTo(entry) >= 0); } previous = entry; } } @Test public void testOrderByAndMergeJoin() throws DbException { final Schema leftSchema = new Schema( ImmutableList.of(Type.LONG_TYPE, Type.STRING_TYPE), ImmutableList.of("id", "name")); TupleBatchBuffer leftTbb = new TupleBatchBuffer(leftSchema); { long[] ids = new long[] {2, 3, 5, 6, 8, 8, 10, 0, 2, 2}; String[] names = new String[] {"d", "e", "f", "g", "h", "i", "j", "a", "b", "c"}; for (int i = 0; i < ids.length; i++) { leftTbb.putLong(0, ids[i]); leftTbb.putString(1, names[i]); } } final Schema rightSchema = new Schema( ImmutableList.of(Type.LONG_TYPE, Type.STRING_TYPE), ImmutableList.of("id2", "name2")); TupleBatchBuffer rightTbb = new TupleBatchBuffer(rightSchema); { long[] ids = new long[] {1, 2, 2, 4, 8, 8, 10}; String[] names = new String[] {"a", "b", "c", "d", "e", "f", "g"}; for (int i = 0; i < ids.length; i++) { rightTbb.putLong(0, ids[i]); rightTbb.putString(1, names[i]); } } BatchTupleSource[] children = new BatchTupleSource[2]; children[0] = new BatchTupleSource(leftTbb); children[1] = new BatchTupleSource(rightTbb); InMemoryOrderBy sort0 = new InMemoryOrderBy(children[0], new int[] {0}, new boolean[] {false}); InMemoryOrderBy sort1 = new InMemoryOrderBy(children[1], new int[] {0}, new boolean[] {false}); BinaryOperator join = new MergeJoin(sort0, sort1, new int[] {0}, new int[] {0}, new boolean[] {false}); join.open(TestEnvVars.get()); TupleBatch tb; final ArrayList<TupleBatch> batches = new ArrayList<TupleBatch>(); while (!join.eos()) { tb = join.nextReady(); if (tb != null) { batches.add(tb); } } join.close(); assertEquals(1, batches.size()); assertEquals(11, batches.get(0).numTuples()); } @Test public void testMergeJoinOnMultipleKeys() throws DbException { final Schema leftSchema = new Schema( ImmutableList.of(Type.LONG_TYPE, Type.STRING_TYPE), ImmutableList.of("id", "name")); TupleBatchBuffer leftTbb = new TupleBatchBuffer(leftSchema); { long[] ids = new long[] {0, 2, 2, 2, 3, 5, 6, 8, 8, 10}; String[] names = new String[] {"c", "c", "c", "b", "b", "b", "b", "a", "a", "a"}; for (int i = 0; i < ids.length; i++) { leftTbb.putLong(0, ids[i]); leftTbb.putString(1, names[i]); } } final Schema rightSchema = new Schema( ImmutableList.of(Type.LONG_TYPE, Type.STRING_TYPE), ImmutableList.of("id2", "name2")); TupleBatchBuffer rightTbb = new TupleBatchBuffer(rightSchema); { long[] ids = new long[] {1, 2, 2, 4, 8, 8, 10, 11}; String[] names = new String[] {"d", "d", "c", "c", "a", "a", "a", "a"}; for (int i = 0; i < ids.length; i++) { rightTbb.putLong(0, ids[i]); rightTbb.putString(1, names[i]); } } BatchTupleSource[] children = new BatchTupleSource[2]; children[0] = new BatchTupleSource(leftTbb); children[1] = new BatchTupleSource(rightTbb); BinaryOperator join = new MergeJoin( children[0], children[1], new int[] {0, 1}, new int[] {0, 1}, new boolean[] {true, false}); join.open(TestEnvVars.get()); TupleBatch tb; final ArrayList<TupleBatch> batches = new ArrayList<TupleBatch>(); while (!join.eos()) { tb = join.nextReady(); if (tb != null) { batches.add(tb); } } join.close(); assertEquals(1, batches.size()); assertEquals(7, batches.get(0).numTuples()); } }