package edu.washington.escience.myria.operator;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import java.util.AbstractMap.SimpleEntry;
import java.util.ArrayList;
import java.util.Map.Entry;
import org.junit.Test;
import com.google.common.collect.HashMultiset;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Multiset;
import edu.washington.escience.myria.DbException;
import edu.washington.escience.myria.Schema;
import edu.washington.escience.myria.Type;
import edu.washington.escience.myria.storage.TupleBatch;
import edu.washington.escience.myria.storage.TupleBatchBuffer;
import edu.washington.escience.myria.util.TestEnvVars;
import edu.washington.escience.myria.util.TestUtils;
public class MergeJoinTest {
@Test
public void testMergeJoin() throws DbException {
final Schema leftSchema =
new Schema(
ImmutableList.of(Type.LONG_TYPE, Type.STRING_TYPE), ImmutableList.of("id", "name"));
TupleBatchBuffer leftTbb = new TupleBatchBuffer(leftSchema);
{
long[] ids = new long[] {0, 2, 2, 2, 3, 5, 6, 8, 8, 10};
String[] names = new String[] {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j"};
for (int i = 0; i < ids.length; i++) {
leftTbb.putLong(0, ids[i]);
leftTbb.putString(1, names[i]);
}
}
final Schema rightSchema =
new Schema(
ImmutableList.of(Type.LONG_TYPE, Type.STRING_TYPE), ImmutableList.of("id2", "name2"));
TupleBatchBuffer rightTbb = new TupleBatchBuffer(rightSchema);
{
long[] ids = new long[] {1, 2, 2, 4, 8, 8, 10};
String[] names = new String[] {"a", "b", "c", "d", "e", "f", "g"};
for (int i = 0; i < ids.length; i++) {
rightTbb.putLong(0, ids[i]);
rightTbb.putString(1, names[i]);
}
}
BatchTupleSource[] children = new BatchTupleSource[2];
children[0] = new BatchTupleSource(leftTbb);
children[1] = new BatchTupleSource(rightTbb);
BinaryOperator join =
new MergeJoin(children[0], children[1], new int[] {0}, new int[] {0}, new boolean[] {true});
join.open(TestEnvVars.get());
TupleBatch tb;
final ArrayList<TupleBatch> batches = new ArrayList<TupleBatch>();
while (!join.eos()) {
tb = join.nextReady();
if (tb != null) {
batches.add(tb);
}
}
join.close();
assertEquals(1, batches.size());
assertEquals(11, batches.get(0).numTuples());
}
@Test
public void testMergeJoinCross() throws DbException {
final Schema schema =
new Schema(
ImmutableList.of(Type.LONG_TYPE, Type.LONG_TYPE), ImmutableList.of("id", "value"));
TupleBatchBuffer[] randomTuples = new TupleBatchBuffer[2];
randomTuples[0] = new TupleBatchBuffer(schema);
randomTuples[1] = new TupleBatchBuffer(schema);
for (int i = 0; i < 5; i++) {
randomTuples[0].putLong(0, 42);
randomTuples[0].putLong(1, i);
randomTuples[1].putLong(0, 42);
randomTuples[1].putLong(1, 100 + i);
}
// we need to rename the columns from the second tuples
ImmutableList.Builder<String> sb = ImmutableList.builder();
sb.addAll(schema.getColumnNames());
sb.add("id2");
sb.add("value2");
BatchTupleSource[] children = new BatchTupleSource[2];
children[0] = new BatchTupleSource(randomTuples[0]);
children[1] = new BatchTupleSource(randomTuples[1]);
BinaryOperator join =
new MergeJoin(
sb.build(),
children[0],
children[1],
new int[] {0},
new int[] {0},
new boolean[] {true});
join.open(TestEnvVars.get());
TupleBatch tb = null;
int count = 0;
Multiset<Long> left = HashMultiset.create();
Multiset<Long> right = HashMultiset.create();
while (!join.eos()) {
tb = join.nextReady();
if (tb != null) {
count += tb.numTuples();
for (int i = 0; i < tb.numTuples(); i++) {
left.add(tb.getLong(1, i));
right.add(tb.getLong(3, i));
}
}
}
for (long i = 0; i < 5; i++) {
assertEquals(5, left.count(i));
assertEquals(5, right.count(i + 100));
}
assertEquals(25, count);
join.close();
}
@Test
public void testMergeJoinLarge() throws DbException {
TupleBatchBuffer[] randomTuples = new TupleBatchBuffer[2];
randomTuples[0] = TestUtils.generateRandomTuples(12200, 12000, true);
randomTuples[1] = TestUtils.generateRandomTuples(13200, 13000, true);
// we need to rename the columns from the second tuples
ImmutableList.Builder<String> sb = ImmutableList.builder();
sb.add("id");
sb.add("name");
sb.add("id2");
sb.add("name2");
BatchTupleSource[] children = new BatchTupleSource[2];
children[0] = new BatchTupleSource(randomTuples[0]);
children[1] = new BatchTupleSource(randomTuples[1]);
BinaryOperator join =
new MergeJoin(
sb.build(),
children[0],
children[1],
new int[] {0},
new int[] {0},
new boolean[] {true});
join.open(TestEnvVars.get());
TupleBatch tb;
final ArrayList<Entry<Long, String>> entries = new ArrayList<Entry<Long, String>>();
while (!join.eos()) {
tb = join.nextReady();
if (tb != null) {
for (int i = 0; i < tb.numTuples(); i++) {
entries.add(new SimpleEntry<Long, String>(tb.getLong(0, i), tb.getString(1, i)));
}
}
}
// output should be sorted by join keys
Entry<Long, String> previous = null;
for (Entry<Long, String> entry : entries) {
if (previous != null) {
assertTrue(previous.getKey() <= entry.getKey());
}
previous = entry;
}
join.close();
}
@Test
public void testMergeJoinOnMultipleKeys() throws DbException {
final Schema leftSchema =
new Schema(
ImmutableList.of(Type.LONG_TYPE, Type.STRING_TYPE), ImmutableList.of("id", "name"));
TupleBatchBuffer leftTbb = new TupleBatchBuffer(leftSchema);
{
long[] ids = new long[] {0, 2, 2, 2, 3, 5, 6, 8, 8, 10};
String[] names = new String[] {"c", "c", "c", "b", "b", "b", "b", "a", "a", "a"};
for (int i = 0; i < ids.length; i++) {
leftTbb.putLong(0, ids[i]);
leftTbb.putString(1, names[i]);
}
}
final Schema rightSchema =
new Schema(
ImmutableList.of(Type.LONG_TYPE, Type.STRING_TYPE), ImmutableList.of("id2", "name2"));
TupleBatchBuffer rightTbb = new TupleBatchBuffer(rightSchema);
{
long[] ids = new long[] {1, 2, 2, 4, 8, 8, 10, 11};
String[] names = new String[] {"d", "d", "c", "c", "a", "a", "a", "a"};
for (int i = 0; i < ids.length; i++) {
rightTbb.putLong(0, ids[i]);
rightTbb.putString(1, names[i]);
}
}
BatchTupleSource[] children = new BatchTupleSource[2];
children[0] = new BatchTupleSource(leftTbb);
children[1] = new BatchTupleSource(rightTbb);
BinaryOperator join =
new MergeJoin(
children[0],
children[1],
new int[] {0, 1},
new int[] {0, 1},
new boolean[] {true, false});
join.open(TestEnvVars.get());
TupleBatch tb;
final ArrayList<TupleBatch> batches = new ArrayList<TupleBatch>();
while (!join.eos()) {
tb = join.nextReady();
if (tb != null) {
batches.add(tb);
}
}
join.close();
assertEquals(1, batches.size());
assertEquals(7, batches.get(0).numTuples());
}
@Test
public void testOrderByAndMergeJoin() throws DbException {
final Schema leftSchema =
new Schema(
ImmutableList.of(Type.LONG_TYPE, Type.STRING_TYPE), ImmutableList.of("id", "name"));
TupleBatchBuffer leftTbb = new TupleBatchBuffer(leftSchema);
{
long[] ids = new long[] {2, 3, 5, 6, 8, 8, 10, 0, 2, 2};
String[] names = new String[] {"d", "e", "f", "g", "h", "i", "j", "a", "b", "c"};
for (int i = 0; i < ids.length; i++) {
leftTbb.putLong(0, ids[i]);
leftTbb.putString(1, names[i]);
}
}
final Schema rightSchema =
new Schema(
ImmutableList.of(Type.LONG_TYPE, Type.STRING_TYPE), ImmutableList.of("id2", "name2"));
TupleBatchBuffer rightTbb = new TupleBatchBuffer(rightSchema);
{
long[] ids = new long[] {1, 2, 2, 4, 8, 8, 10};
String[] names = new String[] {"a", "b", "c", "d", "e", "f", "g"};
for (int i = 0; i < ids.length; i++) {
rightTbb.putLong(0, ids[i]);
rightTbb.putString(1, names[i]);
}
}
BatchTupleSource[] children = new BatchTupleSource[2];
children[0] = new BatchTupleSource(leftTbb);
children[1] = new BatchTupleSource(rightTbb);
InMemoryOrderBy sort0 = new InMemoryOrderBy(children[0], new int[] {0}, new boolean[] {false});
InMemoryOrderBy sort1 = new InMemoryOrderBy(children[1], new int[] {0}, new boolean[] {false});
BinaryOperator join =
new MergeJoin(sort0, sort1, new int[] {0}, new int[] {0}, new boolean[] {false});
join.open(TestEnvVars.get());
TupleBatch tb;
final ArrayList<TupleBatch> batches = new ArrayList<TupleBatch>();
while (!join.eos()) {
tb = join.nextReady();
if (tb != null) {
batches.add(tb);
}
}
join.close();
assertEquals(1, batches.size());
assertEquals(11, batches.get(0).numTuples());
}
}