LeapFrogJoinTest.java example

Explorer
myria-master
package edu.washington.escience.myria.operator;

import static org.junit.Assert.assertEquals;

import java.nio.file.Paths;

import org.junit.Test;

import com.google.common.collect.ImmutableList;

import edu.washington.escience.myria.CsvTupleReader;
import edu.washington.escience.myria.DbException;
import edu.washington.escience.myria.Schema;
import edu.washington.escience.myria.Type;
import edu.washington.escience.myria.io.FileSource;
import edu.washington.escience.myria.storage.TupleBatch;
import edu.washington.escience.myria.storage.TupleBatchBuffer;
import edu.washington.escience.myria.storage.TupleUtils;
import edu.washington.escience.myria.util.TestEnvVars;

public class LeapFrogJoinTest {
  @Test
  public void testLeapFrogJoinOnMultipleTBInBuffer() throws DbException {
    final Schema schema =
        new Schema(
            ImmutableList.of(Type.LONG_TYPE, Type.STRING_TYPE), ImmutableList.of("id2", "name2"));
    TupleBatchBuffer leftTbb = new TupleBatchBuffer(schema);
    TupleBatchBuffer rightTbb = new TupleBatchBuffer(schema);
    for (int i = 0; i < 2; ++i) {
      leftTbb.putLong(0, 0);
      leftTbb.putString(1, "hello world");
    }

    for (int i = 0; i < TupleUtils.getBatchSize(Type.DOUBLE_TYPE) + 1; ++i) {
      rightTbb.putLong(0, 0);
      rightTbb.putString(1, "hello world");
    }

    BatchTupleSource[] children = new BatchTupleSource[2];
    children[0] = new BatchTupleSource(leftTbb);
    children[1] = new BatchTupleSource(rightTbb);

    final ImmutableList<String> outputColumnNames =
        ImmutableList.of("id1", "name1", "id2", "name2");
    final Schema outputSchema =
        new Schema(
            ImmutableList.of(Type.LONG_TYPE, Type.STRING_TYPE, Type.LONG_TYPE, Type.STRING_TYPE),
            outputColumnNames);

    int[][][] fieldMap = new int[][][] {{{0, 0}, {1, 0}}};
    int[][] outputMap = new int[][] {{0, 0}, {0, 1}, {1, 0}, {1, 1}};
    NAryOperator join = new LeapFrogJoin(children, fieldMap, outputMap, outputColumnNames, null);
    join.open(TestEnvVars.get());
    TupleBatch tb;
    TupleBatchBuffer batches = new TupleBatchBuffer(outputSchema);
    while (!join.eos()) {
      tb = join.nextReady();
      if (tb != null) {
        batches.appendTB(tb);
      }
    }
    join.close();
    assertEquals(2 * (TupleUtils.getBatchSize(Type.DOUBLE_TYPE) + 1), batches.numTuples());
  }

  @Test
  public void testLeapFrogUsingTwitterTriangularJoin() throws DbException {
    final Schema r_schema =
        new Schema(
            ImmutableList.of(Type.LONG_TYPE, Type.LONG_TYPE), ImmutableList.of("r_x", "r_y"));
    final Schema s_schema =
        new Schema(
            ImmutableList.of(Type.LONG_TYPE, Type.LONG_TYPE), ImmutableList.of("s_y", "s_z"));
    final Schema t_schema =
        new Schema(
            ImmutableList.of(Type.LONG_TYPE, Type.LONG_TYPE), ImmutableList.of("t_z", "t_x"));
    /* read data from files. */
    final String realFilename = Paths.get("testdata", "twitter", "TwitterK.csv").toString();
    TupleSource dataInputR =
        new TupleSource(new CsvTupleReader(r_schema), new FileSource(realFilename));
    TupleSource dataInputS =
        new TupleSource(new CsvTupleReader(s_schema), new FileSource(realFilename));
    TupleSource dataInputT =
        new TupleSource(new CsvTupleReader(t_schema), new FileSource(realFilename));
    /* order the tables. */
    InMemoryOrderBy orderR =
        new InMemoryOrderBy(dataInputR, new int[] {0, 1}, new boolean[] {true, true});
    InMemoryOrderBy orderS =
        new InMemoryOrderBy(dataInputS, new int[] {0, 1}, new boolean[] {true, true});
    InMemoryOrderBy orderT =
        new InMemoryOrderBy(dataInputT, new int[] {1, 0}, new boolean[] {true, true});
    /* leapfrog join. */
    int[][][] fieldMap = new int[][][] {{{0, 0}, {2, 1}}, {{0, 1}, {1, 0}}, {{1, 1}, {2, 0}}};
    int[][] outputMap = new int[][] {{0, 0}, {0, 1}, {1, 1}};
    final ImmutableList<String> outputColumnNames = ImmutableList.of("x", "y", "z");
    final Schema outputSchema =
        new Schema(
            ImmutableList.of(Type.LONG_TYPE, Type.LONG_TYPE, Type.LONG_TYPE), outputColumnNames);
    LeapFrogJoin join =
        new LeapFrogJoin(
            new Operator[] {orderR, orderS, orderT},
            fieldMap,
            outputMap,
            outputColumnNames,
            new boolean[] {true, true, true});
    join.open(TestEnvVars.get());
    TupleBatch tb;
    TupleBatchBuffer batches = new TupleBatchBuffer(outputSchema);
    while (!join.eos()) {
      tb = join.nextReady();
      if (tb != null) {
        batches.appendTB(tb);
      }
    }
    join.close();
    assertEquals(16826, batches.numTuples());
  }

  @Test
  public void binaryJoinWithMoreColumns() throws DbException {
    /* Query: Result(x,y,z) :- R(x,y),T(x,y,z). */
    final Schema r_schema =
        new Schema(
            ImmutableList.of(Type.LONG_TYPE, Type.LONG_TYPE), ImmutableList.of("r_x", "r_y"));
    final Schema t_schema =
        new Schema(
            ImmutableList.of(Type.LONG_TYPE, Type.LONG_TYPE, Type.LONG_TYPE),
            ImmutableList.of("t_x", "t_z", "t_y"));
    /* read data from files. */
    final String r_path = Paths.get("testdata", "multiwayjoin", "R.csv").toString();
    final String t_path = Paths.get("testdata", "multiwayjoin", "T.csv").toString();
    TupleSource dataInputR = new TupleSource(new CsvTupleReader(r_schema), new FileSource(r_path));
    TupleSource dataInputT = new TupleSource(new CsvTupleReader(t_schema), new FileSource(t_path));
    /* order the tables. */
    InMemoryOrderBy orderR =
        new InMemoryOrderBy(dataInputR, new int[] {0, 1}, new boolean[] {true, true});
    InMemoryOrderBy orderT =
        new InMemoryOrderBy(dataInputT, new int[] {0, 1}, new boolean[] {true, true});
    /* leapfrog join. */
    int[][][] fieldMap = new int[][][] {{{0, 0}, {1, 0}}, {{0, 1}, {1, 2}}};
    int[][] outputMap = new int[][] {{0, 0}, {0, 1}};
    final ImmutableList<String> outputColumnNames = ImmutableList.of("x", "y");
    final Schema outputSchema =
        new Schema(ImmutableList.of(Type.LONG_TYPE, Type.LONG_TYPE), outputColumnNames);
    LeapFrogJoin join =
        new LeapFrogJoin(
            new Operator[] {orderR, orderT},
            fieldMap,
            outputMap,
            outputColumnNames,
            new boolean[] {false, false});
    join.open(TestEnvVars.get());
    TupleBatch tb;
    TupleBatchBuffer batches = new TupleBatchBuffer(outputSchema);
    while (!join.eos()) {
      tb = join.nextReady();
      if (tb != null) {
        batches.appendTB(tb);
      }
    }
    join.close();
    assertEquals(0, batches.numTuples());
  }

  @Test
  public void strangeTriangle() throws DbException {
    /* Query: Result(x,y,z) :- R(x,y),S(y,z),T(x,y,z). */
    final Schema r_schema =
        new Schema(
            ImmutableList.of(Type.LONG_TYPE, Type.LONG_TYPE), ImmutableList.of("r_x", "r_y"));
    final Schema s_schema =
        new Schema(
            ImmutableList.of(Type.LONG_TYPE, Type.LONG_TYPE), ImmutableList.of("s_y", "s_z"));
    final Schema t_schema =
        new Schema(
            ImmutableList.of(Type.LONG_TYPE, Type.LONG_TYPE, Type.LONG_TYPE),
            ImmutableList.of("t_x", "t_y", "t_z"));
    /* read data from files. */
    final String r_path = Paths.get("testdata", "multiwayjoin", "R.csv").toString();
    final String s_path = Paths.get("testdata", "multiwayjoin", "S.csv").toString();
    final String t_path = Paths.get("testdata", "multiwayjoin", "T.csv").toString();
    TupleSource dataInputR = new TupleSource(new CsvTupleReader(r_schema), new FileSource(r_path));
    TupleSource dataInputS = new TupleSource(new CsvTupleReader(s_schema), new FileSource(s_path));
    TupleSource dataInputT = new TupleSource(new CsvTupleReader(t_schema), new FileSource(t_path));
    /* order the tables. */
    InMemoryOrderBy orderR =
        new InMemoryOrderBy(dataInputR, new int[] {0, 1}, new boolean[] {true, true});
    InMemoryOrderBy orderS =
        new InMemoryOrderBy(dataInputS, new int[] {0, 1}, new boolean[] {true, true});
    InMemoryOrderBy orderT =
        new InMemoryOrderBy(dataInputT, new int[] {0, 1, 2}, new boolean[] {true, true, true});
    /* leapfrog join. */
    int[][][] fieldMap =
        new int[][][] {{{0, 0}, {2, 0}}, {{0, 1}, {1, 0}, {2, 1}}, {{1, 1}, {2, 2}}};
    int[][] outputMap = new int[][] {{0, 0}, {0, 1}, {1, 1}};
    final ImmutableList<String> outputColumnNames = ImmutableList.of("x", "y", "z");
    final Schema outputSchema =
        new Schema(
            ImmutableList.of(Type.LONG_TYPE, Type.LONG_TYPE, Type.LONG_TYPE), outputColumnNames);
    LeapFrogJoin join =
        new LeapFrogJoin(
            new Operator[] {orderR, orderS, orderT},
            fieldMap,
            outputMap,
            outputColumnNames,
            new boolean[] {true, true, false});
    join.open(TestEnvVars.get());
    TupleBatch tb;
    TupleBatchBuffer batches = new TupleBatchBuffer(outputSchema);
    while (!join.eos()) {
      tb = join.nextReady();
      if (tb != null) {
        batches.appendTB(tb);
      }
    }
    join.close();
    assertEquals(8, batches.numTuples());
  }

  @Test
  public void strangeRectangle() throws DbException {
    /* Rectangle(x,y,z,p) :- R(x,y),S(y,z),T(z,p),K(p,x),M(x,y,z). */
    final Schema r_schema =
        new Schema(
            ImmutableList.of(Type.LONG_TYPE, Type.LONG_TYPE), ImmutableList.of("r_x", "r_y"));
    final Schema s_schema =
        new Schema(
            ImmutableList.of(Type.LONG_TYPE, Type.LONG_TYPE), ImmutableList.of("s_y", "s_z"));
    final Schema t_schema =
        new Schema(
            ImmutableList.of(Type.LONG_TYPE, Type.LONG_TYPE), ImmutableList.of("t_z", "t_p"));
    final Schema k_schema =
        new Schema(
            ImmutableList.of(Type.LONG_TYPE, Type.LONG_TYPE), ImmutableList.of("k_p", "k_x"));
    final Schema m_schema =
        new Schema(
            ImmutableList.of(Type.LONG_TYPE, Type.LONG_TYPE, Type.LONG_TYPE),
            ImmutableList.of("m_x", "m_y", "m_z"));
    /* read data from files. */
    final String r_path = Paths.get("testdata", "multiwayjoin", "rectangles.csv").toString();
    final String m_path = Paths.get("testdata", "multiwayjoin", "rec_2_hop.csv").toString();
    TupleSource dataInputR = new TupleSource(new CsvTupleReader(r_schema), new FileSource(r_path));
    TupleSource dataInputS = new TupleSource(new CsvTupleReader(s_schema), new FileSource(r_path));
    TupleSource dataInputT = new TupleSource(new CsvTupleReader(t_schema), new FileSource(r_path));
    TupleSource dataInputK = new TupleSource(new CsvTupleReader(k_schema), new FileSource(r_path));
    TupleSource dataInputM = new TupleSource(new CsvTupleReader(m_schema), new FileSource(m_path));
    /* order the tables. */
    InMemoryOrderBy orderR =
        new InMemoryOrderBy(dataInputR, new int[] {0, 1}, new boolean[] {true, true});
    InMemoryOrderBy orderS =
        new InMemoryOrderBy(dataInputS, new int[] {0, 1}, new boolean[] {true, true});
    InMemoryOrderBy orderT =
        new InMemoryOrderBy(dataInputT, new int[] {0, 1}, new boolean[] {true, true});
    InMemoryOrderBy orderK =
        new InMemoryOrderBy(dataInputK, new int[] {1, 0}, new boolean[] {true, true});
    InMemoryOrderBy orderM =
        new InMemoryOrderBy(dataInputM, new int[] {0, 1, 2}, new boolean[] {true, true, true});

    /* leapfrog join, Rectangle(x,y,z,p) :- R(x,y),S(y,z),T(z,p),K(p,x),M(x,y,z). */
    int[][][] fieldMap =
        new int[][][] {
          {{0, 0}, {3, 1}, {4, 0}},
          {{0, 1}, {1, 0}, {4, 1}},
          {{1, 1}, {2, 0}, {4, 2}},
          {{2, 1}, {3, 0}}
        };
    int[][] outputMap = new int[][] {{0, 0}, {0, 1}, {1, 1}, {2, 1}};
    final ImmutableList<String> outputColumnNames = ImmutableList.of("x", "y", "z", "p");
    final Schema outputSchema =
        new Schema(
            ImmutableList.of(Type.LONG_TYPE, Type.LONG_TYPE, Type.LONG_TYPE, Type.LONG_TYPE),
            outputColumnNames);
    LeapFrogJoin join =
        new LeapFrogJoin(
            new Operator[] {orderR, orderS, orderT, orderK, orderM},
            fieldMap,
            outputMap,
            outputColumnNames,
            new boolean[] {false, false, false, false, true});
    join.open(TestEnvVars.get());
    TupleBatch tb;
    TupleBatchBuffer batches = new TupleBatchBuffer(outputSchema);
    while (!join.eos()) {
      tb = join.nextReady();
      if (tb != null) {
        batches.appendTB(tb);
      }
    }
    join.close();
    assertEquals(4, batches.numTuples());
  }

  @Test
  public void tridentQuery() throws DbException {
    /* Query: Result(x,y,z) :- R(x),S(x),T(y,x). */
    final Schema r_schema =
        new Schema(ImmutableList.of(Type.INT_TYPE), ImmutableList.of("film_id_a"));
    final Schema s_schema =
        new Schema(ImmutableList.of(Type.INT_TYPE), ImmutableList.of("film_id_b"));
    final Schema t_schema =
        new Schema(
            ImmutableList.of(Type.INT_TYPE, Type.INT_TYPE),
            ImmutableList.of("film_id", "perform_id"));
    /* read data from files, note */
    final String r_path = Paths.get("testdata", "multiwayjoin", "fa.csv").toString();
    final String s_path = Paths.get("testdata", "multiwayjoin", "fb.csv").toString();
    final String t_path = Paths.get("testdata", "multiwayjoin", "pf.csv").toString();
    TupleSource dataInputR = new TupleSource(new CsvTupleReader(r_schema), new FileSource(r_path));
    TupleSource dataInputS = new TupleSource(new CsvTupleReader(s_schema), new FileSource(s_path));
    TupleSource dataInputT = new TupleSource(new CsvTupleReader(t_schema), new FileSource(t_path));
    /* order the tables. */
    InMemoryOrderBy orderR = new InMemoryOrderBy(dataInputR, new int[] {0}, new boolean[] {true});
    InMemoryOrderBy orderS = new InMemoryOrderBy(dataInputS, new int[] {0}, new boolean[] {true});
    InMemoryOrderBy orderT = new InMemoryOrderBy(dataInputT, new int[] {1}, new boolean[] {true});
    /* leapfrog join. */
    int[][][] fieldMap = new int[][][] {{{0, 0}, {1, 0}, {2, 1}}};
    int[][] outputMap = new int[][] {{0, 0}, {2, 0}};
    final ImmutableList<String> outputColumnNames = ImmutableList.of("x", "y");
    final Schema outputSchema =
        new Schema(ImmutableList.of(Type.INT_TYPE, Type.INT_TYPE), outputColumnNames);
    LeapFrogJoin join =
        new LeapFrogJoin(
            new Operator[] {orderR, orderS, orderT},
            fieldMap,
            outputMap,
            outputColumnNames,
            new boolean[] {false, false, false});
    join.open(TestEnvVars.get());
    TupleBatch tb;
    TupleBatchBuffer batches = new TupleBatchBuffer(outputSchema);
    while (!join.eos()) {
      tb = join.nextReady();
      if (tb != null) {
        batches.appendTB(tb);
      }
    }
    join.close();
    assertEquals(2, batches.numTuples());
  }

  @Test
  public void intersect3tables() throws DbException {
    /* I(x) :- A(x),B(x),C(x). */
    final Schema a_schema = new Schema(ImmutableList.of(Type.INT_TYPE), ImmutableList.of("a_x"));
    final Schema b_schema = new Schema(ImmutableList.of(Type.INT_TYPE), ImmutableList.of("b_x"));
    final Schema c_schema = new Schema(ImmutableList.of(Type.INT_TYPE), ImmutableList.of("c_x"));
    /* read data from files, note */
    final String a_path = Paths.get("testdata", "multiwayjoin", "a.csv").toString();
    final String b_path = Paths.get("testdata", "multiwayjoin", "b.csv").toString();
    final String c_path = Paths.get("testdata", "multiwayjoin", "c.csv").toString();
    TupleSource dataInputA = new TupleSource(new CsvTupleReader(a_schema), new FileSource(a_path));
    TupleSource dataInputB = new TupleSource(new CsvTupleReader(b_schema), new FileSource(b_path));
    TupleSource dataInputC = new TupleSource(new CsvTupleReader(c_schema), new FileSource(c_path));
    /* order the tables. */
    InMemoryOrderBy orderA = new InMemoryOrderBy(dataInputA, new int[] {0}, new boolean[] {true});
    InMemoryOrderBy orderB = new InMemoryOrderBy(dataInputB, new int[] {0}, new boolean[] {true});
    InMemoryOrderBy orderC = new InMemoryOrderBy(dataInputC, new int[] {0}, new boolean[] {true});
    /* leapfrog join. */
    int[][][] fieldMap = new int[][][] {{{0, 0}, {1, 0}, {2, 0}}};
    int[][] outputMap = new int[][] {{0, 0}};
    final ImmutableList<String> outputColumnNames = ImmutableList.of("x");
    final Schema outputSchema = new Schema(ImmutableList.of(Type.INT_TYPE), outputColumnNames);
    LeapFrogJoin join =
        new LeapFrogJoin(
            new Operator[] {orderA, orderB, orderC},
            fieldMap,
            outputMap,
            outputColumnNames,
            new boolean[] {false, false, true});
    join.open(TestEnvVars.get());
    TupleBatch tb;
    TupleBatchBuffer batches = new TupleBatchBuffer(outputSchema);
    while (!join.eos()) {
      tb = join.nextReady();
      if (tb != null) {
        batches.appendTB(tb);
      }
    }
    join.close();
    assertEquals(2, batches.numTuples());
  }

  @Test
  public void outputFreeVariable() throws DbException {
    /* Result(x):- o(k,x), p(x,y), q(y,z) */
    final Schema o_schema =
        new Schema(
            ImmutableList.of(Type.LONG_TYPE, Type.LONG_TYPE), ImmutableList.of("o_k", "o_x"));
    final Schema p_schema =
        new Schema(
            ImmutableList.of(Type.LONG_TYPE, Type.LONG_TYPE), ImmutableList.of("p_x", "p_y"));
    final Schema q_schema =
        new Schema(
            ImmutableList.of(Type.LONG_TYPE, Type.LONG_TYPE), ImmutableList.of("q_y", "q_z"));
    /* order the tables. */
    final String o_path = Paths.get("testdata", "multiwayjoin", "o.csv").toString();
    final String p_path = Paths.get("testdata", "multiwayjoin", "p.csv").toString();
    final String q_path = Paths.get("testdata", "multiwayjoin", "q.csv").toString();
    TupleSource dataInputO = new TupleSource(new CsvTupleReader(o_schema), new FileSource(o_path));
    TupleSource dataInputP = new TupleSource(new CsvTupleReader(p_schema), new FileSource(p_path));
    TupleSource dataInputQ = new TupleSource(new CsvTupleReader(q_schema), new FileSource(q_path));
    InMemoryOrderBy orderO = new InMemoryOrderBy(dataInputO, new int[] {1}, new boolean[] {true});
    InMemoryOrderBy orderP =
        new InMemoryOrderBy(dataInputP, new int[] {0, 1}, new boolean[] {true, true});
    InMemoryOrderBy orderQ = new InMemoryOrderBy(dataInputQ, new int[] {0}, new boolean[] {true});
    /* leapfrog join. */
    int[][][] fieldMap = new int[][][] {{{0, 1}, {1, 0}}, {{1, 1}, {2, 0}}};
    int[][] outputMap = new int[][] {{0, 0}};
    final ImmutableList<String> outputColumnNames = ImmutableList.of("k");
    final Schema outputSchema = new Schema(ImmutableList.of(Type.LONG_TYPE), outputColumnNames);
    LeapFrogJoin join =
        new LeapFrogJoin(
            new Operator[] {orderO, orderP, orderQ},
            fieldMap,
            outputMap,
            outputColumnNames,
            new boolean[] {false, false, false});
    join.open(TestEnvVars.get());
    TupleBatch tb;
    TupleBatchBuffer batches = new TupleBatchBuffer(outputSchema);
    while (!join.eos()) {
      tb = join.nextReady();
      if (tb != null) {
        batches.appendTB(tb);
      }
    }
    join.close();
    assertEquals(9, batches.numTuples());
  }
}