package org.apache.pig.spark;
import static org.apache.pig.builtin.mock.Storage.bag;
import static org.apache.pig.builtin.mock.Storage.tuple;
import static org.junit.Assert.assertEquals;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.Properties;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.log4j.Level;
import org.apache.pig.ExecType;
import org.apache.pig.PigServer;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.backend.executionengine.ExecJob;
import org.apache.pig.builtin.mock.Storage;
import org.apache.pig.builtin.mock.Storage.Data;
import org.apache.pig.data.Tuple;
import org.junit.Assert;
import org.junit.Test;
public class TestSpark {
private static final ExecType MODE = ExecType.SPARK;
private static final Log LOG = LogFactory.getLog(TestSpark.class);
static {
org.apache.log4j.Logger.getLogger("org.apache.pig.backend.hadoop.executionengine.spark").setLevel(Level.DEBUG);
}
private PigServer newPigServer() throws ExecException {
Properties properties = new Properties();
// to avoid pig running out of memory in LOCAL mode
properties.put("io.sort.mb", "1");
PigServer pigServer = new PigServer(MODE, properties);
return pigServer;
}
@Test
public void testLoadStore() throws Exception {
PigServer pigServer = newPigServer();
Data data = Storage.resetData(pigServer);
data.set("input",
tuple("test1"),
tuple("test2"));
pigServer.setBatchOn();
pigServer.registerQuery("A = LOAD 'input' using mock.Storage;");
pigServer.registerQuery("STORE A INTO 'output' using mock.Storage;");
List<ExecJob> executeBatch = pigServer.executeBatch();
// TODO: good stats
// assertEquals(1, executeBatch.size());
// assertTrue(executeBatch.get(0).hasCompleted());
assertEquals(
Arrays.asList(tuple("test1"), tuple("test2")),
data.get("output"));
pigServer.shutdown();
}
@Test
public void testDump() throws Exception {
PigServer pigServer = new PigServer(MODE);
Data data = Storage.resetData(pigServer);
data.set("input",
tuple("test1"),
tuple("test2"));
pigServer.setBatchOn();
pigServer.registerQuery("A = LOAD 'input' using mock.Storage;");
Iterator<Tuple> result = pigServer.openIterator("A");
List<Tuple> resultList = new ArrayList<Tuple>();
while (result.hasNext()) {
resultList.add(result.next());
}
assertEquals(Arrays.asList(tuple("test1"), tuple("test2")), resultList);
pigServer.shutdown();
}
@Test
public void testGroupBy() throws Exception {
PigServer pigServer = newPigServer();
Data data = Storage.resetData(pigServer);
data.set("input",
tuple("foo", "key1", "test1"),
tuple("bar", "key1", "test2"),
tuple("baz", "key2", "test3"));
pigServer.registerQuery("A = LOAD 'input' using mock.Storage;");
pigServer.registerQuery("B = GROUP A BY $1;");
pigServer.registerQuery("STORE B INTO 'output' using mock.Storage;");
assertEquals(
Arrays.asList(
tuple("key1", bag(tuple("foo", "key1", "test1"), tuple("bar", "key1", "test2"))),
tuple("key2", bag(tuple("baz", "key2", "test3")))),
sortByIndex(data.get("output"), 0));
pigServer.shutdown();
}
@Test
public void testMultiJobGroupBy() throws Exception {
PigServer pigServer = newPigServer();
Data data = Storage.resetData(pigServer);
data.set("input",
tuple("foo", "key1", "test1"),
tuple("bar", "key1", "test2"),
tuple("baz", "key2", "test3"));
pigServer.registerQuery("A = LOAD 'input' using mock.Storage;");
pigServer.registerQuery("B = GROUP A BY $1;");
pigServer.registerQuery("C = FOREACH B GENERATE group, FLATTEN(A.$0), COUNT(A);");
pigServer.registerQuery("D = GROUP C BY StringSize($1);");
pigServer.registerQuery("STORE D INTO 'output' using mock.Storage;");
assertEquals(
Arrays.asList(
tuple(3l,bag(tuple("key1","foo",2l),tuple("key1","bar",2l),tuple("key2","baz",1l)))),
sortByIndex(data.get("output"), 0));
pigServer.shutdown();
}
private List<Tuple> sortByIndex(List<Tuple> out, final int i) {
List<Tuple> result = new ArrayList<Tuple>(out);
Collections.sort(result, new Comparator<Tuple>() {
@Override
public int compare(Tuple o1, Tuple o2) {
try {
Comparable c1 = (Comparable)o1.get(i);
Comparable c2 = (Comparable)o2.get(i);
return c1.compareTo(c2);
} catch (ExecException e) {
throw new RuntimeException(e);
}
}
});
return result;
}
@Test
public void testGroupByFlatten() throws Exception {
PigServer pigServer = newPigServer();
Data data = Storage.resetData(pigServer);
data.set("input",
tuple("test1"),
tuple("test1"),
tuple("test2"));
pigServer.registerQuery("A = LOAD 'input' using mock.Storage;");
pigServer.registerQuery("B = GROUP A BY $0;");
pigServer.registerQuery("C = FOREACH B GENERATE FLATTEN(A);");
pigServer.registerQuery("STORE C INTO 'output' using mock.Storage;");
List<Tuple> sortedOutput = data.get("output");
Collections.sort(sortedOutput);
assertEquals(
Arrays.asList(tuple("test1"), tuple("test1"), tuple("test2")),
sortedOutput);
pigServer.shutdown();
}
@Test
public void testCount() throws Exception {
PigServer pigServer = newPigServer();
Data data = Storage.resetData(pigServer);
data.set("input",
tuple("test1"),
tuple("test1"),
tuple("test2"));
pigServer.registerQuery("A = LOAD 'input' using mock.Storage;");
pigServer.registerQuery("B = GROUP A BY $0;");
pigServer.registerQuery("C = FOREACH B GENERATE COUNT(A);");
pigServer.registerQuery("STORE C INTO 'output' using mock.Storage;");
List<Tuple> sortedOutput = data.get("output");
Collections.sort(sortedOutput);
assertEquals(
Arrays.asList(tuple(1l), tuple(2l)),
sortedOutput);
pigServer.shutdown();
}
@Test
public void testCountWithNoData() throws Exception {
PigServer pigServer = new PigServer(ExecType.SPARK);
Data data = Storage.resetData(pigServer);
data.set("input");
pigServer.registerQuery("A = LOAD 'input' using mock.Storage;");
pigServer.registerQuery("B = GROUP A BY $0;");
pigServer.registerQuery("C = FOREACH B GENERATE COUNT(A);");
pigServer.registerQuery("STORE C INTO 'output' using mock.Storage;");
assertEquals(
Arrays.asList(),
data.get("output"));
pigServer.shutdown();
}
@Test
public void testForEach() throws Exception {
PigServer pigServer = newPigServer();
Data data = Storage.resetData(pigServer);
data.set("input",
tuple("1"),
tuple("12"),
tuple("123"));
pigServer.registerQuery("A = LOAD 'input' using mock.Storage;");
pigServer.registerQuery("B = FOREACH A GENERATE StringSize($0);");
pigServer.registerQuery("STORE B INTO 'output' using mock.Storage;");
assertEquals(
Arrays.asList(tuple(1l), tuple(2l), tuple(3l)),
data.get("output"));
pigServer.shutdown();
}
@Test
public void testForEachFlatten() throws Exception {
PigServer pigServer = newPigServer();
Data data = Storage.resetData(pigServer);
data.set("input",
tuple(bag(tuple("1"), tuple("2"), tuple("3"))),
tuple(bag(tuple("4"), tuple("5"), tuple("6"))));
pigServer.registerQuery("A = LOAD 'input' using mock.Storage;");
pigServer.registerQuery("B = FOREACH A GENERATE FLATTEN($0);");
pigServer.registerQuery("STORE B INTO 'output' using mock.Storage;");
assertEquals(
Arrays.asList(tuple("1"), tuple("2"), tuple("3"), tuple("4"), tuple("5"), tuple("6")),
data.get("output"));
pigServer.shutdown();
}
@Test
public void testSimpleUDF() throws Exception {
PigServer pigServer = new PigServer(ExecType.SPARK);
Data data = Storage.resetData(pigServer);
data.set("input",
tuple("Foo"),
tuple("BAR"),
tuple("baT"));
pigServer.registerQuery("A = LOAD 'input' using mock.Storage;");
pigServer.registerQuery("B = FOREACH A GENERATE org.apache.pig.spark.LowercaseUDF($0);");
pigServer.registerQuery("STORE B INTO 'output' using mock.Storage;");
assertEquals(
Arrays.asList(tuple("foo"), tuple("bar"), tuple("bat")),
data.get("output"));
pigServer.shutdown();
}
@Test
public void testFilter() throws Exception {
PigServer pigServer = newPigServer();
Data data = Storage.resetData(pigServer);
data.set("input",
tuple("1"),
tuple("2"),
tuple("3"),
tuple("1"));
pigServer.registerQuery("A = LOAD 'input' using mock.Storage;");
pigServer.registerQuery("B = FILTER A BY $0 == '1';");
pigServer.registerQuery("STORE B INTO 'output' using mock.Storage;");
assertEquals(
Arrays.asList(tuple("1"), tuple("1")),
data.get("output"));
pigServer.shutdown();
}
@Test
public void testDistinct() throws Exception {
PigServer pigServer = newPigServer();
Data data = Storage.resetData(pigServer);
data.set("input",
tuple("1"),
tuple("2"),
tuple("3"),
tuple("1"));
pigServer.registerQuery("A = LOAD 'input' using mock.Storage;");
pigServer.registerQuery("B = DISTINCT A;");
pigServer.registerQuery("STORE B INTO 'output' using mock.Storage;");
assertEquals(
Arrays.asList(tuple("1"), tuple("2"), tuple("3")),
sortByIndex(data.get("output"), 0));
pigServer.shutdown();
}
@Test
public void testExplicitSPLIT() throws Exception {
PigServer pigServer = newPigServer();
Data data = Storage.resetData(pigServer);
data.set("input",
tuple("1", 2, "foo"),
tuple("2", 3, "bar"),
tuple("2", 1, "bar"),
tuple("1", 4, "foo"));
pigServer.setBatchOn();
pigServer.registerQuery("A = LOAD 'input' using mock.Storage;");
pigServer.registerQuery("SPLIT A INTO B IF $0 == '1', C IF $0 == '2';");
pigServer.registerQuery("STORE B INTO 'output1' using mock.Storage;");
pigServer.registerQuery("STORE C INTO 'output2' using mock.Storage;");
pigServer.executeBatch();
assertEquals(
Arrays.asList(
tuple("1", 2, "foo"),
tuple("1", 4, "foo")
),
sortByIndex(data.get("output1"), 0));
assertEquals(
Arrays.asList(
tuple("2", 3, "bar"),
tuple("2", 1, "bar")
),
sortByIndex(data.get("output2"), 0));
pigServer.shutdown();
}
@Test
public void testExplicitSPLITWithDistinct() throws Exception {
PigServer pigServer = newPigServer();
Data data = Storage.resetData(pigServer);
data.set("input",
tuple("1", 2, "foo"),
tuple("2", 3, "bar"),
tuple("2", 1, "bar"),
tuple("1", 4, "foo"));
pigServer.setBatchOn();
pigServer.registerQuery("A = LOAD 'input' using mock.Storage;");
pigServer.registerQuery("B = DISTINCT A;");
pigServer.registerQuery("SPLIT B INTO C IF $0 == '1', D IF $0 == '2';");
pigServer.registerQuery("E = DISTINCT C;");
pigServer.registerQuery("F = DISTINCT D;");
pigServer.registerQuery("STORE E INTO 'output1' using mock.Storage;");
pigServer.registerQuery("STORE F INTO 'output2' using mock.Storage;");
pigServer.executeBatch();
assertEquals(
Arrays.asList(
tuple("1", 2, "foo"),
tuple("1", 4, "foo")
),
sortByIndex(data.get("output1"), 1));
assertEquals(
Arrays.asList(
tuple("2", 1, "bar"),
tuple("2", 3, "bar")
),
sortByIndex(data.get("output2"),1));
pigServer.shutdown();
}
@Test
public void testImplicitSPLIT() throws Exception {
PigServer pigServer = newPigServer();
Data data = Storage.resetData(pigServer);
data.set("input",
tuple("1", 2, "foo"),
tuple("2", 3, "bar"),
tuple("2", 1, "bar"),
tuple("1", 4, "foo"));
pigServer.setBatchOn();
pigServer.registerQuery("A = LOAD 'input' using mock.Storage;");
pigServer.registerQuery("B = FILTER A BY $0 == '1';");
pigServer.registerQuery("C = FILTER A BY $0 == '2';");
pigServer.registerQuery("STORE B INTO 'output1' using mock.Storage;");
pigServer.registerQuery("STORE C INTO 'output2' using mock.Storage;");
pigServer.executeBatch();
assertEquals(
Arrays.asList(
tuple("1", 2, "foo"),
tuple("1", 4, "foo")
),
sortByIndex(data.get("output1"), 0));
assertEquals(
Arrays.asList(
tuple("2", 3, "bar"),
tuple("2", 1, "bar")
),
sortByIndex(data.get("output2"), 0));
pigServer.shutdown();
}
@Test
public void testOrderBy() throws Exception {
PigServer pigServer = newPigServer();
Data data = Storage.resetData(pigServer);
data.set("input",
tuple("1", 2, "foo"),
tuple("2", 3, "foo"),
tuple("3", 1, "foo"),
tuple("1", 4, "foo"));
pigServer.registerQuery("A = LOAD 'input' using mock.Storage;");
pigServer.registerQuery("B = ORDER A BY $1;");
pigServer.registerQuery("STORE B INTO 'output' using mock.Storage;");
assertEquals(
Arrays.asList(
tuple("3", 1, "foo"),
tuple("1", 2, "foo"),
tuple("2", 3, "foo"),
tuple("1", 4, "foo")
),
data.get("output"));
pigServer.shutdown();
}
@Test
public void testLimit() throws Exception {
PigServer pigServer = newPigServer();
Data data = Storage.resetData(pigServer);
data.set("input",
tuple("1"),
tuple("2"),
tuple("3"),
tuple("4"));
pigServer.registerQuery("A = LOAD 'input' using mock.Storage;");
pigServer.registerQuery("B = LIMIT A 2;");
pigServer.registerQuery("STORE B INTO 'output' using mock.Storage;");
assertEquals(
Arrays.asList(tuple("1"), tuple("2")),
data.get("output"));
pigServer.shutdown();
}
@Test
public void testUnion() throws Exception {
PigServer pigServer = newPigServer();
Data data = Storage.resetData(pigServer);
data.set("input1",
tuple("1"),
tuple("2"),
tuple("3"),
tuple("4"));
data.set("input2",
tuple("5"),
tuple("6"),
tuple("7"),
tuple("8"));
pigServer.registerQuery("A = LOAD 'input1' using mock.Storage;");
pigServer.registerQuery("B = LOAD 'input2' using mock.Storage;");
pigServer.registerQuery("C = UNION A, B;");
pigServer.registerQuery("STORE C INTO 'output' using mock.Storage;");
assertEquals(
Arrays.asList(
tuple("1"),
tuple("2"),
tuple("3"),
tuple("4"),
tuple("5"),
tuple("6"),
tuple("7"),
tuple("8")),
data.get("output"));
pigServer.shutdown();
}
@Test
public void testDistinctUnion() throws Exception {
PigServer pigServer = newPigServer();
Data data = Storage.resetData(pigServer);
data.set("input1",
tuple("1"),
tuple("2"),
tuple("3"),
tuple("4"),
tuple("2"),
tuple("3"));
data.set("input2",
tuple("5"),
tuple("6"),
tuple("7"),
tuple("8"));
pigServer.registerQuery("A = LOAD 'input1' using mock.Storage;");
pigServer.registerQuery("B = LOAD 'input2' using mock.Storage;");
pigServer.registerQuery("C = DISTINCT A;");
pigServer.registerQuery("D = UNION C, B;");
pigServer.registerQuery("STORE D INTO 'output' using mock.Storage;");
assertEquals(
Arrays.asList(
tuple("1"),
tuple("2"),
tuple("3"),
tuple("4"),
tuple("5"),
tuple("6"),
tuple("7"),
tuple("8")),
sortByIndex(data.get("output"),0));
pigServer.shutdown();
}
@Test
public void testCoGroup() throws Exception {
PigServer pigServer = newPigServer();
Data data = Storage.resetData(pigServer);
data.set("input1",
tuple("foo", 1, "a"),
tuple("foo", 2, "b"),
tuple("foo", 3, "c"),
tuple("foo", 1, "d"));
data.set("input2",
tuple("bar", 1, "e"),
tuple("bar", 2, "f"),
tuple("bar", 1, "g"));
pigServer.registerQuery("A = LOAD 'input1' using mock.Storage;");
pigServer.registerQuery("B = LOAD 'input2' using mock.Storage;");
pigServer.registerQuery("C = COGROUP A BY $1, B BY $1;");
pigServer.registerQuery("STORE C INTO 'output' using mock.Storage;");
assertEquals(
Arrays.asList(
tuple(1,bag(tuple("foo", 1,"a"),tuple("foo", 1,"d")),bag(tuple("bar", 1,"e"),tuple("bar", 1,"g"))),
tuple(2,bag(tuple("foo", 2,"b")),bag(tuple("bar", 2,"f"))),
tuple(3,bag(tuple("foo", 3,"c")),bag())
),
sortByIndex(data.get("output"), 0));
pigServer.shutdown();
}
@Test
public void testCoGroup2() throws Exception {
PigServer pigServer = newPigServer();
Data data = Storage.resetData(pigServer);
data.set("input1",
tuple("foo", 1, "a"),
tuple("foo", 2, "b"),
tuple("foo", 3, "c"),
tuple("foo", 1, "d"));
data.set("input2",
tuple("bar", 1, "e"),
tuple("bar", 2, "f"),
tuple("bar", 1, "g"));
data.set("input3",
tuple("baz", 3, "h"));
data.set("input4",
tuple("boz", 4, "i"));
pigServer.registerQuery("A = LOAD 'input1' using mock.Storage;");
pigServer.registerQuery("B = LOAD 'input2' using mock.Storage;");
pigServer.registerQuery("C = LOAD 'input3' using mock.Storage;");
pigServer.registerQuery("D = LOAD 'input4' using mock.Storage;");
pigServer.registerQuery("E = COGROUP A BY $1, B BY $1, C BY $1, D BY $1;");
pigServer.registerQuery("STORE E INTO 'output' using mock.Storage;");
assertEquals(
Arrays.asList(
tuple(1,bag(tuple("foo", 1,"a"),tuple("foo", 1,"d")),bag(tuple("bar", 1,"e"),tuple("bar", 1,"g")),bag(),bag()),
tuple(2,bag(tuple("foo", 2,"b")),bag(tuple("bar", 2,"f")),bag(),bag()),
tuple(3,bag(tuple("foo", 3,"c")),bag(),bag(tuple("baz", 3, "h")),bag()),
tuple(4,bag(),bag(),bag(),bag(tuple("boz", 4, "i")))
),
sortByIndex(data.get("output"), 0));
pigServer.shutdown();
}
@Test
public void testJoin() throws Exception {
PigServer pigServer = newPigServer();
Data data = Storage.resetData(pigServer);
data.set("input1",
tuple(1, "a"),
tuple(2, "b"),
tuple(3, "c"),
tuple(1, "d"));
data.set("input2",
tuple(1, "e"),
tuple(2, "f"),
tuple(1, "g"));
pigServer.registerQuery("A = LOAD 'input1' using mock.Storage;");
pigServer.registerQuery("B = LOAD 'input2' using mock.Storage;");
pigServer.registerQuery("C = JOIN A BY $0, B BY $0;");
pigServer.registerQuery("STORE C INTO 'output' using mock.Storage;");
List<Tuple> sortedOutput = data.get("output");
Collections.sort(sortedOutput);
assertEquals(
Arrays.asList(
tuple(1, "a", 1, "e"),
tuple(1, "a", 1, "g"),
tuple(1, "d", 1, "e"),
tuple(1, "d", 1, "g"),
tuple(2, "b", 2, "f")
),
sortedOutput);
pigServer.shutdown();
}
@Test
public void testCachingLoad() throws Exception {
testCaching("A = LOAD 'input' using mock.Storage;" +
"CACHE A;" +
"STORE A INTO 'output' using mock.Storage;");
}
@Test
public void testCachingLoadCast() throws Exception {
testCaching("A = LOAD 'input' using mock.Storage as (foo:chararray);" +
"CACHE A;" +
"STORE A INTO 'output' using mock.Storage;");
}
@Test
public void testCachingWithFilter() throws Exception {
testCaching("A = LOAD 'input' using mock.Storage; " +
"B = FILTER A by $0 == $0;" + // useless filter
"A = FOREACH B GENERATE (chararray) $0;" +
"CACHE A;" +
"STORE A INTO 'output' using mock.Storage;");
}
@Test
public void testCachingJoin() throws Exception {
testCaching("A = LOAD 'input' using mock.Storage; " +
"B = LOAD 'input' using mock.Storage; " +
"A = JOIN A by $0, B by LOWER($0); " +
"CACHE A; " +
"STORE A INTO 'output' using mock.Storage;");
}
@Test
public void testCachingGroup() throws Exception {
testCaching("A = LOAD 'input' using mock.Storage; " +
"A = GROUP A by LOWER($0); " +
"CACHE A; " +
"STORE A INTO 'output' using mock.Storage;");
}
@Test
public void testIgnoreWrongUDFCache() throws Exception {
testIgnoreCache(
"A = LOAD 'input' using mock.Storage; " +
"B = LOAD 'input' using mock.Storage; " +
"A = JOIN A by $0, B by LOWER($0); " +
"CACHE A; " +
"STORE A INTO 'output' using mock.Storage;",
"A = LOAD 'input' using mock.Storage; " +
"B = LOAD 'input' using mock.Storage; " +
"A = JOIN A by $0, B by UPPER($0); " +
"CACHE A; " +
"STORE A INTO 'output' using mock.Storage;");
}
@Test
public void testIgnoreDiffFilterCache() throws Exception {
testIgnoreCache("A = LOAD 'input' using mock.Storage;" +
"A = FILTER A by $0 == 'test1';" +
"CACHE A;" +
"STORE A INTO 'output' using mock.Storage;",
"A = LOAD 'input' using mock.Storage;" +
"A = FILTER A by $0 == 'test2';" +
"CACHE A;" +
"STORE A INTO 'output' using mock.Storage;");
}
public void testIgnoreCache(String query1, String query2) throws Exception {
PigServer pigServer = newPigServer();
Data data = Storage.resetData(pigServer);
data.set("input",
tuple("test1"),
tuple("test2"));
pigServer.setBatchOn();
pigServer.registerQuery(query1);
pigServer.executeBatch();
List<Tuple> originalOutput = data.get("output");
LOG.debug("After first query: " + originalOutput);
data = Storage.resetData(pigServer);
data.set("input",
tuple("test3"),
tuple("test4"));
pigServer.registerQuery(query2);
pigServer.executeBatch();
LOG.debug("After second query: " + data.get("output"));
Assert.assertFalse(
originalOutput.equals(
data.get("output")));
pigServer.shutdown();
}
/**
* Kind of a hack: To test whether caching is happening, we modify a file on disk after caching
* it in Spark.
*/
private void testCaching(String query) throws Exception {
PigServer pigServer = newPigServer();
Data data = Storage.resetData(pigServer);
data.set("input",
tuple("test1"),
tuple("test2"));
pigServer.setBatchOn();
pigServer.registerQuery(query);
pigServer.executeBatch();
LOG.debug("After first query: " + data.get("output"));
List<Tuple> originalOutput = data.get("output");
data = Storage.resetData(pigServer);
data.set("input",
tuple("test3"),
tuple("test4"));
pigServer.registerQuery("STORE A INTO 'output' using mock.Storage;");
pigServer.executeBatch();
LOG.debug("After second query: " + data.get("output"));
assertEquals(
originalOutput,
data.get("output"));
pigServer.shutdown();
}
}