TestCombiner.java example

Explorer
spork-streaming-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements. See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership. The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License. You may obtain a copy of the License at
 * http://www.apache.org/licenses/LICENSE-2.0
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.pig.test;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Properties;

import org.apache.pig.EvalFunc;
import org.apache.pig.ExecType;
import org.apache.pig.PigServer;
import org.apache.pig.builtin.PigStorage;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.DefaultDataBag;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.PigContext;
import org.apache.pig.impl.io.FileLocalizer;
import org.junit.AfterClass;
import org.junit.Before;
import org.junit.Test;

public class TestCombiner {

    static MiniCluster cluster = MiniCluster.buildCluster();

    @AfterClass
    public static void oneTimeTearDown() throws Exception {
        cluster.shutDown();
    }

    @Test
    public void testSuccessiveUserFuncs1() throws Exception {
        String query = "a = load 'students.txt' as (c1,c2,c3,c4); " +
                "c = group a by c2; " +
                "f = foreach c generate COUNT(org.apache.pig.builtin.Distinct($1.$2)); " +
                "store f into 'out';";
        PigServer pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());
        PigContext pc = pigServer.getPigContext();
        assertTrue((Util.buildMRPlan(Util.buildPp(pigServer, query), pc).getRoots().get(0).combinePlan
                .isEmpty()));
    }

    @Test
    public void testSuccessiveUserFuncs2() throws Exception {
        String dummyUDF = JiraPig1030.class.getName();
        String query = "a = load 'students.txt' as (c1,c2,c3,c4); " +
                "c = group a by c2; " +
                "f = foreach c generate COUNT(" + dummyUDF + "" +
                "(org.apache.pig.builtin.Distinct($1.$2)," + dummyUDF + "())); " +
                "store f into 'out';";
        PigServer pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());
        PigContext pc = pigServer.getPigContext();
        assertTrue((Util.buildMRPlan(Util.buildPp(pigServer, query), pc).getRoots().get(0).combinePlan
                .isEmpty()));
    }

    @Test
    public void testOnCluster() throws Exception {
        // run the test on cluster
        String inputFileName = runTest(new PigServer(
                ExecType.MAPREDUCE, cluster.getProperties()));
        Util.deleteFile(cluster, inputFileName);

    }

    /*
     * (non-Javadoc)
     * @see junit.framework.TestCase#setUp()
     */
    @Before
    public void setUp() throws Exception {
        // cause a re initialization of FileLocalizer's
        // internal state before each test run
        // A previous test might have been in a different
        // mode than the test which is about to run. To
        // ensure each test runs correctly in it's exectype
        // mode, let's re initialize.
        FileLocalizer.setInitialized(false);
    }

    @Test
    public void testLocal() throws Exception {
        // run the test locally
        FileLocalizer.deleteTempFiles();
        runTest(new PigServer(ExecType.LOCAL, new Properties()));
        FileLocalizer.deleteTempFiles();
    }

    private String runTest(PigServer pig) throws IOException {
        List<String> inputLines = new ArrayList<String>();
        inputLines.add("a,b,1");
        inputLines.add("a,b,1");
        inputLines.add("a,c,1");
        String inputFileName = loadWithTestLoadFunc("A", pig, inputLines);

        pig.registerQuery("B = group A by ($0, $1);");
        pig.registerQuery("C = foreach B generate flatten(group), COUNT($1);");
        Iterator<Tuple> resultIterator = pig.openIterator("C");
        Tuple tuple = resultIterator.next();
        assertEquals("(a,b,2)", tuple.toString());
        tuple = resultIterator.next();
        assertEquals("(a,c,1)", tuple.toString());

        return inputFileName;
    }

    private String loadWithTestLoadFunc(String loadAlias, PigServer pig,
            List<String> inputLines) throws IOException {
        File inputFile = File.createTempFile("test", "txt");
        inputFile.deleteOnExit();
        String inputFileName = inputFile.getAbsolutePath();
        if (pig.getPigContext().getExecType() == ExecType.LOCAL) {
            PrintStream ps = new PrintStream(new FileOutputStream(inputFile));
            for (String line : inputLines) {
                ps.println(line);
            }
            ps.close();
        } else {
            inputFileName = Util.removeColon(inputFileName);
            Util.createInputFile(cluster, inputFileName, inputLines.toArray(new String[] {}));
        }
        pig.registerQuery(loadAlias + " = load '"
                + Util.encodeEscape(inputFileName) + "' using "
                + PigStorage.class.getName() + "(',');");
        return inputFileName;
    }

    @Test
    public void testNoCombinerUse() {
        // To simulate this, we will have two input files
        // with exactly one input record - this should result
        // in two map tasks and each would process only one record
        // hence the combiner would not get called.
    }

    @Test
    public void testMultiCombinerUse() throws Exception {
        // test the scenario where the combiner is called multiple
        // times - this can happen when the output of the map > io.sort.mb
        // let's set the io.sort.mb to 1MB and > 1 MB map data.
        String[] input = new String[500 * 1024];
        for (int i = 0; i < input.length; i++) {
            if (i % 2 == 0) {
                input[i] = Integer.toString(1);
            } else {
                input[i] = Integer.toString(0);
            }
        }
        Util.createInputFile(cluster, "MultiCombinerUseInput.txt", input);
        Properties props = cluster.getProperties();
        props.setProperty("io.sort.mb", "1");
        PigServer pigServer = new PigServer(ExecType.MAPREDUCE, props);
        pigServer.registerQuery("a = load 'MultiCombinerUseInput.txt' as (x:int);");
        pigServer.registerQuery("b = group a all;");
        pigServer.registerQuery("c = foreach b generate COUNT(a), SUM(a.$0), " +
                "MIN(a.$0), MAX(a.$0), AVG(a.$0), ((double)SUM(a.$0))/COUNT(a.$0)," +
                " COUNT(a.$0) + SUM(a.$0) +  MAX(a.$0);");

        // make sure there is a combine plan in the explain output
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        PrintStream ps = new PrintStream(baos);
        pigServer.explain("c", ps);
        assertTrue(baos.toString().matches("(?si).*combine plan.*"));

        Iterator<Tuple> it = pigServer.openIterator("c");
        Tuple t = it.next();
        assertEquals(512000L, t.get(0));
        assertEquals(256000L, t.get(1));
        assertEquals(0, t.get(2));
        assertEquals(1, t.get(3));
        assertEquals(0.5, t.get(4));
        assertEquals(0.5, t.get(5));
        assertEquals(512000L + 256000L + 1, t.get(6));

        assertFalse(it.hasNext());
        Util.deleteFile(cluster, "MultiCombinerUseInput.txt");
    }

    @Test
    public void testDistinctAggs1() throws Exception {
        // test the use of combiner for distinct aggs:
        String input[] = {
                        "pig1\t18\t2.1",
                        "pig2\t24\t3.3",
                        "pig5\t45\t2.4",
                        "pig1\t18\t2.1",
                        "pig1\t19\t2.1",
                        "pig2\t24\t4.5",
                        "pig1\t20\t3.1" };

        Util.createInputFile(cluster, "distinctAggs1Input.txt", input);
        PigServer pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());
        pigServer.registerQuery("a = load 'distinctAggs1Input.txt' as (name:chararray, age:int, gpa:double);");
        pigServer.registerQuery("b = group a by name;");
        pigServer.registerQuery("c = foreach b  {" +
                "        x = distinct a.age;" +
                "        y = distinct a.gpa;" +
                "        z = distinct a;" +
                "        generate group, COUNT(x), SUM(x.age), SUM(y.gpa), SUM(a.age), " +
                "                       SUM(a.gpa), COUNT(z.age), COUNT(z), SUM(z.age);};");

        // make sure there is a combine plan in the explain output
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        PrintStream ps = new PrintStream(baos);
        pigServer.explain("c", ps);
        assertTrue(baos.toString().matches("(?si).*combine plan.*"));

        HashMap<String, Object[]> results = new HashMap<String, Object[]>();
        results.put("pig1", new Object[] { "pig1", 3L, 57L, 5.2, 75L, 9.4, 3L, 3L, 57L });
        results.put("pig2", new Object[] { "pig2", 1L, 24L, 7.8, 48L, 7.8, 2L, 2L, 48L });
        results.put("pig5", new Object[] { "pig5", 1L, 45L, 2.4, 45L, 2.4, 1L, 1L, 45L });
        Iterator<Tuple> it = pigServer.openIterator("c");
        while (it.hasNext()) {
            Tuple t = it.next();
            List<Object> fields = t.getAll();
            Object[] expected = results.get((String)fields.get(0));
            int i = 0;
            for (Object field : fields) {
                assertEquals(expected[i++], field);
            }
        }
        Util.deleteFile(cluster, "distinctAggs1Input.txt");

    }

    @Test
    public void testGroupElements() throws Exception {
        // test use of combiner when group elements are accessed in the foreach
        String input[] = {
                        "ABC\t1\ta\t1",
                        "ABC\t1\tb\t2",
                        "ABC\t1\ta\t3",
                        "ABC\t2\tb\t4",
                        "DEF\t1\td\t1",
                        "XYZ\t1\tx\t2"
        };

        Util.createInputFile(cluster, "testGroupElements.txt", input);
        PigServer pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());
        pigServer.registerQuery("a = load 'testGroupElements.txt' as (str:chararray, num1:int, alph : chararray, num2 : int);");
        pigServer.registerQuery("b = group a by (str, num1);");

        // check if combiner is present or not for various forms of foreach
        pigServer.registerQuery("c = foreach b  generate flatten(group), COUNT(a.alph), SUM(a.num2); ");
        checkCombinerUsed(pigServer, "c", true);

        pigServer.registerQuery("c = foreach b  generate group, COUNT(a.alph), SUM(a.num2); ");
        checkCombinerUsed(pigServer, "c", true);

        // projecting bag - combiner should not be used
        pigServer.registerQuery("c = foreach b  generate group, a,  COUNT(a.alph), SUM(a.num2); ");
        checkCombinerUsed(pigServer, "c", false);

        // projecting bag - combiner should not be used
        pigServer.registerQuery("c = foreach b  generate group, a.num2,  COUNT(a.alph), SUM(a.num2); ");
        checkCombinerUsed(pigServer, "c", false);

        pigServer.registerQuery("c = foreach b  generate group.$0, group.$1, COUNT(a.alph), SUM(a.num2); ");
        checkCombinerUsed(pigServer, "c", true);

        pigServer.registerQuery("c = foreach b  generate group.$0, group.$1 + COUNT(a.alph), SUM(a.num2); ");
        checkCombinerUsed(pigServer, "c", true);

        pigServer.registerQuery("c = foreach b  generate group.str, group.$1, COUNT(a.alph), SUM(a.num2); ");
        checkCombinerUsed(pigServer, "c", true);

        pigServer.registerQuery("c = foreach b  generate group.str, group.$1, COUNT(a.alph), SUM(a.num2), "
                        + " (group.num1 == 1 ? (COUNT(a.num2) + 1)  : (SUM(a.num2) + 10)) ; ");
        checkCombinerUsed(pigServer, "c", true);

        List<Tuple> expectedRes =
                Util.getTuplesFromConstantTupleStrings(
                        new String[] {
                                        "('ABC',1,3L,6L,4L)",
                                        "('ABC',2,1L,4L,14L)",
                                        "('DEF',1,1L,1L,2L)",
                                        "('XYZ',1,1L,2L,2L)",
                        });

        Iterator<Tuple> it = pigServer.openIterator("c");
        Util.checkQueryOutputsAfterSort(it, expectedRes);

        Util.deleteFile(cluster, "distinctAggs1Input.txt");

    }

    @Test
    public void testGroupByLimit() throws Exception {
        // test use of combiner when group elements are accessed in the foreach
        String input[] = {
                        "ABC 1",
                        "ABC 2",
                        "DEF 1",
                        "XYZ 1",
                        "XYZ 2",
                        "XYZ 3",
        };

        Util.createInputFile(cluster, "testGroupLimit.txt", input);
        PigServer pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());
        pigServer.registerQuery("a = load 'testGroupLimit.txt'  using PigStorage(' ') " +
                "as (str:chararray, num1:int) ;");
        pigServer.registerQuery("b = group a by str;");

        pigServer.registerQuery("c = foreach b  generate group, COUNT(a.num1) ; ");

        // check if combiner is present
        pigServer.registerQuery("d = limit c 2 ; ");
        checkCombinerUsed(pigServer, "d", true);

        List<Tuple> expectedRes =
                Util.getTuplesFromConstantTupleStrings(
                        new String[] {
                                        "('ABC',2L)",
                                        "('DEF',1L)",
                        });

        Iterator<Tuple> it = pigServer.openIterator("d");
        Util.checkQueryOutputsAfterSort(it, expectedRes);

    }

    private void checkCombinerUsed(PigServer pigServer, String string, boolean combineExpected)
            throws IOException {
        // make sure there is a combine plan in the explain output
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        PrintStream ps = new PrintStream(baos);
        pigServer.explain("c", ps);
        boolean combinerFound = baos.toString().matches("(?si).*combine plan.*");
        System.out.println(baos.toString());
        assertEquals("is combiner present as expected", combineExpected, combinerFound);
    }

    @Test
    public void testDistinctNoCombiner() throws Exception {
        // test that combiner is NOT invoked when
        // one of the elements in the foreach generate
        // is a distinct() as the leaf
        String input[] = {
                        "pig1\t18\t2.1",
                        "pig2\t24\t3.3",
                        "pig5\t45\t2.4",
                        "pig1\t18\t2.1",
                        "pig1\t19\t2.1",
                        "pig2\t24\t4.5",
                        "pig1\t20\t3.1" };

        Util.createInputFile(cluster, "distinctNoCombinerInput.txt", input);
        PigServer pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());
        pigServer.registerQuery("a = load 'distinctNoCombinerInput.txt' as (name:chararray, age:int, gpa:double);");
        pigServer.registerQuery("b = group a by name;");
        pigServer.registerQuery("c = foreach b  {" +
                "        z = distinct a;" +
                "        generate group, z, SUM(a.age), SUM(a.gpa);};");

        // make sure there is a combine plan in the explain output
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        PrintStream ps = new PrintStream(baos);
        pigServer.explain("c", ps);
        assertFalse(baos.toString().matches("(?si).*combine plan.*"));

        HashMap<String, Object[]> results = new HashMap<String, Object[]>();
        results.put("pig1", new Object[] { "pig1", "bag-place-holder", 75L, 9.4 });
        results.put("pig2", new Object[] { "pig2", "bag-place-holder", 48L, 7.8 });
        results.put("pig5", new Object[] { "pig5", "bag-place-holder", 45L, 2.4 });
        Iterator<Tuple> it = pigServer.openIterator("c");
        while (it.hasNext()) {
            Tuple t = it.next();
            List<Object> fields = t.getAll();
            Object[] expected = results.get((String)fields.get(0));
            int i = 0;
            for (Object field : fields) {
                if (i == 1) {
                    // ignore the second field which is a bag
                    // for comparison here
                    continue;
                }
                assertEquals(expected[i++], field);
            }
        }
        Util.deleteFile(cluster, "distinctNoCombinerInput.txt");

    }

    @Test
    public void testForEachNoCombiner() throws Exception {
        // test that combiner is NOT invoked when
        // one of the elements in the foreach generate
        // has a foreach in the plan without a distinct agg
        String input[] = {
                        "pig1\t18\t2.1",
                        "pig2\t24\t3.3",
                        "pig5\t45\t2.4",
                        "pig1\t18\t2.1",
                        "pig1\t19\t2.1",
                        "pig2\t24\t4.5",
                        "pig1\t20\t3.1" };

        Util.createInputFile(cluster, "forEachNoCombinerInput.txt", input);
        PigServer pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());
        pigServer.registerQuery("a = load 'forEachNoCombinerInput.txt' as (name:chararray, age:int, gpa:double);");
        pigServer.registerQuery("b = group a by name;");
        pigServer.registerQuery("c = foreach b  {" +
                "        z = a.age;" +
                "        generate group, z, SUM(a.age), SUM(a.gpa);};");

        // make sure there is a combine plan in the explain output
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        PrintStream ps = new PrintStream(baos);
        pigServer.explain("c", ps);
        assertFalse(baos.toString().matches("(?si).*combine plan.*"));

        HashMap<String, Object[]> results = new HashMap<String, Object[]>();
        results.put("pig1", new Object[] { "pig1", "bag-place-holder", 75L, 9.4 });
        results.put("pig2", new Object[] { "pig2", "bag-place-holder", 48L, 7.8 });
        results.put("pig5", new Object[] { "pig5", "bag-place-holder", 45L, 2.4 });
        Iterator<Tuple> it = pigServer.openIterator("c");
        while (it.hasNext()) {
            Tuple t = it.next();
            List<Object> fields = t.getAll();
            Object[] expected = results.get((String)fields.get(0));
            int i = 0;
            for (Object field : fields) {
                if (i == 1) {
                    // ignore the second field which is a bag
                    // for comparison here
                    continue;
                }
                assertEquals(expected[i++], field);
            }
        }
        Util.deleteFile(cluster, "forEachNoCombinerInput.txt");

    }

    @Test
    public void testJiraPig746() throws Exception {
        // test that combiner is NOT invoked when
        // one of the elements in the foreach generate
        // has a foreach in the plan without a distinct agg
        String input[] = {
                        "pig1\t18\t2.1",
                        "pig2\t24\t3.3",
                        "pig5\t45\t2.4",
                        "pig1\t18\t2.1",
                        "pig1\t19\t2.1",
                        "pig2\t24\t4.5",
                        "pig1\t20\t3.1" };

        String expected[] = {
                        "(pig1,75,{(pig1,18,2.1),(pig1,18,2.1),(pig1,19,2.1),(pig1,20,3.1)})",
                        "(pig2,48,{(pig2,24,3.3),(pig2,24,4.5)})",
                        "(pig5,45,{(pig5,45,2.4)})"
        };

        try {
            Util.createInputFile(cluster, "forEachNoCombinerInput.txt", input);

            PigServer pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());
            pigServer.registerQuery("a = load 'forEachNoCombinerInput.txt' as (name:chararray, age:int, gpa:double);");
            pigServer.registerQuery("b = group a by name;");
            pigServer.registerQuery("c = foreach b generate group, SUM(a.age), a;");

            // make sure there isn't a combine plan in the explain output
            ByteArrayOutputStream baos = new ByteArrayOutputStream();
            PrintStream ps = new PrintStream(baos);
            pigServer.explain("c", ps);
            assertFalse(baos.toString().matches("(?si).*combine plan.*"));

            Iterator<Tuple> it = pigServer.openIterator("c");
            Util.checkQueryOutputsAfterSortRecursive(it, expected,
                    "group:chararray,age:long,b:{t:(name:chararray,age:int,gpa:double)}");
        } finally {
            Util.deleteFile(cluster, "forEachNoCombinerInput.txt");
        }
    }

    public static class JiraPig1030 extends EvalFunc<DataBag> {

        public DataBag exec(Tuple input) throws IOException {
            return new DefaultDataBag();
        }
    }

    @Test
    public void testJiraPig1030() throws Exception {
        // test that combiner is NOT invoked when
        // one of the elements in the foreach generate
        // has a non-algebraic UDF that have multiple inputs
        // (one of them is distinct).

        String input[] = {
                        "pig1\t18\t2.1",
                        "pig2\t24\t3.3",
                        "pig5\t45\t2.4",
                        "pig1\t18\t2.1",
                        "pig1\t19\t2.1",
                        "pig2\t24\t4.5",
                        "pig1\t20\t3.1" };

        try {
            Util.createInputFile(cluster, "forEachNoCombinerInput.txt", input);
            PigServer pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());
            pigServer.registerQuery("a = load 'forEachNoCombinerInput.txt' as (name:chararray, age:int, gpa:double);");
            pigServer.registerQuery("b = group a all;");
            pigServer.registerQuery("c = foreach b  {" +
                    "        d = distinct a.age;" +
                    "        generate group, " + JiraPig1030.class.getName() + "(d, 0);};");

            // make sure there isn't a combine plan in the explain output
            ByteArrayOutputStream baos = new ByteArrayOutputStream();
            PrintStream ps = new PrintStream(baos);
            pigServer.explain("c", ps);
            assertFalse(baos.toString().matches("(?si).*combine plan.*"));
        } finally {
            Util.deleteFile(cluster, "forEachNoCombinerInput.txt");
        }
    }
}