/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.pig.test; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; import java.lang.reflect.Method; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Random; import java.util.Set; import org.apache.pig.Algebraic; import org.apache.pig.EvalFunc; import org.apache.pig.ExecType; import org.apache.pig.LoadFunc; import org.apache.pig.PigServer; import org.apache.pig.backend.executionengine.ExecException; import org.apache.pig.backend.hadoop.datastorage.ConfigurationUtil; import org.apache.pig.builtin.ARITY; import org.apache.pig.builtin.BagSize; import org.apache.pig.builtin.CONCAT; import org.apache.pig.builtin.COR; import org.apache.pig.builtin.COUNT; import org.apache.pig.builtin.COUNT_STAR; import org.apache.pig.builtin.COV; import org.apache.pig.builtin.DIFF; import org.apache.pig.builtin.Distinct; import org.apache.pig.builtin.INDEXOF; import org.apache.pig.builtin.LAST_INDEX_OF; import org.apache.pig.builtin.LCFIRST; import org.apache.pig.builtin.LOWER; import org.apache.pig.builtin.MapSize; import org.apache.pig.builtin.PigStorage; import org.apache.pig.builtin.REGEX_EXTRACT; import org.apache.pig.builtin.REGEX_EXTRACT_ALL; import org.apache.pig.builtin.REPLACE; import org.apache.pig.builtin.SIZE; import org.apache.pig.builtin.STRSPLIT; import org.apache.pig.builtin.SUBSTRING; import org.apache.pig.builtin.StringConcat; import org.apache.pig.builtin.StringSize; import org.apache.pig.builtin.TOBAG; import org.apache.pig.builtin.TOKENIZE; import org.apache.pig.builtin.TOP; import org.apache.pig.builtin.TOTUPLE; import org.apache.pig.builtin.TRIM; import org.apache.pig.builtin.TextLoader; import org.apache.pig.builtin.TupleSize; import org.apache.pig.builtin.UCFIRST; import org.apache.pig.builtin.UPPER; import org.apache.pig.data.BagFactory; import org.apache.pig.data.DataBag; import org.apache.pig.data.DataByteArray; import org.apache.pig.data.DataType; import org.apache.pig.data.DefaultBagFactory; import org.apache.pig.data.Tuple; import org.apache.pig.data.TupleFactory; import org.apache.pig.impl.PigContext; import org.apache.pig.impl.io.ReadToEndLoader; import org.apache.pig.impl.logicalLayer.schema.Schema; import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema; import org.junit.AfterClass; import org.junit.Before; import org.junit.Test; public class TestBuiltin { private String initString = "mapreduce"; //private String initString = "local"; static MiniCluster cluster = MiniCluster.buildCluster(); PigServer pigServer; TupleFactory tupleFactory = TupleFactory.getInstance(); BagFactory bagFactory = DefaultBagFactory.getInstance(); // some inputs private static Integer[] intInput = { 3, 1, 2, 4, 5, 7, null, 6, 8, 9, 10 }; private static Long[] intAsLong = { 3L, 1L, 2L, 4L, 5L, 7L, null, 6L, 8L, 9L, 10L }; private static Long[] longInput = { 145769183483345L, null, 4345639849L, 3435543121L, 2L, 5L, 9L, 7L, 8L, 6L, 10L }; private static Float[] floatInput = { 10.4f, 2.35f, 3.099f, null, 4.08495f, 5.350f, 6.78f, 7.0f, 8.0f, 9.0f, 0.09f }; private static Double[] floatAsDouble = { 10.4, 2.35, 3.099, null, 4.08495, 5.350, 6.78, 7.0, 8.0, 9.0, 0.09 }; private static Double[] doubleInput = { 5.5673910, 121.0, 3.0, 0.000000834593, 1.0, 6.0, 7.0, 8.0, 9.0, 10.0, null }; private static String[] ba = { "7", "2", "3", null, "4", "5", "6", "1", "8", "9", "10"}; private static Double[] baAsDouble = { 7.0, 2.0, 3.0, null, 4.0, 5.0, 6.0, 1.0, 8.0, 9.0, 10.0}; private static String[] stringInput = {"unit", "test", null, "input", "string"}; private static DataByteArray[] ByteArrayInput = Util.toDataByteArrays(ba); // The HashMaps below are used to set up the appropriate EvalFunc, // the allowed input and expected output for the different aggregate functions // which have different implementations for different input types // This way rather than quickly exploding the test cases (one per input type // per aggregate), all cases for a given aggregate stage are handled // in one test case in a loop // A mapping between name of Aggregate function to its corresponding EvalFunc object private static HashMap<String, EvalFunc<?>> evalFuncMap = new HashMap<String, EvalFunc<?>>(); // A mapping between a type name (example: "Integer") and a tuple containing // a bag of inputs of that type private static HashMap<String, Tuple> inputMap = new HashMap<String, Tuple>(); // A mapping between name of Aggregate function and the input type of its // argument private static HashMap<String, String> allowedInput = new HashMap<String, String>(); // A mapping between name of Aggregate function and the output value (based on the // inputs above) private static HashMap<String, Object> expectedMap = new HashMap<String, Object>(); String[] stages = {"Initial", "Intermediate", "Final"}; String[][] aggs = { {"SUM", "IntSum", "LongSum", "FloatSum", "DoubleSum"}, {"AVG", "IntAvg", "LongAvg", "FloatAvg", "DoubleAvg"}, {"MIN", "IntMin", "LongMin", "FloatMin", "DoubleMin", "StringMin"}, {"MAX", "IntMax", "LongMax", "FloatMax", "DoubleMax", "StringMax"}, {"COUNT"}, }; String[] inputTypeAsString = {"ByteArray", "Integer", "Long", "Float", "Double", "String" }; @Before public void setUp() throws Exception { pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties()); // First set up data structs for "base" SUM, MIN and MAX and AVG. // The allowed input and expected output data structs for // the "Intermediate" and "Final" stages can be based on the // "base" case - the allowed inputs for Initial stage can be based // on the "base" case. In the test cases, the // output of Initial is sent to Intermediate, so we don't // explicitly test the output of Initial and hence do not // need to set up expectedMap. // first set up EvalFuncMap and expectedMap setupEvalFuncMap(); expectedMap.put("SUM", new Double(55)); expectedMap.put("DoubleSum", new Double(170.567391834593)); expectedMap.put("IntSum", new Long(55)); expectedMap.put("LongSum", new Long(145776964666362L)); expectedMap.put("FloatSum", new Double(56.15395)); expectedMap.put("AVG", new Double(5.5)); expectedMap.put("DoubleAvg", new Double(17.0567391834593)); expectedMap.put("LongAvg", new Double(14577696466636.2)); expectedMap.put("IntAvg", new Double(5.5)); expectedMap.put("FloatAvg", new Double(5.615394958853722)); expectedMap.put("MIN", new Double(1)); expectedMap.put("IntMin", new Integer(1)); expectedMap.put("LongMin", new Long(2)); expectedMap.put("FloatMin", new Float(0.09f)); expectedMap.put("DoubleMin", new Double(0.000000834593)); expectedMap.put("StringMin", "input"); expectedMap.put("MAX", new Double(10)); expectedMap.put("IntMax", new Integer(10)); expectedMap.put("LongMax", new Long(145769183483345L)); expectedMap.put("FloatMax", new Float(10.4f)); expectedMap.put("DoubleMax", new Double(121.0)); expectedMap.put("StringMax", "unit"); expectedMap.put("COUNT", new Long(10)); // set up allowedInput for (String[] aggGroups : aggs) { int i = 0; for(String agg: aggGroups) { allowedInput.put(agg, inputTypeAsString[i++]); } } // The idea here is that we can reuse the same input // and expected output of the algebraic functions // for their Intermediate and Final Stages. For the // Initial stage we can reuse the input of the algebraic // function. for (String[] aggGroups : aggs) { for(String agg: aggGroups) { for (String stage : stages) { if(stage.equals("Initial")) { // For the Initial function, the input should match the input // for the aggregate function itself. In the test cases, the // output of Initial is sent to Intermediate, so we don't // explicitly test the output of Initial and hence do not // need to set up expectedMap. allowedInput.put(agg + stage, allowedInput.get(agg)); } else { // For IntSumIntermediate and IntSumFinal and // FloatSumIntermediate and FloatSumFinal, the input is expected // be of types Long and Double respectively (Initial version // of these functions is supposed to convert the Int to Long // and Float to Double respectively) - Likewise for SUMIntermediate // and SumFinal the input is expected to be Double - The Initial // version is supposed to convert byteArrays to Double if((agg).equals("IntSum") || (agg).equals("IntAvg")) { allowedInput.put(agg + stage, "IntegerAsLong"); } else if ((agg).equals("FloatSum") || agg.equals("FloatAvg")) { allowedInput.put(agg + stage, "FloatAsDouble"); }else if ((agg).equals("MIN") || agg.equals("MAX") || (agg.equals("SUM")) || agg.equals("AVG")) { // For MIN and MAX the Intermediate and Final functions // expect input to be Doubles (Initial is supposed to // convert the ByteArray to Double) allowedInput.put(agg + stage, "ByteArrayAsDouble"); } else { // In all other cases, the input and expected output // for "Intermediate" and "Final" stages should match the input // and expected output for the aggregate function itself allowedInput.put(agg + stage, allowedInput.get(agg)); } // For Average, we set up expectedMap only for the "Final" stage // For other aggs, set up expected Map for both "Intermediate" // and "Final" if(! agg.matches("(?i)avg") || stage.equals("Final")) { expectedMap.put(agg + stage, expectedMap.get(agg)); } } } } } // For Avg, the expected output (for the sum part) for Intermediate are the // same as SUM - so handled a little differently accordingly expectedMap.put("AVGIntermediate", expectedMap.get("SUM")); expectedMap.put("DoubleAvgIntermediate", expectedMap.get("DoubleSum")); expectedMap.put("LongAvgIntermediate", expectedMap.get("LongSum")); expectedMap.put("IntAvgIntermediate", expectedMap.get("IntSum")); expectedMap.put("FloatAvgIntermediate", expectedMap.get("FloatSum")); // set up input hash try{ inputMap.put("Integer", Util.loadNestTuple(TupleFactory.getInstance().newTuple(1), intInput)); inputMap.put("IntegerAsLong", Util.loadNestTuple(TupleFactory.getInstance().newTuple(1), intAsLong)); inputMap.put("Long", Util.loadNestTuple(TupleFactory.getInstance().newTuple(1), longInput)); inputMap.put("Float", Util.loadNestTuple(TupleFactory.getInstance().newTuple(1), floatInput)); inputMap.put("FloatAsDouble", Util.loadNestTuple(TupleFactory.getInstance().newTuple(1), floatAsDouble)); inputMap.put("Double", Util.loadNestTuple(TupleFactory.getInstance().newTuple(1), doubleInput)); inputMap.put("ByteArray", Util.loadNestTuple(TupleFactory.getInstance().newTuple(1), ByteArrayInput)); inputMap.put("ByteArrayAsDouble", Util.loadNestTuple(TupleFactory.getInstance().newTuple(1), baAsDouble)); inputMap.put("String", Util.loadNestTuple(TupleFactory.getInstance().newTuple(1), stringInput)); }catch(ExecException e) { e.printStackTrace(); } } /** * */ private void setupEvalFuncMap() { for (String[] aggGroup : aggs) { for (String agg : aggGroup) { // doing this as a two step process because PigContext.instantiateFuncFromSpec("SUM.Intermediate") // fails with class resolution error. EvalFunc<?> func = (EvalFunc<?>)PigContext.instantiateFuncFromSpec(agg); evalFuncMap.put(agg, func); evalFuncMap.put(agg + "Initial", (EvalFunc<?>)PigContext.instantiateFuncFromSpec(((Algebraic)func).getInitial())); evalFuncMap.put(agg + "Intermediate", (EvalFunc<?>)PigContext.instantiateFuncFromSpec(((Algebraic)func).getIntermed())); evalFuncMap.put(agg + "Final", (EvalFunc<?>)PigContext.instantiateFuncFromSpec(((Algebraic)func).getFinal())); } } } @AfterClass public static void oneTimeTearDown() throws Exception { cluster.shutDown(); } /** * Test the case where the combiner is not called - so initial is called * and then final is called * @throws Exception */ @Test public void testAggNoCombine() throws Exception { for (String[] aggGroup : aggs) { String[] aggFinalTypes = null; // will contains AVGFinal, DoubleAvgFinal etc String[] aggInitialTypes = null; // will contains AVGInitial, DoubleAvgInitial etc for (String stage: stages) { String[] aggTypesArray = null; if(stage.equals("Initial")) { aggInitialTypes = new String[aggGroup.length]; aggTypesArray = aggInitialTypes; } else if(stage.equals("Final")) { aggFinalTypes = new String[aggGroup.length]; aggTypesArray = aggFinalTypes; } else { // Intermediate continue; } for (int i = 0; i < aggTypesArray.length; i++) { aggTypesArray[i] = aggGroup[i] + stage; } } for(int k = 0; k < aggFinalTypes.length; k++) { EvalFunc<?> avgInitial = evalFuncMap.get(aggInitialTypes[k]); Tuple tup = inputMap.get(getInputType(aggInitialTypes[k])); // To test this case, first AVGInitial is called for each input // value and output of it is put into a bag. The bag containing // all AVGInitial output is provided as input to AVGFinal // The tuple we got above has a bag with input // values. Lets call AVGInitial with each value: DataBag bg = (DataBag) tup.get(0); DataBag finalInputBg = bagFactory.newDefaultBag(); for (Tuple tuple : bg) { DataBag initialInputBg = bagFactory.newDefaultBag(); initialInputBg.add(tuple); Tuple initialInputTuple = tupleFactory.newTuple(initialInputBg); finalInputBg.add((Tuple)avgInitial.exec(initialInputTuple)); } Tuple finalInputTuple = tupleFactory.newTuple(finalInputBg); EvalFunc<?> aggFinal = evalFuncMap.get(aggFinalTypes[k]); String msg = "[Testing " + aggGroup[k] + " on input type: " + getInputType(aggFinalTypes[k]); System.err.println(msg + " for no combiner case]"); Object output = aggFinal.exec(finalInputTuple); msg += " ( (output) " + output + " == " + getExpected(aggFinalTypes[k]) + " (expected) )]"; // for doubles, precisions can be a problem - so check // if the type is double for expected result and check // within some precision if(getExpected(aggFinalTypes[k]) instanceof Double) { assertEquals(msg, (Double)getExpected(aggFinalTypes[k]), (Double)output, 0.00001); } else { assertEquals(msg, getExpected(aggFinalTypes[k]), output); } } } } /** * Test the case where the combiner is called once - so initial is called * and then Intermediate and then final is called * @throws Exception */ @Test public void testAggSingleCombine() throws Exception { for (String[] aggGroup : aggs) { String[] aggFinalTypes = null; // will contains AVGFinal, DoubleAvgFinal etc String[] aggInitialTypes = null; // will contains AVGInitial, DoubleAvgInitial etc String[] aggIntermediateTypes = null; // will contains AVGIntermediate, DoubleAvgIntermediate etc for (String stage: stages) { String[] aggTypesArray = null; if(stage.equals("Initial")) { aggInitialTypes = new String[aggGroup.length]; aggTypesArray = aggInitialTypes; } else if (stage.equals("Intermediate")) { aggIntermediateTypes = new String[aggGroup.length]; aggTypesArray = aggIntermediateTypes; } else {// final aggFinalTypes = new String[aggGroup.length]; aggTypesArray = aggFinalTypes; } for (int i = 0; i < aggTypesArray.length; i++) { aggTypesArray[i] = aggGroup[i] + stage; } } for(int k = 0; k < aggFinalTypes.length; k++) { EvalFunc<?> aggInitial = evalFuncMap.get(aggInitialTypes[k]); Tuple tup = inputMap.get(getInputType(aggInitialTypes[k])); // To test this case, first <Agg>Initial is called for each input // value. The output from <Agg>Initial for the first half of inputs is // put into one bag and the next half into another. Then these two // bags are provided as inputs to two separate calls of <Agg>Intermediate. // The outputs from the two calls to <Agg>Intermediate are put into a bag // and sent as input to <Agg>Final // The tuple we got above has a bag with input // values. Lets call <Agg>Initial with each value: DataBag bg = (DataBag) tup.get(0); DataBag intermediateInputBg1 = bagFactory.newDefaultBag(); DataBag intermediateInputBg2 = bagFactory.newDefaultBag(); int i = 0; for (Tuple tuple : bg) { DataBag initialInputBg = bagFactory.newDefaultBag(); initialInputBg.add(tuple); Tuple initialInputTuple = tupleFactory.newTuple(initialInputBg); if(i < bg.size()/2) { intermediateInputBg1.add((Tuple)aggInitial.exec(initialInputTuple)); } else { intermediateInputBg2.add((Tuple)aggInitial.exec(initialInputTuple)); } i++; } EvalFunc<?> avgIntermediate = evalFuncMap.get(aggIntermediateTypes[k]); DataBag finalInputBg = bagFactory.newDefaultBag(); Tuple intermediateInputTuple = tupleFactory.newTuple(intermediateInputBg1); finalInputBg.add((Tuple)avgIntermediate.exec(intermediateInputTuple)); intermediateInputTuple = tupleFactory.newTuple(intermediateInputBg2); finalInputBg.add((Tuple)avgIntermediate.exec(intermediateInputTuple)); Tuple finalInputTuple = tupleFactory.newTuple(finalInputBg); EvalFunc<?> aggFinal = evalFuncMap.get(aggFinalTypes[k]); String msg = "[Testing " + aggGroup[k] + " on input type: " + getInputType(aggFinalTypes[k]); System.err.println(msg + " for single combiner case]"); Object output = aggFinal.exec(finalInputTuple); msg += " ( (output) " + output + " == " + getExpected(aggFinalTypes[k]) + " (expected) )]"; // for doubles, precisions can be a problem - so check // if the type is double for expected result and check // within some precision if(getExpected(aggFinalTypes[k]) instanceof Double) { assertEquals(msg, (Double)getExpected(aggFinalTypes[k]), (Double)output, 0.00001); } else { assertEquals(msg, getExpected(aggFinalTypes[k]), output); } } } } /** * Test the case where the combiner is called more than once - so initial is called * and then Intermediate called couple of times and then final is called * @throws Exception */ @Test public void testAggMultipleCombine() throws Exception { for (String[] aggGroup : aggs) { String[] aggFinalTypes = null; // will contains AVGFinal, DoubleAvgFinal etc String[] aggInitialTypes = null; // will contains AVGInitial, DoubleAvgInitial etc String[] aggIntermediateTypes = null; // will contains AVGIntermediate, DoubleAvgIntermediate etc for (String stage: stages) { String[] aggTypesArray = null; if(stage.equals("Initial")) { aggInitialTypes = new String[aggGroup.length]; aggTypesArray = aggInitialTypes; } else if (stage.equals("Intermediate")) { aggIntermediateTypes = new String[aggGroup.length]; aggTypesArray = aggIntermediateTypes; } else {// final aggFinalTypes = new String[aggGroup.length]; aggTypesArray = aggFinalTypes; } for (int i = 0; i < aggTypesArray.length; i++) { aggTypesArray[i] = aggGroup[i] + stage; } } for(int k = 0; k < aggFinalTypes.length; k++) { EvalFunc<?> aggInitial = evalFuncMap.get(aggInitialTypes[k]); Tuple tup = inputMap.get(getInputType(aggInitialTypes[k])); // To test this case, first <Agg>Initial is called for each input // value. The output from <Agg>Initial for quarter of values from // the inputs is put into one bag. Then 4 calls are made to Intermediate // with each bag going to one call. This simulates the call in the map-combine // boundary. The outputs from the first two calls to Intermediate above are // put into a bag and the output from the next two calls put into another bag. // These two bags are provided as inputs to two separate calls of <Agg>Intermediate. // This simulates the call in the combine-reduce boundary. // The outputs from the two calls to <Agg>Intermediate are put into a bag // and sent as input to <Agg>Final // The tuple we got above has a bag with input // values. Lets call <Agg>Initial with each value: DataBag bg = (DataBag) tup.get(0); DataBag[] mapIntermediateInputBgs = new DataBag[4]; for (int i = 0; i < mapIntermediateInputBgs.length; i++) { mapIntermediateInputBgs[i] = bagFactory.newDefaultBag(); } Iterator<Tuple> it = bg.iterator(); for(int i = 0; i < 4; i++) { for(int j = 0; j < bg.size()/4; j++) { DataBag initialInputBg = bagFactory.newDefaultBag(); initialInputBg.add(it.next()); Tuple initialInputTuple = tupleFactory.newTuple(initialInputBg); mapIntermediateInputBgs[i].add((Tuple)aggInitial.exec(initialInputTuple)); } if(i == 3) { // if the last quarter has more elements process them while(it.hasNext()) { DataBag initialInputBg = bagFactory.newDefaultBag(); initialInputBg.add(it.next()); Tuple initialInputTuple = tupleFactory.newTuple(initialInputBg); mapIntermediateInputBgs[i].add((Tuple)aggInitial.exec(initialInputTuple)); } } } EvalFunc<?> aggIntermediate = evalFuncMap.get(aggIntermediateTypes[k]); DataBag[] reduceIntermediateInputBgs = new DataBag[2]; for (int i = 0; i < reduceIntermediateInputBgs.length; i++) { reduceIntermediateInputBgs[i] = bagFactory.newDefaultBag(); } // simulate call to combine after map for(int i = 0; i < 4; i++) { Tuple intermediateInputTuple = tupleFactory.newTuple(mapIntermediateInputBgs[i]); if(i < 2) { reduceIntermediateInputBgs[0].add((Tuple)aggIntermediate.exec(intermediateInputTuple)); } else { reduceIntermediateInputBgs[1].add((Tuple)aggIntermediate.exec(intermediateInputTuple)); } } DataBag finalInputBag = bagFactory.newDefaultBag(); // simulate call to combine before reduce for(int i = 0; i < 2; i++) { Tuple intermediateInputTuple = tupleFactory.newTuple(reduceIntermediateInputBgs[i]); finalInputBag.add((Tuple)aggIntermediate.exec(intermediateInputTuple)); } // simulate call to final (in reduce) Tuple finalInputTuple = tupleFactory.newTuple(finalInputBag); EvalFunc<?> aggFinal = evalFuncMap.get(aggFinalTypes[k]); String msg = "[Testing " + aggGroup[k] + " on input type: " + getInputType(aggFinalTypes[k]); System.err.println(msg + " for multiple combiner case]"); Object output = aggFinal.exec(finalInputTuple); msg += " ( (output) " + output + " == " + getExpected(aggFinalTypes[k]) + " (expected) )]"; // for doubles, precisions can be a problem - so check // if the type is double for expected result and check // within some precision if(getExpected(aggFinalTypes[k]) instanceof Double) { assertEquals(msg, (Double)getExpected(aggFinalTypes[k]), (Double)output, 0.00001); } else { assertEquals(msg, getExpected(aggFinalTypes[k]), output); } } } } /** * Test the case where an empty bag is given as input to * the Initial function and the output is fed to Intermediate * function whose output is fed to the Final function * @throws Exception */ @Test public void testAggEmptyBagWithCombiner() throws Exception { for (String[] aggGroup : aggs) { String[] aggFinalTypes = null; // will contains AVGFinal, DoubleAvgFinal etc String[] aggInitialTypes = null; // will contains AVGInitial, DoubleAvgInitial etc String[] aggIntermediateTypes = null; // will contains AVGIntermediate, DoubleAvgIntermediate etc for (String stage: stages) { String[] aggTypesArray = null; if(stage.equals("Initial")) { aggInitialTypes = new String[aggGroup.length]; aggTypesArray = aggInitialTypes; } else if (stage.equals("Intermediate")) { aggIntermediateTypes = new String[aggGroup.length]; aggTypesArray = aggIntermediateTypes; } else {// final aggFinalTypes = new String[aggGroup.length]; aggTypesArray = aggFinalTypes; } for (int i = 0; i < aggTypesArray.length; i++) { aggTypesArray[i] = aggGroup[i] + stage; } } for(int k = 0; k < aggFinalTypes.length; k++) { EvalFunc<?> aggInitial = evalFuncMap.get(aggInitialTypes[k]); // To test this case, first <Agg>Initial is called with an empty bag // as input. This is done in two ierations of 5 calls. // The output from <Agg>Initial for the first half of inputs is // put into one bag and the next half into another. Then these two // bags are provided as inputs to two separate calls of <Agg>Intermediate. // The outputs from the two calls to <Agg>Intermediate are put into a bag // and sent as input to <Agg>Final DataBag intermediateInputBg1 = bagFactory.newDefaultBag(); DataBag intermediateInputBg2 = bagFactory.newDefaultBag(); Tuple outputTuple = null; for(int i = 0; i < 10; i++) { // create empty bag input to be provided as input // argument to the "Initial" function DataBag initialInputBg = bagFactory.newDefaultBag(); Tuple initialInputTuple = tupleFactory.newTuple(initialInputBg); if(i < 5) { outputTuple = (Tuple)aggInitial.exec(initialInputTuple); // check that output is null for all aggs except COUNT // COUNT will give an output of 0 for empty bag input checkZeroOrNull(aggInitial, outputTuple.get(0)); intermediateInputBg1.add(outputTuple); } else { outputTuple = (Tuple)aggInitial.exec(initialInputTuple); // check that output is null for all aggs except COUNT // COUNT will give an output of 0 for empty bag input checkZeroOrNull(aggInitial, outputTuple.get(0)); intermediateInputBg2.add(outputTuple); } } EvalFunc<?> aggIntermediate = evalFuncMap.get(aggIntermediateTypes[k]); DataBag finalInputBg = bagFactory.newDefaultBag(); Tuple intermediateInputTuple = tupleFactory.newTuple(intermediateInputBg1); outputTuple = (Tuple)aggIntermediate.exec(intermediateInputTuple); // check that output is null for all aggs except COUNT // COUNT will give an output of 0 for empty bag input checkZeroOrNull(aggIntermediate, outputTuple.get(0)); finalInputBg.add(outputTuple); intermediateInputTuple = tupleFactory.newTuple(intermediateInputBg2); outputTuple = (Tuple)aggIntermediate.exec(intermediateInputTuple); // check that output is null for all aggs except COUNT // COUNT will give an output of 0 for empty bag input checkZeroOrNull(aggIntermediate, outputTuple.get(0)); finalInputBg.add(outputTuple); Tuple finalInputTuple = tupleFactory.newTuple(finalInputBg); EvalFunc<?> aggFinal = evalFuncMap.get(aggFinalTypes[k]); Object output = aggFinal.exec(finalInputTuple); // check that output is null for all aggs except COUNT // COUNT will give an output of 0 for empty bag input checkZeroOrNull(aggFinal, output); } } } /** * Test the case where an empty bag is given as input to the non * combiner version of aggregate functions * @throws Exception if there are issues executing the aggregate function */ @Test public void testAggEmptyBag() throws Exception { for (String[] aggGroup : aggs) { for(int k = 0; k < aggGroup.length; k++) { EvalFunc<?> agg = evalFuncMap.get(aggGroup[k]); // call agg with empty bag as input DataBag inputBag = bagFactory.newDefaultBag(); Tuple inputTuple = tupleFactory.newTuple(inputBag); Object output = agg.exec(inputTuple); // check that output is null for all aggs except COUNT // COUNT will give an output of 0 for empty bag input checkZeroOrNull(agg, output); } } } private void checkZeroOrNull(EvalFunc<?> func, Object output) { if(func.getClass().getName().contains("COUNT")) { assertEquals(new Long(0), output); } else { assertEquals(null, output); } } // Builtin MATH Functions // ======================= @Test public void testAVG() throws Exception { String[] avgTypes = {"AVG", "DoubleAvg", "LongAvg", "IntAvg", "FloatAvg"}; for(int k = 0; k < avgTypes.length; k++) { EvalFunc<?> avg = evalFuncMap.get(avgTypes[k]); Tuple tup = inputMap.get(getInputType(avgTypes[k])); Object output = avg.exec(tup); String msg = "[Testing " + avgTypes[k] + " on input type: " + getInputType(avgTypes[k]) + " ( (output) " + output + " == " + getExpected(avgTypes[k]) + " (expected) )]"; assertEquals(msg, (Double)output, (Double)getExpected(avgTypes[k]), 0.00001); } } @Test public void testAVGIntermediate() throws Exception { String[] avgTypes = {"AVGIntermediate", "DoubleAvgIntermediate", "LongAvgIntermediate", "IntAvgIntermediate", "FloatAvgIntermediate"}; for(int k = 0; k < avgTypes.length; k++) { EvalFunc<?> avg = evalFuncMap.get(avgTypes[k]); String inputType = getInputType(avgTypes[k]); Tuple tup = inputMap.get(inputType); // The tuple we got above has a bag with input // values. Input to the Intermediate.exec() however comes // from the map which would put each value and a count of // 1 in a tuple and send it down. So lets create a bag with // tuples that have two fields - the value and a count 1. DataBag bag = (DataBag) tup.get(0); DataBag bg = bagFactory.newDefaultBag(); for (Tuple t: bag) { Tuple newTuple = tupleFactory.newTuple(2); newTuple.set(0, t.get(0)); newTuple.set(1, new Long(1)); bg.add(newTuple); } Tuple intermediateInput = tupleFactory.newTuple(); intermediateInput.append(bg); Object output = avg.exec(intermediateInput); if(inputType == "Long" || inputType == "Integer" || inputType == "IntegerAsLong") { Long l = (Long)((Tuple)output).get(0); String msg = "[Testing " + avgTypes[k] + " on input type: " + getInputType(avgTypes[k]) + " ( (output) " + l + " == " + getExpected(avgTypes[k]) + " (expected) )]"; assertEquals(msg, (Long)getExpected(avgTypes[k]), l); } else { Double f1 = (Double)((Tuple)output).get(0); String msg = "[Testing " + avgTypes[k] + " on input type: " + getInputType(avgTypes[k]) + " ( (output) " + f1 + " == " + getExpected(avgTypes[k]) + " (expected) )]"; assertEquals(msg, (Double)getExpected(avgTypes[k]), f1, 0.00001); } Long f2 = (Long)((Tuple)output).get(1); assertEquals("[Testing " + avgTypes[k] + " on input type: "+ inputType+"]Expected count to be 11", 11, f2.longValue()); } } @Test public void testAVGFinal() throws Exception { String[] avgTypes = {"AVGFinal", "DoubleAvgFinal", "LongAvgFinal", "IntAvgFinal", "FloatAvgFinal"}; String[] avgIntermediateTypes = {"AVGIntermediate", "DoubleAvgIntermediate", "LongAvgIntermediate", "IntAvgIntermediate", "FloatAvgIntermediate"}; for(int k = 0; k < avgTypes.length; k++) { EvalFunc<?> avg = evalFuncMap.get(avgTypes[k]); Tuple tup = inputMap.get(getInputType(avgTypes[k])); // To test AVGFinal, AVGIntermediate should first be called and // the output of AVGIntermediate should be supplied as input to // AVGFinal. To simulate this, we will call Intermediate twice // on the above tuple and collect the outputs and pass it to // Final. // get the right "Intermediate" EvalFunc EvalFunc<?> avgIntermediate = evalFuncMap.get(avgIntermediateTypes[k]); // The tuple we got above has a bag with input // values. Input to the Intermediate.exec() however comes // from the map which would put each value and a count of // 1 in a tuple and send it down. So lets create a bag with // tuples that have two fields - the value and a count 1. // The input has 10 values - lets put the first five of them // in the input to the first call of AVGIntermediate and the // remaining five in the second call. DataBag bg = (DataBag) tup.get(0); DataBag bg1 = bagFactory.newDefaultBag(); DataBag bg2 = bagFactory.newDefaultBag(); int i = 0; for (Tuple t: bg) { Tuple newTuple = tupleFactory.newTuple(2); newTuple.set(0, t.get(0)); if ( t.get(0) == null) newTuple.set(1, new Long(0)); else newTuple.set(1, new Long(1)); if(i < 5) { bg1.add(newTuple); } else { bg2.add(newTuple); } i++; } Tuple intermediateInput1 = tupleFactory.newTuple(); intermediateInput1.append(bg1); Object output1 = avgIntermediate.exec(intermediateInput1); Tuple intermediateInput2 = tupleFactory.newTuple(); intermediateInput2.append(bg2); Object output2 = avgIntermediate.exec(intermediateInput2); DataBag bag = Util.createBag(new Tuple[]{(Tuple)output1, (Tuple)output2}); Tuple finalTuple = TupleFactory.getInstance().newTuple(1); finalTuple.set(0, bag); Object output = avg.exec(finalTuple); String msg = "[Testing " + avgTypes[k] + " on input type: " + getInputType(avgTypes[k]) + " ( (output) " + output + " == " + getExpected(avgTypes[k]) + " (expected) )]"; assertEquals(msg, (Double)getExpected(avgTypes[k]), (Double)output, 0.00001); } } @Test public void testCOUNT() throws Exception { Integer input[] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, null }; long expected = input.length - 1; EvalFunc<Long> count = new COUNT(); Tuple tup = Util.loadNestTuple(TupleFactory.getInstance().newTuple(1), input); Long output = count.exec(tup); assertTrue(output == expected); } @Test public void testCOUNTIntermed() throws Exception { Integer input[] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 }; DataBag intermediateInputBag = bagFactory.newDefaultBag(); // call initial and then Intermed for (Integer i : input) { Tuple t = tupleFactory.newTuple(i); DataBag b = bagFactory.newDefaultBag(); b.add(t); Tuple initialInput = tupleFactory.newTuple(b); EvalFunc<?> initial = new COUNT.Initial(); intermediateInputBag.add((Tuple)initial.exec(initialInput)); } EvalFunc<Tuple> countIntermed = new COUNT.Intermediate(); Tuple intermediateInput = tupleFactory.newTuple(intermediateInputBag); Tuple output = countIntermed.exec(intermediateInput); Long f1 = DataType.toLong(output.get(0)); assertEquals("Expected count to be 10", 10, f1.longValue()); } @Test public void testCOUNTFinal() throws Exception { long input[] = { 23, 38, 39 }; Tuple tup = Util.loadNestTuple(TupleFactory.getInstance().newTuple(1), input); EvalFunc<Long> count = new COUNT.Final(); Long output = count.exec(tup); assertEquals("Expected count to be 100", 100, output.longValue()); } @Test public void testCOUNT_STAR() throws Exception { Integer input[] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, null }; long expected = input.length; EvalFunc<Long> count = new COUNT_STAR(); Tuple tup = Util.loadNestTuple(TupleFactory.getInstance().newTuple(1), input); Long output = count.exec(tup); assertTrue(output == expected); } @Test public void testCOUNT_STARIntermed() throws Exception { Integer input[] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 }; DataBag intermediateInputBag = bagFactory.newDefaultBag(); // call initial and then Intermed for (Integer i : input) { Tuple t = tupleFactory.newTuple(i); DataBag b = bagFactory.newDefaultBag(); b.add(t); Tuple initialInput = tupleFactory.newTuple(b); EvalFunc<?> initial = new COUNT_STAR.Initial(); intermediateInputBag.add((Tuple)initial.exec(initialInput)); } EvalFunc<Tuple> countIntermed = new COUNT_STAR.Intermediate(); Tuple intermediateInput = tupleFactory.newTuple(intermediateInputBag); Tuple output = countIntermed.exec(intermediateInput); Long f1 = DataType.toLong(output.get(0)); assertEquals("Expected count to be 10", 10, f1.longValue()); } @Test public void testCOUNT_STARFinal() throws Exception { long input[] = { 23, 38, 39 }; Tuple tup = Util.loadNestTuple(TupleFactory.getInstance().newTuple(1), input); EvalFunc<Long> count = new COUNT_STAR.Final(); Long output = count.exec(tup); assertEquals("Expected count to be 100", 100, output.longValue()); } @Test public void testSUM() throws Exception { String[] sumTypes = {"SUM", "DoubleSum", "LongSum", "IntSum", "FloatSum"}; for(int k = 0; k < sumTypes.length; k++) { EvalFunc<?> sum = evalFuncMap.get(sumTypes[k]); String inputType = getInputType(sumTypes[k]); Tuple tup = inputMap.get(inputType); Object output = sum.exec(tup); String msg = "[Testing " + sumTypes[k] + " on input type: " + getInputType(sumTypes[k]) + " ( (output) " + output + " == " + getExpected(sumTypes[k]) + " (expected) )]"; if(inputType == "Integer" || inputType == "Long") { assertEquals(msg, (Long)output, (Long)getExpected(sumTypes[k]), 0.00001); } else { assertEquals(msg, (Double)output, (Double)getExpected(sumTypes[k]), 0.00001); } } } @Test public void testSUMIntermed() throws Exception { String[] sumTypes = {"SUMIntermediate", "DoubleSumIntermediate", "LongSumIntermediate", "IntSumIntermediate", "FloatSumIntermediate"}; for(int k = 0; k < sumTypes.length; k++) { EvalFunc<?> sum = evalFuncMap.get(sumTypes[k]); String inputType = getInputType(sumTypes[k]); Tuple tup = inputMap.get(inputType); Object output = sum.exec(tup); String msg = "[Testing " + sumTypes[k] + " on input type: " + getInputType(sumTypes[k]) + " ( (output) " + ((Tuple)output).get(0) + " == " + getExpected(sumTypes[k]) + " (expected) )]"; if(inputType.equals("Integer") || inputType.equals("Long") || inputType.equals("IntegerAsLong")) { assertEquals(msg, (Long) ((Tuple)output).get(0), (Long)getExpected(sumTypes[k]), 0.00001); } else { assertEquals(msg, (Double) ((Tuple)output).get(0), (Double)getExpected(sumTypes[k]), 0.00001); } } } @Test public void testSUMFinal() throws Exception { String[] sumTypes = {"SUMFinal", "DoubleSumFinal", "LongSumFinal", "IntSumFinal", "FloatSumFinal"}; for(int k = 0; k < sumTypes.length; k++) { EvalFunc<?> sum = evalFuncMap.get(sumTypes[k]); String inputType = getInputType(sumTypes[k]); Tuple tup = inputMap.get(inputType); Object output = sum.exec(tup); String msg = "[Testing " + sumTypes[k] + " on input type: " + getInputType(sumTypes[k]) + " ( (output) " + output + " == " + getExpected(sumTypes[k]) + " (expected) )]"; if(inputType.equals("Integer") || inputType.equals("Long") || inputType.equals("IntegerAsLong")) { assertEquals(msg, (Long)output, (Long)getExpected(sumTypes[k]), 0.00001); } else { assertEquals(msg, (Double)output, (Double)getExpected(sumTypes[k]), 0.00001); } } } @Test public void testMIN() throws Exception { String[] minTypes = {"MIN", "LongMin", "IntMin", "FloatMin"}; for(int k = 0; k < minTypes.length; k++) { EvalFunc<?> min = evalFuncMap.get(minTypes[k]); String inputType = getInputType(minTypes[k]); Tuple tup = inputMap.get(inputType); Object output = min.exec(tup); String msg = "[Testing " + minTypes[k] + " on input type: " + getInputType(minTypes[k]) + " ( (output) " + output + " == " + getExpected(minTypes[k]) + " (expected) )]"; if(inputType == "ByteArray") { assertEquals(msg, (Double)output, (Double)getExpected(minTypes[k])); } else if(inputType == "Long") { assertEquals(msg, (Long)output, (Long)getExpected(minTypes[k])); } else if(inputType == "Integer") { assertEquals(msg, (Integer)output, (Integer)getExpected(minTypes[k])); } else if (inputType == "Double") { assertEquals(msg, (Double)output, (Double)getExpected(minTypes[k])); } else if (inputType == "Float") { assertEquals(msg, (Float)output, (Float)getExpected(minTypes[k])); } else if (inputType == "String") { assertEquals(msg, (String)output, (String)getExpected(minTypes[k])); } } } @Test public void testMINIntermediate() throws Exception { String[] minTypes = {"MINIntermediate", "LongMinIntermediate", "IntMinIntermediate", "FloatMinIntermediate"}; for(int k = 0; k < minTypes.length; k++) { EvalFunc<?> min = evalFuncMap.get(minTypes[k]); String inputType = getInputType(minTypes[k]); Tuple tup = inputMap.get(inputType); Object output = min.exec(tup); String msg = "[Testing " + minTypes[k] + " on input type: " + getInputType(minTypes[k]) + " ( (output) " + ((Tuple)output).get(0) + " == " + getExpected(minTypes[k]) + " (expected) )]"; if(inputType == "ByteArray") { assertEquals(msg, (Double)((Tuple)output).get(0), (Double)getExpected(minTypes[k])); } else if(inputType == "Long") { assertEquals(msg, (Long)((Tuple)output).get(0), (Long)getExpected(minTypes[k])); } else if(inputType == "Integer") { assertEquals(msg, (Integer)((Tuple)output).get(0), (Integer)getExpected(minTypes[k])); } else if (inputType == "Double") { assertEquals(msg, (Double)((Tuple)output).get(0), (Double)getExpected(minTypes[k])); } else if (inputType == "Float") { assertEquals(msg, (Float)((Tuple)output).get(0), (Float)getExpected(minTypes[k])); } else if (inputType == "String") { assertEquals(msg, (String)((Tuple)output).get(0), (String)getExpected(minTypes[k])); } } } @Test public void testMINFinal() throws Exception { String[] minTypes = {"MINFinal", "LongMinFinal", "IntMinFinal", "FloatMinFinal"}; for(int k = 0; k < minTypes.length; k++) { EvalFunc<?> min = evalFuncMap.get(minTypes[k]); String inputType = getInputType(minTypes[k]); Tuple tup = inputMap.get(inputType); Object output = min.exec(tup); String msg = "[Testing " + minTypes[k] + " on input type: " + getInputType(minTypes[k]) + " ( (output) " + output + " == " + getExpected(minTypes[k]) + " (expected) )]"; if(inputType == "ByteArray") { assertEquals(msg, (Double)output, (Double)getExpected(minTypes[k])); } else if(inputType == "Long") { assertEquals(msg, (Long)output, (Long)getExpected(minTypes[k])); } else if(inputType == "Integer") { assertEquals(msg, (Integer)output, (Integer)getExpected(minTypes[k])); } else if (inputType == "Double") { assertEquals(msg, (Double)output, (Double)getExpected(minTypes[k])); } else if (inputType == "Float") { assertEquals(msg, (Float)output, (Float)getExpected(minTypes[k])); } else if (inputType == "String") { assertEquals(msg, (String)output, (String)getExpected(minTypes[k])); } } } @Test public void testMAX() throws Exception { String[] maxTypes = {"MAX", "LongMax", "IntMax", "FloatMax"}; for(int k = 0; k < maxTypes.length; k++) { EvalFunc<?> max = evalFuncMap.get(maxTypes[k]); String inputType = getInputType(maxTypes[k]); Tuple tup = inputMap.get(inputType); Object output = max.exec(tup); String msg = "[Testing " + maxTypes[k] + " on input type: " + getInputType(maxTypes[k]) + " ( (output) " + output + " == " + getExpected(maxTypes[k]) + " (expected) )]"; if(inputType == "ByteArray") { assertEquals(msg, (Double)output, (Double)getExpected(maxTypes[k])); } else if(inputType == "Long") { assertEquals(msg, (Long)output, (Long)getExpected(maxTypes[k])); } else if(inputType == "Integer") { assertEquals(msg, (Integer)output, (Integer)getExpected(maxTypes[k])); } else if (inputType == "Double") { assertEquals(msg, (Double)output, (Double)getExpected(maxTypes[k])); } else if (inputType == "Float") { assertEquals(msg, (Float)output, (Float)getExpected(maxTypes[k])); } else if (inputType == "String") { assertEquals(msg, (String)output, (String)getExpected(maxTypes[k])); } } } @Test public void testMAXIntermed() throws Exception { String[] maxTypes = {"MAXIntermediate", "LongMaxIntermediate", "IntMaxIntermediate", "FloatMaxIntermediate"}; for(int k = 0; k < maxTypes.length; k++) { EvalFunc<?> max = evalFuncMap.get(maxTypes[k]); String inputType = getInputType(maxTypes[k]); Tuple tup = inputMap.get(inputType); Object output = max.exec(tup); String msg = "[Testing " + maxTypes[k] + " on input type: " + getInputType(maxTypes[k]) + " ( (output) " + ((Tuple)output).get(0) + " == " + getExpected(maxTypes[k]) + " (expected) )]"; if(inputType == "ByteArray") { assertEquals(msg, (Double)((Tuple)output).get(0), (Double)getExpected(maxTypes[k])); } else if(inputType == "Long") { assertEquals(msg, (Long)((Tuple)output).get(0), (Long)getExpected(maxTypes[k])); } else if(inputType == "Integer") { assertEquals(msg, (Integer)((Tuple)output).get(0), (Integer)getExpected(maxTypes[k])); } else if (inputType == "Double") { assertEquals(msg, (Double)((Tuple)output).get(0), (Double)getExpected(maxTypes[k])); } else if (inputType == "Float") { assertEquals(msg, (Float)((Tuple)output).get(0), (Float)getExpected(maxTypes[k])); } else if (inputType == "String") { assertEquals(msg, (String)((Tuple)output).get(0), (String)getExpected(maxTypes[k])); } } } @Test public void testMAXFinal() throws Exception { String[] maxTypes = {"MAXFinal", "LongMaxFinal", "IntMaxFinal", "FloatMaxFinal"}; for(int k = 0; k < maxTypes.length; k++) { EvalFunc<?> max = evalFuncMap.get(maxTypes[k]); String inputType = getInputType(maxTypes[k]); Tuple tup = inputMap.get(inputType); Object output = max.exec(tup); String msg = "[Testing " + maxTypes[k] + " on input type: " + getInputType(maxTypes[k]) + " ( (output) " + output + " == " + getExpected(maxTypes[k]) + " (expected) )]"; if(inputType == "ByteArray") { assertEquals(msg, (Double)output, (Double)getExpected(maxTypes[k])); } else if(inputType == "Long") { assertEquals(msg, (Long)output, (Long)getExpected(maxTypes[k])); } else if(inputType == "Integer") { assertEquals(msg, (Integer)output, (Integer)getExpected(maxTypes[k])); } else if (inputType == "Double") { assertEquals(msg, (Double)output, (Double)getExpected(maxTypes[k])); } else if (inputType == "Float") { assertEquals(msg, (Float)output, (Float)getExpected(maxTypes[k])); } else if (inputType == "String") { assertEquals(msg, (String)output, (String)getExpected(maxTypes[k])); } } } @Test public void testMathFuncs() throws Exception { Random generator = new Random(); generator.setSeed(System.currentTimeMillis()); Double delta = 0.1; // We assume that UDFs are stored in org.apache.pig.builtin // Change this test case if we add more hierarchy later\ // Also, we assume that we have a function with math function // associated with these UDF with a lowercase name String[] mathFuncs = { "SIN", "SINH", "ASIN", "COS", "COSH", "ACOS", "TAN", "TANH", "ATAN", "LOG", "LOG10", "SQRT", "CEIL", "EXP", "FLOOR", "CBRT" }; String udfPackage = "org.apache.pig.builtin."; //String[] mathNonStdFuncs = {}; EvalFunc<Double> evalFunc; Tuple tup; Double input, actual, expected; Method mathMethod; String msg; for(String func: mathFuncs) { evalFunc = (EvalFunc<Double>) Class.forName(udfPackage + func).newInstance(); tup = TupleFactory.getInstance().newTuple(1); // double value between 0.0 and 1.0 input = generator.nextDouble(); tup.set(0, input); mathMethod = Math.class.getDeclaredMethod(func.toLowerCase(), double.class); actual = evalFunc.exec(tup); expected = (Double)mathMethod.invoke(null, input); msg = "[Testing " + func + " on input: " + input + " ( (actual) " + actual + " == " + expected + " (expected) )]"; assertEquals(msg, actual, expected, delta); } } @Test public void testStringFuncs() throws Exception { // Since String functions are trivial we add test on per case basis String inputStr = "Hello World!"; String inputStrLower = "hello world!"; String inputStrUpper = "HELLO WORLD!"; String inputStrCamel = "hello World!"; String inputStroWitha = "Hella Warld!"; String inpuStrExtra = "Hello World! "; List<Object> l = new LinkedList<Object>(); l.add(inputStr); l.add("o"); String expected = null; Tuple input; String output; Integer intOutput; EvalFunc<String> strFunc; EvalFunc<Integer> intFunc; strFunc = new LCFIRST(); input = TupleFactory.getInstance().newTuple(inputStr); expected = inputStrCamel; output = strFunc.exec(input); assertTrue(output.equals(expected)); strFunc = new UCFIRST(); input = TupleFactory.getInstance().newTuple(inputStrCamel); expected = inputStr; output = strFunc.exec(input); assertTrue(output.equals(expected)); intFunc = new LAST_INDEX_OF(); input = TupleFactory.getInstance().newTuple(l); intOutput = intFunc.exec(input); assertTrue(intOutput.intValue()==7); intFunc = new INDEXOF(); input = TupleFactory.getInstance().newTuple(l); intOutput = intFunc.exec(input); assertTrue(intOutput.intValue()==4); strFunc = new UPPER(); input = TupleFactory.getInstance().newTuple(inputStr); expected = inputStrUpper; output = strFunc.exec(input); assertTrue(output.equals(expected)); strFunc = new LOWER(); input = TupleFactory.getInstance().newTuple(inputStr); expected = inputStrLower; output = strFunc.exec(input); assertTrue(output.equals(expected)); strFunc = new REPLACE(); l.clear(); l.add(inputStr); l.add("o"); l.add("a"); input = TupleFactory.getInstance().newTuple(l); expected = inputStroWitha; output = strFunc.exec(input); assertTrue(output.equals(expected)); strFunc = new SUBSTRING(); l.clear(); l.add(inputStr); l.add(1); l.add(5); input = TupleFactory.getInstance().newTuple(l); expected = "ello"; output = strFunc.exec(input); assertTrue(output.equals(expected)); strFunc = new TRIM(); input = TupleFactory.getInstance().newTuple(inpuStrExtra); expected = inputStr; output = strFunc.exec(input); assertTrue(output.equals(expected)); STRSPLIT splitter = new STRSPLIT(); Tuple test1 = TupleFactory.getInstance().newTuple(1); Tuple test2 = TupleFactory.getInstance().newTuple(2); Tuple test3 = TupleFactory.getInstance().newTuple(3); test2.set(0, "foo"); test2.set(1, ":"); Tuple splits = splitter.exec(test2); assertEquals("no matches should return tuple with original string", 1, splits.size()); assertEquals("no matches should return tuple with original string", "foo", splits.get(0)); // test default delimiter test1.set(0, "f ooo bar"); splits = splitter.exec(test1); assertEquals("split on default value ", 3, splits.size()); assertEquals("f", splits.get(0)); assertEquals("ooo", splits.get(1)); assertEquals("bar", splits.get(2)); // test trimming of whitespace test1.set(0, "foo bar "); splits = splitter.exec(test1); assertEquals("whitespace trimmed if no length arg", 2, splits.size()); // test forcing null matches with length param test3.set(0, "foo bar "); test3.set(1, "\\s"); test3.set(2, 10); splits = splitter.exec(test3); assertEquals("length forces empty string matches on end", 5, splits.size()); // test limiting results with limit test3.set(0, "foo:bar:baz"); test3.set(1, ":"); test3.set(2, 2); splits = splitter.exec(test3); assertEquals(2, splits.size()); assertEquals("foo", splits.get(0)); assertEquals("bar:baz", splits.get(1)); Tuple t1 = TupleFactory.getInstance().newTuple(3); t1.set(0, "/search/iy/term1/test"); t1.set(1, "^\\/search\\/iy\\/(.*?)\\/.*"); t1.set(2, 1); Tuple t2 = TupleFactory.getInstance().newTuple(3); t2.set(0, "/search/iy/term1/test"); t2.set(1, "^\\/search\\/iy\\/(.*?)\\/.*"); t2.set(2, 2); Tuple t3 = TupleFactory.getInstance().newTuple(3); t3.set(0, null); t3.set(1, "^\\/search\\/iy\\/(.*?)\\/.*"); t3.set(2, 2); REGEX_EXTRACT func = new REGEX_EXTRACT(); String r = func.exec(t1); assertTrue(r.equals("term1")); r = func.exec(t2); assertTrue(r==null); r = func.exec(t3); assertTrue(r==null); String matchRegex = "^(.+)\\b\\s+is a\\s+\\b(.+)$"; TupleFactory tupleFactory = TupleFactory.getInstance(); Tuple te1 = tupleFactory.newTuple(2); te1.set(0,"this is a match"); te1.set(1, matchRegex); Tuple te2 = tupleFactory.newTuple(2); te2.set(0, "no match"); te2.set(1, matchRegex); Tuple te3 = tupleFactory.newTuple(2); te3.set(0, null); te3.set(1, matchRegex); REGEX_EXTRACT_ALL funce = new REGEX_EXTRACT_ALL(); Tuple re = funce.exec(te1); assertEquals(re.size(), 2); assertEquals("this", re.get(0)); assertEquals("match", re.get(1)); re = funce.exec(te2); assertTrue(re==null); re = funce.exec(te3); assertTrue(re==null); } @Test public void testStatsFunc() throws Exception { COV cov = new COV("a","b"); DataBag dBag = DefaultBagFactory.getInstance().newDefaultBag(); Tuple tup1 = TupleFactory.getInstance().newTuple(1); tup1.set(0, 1.0); dBag.add(tup1); tup1 = TupleFactory.getInstance().newTuple(1); tup1.set(0, 4.0); dBag.add(tup1); tup1 = TupleFactory.getInstance().newTuple(1); tup1.set(0, 8.0); dBag.add(tup1); tup1 = TupleFactory.getInstance().newTuple(1); tup1.set(0, 4.0); dBag.add(tup1); tup1 = TupleFactory.getInstance().newTuple(1); tup1.set(0, 7.0); dBag.add(tup1); tup1 = TupleFactory.getInstance().newTuple(1); tup1.set(0, 8.0); dBag.add(tup1); DataBag dBag1 = DefaultBagFactory.getInstance().newDefaultBag(); tup1 = TupleFactory.getInstance().newTuple(1); tup1.set(0, 2.0); dBag1.add(tup1); tup1 = TupleFactory.getInstance().newTuple(1); tup1.set(0, 2.0); dBag1.add(tup1); tup1 = TupleFactory.getInstance().newTuple(1); tup1.set(0, 3.0); dBag1.add(tup1); tup1 = TupleFactory.getInstance().newTuple(1); tup1.set(0, 3.0); dBag1.add(tup1); tup1 = TupleFactory.getInstance().newTuple(1); tup1.set(0, 2.0); dBag1.add(tup1); tup1 = TupleFactory.getInstance().newTuple(1); tup1.set(0, 4.0); dBag1.add(tup1); Tuple input = TupleFactory.getInstance().newTuple(2); input.set(0, dBag); input.set(1, dBag1); DataBag output = cov.exec(input); Iterator<Tuple> it = output.iterator(); Tuple ans = (Tuple)it.next(); assertEquals((String)ans.get(0),"a"); assertEquals((String)ans.get(1),"b"); assertEquals(1.11111, (Double)ans.get(2),0.0005); COR cor = new COR("a","b"); dBag = DefaultBagFactory.getInstance().newDefaultBag(); tup1 = TupleFactory.getInstance().newTuple(1); tup1.set(0, 1.0); dBag.add(tup1); tup1 = TupleFactory.getInstance().newTuple(1); tup1.set(0, 4.0); dBag.add(tup1); tup1 = TupleFactory.getInstance().newTuple(1); tup1.set(0, 8.0); dBag.add(tup1); tup1 = TupleFactory.getInstance().newTuple(1); tup1.set(0, 4.0); dBag.add(tup1); tup1 = TupleFactory.getInstance().newTuple(1); tup1.set(0, 7.0); dBag.add(tup1); tup1 = TupleFactory.getInstance().newTuple(1); tup1.set(0, 8.0); dBag.add(tup1); dBag1 = DefaultBagFactory.getInstance().newDefaultBag(); tup1 = TupleFactory.getInstance().newTuple(1); tup1.set(0, 2.0); dBag1.add(tup1); tup1 = TupleFactory.getInstance().newTuple(1); tup1.set(0, 2.0); dBag1.add(tup1); tup1 = TupleFactory.getInstance().newTuple(1); tup1.set(0, 3.0); dBag1.add(tup1); tup1 = TupleFactory.getInstance().newTuple(1); tup1.set(0, 3.0); dBag1.add(tup1); tup1 = TupleFactory.getInstance().newTuple(1); tup1.set(0, 2.0); dBag1.add(tup1); tup1 = TupleFactory.getInstance().newTuple(1); tup1.set(0, 4.0); dBag1.add(tup1); input = TupleFactory.getInstance().newTuple(2); input.set(0, dBag); input.set(1, dBag1); output = cor.exec(input); it = output.iterator(); ans = (Tuple) it.next(); assertEquals((String)ans.get(0),"a"); assertEquals((String)ans.get(1),"b"); assertEquals(0.582222509739582, (Double)ans.get(2) ,0.0005); } private void checkItemsGT(Iterable<Tuple> tuples, int field, int limit) throws ExecException { for (Tuple t : tuples) { Long val = (Long) t.get(field); assertTrue("Value "+ val + " exceeded the expected limit", val > limit); } } @Test public void testMiscFunc() throws Exception { //TEST TOBAG TOBAG tb = new TOBAG(); //test output schema of udf Schema expectedSch = Schema.generateNestedSchema(DataType.BAG, DataType.INTEGER); //check schema of TOBAG when given input tuple having only integers Schema inputSch = new Schema(); inputSch.add(new FieldSchema(null, DataType.INTEGER)); assertEquals("schema of tobag when input has only ints", expectedSch, tb.outputSchema(inputSch)); //add another int column inputSch.add(new FieldSchema(null, DataType.INTEGER)); assertEquals("schema of tobag when input has only ints", expectedSch, tb.outputSchema(inputSch)); //add a long column inputSch.add(new FieldSchema(null, DataType.LONG)); //expect null inner schema expectedSch = Schema.generateNestedSchema(DataType.BAG, DataType.NULL); assertEquals("schema of tobag when input has ints and long", expectedSch, tb.outputSchema(inputSch)); //test schema when input is a tuple with inner schema Schema tupInSchema = new Schema(new FieldSchema("x", DataType.CHARARRAY)); inputSch = new Schema(); inputSch.add(new FieldSchema("a", tupInSchema, DataType.TUPLE)); Schema inputSchCp = new Schema(inputSch); inputSchCp.getField(0).alias = null; expectedSch = new Schema(new FieldSchema(null, inputSchCp, DataType.BAG)); assertEquals("schema of tobag when input has cols of type tuple ", expectedSch, tb.outputSchema(inputSch)); inputSch.add(new FieldSchema("b", tupInSchema, DataType.TUPLE)); assertEquals("schema of tobag when input has cols of type tuple ", expectedSch, tb.outputSchema(inputSch)); //add a column of type tuple with different inner schema tupInSchema = new Schema(new FieldSchema("x", DataType.BYTEARRAY)); inputSch.add(new FieldSchema("c", tupInSchema, DataType.TUPLE)); //expect null inner schema expectedSch = Schema.generateNestedSchema(DataType.BAG, DataType.NULL); assertEquals("schema of tobag when input has cols of type tuple with diff inner schema", expectedSch, tb.outputSchema(inputSch)); Tuple input = TupleFactory.getInstance().newTuple(); for (int i = 0; i < 100; ++i) { input.append(i); } //test null value in input input.append(null); Set<Integer> s = new HashSet<Integer>(); DataBag db = tb.exec(input); for (Tuple t : db) { s.add((Integer) t.get(0)); } // finally check the bag had everything we put in the tuple. assertEquals(101, s.size()); for (int i = 0; i < 100; ++i) { assertTrue(s.contains(i)); } assertTrue("null in tobag result", s.contains(null)); TOTUPLE tt = new TOTUPLE(); input = TupleFactory.getInstance().newTuple(); for (int i = 0; i < 100; ++i) { input.append(i); } Tuple output = tt.exec(input); assertTrue(!(input == output)); assertEquals(input, output); TOP top = new TOP(); TupleFactory tupleFactory = TupleFactory.getInstance(); BagFactory bagFactory = DefaultBagFactory.getInstance(); Tuple inputTuple = tupleFactory.newTuple(3); DataBag dBag = bagFactory.newDefaultBag(); // set N = 10 i.e retain top 10 tuples inputTuple.set(0, 10); // compare tuples by field number 1 inputTuple.set(1, 1); // set the data bag containing the tuples inputTuple.set(2, dBag); // generate tuples of the form (group-1, 1), (group-2, 2) ... for (long i = 0; i < 100; i++) { Tuple nestedTuple = tupleFactory.newTuple(2); nestedTuple.set(0, "group-" + i); nestedTuple.set(1, i); dBag.add(nestedTuple); } DataBag outBag = top.exec(inputTuple); assertEquals(outBag.size(), 10L); checkItemsGT(outBag, 1, 89); // two initial results Tuple init1 = (new TOP.Initial()).exec(inputTuple); Tuple init2 = (new TOP.Initial()).exec(inputTuple); // two intermediate results DataBag intermedBag = bagFactory.newDefaultBag(); intermedBag.add(init1); intermedBag.add(init2); Tuple intermedInput = tupleFactory.newTuple(intermedBag); Tuple intermedOutput1 = (new TOP.Intermed()).exec(intermedInput); Tuple intermedOutput2 = (new TOP.Intermed()).exec(intermedInput); checkItemsGT((DataBag)intermedOutput1.get(2), 1, 94); // final result DataBag finalInputBag = bagFactory.newDefaultBag(); finalInputBag.add(intermedOutput1); finalInputBag.add(intermedOutput2); Tuple finalInput = tupleFactory.newTuple(finalInputBag); outBag = (new TOP.Final()).exec(finalInput); assertEquals(outBag.size(), 10L); checkItemsGT(outBag, 1, 96); } @Test public void testDistinct() throws Exception { Integer[] inp = new Integer[] { 1, 2 , 3, 1 ,4, 5, 3}; DataBag inputBag = Util.createBagOfOneColumn(inp); EvalFunc<Tuple> initial = new Distinct.Initial(); DataBag intermedInputBg1 = bagFactory.newDefaultBag(); DataBag intermedInputBg2 = bagFactory.newDefaultBag(); int i = 0; for (Tuple t : inputBag) { Tuple initialOutput = initial.exec(tupleFactory.newTuple(t)); if(i < inp.length/2 ) { intermedInputBg1.add(initialOutput); } else { intermedInputBg2.add(initialOutput); } i++; } EvalFunc<Tuple> intermed = new Distinct.Intermediate(); DataBag finalInputBg = bagFactory.newDefaultBag(); finalInputBg.add(intermed.exec(tupleFactory.newTuple(intermedInputBg1))); finalInputBg.add(intermed.exec(tupleFactory.newTuple(intermedInputBg2))); EvalFunc<DataBag> fin = new Distinct.Final(); DataBag result = fin.exec(tupleFactory.newTuple(finalInputBg)); Integer[] exp = new Integer[] { 1, 2, 3, 4, 5}; DataBag expectedBag = Util.createBagOfOneColumn(exp); assertEquals(expectedBag, result); } @Test public void testDistinctProgressNonAlgebraic() throws Exception { //This test is for the exec method in Distinct which is not //called currently. int inputSize = 2002; Integer[] inp = new Integer[inputSize]; for(int i = 0; i < inputSize; i+=2) { inp[i] = i/2; inp[i+1] = i/2; } DataBag inputBag = Util.createBagOfOneColumn(inp); EvalFunc<DataBag> distinct = new Distinct(); DataBag result = distinct.exec(tupleFactory.newTuple(inputBag)); Integer[] exp = new Integer[inputSize/2]; for(int j = 0; j < inputSize/2; ++j) { exp[j] = j; } DataBag expectedBag = Util.createBagOfOneColumn(exp); assertEquals(expectedBag, result); } @Test public void testCONCAT() throws Exception { // DataByteArray concat byte[] a = {1,2,3}; byte[] b = {4,5,6}; byte[] expected = {1,2,3,4,5,6}; DataByteArray dbaExpected = new DataByteArray(expected); DataByteArray dbaA = new DataByteArray(a); DataByteArray dbaB = new DataByteArray(b); EvalFunc<DataByteArray> concat = new CONCAT(); Tuple t = TupleFactory.getInstance().newTuple(2); t.set(0, dbaA); t.set(1, dbaB); DataByteArray result = concat.exec(t); String msg = "[Testing CONCAT on input type: bytearray]"; assertTrue(msg, result.equals(dbaExpected)); // String concat String s1 = "unit "; String s2 = "test"; String exp = "unit test"; EvalFunc<String> sConcat = new StringConcat(); Tuple ts = TupleFactory.getInstance().newTuple(2); ts.set(0, s1); ts.set(1, s2); String res = sConcat.exec(ts); msg = "[Testing StringConcat on input type: String]"; assertTrue(msg, res.equals(exp)); } @Test public void testMultiCONCAT() throws Exception { // DataByteArray concat byte[] a = {1,2,3}; byte[] b = {4,5,6}; byte[] c = {7,8,9}; byte[] d = {10,11,12}; byte[] e = {13,14,15}; byte[] expected = {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}; DataByteArray dbaExpected = new DataByteArray(expected); DataByteArray dbaA = new DataByteArray(a); DataByteArray dbaB = new DataByteArray(b); DataByteArray dbaC = new DataByteArray(c); DataByteArray dbaD = new DataByteArray(d); DataByteArray dbaE = new DataByteArray(e); EvalFunc<DataByteArray> concat = new CONCAT(); Tuple t = TupleFactory.getInstance().newTuple(5); t.set(0, dbaA); t.set(1, dbaB); t.set(2, dbaC); t.set(3, dbaD); t.set(4, dbaE); DataByteArray result = concat.exec(t); String msg = "[Testing CONCAT on >2 tuples for input type: bytearray]"; assertTrue(msg, result.equals(dbaExpected)); // String concat String s1 = "high "; String s2 = "fives "; String s3 = "kick "; String s4 = "ass "; String s5 = "yo"; String exp = "high fives kick ass yo"; EvalFunc<String> sConcat = new StringConcat(); Tuple ts = TupleFactory.getInstance().newTuple(5); ts.set(0, s1); ts.set(1, s2); ts.set(2, s3); ts.set(3, s4); ts.set(4, s5); String res = sConcat.exec(ts); msg = "[Testing StringConcat on >2 tuples input type: String]"; assertTrue(msg, res.equals(exp)); } @Test public void testSIZE() throws Exception { // DataByteArray size byte[] a = {1,2,3}; DataByteArray dba = new DataByteArray(a); Long expected = new Long(3); Tuple t = TupleFactory.getInstance().newTuple(1); t.set(0, dba); EvalFunc<Long> size = new SIZE(); String msg = "[Testing SIZE on input type: bytearray]"; assertTrue(msg, expected.equals(size.exec(t))); // String size String s = "Unit test case"; expected = new Long(14); t.set(0, s); size = new StringSize(); msg = "[Testing StringSize on input type: String]"; assertTrue(msg, expected.equals(size.exec(t))); // Map size String[] mapContents = new String[]{"key1", "value1", "key2", "value2"}; Map<String, Object> map = Util.createMap(mapContents); expected = new Long(2); t.set(0, map); size = new MapSize(); msg = "[Testing MapSize on input type: Map]"; assertTrue(msg, expected.equals(size.exec(t))); // Bag size Tuple t1 = Util.createTuple(new String[]{"a", "b", "c"}); Tuple t2 = Util.createTuple(new String[]{"d", "e", "f"}); Tuple t3 = Util.createTuple(new String[]{"g", "h", "i"}); Tuple t4 = Util.createTuple(new String[]{"j", "k", "l"}); DataBag b = Util.createBag(new Tuple[]{t1, t2, t3, t4}); expected = new Long(4); t.set(0, b); size = new BagSize(); msg = "[Testing BagSize on input type: Bag]"; assertTrue(msg, expected.equals(size.exec(t))); // Tuple size expected = new Long(3); size = new TupleSize(); msg = "[Testing TupleSize on input type: Tuple]"; assertTrue(msg, expected.equals(size.exec(t1))); // Test for ARITY function. // It is depricated but we still need to make sure it works ARITY arrity = new ARITY(); msg = "[Testing ARRITY on input type: Tuple]"; assertTrue(msg, expected.equals(new Long(arrity.exec(t1)))); } // Builtin APPLY Functions // ======================== // Builtin LOAD Functions // ======================= @Test public void testLFPig() throws Exception { Util.createInputFile(cluster, "input.txt", new String[] {"this:is:delimited:by:a:colon\n"}); int arity1 = 6; LoadFunc lf = new PigStorage(":"); LoadFunc p1 = new ReadToEndLoader(lf, ConfigurationUtil. toConfiguration(cluster.getProperties()), "input.txt", 0); Tuple f1 = p1.getNext(); assertTrue(f1.size() == arity1); Util.deleteFile(cluster, "input.txt"); int LOOP_COUNT = 100; String[] input = new String[LOOP_COUNT * LOOP_COUNT]; int n = 0; for (int i = 0; i < LOOP_COUNT; i++) { for (int j = 0; j < LOOP_COUNT; j++) { input[n++] = (i + "\t" + i + "\t" + j % 2); } } Util.createInputFile(cluster, "input.txt", input); LoadFunc p15 = new ReadToEndLoader(new PigStorage(), ConfigurationUtil. toConfiguration(cluster.getProperties()), "input.txt", 0); int count = 0; while (true) { Tuple f15 = p15.getNext(); if (f15 == null) break; count++; assertEquals(3, f15.size()); } assertEquals(LOOP_COUNT * LOOP_COUNT, count); Util.deleteFile(cluster, "input.txt"); String input2 = ":this:has:a:leading:colon\n"; int arity2 = 6; Util.createInputFile(cluster, "input.txt", new String[] {input2}); LoadFunc p2 = new ReadToEndLoader(new PigStorage(":"), ConfigurationUtil. toConfiguration(cluster.getProperties()), "input.txt", 0); Tuple f2 = p2.getNext(); assertTrue(f2.size() == arity2); Util.deleteFile(cluster, "input.txt"); String input3 = "this:has:a:trailing:colon:\n"; int arity3 = 6; Util.createInputFile(cluster, "input.txt", new String[] {input3}); LoadFunc p3 = new ReadToEndLoader(new PigStorage(":"), ConfigurationUtil. toConfiguration(cluster.getProperties()), "input.txt", 0); Tuple f3 = p3.getNext(); assertTrue(f3.size() == arity3); Util.deleteFile(cluster, "input.txt"); } /* @Test public void testLFBin() throws Exception { BagFactory.init(new File("/tmp")); Tuple t1 = new Tuple(4); DataAtom a = new DataAtom("a"); DataAtom b = new DataAtom("b"); Tuple t2 = new Tuple(1); t2.setField(0,a); Tuple t3 = new Tuple(1); t3.setField(0, b); DataBag bag = BagFactory.getInstance().getNewBigBag(); bag.add(t2); bag.add(t3); Tuple t4 = new Tuple(2); t4.setField(0, t2); t4.setField(1, t3); t1.setField(0, a); t1.setField(1, t2); t1.setField(2, bag); t1.setField(3, t4); Tuple t5 = new Tuple(4); DataAtom c = new DataAtom("the quick brown fox"); DataAtom d = new DataAtom("jumps over the lazy dog"); Tuple t6 = new Tuple(1); t6.setField(0,c); Tuple t7 = new Tuple(1); t7.setField(0, d); DataBag bag2 = BagFactory.getInstance().getNewBigBag(); for(int i = 0; i < 10; i ++) { bag2.add(t6); bag2.add(t7); } Tuple t8 = new Tuple(2); t8.setField(0, t6); t8.setField(1, t7); t5.setField(0, c); t5.setField(1, t6); t5.setField(2, bag2); t5.setField(3, t8); OutputStream os = new FileOutputStream("/tmp/bintest.bin"); StoreFunc s = new BinStorage(); s.bindTo(os); s.putNext(t1); s.putNext(t5); s.finish(); LoadFunc l = new BinStorage(); InputStream is = FileLocalizer.open("/tmp/bintest.bin", new PigContext(ExecType.LOCAL)); l.bindTo("/tmp/bintest.bin", new BufferedPositionedInputStream(is), 0, Long.MAX_VALUE); Tuple r1 = l.getNext(); Tuple r2 = l.getNext(); assertTrue(r1.equals(t1)); assertTrue(r2.equals(t5)); } */ /** * test {@link TextLoader} - this also tests that {@link TextLoader} is capable * of reading data a couple of dirs deep when the input specified is the top * level directory */ @Test public void testLFText() throws Exception { String input1 = "This is some text.\nWith a newline in it.\n"; String expected1 = "This is some text."; String expected2 = "With a newline in it."; Util.createInputFile(cluster, "/tmp/testLFTextdir1/testLFTextdir2/testLFTest-input1.txt", new String[] {input1}); // check that loading the top level dir still reading the file a couple // of subdirs below LoadFunc text1 = new ReadToEndLoader(new TextLoader(), ConfigurationUtil. toConfiguration(cluster.getProperties()), "/tmp/testLFTextdir1", 0); Tuple f1 = text1.getNext(); Tuple f2 = text1.getNext(); assertTrue(expected1.equals(f1.get(0).toString()) && expected2.equals(f2.get(0).toString())); Util.deleteFile(cluster, "testLFTest-input1.txt"); Util.createInputFile(cluster, "testLFTest-input2.txt"); LoadFunc text2 = new ReadToEndLoader(new TextLoader(), ConfigurationUtil. toConfiguration(cluster.getProperties()), "testLFTest-input2.txt", 0); Tuple f3 = text2.getNext(); assertTrue(f3 == null); Util.deleteFile(cluster, "testLFTest-input2.txt"); } @SuppressWarnings("unchecked") @Test public void testSFPig() throws Exception { String inputStr = "amy\tbob\tcharlene\tdavid\terin\tfrank"; Util.createInputFile(cluster, "testSFPig-input.txt", new String[] {inputStr}); DataByteArray[] input = { new DataByteArray("amy"), new DataByteArray("bob"), new DataByteArray("charlene"), new DataByteArray("david"), new DataByteArray("erin"), new DataByteArray("frank") }; Tuple f1 = Util.loadTuple(TupleFactory.getInstance(). newTuple(input.length), input); String query = "a = load 'testSFPig-input.txt';" + "store a into 'testSFPig-output.txt';"; pigServer.setBatchOn(); Util.registerMultiLineQuery(pigServer, query); pigServer.executeBatch(); LoadFunc lfunc = new ReadToEndLoader(new PigStorage(), ConfigurationUtil. toConfiguration(cluster.getProperties()), "testSFPig-output.txt", 0); Tuple f2 = lfunc.getNext(); assertEquals(f1, f2); Util.deleteFile(cluster, "testSFPig-input.txt"); Util.deleteFile(cluster, "testSFPig-output.txt"); } /* This are e2e tests to make sure that function that maps * arguments to class is properly setup. More comprehansive * unit tests are done in TestStringUDFs */ @Test @SuppressWarnings("unchecked") public void testStringUDFs() throws Exception { String inputStr = "amy smith "; Util.createInputFile(cluster, "testStrUDFsIn.txt", new String[] {inputStr}); // test typed data pigServer.registerQuery("A = load 'testStrUDFsIn.txt' as (name: chararray);"); pigServer.registerQuery("B = foreach A generate SUBSTRING(name, 0, 3), " + "INDEXOF(name, 'a'), INDEXOF(name, 'a', 3), LAST_INDEX_OF(name, 'a'), REPLACE(name, 'a', 'b'), " + "STRSPLIT(name), STRSPLIT(name, ' '), STRSPLIT(name, ' ', 0), TRIM(name);"); Iterator<Tuple> it = pigServer.openIterator("B"); assertTrue(it.hasNext()); Tuple t = it.next(); Tuple expected = Util.buildTuple("amy", "smith"); assertTrue(!it.hasNext()); assertEquals(9, t.size()); assertEquals("amy", t.get(0)); assertEquals(0, t.get(1)); assertEquals(-1, t.get(2)); assertEquals(0, t.get(3)); assertEquals("bmy smith ", t.get(4)); assertEquals(expected, t.get(5)); assertEquals(expected, t.get(6)); assertEquals(expected, t.get(7)); assertEquals("amy smith", t.get(8)); // test untyped data pigServer.registerQuery("A = load 'testStrUDFsIn.txt' as (name);"); pigServer.registerQuery("B = foreach A generate SUBSTRING(name, 0, 3), " + "LAST_INDEX_OF(name, 'a'), REPLACE(name, 'a', 'b'), TRIM(name);"); it = pigServer.openIterator("B"); assertTrue(it.hasNext()); t = it.next(); assertTrue(!it.hasNext()); assertEquals(4, t.size()); assertEquals("amy", t.get(0)); assertEquals(0, t.get(1)); assertEquals("bmy smith ", t.get(2)); assertEquals("amy smith", t.get(3)); } @Test public void testTOKENIZE() throws Exception { TupleFactory tf = TupleFactory.getInstance(); Tuple t1 = tf.newTuple(1); t1.set(0, "123 456\"789"); Tuple t2 = tf.newTuple(1); t2.set(0, null); Tuple t3 = tf.newTuple(0); TOKENIZE f = new TOKENIZE(); DataBag b = f.exec(t1); assertTrue(b.size()==3); Iterator<Tuple> i = b.iterator(); Tuple rt = i.next(); assertTrue(rt.get(0).equals("123")); rt = i.next(); assertTrue(rt.get(0).equals("456")); rt = i.next(); assertTrue(rt.get(0).equals("789")); b = f.exec(t2); assertTrue(b==null); b = f.exec(t3); assertTrue(b==null); } @Test public void testDIFF() throws Exception { // Test it in the case with two bags. BagFactory bf = BagFactory.getInstance(); TupleFactory tf = TupleFactory.getInstance(); DataBag b1 = bf.newDefaultBag(); DataBag b2 = bf.newDefaultBag(); for (int i = 0; i < 10; i++) b1.add(tf.newTuple(new Integer(i))); for (int i = 0; i < 10; i += 2) b2.add(tf.newTuple(new Integer(i))); Tuple t = tf.newTuple(2); t.set(0, b1); t.set(1, b2); DIFF d = new DIFF(); DataBag result = d.exec(t); assertEquals(5, result.size()); Iterator<Tuple> i = result.iterator(); int[] values = new int[5]; for (int j = 0; j < 5; j++) values[j] = (Integer)i.next().get(0); Arrays.sort(values); for (int j = 1; j < 10; j += 2) assertEquals(j, values[j/2]); // Test it in the case of two objects that are equals t = tf.newTuple(2); t.set(0, new Integer(1)); t.set(1, new Integer(1)); result = d.exec(t); assertEquals(0, result.size()); // Test it in the case of two objects that are not equal t = tf.newTuple(2); t.set(0, new Integer(1)); t.set(1, new Integer(2)); result = d.exec(t); assertEquals(2, result.size()); } private static String getInputType(String typeFor) { return allowedInput.get(typeFor); } /** * @param expectedFor functionName for which expected result is sought * @return Object appropriate expected result */ private Object getExpected(String expectedFor) { return expectedMap.get(expectedFor); } }