/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.pig.test; import static junit.framework.Assert.*; import java.util.Iterator; import java.util.List; import org.apache.pig.ExecType; import org.apache.pig.PigServer; import org.apache.pig.backend.executionengine.ExecException; import org.apache.pig.builtin.BagToString; import org.apache.pig.builtin.BagToTuple; import org.apache.pig.builtin.mock.Storage.Data; import org.apache.pig.data.BagFactory; import org.apache.pig.data.DataBag; import org.apache.pig.data.DataType; import org.apache.pig.data.Tuple; import org.apache.pig.data.TupleFactory; import org.apache.pig.impl.logicalLayer.schema.Schema; import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema; import static org.apache.pig.builtin.mock.Storage.*; import org.junit.Test; /** * * Test cases for BagToTuple and BagToString UDFs * * @author hluu * */ public class TestBuiltInBagToTupleOrString { private BagFactory bf = BagFactory.getInstance(); private TupleFactory tf = TupleFactory.getInstance(); @Test public void testNullInputBagToTupleUDF() throws Exception { BagToTuple udf = new BagToTuple(); Tuple udfInput = tf.newTuple(1); udfInput.set(0, null); Tuple output = udf.exec(udfInput); assertNull(output); } @Test public void testBasicBagToTupleUDF() throws Exception { Tuple t1 = tf.newTuple(2); t1.set(0, "a"); t1.set(1, 5); Tuple t2 = tf.newTuple(2); t2.set(0, "c"); t2.set(1, 6); DataBag bag = bf.newDefaultBag(); bag.add(t1); bag.add(t2); Tuple udfInput = tf.newTuple(1); udfInput.set(0, bag); // invoking UDF BagToTuple udf = new BagToTuple(); Tuple result = udf.exec(udfInput); int totalExpectedSize = t1.size() + t2.size(); assertEquals(totalExpectedSize, result.size()); for (int i = 0; i < t1.size(); i++) { assertEquals(t1.get(i), result.get(i)); } for (int i = 0; i < t2.size(); i++) { assertEquals(t2.get(i), result.get(t1.size() + i)); } } @Test public void testNonuniformTuplesInBagForBagToTupleUDF() throws Exception { Tuple t1 = tf.newTuple(2); t1.set(0, "a"); t1.set(1, 5); Tuple t2 = tf.newTuple(3); t2.set(0, "b"); t2.set(1, 6); t2.set(2, 7); Tuple t3 = tf.newTuple(4); t3.set(0, "c"); t3.set(1, 8); t3.set(2, 9.7); t3.set(3, 10); DataBag bag = bf.newDefaultBag(); bag.add(t1); bag.add(t2); bag.add(t3); Tuple udfInput = tf.newTuple(1); udfInput.set(0, bag); // invoking UDF BagToTuple udf = new BagToTuple(); Tuple outputTuple = udf.exec(udfInput); int totalExpectedSize = t1.size() + t2.size() + t3.size(); assertEquals(totalExpectedSize, outputTuple.size()); for (int i = 0; i < t1.size(); i++) { assertEquals(t1.get(i), outputTuple.get(i)); } for (int i = 0; i < t2.size(); i++) { assertEquals(t2.get(i), outputTuple.get(t1.size() + i)); } int startIndex = t1.size() + t2.size(); for (int i = 0; i < t3.size(); i++) { assertEquals(t3.get(i), outputTuple.get(startIndex + i)); } } @Test public void testNestedDataElementsForBagToTupleUDF() throws Exception { DataBag inputBag = buildBagWithNestedTupleAndBag(); BagToTuple udf = new BagToTuple(); Tuple udfInput = tf.newTuple(1); udfInput.set(0, inputBag); Tuple outputTuple = udf.exec(udfInput); Iterator<Tuple> inputBagIterator = inputBag.iterator(); Tuple firstTuple = inputBagIterator.next(); for (int i = 0; i < firstTuple.size(); i++) { assertEquals(firstTuple.get(i), outputTuple.get(i)); } Tuple secondTuple = inputBagIterator.next(); for (int i = 0; i < secondTuple.size(); i++) { assertEquals(secondTuple.get(i), outputTuple.get(firstTuple.size() + i)); } int startIndex = firstTuple.size() + secondTuple.size(); Tuple thirdTuple = inputBagIterator.next(); for (int i = 0; i < thirdTuple.size(); i++) { assertEquals(thirdTuple.get(i), outputTuple.get(startIndex + i)); } } @Test public void testOutputSchemaForBagToTupleUDF() throws Exception { Schema expectedSch = Schema.generateNestedSchema(DataType.TUPLE, DataType.INTEGER, DataType.CHARARRAY); FieldSchema tupSch = new FieldSchema(null, DataType.TUPLE); tupSch.schema = new Schema(); tupSch.schema.add(new FieldSchema(null, DataType.INTEGER)); tupSch.schema.add(new FieldSchema(null, DataType.CHARARRAY)); FieldSchema bagSch = new FieldSchema(null, DataType.BAG); bagSch.schema = new Schema(tupSch); Schema inputSch = new Schema(); inputSch.add(bagSch); BagToTuple udf = new BagToTuple(); Schema outputSchema = udf.outputSchema(inputSch); assertEquals("schema of BagToTuple input", expectedSch.size(), outputSchema.size()); assertTrue("schema of BagToTuple input", Schema.equals(expectedSch, outputSchema, false, true)); } @Test(expected=org.apache.pig.backend.executionengine.ExecException.class) public void testInvalidInputToBagToTupleUDF() throws Exception { TupleFactory tf = TupleFactory.getInstance(); Tuple udfInput = tf.newTuple(1); // input contains tuple instead of bag udfInput.set(0, tf.newTuple()); BagToTuple udf = new BagToTuple(); // expecting an exception because the input if of type Tuple, not DataBag udf.exec(udfInput); } @Test public void testNullInputBagToStringUDF() throws Exception { BagToString udf = new BagToString(); Tuple udfInput = tf.newTuple(1); udfInput.set(0, null); String output = udf.exec(udfInput); assertNull(output); } @Test(expected=org.apache.pig.backend.executionengine.ExecException.class) public void testInvalidInputForBagToStringUDF() throws Exception { TupleFactory tf = TupleFactory.getInstance(); Tuple udfInput = tf.newTuple(1); // input contains tuple instead of bag udfInput.set(0, tf.newTuple()); BagToString udf = new BagToString(); // expecting an exception because the input if of type Tuple, not DataBag udf.exec(udfInput); } @Test public void testUseDefaultDelimiterBagToStringUDF() throws Exception { BagFactory bf = BagFactory.getInstance(); TupleFactory tf = TupleFactory.getInstance(); Tuple t1 = tf.newTuple(2); t1.set(0, "a"); t1.set(1, 5); Tuple t2 = tf.newTuple(2); t2.set(0, "c"); t2.set(1, 6); DataBag bag = bf.newDefaultBag(); bag.add(t1); bag.add(t2); BagToString udf = new BagToString(); Tuple udfInput = tf.newTuple(1); udfInput.set(0, bag); String result = udf.exec(udfInput); assertEquals("a_5_c_6", result); } @Test public void testBasicBagToStringUDF() throws Exception { BagFactory bf = BagFactory.getInstance(); TupleFactory tf = TupleFactory.getInstance(); Tuple t1 = tf.newTuple(2); t1.set(0, "a"); t1.set(1, 5); Tuple t2 = tf.newTuple(2); t2.set(0, "c"); t2.set(1, 6); DataBag bag = bf.newDefaultBag(); bag.add(t1); bag.add(t2); BagToString udf = new BagToString(); Tuple udfInput = tf.newTuple(2); udfInput.set(0, bag); udfInput.set(1, "-"); String result = udf.exec(udfInput); assertEquals("a-5-c-6", result); } @Test public void testNestedTupleForBagToStringUDF() throws Exception { BagFactory bf = BagFactory.getInstance(); TupleFactory tf = TupleFactory.getInstance(); Tuple t1 = tf.newTuple(2); t1.set(0, "a"); t1.set(1, 5); Tuple nestedTuple = tf.newTuple(2); nestedTuple.set(0, "d"); nestedTuple.set(1, 7); Tuple t2 = tf.newTuple(3); t2.set(0, "c"); t2.set(1, 6); t2.set(2, nestedTuple); DataBag inputBag = bf.newDefaultBag(); inputBag.add(t1); inputBag.add(t2); BagToString udf = new BagToString(); Tuple udfInput = tf.newTuple(2); udfInput.set(0, inputBag); udfInput.set(1, "_"); String result = udf.exec(udfInput); assertEquals("a_5_c_6_(d,7)", result); } @Test public void testNestedDataElementsForBagToStringUDF() throws Exception { DataBag inputBag = buildBagWithNestedTupleAndBag(); BagToString udf = new BagToString(); Tuple udfInput = tf.newTuple(2); udfInput.set(0, inputBag); udfInput.set(1, "*"); String result = udf.exec(udfInput); assertEquals("a*5*c*6*(d,7)*{(in bag,10)}", result); } @Test(expected=java.lang.RuntimeException.class) public void testInvalidZeroInputToOutputSchemaForBagToTupleStringUDF() throws Exception { Schema inputSch = new Schema(); BagToString udf = new BagToString(); Schema outputSchema = udf.outputSchema(inputSch); assertEquals("schema of BagToTuple input", outputSchema.getField(0).type, DataType.CHARARRAY); } @Test public void testOutputSchemaForBagToTupleStringUDF() throws Exception { FieldSchema tupSch = new FieldSchema(null, DataType.TUPLE); tupSch.schema = new Schema(); tupSch.schema.add(new FieldSchema(null, DataType.INTEGER)); tupSch.schema.add(new FieldSchema(null, DataType.CHARARRAY)); FieldSchema bagSch = new FieldSchema(null, DataType.BAG); bagSch.schema = new Schema(tupSch); Schema inputSch = new Schema(); inputSch.add(bagSch); inputSch.add(new FieldSchema(null, DataType.CHARARRAY)); BagToString udf = new BagToString(); Schema outputSchema = udf.outputSchema(inputSch); assertEquals("schema of BagToTuple input", outputSchema.getField(0).type, DataType.CHARARRAY); } @Test public void testOutputSchemaWithDefaultDelimiterForBagToTupleStringUDF() throws Exception { FieldSchema tupSch = new FieldSchema(null, DataType.TUPLE); tupSch.schema = new Schema(); tupSch.schema.add(new FieldSchema(null, DataType.INTEGER)); tupSch.schema.add(new FieldSchema(null, DataType.CHARARRAY)); FieldSchema bagSch = new FieldSchema(null, DataType.BAG); bagSch.schema = new Schema(tupSch); Schema inputSch = new Schema(); inputSch.add(bagSch); BagToString udf = new BagToString(); Schema outputSchema = udf.outputSchema(inputSch); assertEquals("schema of BagToTuple input", outputSchema.getField(0).type, DataType.CHARARRAY); } @Test(expected=java.lang.RuntimeException.class) public void testInvalidOutputSchemaForBagToTupleStringUDF() throws Exception { FieldSchema tupSch = new FieldSchema(null, DataType.TUPLE); tupSch.schema = new Schema(); tupSch.schema.add(new FieldSchema(null, DataType.INTEGER)); tupSch.schema.add(new FieldSchema(null, DataType.CHARARRAY)); FieldSchema bagSch = new FieldSchema(null, DataType.BAG); bagSch.schema = new Schema(tupSch); Schema inputSch = new Schema(); inputSch.add(bagSch); inputSch.add(new FieldSchema(null, DataType.DOUBLE)); BagToString udf = new BagToString(); // expecting an exception because the delimiter is not of type Data.CHARARRAY udf.outputSchema(inputSch); } @Test public void testPigScriptForBagToTupleUDF() throws Exception { PigServer pigServer = new PigServer(ExecType.LOCAL); Data data = resetData(pigServer); // bag of chararray data.set("foo", "myBag:bag{t:(l:chararray)}", tuple(bag(tuple("a"), tuple("b"), tuple("c")))); pigServer.registerQuery("A = LOAD 'foo' USING mock.Storage();"); pigServer.registerQuery("B = FOREACH A GENERATE BagToTuple(myBag) as myBag;"); pigServer.registerQuery("STORE B INTO 'bar' USING mock.Storage();"); assertEquals(schema("myBag:(l:chararray)"), data.getSchema("bar")); List<Tuple> out = data.get("bar"); assertEquals(tuple("a", "b","c"), out.get(0).get(0)); // bag of longs data = resetData(pigServer); data.set("foo", "myBag:bag{t:(l:long)}", tuple(bag(tuple(1), tuple(2), tuple(3)))); pigServer.registerQuery("A = LOAD 'foo' USING mock.Storage();"); pigServer.registerQuery("B = FOREACH A GENERATE BagToTuple(myBag) as myBag;"); pigServer.registerQuery("STORE B INTO 'bar' USING mock.Storage();"); out = data.get("bar"); assertEquals(tuple(1, 2, 3), out.get(0).get(0)); } @Test public void testPigScriptMultipleElmementsPerTupleForBagTupleUDF() throws Exception { PigServer pigServer = new PigServer(ExecType.LOCAL); Data data = resetData(pigServer); data.set("foo", "myBag:bag{t:(l:chararray)}", tuple(bag(tuple("a", "b"), tuple("c", "d"), tuple("e", "f")))); pigServer.registerQuery("A = LOAD 'foo' USING mock.Storage();"); pigServer.registerQuery("B = FOREACH A GENERATE BagToTuple(myBag) as myBag;"); pigServer.registerQuery("STORE B INTO 'bar' USING mock.Storage();"); List<Tuple> out = data.get("bar"); assertEquals(tuple("a", "b","c", "d", "e", "f"), out.get(0).get(0)); } @Test public void testPigScriptNestedTupleForBagToTupleDF() throws Exception { PigServer pigServer = new PigServer(ExecType.LOCAL); Data data = resetData(pigServer); Tuple nestedTuple = tuple(bag(tuple("c"), tuple("d"))); data.set("foo", "myBag:bag{t:(l:chararray)}", tuple(bag(tuple("a"), tuple("b"), nestedTuple, tuple("e")))); pigServer.registerQuery("A = LOAD 'foo' USING mock.Storage();"); pigServer.registerQuery("B = FOREACH A GENERATE BagToTuple(myBag) as myBag;"); pigServer.registerQuery("STORE B INTO 'bar' USING mock.Storage();"); List<Tuple> out = data.get("bar"); assertEquals(tuple("a", "b",bag(tuple("c"), tuple("d")), "e"), out.get(0).get(0)); } @Test public void testPigScriptEmptyBagForBagToTupleUDF() throws Exception { PigServer pigServer = new PigServer(ExecType.LOCAL); Data data = resetData(pigServer); data.set("foo", "myBag:bag{t:(l:chararray)}", tuple(bag())); pigServer.registerQuery("A = LOAD 'foo' USING mock.Storage();"); pigServer.registerQuery("B = FOREACH A GENERATE BagToTuple(myBag) as myBag;"); pigServer.registerQuery("STORE B INTO 'bar' USING mock.Storage();"); List<Tuple> out = data.get("bar"); // empty bag will generate empty tuple assertEquals(tuple(), out.get(0).get(0)); } @Test public void testPigScriptrForBagToStringUDF() throws Exception { PigServer pigServer = new PigServer(ExecType.LOCAL); Data data = resetData(pigServer); data.set("foo", "myBag:bag{t:(l:chararray)}", tuple(bag(tuple("a"), tuple("b"), tuple("c")))); pigServer.registerQuery("A = LOAD 'foo' USING mock.Storage();"); pigServer.registerQuery("B = FOREACH A GENERATE BagToString(myBag) as myBag;"); pigServer.registerQuery("STORE B INTO 'bar' USING mock.Storage();"); pigServer.registerQuery("C = FOREACH A GENERATE BagToString(myBag, '==') as myBag;"); pigServer.registerQuery("STORE C INTO 'baz' USING mock.Storage();"); List<Tuple> out = data.get("bar"); assertEquals(schema("myBag:chararray"), data.getSchema("bar")); assertEquals(tuple("a_b_c"), out.get(0)); out = data.get("baz"); assertEquals(tuple("a==b==c"), out.get(0)); } @Test public void testPigScriptMultipleElmementsPerTupleForBagToStringUDF() throws Exception { PigServer pigServer = new PigServer(ExecType.LOCAL); Data data = resetData(pigServer); data.set("foo", "myBag:bag{t:(l:chararray)}", tuple(bag(tuple("a", "b"), tuple("c", "d"), tuple("e", "f")))); pigServer.registerQuery("A = LOAD 'foo' USING mock.Storage();"); pigServer.registerQuery("B = FOREACH A GENERATE BagToString(myBag) as myBag;"); pigServer.registerQuery("STORE B INTO 'bar' USING mock.Storage();"); pigServer.registerQuery("C = FOREACH A GENERATE BagToString(myBag, '^') as myBag;"); pigServer.registerQuery("STORE C INTO 'baz' USING mock.Storage();"); List<Tuple> out = data.get("bar"); assertEquals(tuple("a_b_c_d_e_f"), out.get(0)); out = data.get("baz"); assertEquals(tuple("a^b^c^d^e^f"), out.get(0)); } @Test public void testPigScriptNestedTupleForBagToStringUDF() throws Exception { PigServer pigServer = new PigServer(ExecType.LOCAL); Data data = resetData(pigServer); Tuple nestedTuple = tuple(bag(tuple("c"), tuple("d"))); data.set("foo", "myBag:bag{t:(l:chararray)}", tuple(bag(tuple("a"), tuple("b"), nestedTuple, tuple("e")))); pigServer.registerQuery("A = LOAD 'foo' USING mock.Storage();"); pigServer.registerQuery("B = FOREACH A GENERATE BagToString(myBag) as myBag;"); pigServer.registerQuery("STORE B INTO 'bar' USING mock.Storage();"); List<Tuple> out = data.get("bar"); assertEquals(tuple("a_b_{(c),(d)}_e"), out.get(0)); } @Test public void testPigScriptEmptyBagForBagToStringUDF() throws Exception { PigServer pigServer = new PigServer(ExecType.LOCAL); Data data = resetData(pigServer); data.set("foo", "myBag:bag{t:(l:chararray)}", tuple(bag())); pigServer.registerQuery("A = LOAD 'foo' USING mock.Storage();"); pigServer.registerQuery("B = FOREACH A GENERATE BagToString(myBag) as myBag;"); pigServer.registerQuery("STORE B INTO 'bar' USING mock.Storage();"); List<Tuple> out = data.get("bar"); // empty bag will generate empty string assertEquals(tuple(""), out.get(0)); } private DataBag buildBagWithNestedTupleAndBag() throws ExecException { Tuple t1 = tf.newTuple(2); t1.set(0, "a"); t1.set(1, 5); Tuple nestedTuple = tf.newTuple(2); nestedTuple.set(0, "d"); nestedTuple.set(1, 7); Tuple t2 = tf.newTuple(3); t2.set(0, "c"); t2.set(1, 6); t2.set(2, nestedTuple); DataBag nestedBag = bf.newDefaultBag(); Tuple tupleInNestedBag = tf.newTuple(2); tupleInNestedBag.set(0, "in bag"); tupleInNestedBag.set(1, 10); nestedBag.add(tupleInNestedBag); Tuple t3 = tf.newTuple(1); t3.set(0, nestedBag); DataBag bag = bf.newDefaultBag(); bag.add(t1); bag.add(t2); bag.add(t3); return bag; } }