/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pig.test;
import java.io.File;
import java.util.Iterator;
import java.util.Map;
import java.util.Random;
import junit.framework.Assert;
import junit.framework.TestCase;
import org.apache.pig.ExecType;
import org.apache.pig.PigServer;
import org.apache.pig.data.BagFactory;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.io.FileLocalizer;
import org.junit.AfterClass;
import org.junit.Before;
import org.junit.Test;
public class TestScriptUDF{
static MiniCluster cluster = MiniCluster.buildCluster();
private PigServer pigServer;
TupleFactory mTf = TupleFactory.getInstance();
BagFactory mBf = BagFactory.getInstance();
@Before
public void setUp() throws Exception{
FileLocalizer.setR(new Random());
pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties());
}
@AfterClass
public static void oneTimeTearDown() throws Exception {
cluster.shutDown();
}
// See PIG-928
@Test
public void testPythonStandardScript() throws Exception{
String[] script = {
"#!/usr/bin/python",
"@outputSchema(\"x:{t:(num:long)}\")",
"def square(number):" ,
"\treturn (number * number)"
};
String[] input = {
"1\t3",
"2\t4",
"3\t5"
};
Util.createInputFile(cluster, "table_testPythonStandardScript", input);
Util.createLocalInputFile( "testPythonStandardScript.py", script);
// Test the namespace
pigServer.registerCode("testPythonStandardScript.py", "jython", "pig");
pigServer.registerQuery("A = LOAD 'table_testPythonStandardScript' as (a0:long, a1:long);");
pigServer.registerQuery("B = foreach A generate pig.square(a0);");
pigServer.registerCode("testPythonStandardScript.py", "jython", null);
pigServer.registerQuery("C = foreach A generate square(a0);");
Iterator<Tuple> iter = pigServer.openIterator("B");
Assert.assertTrue(iter.hasNext());
Tuple t = iter.next();
Assert.assertTrue(t.toString().equals("(1)"));
Assert.assertTrue(iter.hasNext());
t = iter.next();
Assert.assertTrue(t.toString().equals("(4)"));
Assert.assertTrue(iter.hasNext());
t = iter.next();
Assert.assertTrue(t.toString().equals("(9)"));
iter = pigServer.openIterator("C");
Assert.assertTrue(iter.hasNext());
t = iter.next();
Assert.assertTrue(t.toString().equals("(1)"));
Assert.assertTrue(iter.hasNext());
t = iter.next();
Assert.assertTrue(t.toString().equals("(4)"));
Assert.assertTrue(iter.hasNext());
t = iter.next();
Assert.assertTrue(t.toString().equals("(9)"));
}
@Test
public void testJavascriptExampleScript() throws Exception{
String[] script = {
"helloworld.outputSchema = \"word:chararray\";",
"function helloworld() {",
"return 'Hello, World';",
"}",
"complex.outputSchema = \"word:chararray,num:long\";",
"function complex(word) {",
"return {word:word, num:word.length};",
"}",
};
String[] input = {
"one\t1",
"two\t2",
"three\t3"
};
Util.createInputFile(cluster, "table_testJavascriptExampleScript", input);
Util.createLocalInputFile( "testJavascriptExampleScript.js", script);
// Test the namespace
pigServer.registerCode("testJavascriptExampleScript.js", "javascript", "myfuncs");
pigServer.registerQuery("A = LOAD 'table_testJavascriptExampleScript' as (a0:chararray, a1:long);");
pigServer.registerQuery("B = foreach A generate myfuncs.helloworld(), myfuncs.complex($0);");
Iterator<Tuple> iter = pigServer.openIterator("B");
Assert.assertTrue(iter.hasNext());
Tuple t = iter.next();
Assert.assertEquals(((Tuple)t.get(1)).get(1), 3);
Assert.assertTrue(iter.hasNext());
t = iter.next();
Assert.assertEquals(((Tuple)t.get(1)).get(1), 3);
Assert.assertTrue(iter.hasNext());
t = iter.next();
Assert.assertEquals(((Tuple)t.get(1)).get(1), 5);
}
// See PIG-928
@Test
public void testPythonScriptWithSchemaFunction() throws Exception{
String[] script = {
"#!/usr/bin/python",
"@outputSchemaFunction(\"squareSchema\")",
"def square(number):" ,
"\treturn (number * number)\n",
"@schemaFunction(\"square\")",
"def squareSchema(input):",
"\treturn input "
};
String[] input = {
"1\t3.0",
"2\t4.0",
"3\t5.0"
};
Util.createInputFile(cluster, "table_testPythonScriptWithSchemaFunction", input);
Util.createLocalInputFile( "testPythonScriptWithSchemaFunction.py", script);
// Test the namespace
pigServer.registerCode("testPythonScriptWithSchemaFunction.py", "jython", "pig");
pigServer.registerQuery("A = LOAD 'table_testPythonScriptWithSchemaFunction' as (a0:int, a1:double);");
pigServer.registerQuery("B = foreach A generate pig.square(a0);");
pigServer.registerCode("testPythonScriptWithSchemaFunction.py", "jython", null);
pigServer.registerQuery("C = foreach A generate square(a1);");
Iterator<Tuple> iter = pigServer.openIterator("B");
Assert.assertTrue(iter.hasNext());
Tuple t = iter.next();
Assert.assertTrue(t.toString().equals("(1)"));
Assert.assertTrue(iter.hasNext());
t = iter.next();
Assert.assertTrue(t.toString().equals("(4)"));
Assert.assertTrue(iter.hasNext());
t = iter.next();
Assert.assertTrue(t.toString().equals("(9)"));
// The same python function will operate on double and try to get square of double
// Since these are small double numbers we do not need to use delta to test the results
iter = pigServer.openIterator("C");
Assert.assertTrue(iter.hasNext());
t = iter.next();
Assert.assertTrue(t.toString().equals("(9.0)"));
Assert.assertTrue(iter.hasNext());
t = iter.next();
Assert.assertTrue(t.toString().equals("(16.0)"));
Assert.assertTrue(iter.hasNext());
t = iter.next();
Assert.assertTrue(t.toString().equals("(25.0)"));
}
// See PIG-928
@Test
public void testPythonScriptUDFNoDecorator() throws Exception{
String[] script = {
"#!/usr/bin/python",
// No decorator means schema is null - bytearray...
"def concat(word):" ,
"\treturn word + word"
};
String[] input = {
"hello\t1",
"pig\t2",
"world\t3"
};
Util.createInputFile(cluster, "table_testPythonScriptUDFNoDecorator", input);
Util.createLocalInputFile( "testPythonScriptUDFNoDecorator.py", script);
pigServer.registerCode("testPythonScriptUDFNoDecorator.py", "jython", "pig");
pigServer.registerQuery("A = LOAD 'table_testPythonScriptUDFNoDecorator' as (a0, a1:int);");
pigServer.registerQuery("B = foreach A generate pig.concat(a0);");
Iterator<Tuple> iter = pigServer.openIterator("B");
Assert.assertTrue(iter.hasNext());
Tuple t = iter.next();
// We need to check whether this is a DataByteArray or fail otherwise
if(!(t.get(0) instanceof DataByteArray)) {
Assert.fail("Default return type should be bytearray");
}
Assert.assertTrue(t.get(0).toString().trim().equals("hellohello"));
Assert.assertTrue(iter.hasNext());
t = iter.next();
Assert.assertTrue(t.get(0).toString().trim().equals("pigpig"));
Assert.assertTrue(iter.hasNext());
t = iter.next();
Assert.assertTrue(t.get(0).toString().trim().equals("worldworld"));
}
@Test
public void testPythonScriptUDFBagInput() throws Exception{
String[] script = {
"#!/usr/bin/python",
"@outputSchema(\"bag:{(y:{t:(len:int,word:chararray)})}\")",
"def collect(bag):" ,
"\toutBag = []",
"\tfor word in bag:",
// We need to wrap word inside a tuple for pig
"\t\ttup=(len(bag), word[1])",
"\t\toutBag.append(tup)",
"\treturn outBag"
};
String[] input = {
"1\thello",
"2\tpig",
"1\tworld",
"1\tprogram",
"2\thadoop"
};
Util.createInputFile(cluster, "table_testPythonScriptUDFBagInput", input);
Util.createLocalInputFile( "testPythonScriptUDFBagInput.py", script);
pigServer.registerCode("testPythonScriptUDFBagInput.py", "jython", "pig");
pigServer.registerQuery("A = LOAD 'table_testPythonScriptUDFBagInput' as (a0:int, a1:chararray);");
pigServer.registerQuery("B = group A by a0;");
pigServer.registerQuery("C = foreach B generate pig.collect(A);");
Iterator<Tuple> iter = pigServer.openIterator("C");
String[] expected = new String[] {
"({(3,hello),(3,world),(3,program)})",
"({(2,hadoop),(2,pig)})"
};
Util.checkQueryOutputsAfterSortRecursive(iter, expected, "y: {(len:int, word:chararray)}");
}
@Test
public void testPythonScriptUDFMapInput() throws Exception{
String[] script = {
"#!/usr/bin/python",
"@outputSchema(\"bag:{(y:{t:(word:chararray)})}\")",
"def maptobag(map):" ,
"\toutBag = []",
"\tfor k, v in map.iteritems():",
// We need to wrap word inside a tuple for pig
"\t\ttup = (k, v)",
"\t\toutBag.append(tup)",
"\treturn outBag"
};
String[] input = {
"[1#hello,2#world]",
"[3#pig,4#rocks]",
};
Util.createInputFile(cluster, "table_testPythonScriptUDFMapInput", input);
Util.createLocalInputFile( "testPythonScriptUDFMapInput.py", script);
pigServer.registerCode("testPythonScriptUDFMapInput.py", "jython", "pig");
pigServer.registerQuery("A = LOAD 'table_testPythonScriptUDFMapInput' as (a0:map[]);");
pigServer.registerQuery("B = foreach A generate pig.maptobag(a0);");
Iterator<Tuple> iter = pigServer.openIterator("B");
Assert.assertTrue(iter.hasNext());
Tuple t = iter.next();
DataBag bag;
Tuple tup;
bag = BagFactory.getInstance().newDefaultBag();
tup = TupleFactory.getInstance().newTuple();
tup.append(1);
tup.append("hello");
bag.add(tup);
tup = TupleFactory.getInstance().newTuple();
tup.append(2);
tup.append("world");
bag.add(tup);
Assert.assertTrue(t.get(0).toString().equals(bag.toString()));
Assert.assertTrue(iter.hasNext());
t = iter.next();
tup = TupleFactory.getInstance().newTuple();
tup.append(3);
tup.append("pig");
Assert.assertTrue(t.toString().contains(tup.toString()));
tup = TupleFactory.getInstance().newTuple();
tup.append(4);
tup.append("rocks");
Assert.assertTrue(t.toString().contains(tup.toString()));
Assert.assertFalse(iter.hasNext());
}
@Test
public void testPythonScriptUDFMapOutput() throws Exception{
String[] script = {
"#!/usr/bin/python",
"@outputSchema(\"mapint:[]\")",
"def maptomapint(map):" ,
"\toutMap = {}",
"\tfor k, v in map.iteritems():",
"\t\toutMap[k] = len(v)",
"\treturn outMap"
};
String[] input = {
"[1#hello,2#world]",
"[3#pig,4#rocks]",
};
Util.createInputFile(cluster, "table_testPythonScriptUDFMapOutput", input);
Util.createLocalInputFile( "testPythonScriptUDFMapOutput.py", script);
pigServer.registerCode("testPythonScriptUDFMapOutput.py", "jython", "pig");
pigServer.registerQuery("A = LOAD 'table_testPythonScriptUDFMapOutput' as (a0:map[]);");
pigServer.registerQuery("B = foreach A generate pig.maptomapint(a0);");
Iterator<Tuple> iter = pigServer.openIterator("B");
Assert.assertTrue(iter.hasNext());
Tuple t = iter.next();
Assert.assertEquals(5, ((Map<?,?>)t.get(0)).get("1"));
Assert.assertEquals(5, ((Map<?,?>)t.get(0)).get("2"));
Assert.assertTrue(iter.hasNext());
t = iter.next();
Assert.assertEquals(3, ((Map<?,?>)t.get(0)).get("3"));
Assert.assertEquals(5, ((Map<?,?>)t.get(0)).get("4"));
Assert.assertFalse(iter.hasNext());
}
@Test
public void testPythonScriptUDFNullInputOutput() throws Exception {
String[] script = {
"#!/usr/bin/python",
"@outputSchema(\"bag:{(y:{t:(word:chararray)})}\")",
"def multStr(cnt, str):" ,
"\tif cnt != None and str != None:",
"\t\treturn cnt * str",
"\telse:",
"\t\treturn None"
};
String[] input = {
"3\thello",
// Null input
"\tworld",
};
Util.createInputFile(cluster, "table_testPythonScriptUDFNullInputOutput", input);
Util.createLocalInputFile( "testPythonScriptUDFNullInputOutput.py", script);
pigServer.registerCode("testPythonScriptUDFNullInputOutput.py", "jython", "pig");
pigServer.registerQuery("A = LOAD 'table_testPythonScriptUDFNullInputOutput' as (a0:int, a1:chararray);");
pigServer.registerQuery("B = foreach A generate pig.multStr(a0, a1);");
Iterator<Tuple> iter = pigServer.openIterator("B");
Assert.assertTrue(iter.hasNext());
Tuple t = iter.next();
Assert.assertTrue(t.get(0).toString().equals("hellohellohello"));
Assert.assertTrue(iter.hasNext());
t = iter.next();
// UDF takes null and returns null
Assert.assertTrue(t.get(0) == null);
}
// See Pig-1653
@Test
public void testPythonAbsolutePath() throws Exception{
String[] script = {
"#!/usr/bin/python",
"@outputSchema(\"x:{t:(num:long)}\")",
"def square(number):" ,
"\treturn (number * number)"
};
String[] input = {
"1\t3",
"2\t4",
"3\t5"
};
Util.createInputFile(cluster, "table_testPythonAbsolutePath", input);
File scriptFile = Util.createLocalInputFile( "testPythonAbsolutePath.py", script);
// Test the namespace
pigServer.registerCode(scriptFile.getAbsolutePath(), "jython", "pig");
pigServer.registerQuery("A = LOAD 'table_testPythonAbsolutePath' as (a0:long, a1:long);");
pigServer.registerQuery("B = foreach A generate pig.square(a0);");
Iterator<Tuple> iter = pigServer.openIterator("B");
Assert.assertTrue(iter.hasNext());
Tuple t = iter.next();
Assert.assertTrue(t.toString().equals("(1)"));
Assert.assertTrue(iter.hasNext());
t = iter.next();
Assert.assertTrue(t.toString().equals("(4)"));
Assert.assertTrue(iter.hasNext());
t = iter.next();
Assert.assertTrue(t.toString().equals("(9)"));
Assert.assertFalse(iter.hasNext());
}
/** See Pig-1824
* test import of wildcarded java classes, this will not work unless
* jython is configured with a valid cachedir, which is what this tests.
* @throws Exception
*/
@Test
public void testPythonWilcardImport() throws Exception {
// hadoop.fs.Path is in the classpath (always)
String[] script = {
"#!/usr/bin/python",
"from org.apache.hadoop.fs import *",
"p = Path('foo')",
"@outputSchema(\"word:chararray\")",
"def first(content):",
" return content.split(' ')[0]"
};
String[] input = {
"words words words",
"talk talk talk"
};
Util.createInputFile(cluster, "table_testPythonWildcardImport", input);
File scriptFile = Util.createLocalInputFile( "script.py", script);
// Test the namespace
pigServer.registerCode(scriptFile.getAbsolutePath(), "jython", "pig");
pigServer.registerQuery("A = LOAD 'table_testPythonWildcardImport' as (a:chararray);");
pigServer.registerQuery("B = foreach A generate pig.first(a);");
Iterator<Tuple> iter = pigServer.openIterator("B");
Assert.assertTrue(iter.hasNext());
Tuple t = iter.next();
Assert.assertTrue(t.toString().equals("(words)"));
Assert.assertTrue(iter.hasNext());
t = iter.next();
Assert.assertTrue(t.toString().equals("(talk)"));
Assert.assertFalse(iter.hasNext());
}
/** See Pig-1824
* test importing a second module/file from the local fs from within
* the first module.
*
* NOTE: this unit test also covers the "import re" test case.
* not all users have a jython install, so there is no explicit unit test
* for "import re".
* to use a jython install, the Lib dir must be in the jython search path
* via env variable JYTHON_HOME=jy_home or JYTHON_PATH=jy_home/Lib:...
*
* @throws Exception
*/
@Test
public void testPythonNestedImport() throws Exception {
// Skip for hadoop 23 until PIG-2433 fixed
if (Util.isHadoop23())
return;
String[] scriptA = {
"#!/usr/bin/python",
"def square(number):" ,
" return (number * number)"
};
String[] scriptB = {
"#!/usr/bin/python",
"import scriptA",
"@outputSchema(\"x:{t:(num:double)}\")",
"def sqrt(number):" ,
" return (number ** .5)",
"@outputSchema(\"x:{t:(num:long)}\")",
"def square(number):" ,
" return long(scriptA.square(number))"
};
String[] input = {
"1\t3",
"2\t4",
"3\t5"
};
Util.createInputFile(cluster, "table_testPythonNestedImport", input);
Util.createLocalInputFile("scriptA.py", scriptA);
File scriptFileB = Util.createLocalInputFile("scriptB.py", scriptB);
// Test the namespace: import B, which, in turn, imports A
pigServer.registerCode(scriptFileB.getAbsolutePath(), "jython", "pig");
pigServer.registerQuery("A = LOAD 'table_testPythonNestedImport' as (a0:long, a1:long);");
pigServer.registerQuery("B = foreach A generate pig.square(a0);");
Iterator<Tuple> iter = pigServer.openIterator("B");
Assert.assertTrue(iter.hasNext());
Tuple t = iter.next();
Assert.assertTrue(t.toString().equals("(1)"));
Assert.assertTrue(iter.hasNext());
t = iter.next();
Assert.assertTrue(t.toString().equals("(4)"));
Assert.assertTrue(iter.hasNext());
t = iter.next();
Assert.assertTrue(t.toString().equals("(9)"));
Assert.assertFalse(iter.hasNext());
}
}