/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.zebra.pig; import java.io.IOException; import java.util.Iterator; import java.util.List; import junit.framework.Assert; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.zebra.BaseTestCase; import org.apache.hadoop.zebra.io.BasicTable; import org.apache.hadoop.zebra.io.TableInserter; import org.apache.hadoop.zebra.io.TableScanner; import org.apache.hadoop.zebra.io.BasicTable.Reader.RangeSplit; import org.apache.hadoop.zebra.parser.ParseException; import org.apache.hadoop.zebra.schema.Schema; import org.apache.hadoop.zebra.types.TypesUtils; import org.apache.pig.ExecType; import org.apache.pig.PigServer; import org.apache.pig.backend.executionengine.ExecException; import org.apache.pig.data.DataByteArray; import org.apache.pig.data.Tuple; import org.apache.pig.test.MiniCluster; import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Test; /** * Note: * * Make sure you add the build/pig-0.1.0-dev-core.jar to the Classpath of the * app/debug configuration, when run this from inside the Eclipse. * */ public class TestBasicUnion extends BaseTestCase { final static String STR_SCHEMA1 = "a:string,b,c:string,e,f"; final static String STR_STORAGE1 = "[a];[c]"; final static String STR_SCHEMA2 = "a:string,b,d:string,f,e"; final static String STR_STORAGE2 = "[a,b];[d]"; final static String STR_SCHEMA3 = "b,a"; final static String STR_STORAGE3 = "[a];[b]"; final static String STR_SCHEMA4 = "b:string,a,c:string"; final static String STR_STORAGE4 = "[a,b];[c]"; final static String STR_SCHEMA5 = "b,a:string"; final static String STR_STORAGE5 = "[a,b]"; private static Path path1, path2, path3, path4, path5; @BeforeClass public static void setUp() throws Exception { init(); path1 = getTableFullPath("/TestBasicUnion1"); path2 = getTableFullPath("/TestBasicUnion2"); path3 = getTableFullPath("/TestBasicUnion3"); path4 = getTableFullPath("/TestBasicUnion4"); path5 = getTableFullPath("/TestBasicUnion5"); removeDir(path1); removeDir(path2); removeDir(path3); removeDir(path4); removeDir(path5); /* * create 1st basic table; */ BasicTable.Writer writer = new BasicTable.Writer(path1, STR_SCHEMA1, STR_STORAGE1, conf); Schema schema = writer.getSchema(); Tuple tuple = TypesUtils.createTuple(schema); final int numsBatch = 10; final int numsInserters = 2; TableInserter[] inserters = new TableInserter[numsInserters]; for (int i = 0; i < numsInserters; i++) { inserters[i] = writer.getInserter("ins" + i, false); } for (int b = 0; b < numsBatch; b++) { for (int i = 0; i < numsInserters; i++) { TypesUtils.resetTuple(tuple); for (int k = 0; k < tuple.size(); ++k) { try { if (k == 1 || k == 3 || k == 4) { // bytes type tuple.set(k, new DataByteArray(new String(b + "_" + i + "" + k).toString())); } else { tuple.set(k, b + "_" + i + "" + k); } } catch (ExecException e) { } }// k inserters[i].insert(new BytesWritable(("key1" + i).getBytes()), tuple); }// i }// b for (int i = 0; i < numsInserters; i++) { inserters[i].close(); } writer.close(); /* * create 2nd basic table; */ writer = new BasicTable.Writer(path2, STR_SCHEMA2, STR_STORAGE2, conf); schema = writer.getSchema(); tuple = TypesUtils.createTuple(schema); inserters = new TableInserter[numsInserters]; for (int i = 0; i < numsInserters; i++) { inserters[i] = writer.getInserter("ins" + i, false); } for (int b = 0; b < numsBatch; b++) { for (int i = 0; i < numsInserters; i++) { TypesUtils.resetTuple(tuple); for (int k = 0; k < tuple.size(); ++k) { try { if (k == 1 || k == 3 || k == 4) { tuple.set(k, new DataByteArray(new String(b + "_" + i + "" + k).toString())); } else { tuple.set(k, b + "_" + i + "" + k); } } catch (ExecException e) { } } inserters[i].insert(new BytesWritable(("key2" + i).getBytes()), tuple); } } for (int i = 0; i < numsInserters; i++) { inserters[i].close(); } writer.close(); /* * create 3rd basic table; */ writer = new BasicTable.Writer(path3, STR_SCHEMA3, STR_STORAGE3, conf); schema = writer.getSchema(); tuple = TypesUtils.createTuple(schema); inserters = new TableInserter[numsInserters]; for (int i = 0; i < numsInserters; i++) { inserters[i] = writer.getInserter("ins" + i, false); } for (int b = 0; b < numsBatch; b++) { for (int i = 0; i < numsInserters; i++) { TypesUtils.resetTuple(tuple); for (int k = 0; k < tuple.size(); ++k) { try { tuple.set(k, new DataByteArray(new String(b + "_" + i + "" + k).toString())); } catch (ExecException e) { } } inserters[i].insert(new BytesWritable(("key3" + i).getBytes()), tuple); } } for (int i = 0; i < numsInserters; i++) { inserters[i].close(); } writer.close(); /* * create 4th basic table; */ writer = new BasicTable.Writer(path4, STR_SCHEMA4, STR_STORAGE4, conf); schema = writer.getSchema(); tuple = TypesUtils.createTuple(schema); inserters = new TableInserter[numsInserters]; for (int i = 0; i < numsInserters; i++) { inserters[i] = writer.getInserter("ins" + i, false); } for (int b = 0; b < numsBatch; b++) { for (int i = 0; i < numsInserters; i++) { TypesUtils.resetTuple(tuple); for (int k = 0; k < tuple.size(); ++k) { try { if (k == 1) { tuple.set(k, new DataByteArray(new String(b + "_" + i + "" + k).toString())); } else { tuple.set(k, b + "_" + i + "" + k); } } catch (ExecException e) { } } inserters[i].insert(new BytesWritable(("key4" + i).getBytes()), tuple); } } for (int i = 0; i < numsInserters; i++) { inserters[i].close(); } writer.close(); /* * create 5th basic table; */ writer = new BasicTable.Writer(path5, STR_SCHEMA5, STR_STORAGE5, conf); schema = writer.getSchema(); tuple = TypesUtils.createTuple(schema); inserters = new TableInserter[numsInserters]; for (int i = 0; i < numsInserters; i++) { inserters[i] = writer.getInserter("ins" + i, false); } for (int b = 0; b < numsBatch; b++) { for (int i = 0; i < numsInserters; i++) { TypesUtils.resetTuple(tuple); for (int k = 0; k < tuple.size(); ++k) { try { if (k == 0) { tuple.set(k, new DataByteArray(new String(b + "_" + i + "" + k).toString())); } else { tuple.set(k, b + "_" + i + "" + k); } } catch (ExecException e) { } } inserters[i].insert(new BytesWritable(("key5" + i).getBytes()), tuple); } } for (int i = 0; i < numsInserters; i++) { inserters[i].close(); } writer.close(); } @AfterClass public static void tearDownOnce() throws Exception { pigServer.shutdown(); BasicTable.drop(path1, conf); BasicTable.drop(path2, conf); BasicTable.drop(path3, conf); BasicTable.drop(path4, conf); BasicTable.drop(path5, conf); } @Test public void testReader1() throws ExecException, IOException { pigServer.registerQuery(constructQuery(path1, path2, "'a, b, c, d'")); Iterator<Tuple> it = pigServer.openIterator("records"); int i = 0; int k = -1; Tuple cur = null; int t = -1; int j = -1; while (it.hasNext()) { cur = it.next(); System.out.println("cur: " + cur); // first table if (i <= 9) { System.out.println("first table first part: " + cur.toString()); Assert.assertEquals(i + "_00", cur.get(0)); Assert.assertEquals(i + "_01", cur.get(1).toString()); Assert.assertTrue(((cur.get(2) == null) || (cur.get(2) .equals(i + "_02")))); Assert.assertTrue(((cur.get(3) == null) || (cur.get(3) .equals(i + "_02")))); } if (i >= 10) { k++; } if (k <= 9 && k >= 0) { System.out.println("first table second part: : " + cur.toString()); Assert.assertEquals(k + "_10", cur.get(0)); Assert.assertEquals(k + "_11", cur.get(1).toString()); Assert.assertTrue(((cur.get(2) == null) || (cur.get(2) .equals(k + "_12")))); Assert.assertTrue(((cur.get(3) == null) || (cur.get(3) .equals(k + "_12")))); } // second table if (k >= 10) { t++; } if (t <= 9 && t >= 0) { System.out.println("second table first part: " + cur.toString()); Assert.assertEquals(t + "_00", cur.get(0)); Assert.assertEquals(t + "_01", cur.get(1).toString()); Assert.assertTrue(((cur.get(2) == null) || (cur.get(2) .equals(t + "_02")))); Assert.assertTrue(((cur.get(3) == null) || (cur.get(3) .equals(t + "_02")))); } if (t >= 10) { j++; } if (j <= 9 && j >= 0) { System.out.println("second table first part: " + cur.toString()); Assert.assertEquals(j + "_10", cur.get(0)); Assert.assertEquals(j + "_11", cur.get(1).toString()); Assert.assertTrue(((cur.get(2) == null) || (cur.get(2) .equals(j + "_12")))); Assert.assertTrue(((cur.get(3) == null) || (cur.get(3) .equals(j + "_12")))); } i++; }// while Assert.assertEquals(40, i); } @Test public void testReaderThroughIO() throws ExecException, IOException, ParseException { String projection1 = new String("a,b,c"); BasicTable.Reader reader = new BasicTable.Reader(path1, conf); reader.setProjection(projection1); List<RangeSplit> splits = reader.rangeSplit(1); TableScanner scanner = reader.getScanner(splits.get(0), true); scanner = reader.getScanner(splits.get(0), true); BytesWritable key = new BytesWritable(); Tuple RowValue = TypesUtils.createTuple(scanner.getSchema()); scanner.getKey(key); System.out.println("read record or record:" + RowValue.toString()); for (int i = 0; i <= 9; i++) { scanner.getValue(RowValue); System.out.println("read record or record:" + RowValue.toString()); Assert.assertEquals(i + "_00", RowValue.get(0)); Assert.assertEquals(i + "_01", RowValue.get(1).toString()); Assert.assertEquals(i + "_02", RowValue.get(2)); scanner.advance(); } for (int i = 0; i <= 9; i++) { scanner.getValue(RowValue); System.out.println("read record or record:" + RowValue.toString()); Assert.assertEquals(i + "_10", RowValue.get(0)); Assert.assertEquals(i + "_11", RowValue.get(1).toString()); Assert.assertEquals(i + "_12", RowValue.get(2)); scanner.advance(); } reader.close(); } // field c @Test public void testReader2() throws ExecException, IOException { pigServer.registerQuery(constructQuery(path1, path2, "'c'")); Iterator<Tuple> it = pigServer.openIterator("records"); int i = 0; int k = -1; Tuple cur = null; int t = -1; int j = -1; while (it.hasNext()) { cur = it.next(); System.out.println("cur: " + cur); if (i <= 9) { System.out.println("first table first part: " + cur.toString()); Assert.assertTrue(((cur.get(0) == null) || (cur.get(0) .equals(i + "_02")))); try { cur.get(1); Assert.fail("should throw index out of bound exception "); } catch (Exception e) { } } if (i >= 10) { k++; } if (k <= 9 && k >= 0) { System.out.println("first table second part: : " + cur.toString()); Assert.assertTrue(((cur.get(0) == null) || (cur.get(0) .equals(k + "_12")))); try { cur.get(1); Assert.fail("should throw index out of bound exception "); } catch (Exception e) { } } if (k >= 10) { t++; } if (t <= 9 && t >= 0) { System.out.println("second table first part: " + cur.toString()); Assert.assertTrue(((cur.get(0) == null) || (cur.get(0) .equals(t + "_02")))); try { cur.get(1); Assert.fail("should throw index out of bound exception "); } catch (Exception e) { } } if (t >= 10) { j++; } if (j <= 9 && j >= 0) { System.out.println("second table first part: " + cur.toString()); Assert.assertTrue(((cur.get(0) == null) || (cur.get(0) .equals(j + "_12")))); try { cur.get(1); Assert.fail("should throw index out of bound exception "); } catch (Exception e) { } } i++; }// while Assert.assertEquals(40, i); } // projection for common exist colum a @Test public void testReader3() throws ExecException, IOException { pigServer.registerQuery(constructQuery(path1, path2, "'a'")); Iterator<Tuple> it = pigServer.openIterator("records"); int i = 0; int k = -1; Tuple cur = null; int t = -1; int j = -1; while (it.hasNext()) { cur = it.next(); System.out.println("cur: " + cur); // first table if (i <= 9) { System.out.println("first table first part: " + cur.toString()); Assert.assertTrue(((cur.get(0) == null) || (cur.get(0) .equals(i + "_00")))); try { cur.get(1); Assert.fail("should throw index out of bound exception "); } catch (Exception e) { } } if (i >= 10) { k++; } if (k <= 9 && k >= 0) { System.out.println("first table second part: : " + cur.toString()); Assert.assertTrue(((cur.get(0) == null) || (cur.get(0) .equals(k + "_10")))); try { cur.get(1); Assert.fail("should throw index out of bound exception "); } catch (Exception e) { } } // second table if (k >= 10) { t++; } if (t <= 9 && t >= 0) { System.out.println("second table first part: " + cur.toString()); Assert.assertTrue(((cur.get(0) == null) || (cur.get(0) .equals(t + "_00")))); try { cur.get(1); Assert.fail("should throw index out of bound exception "); } catch (Exception e) { } } if (t >= 10) { j++; } if (j <= 9 && j >= 0) { System.out.println("second table first part: " + cur.toString()); Assert.assertTrue(((cur.get(0) == null) || (cur.get(0) .equals(j + "_10")))); try { cur.get(1); Assert.fail("should throw index out of bound exception "); } catch (Exception e) { } } i++; }// while Assert.assertEquals(40, i); } // some common fields @Test public void testReader4() throws ExecException, IOException { pigServer.registerQuery(constructQuery(path1, path2, "'a, b'")); Iterator<Tuple> it = pigServer.openIterator("records"); int i = 0; int k = -1; Tuple cur = null; int t = -1; int j = -1; while (it.hasNext()) { cur = it.next(); System.out.println("cur: " + cur); // first table if (i <= 9) { System.out.println("first table first part: " + cur.toString()); Assert.assertEquals(i + "_00", cur.get(0)); Assert.assertEquals(i + "_01", cur.get(1).toString()); try { cur.get(2); Assert.fail("should throw index out of bound exception"); } catch (Exception e) { } } if (i >= 10) { k++; } if (k <= 9 && k >= 0) { System.out.println("first table second part: : " + cur.toString()); Assert.assertEquals(k + "_10", cur.get(0)); Assert.assertEquals(k + "_11", cur.get(1).toString()); try { cur.get(2); Assert.fail("should throw index out of bound exception"); } catch (Exception e) { } } // second table if (k >= 10) { t++; } if (t <= 9 && t >= 0) { System.out.println("second table first part: " + cur.toString()); Assert.assertEquals(t + "_00", cur.get(0)); Assert.assertEquals(t + "_01", cur.get(1).toString()); try { cur.get(2); Assert.fail("should throw index out of bound exception"); } catch (Exception e) { } } if (t >= 10) { j++; } if (j <= 9 && j >= 0) { System.out.println("second table first part: " + cur.toString()); Assert.assertEquals(j + "_10", cur.get(0)); Assert.assertEquals(j + "_11", cur.get(1).toString()); try { cur.get(2); Assert.fail("should throw index out of bound exception"); } catch (Exception e) { } } i++; }// while Assert.assertEquals(40, i); } // common column, but different posion @Test public void testReader5() throws ExecException, IOException { pigServer.registerQuery(constructQuery(path1, path2, "'e,f'")); Iterator<Tuple> it = pigServer.openIterator("records"); int i = 0; int k = -1; Tuple cur = null; int t = -1; int j = -1; while (it.hasNext()) { cur = it.next(); System.out.println("cur: " + cur); // first table if (i <= 9) { System.out.println("first table first part: " + cur.toString()); Assert.assertTrue(((cur.get(0)).toString().equals(i + "_03") || (cur.get(0).toString() .equals(i + "_04")))); System.out.println("get1: " + cur.get(1)); Assert.assertTrue(((cur.get(1)).toString().equals(i + "_03") || (cur.get(1).toString() .equals(i + "_04")))); try { cur.get(2); Assert.fail("should throw out of index bound exception"); } catch (Exception e) { } } if (i >= 10) { k++; } if (k <= 9 && k >= 0) { System.out.println("first table second part: : " + cur.toString()); Assert.assertTrue(((cur.get(0).toString().equals(k + "_13")) || (cur.get(0).toString() .equals(k + "_14")))); Assert.assertTrue(((cur.get(1).toString().equals(k + "_13")) || (cur.get(1).toString() .equals(k + "_14")))); } // second table if (k >= 10) { t++; } if (t <= 9 && t >= 0) { System.out.println("second table first part: " + cur.toString()); Assert.assertTrue(((cur.get(0).toString().equals(t + "_03")) || (cur.get(0).toString() .equals(t + "_04")))); Assert.assertTrue(((cur.get(1).toString().equals(t + "_03")) || (cur.get(1).toString() .equals(t + "_04")))); } if (t >= 10) { j++; } if (j <= 9 && j >= 0) { System.out.println("second table first part: " + cur.toString()); Assert.assertTrue(((cur.get(0).toString().equals(j + "_13")) || (cur.get(0).toString() .equals(j + "_14")))); Assert.assertTrue(((cur.get(1).toString().equals(j + "_13")) || (cur.get(1).toString() .equals(j + "_14")))); } i++; }// while Assert.assertEquals(40, i); } @Test // union two tables with different column numbers and column positions public void testReader6() throws ExecException, IOException { pigServer.registerQuery(constructQuery(path1, path5, "'b,a'")); Iterator<Tuple> it = pigServer.openIterator("records"); int i = 0; int count = 0; Tuple cur = null; String[] exp1 = new String[] { "0_01", "1_01", "2_01", "3_01", "4_01", "5_01", "6_01", "7_01", "8_01", "9_01", "0_11", "1_11", "2_11", "3_11", "4_11", "5_11", "6_11", "7_11", "8_11", "9_11"}; String[] exp2 = new String[] { "0_00", "1_00", "2_00", "3_00", "4_00", "5_00", "6_00", "7_00", "8_00", "9_00", "0_10", "1_10", "2_10", "3_10", "4_10", "5_10", "6_10", "7_10", "8_10", "9_10"}; while (it.hasNext()) { count++; cur = it.next(); System.out.println("cur #" + i + ": " + cur); try { cur.get(2); Assert.fail("should throw index out of bound exception"); } catch (Exception e) { } //String b = (String)cur.get(0); String b = cur.get(0).toString(); String a = (String)cur.get(1); Assert.assertTrue( b.equals(exp1[i]) || b.equals(exp2[i]) ); Assert.assertTrue( a.equals(exp2[i]) || a.equals(exp1[i]) ); if( ++i == 20 ) i = 0; }// while Assert.assertEquals(40, count); } // both paths is hdfs:///../jars. mini cluster need to substr, real cluster // don't need to @Test public void testNeg1() throws ExecException, IOException { pigServer.registerQuery(constructQuery(path1, path2, "'a,b,c,d'")); Iterator<Tuple> it = pigServer.openIterator("records"); int cnt = 0; Tuple cur = it.next(); cnt++; while (it.hasNext()) { cur = it.next(); System.out.println(cur); cnt++; if (cnt == 1) { Assert.assertEquals("0_00", cur.get(0)); Assert.assertEquals("0_01", cur.get(1)); Assert .assertTrue(((cur.get(2) == null) || (cur.get(2).equals("0_02")))); Assert .assertTrue(((cur.get(3) == null) || (cur.get(3).equals("0_02")))); } if (cnt == 22) { Assert.assertEquals("1_00", cur.get(0)); Assert.assertEquals("1_01", cur.get(1).toString()); Assert .assertTrue(((cur.get(2) == null) || (cur.get(2).equals("1_02")))); Assert .assertTrue(((cur.get(3) == null) || (cur.get(3).equals("1_02")))); } } Assert.assertEquals(cnt, 40); } // non-existing column @Test public void testNeg2() throws ExecException, IOException { pigServer.registerQuery(constructQuery(path1, path2, "'a,f,x'")); Iterator<Tuple> it = pigServer.openIterator("records"); int cnt = 0; Tuple cur = it.next(); // cnt++; while (it.hasNext()) { cur = it.next(); System.out.println(cur); cnt++; if (cnt == 1) { Assert.assertEquals("1_00", cur.get(0)); System.out.println("neg2, cnt ==1: " +cur.get(1)); Assert .assertTrue(((cur.get(1) == null) || (cur.get(1).toString().equals("1_03"))||(cur.get(1).toString().equals("1_04")))); Assert.assertEquals(null, cur.get(2)); try { cur.get(3); Assert.fail("should throw index out of bound exception"); } catch (Exception e) { } } if (cnt == 21) { Assert.assertEquals("1_00", cur.get(0)); System.out.println("neg2, cnt ==22: " +cur.get(1)); Assert .assertTrue(((cur.get(1) == null) || (cur.get(1).toString().equals("1_04"))||(cur.get(1).toString().equals("1_03")))); Assert.assertEquals(null, cur.get(2)); try { cur.get(3); Assert.fail("should throw index out of bound exception"); } catch (Exception e) { } } } Assert.assertEquals(39, cnt); } @Test // 2 table with same column name but different type and different position. // should throw exception public void testNeg3() throws ExecException, IOException { try { pigServer.setValidateEachStatement(true); pigServer.registerQuery(constructQuery(path1, path3, "'a, b'")); Assert.fail("should throw exception"); } catch (Exception e) { } } @Test // union table1 and table4. they have same culumn name with differnt types , // should throw excepiton in union public void testNeg4() throws ExecException, IOException { try { pigServer.setValidateEachStatement(true); pigServer.registerQuery(constructQuery(path1, path4, "'a, b, c'")); Assert.fail("should throw exception"); } catch (Exception e) { } } protected String constructQuery(Path path1, Path path2, String schema) { return "records = LOAD '" + path1 + "," + path2 + "' USING org.apache.hadoop.zebra.pig.TableLoader(" + schema + ");"; } }