/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.zebra.pig; import java.io.IOException; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import junit.framework.Assert; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.zebra.io.BasicTable; import org.apache.hadoop.zebra.io.TableInserter; import org.apache.hadoop.zebra.io.TableScanner; import org.apache.hadoop.zebra.io.BasicTable.Reader.RangeSplit; import org.apache.hadoop.zebra.pig.TableStorer; import org.apache.hadoop.zebra.schema.Schema; import org.apache.hadoop.zebra.parser.ParseException; import org.apache.hadoop.zebra.types.TypesUtils; import org.apache.hadoop.zebra.BaseTestCase; import org.apache.pig.backend.executionengine.ExecException; import org.apache.pig.backend.executionengine.ExecJob; import org.apache.pig.data.DataByteArray; import org.apache.pig.data.Tuple; import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Test; public class TestMergeJoinPrune extends BaseTestCase { final static int numsBatch = 1; final static int numsInserters = 1; static Path pathTable1; static Path pathTable2; final static String STR_SCHEMA1 = "a:int,b:float,c:long,d:double,e:string,f:bytes,r1:record(f1:string, f2:string),m1:map(string)"; final static String STR_SCHEMA2 = "m1:map(string),r1:record(f1:string, f2:string),f:bytes,e:string,d:double,c:long,b:float,a:int"; final static String STR_STORAGE1 = "[a, b, c]; [e, f]; [r1.f1]; [m1#{a}]"; final static String STR_STORAGE2 = "[a];[b]; [c]; [e]; [f]; [r1.f1]; [m1#{a}]"; private static int t1; @BeforeClass public static void setUp() throws Exception { init(); pathTable1 = getTableFullPath("TestMergeJoinPrune1"); pathTable2 = getTableFullPath("TestMergeJoinPrune2"); removeDir(pathTable1); removeDir(pathTable2); createFirstTable(); createSecondTable(); } public static void createFirstTable() throws IOException, ParseException { BasicTable.Writer writer = new BasicTable.Writer(pathTable1, STR_SCHEMA1, STR_STORAGE1, conf); Schema schema = writer.getSchema(); //System.out.println("typeName" + schema.getColumn("a").type.pigDataType()); Tuple tuple = TypesUtils.createTuple(schema); TableInserter[] inserters = new TableInserter[numsInserters]; for (int i = 0; i < numsInserters; i++) { inserters[i] = writer.getInserter("ins" + i, false); } Tuple tupRecord1; try { tupRecord1 = TypesUtils.createTuple(schema.getColumnSchema("r1") .getSchema()); } catch (ParseException e) { e.printStackTrace(); throw new IOException(e); } Map<String, String> m1 = new HashMap<String, String>(); for (int b = 0; b < numsBatch; b++) { for (int i = 0; i < numsInserters; i++) { TypesUtils.resetTuple(tupRecord1); TypesUtils.resetTuple(tuple); m1.clear(); try { // first row of the table , the biggest row if (i == 0 && b == 0) { tuple.set(0, 100); tuple.set(1, 100.1f); tuple.set(2, 100L); tuple.set(3, 50e+2); tuple.set(4, "something"); tuple.set(5, new DataByteArray("something")); } // the middle + 1 row of the table, the smallest row else if (i == 0 && b == (numsBatch / 2)) { tuple.set(0, -100); tuple.set(1, -100.1f); tuple.set(2, -100L); tuple.set(3, -50e+2); tuple.set(4, "so"); tuple.set(5, new DataByteArray("so")); } else { Float f = 1.1f; long l = 11; double d = 1.1; tuple.set(0, b); tuple.set(1, f); tuple.set(2, l); tuple.set(3, d); tuple.set(4, "some"); tuple.set(5, new DataByteArray("some")); } // insert record tupRecord1.set(0, "" + b); tupRecord1.set(1, "" + b); tuple.set(6, tupRecord1); // insert map m1.put("a", "" + b); m1.put("b", "" + b); m1.put("c", "" + b); tuple.set(7, m1); } catch (ExecException e) { e.printStackTrace(); } inserters[i].insert(new BytesWritable(("key_" + b).getBytes()), tuple); } } for (int i = 0; i < numsInserters; i++) { inserters[i].close(); } writer.close(); //check table is setup correctly String projection = new String("a,b,c,d,e,f,r1,m1"); BasicTable.Reader reader = new BasicTable.Reader(pathTable1, conf); reader.setProjection(projection); List<RangeSplit> splits = reader.rangeSplit(1); TableScanner scanner = reader.getScanner(splits.get(0), true); Tuple RowValue = TypesUtils.createTuple(scanner.getSchema()); scanner.getValue(RowValue); System.out.println("rowvalue size:"+RowValue.size()); System.out.println("read a : " + RowValue.get(0).toString()); System.out.println("read string: " + RowValue.get(1).toString()); scanner.advance(); if(!scanner.atEnd()) { scanner.getValue(RowValue); System.out.println("read float in 2nd row: "+ RowValue.get(1).toString()); System.out.println("done insert table"); } reader.close(); } public static void createSecondTable() throws IOException, ParseException { BasicTable.Writer writer = new BasicTable.Writer(pathTable2, STR_SCHEMA2, STR_STORAGE2, conf); Schema schema = writer.getSchema(); //System.out.println("typeName" + schema.getColumn("a").type.pigDataType()); Tuple tuple = TypesUtils.createTuple(schema); TableInserter[] inserters = new TableInserter[numsInserters]; for (int i = 0; i < numsInserters; i++) { inserters[i] = writer.getInserter("ins" + i, false); } Tuple tupRecord1; try { tupRecord1 = TypesUtils.createTuple(schema.getColumnSchema("r1") .getSchema()); } catch (ParseException e) { e.printStackTrace(); throw new IOException(e); } Map<String, String> m1 = new HashMap<String, String>(); for (int b = 0; b < numsBatch; b++) { for (int i = 0; i < numsInserters; i++) { TypesUtils.resetTuple(tupRecord1); TypesUtils.resetTuple(tuple); m1.clear(); try { // first row of the table , the biggest row if (i == 0 && b == 0) { tuple.set(7, 100); tuple.set(6, 100.1f); tuple.set(5, 100L); tuple.set(4, 50e+2); tuple.set(3, "something"); tuple.set(2, new DataByteArray("something")); } // the middle +1 row of the table, the smallest row else if (i == 0 && b == (numsBatch / 2)) { tuple.set(7, -100); tuple.set(6, -100.1f); tuple.set(5, -100L); tuple.set(4, -50e+2); tuple.set(3, "so"); tuple.set(2, new DataByteArray("so")); } else { Float f = 2.1f; long l = 12; double d = 2.1; tuple.set(7, b*2); tuple.set(6, f); tuple.set(5, l); tuple.set(4, d); tuple.set(3, "somee"); tuple.set(2, new DataByteArray("somee")); } // insert record tupRecord1.set(0, "" + b); tupRecord1.set(1, "" + b); tuple.set(1, tupRecord1); // insert map m1.put("a", "" + b); m1.put("b", "" + b); m1.put("c", "" + b); tuple.set(0, m1); } catch (ExecException e) { e.printStackTrace(); } inserters[i].insert(new BytesWritable(("key" + b).getBytes()), tuple); } } for (int i = 0; i < numsInserters; i++) { inserters[i].close(); } writer.close(); //check table is setup correctly String projection = new String("a,b,c,d,e,f,r1,m1"); BasicTable.Reader reader = new BasicTable.Reader(pathTable2, conf); reader.setProjection(projection); List<RangeSplit> splits = reader.rangeSplit(1); TableScanner scanner = reader.getScanner(splits.get(0), true); Tuple RowValue = TypesUtils.createTuple(scanner.getSchema()); scanner.getValue(RowValue); System.out.println("rowvalue size:"+RowValue.size()); System.out.println("read a : " + RowValue.get(7).toString()); System.out.println("read string: " + RowValue.get(6).toString()); scanner.advance(); if(!scanner.atEnd()) { scanner.getValue(RowValue); System.out.println("read float in 2nd row: "+ RowValue.get(6).toString()); System.out.println("done insert table"); } reader.close(); } @AfterClass public static void tearDown() throws Exception { pigServer.shutdown(); BasicTable.drop(pathTable1, conf); BasicTable.drop(pathTable2, conf); } public void verify(Iterator<Tuple> it3) throws ExecException { int row = 0; Tuple RowValue3 = null; while (it3.hasNext()) { RowValue3 = it3.next(); Assert.assertEquals(9, RowValue3.size()); row++; if (row == 100) { // smallest row, the middle row of original table Assert.assertEquals(-100, RowValue3.get(0));// a Assert.assertEquals(-100.1f, RowValue3.get(1)); // b Assert.assertEquals(-100L, RowValue3.get(2)); // c Assert.assertEquals(-5000.0, RowValue3.get(3)); // d Assert.assertEquals("so", RowValue3.get(4)); // e Assert.assertEquals("so", RowValue3.get(5).toString());// f Assert.assertEquals("" + numsBatch / 2, ((Tuple) RowValue3.get(6)) .get(0));// r Assert.assertEquals("" + numsBatch / 2, ((Tuple) RowValue3.get(6)) .get(1));// r Assert.assertEquals("" + numsBatch / 2, ((Map) RowValue3.get(7)) .get("a"));// m Assert.assertEquals("" + numsBatch / 2, ((Map) RowValue3.get(7)) .get("b"));// m Assert.assertEquals("" + numsBatch / 2, ((Map) RowValue3.get(7)) .get("c"));// m Assert.assertEquals(-100, RowValue3.get(15)); // a Assert.assertEquals(-100.1f, RowValue3.get(14)); // b Assert.assertEquals(-100L, RowValue3.get(13)); // c Assert.assertEquals(-5000.0, RowValue3.get(12)); // d Assert.assertEquals("so", RowValue3.get(11)); // e Assert.assertEquals("so", RowValue3.get(10).toString());// f Assert.assertEquals("" + numsBatch / 2, ((Tuple) RowValue3.get(9)) .get(0));// r Assert.assertEquals("" + numsBatch / 2, ((Tuple) RowValue3.get(9)) .get(1));// r Assert.assertEquals("" + numsBatch / 2, ((Map) RowValue3.get(8)) .get("a"));// m Assert.assertEquals("" + numsBatch / 2, ((Map) RowValue3.get(8)) .get("b"));// m Assert.assertEquals("" + numsBatch / 2, ((Map) RowValue3.get(8)) .get("c"));// m } // largest row, the first row of the original table if (row == 200) { Assert.assertEquals(100, RowValue3.get(0));// a Assert.assertEquals(100.1f, RowValue3.get(1)); // b Assert.assertEquals(100L, RowValue3.get(2)); // c Assert.assertEquals(5000.0, RowValue3.get(3)); // d Assert.assertEquals("something", RowValue3.get(4)); // e Assert.assertEquals("something", RowValue3.get(5).toString());// f Assert.assertEquals("" + 0, ((Tuple) RowValue3.get(6)) .get(0));// r Assert.assertEquals("" + 0, ((Tuple) RowValue3.get(6)) .get(1));// r Assert.assertEquals("" + 0, ((Map) RowValue3.get(7)) .get("a"));// m Assert.assertEquals("" + 0, ((Map) RowValue3.get(7)) .get("b"));// m Assert.assertEquals("" + 0, ((Map) RowValue3.get(7)) .get("c"));// m Assert.assertEquals(100, RowValue3.get(15)); // a Assert.assertEquals(100.1f, RowValue3.get(14)); // b Assert.assertEquals(100L, RowValue3.get(13)); // c Assert.assertEquals(5000.0, RowValue3.get(12)); // d Assert.assertEquals("something", RowValue3.get(11)); // e Assert.assertEquals("something", RowValue3.get(10).toString());// f Assert.assertEquals("" + 0, ((Tuple) RowValue3.get(9)).get(0));// r Assert.assertEquals("" + 0, ((Tuple) RowValue3.get(9)).get(1));// r Assert.assertEquals("" + 0, ((Map) RowValue3.get(8)).get("a"));// m Assert.assertEquals("" + 0, ((Map) RowValue3.get(8)).get("b"));// m Assert.assertEquals("" + 0, ((Map) RowValue3.get(8)).get("c"));// m } } // Assert.assertEquals(2, row); } public Iterator<Tuple> joinTable(String table1, String table2, String sortkey1, String sortkey2) throws IOException { String query1 = "records1 = LOAD '" + pathTable1.toString() + "' USING org.apache.hadoop.zebra.pig.TableLoader();"; System.out.println("query1:" + query1); pigServer.registerQuery(query1); String query2 = "records2 = LOAD '" + pathTable2.toString() + "' USING org.apache.hadoop.zebra.pig.TableLoader();"; System.out.println("query2:" + query2); pigServer.registerQuery(query2); String orderby1 = "sort1 = ORDER records1 BY " + sortkey1 + " ;"; String orderby2 = "sort2 = ORDER records2 BY " + sortkey2 + " ;"; pigServer.registerQuery(orderby1); pigServer.registerQuery(orderby2); t1++; String table1path = pathTable1.toString() + Integer.toString(t1); removeDir(new Path(table1path)); ExecJob pigJob =pigServer.store("sort1", table1path, TableStorer.class.getCanonicalName() + "('[a, b, c]; [d, e, f, r1, m1]')"); if (pigJob.getException() != null){ System.out.println("******pig job exception"+ pigJob.getException().getMessage()); } Assert.assertNull(pigJob.getException()); String query3 = "records1 = LOAD '" + table1path + "' USING org.apache.hadoop.zebra.pig.TableLoader('a, b, c, d, e, f, r1, m1', 'sorted');"; System.out.println("query3:" + query3); pigServer.registerQuery(query3); String foreach = "records11 = foreach records1 generate a as a, b as b, c as c, d as d, e as e, f as f, r1 as r1, m1#'a' as ma1;"; pigServer.registerQuery(foreach); /* * Table2 creation */ this.t1++; String table2path = this.pathTable2.toString() + Integer.toString(this.t1); removeDir(new Path(table2path)); pigJob = pigServer.store("sort2", table2path, TableStorer.class.getCanonicalName() + "('[a, b, c]; [d,e,f,r1,m1]')"); if (pigJob.getException() != null){ System.out.println("******pig job exception"+ pigJob.getException().getMessage()); } Assert.assertNull(pigJob.getException()); String query4 = "records2 = LOAD '" + table2path + "' USING org.apache.hadoop.zebra.pig.TableLoader();"; pigServer.registerQuery(query4); String filter = "records22 = foreach records2 generate " + sortkey2 + ";" ; pigServer.registerQuery(filter); String join = "joinRecords = JOIN records11 BY " + "(" + sortkey1 + ")" + " , records22 BY " + "("+ sortkey2 + ")"+" USING \"merge\";"; //TODO: can not use records22 pigServer.registerQuery(join); // check JOIN content Iterator<Tuple> it3 = pigServer.openIterator("joinRecords"); return it3; } @Test public void test1() throws ExecException, IOException { /* * join key: single integer column */ Iterator<Tuple> it3 = joinTable(this.pathTable1.toString(), this.pathTable2.toString(), "a","a" ); verify (it3); } // @Test public void test2() throws ExecException, IOException { /* * join key: single float column */ Iterator<Tuple> it3 = joinTable(this.pathTable1.toString(), this.pathTable2.toString(), "b","b" ); verify(it3); } // @Test public void test3() throws ExecException, IOException { /* * join key: single string column */ Iterator<Tuple> it3 = joinTable(this.pathTable1.toString(), this.pathTable2.toString(), "e","e" ); verify(it3); } // @Test public void test4() throws ExecException, IOException { /* * join key: single byte column */ Iterator<Tuple> it3 = joinTable(this.pathTable1.toString(), this.pathTable2.toString(), "f","f" ); verify(it3); } // @Test public void test5() throws ExecException, IOException { /* * join key: single double column */ Iterator<Tuple> it3 = joinTable(this.pathTable1.toString(), this.pathTable2.toString(), "d","d" ); verify(it3); } // @Test public void test6() throws ExecException, IOException { /* * join key: single long column */ Iterator<Tuple> it3 = joinTable(this.pathTable1.toString(), this.pathTable2.toString(), "c","c" ); verify(it3); } // @Test public void test7() throws ExecException, IOException { /* * 2 join keys: integer and float */ System.out.println ("helloo"); Iterator<Tuple> it3 = joinTable(this.pathTable1.toString(), this.pathTable2.toString(), "a,b","a,b" ); verify(it3); } // @Test public void test8() throws ExecException, IOException { /* * multiple join keys: integer, float, long, double, string, bytes */ Iterator<Tuple> it3 = joinTable(this.pathTable1.toString(), this.pathTable2.toString(), "a,b,c,d,e,f","a,b,c,d,e,f" ); verify(it3); } //@Test(expected = IOException.class) public void test9a() throws ExecException, IOException { /* * Negative test case, one join key is not primitive type which is a record * 2 join keys: integer and record */ Iterator<Tuple> it3 = joinTable(this.pathTable1.toString(), this.pathTable2.toString(), "a,r1,e","a,r1,e"); } //@Test(expected = IOException.class) public void test9b() throws ExecException, IOException { /* * Negative test case, one join key is not primitive type which is a record * 2 join keys: integer and map */ Iterator<Tuple> it3 = joinTable(this.pathTable1.toString(), this.pathTable2.toString(), "a,m1,e","a,m1,e"); } }