/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.zebra.pig; import java.io.IOException; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.ArrayList; import junit.framework.Assert; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.zebra.io.BasicTable; import org.apache.hadoop.zebra.io.TableInserter; import org.apache.hadoop.zebra.pig.TableStorer; import org.apache.hadoop.zebra.schema.Schema; import org.apache.hadoop.zebra.types.TypesUtils; import org.apache.hadoop.zebra.BaseTestCase; import org.apache.pig.backend.executionengine.ExecException; import org.apache.pig.data.DataByteArray; import org.apache.pig.data.Tuple; import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Test; public class TestMergeJoinPartial extends BaseTestCase { final static String STR_SCHEMA1 = "a:int,b:float,c:long,d:double,e:string,f:bytes,m1:map(string)"; final static String STR_STORAGE1 = "[a, b, c]; [e, f]; [m1#{a}]"; static int fileId = 0; private static Path pathTable1; private static Path pathTable2; private static Object[][] table1; private static Object[][] table2; @BeforeClass public static void setUp() throws Exception { init(); pathTable1 = getTableFullPath("TestMergeJoinPartial1"); pathTable2 = getTableFullPath("TestMergeJoinPartial2"); removeDir(pathTable1); removeDir(pathTable2); // Create table1 data Map<String, String> m1 = new HashMap<String, String>(); m1.put("a","m1-a"); m1.put("b","m1-b"); table1 = new Object[][]{ {5, -3.25f, 1001L, 51e+2, "Zebra", new DataByteArray("Zebra"), m1}, {-1, 3.25f, 1000L, 50e+2, "zebra", new DataByteArray("zebra"), m1}, {1001, 100.0f, 1003L, 50e+2, "Apple", new DataByteArray("Apple"), m1}, {1001, 101.0f, 1001L, 50e+2, "apple", new DataByteArray("apple"), m1}, {1001, 50.0f, 1000L, 50e+2, "Pig", new DataByteArray("Pig"), m1}, {1001, 52.0f, 1001L, 50e+2, "pig", new DataByteArray("pig"), m1}, {1002, 28.0f, 1000L, 50e+2, "Hadoop", new DataByteArray("Hadoop"), m1}, {1000, 0.0f, 1002L, 52e+2, "hadoop", new DataByteArray("hadoop"), m1} }; // Create table1 createTable(pathTable1, STR_SCHEMA1, STR_STORAGE1, table1); // Create table2 data Map<String, String> m2 = new HashMap<String, String>(); m2.put("a","m2-a"); m2.put("b","m2-b"); table2 = new Object[][] { {15, 56.0f, 1004L, 50e+2, "green", new DataByteArray("green"), m2}, {-1, -99.0f, 1002L, 51e+2, "orange", new DataByteArray("orange"), m2}, {1001, 100.0f, 1003L, 55e+2, "white", new DataByteArray("white"), m2}, {1001, 102.0f, 1001L, 52e+2, "purple", new DataByteArray("purple"), m2}, {1001, 50.0f, 1008L, 52e+2, "gray", new DataByteArray("gray"), m2}, {1001, 53.0f, 1001L, 52e+2, "brown", new DataByteArray("brown"), m2}, {2000, 33.0f, 1006L, 52e+2, "beige", new DataByteArray("beige"), m2} }; // Create table2 createTable(pathTable2, STR_SCHEMA1, STR_STORAGE1, table2); // Load table1 String query1 = "table1 = LOAD '" + pathTable1.toString() + "' USING org.apache.hadoop.zebra.pig.TableLoader();"; pigServer.registerQuery(query1); // Load table2 String query2 = "table2 = LOAD '" + pathTable2.toString() + "' USING org.apache.hadoop.zebra.pig.TableLoader();"; pigServer.registerQuery(query2); } private static void createTable(Path path, String schemaString, String storageString, Object[][] tableData) throws IOException { // // Create table from tableData array // BasicTable.Writer writer = new BasicTable.Writer(path, schemaString, storageString, conf); Schema schema = writer.getSchema(); Tuple tuple = TypesUtils.createTuple(schema); TableInserter inserter = writer.getInserter("ins", false); for (int i = 0; i < tableData.length; ++i) { TypesUtils.resetTuple(tuple); for (int k = 0; k < tableData[i].length; ++k) { tuple.set(k, tableData[i][k]); System.out.println("DEBUG: setting tuple k=" + k + "value= " + tableData[i][k]); } inserter.insert(new BytesWritable(("key" + i).getBytes()), tuple); } inserter.close(); writer.close(); } @AfterClass public static void tearDown() throws Exception { pigServer.shutdown(); } @Test public void test_merge_joint_17() throws ExecException, IOException { // // Multiple join where join keys are partial, and the order of keys is honored // // Sort tables String orderby1 = "sort1 = ORDER table1 BY " + "a,b,c" + " ;"; pigServer.registerQuery(orderby1); String orderby2 = "sort2 = ORDER table2 BY " + "a,b,c" + " ;"; pigServer.registerQuery(orderby2); // Store sorted tables ++fileId; // increment filename suffix String pathSort1 = pathTable1.toString() + Integer.toString(fileId); pigServer.store("sort1", pathSort1, TableStorer.class.getCanonicalName() + "('[a, b, c]; [d, e, f, m1]')"); String pathSort2 = pathTable2.toString() + Integer.toString(fileId); pigServer.store("sort2", pathSort2, TableStorer.class.getCanonicalName() + "('[a, b, c]; [d, e, f, m1]')"); // Load sorted tables String load1 = "records1 = LOAD '" + pathSort1 + "' USING org.apache.hadoop.zebra.pig.TableLoader('a, b, c, d, e, f, m1', 'sorted');"; pigServer.registerQuery(load1); String load2 = "records2 = LOAD '" + pathSort2 + "' USING org.apache.hadoop.zebra.pig.TableLoader('a, b, c, d, e, f, m1', 'sorted');"; pigServer.registerQuery(load2); // Merge tables String join = "joinRecords = JOIN records1 BY " + "(" + "a,b,c" + ")" + " , records2 BY " + "("+ "a,b,c" + ")" + " USING \"merge\";"; pigServer.registerQuery(join); // Verify merged tables ArrayList<ArrayList<Object>> resultTable = new ArrayList<ArrayList<Object>>(); addResultRow(resultTable, table1[2], table2[2]); // set expected values for row1 Iterator<Tuple> it = pigServer.openIterator("joinRecords"); verifyTable(resultTable, it); } @Test public void test_merge_joint_22() throws ExecException, IOException { // // Multiple join where join keys are partial, and the order of keys is honored // // Known bug with partial key join // - Need to add verification to this test once bug is fixed // Sort tables String orderby1 = "sort1 = ORDER table1 BY " + "a,b,c" + " ;"; pigServer.registerQuery(orderby1); String orderby2 = "sort2 = ORDER table2 BY " + "a,b,c" + " ;"; pigServer.registerQuery(orderby2); // Store sorted tables ++fileId; // increment filename suffix String pathSort1 = pathTable1.toString() + Integer.toString(fileId); pigServer.store("sort1", pathSort1, TableStorer.class.getCanonicalName() + "('[a, b, c]; [d, e, f, m1]')"); String pathSort2 = pathTable2.toString() + Integer.toString(fileId); pigServer.store("sort2", pathSort2, TableStorer.class.getCanonicalName() + "('[a, b, c]; [d, e, f, m1]')"); // Load sorted tables String load1 = "records1 = LOAD '" + pathSort1 + "' USING org.apache.hadoop.zebra.pig.TableLoader('a, b, c, d, e, f, m1', 'sorted');"; pigServer.registerQuery(load1); String load2 = "records2 = LOAD '" + pathSort2 + "' USING org.apache.hadoop.zebra.pig.TableLoader('a, b, c, d, e, f, m1', 'sorted');"; pigServer.registerQuery(load2); // Merge tables String join = "joinRecords = JOIN records1 BY " + "(" + "a,b" + ")" + " , records2 BY " + "("+ "a,b" + ")" + " USING \"merge\";"; pigServer.registerQuery(join); printTable("joinRecords"); } @Test public void test_merge_joint_23() throws ExecException, IOException { // // Multiple join where join keys are partial, and the order of keys is honored // // Known bug with partial key join // - Need to add verification to this test once bug is fixed // Sort tables String orderby1 = "sort1 = ORDER table1 BY " + "a,b,c" + " ;"; pigServer.registerQuery(orderby1); String orderby2 = "sort2 = ORDER table2 BY " + "a,b,c" + " ;"; pigServer.registerQuery(orderby2); // Store sorted tables ++fileId; // increment filename suffix String pathSort1 = pathTable1.toString() + Integer.toString(fileId); pigServer.store("sort1", pathSort1, TableStorer.class.getCanonicalName() + "('[a, b, c]; [d, e, f, m1]')"); String pathSort2 = pathTable2.toString() + Integer.toString(fileId); pigServer.store("sort2", pathSort2, TableStorer.class.getCanonicalName() + "('[a, b, c]; [d, e, f, m1]')"); // Load sorted tables String load1 = "records1 = LOAD '" + pathSort1 + "' USING org.apache.hadoop.zebra.pig.TableLoader('a, b, c, d, e, f, m1', 'sorted');"; pigServer.registerQuery(load1); String load2 = "records2 = LOAD '" + pathSort2 + "' USING org.apache.hadoop.zebra.pig.TableLoader('a, b, c, d, e, f, m1', 'sorted');"; pigServer.registerQuery(load2); // Merge tables String join = "joinRecords = JOIN records1 BY " + "(" + "a" + ")" + " , records2 BY " + "("+ "a" + ")" + " USING \"merge\";"; pigServer.registerQuery(join); printTable("joinRecords"); } @Test(expected = IOException.class) public void test_merge_joint_24() throws ExecException, IOException { // // Multiple join where join keys are partial, and the order of keys is honored // (negative test) // Sort tables String orderby1 = "sort1 = ORDER table1 BY " + "a,b,c" + " ;"; pigServer.registerQuery(orderby1); String orderby2 = "sort2 = ORDER table2 BY " + "a,b,c" + " ;"; pigServer.registerQuery(orderby2); // Store sorted tables ++fileId; // increment filename suffix String pathSort1 = pathTable1.toString() + Integer.toString(fileId); pigServer.store("sort1", pathSort1, TableStorer.class.getCanonicalName() + "('[a, b, c]; [d, e, f, m1]')"); String pathSort2 = pathTable2.toString() + Integer.toString(fileId); pigServer.store("sort2", pathSort2, TableStorer.class.getCanonicalName() + "('[a, b, c]; [d, e, f, m1]')"); // Load sorted tables String load1 = "records1 = LOAD '" + pathSort1 + "' USING org.apache.hadoop.zebra.pig.TableLoader('a, b, c, d, e, f, m1', 'sorted');"; pigServer.registerQuery(load1); String load2 = "records2 = LOAD '" + pathSort2 + "' USING org.apache.hadoop.zebra.pig.TableLoader('a, b, c, d, e, f, m1', 'sorted');"; pigServer.registerQuery(load2); // Merge tables String join = "joinRecords = JOIN records1 BY " + "(" + "a,c" + ")" + " , records2 BY " + "("+ "a,c" + ")" + " USING \"merge\";"; pigServer.registerQuery(join); pigServer.openIterator("joinRecords"); // get iterator to trigger error } public void printTable(String tablename) throws IOException { // // Print Pig Table (for debugging) // Iterator<Tuple> it1 = pigServer.openIterator(tablename); Tuple RowValue1 = null; while (it1.hasNext()) { RowValue1 = it1.next(); System.out.println(); for (int i = 0; i < RowValue1.size(); ++i) { System.out.println("DEBUG: " + tablename + " RowValue.get(" + i + ") = " + RowValue1.get(i)); } } } public void addResultRow(ArrayList<ArrayList<Object>> resultTable, Object[] leftRow, Object[] rightRow) { // // Add a row to expected results table // ArrayList<Object> resultRow = new ArrayList<Object>(); for (int i=0; i<leftRow.length; ++i) resultRow.add(leftRow[i]); for (int i=0; i<rightRow.length; ++i) resultRow.add(rightRow[i]); resultTable.add(resultRow); } public void verifyTable(ArrayList<ArrayList<Object>> resultTable, Iterator<Tuple> it) throws IOException { // // Verify expected results table to returned test case table // Tuple RowValues; int rowIndex = 0; while (it.hasNext()) { RowValues = it.next(); ArrayList<Object> resultRow = resultTable.get(rowIndex); Assert.assertEquals(resultRow.size(), RowValues.size()); // verify expected tuple count System.out.println(); for (int i = 0; i < RowValues.size(); ++i) { System.out.println("DEBUG: resultTable " + " RowValue.get(" + i + ") = " + RowValues.get(i) + " " + resultRow.get(i)); Assert.assertEquals(resultRow.get(i), RowValues.get(i)); // verify each row value } ++rowIndex; } Assert.assertEquals(resultTable.size(), rowIndex); // verify expected row count } }