/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.zebra.pig; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.PrintWriter; import java.util.Iterator; import java.util.ArrayList; import java.util.HashMap; import java.util.StringTokenizer; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.zebra.io.BasicTable; import org.apache.hadoop.zebra.io.TableInserter; import org.apache.hadoop.zebra.pig.TableStorer; import org.apache.hadoop.zebra.schema.Schema; import org.apache.hadoop.zebra.types.TypesUtils; import org.apache.hadoop.zebra.BaseTestCase; import org.apache.pig.backend.executionengine.ExecException; import org.apache.pig.backend.executionengine.ExecJob; import org.apache.pig.data.DataByteArray; import org.apache.pig.data.Tuple; import junit.framework.Assert; import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Test; public class TestOrderPreserveMultiTableGlob extends BaseTestCase { final static int NUMB_TABLE = 10; // number of tables for stress test final static int NUMB_TABLE_ROWS = 5; // number of rows for each table final static String TABLE_SCHEMA = "int1:int,str1:string,byte1:bytes"; final static String TABLE_STORAGE = "[int1,str1,byte1]"; static int fileId = 0; static int sortId = 0; protected static ExecJob pigJob; private static ArrayList<Path> pathTables; private static int totalTableRows =0; @BeforeClass public static void setUp() throws Exception { init(); pathTables = new ArrayList<Path>(); for (int i=0; i<NUMB_TABLE; ++i) { Path pathTable = getTableFullPath("TestOderPerserveMultiTable" + i); pathTables.add(pathTable); removeDir(pathTable); } // Create tables for (int i=0; i<NUMB_TABLE; ++i) { // Create table data Object[][] table = new Object[NUMB_TABLE_ROWS][3]; // three columns for (int j=0; j<NUMB_TABLE_ROWS; ++j) { table[j][0] = i; table[j][1] = new String("string" + j); table[j][2] = new DataByteArray("byte" + (NUMB_TABLE_ROWS - j)); ++totalTableRows; } // Create table createTable(pathTables.get(i), TABLE_SCHEMA, TABLE_STORAGE, table); // Load Table String query = "table" + i + " = LOAD '" + pathTables.get(i).toString() + "' USING org.apache.hadoop.zebra.pig.TableLoader();"; pigServer.registerQuery(query); } } private static void createTable(Path path, String schemaString, String storageString, Object[][] tableData) throws IOException { // // Create table from tableData array // BasicTable.Writer writer = new BasicTable.Writer(path, schemaString, storageString, conf); Schema schema = writer.getSchema(); Tuple tuple = TypesUtils.createTuple(schema); TableInserter inserter = writer.getInserter("ins", false); for (int i = 0; i < tableData.length; ++i) { TypesUtils.resetTuple(tuple); for (int k = 0; k < tableData[i].length; ++k) { tuple.set(k, tableData[i][k]); System.out.println("DEBUG: setting tuple k=" + k + "value= " + tableData[i][k]); } inserter.insert(new BytesWritable(("key" + i).getBytes()), tuple); } inserter.close(); writer.close(); } @AfterClass public static void tearDown() throws Exception { pigServer.shutdown(); } private Iterator<Tuple> testOrderPreserveUnion(ArrayList<String> inputTables, String sortkey, String columns) throws IOException { // // Test order preserve union from input tables and provided output columns // Assert.assertTrue("Table union requires two or more input tables", inputTables.size() >= 2); Path newPath = new Path(getCurrentMethodName()); ArrayList<String> pathList = new ArrayList<String>(); // Load and store each of the input tables for (int i=0; i<inputTables.size(); ++i) { String tablename = inputTables.get(i); String sortName = "sort" + ++sortId; // Sort tables String orderby = sortName + " = ORDER " + tablename + " BY " + sortkey + " ;"; pigServer.registerQuery(orderby); String sortPath = new String(newPath.toString() + ++fileId); // increment fileId suffix // Store sorted tables pigJob = pigServer.store(sortName, sortPath, TableStorer.class.getCanonicalName() + "('" + TABLE_STORAGE + "')"); Assert.assertNull(pigJob.getException()); pathList.add(sortPath); // add table path to list } String paths = new String(); paths += newPath.toString() + "{"; fileId = 0; for (String path:pathList) paths += ++fileId + ","; paths = paths.substring(0, paths.lastIndexOf(",")); // remove trailing comma paths += "}"; String queryLoad = "records1 = LOAD '" + paths + "' USING org.apache.hadoop.zebra.pig.TableLoader('" + columns + "', 'sorted');"; System.out.println("queryLoad: " + queryLoad); pigServer.registerQuery(queryLoad); // Return iterator Iterator<Tuple> it1 = pigServer.openIterator("records1"); return it1; } @Test public void test_sorted_union_multi_table() throws ExecException, IOException { // // Test sorted union // // Create input tables for order preserve union ArrayList<String> inputTables = new ArrayList<String>(); // Input tables for (int i=0; i<NUMB_TABLE; ++i) { inputTables.add("table" + i); // add input table } // Test with input tables and provided output columns testOrderPreserveUnion(inputTables, "int1", "int1, str1, byte1, source_table"); // Create results table for verification HashMap<Integer, ArrayList<ArrayList<Object>>> resultTable = new HashMap<Integer, ArrayList<ArrayList<Object>>>(); // The ordering from FileInputFormat glob expansion. int[] tblIndexList = {0, 9, 1, 2, 3, 4, 5, 6, 7, 8}; for (int i=0; i<NUMB_TABLE; ++i) { ArrayList<ArrayList<Object>> rows = new ArrayList<ArrayList<Object>>(); for (int j=0; j<NUMB_TABLE_ROWS; ++j) { ArrayList<Object> resultRow = new ArrayList<Object>(); resultRow.add(tblIndexList[i]); // int1 resultRow.add(new String("string" + j)); // str1 resultRow.add(new DataByteArray("byte" + (NUMB_TABLE_ROWS - j))); // byte1 rows.add(resultRow); } resultTable.put(i, rows); } // Verify union table Iterator<Tuple> it = pigServer.openIterator("records1"); int numbRows = verifyTable(resultTable, 0, 3, it); Assert.assertEquals(totalTableRows, numbRows); // Print Table //printTable("records1"); } /** * Return the name of the routine that called getCurrentMethodName * */ private String getCurrentMethodName() { ByteArrayOutputStream baos = new ByteArrayOutputStream(); PrintWriter pw = new PrintWriter(baos); (new Throwable()).printStackTrace(pw); pw.flush(); String stackTrace = baos.toString(); pw.close(); StringTokenizer tok = new StringTokenizer(stackTrace, "\n"); tok.nextToken(); // 'java.lang.Throwable' tok.nextToken(); // 'at ...getCurrentMethodName' String l = tok.nextToken(); // 'at ...<caller to getCurrentRoutine>' // Parse line 3 tok = new StringTokenizer(l.trim(), " <("); String t = tok.nextToken(); // 'at' t = tok.nextToken(); // '...<caller to getCurrentRoutine>' return t; } }