TestMergeJoinPartial.java example

Explorer
spork-streaming-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.hadoop.zebra.pig;

import java.io.IOException;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.ArrayList;

import junit.framework.Assert;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.zebra.io.BasicTable;
import org.apache.hadoop.zebra.io.TableInserter;
import org.apache.hadoop.zebra.pig.TableStorer;
import org.apache.hadoop.zebra.schema.Schema;
import org.apache.hadoop.zebra.types.TypesUtils;
import org.apache.hadoop.zebra.BaseTestCase;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.Tuple;
import org.junit.AfterClass;
import org.junit.BeforeClass;
import org.junit.Test;


public class TestMergeJoinPartial extends BaseTestCase {
	
	final static String STR_SCHEMA1 = "a:int,b:float,c:long,d:double,e:string,f:bytes,m1:map(string)";
	final static String STR_STORAGE1 = "[a, b, c]; [e, f]; [m1#{a}]";
	
	static int fileId = 0;
	
	private static Path pathTable1;
	private static Path pathTable2;
	
	private static Object[][] table1;
	private static Object[][] table2;
	
	@BeforeClass
	public static void setUp() throws Exception {
    init();
    
    pathTable1 = getTableFullPath("TestMergeJoinPartial1");
    pathTable2 = getTableFullPath("TestMergeJoinPartial2");    
    removeDir(pathTable1);
    removeDir(pathTable2);
		
		// Create table1 data
		Map<String, String> m1 = new HashMap<String, String>();
		m1.put("a","m1-a");
		m1.put("b","m1-b");
		
		table1 = new Object[][]{
			{5,		-3.25f,	1001L,	51e+2,	"Zebra",	new DataByteArray("Zebra"),		m1},
			{-1,	3.25f,	1000L,	50e+2,	"zebra",	new DataByteArray("zebra"),		m1},
			{1001,	100.0f,	1003L,	50e+2,	"Apple",	new DataByteArray("Apple"),		m1},
			{1001,	101.0f,	1001L,	50e+2,	"apple",	new DataByteArray("apple"),		m1},
			{1001,	50.0f,	1000L,	50e+2,	"Pig",		new DataByteArray("Pig"),		m1},
			{1001,	52.0f,	1001L,	50e+2,	"pig",		new DataByteArray("pig"),		m1},
			{1002,	28.0f,	1000L,	50e+2,	"Hadoop",	new DataByteArray("Hadoop"),	m1},
			{1000,	0.0f,	1002L,	52e+2,	"hadoop",	new DataByteArray("hadoop"),	m1} };
		
		// Create table1
		createTable(pathTable1, STR_SCHEMA1, STR_STORAGE1, table1);
		
		// Create table2 data
		Map<String, String> m2 = new HashMap<String, String>();
		m2.put("a","m2-a");
		m2.put("b","m2-b");
		
		table2 = new Object[][] {
			{15,	56.0f,	1004L,	50e+2,	"green",	new DataByteArray("green"),		m2},
			{-1,	-99.0f,	1002L,	51e+2,	"orange",	new DataByteArray("orange"),	m2},
			{1001,	100.0f,	1003L,	55e+2,	"white",	new DataByteArray("white"),		m2},
			{1001,	102.0f,	1001L,	52e+2,	"purple",	new DataByteArray("purple"),	m2},
			{1001,	50.0f,	1008L,	52e+2,	"gray",		new DataByteArray("gray"),		m2},
			{1001,	53.0f,	1001L,	52e+2,	"brown",	new DataByteArray("brown"),		m2},
			{2000,	33.0f,	1006L,	52e+2,	"beige",	new DataByteArray("beige"),		m2} };
		
		// Create table2
		createTable(pathTable2, STR_SCHEMA1, STR_STORAGE1, table2);
		
		// Load table1
		String query1 = "table1 = LOAD '" + pathTable1.toString() + "' USING org.apache.hadoop.zebra.pig.TableLoader();";
		pigServer.registerQuery(query1);
		
		// Load table2
		String query2 = "table2 = LOAD '" + pathTable2.toString() + "' USING org.apache.hadoop.zebra.pig.TableLoader();";
		pigServer.registerQuery(query2);
		
	}
	
	private static void createTable(Path path, String schemaString, String storageString, Object[][] tableData)
			throws IOException {
		//
		// Create table from tableData array
		//
		BasicTable.Writer writer = new BasicTable.Writer(path, schemaString, storageString, conf);
		
		Schema schema = writer.getSchema();
		Tuple tuple = TypesUtils.createTuple(schema);
		TableInserter inserter = writer.getInserter("ins", false);
		
		for (int i = 0; i < tableData.length; ++i) {
			TypesUtils.resetTuple(tuple);
			for (int k = 0; k < tableData[i].length; ++k) {
				tuple.set(k, tableData[i][k]);
				System.out.println("DEBUG: setting tuple k=" + k + "value= " + tableData[i][k]);
			}
			inserter.insert(new BytesWritable(("key" + i).getBytes()), tuple);
		}
		inserter.close();
		writer.close();
	}
	
	@AfterClass
	public static void tearDown() throws Exception {
		pigServer.shutdown();
	}
	
	
	@Test
	public void test_merge_joint_17() throws ExecException, IOException {
		//
		// Multiple join where join keys are partial, and the order of keys is honored
		//
		
		// Sort tables
		String orderby1 = "sort1 = ORDER table1 BY " + "a,b,c" + " ;";
		pigServer.registerQuery(orderby1);
		
		String orderby2 = "sort2 = ORDER table2 BY " + "a,b,c" + " ;";
		pigServer.registerQuery(orderby2);
		
		// Store sorted tables
		++fileId;  // increment filename suffix
		String pathSort1 = pathTable1.toString() + Integer.toString(fileId);
		pigServer.store("sort1", pathSort1, TableStorer.class.getCanonicalName() +
			"('[a, b, c]; [d, e, f, m1]')");
		
		String pathSort2 = pathTable2.toString() + Integer.toString(fileId);
		pigServer.store("sort2", pathSort2, TableStorer.class.getCanonicalName() +
			"('[a, b, c]; [d, e, f, m1]')");
		
		// Load sorted tables
		String load1 = "records1 = LOAD '" + pathSort1 +
			"' USING org.apache.hadoop.zebra.pig.TableLoader('a, b, c, d, e, f, m1', 'sorted');";
		pigServer.registerQuery(load1);
		
		String load2 = "records2 = LOAD '" + pathSort2 +
			"' USING org.apache.hadoop.zebra.pig.TableLoader('a, b, c, d, e, f, m1', 'sorted');";
		pigServer.registerQuery(load2);
		
		// Merge tables
		String join = "joinRecords = JOIN records1 BY " + "(" + "a,b,c" + ")" + " , records2 BY " + "("+ "a,b,c" + ")" +
			" USING \"merge\";";
		pigServer.registerQuery(join);
		
		// Verify merged tables
		ArrayList<ArrayList<Object>> resultTable = new ArrayList<ArrayList<Object>>();
		
		addResultRow(resultTable, table1[2], table2[2]);  // set expected values for row1
		
		Iterator<Tuple> it = pigServer.openIterator("joinRecords");
		verifyTable(resultTable, it);
	}
	
	@Test
	public void test_merge_joint_22() throws ExecException, IOException {
		//
		// Multiple join where join keys are partial, and the order of keys is honored
		//
		// Known bug with partial key join
		// - Need to add verification to this test once bug is fixed
		
		// Sort tables
		String orderby1 = "sort1 = ORDER table1 BY " + "a,b,c" + " ;";
		pigServer.registerQuery(orderby1);
		
		String orderby2 = "sort2 = ORDER table2 BY " + "a,b,c" + " ;";
		pigServer.registerQuery(orderby2);
		
		// Store sorted tables
		++fileId;  // increment filename suffix
		String pathSort1 = pathTable1.toString() + Integer.toString(fileId);
		pigServer.store("sort1", pathSort1, TableStorer.class.getCanonicalName() +
			"('[a, b, c]; [d, e, f, m1]')");
		
		String pathSort2 = pathTable2.toString() + Integer.toString(fileId);
		pigServer.store("sort2", pathSort2, TableStorer.class.getCanonicalName() +
			"('[a, b, c]; [d, e, f, m1]')");
		
		// Load sorted tables
		String load1 = "records1 = LOAD '" + pathSort1 +
			"' USING org.apache.hadoop.zebra.pig.TableLoader('a, b, c, d, e, f, m1', 'sorted');";
		pigServer.registerQuery(load1);
		
		String load2 = "records2 = LOAD '" + pathSort2 +
			"' USING org.apache.hadoop.zebra.pig.TableLoader('a, b, c, d, e, f, m1', 'sorted');";
		pigServer.registerQuery(load2);
		
		// Merge tables
		String join = "joinRecords = JOIN records1 BY " + "(" + "a,b" + ")" + " , records2 BY " + "("+ "a,b" + ")" +
			" USING \"merge\";";
		pigServer.registerQuery(join);
		
		printTable("joinRecords");
	}
	
	@Test
	public void test_merge_joint_23() throws ExecException, IOException {
		//
		// Multiple join where join keys are partial, and the order of keys is honored
		//
		// Known bug with partial key join
		// - Need to add verification to this test once bug is fixed
		
		// Sort tables
		String orderby1 = "sort1 = ORDER table1 BY " + "a,b,c" + " ;";
		pigServer.registerQuery(orderby1);
		
		String orderby2 = "sort2 = ORDER table2 BY " + "a,b,c" + " ;";
		pigServer.registerQuery(orderby2);
		
		// Store sorted tables
		++fileId;  // increment filename suffix
		String pathSort1 = pathTable1.toString() + Integer.toString(fileId);
		pigServer.store("sort1", pathSort1, TableStorer.class.getCanonicalName() +
			"('[a, b, c]; [d, e, f, m1]')");
		
		String pathSort2 = pathTable2.toString() + Integer.toString(fileId);
		pigServer.store("sort2", pathSort2, TableStorer.class.getCanonicalName() +
			"('[a, b, c]; [d, e, f, m1]')");
		
		// Load sorted tables
		String load1 = "records1 = LOAD '" + pathSort1 +
			"' USING org.apache.hadoop.zebra.pig.TableLoader('a, b, c, d, e, f, m1', 'sorted');";
		pigServer.registerQuery(load1);
		
		String load2 = "records2 = LOAD '" + pathSort2 +
			"' USING org.apache.hadoop.zebra.pig.TableLoader('a, b, c, d, e, f, m1', 'sorted');";
		pigServer.registerQuery(load2);
		
		// Merge tables
		String join = "joinRecords = JOIN records1 BY " + "(" + "a" + ")" + " , records2 BY " + "("+ "a" + ")" +
			" USING \"merge\";";
		pigServer.registerQuery(join);
		
		printTable("joinRecords");
	}
	
	@Test(expected = IOException.class)
	public void test_merge_joint_24() throws ExecException, IOException {
		//
		// Multiple join where join keys are partial, and the order of keys is honored
		// (negative test)
		
		// Sort tables
		String orderby1 = "sort1 = ORDER table1 BY " + "a,b,c" + " ;";
		pigServer.registerQuery(orderby1);
		
		String orderby2 = "sort2 = ORDER table2 BY " + "a,b,c" + " ;";
		pigServer.registerQuery(orderby2);
		
		// Store sorted tables
		++fileId;  // increment filename suffix
		String pathSort1 = pathTable1.toString() + Integer.toString(fileId);
		pigServer.store("sort1", pathSort1, TableStorer.class.getCanonicalName() +
			"('[a, b, c]; [d, e, f, m1]')");
		
		String pathSort2 = pathTable2.toString() + Integer.toString(fileId);
		pigServer.store("sort2", pathSort2, TableStorer.class.getCanonicalName() +
			"('[a, b, c]; [d, e, f, m1]')");
		
		// Load sorted tables
		String load1 = "records1 = LOAD '" + pathSort1 +
			"' USING org.apache.hadoop.zebra.pig.TableLoader('a, b, c, d, e, f, m1', 'sorted');";
		pigServer.registerQuery(load1);
		
		String load2 = "records2 = LOAD '" + pathSort2 +
			"' USING org.apache.hadoop.zebra.pig.TableLoader('a, b, c, d, e, f, m1', 'sorted');";
		pigServer.registerQuery(load2);
		
		// Merge tables
		String join = "joinRecords = JOIN records1 BY " + "(" + "a,c" + ")" + " , records2 BY " + "("+ "a,c" + ")" +
			" USING \"merge\";";
		pigServer.registerQuery(join);
		
		pigServer.openIterator("joinRecords");  // get iterator to trigger error
	}
	
	public void printTable(String tablename) throws IOException {
		//
		// Print Pig Table (for debugging)
		//
		Iterator<Tuple> it1 = pigServer.openIterator(tablename);
		Tuple RowValue1 = null;
		while (it1.hasNext()) {
			RowValue1 = it1.next();
			System.out.println();
			
			for (int i = 0; i < RowValue1.size(); ++i) {
				System.out.println("DEBUG: " + tablename + " RowValue.get(" + i + ") = " + RowValue1.get(i));
				
			}
		}
	}
	
	public void addResultRow(ArrayList<ArrayList<Object>> resultTable, Object[] leftRow, Object[] rightRow) {
		//
		// Add a row to expected results table
		//
		ArrayList<Object> resultRow = new ArrayList<Object>();
		
		for (int i=0; i<leftRow.length; ++i)
			resultRow.add(leftRow[i]);
		for (int i=0; i<rightRow.length; ++i)
			resultRow.add(rightRow[i]);
		
		resultTable.add(resultRow);
	}
	
	public void verifyTable(ArrayList<ArrayList<Object>> resultTable, Iterator<Tuple> it) throws IOException {
		//
		// Verify expected results table to returned test case table
		//
		Tuple RowValues;
		int rowIndex = 0;
		
		while (it.hasNext()) {
			RowValues = it.next();
			ArrayList<Object> resultRow = resultTable.get(rowIndex);
			Assert.assertEquals(resultRow.size(), RowValues.size());  // verify expected tuple count
			System.out.println();
			
			for (int i = 0; i < RowValues.size(); ++i) {
				System.out.println("DEBUG: resultTable " + " RowValue.get(" + i + ") = " + RowValues.get(i) +
						" " + resultRow.get(i));
				Assert.assertEquals(resultRow.get(i), RowValues.get(i));  // verify each row value
			}
			++rowIndex;
		}
		Assert.assertEquals(resultTable.size(), rowIndex);  // verify expected row count
	}
	
}