/* * (c) 2014 LinkedIn Corp. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use * this file except in compliance with the License. You may obtain a copy of the * License at http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR * CONDITIONS OF ANY KIND, either express or implied. */ package com.linkedin.cubert.operator; import com.linkedin.cubert.block.Block; import com.linkedin.cubert.block.BlockProperties; import com.linkedin.cubert.block.BlockSchema; import com.linkedin.cubert.block.ColumnType; import com.linkedin.cubert.block.DataType; import com.linkedin.cubert.block.TupleStoreBlock; import com.linkedin.cubert.plan.physical.CubertStrings; import com.linkedin.cubert.plan.physical.TestContext; import com.linkedin.cubert.utils.DataGenerator; import com.linkedin.cubert.utils.RawTupleStore; import com.linkedin.cubert.utils.TupleStore; import java.io.IOException; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.mapreduce.MapContext; import org.apache.pig.backend.executionengine.ExecException; import org.apache.pig.data.Tuple; import org.codehaus.jackson.node.ArrayNode; import org.codehaus.jackson.node.JsonNodeFactory; import org.codehaus.jackson.node.ObjectNode; import org.testng.Assert; import org.testng.annotations.Test; /** * Tests the Hash JOIN operator * * Created by spyne on 10/30/14. */ public class TestHashJoinOperator { private final int nRows = 1000; final DataGenerator dataGenerator = new DataGenerator(); final String lBlockName = "lBlock"; final String rBlockName = "rBlock"; public static void setup(Boolean useCompactSerialization) throws IOException { final Configuration conf = new Configuration(); conf.set(CubertStrings.USE_COMPACT_SERIALIZATION, useCompactSerialization.toString()); PhaseContext.create((MapContext) new TestContext(), conf); } @Test public void testInnerHashJoin() throws IOException, InterruptedException { setup(false); BlockSchema schema = DataGenerator.createBlockSchema(); dataGenerator.setMIN_INT(0); dataGenerator.setMAX_INT(1000000); dataGenerator.setMIN_STRING_LENGTH(5); dataGenerator.setMAX_STRING_LENGTH(10); final List<Tuple> tuples = dataGenerator.generateSequentialTuples(nRows, schema); /* Create the Tuple Schema */ final BlockSchema lSchema = new BlockSchema(new ColumnType[] { new ColumnType("Integer", DataType.INT), new ColumnType("Long", DataType.LONG) }); final BlockSchema rSchema = new BlockSchema(new ColumnType[] { new ColumnType("Integer", DataType.INT), new ColumnType("Double", DataType.DOUBLE), new ColumnType("String", DataType.STRING) }); final BlockSchema operatorSchema = new BlockSchema(new ColumnType[] { new ColumnType(lBlockName + "___" + "Integer", DataType.INT), new ColumnType(lBlockName + "___" + "Long", DataType.LONG), new ColumnType(rBlockName + "___" + "Integer", DataType.INT), new ColumnType(rBlockName + "___" + "Double", DataType.DOUBLE), new ColumnType(rBlockName + "___" + "String", DataType.STRING) }); final TupleStore lStore = new RawTupleStore(lSchema); final TupleStore rStore = new RawTupleStore(rSchema); /* For the test "Integer" field is being used as the JOIN key */ for (Tuple t : tuples) { Tuple lt = extractFields(t, schema, lSchema); Tuple rt = extractFields(t, schema, rSchema); lStore.addToStore(lt); rStore.addToStore(rt); } HashJoinOperator operator = createHashJoinOperator(lSchema, rSchema, operatorSchema, lStore, rStore); for (Tuple t : tuples) { Tuple output = operator.next(); Assert.assertEquals(5, output.size()); Assert.assertEquals(t.get(0), output.get(0)); Assert.assertEquals(t.get(0), output.get(2)); Assert.assertEquals(t.get(1), output.get(1)); Assert.assertEquals(t.get(2), output.get(3)); Assert.assertEquals(t.get(3), output.get(4)); } Assert.assertNull(operator.next()); } @Test public void testInnerHashJoinRightBlockEmpty() throws IOException, InterruptedException { setup(false); BlockSchema schema = DataGenerator.createBlockSchema(); dataGenerator.setMIN_INT(0); dataGenerator.setMAX_INT(1000000); dataGenerator.setMIN_STRING_LENGTH(5); dataGenerator.setMAX_STRING_LENGTH(10); final List<Tuple> tuples = dataGenerator.generateSequentialTuples(nRows, schema); /* Create the Tuple Schema */ final BlockSchema lSchema = new BlockSchema(new ColumnType[] { new ColumnType("Integer", DataType.INT), new ColumnType("Long", DataType.LONG) }); final BlockSchema rSchema = new BlockSchema(new ColumnType[] { new ColumnType("Integer", DataType.INT), new ColumnType("Double", DataType.DOUBLE), new ColumnType("String", DataType.STRING) }); final BlockSchema operatorSchema = new BlockSchema(new ColumnType[] { new ColumnType(lBlockName + "___" + "Integer", DataType.INT), new ColumnType(lBlockName + "___" + "Long", DataType.LONG), new ColumnType(rBlockName + "___" + "Integer", DataType.INT), new ColumnType(rBlockName + "___" + "Double", DataType.DOUBLE), new ColumnType(rBlockName + "___" + "String", DataType.STRING) }); final TupleStore lStore = new RawTupleStore(lSchema); final TupleStore rStore = new RawTupleStore(rSchema); /* For the test "Integer" field is being used as the JOIN key */ for (Tuple t : tuples) { Tuple lt = extractFields(t, schema, lSchema); lStore.addToStore(lt); } HashJoinOperator operator = createHashJoinOperator(lSchema, rSchema, operatorSchema, lStore, rStore); Assert.assertNull(operator.next()); } @Test public void testInnerHashJoinLeftBlockEmpty() throws IOException, InterruptedException { setup(false); BlockSchema schema = DataGenerator.createBlockSchema(); dataGenerator.setMIN_INT(0); dataGenerator.setMAX_INT(1000000); dataGenerator.setMIN_STRING_LENGTH(5); dataGenerator.setMAX_STRING_LENGTH(10); final List<Tuple> tuples = dataGenerator.generateSequentialTuples(nRows, schema); /* Create the Tuple Schema */ final BlockSchema lSchema = new BlockSchema(new ColumnType[] { new ColumnType("Integer", DataType.INT), new ColumnType("Long", DataType.LONG) }); final BlockSchema rSchema = new BlockSchema(new ColumnType[] { new ColumnType("Integer", DataType.INT), new ColumnType("Double", DataType.DOUBLE), new ColumnType("String", DataType.STRING) }); final BlockSchema operatorSchema = new BlockSchema(new ColumnType[] { new ColumnType(lBlockName + "___" + "Integer", DataType.INT), new ColumnType(lBlockName + "___" + "Long", DataType.LONG), new ColumnType(rBlockName + "___" + "Integer", DataType.INT), new ColumnType(rBlockName + "___" + "Double", DataType.DOUBLE), new ColumnType(rBlockName + "___" + "String", DataType.STRING) }); final TupleStore lStore = new RawTupleStore(lSchema); final TupleStore rStore = new RawTupleStore(rSchema); /* For the test "Integer" field is being used as the JOIN key */ for (Tuple t : tuples) { Tuple rt = extractFields(t, schema, rSchema); rStore.addToStore(rt); } HashJoinOperator operator = createHashJoinOperator(lSchema, rSchema, operatorSchema, lStore, rStore); Assert.assertNull(operator.next()); } private Tuple extractFields(final Tuple source, final BlockSchema srcSchema, final BlockSchema dstSchema) throws ExecException { Tuple output = DataGenerator.newTuple(dstSchema.getNumColumns()); String[] columnNames = dstSchema.getColumnNames(); for (int i = 0; i < columnNames.length; i++) { output.set(i, source.get(srcSchema.getIndex(columnNames[i]))); } return output; } public HashJoinOperator createHashJoinOperator(BlockSchema lSchema, BlockSchema rSchema, BlockSchema operatorSchema, TupleStore lStore, TupleStore rStore) throws IOException, InterruptedException { /* Create Blocks */ final Block lBlock = new TupleStoreBlock(lStore, new BlockProperties(lBlockName, lSchema, (BlockProperties) null)); final Block rBlock = new TupleStoreBlock(rStore, new BlockProperties(rBlockName, rSchema, (BlockProperties) null)); /* Perform the Hash Join */ Map<String, Block> input = new HashMap<String, Block>(); input.put(lBlockName, lBlock); input.put(rBlockName, rBlock); ObjectNode root = new ObjectNode(JsonNodeFactory.instance); root.put("leftBlock", lBlockName); root.put("rightBlock", rBlockName); final ArrayNode joinKeys = new ArrayNode(JsonNodeFactory.instance); joinKeys.add("Integer"); root.put("leftJoinKeys", joinKeys); root.put("rightJoinKeys", joinKeys); BlockProperties props = new BlockProperties("Joined", operatorSchema, (BlockProperties) null); HashJoinOperator operator = new HashJoinOperator(); operator.setInput(input, root, props); return operator; } }