/* (c) 2014 LinkedIn Corp. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use * this file except in compliance with the License. You may obtain a copy of the * License at http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR * CONDITIONS OF ANY KIND, either express or implied. */ package com.linkedin.cubert.operator; import com.linkedin.cubert.block.Block; import com.linkedin.cubert.block.BlockProperties; import com.linkedin.cubert.block.BlockSchema; import com.linkedin.cubert.block.PivotedBlock; import com.linkedin.cubert.utils.CommonUtils; import com.linkedin.cubert.utils.TupleUtils; import java.io.IOException; import java.util.Arrays; import java.util.Map; import org.apache.pig.data.Tuple; import org.apache.pig.data.TupleFactory; import org.codehaus.jackson.JsonNode; import static com.linkedin.cubert.utils.JsonUtils.asArray; import static com.linkedin.cubert.utils.JsonUtils.getText; /** * @author Maneesh Varshney */ public class RSJoinOperator implements TupleOperator { public static final String TAG_COLUMN = "___tag"; private PivotedBlock pivotedBlock; private Joiner joiner; @Override public void setInput(Map<String, Block> input, JsonNode json, BlockProperties props) throws IOException, InterruptedException { Block block = input.values().iterator().next(); String[] joinKeys = asArray(json, "joinKeys"); pivotedBlock = new PivotedBlock(block, joinKeys); joiner = new Joiner(pivotedBlock, json); joiner.newPivot(); } @Override public Tuple next() throws IOException, InterruptedException { while (true) { Tuple tuple = joiner.next(); if (tuple != null) return tuple; if (!pivotedBlock.advancePivot()) return null; joiner.newPivot(); } } static final class Joiner { private final Block block; private final int tagIndex; private final Tuple rightTuple; private final Tuple output; private final boolean isLeftOuter; private final String[] joinKeys; private Tuple leftTuple; public Joiner(PivotedBlock block, JsonNode json) { this.block = block; BlockSchema schema = block.getProperties().getSchema(); tagIndex = schema.getNumColumns() - 1; rightTuple = TupleFactory.getInstance().newTuple(schema.getNumColumns()); output = TupleFactory.getInstance().newTuple(schema.getNumColumns() - 1); isLeftOuter = (json.has("joinType") && getText(json, "joinType").equalsIgnoreCase("LEFT OUTER")); joinKeys = asArray(json, "joinKeys"); } void newPivot() throws IOException, InterruptedException { // we have just advanced the pivot // fetch the first tuple from this pivot Tuple tuple = block.next(); // if this tuple has tag 0 (that is, row from right table is found) if (tuple.get(tagIndex).equals(0)) { // make a copy of this right tuple TupleUtils.copy(tuple, rightTuple); // fetch the next row (this will be the row from left table) tuple = block.next(); if (tuple != null && tuple.get(tagIndex).equals(0)) { // found duplicate value in dimension table. Error out! BlockSchema schema = block.getProperties().getSchema(); Map<String, Integer> indexMap = schema.getIndexMap(); // exclude TAG_COLUMN String[] values1 = new String[indexMap.size() - 1]; String[] values2 = new String[indexMap.size() - 1]; String[] columns = new String[indexMap.size() - 1]; int i = 0; for (Map.Entry<String, Integer> entry : indexMap.entrySet()) { String columName = entry.getKey(); if (columName.equalsIgnoreCase(TAG_COLUMN)) { continue; } columns[i] = columName; values1[i] = String.valueOf(rightTuple.get(entry.getValue())); values2[i] = String.valueOf(tuple.get(entry.getValue())); i++; } String message = String.format("Duplicate keys found in dimension table.\n\t%s\n\t%s", Arrays.toString(CommonUtils.zip(columns, values1, ":")), Arrays.toString(CommonUtils.zip(columns, values2, ":"))); throw new RuntimeException(message); } } else { // if we haven't seen any row from right table if (isLeftOuter) { // if this is outer join, assign nulls to the rightTuple for (int i = 0; i < rightTuple.size(); i++) rightTuple.set(i, null); } else { // inner join: drain the current pivot while ((tuple = block.next()) != null) ; } } leftTuple = tuple; } Tuple next() throws IOException, InterruptedException { if (leftTuple == null) return null; int rightTupleStart = (Integer) leftTuple.get(tagIndex); int tupleEnd = output.size(); // copy the columns from the left tuple for (int i = 0; i < rightTupleStart; i++) { output.set(i, leftTuple.get(i)); } // copy the columns from the right tuple for (int i = rightTupleStart; i < tupleEnd; i++) { output.set(i, rightTuple.get(i)); } // after copying, fetch the next tuple leftTuple = block.next(); return output; } } @Override public PostCondition getPostCondition(Map<String, PostCondition> preConditions, JsonNode json) throws PreconditionException { PostCondition preCondition = preConditions.values().iterator().next(); BlockSchema schema = preCondition.getSchema(); // remove the ___tag columns BlockSchema outputSchema = schema.getComplementSubset(new String[]{TAG_COLUMN}); return new PostCondition(outputSchema, preCondition.getPartitionKeys(), preCondition.getSortKeys()); } }