/* (c) 2014 LinkedIn Corp. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use * this file except in compliance with the License. You may obtain a copy of the * License at http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR * CONDITIONS OF ANY KIND, either express or implied. */ package com.linkedin.cubert.operator; import java.io.IOException; import java.nio.ByteBuffer; import java.util.HashMap; import java.util.Map; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.pig.backend.executionengine.ExecException; import org.apache.pig.data.Tuple; import org.apache.pig.data.TupleFactory; import org.codehaus.jackson.JsonNode; import org.codehaus.jackson.node.ArrayNode; import org.codehaus.jackson.node.ObjectNode; import com.linkedin.cubert.block.Block; import com.linkedin.cubert.block.BlockSchema; import com.linkedin.cubert.block.BlockUtils; import com.linkedin.cubert.block.TupleOperatorBlock; import com.linkedin.cubert.io.BlockSerializationType; import com.linkedin.cubert.io.rubix.RubixMemoryBlock; import com.linkedin.cubert.utils.JsonUtils; import com.linkedin.cubert.utils.Pair; import com.linkedin.cubert.utils.RewriteUtils; /** * Given an input block with a pivoted column and a side meta-data block, create vector * blocks for each input in metadata file. * this <code>BlockOperator</code> outputs * multiple blocks (one combined block for each vector from meta-data block) * * @author Mani Parkhe */ public class CollateVectorBlockOperator implements BlockOperator { private RubixMemoryBlock inputBlock; private Map<Object, Pair<Integer, Integer>> coord2offsets = new HashMap<Object, Pair<Integer, Integer>>(); private String metaRelationName; private Block matchingMetaBlock; private int[] coordinateColumnIndexes; private String identifierColumnName = null; private int identifierColumnIndex; private JsonNode jsonForCombine = null; private CombineOperator combineOp = new CombineOperator(); private TupleOperatorBlock combinedBlock = null; private Map<String, Block> inputGenerator = new HashMap<String, Block>(); private JsonNode jsonForGenerate = null; private GenerateOperator genOp = null; private TupleOperatorBlock generatedBlock = null; @SuppressWarnings("unchecked") private Class<Tuple> valueClass = (Class<Tuple>) TupleFactory.getInstance() .newTuple() .getClass(); @Override public void setInput(Configuration conf, Map<String, Block> input, JsonNode json) throws IOException, InterruptedException { // #1. input block inputBlock = (RubixMemoryBlock) input.get(JsonUtils.getText(json, "inputBlock")); // #2. lookup column String lookupColumn = json.get("lookupColumn").getTextValue(); BlockSchema inputSchema = inputBlock.getProperties().getSchema(); coord2offsets = BlockUtils.generateColumnIndex(inputBlock, lookupColumn); // #3. meta data relation name metaRelationName = new String(JsonUtils.getText(json, "metaRelationName")); matchingMetaBlock = (Block) input.get(metaRelationName); BlockSchema metaBlockSchema = matchingMetaBlock.getProperties().getSchema(); // #4. find indexes for coordinate column names in meta relation's schema String[] coordinateColumns = JsonUtils.asArray(json.get("coordinateColumns")); coordinateColumnIndexes = new int[coordinateColumns.length]; int idx = 0; for (String s : JsonUtils.asArray(json.get("coordinateColumns"))) coordinateColumnIndexes[idx++] = metaBlockSchema.getIndex(s); // #5. find index of identifier column in meta relation's schema identifierColumnName = new String(JsonUtils.getText(json, "identifierColumn")); identifierColumnIndex = metaBlockSchema.getIndex(identifierColumnName); // #6. combine columns ArrayNode combineColumns = (ArrayNode) json.get("combineColumns"); // setup info for sort operator /* * jsonForSort = JsonUtils.cloneNode(json); ((ObjectNode) * jsonForSort).put("sortBy", combineColumns); sortedBlock = new * TupleOperatorBlock(sortOp); */ // setup info for combiner operator jsonForCombine = JsonUtils.createObjectNode(); ((ObjectNode) jsonForCombine).put("pivotBy", combineColumns); ((ObjectNode) jsonForCombine).put("schema", inputSchema.toJson()); combinedBlock = new TupleOperatorBlock(combineOp, null); // setup info for generate operator jsonForGenerate = JsonUtils.createObjectNode(); } @Override public Block next() throws IOException, InterruptedException { Tuple metaDataTuple = matchingMetaBlock.next(); if (metaDataTuple == null) return null; // Done System.out.println("Collate Vector: metadata tuple = " + metaDataTuple.toString()); return generateVectorBlock(metaDataTuple); } private Block generateVectorBlock(Tuple metaDataTuple) throws ExecException, IOException, InterruptedException { Map<String, Block> inputBlocksToCombiner = new HashMap<String, Block>(); for (int i : coordinateColumnIndexes) { Object coordinate = metaDataTuple.get(i); Block coordBlock = createCoordinateBlock(coordinate); if (coordBlock == null) continue; inputBlocksToCombiner.put(coordinate.toString(), coordBlock); } // No data for this vector -- proceed to next one. if (inputBlocksToCombiner.size() == 0) return this.next(); if (inputBlocksToCombiner.size() != coordinateColumnIndexes.length) { System.out.println("CollateVectorBlock: Found fewer input blocks than number of co-ordinates "); return this.next(); } // Combine individual blocks Object vectorIdentifier = metaDataTuple.get(identifierColumnIndex); if (!(vectorIdentifier instanceof Integer || vectorIdentifier instanceof String)) throw new RuntimeException("Unexpected data-type for identifier column"); Block combinedBlock = createCombinedBlock(inputBlocksToCombiner); /* * // Prepare input args for sort operator inputSorter.clear(); * inputSorter.put("combined_block", combinedBlock); * * // Setup sort operator object sortOp.setInput(inputSorter, jsonForSort); */ // Prepare input arguments for generator operator ArrayNode outputTupleJson = createJsonForGenerate(vectorIdentifier); JsonNode thisGenJson = JsonUtils.cloneNode(jsonForGenerate); ((ObjectNode) thisGenJson).put("outputTuple", outputTupleJson); inputGenerator.clear(); inputGenerator.put("combined_block", combinedBlock); // Setup generate operator object. genOp = new GenerateOperator(); genOp.setInput(inputGenerator, thisGenJson, null); // Return tuple operator block that contains this generate op. generatedBlock = new TupleOperatorBlock(genOp, null); // TODO: generatedBlock.setProperty("identifierColumn", vectorIdentifier); // System.out.println("CollateVectorBlock: finished setInput"); return generatedBlock; } private ArrayNode createJsonForGenerate(Object vectorIdentifier) { ArrayNode outputTupleJson = JsonUtils.createArrayNode(); // + First duplicate existing schema for (String s : inputBlock.getProperties().getSchema().getColumnNames()) { outputTupleJson.add(RewriteUtils.createProjectionExpressionNode(s, s)); } // + Add the new generated column JsonNode constNode; if (vectorIdentifier instanceof String) constNode = RewriteUtils.createStringConstant((String) vectorIdentifier); else constNode = RewriteUtils.createIntegerConstant((Integer) vectorIdentifier); String outColName = metaRelationName + "___" + identifierColumnName; outputTupleJson.add(JsonUtils.createObjectNode("col_name", outColName, "expression", constNode)); return outputTupleJson; } private Block createCombinedBlock(Map<String, Block> inputCombiner) throws IOException, InterruptedException { // Special case -- only one block. if (inputCombiner.size() == 1) { return inputCombiner.values().iterator().next(); } // Setup combine operator object combineOp.setInput(inputCombiner, jsonForCombine, null); // Return tuple operator block that contains the combine op. return combinedBlock; } private Block createCoordinateBlock(Object coordinate) throws IOException, InterruptedException { Pair<Integer, Integer> offsets = coord2offsets.get(coordinate); if (offsets == null) return null; int start = offsets.getFirst(); int len = offsets.getSecond() - start; ByteBuffer inMemBuffer = ByteBuffer.wrap(inputBlock.getByteBuffer().array(), start, len); RubixMemoryBlock miniBlock = new RubixMemoryBlock(null, PhaseContext.getConf(), inMemBuffer, valueClass, (CompressionCodec) null, BlockSerializationType.DEFAULT); // This is because the schema and pivotBy attributes which are set in // jsonForCombine also apply to this. miniBlock.configure(jsonForCombine); return miniBlock; } @Override public PostCondition getPostCondition(Map<String, PostCondition> preConditions, JsonNode json) throws PreconditionException { // TODO Auto-generated method stub return null; } }