/* (c) 2014 LinkedIn Corp. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use * this file except in compliance with the License. You may obtain a copy of the * License at http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR * CONDITIONS OF ANY KIND, either express or implied. */ package com.linkedin.cubert.io.rubix; import java.io.IOException; import org.apache.pig.backend.executionengine.ExecException; import org.apache.pig.data.Tuple; import org.apache.pig.data.TupleFactory; import org.codehaus.jackson.JsonNode; import org.codehaus.jackson.JsonParseException; import org.codehaus.jackson.map.JsonMappingException; import com.linkedin.cubert.block.Block; import com.linkedin.cubert.block.BlockSchema; import com.linkedin.cubert.block.BlockWriter; import com.linkedin.cubert.block.CommonContext; import com.linkedin.cubert.operator.PhaseContext; /** * Writes a block in the Rubix file format. * * @author Maneesh Varshney * */ public class RubixBlockWriter implements BlockWriter { private BlockSchema outputSchema; private static long blockCount = 0; @Override public void configure(JsonNode json) throws JsonParseException, JsonMappingException, IOException { outputSchema = new BlockSchema(json.get("schema")); } @Override public void write(Block block, CommonContext context) throws IOException, InterruptedException { // increment the "blocks written" counter // PhaseContext.getCounter("FileSystemCounters", "Rubix Blocks Written") // .increment(1); Tuple outputTuple = TupleFactory.getInstance().newTuple(outputSchema.getNumColumns()); int[] outputFieldIndex = new int[outputSchema.getNumColumns()]; for (int i = 0; i < outputFieldIndex.length; i++) { outputFieldIndex[i] = block.getProperties().getSchema().getIndex(outputSchema.getName(i)); } Tuple partitionKey = block.getProperties().getPartitionKey(); if (partitionKey == null) { /** * This is a case where the block has no data and no partition key * information. In this case: Generate an empty block with an empty partition * key. This ensures that reducers create at least 1 block. */ partitionKey = TupleFactory.getInstance().newTuple(0); } long blockId = block.getProperties().getBlockId(); if (blockId < 0) { blockId = createBlockId(); blockCount++; } Tuple compositeKey = TupleFactory.getInstance().newTuple(2); compositeKey.set(0, partitionKey); compositeKey.set(1, blockId); // Write the first tuple with key Tuple tuple = block.next(); if (tuple == null) // i.e. if the block is empty { context.write(compositeKey, null); return; } projectColumns(tuple, outputTuple, outputFieldIndex); context.write(compositeKey, outputTuple); // write the remaining tuples with null key while ((tuple = block.next()) != null) { projectColumns(tuple, outputTuple, outputFieldIndex); context.write(null, outputTuple); } } private void projectColumns(Tuple input, Tuple output, int[] fieldIndex) throws ExecException { for (int i = 0; i < fieldIndex.length; i++) { output.set(i, input.get(fieldIndex[i])); } } private long createBlockId() { long reducerId = PhaseContext.getRedContext().getTaskAttemptID().getTaskID().getId(); return (reducerId << 32) | blockCount; } }