/* (c) 2014 LinkedIn Corp. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use
* this file except in compliance with the License. You may obtain a copy of the
* License at http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed
* under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
* CONDITIONS OF ANY KIND, either express or implied.
*/
package com.linkedin.cubert.block;
import java.io.IOException;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import org.apache.hadoop.conf.Configuration;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.codehaus.jackson.JsonNode;
import com.linkedin.cubert.operator.BlockOperator;
import com.linkedin.cubert.operator.PhaseContext;
import com.linkedin.cubert.operator.PostCondition;
import com.linkedin.cubert.operator.PreconditionException;
import com.linkedin.cubert.utils.CommonUtils;
import com.linkedin.cubert.utils.DefaultSortAlgo;
import com.linkedin.cubert.utils.FileCache;
import com.linkedin.cubert.utils.JsonUtils;
import com.linkedin.cubert.utils.SerializedTupleStore;
import com.linkedin.cubert.utils.TupleUtils;
/**
* A BlockOperator that creates blocks.
*
* @author Maneesh Varshney
*
*/
public class CreateBlockOperator implements BlockOperator
{
private static class CostFunction
{
protected final long blockgenValue;
protected final BlockgenType type;
CostFunction(long blockgenValue, BlockgenType type)
{
this.blockgenValue = blockgenValue;
this.type = type;
}
protected boolean isCostExceeded(long numTuples,
long numPartitionKeys,
long storeSize)
{
switch (type)
{
case BY_BLOCK_ID:
return numPartitionKeys > 0;
case BY_INDEX:
return numPartitionKeys > 0;
case BY_PARTITION_KEY:
return numPartitionKeys >= blockgenValue;
case BY_ROW:
return numTuples >= blockgenValue;
case BY_SIZE:
return storeSize >= blockgenValue;
case USER_DEFINED:
break;
default:
break;
}
return false;
}
}
static final class StoredBlock implements Block
{
private final PivotedBlock input;
private Iterator<Tuple> iterator;
private int numPartitionKeys;
private final SerializedTupleStore store;
private final BlockProperties props;
StoredBlock(PivotedBlock input,
String[] partitionKeys,
String[] sortKeys,
CostFunction costFunction) throws IOException, InterruptedException
{
this.input = input;
BlockSchema inputSchema = input.getProperties().getSchema();
props = new BlockProperties("", inputSchema, input.getProperties());
props.setBlockId(-1);
// if no need for sorting, then use the simpler SerializedTupleStore (that
// does not maintain offsetList)
if (sortKeys == null)
store = new SerializedTupleStore(inputSchema);
else
store = new SerializedTupleStore(inputSchema, sortKeys);
// fetch the first tuple.. this will be the partition key
Tuple tuple = input.next();
if (tuple == null)
{
// If there is no input data, set "empty iterator"
iterator = new Iterator<Tuple>()
{
@Override
public boolean hasNext()
{
return false;
}
@Override
public Tuple next()
{
return null;
}
@Override
public void remove()
{
}
};
return;
}
Tuple partitionKey =
TupleUtils.extractTuple(tuple, inputSchema, partitionKeys);
props.setPartitionKey(partitionKey);
store.addToStore(tuple);
while (true)
{
while ((tuple = input.next()) != null)
store.addToStore(tuple);
numPartitionKeys++;
if (costFunction.isCostExceeded(store.getNumTuples(),
numPartitionKeys,
store.size()))
break;
if (!input.advancePivot())
break;
}
System.out.println("Store size is " + store.size());
if (sortKeys != null)
store.sort(new DefaultSortAlgo<Tuple>());
iterator = store.iterator();
}
@Override
public void configure(JsonNode json) throws IOException,
InterruptedException
{
}
@Override
public Tuple next() throws IOException,
InterruptedException
{
if (iterator.hasNext())
return iterator.next();
else
{
store.clear();
return null;
}
}
@Override
public void rewind() throws IOException
{
}
@Override
public BlockProperties getProperties()
{
return props;
}
}
static class StreamingBlock implements Block
{
protected BlockProperties props;
protected final PivotedBlock input;
private long numPartitionKeys;
private long numTuples;
protected Tuple firstTuple;
private boolean isFirstTuple = true;
private final CostFunction costFunction;
StreamingBlock(PivotedBlock input,
String[] partitionKeys,
CostFunction costFunction) throws IOException,
InterruptedException
{
this.input = input;
this.costFunction = costFunction;
BlockSchema inputSchema = input.getProperties().getSchema();
props = new BlockProperties("", inputSchema, input.getProperties());
props.setBlockId(-1);
firstTuple = input.next();
if (firstTuple == null)
{
return;
}
Tuple partitionKey =
TupleUtils.extractTuple(firstTuple, inputSchema, partitionKeys);
props.setPartitionKey(partitionKey);
numPartitionKeys++;
}
@Override
public void configure(JsonNode json) throws IOException
{
}
@Override
public Tuple next() throws IOException,
InterruptedException
{
if (isFirstTuple)
{
isFirstTuple = false;
if (firstTuple == null)
return null;
numTuples++;
return firstTuple;
}
Tuple tuple = input.next();
if (tuple != null)
{
numTuples++;
return tuple;
}
if (costFunction.isCostExceeded(numTuples, numPartitionKeys, 0))
return null;
if (!input.advancePivot())
return null;
numPartitionKeys++;
return next();
}
@Override
public void rewind() throws IOException
{
}
@Override
public BlockProperties getProperties()
{
return props;
}
}
private static class ByIndexBlock extends StreamingBlock
{
private Index index;
private int[] colIndex;
private Tuple outputTuple;
ByIndexBlock(PivotedBlock input, String[] partitionKeys, CostFunction costFunction) throws IOException,
InterruptedException
{
super(input, partitionKeys, costFunction);
BlockSchema inputSchema = input.getProperties().getSchema();
BlockSchema outputSchema =
inputSchema.getComplementSubset(new String[] { "BLOCK_ID" });
colIndex = new int[outputSchema.getNumColumns()];
for (int i = 0; i < colIndex.length; i++)
colIndex[i] = inputSchema.getIndex(outputSchema.getName(i));
outputTuple = TupleFactory.getInstance().newTuple(colIndex.length);
props = new BlockProperties("", outputSchema, input.getProperties());
props.setBlockId(-1);
}
@Override
public void configure(JsonNode json) throws IOException
{
String indexName = JsonUtils.getText(json, "index");
try
{
index = FileCache.getCachedIndex(indexName);
}
catch (ClassNotFoundException e)
{
throw new RuntimeException(e);
}
if (firstTuple != null)
{
int blockIdIndex = input.getProperties().getSchema().getIndex("BLOCK_ID");
configure((Long) firstTuple.get(blockIdIndex));
}
}
@Override
public Tuple next() throws IOException,
InterruptedException
{
Tuple out = super.next();
if (out == null)
return out;
for (int i = 0; i < colIndex.length; i++)
outputTuple.set(i, out.get(colIndex[i]));
return outputTuple;
}
/**
* This configure method is required in cases where empty blocks need to be
* created based on their corresponding counterparts in the parent relation.
*
* @param blockId
* block ID to be set
*/
public void configure(long blockId)
{
Tuple partitionKey = index.getEntry(blockId).getKey();
props.setPartitionKey(partitionKey);
props.setBlockId(blockId);
}
}
private JsonNode inputJson;
private PivotedBlock inputBlock;
private BlockgenType blockgenType;
private long blockgenValue;
private String[] partitionKeys;
private String[] sortKeys;
private final Set<Long> relevantBlockIds = new HashSet<Long>();
private Map<Long, Integer> blockIdPartitionMap;
private int reducerId = -1;
private int nReducers = -1;
boolean firstBlock = true;
@Override
public void setInput(Configuration conf, Map<String, Block> input, JsonNode json) throws IOException,
InterruptedException
{
if (input.size() == 0)
throw new IllegalArgumentException("No input block is provided");
if (input.size() != 1)
throw new IllegalArgumentException("This operator operates on only one input block. ("
+ input.size() + " provided)");
this.inputJson = json;
Block block = input.values().iterator().next();
if (block == null)
throw new IllegalArgumentException("The specified block for ["
+ input.keySet().iterator().next() + "] is null");
blockgenType =
BlockgenType.valueOf(JsonUtils.getText(json, "blockgenType")
.toUpperCase());
if (json.has("blockgenValue"))
{
blockgenValue = json.get("blockgenValue").getLongValue();
}
partitionKeys = JsonUtils.asArray(json.get("partitionKeys"));
// if (json.get("originalPartitionKeys") != null)
// originalPartitionKeys = JsonUtils.asArray(json.get("originalPartitionKeys"));
if (json.has("pivotKeys"))
sortKeys = JsonUtils.asArray(json, "pivotKeys");
// if sort keys are prefix of partitionkeys. or vice versa, there is no need to
// sort the block
if (CommonUtils.isPrefix(partitionKeys, sortKeys)
|| CommonUtils.isPrefix(sortKeys, partitionKeys))
sortKeys = null;
inputBlock = new PivotedBlock(block, partitionKeys);
if (blockgenType == BlockgenType.BY_INDEX)
{
if (PhaseContext.isMapper())
{
throw new RuntimeException("Expecting Reduce Context while performing LOAD BLOCK");
}
nReducers = conf.getInt("mapred.reduce.tasks", -1);
if (nReducers < 0)
throw new RuntimeException("Unable to determine number of reducers.");
reducerId =
PhaseContext.getRedContext().getTaskAttemptID().getTaskID().getId();
retrieveRelevantBlockIds(json);
}
}
private void retrieveRelevantBlockIds(JsonNode json) throws IOException
{
final Index index;
try
{
String indexName = JsonUtils.getText(json, "index");
index = FileCache.getCachedIndex(indexName);
}
catch (ClassNotFoundException e)
{
throw new RuntimeException(e);
}
blockIdPartitionMap = index.getBlockIdPartitionMap(nReducers);
for (Long blockId : index.getAllBlockIds())
{
if (isRelevant(blockId))
{
relevantBlockIds.add(blockId);
}
}
}
/**
* The relevance check is based on the fact that INDEX based join happens by shuffling
* data on the BLOCK_ID.
*
* @param blockId blockId
* @return if the block is relevant to the current reducer
*/
private boolean isRelevant(Long blockId)
{
return blockIdPartitionMap.get(blockId) == reducerId;
}
@Override
public Block next() throws IOException,
InterruptedException
{
if (!firstBlock && !inputBlock.advancePivot())
{
/*
* For blockgen by INDEX cases. Create empty blocks for all block IDs relevant
* to this reducer
*/
if (blockgenType == BlockgenType.BY_INDEX && relevantBlockIds.size() > 0)
{
final Long nextBlockId = relevantBlockIds.iterator().next();
final ByIndexBlock nextBlock = (ByIndexBlock) createBlock();
nextBlock.configure(inputJson);
nextBlock.configure(nextBlockId);
relevantBlockIds.remove(nextBlockId);
return nextBlock;
}
return null;
}
final Block nextBlock = createBlock();
nextBlock.configure(inputJson);
/**
* The following if check applies to a Block GEN by INDEX case where a reducer
* doesn't receive any data. While there are relevant block ids for which it is
* supposed to generate empty blocks. In this case First Block is true and
* therefore the configure(nextBlockId) doesn't happen.
*
* For ByIndexBlock Only The configure method sets the blockId and partitionKey
* (using the index)
*/
if (firstBlock && relevantBlockIds.size() > 0
&& nextBlock instanceof ByIndexBlock
&& ((ByIndexBlock) nextBlock).firstTuple == null)
{
final Long nextBlockId = relevantBlockIds.iterator().next();
((ByIndexBlock) nextBlock).configure(nextBlockId);
}
relevantBlockIds.remove(nextBlock.getProperties().getBlockId());
if (firstBlock)
firstBlock = false;
return nextBlock;
}
private Block createBlock() throws IOException,
InterruptedException
{
CostFunction costFunction = new CostFunction(blockgenValue, blockgenType);
switch (blockgenType)
{
case BY_INDEX:
return new ByIndexBlock(inputBlock, partitionKeys, costFunction);
case BY_PARTITION_KEY:
if (sortKeys == null)
return new StreamingBlock(inputBlock, partitionKeys, costFunction);
else
return new StoredBlock(inputBlock, partitionKeys, sortKeys, costFunction);
case BY_ROW:
if (sortKeys == null)
return new StreamingBlock(inputBlock, partitionKeys, costFunction);
else
return new StoredBlock(inputBlock, partitionKeys, sortKeys, costFunction);
case BY_BLOCK_ID:
return new ByIndexBlock(inputBlock, partitionKeys, costFunction);
case BY_SIZE:
return new StoredBlock(inputBlock, partitionKeys, sortKeys, costFunction);
case USER_DEFINED:
break;
default:
break;
}
return null;
}
@Override
public PostCondition getPostCondition(Map<String, PostCondition> preConditions,
JsonNode json) throws PreconditionException
{
// TODO Auto-generated method stub
return null;
}
}