package com.linkedin.cubert.operator;
import java.io.IOException;
import java.util.Arrays;
import java.util.Map;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.codehaus.jackson.JsonNode;
import com.linkedin.cubert.block.Block;
import com.linkedin.cubert.block.BlockProperties;
import com.linkedin.cubert.block.BlockSchema;
import com.linkedin.cubert.utils.CommonUtils;
import com.linkedin.cubert.utils.JsonUtils;
/**
* Tuple operator that will generate a rank/ordering for each tuple. Analytical Function
* Operator: works on a sorted block generates rank for a "partition by" (columns)
* sub-block
*
* @author Mani Parkhe
*/
public class RankOperator implements TupleOperator
{
private BlockSchema outputSchema = null;
private Tuple outputTuple;
private int rankColumnIndex;
private long rank = Long.MIN_VALUE;
private Block inputBlock = null;
private Block currentBlock;
private PivotBlockOperator pivotBlockOp;
/**
* {@inheritDoc}
*
* @see com.linkedin.rcf.operator.TupleOperator#setInput(java.util.Map,
* org.codehaus.jackson.JsonNode)
*/
@Override
public void setInput(Map<String, Block> input, JsonNode json, BlockProperties props) throws IOException,
InterruptedException
{
inputBlock = input.values().iterator().next();
BlockSchema inputSchema = inputBlock.getProperties().getSchema();
outputSchema = getOutputSchema(json, inputSchema);
outputTuple = TupleFactory.getInstance().newTuple(outputSchema.getNumColumns());
rankColumnIndex = outputSchema.getNumColumns() - 1;
pivotBlockOp = new PivotBlockOperator();
String[] groupByColumns = JsonUtils.asArray(JsonUtils.get(json, "groupBy"));
pivotBlockOp.setInput(inputBlock, groupByColumns, false);
resetState();
}
/**
* Create output schema : input schema + rank (column name user specified)
*
* @param json
*/
private BlockSchema getOutputSchema(JsonNode json, BlockSchema inputSchema)
{
String rankColumnType = "long " + JsonUtils.getText(json, "rankAs");
return inputSchema.append(new BlockSchema(rankColumnType));
}
/**
* This is the meat of the operator. While current block as tuples, increment rank and
* output tuple. When current block runs out -- get new pivot block. Since
* preconditions dictate that the block is partitioned by GROUP BY keys (or subset)
* and sorted by a combination of GROUP BY + ORDER BY keys there is no need to further
* compare tuples.
*
* @see com.linkedin.rcf.operator.TupleOperator#next()
*/
@Override
public Tuple next() throws IOException,
InterruptedException
{
if (currentBlock == null)
{
if (!nextBlock())
return null;
}
// Advance pivot if this block is exhausted or top_n have been exhausted.
Tuple tuple = currentBlock.next();
if (tuple == null)
{
resetState();
return this.next();
}
// Valid tuple: increment rank and check
rank++;
if (rank < 0)
throw new RuntimeException("Rank overflow!");
for (int i = 0; i < rankColumnIndex; i++)
outputTuple.set(i, tuple.get(i));
outputTuple.set(rankColumnIndex, rank);
return outputTuple;
}
/*
* Get next pivot block.
*/
private boolean nextBlock() throws IOException,
InterruptedException
{
// Get next pivot block
currentBlock = pivotBlockOp.next();
if (null == currentBlock)
return false;
return true;
}
private void resetState()
{
rank = 0; // Starting rank==1. 'rank' is pre-incremented before set.
currentBlock = null;
}
/**
* {@inheritDoc}
*
* @see com.linkedin.rcf.operator.TupleOperator#getPostCondition(java.util.Map,
* org.codehaus.jackson.JsonNode)
*/
@Override
public PostCondition getPostCondition(Map<String, PostCondition> preConditions,
JsonNode json) throws PreconditionException
{
String inputBlock = JsonUtils.getText(json, "input");
PostCondition inputBlockPostCondition = preConditions.get(inputBlock);
BlockSchema inputSchema = inputBlockPostCondition.getSchema();
String[] groupByKeys = JsonUtils.asArray(JsonUtils.get(json, "groupBy"));
String[] orderByKeys = JsonUtils.asArray(JsonUtils.get(json, "orderBy"));
String[] expectedSortOrder = CommonUtils.concat(groupByKeys, orderByKeys);
// Check: all orderBy and groupBy columns exist in inputBlock's schema
for (String colName : expectedSortOrder)
{
if (!inputSchema.hasIndex(colName))
{
String msg =
String.format("Input block '%s' is missing expected column '%s'",
inputBlock,
colName);
throw new PreconditionException(PreconditionExceptionType.COLUMN_NOT_PRESENT,
msg);
}
}
// Check: partition order
// Logic -- partition keys should match group-by keys or should prefix group-by
// keys.
String[] partitionKeys = inputBlockPostCondition.getPartitionKeys();
if (groupByKeys.length > 0
&& (partitionKeys == null || partitionKeys.length == 0 || !CommonUtils.isPrefix(groupByKeys,
partitionKeys)))
{
throw new PreconditionException(PreconditionExceptionType.INVALID_PARTITION_KEYS,
String.format("Found=%s, Expected=%s",
partitionKeys == null
? "[null]"
: Arrays.toString(partitionKeys),
groupByKeys == null
? "[null]"
: Arrays.toString(groupByKeys)));
}
// Check: sorting order
String[] sortKeys = inputBlockPostCondition.getSortKeys();
if (expectedSortOrder.length > 0
&& !CommonUtils.isPrefix(sortKeys, expectedSortOrder))
{
throw new PreconditionException(PreconditionExceptionType.INVALID_SORT_KEYS,
String.format("Found=%s, Expected=%s",
sortKeys == null
? "[null]"
: Arrays.toString(sortKeys),
expectedSortOrder == null
? "[null]"
: Arrays.toString(expectedSortOrder)));
}
BlockSchema schema = getOutputSchema(json, inputSchema);
return new PostCondition(schema, partitionKeys, sortKeys);
}
}