/* (c) 2014 LinkedIn Corp. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use
* this file except in compliance with the License. You may obtain a copy of the
* License at http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed
* under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
* CONDITIONS OF ANY KIND, either express or implied.
*/
package com.linkedin.cubert.plan.physical;
import java.io.IOException;
import java.util.ArrayList;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.pig.backend.executionengine.ExecException;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.codehaus.jackson.JsonNode;
import org.codehaus.jackson.JsonParseException;
import org.codehaus.jackson.map.JsonMappingException;
import org.codehaus.jackson.map.ObjectMapper;
import com.linkedin.cubert.block.BlockSchema;
import com.linkedin.cubert.block.BlockUtils;
import com.linkedin.cubert.utils.JsonUtils;
import com.linkedin.cubert.utils.Pair;
/**
* Partitions the data during the shuffle stage.
*
* @author Maneesh Varshney
*
*/
public class CubertPartitioner<V> extends Partitioner<Tuple, V> implements Configurable
{
private Tuple keyTuple;
private ArrayList<Pair<Boolean, Integer>> partitionKeyIndex =
new ArrayList<Pair<Boolean, Integer>>();
@Override
public void setConf(Configuration conf)
{
String jsonStr = conf.get(CubertStrings.JSON_SHUFFLE);
try
{
JsonNode json = new ObjectMapper().readValue(jsonStr, JsonNode.class);
String[] pivotKeys = JsonUtils.asArray(json.get("pivotKeys"));
BlockSchema fullSchema = new BlockSchema(json.get("schema"));
BlockSchema keySchema = fullSchema.getSubset(pivotKeys);
BlockSchema valueSchema = fullSchema.getComplementSubset(pivotKeys);
String[] partitionKeys = JsonUtils.asArray(json.get("partitionKeys"));
keyTuple = TupleFactory.getInstance().newTuple(partitionKeys.length);
for (int i = 0; i < partitionKeys.length; i++)
{
if (keySchema.hasIndex(partitionKeys[i]))
{
partitionKeyIndex.add(new Pair<Boolean, Integer>(true,
keySchema.getIndex(partitionKeys[i])));
}
else
{
partitionKeyIndex.add(new Pair<Boolean, Integer>(false,
valueSchema.getIndex(partitionKeys[i])));
}
}
// print.l(partitionKeyIndex);
// print.f("%s is the full schema", fullSchema.toString());
}
catch (JsonParseException e)
{
throw new RuntimeException(e);
}
catch (JsonMappingException e)
{
throw new RuntimeException(e);
}
catch (IOException e)
{
throw new RuntimeException(e);
}
}
@Override
public Configuration getConf()
{
return null;
}
@Override
public int getPartition(Tuple key, V value, int numPartitions)
{
try
{
for (int i = 0; i < partitionKeyIndex.size(); i++)
{
Object o = null;
if (partitionKeyIndex.get(i).getFirst())
o = key.get(partitionKeyIndex.get(i).getSecond());
else
o = ((Tuple) value).get(partitionKeyIndex.get(i).getSecond());
keyTuple.set(i, o);
}
}
catch (ExecException e)
{
e.printStackTrace();
}
// This has to be cast to long;
// If it is int, and the hashcode is -2147483648, negative or math.abs(), or *-1
// of this all return negative numbers
long hashcode = BlockUtils.getBlockId(keyTuple);
return (int) (hashcode % numPartitions);
}
}