/* (c) 2014 LinkedIn Corp. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use * this file except in compliance with the License. You may obtain a copy of the * License at http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed * under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR * CONDITIONS OF ANY KIND, either express or implied. */ package com.linkedin.cubert.plan.physical; import java.io.IOException; import java.util.Map; import org.apache.hadoop.conf.Configurable; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.mapreduce.Partitioner; import org.apache.pig.backend.executionengine.ExecException; import org.apache.pig.data.Tuple; import org.codehaus.jackson.JsonNode; import org.codehaus.jackson.JsonParseException; import org.codehaus.jackson.map.JsonMappingException; import org.codehaus.jackson.map.ObjectMapper; import com.linkedin.cubert.block.BlockSchema; import com.linkedin.cubert.block.Index; import com.linkedin.cubert.utils.FileCache; import com.linkedin.cubert.utils.JsonUtils; /** * Partitioner for BLOCKGEN BY INDEX. This partitioner expects a BLOCK_ID within the key * when shuffling. The partitioner extracts the reducerId of the BLOCK_ID (which are the * upper 32 bits) and use it to route it to the reducer. * <p> * This partitioner ensures that the skew of the partition keys in blockgen by index will * be identical to the skew in the original blockgen. * * @author Maneesh Varshney * * @param <V> */ public class ByIndexPartitioner<V> extends Partitioner<Tuple, V> implements Configurable { private Configuration conf; private String indexName; private int blockIdIndex; private Map<Long, Integer> blockIdReducerMap; @Override public int getPartition(Tuple key, V value, int numPartitions) { if (blockIdReducerMap == null) { prepareBlockIdReducerMap(); } long blockId; try { blockId = (Long) key.get(blockIdIndex); } catch (ExecException e) { throw new RuntimeException(e); } return blockIdReducerMap.get(blockId); } @Override public void setConf(Configuration conf) { this.conf = conf; try { final String jsonStr = conf.get(CubertStrings.JSON_SHUFFLE); final JsonNode json = new ObjectMapper().readValue(jsonStr, JsonNode.class); final String[] pivotKeys = JsonUtils.asArray(json.get("pivotKeys")); final BlockSchema fullSchema = new BlockSchema(json.get("schema")); final BlockSchema keySchema = fullSchema.getSubset(pivotKeys); blockIdIndex = keySchema.getIndex("BLOCK_ID"); indexName = JsonUtils.getText(json, "index"); } catch (JsonParseException e) { e.printStackTrace(); } catch (JsonMappingException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } private void prepareBlockIdReducerMap() { Index index; try { index = FileCache.getCachedIndex(indexName); } catch (Exception e) { throw new RuntimeException(e); } int nReducers = conf.getInt("mapred.reduce.tasks", -1); if (nReducers == -1) { throw new RuntimeException("Unable to obtain number of reducers"); } blockIdReducerMap = index.getBlockIdPartitionMap(nReducers); System.out.println("Successfully created BlockId versus reducer map. #blocks: " + blockIdReducerMap.size() + " #reducers: " + nReducers); } @Override public Configuration getConf() { return conf; } }