package com.github.elazarl.multireducers;
import com.google.common.primitives.Ints;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.util.ReflectionUtils;
import java.util.ArrayList;
import java.util.List;
/**
* MultiPartitioner would work on PerMapperOutputKey/Value, and use the relevant internal partitioner
*/
public class MultiPartitioner<T> extends Partitioner<PerMapperOutputKey, T>
implements Configurable {
public static final String CONF_KEY = "com.github.elazarl.multireducers.partitioners";
public static final String NUM_REDUCERS_KEY = "com.github.elazarl.multireducers.reducers.number";
@Override
public int getPartition(PerMapperOutputKey perMapperOutputKey, T value, int numPartitions) {
int n = perMapperOutputKey.targetReducer;
int reducersOffset = 0;
for (int i = 0; i < n; i++) {
reducersOffset += numReducers.get(i);
}
// % numPartitions is needed in case we do not have enough "physical" reducers, and we run both sub-reducers
// on the same physical reducer.
return (reducersOffset +
partitioners.get(n).getPartition(perMapperOutputKey.data, value, numReducers.get(n))) % numPartitions;
}
@SuppressWarnings("unchecked")
@Override
public void setConf(Configuration conf) {
this.conf = conf;
Class<Partitioner>[] partitionersClass = (Class<Partitioner>[])
conf.getClasses(CONF_KEY);
partitioners = new ArrayList<Partitioner<Object, Object>>(partitionersClass.length);
for (Class<Partitioner> partitionerClass : partitionersClass) {
partitioners.add(ReflectionUtils.newInstance(partitionerClass, conf));
}
numReducers = Ints.asList(conf.getInts(NUM_REDUCERS_KEY));
}
@Override
public Configuration getConf() {
return conf;
}
Configuration conf;
List<Partitioner<Object, Object>> partitioners;
List<Integer> numReducers;
}