package ch.unibe.scg.cells.hadoop;
import javax.inject.Provider;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.io.compress.SnappyCodec;
import org.apache.hadoop.mapreduce.MRJobConfig;
import com.google.inject.AbstractModule;
/** Internal testing. Don't use. */
public final class UnibeModule extends AbstractModule {
static class UnibeConfigurationProvider implements Provider<Configuration> {
@Override
public Configuration get() {
Configuration ret = HBaseConfiguration.create();
ret.set("hbase.master", "leela.unibe.ch:60000");
ret.set("hbase.zookeeper.quorum", "leela.unibe.ch");
ret.setInt("hbase.zookeeper.property.clientPort", 2181);
ret.setBoolean("fs.automatic.close", false);
// Performance settings.
ret.setLong(MRJobConfig.MAP_MEMORY_MB, 4000L);
ret.set(MRJobConfig.MAP_JAVA_OPTS, "-Xmx3500m");
ret.setLong(MRJobConfig.REDUCE_MEMORY_MB, 4000L);
ret.set(MRJobConfig.REDUCE_JAVA_OPTS, "-Xmx3500m");
// adjust this when total memory of cluster changes
ret.setInt(MRJobConfig.NUM_REDUCES, 27);
ret.setInt(MRJobConfig.MAP_FAILURES_MAX_PERCENT, 99);
ret.setBoolean(MRJobConfig.MAP_SPECULATIVE, false);
ret.setInt(MRJobConfig.JVM_NUMTASKS_TORUN, -1);
ret.setBoolean(MRJobConfig.MAP_OUTPUT_COMPRESS, true);
ret.set(MRJobConfig.MAP_OUTPUT_COMPRESS_CODEC, SnappyCodec.class.getName());
ret.setInt(MRJobConfig.IO_SORT_MB, 500);
ret.setInt(MRJobConfig.IO_SORT_FACTOR, 50);
ret.setFloat(MRJobConfig.MAP_SORT_SPILL_PERCENT, 0.9f);
// as suggested on p. 27: http://www.slideshare.net/cloudera/mr-perf
ret.setInt(MRJobConfig.REDUCE_MERGE_INMEM_THRESHOLD, 0);
ret.setBoolean(MRJobConfig.REDUCE_MEMTOMEM_ENABLED, false);
// don't try a failed pack file a second time
ret.setInt(MRJobConfig.MAP_MAX_ATTEMPTS, 1);
// wait until all map tasks are completed (default: 0.05)
ret.setFloat(MRJobConfig.COMPLETED_MAPS_FOR_REDUCE_SLOWSTART, 0.05f);
return ret;
}
}
@Override
protected void configure() {
bind(Configuration.class).toProvider(UnibeConfigurationProvider.class);
}
}