package ch.unibe.scg.cells.benchmarks; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapreduce.MRJobConfig; import ch.unibe.scg.cells.benchmarks.CellsInMemoryWordCountBenchmark.FileContent; import ch.unibe.scg.cells.benchmarks.CellsInMemoryWordCountBenchmark.WordCount; import ch.unibe.scg.cells.hadoop.HadoopPipeline; import ch.unibe.scg.cells.hadoop.Table; import ch.unibe.scg.cells.hadoop.TableAdmin; import ch.unibe.scg.cells.hadoop.UnibeModule; import com.google.inject.Guice; import com.google.inject.Injector; import com.google.protobuf.ByteString; /** * Benchmarks cells performance in cluster with wordcount problem. * Input hdfs folder can be specified via command line argument. */ public final class CellsHadoopWordCountBenchmark { /** * Runs a wordcount benchmark in cluster. You can specify input folder with first argument. * The default input folder is "hdfs://haddock.unibe.ch/tmp/books" */ public static void main(String[] args) throws IOException, InterruptedException { String input = HadoopBenchmark.INPUT_PATH; if (args.length > 0) { input = args[0]; } Injector inj = Guice.createInjector(new UnibeModule()); final ByteString family = ByteString.copyFromUtf8("f"); Configuration c = inj.getInstance(Configuration.class); c.setLong(MRJobConfig.MAP_MEMORY_MB, 1400L); c.set(MRJobConfig.MAP_JAVA_OPTS, "-Xmx1100m"); c.setLong(MRJobConfig.REDUCE_MEMORY_MB, 1400L); c.set(MRJobConfig.REDUCE_JAVA_OPTS, "-Xmx1100m"); c.setLong(MRJobConfig.NUM_REDUCES, 2); try (Table<WordCount> tab = inj.getInstance(TableAdmin.class).createTemporaryTable(family)) { HadoopPipeline<FileContent, WordCount> pipe = HadoopPipeline.fromHDFSToTable( inj.getInstance(Configuration.class), RawFileFormat.class, new Path(input), tab); CellsInMemoryWordCountBenchmark.run(pipe); } } }