package nyse.topthreestocksbyvolume; import java.io.PrintStream; import java.net.URI; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.SnappyCodec; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.util.GenericOptionsParser; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import nyse.comparators.LongPairPrimitiveGroupingComparator; import nyse.comparators.LongPairPrimitiveSortingComparator; import nyse.keyvalues.LongPairPrimitive; import nyse.partitioners.FirstKeyLongPairPartitioner; public class TopThreeStocksByVolumePerDayCompressDriver extends Configured implements Tool { @Override public int run(String[] arg0) throws Exception { Configuration conf = getConf(); GenericOptionsParser parser = new GenericOptionsParser(conf, arg0); String[] args = parser.getRemainingArgs(); conf.setBoolean("mapreduce.compress.map.output", true); conf.setClass("mapreduce.map.output.compress.codec", SnappyCodec.class, CompressionCodec.class); Job job = Job.getInstance(conf); job.setJarByClass(getClass()); FileSystem fs = FileSystem.get(URI.create(args[0]), conf); Path path = new Path(args[0] + args[1]); //arg0[1] NYSE_201[2-3] FileStatus[] status = fs.globStatus(path); Path[] paths = FileUtil.stat2Paths(status); for(Path p : paths) { System.out.println(p.toString()); FileInputFormat.addInputPath(job, p); } job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(TopThreeStocksByVolumePerDayMapper.class); job.setMapOutputKeyClass(LongPairPrimitive.class); job.setMapOutputValueClass(Text.class); job.setPartitionerClass(FirstKeyLongPairPartitioner.class); job.setGroupingComparatorClass(LongPairPrimitiveGroupingComparator.class); job.setSortComparatorClass(LongPairPrimitiveSortingComparator.class); job.setNumReduceTasks(6); job.setReducerClass(TopThreeStocksByVolumePerDayReducer.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); FileOutputFormat.setOutputPath(job, new Path(args[2])); FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, SnappyCodec.class); return job.waitForCompletion(true) ? 0 : 1; } public static void main(String[] args) throws Exception { System.exit(ToolRunner.run(new TopThreeStocksByVolumePerDayCompressDriver(), args)); } }