package nyse.avgstockvolpermonth; import java.net.URI; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Partitioner; import org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.partition.InputSampler; import org.apache.hadoop.mapreduce.lib.partition.TotalOrderPartitioner; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import nyse.keyvalues.LongPair; import nyse.keyvalues.TextPair; public class AvgStockVolumePerMonthDriver extends Configured implements Tool { public static class MonthPartitioner extends Partitioner<TextPair, LongPair> { @Override public int getPartition(TextPair key, LongPair value, int numPartitions) { // TODO Auto-generated method stub String tradeMonth = key.getFirst().toString().replace("-", ""); return new Integer(tradeMonth) % numPartitions; } } @Override public int run(String[] arg0) throws Exception { Configuration conf = getConf(); Job job = Job.getInstance(conf); job.setJarByClass(getClass()); FileSystem fs = FileSystem.get(URI.create(arg0[0]), conf); Path path = new Path(arg0[0] + arg0[1]); // arg0[1] NYSE_201[2-3] FileStatus[] status = fs.globStatus(path); Path[] paths = FileUtil.stat2Paths(status); for (Path p : paths) { System.out.println(p.toString()); FileInputFormat.addInputPath(job, p); } job.setInputFormatClass(CombineTextInputFormat.class); CombineTextInputFormat.setMaxInputSplitSize(job, 32000000); job.setMapperClass(AvgStockVolPerMonthMapper.class); job.setMapOutputKeyClass(TextPair.class); job.setMapOutputValueClass(LongPair.class); // job.setPartitionerClass(MonthPartitioner.class); job.setPartitionerClass(TotalOrderPartitioner.class); Path inputDir = new Path(arg0[3]); Path partitionFile = new Path(inputDir, "partitioning"); TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), partitionFile); double pcnt = 0.10; int numReduceTasks = 4; int numSamples = 10000; int maxSplits = numReduceTasks - 1; if (0 >= maxSplits) maxSplits = Integer.MAX_VALUE; InputSampler.Sampler<TextPair, LongPair> sampler = new InputSampler.RandomSampler<TextPair, LongPair>(pcnt, numSamples); InputSampler.writePartitionFile(job, sampler); // job.setCombinerClass(AvgStockVolPerMonthCombiner.class); job.setReducerClass(AvgStockVolPerMonthReducer.class); job.setNumReduceTasks(numReduceTasks ); job.setOutputKeyClass(TextPair.class); job.setOutputValueClass(LongPair.class); FileOutputFormat.setOutputPath(job, new Path(arg0[2])); // TODO Auto-generated method stub return job.waitForCompletion(true) ? 0 : 1; } public static void main(String[] args) throws Exception { System.exit(ToolRunner.run(new AvgStockVolumePerMonthDriver(), args)); } }