package mia.clustering.ch12.twitter; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import org.apache.mahout.common.HadoopUtil; import org.apache.mahout.common.Parameters; public class ByKeyGroupingJob { private ByKeyGroupingJob() {} public static void startJob(Parameters params) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = new Configuration(); conf.set("job.parameters", params.toString()); conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," + "org.apache.hadoop.io.serializer.WritableSerialization"); String input = params.get("input"); Job job = new Job(conf, "Generating dataset based from input" + input); job.setJarByClass(ByKeyGroupingJob.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(input)); Path outPath = new Path(params.get("output")); FileOutputFormat.setOutputPath(job, outPath); HadoopUtil.delete(conf, outPath); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(ByKeyMapper.class); job.setCombinerClass(ByKeyReducer.class); job.setReducerClass(ByKeyReducer.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.waitForCompletion(true); } }