package com.ontology2.bakemono.mapmap; import com.google.common.collect.Iterators; import com.google.common.collect.Lists; import com.google.common.collect.PeekingIterator; import com.ontology2.bakemono.Main; import com.ontology2.bakemono.uniq.Uniq; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.GzipCodec; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.util.Tool; import java.util.List; /** * Created with IntelliJ IDEA. * User: paul_000 * Date: 11/12/13 * Time: 1:22 PM * To change this template use File | Settings | File Templates. */ public abstract class UniqTool implements Tool { abstract protected Class getMapperClass(); abstract protected String getJobName(); private Configuration conf; @Override public Configuration getConf() { return this.conf; } @Override public void setConf(Configuration arg0) { this.conf=arg0; } @Override public int run(String[] arg0) throws Exception { try { PeekingIterator<String> a= Iterators.peekingIterator(Iterators.forArray(arg0)); Integer reduceTasks = parseRArgument(a); if (!a.hasNext()) usage(); List<String> paths= Lists.newArrayList(a); String output=paths.get(paths.size()-1); paths.remove(paths.size()-1); conf.set("mapred.compress.map.output", "true"); conf.set("mapred.output.compression.type", "BLOCK"); conf.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec"); Job job=new Job(conf,getJobName()); job.setSpeculativeExecution(false); job.setJarByClass(this.getClass()); job.setMapperClass(getMapperClass()); job.setReducerClass(Uniq.class); if(reduceTasks==null) { reduceTasks=29; // about right for AWS runs } job.setNumReduceTasks(reduceTasks); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(LongWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); for(String input:paths) { FileInputFormat.addInputPath(job, new Path(input)); } FileOutputFormat.setOutputPath(job, new Path(output)); FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, GzipCodec.class); // Gotcha -- this has to run before the definitions above associated with the output format because // this is going to be configured against the job as it stands a moment from now job.setOutputFormatClass(TextOutputFormat.class); return job.waitForCompletion(true) ? 0 : 1; } catch(Main.IncorrectUsageException iue) { return 2; } } public static Integer parseRArgument(PeekingIterator<String> a) throws Main.IncorrectUsageException { Integer reduceTasks=null; while(a.hasNext() && a.peek().startsWith("-")) { String flagName=a.next().substring(1).intern(); if (!a.hasNext()) usage(); String flagValue=a.next(); if (flagName=="r") { reduceTasks=Integer.parseInt(flagValue); } else { usage(); }; } return reduceTasks; } private static void usage() throws Main.IncorrectUsageException { throw new Main.IncorrectUsageException("incorrect arguments"); }; }