package com.manning.hip.ch13; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; public final class ArrayOutOfBoundsImproved { private static final Logger log = LoggerFactory.getLogger( ArrayOutOfBoundsImproved.class); public static void main(String... args) throws Exception { runJob(args[0], args[1]); } public static void runJob(String input, String output) throws Exception { Configuration conf = new Configuration(); conf.set("keep.failed.task.files", "true"); Job job = new Job(conf); job.setJarByClass(ArrayOutOfBoundsImproved.class); job.setMapperClass(Map.class); job.setReducerClass(Reduce.class); job.setInputFormatClass(KeyValueTextInputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); Path outputPath = new Path(output); FileInputFormat.setInputPaths(job, input); FileOutputFormat.setOutputPath(job, outputPath); outputPath.getFileSystem(conf).delete(outputPath, true); job.waitForCompletion(true); } public static class Map extends Mapper<Text, Text, Text, Text> { protected Text outputValue = new Text(); protected int failedRecords; public static enum Counters { FAILED_RECORDS } @Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); log.info("Input split = {}", context.getInputSplit()); } @Override protected void map(Text key, Text value, Context context) throws IOException, InterruptedException { if(log.isDebugEnabled()) { log.debug("Input K[{}],V[{}]", key, value); } try { String id = StringUtils.split(value.toString())[5]; outputValue.set(id); if(log.isDebugEnabled()) { log.debug("Output K[{}],V[{}]", key, value); } context.write(key, outputValue); } catch(Exception t) { processError(context, t, key, value); } } protected void processError(Context c, Throwable t, Text k, Text v) { log.error("Caught exception processing key[" + k + "], value[" + v + "]", t); c.getCounter(Counters.FAILED_RECORDS).increment(1); c.setStatus("Records with failures = " + (++failedRecords)); } } public static class Reduce extends Reducer<Text, Text, Text, Text> { public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { if(log.isDebugEnabled()) { log.debug("Input K[{}]", key); } for (Text val : values) { if(log.isDebugEnabled()) { log.debug("Input V[{}]", val); log.debug("Output K[{}],V[{}]", key, val); } context.write(key, val); } } } }