package com.github.elazarl.multireducers; import com.google.common.collect.Lists; import org.apache.hadoop.fs.CommonConfigurationKeys; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.RawComparator; import org.apache.hadoop.mapreduce.*; import org.apache.hadoop.mapreduce.lib.output.LazyOutputFormat; import org.apache.hadoop.mapreduce.lib.output.NullOutputFormat; import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner; import java.lang.reflect.Method; import java.lang.reflect.ParameterizedType; import java.lang.reflect.Type; import java.util.*; /** * MultiJob is a helper class that helps you configure a multiplexed job. */ public class MultiJob { public static final String MULTIREDUCERS_HAVE_OUTPUT_FORMAT = "com.github.elazarl.multireducers.have.output.format"; public static final String OUTPUT_FORMAT_PATH = "com.github.elazarl.multireducers.outputFormatPath"; public static final String OUTPUT_FORMAT_PROPERTIES = "com.github.elazarl.multireducers.outputFormat.properties"; public static final String DISABLE_JOB_PREFIX = "com.github.elazarl.multireducers.disable.job."; public static final String DISABLE_JOB_BY_INDEX_PREFIX = "com.github.elazarl.multireducers.disable.job.index."; public static final String JOB_IDS_CONF_KEY = "com.github.elazarl.multireducers.job.ids"; public static final String REDIRECT_TO_REDUCER = "com.github.elazarl.multireducers.redirect.to.reducer"; static public MultiJobBuilder create() { return new MultiJobBuilder(); } static public MultiJobBuilder createWithId(String id) { return new MultiJobBuilder(id); } static public class MultiJobBuilder { private String id = "~always.disabled~"; private MultiJobBuilder(){} private MultiJobBuilder(String id) { this.id = id; } private boolean skipVerification = false; private Class<? extends Mapper> mapper = Mapper.class; private Class<?> mapperOutputKey; private Class<?> mapperOutputValue; private Class<? extends Reducer> reducer = Reducer.class; private String redirectReducerToJobId; private int numReducers = 0; private Class<? extends Reducer> combiner = Reducer.class; private Class<? extends Partitioner> partitioner = HashPartitioner.class; private Class<? extends OutputFormat> outputFormat = NullOutputFormat.class; private Class<?> outputFormatKey = NullWritable.class; private Class<?> outputFormatValue = NullWritable.class; private Class<? extends RawComparator> comparator = MultiComparator.NoComparator.class; private List<MultiOutputFormat.Property> outputFormatProperties = Lists.newArrayList(); public MultiJobBuilder skipJobVerificationCanCauseRuntimeErrorsIKnowWhatImDoing() { this.skipVerification = true; return this; } public MultiJobBuilder withMapper(Class<? extends Mapper> mapper, Class<?> outputKey, Class<?> outputValue) { this.mapper = mapper; this.mapperOutputKey = outputKey; this.mapperOutputValue = outputValue; return this; } public MultiJobBuilder redirectReducerTo(String jobId) { this.redirectReducerToJobId = jobId; return this; } public MultiJobBuilder withReducer(Class<? extends Reducer> reducer, int numReducers) { this.reducer = reducer; this.numReducers = numReducers; return this; } public MultiJobBuilder withCombiner(Class<? extends Reducer> combiner) { this.combiner = combiner; return this; } public MultiJobBuilder withPartitioner(Class<? extends Partitioner> partitioner) { this.partitioner = partitioner; return this; } public MultiJobBuilder withComparator(Class<? extends RawComparator> comparator) { this.comparator = comparator; return this; } public MultiJobBuilder withOutputFormat(Class<? extends OutputFormat> outputFormat, Class<?> outputFormatKey, Class<?> outputFormatValue, MultiOutputFormat.Property... properties) { this.outputFormat = outputFormat; this.outputFormatKey = outputFormatKey; this.outputFormatValue = outputFormatValue; Collections.addAll(this.outputFormatProperties, properties); return this; } public boolean addTo(Job job) { int jobIndex = job.getConfiguration().getStringCollection(MultiMapper.CONF_KEY).size(); if (job.getConfiguration().getBoolean(DISABLE_JOB_PREFIX + reducer.getName(), false) || job.getConfiguration().getBoolean(DISABLE_JOB_PREFIX + mapper.getName(), false) || job.getConfiguration().getBoolean(DISABLE_JOB_BY_INDEX_PREFIX + jobIndex, false) || job.getConfiguration().getBoolean(DISABLE_JOB_PREFIX + id, false)) { return false; } if (!outputFormat.equals(NullOutputFormat.class)) { job.getConfiguration().setBoolean(MULTIREDUCERS_HAVE_OUTPUT_FORMAT, true); } verifyJobIsSound(); MultiOutputFormat.addOutputFormat(job, outputFormat, outputFormatProperties.toArray( new MultiOutputFormat.Property[outputFormatProperties.size()])); appendTo(job, JOB_IDS_CONF_KEY, id); appendTo(job, MultiMapper.CONF_KEY, mapper); if (redirectReducerToJobId != null) { appendTo(job, MultiReducer.CONF_KEY, NopReducer.class); List<String> ids = Lists.newArrayList( job.getConfiguration().getTrimmedStringCollection(JOB_IDS_CONF_KEY)); int i = ids.indexOf(redirectReducerToJobId); if (i == -1) { throw new IllegalArgumentException("Cannot find job ID of " + redirectReducerToJobId + " have " + ids); } appendTo(job, REDIRECT_TO_REDUCER, i); } else { appendTo(job, MultiReducer.CONF_KEY, reducer); appendTo(job, REDIRECT_TO_REDUCER, jobIndex); } appendTo(job, MultiPartitioner.NUM_REDUCERS_KEY, numReducers); appendTo(job, MultiCombiner.CONF_KEY, combiner); appendTo(job, MultiPartitioner.CONF_KEY, partitioner); appendTo(job, MultiReducer.INPUT_KEY_CLASSES, mapperOutputKey); appendTo(job, MultiReducer.INPUT_VALUE_CLASSES, mapperOutputValue); appendTo(job, MultiComparator.CONF_KEY, comparator); ensureJobSet(job); return true; } // return exception, or null if all is well private RuntimeException reduceMethodNotSoundError(String who, Class<?> clazz, Method reduce) { Class<?> key = reduce.getParameterTypes()[0]; if (!key.isAssignableFrom(mapperOutputKey)) { return new IllegalArgumentException("Map output key " + mapperOutputKey.getName() + ", but " + who + " " + clazz.getName() + " expects " + key.getName() + " and it cannot be assigned"); } Type reduceIterator = reduce.getGenericParameterTypes()[1]; Class<?> iteratorParameter = getTypeParameter(reduceIterator); if (iteratorParameter != null && !iteratorParameter.isAssignableFrom(mapperOutputValue)) { return new IllegalArgumentException("Map output value " + mapperOutputValue.getName() + ", but " + who + " " + clazz.getName() + " expects " + iteratorParameter.getName() + " and it cannot be assigned"); } return null; } private void verifyJobIsSound() { if (skipVerification) { return; } verifyReduceMethodIsSound("reducer", reducer); verifyReduceMethodIsSound("combiner", combiner); } private void verifyReduceMethodIsSound(String who, Class<? extends Reducer> clazz) { RuntimeException error = null; for (Class<?> c = clazz; c != Reducer.class; c = c.getSuperclass()) { List<Method> methods = Methods.getAllWithName(c, "reduce"); removeIrrelevantReduceMethods(methods); if (methods.isEmpty()) { // Our reducer does not implement methods other than // reduce(Object, Iterable, Context). // This means you inherit from Reduce without type parameters. // hence, the reducer can accept any type, and is always valid. continue; } for (Method reduceMethod : methods) { // we found a method with reduce signature error = reduceMethodNotSoundError(who, clazz, reduceMethod); if (error == null) { return; } } } if (error != null) { throw error; } } // for type like Iterator<Foo> returns Foo class private Class<?> getTypeParameter(Type type) { if (!(type instanceof ParameterizedType)) { return null; } ParameterizedType parameterizedType = (ParameterizedType) type; Type innerType = parameterizedType.getActualTypeArguments()[0]; if (!(innerType instanceof Class)) { return null; } return (Class<?>)innerType; } /** * removeIrrelevantReduceMethods removes: * 1. methods not of the form reduce(K, Iterator<V>, Reducer.Context) * from the list. * 2. Methods of the form reduce(Object, Iterator, Reducer.Context), * which are generated by type erasure. * @param methods list of methods to remove irrelevant reduce methods from */ private void removeIrrelevantReduceMethods(List<Method> methods) { Iterator<Method> it = methods.iterator(); while (it.hasNext()) { Method method = it.next(); Class<?>[] params = method.getParameterTypes(); if (params.length != 3 || !params[1].equals(Iterable.class) || !params[2].isAssignableFrom(Reducer.Context.class)) { // user added a reduce class that does not override parent it.remove(); } else if (method.getParameterTypes()[0].equals(Object.class) && method.getGenericParameterTypes()[1] instanceof Class) { // reduce class from type erasure // class Foo extends Reducer<Text, Text, Text, Text>{} // would have two methods: // reduce(Text, Iterable<Text>, Context) // and // reduce(Object, Iterable, Context) // which is added due to type erasure. This method always accepts // any input, hence irrelevant for validity check. it.remove(); } } } } private static void ensureJobSet(Job job) { if (job.getConfiguration().getBoolean(MULTIREDUCERS_HAVE_OUTPUT_FORMAT, false)) { // we need to use the TextOutputFormat, since otherwise the FileOutputCommitter won't run LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class); } else { job.setOutputFormatClass(NullOutputFormat.class); } job.setOutputFormatClass(MultiOutputFormat.class); job.setReducerClass(MultiReducer.class); job.setMapperClass(MultiMapper.class); job.setMapOutputKeyClass(PerMapperOutputKey.class); job.setMapOutputValueClass(PerMapperOutputValue.class); job.setSortComparatorClass(MultiComparator.class); job.setPartitionerClass(MultiPartitioner.class); List<Class<?>> serializations = Arrays.asList( job.getConfiguration().getClasses(CommonConfigurationKeys.IO_SERIALIZATIONS_KEY)); if (serializations.indexOf(MultiSerializer.class) == -1) { appendTo(job, CommonConfigurationKeys.IO_SERIALIZATIONS_KEY, MultiSerializer.class); } for (Class<?> aClass : job.getConfiguration().getClasses(MultiCombiner.CONF_KEY)) { if (!aClass.equals(Reducer.class)) { job.setCombinerClass(MultiCombiner.class); } } } public static void appendTo(Job jobConf, String key, int n) { appendTo(jobConf, key, "" + n); } public static void appendTo(Job jobConf, String key, Class<?> clazz) { appendTo(jobConf, key, clazz.getName()); } public static void appendTo(Job jobConf, String key, String val) { Collection<String> src = jobConf.getConfiguration().getStringCollection(key); src.add(val); jobConf.getConfiguration().setStrings(key, src.toArray(new String[src.size()])); } }