/*
* Copyright © 2015-2016 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package co.cask.cdap.internal.app.runtime.batch.dataset.output;
import co.cask.cdap.app.metrics.MapReduceMetrics;
import co.cask.cdap.app.verification.AbstractVerifier;
import co.cask.cdap.common.conf.ConfigurationUtil;
import co.cask.cdap.common.lang.ClassLoaders;
import com.google.common.base.Preconditions;
import com.google.common.base.Splitter;
import com.google.common.collect.Lists;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.StatusReporter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskCounter;
import org.apache.hadoop.mapreduce.TaskInputOutputContext;
import org.apache.hadoop.mapreduce.task.JobContextImpl;
import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl;
import org.apache.hadoop.util.ReflectionUtils;
import java.io.Closeable;
import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
/**
* The MultipleOutputs class simplifies writing output data to multiple outputs.
* It has been adapted from org.apache.hadoop.mapreduce.lib.output.MultipleOutputs.
*/
public class MultipleOutputs implements Closeable {
private static final String MULTIPLE_OUTPUTS = "hconf.mapreduce.multipleoutputs";
private static final String PREFIXED_CONF_PREFIX = "hconf.named.";
private static final String MO_PREFIX = MULTIPLE_OUTPUTS + ".namedOutput.";
private static final String FORMAT = ".format";
private static final String KEY = ".key";
private static final String VALUE = ".value";
/**
* Cache for the taskContexts
*/
private final Map<String, TaskAttemptContext> taskContexts = new HashMap<>();
// instance code, to be used from Mapper/Reducer code
private final TaskInputOutputContext context;
private final Set<String> namedOutputs;
private final Map<String, RecordWriter<?, ?>> recordWriters;
/**
* Checks the existence of a named output within a collection.
*
* @throws IllegalArgumentException if the output name is not existing/absent, as appropriate.
*/
private static void checkNamedOutputName(String namedOutput, Collection<String> namedOutputs, boolean expectToExist) {
if (!expectToExist && namedOutputs.contains(namedOutput)) {
// this shouldn't happen, because it is already protected against in BasicMapReduceContext#addOutput
throw new IllegalArgumentException("Named output '" + namedOutput + "' already defined");
} else if (expectToExist && !namedOutputs.contains(namedOutput)) {
throw new IllegalArgumentException("Named output '" + namedOutput + "' not defined");
}
}
// Returns list of channel names.
static List<String> getNamedOutputsList(JobContext job) {
Iterable<String> parts =
Splitter.on(" ").omitEmptyStrings().split(job.getConfiguration().get(MULTIPLE_OUTPUTS, ""));
return Lists.newArrayList(parts);
}
// Returns the named output OutputFormat.
static Class<? extends OutputFormat> getNamedOutputFormatClass(JobContext job, String namedOutput) {
return job.getConfiguration().getClass(MO_PREFIX + namedOutput + FORMAT, null, OutputFormat.class);
}
// Returns the key class for a named output.
private static Class<?> getNamedOutputKeyClass(JobContext job, String namedOutput) {
return job.getConfiguration().getClass(MO_PREFIX + namedOutput + KEY, null, Object.class);
}
// Returns the value class for a named output.
private static Class<?> getNamedOutputValueClass(JobContext job, String namedOutput) {
return job.getConfiguration().getClass(MO_PREFIX + namedOutput + VALUE, null, Object.class);
}
/**
* Adds a named output for the job.
*
* @param job job to add the named output
* @param namedOutput named output name, it has to be a word, letters
* and numbers only (alphanumeric)
* @param outputFormatClass name of the OutputFormat class.
* @param keyClass key class
* @param valueClass value class
* @param outputConfigs configurations for the output
*/
@SuppressWarnings("unchecked")
public static void addNamedOutput(Job job, String namedOutput, String outputFormatClass,
Class<?> keyClass, Class<?> valueClass, Map<String, String> outputConfigs) {
assertValidName(namedOutput);
checkNamedOutputName(namedOutput, getNamedOutputsList(job), false);
Configuration conf = job.getConfiguration();
conf.set(MULTIPLE_OUTPUTS, conf.get(MULTIPLE_OUTPUTS, "") + " " + namedOutput);
conf.set(MO_PREFIX + namedOutput + FORMAT, outputFormatClass);
conf.setClass(MO_PREFIX + namedOutput + KEY, keyClass, Object.class);
conf.setClass(MO_PREFIX + namedOutput + VALUE, valueClass, Object.class);
ConfigurationUtil.setNamedConfigurations(conf, computePrefixName(namedOutput), outputConfigs);
}
private static String computePrefixName(String outputName) {
// suffix the outputName with an '.', so that one outputName being a prefix of another outputName doesn't cause
// conflicts when scanning for properties
return PREFIXED_CONF_PREFIX + outputName + ".";
}
private static void assertValidName(String name) {
// use the same check as used on datasets when they're created, since the output name can be any dataset name
Preconditions.checkArgument(AbstractVerifier.isId(name),
"Name '%s' must consist only of ASCII letters, numbers, _, or -.", name);
}
/**
* Creates and initializes multiple outputs support,
* it should be instantiated in the Mapper/Reducer setup method.
*
* @param context the TaskInputOutputContext object
*/
public MultipleOutputs(TaskInputOutputContext context) {
this.context = context;
namedOutputs = Collections.unmodifiableSet(
new HashSet<>(MultipleOutputs.getNamedOutputsList(context)));
recordWriters = new HashMap<>();
}
/**
* Write key and value to the namedOutput.
*
* @param namedOutput the named output name
* @param key the key
* @param value the value
*/
@SuppressWarnings("unchecked")
public <K, V> void write(String namedOutput, K key, V value) throws IOException, InterruptedException {
checkNamedOutputName(namedOutput, namedOutputs, true);
getRecordWriter(namedOutput).write(key, value);
}
// by being synchronized MultipleOutputTask can be use with a MultithreadedMapper.
@SuppressWarnings("unchecked")
private synchronized RecordWriter getRecordWriter(String namedOutput) throws IOException, InterruptedException {
// look for record-writer in the cache
RecordWriter writer = recordWriters.get(namedOutput);
// If not in cache, create a new one
if (writer == null) {
// get the record writer from context output format
TaskAttemptContext taskContext = getContext(namedOutput);
Class<? extends OutputFormat<?, ?>> outputFormatClass;
try {
outputFormatClass = taskContext.getOutputFormatClass();
} catch (ClassNotFoundException e) {
throw new IOException(e);
}
ClassLoader outputFormatClassLoader = outputFormatClass.getClassLoader();
// This is needed in case the OutputFormat's classloader conflicts with the program classloader (for example,
// TableOutputFormat).
ClassLoader oldClassLoader = ClassLoaders.setContextClassLoader(outputFormatClassLoader);
try {
// We use ReflectionUtils to instantiate the OutputFormat, because it also calls setConf on the object, if it
// is a org.apache.hadoop.conf.Configurable.
OutputFormat<?, ?> outputFormat =
ReflectionUtils.newInstance(outputFormatClass, taskContext.getConfiguration());
writer = new MeteredRecordWriter<>(outputFormat.getRecordWriter(taskContext), context);
} finally {
ClassLoaders.setContextClassLoader(oldClassLoader);
}
// add the record-writer to the cache
recordWriters.put(namedOutput, writer);
}
return writer;
}
// Create a taskAttemptContext for the named output with
// output format and output key/value types put in the context
private synchronized TaskAttemptContext getContext(String nameOutput) throws IOException {
TaskAttemptContext taskContext = taskContexts.get(nameOutput);
if (taskContext != null) {
return taskContext;
}
taskContext = getNamedTaskContext(context, nameOutput);
taskContexts.put(nameOutput, taskContext);
return taskContext;
}
static TaskAttemptContext getNamedTaskContext(TaskAttemptContext context, String namedOutput) throws IOException {
Job job = getNamedJob(context, namedOutput);
return new TaskAttemptContextImpl(job.getConfiguration(),
context.getTaskAttemptID(), new WrappedStatusReporter(context));
}
static JobContext getNamedJobContext(JobContext context, String namedOutput) throws IOException {
Job job = getNamedJob(context, namedOutput);
return new JobContextImpl(job.getConfiguration(), job.getJobID());
}
private static Job getNamedJob(JobContext context, String namedOutput) throws IOException {
// The following trick leverages the instantiation of a record writer via
// the job thus supporting arbitrary output formats.
Job job = Job.getInstance(context.getConfiguration());
job.setOutputFormatClass(getNamedOutputFormatClass(context, namedOutput));
job.setOutputKeyClass(getNamedOutputKeyClass(context, namedOutput));
job.setOutputValueClass(getNamedOutputValueClass(context, namedOutput));
Configuration conf = job.getConfiguration();
Map<String, String> namedConfigurations = ConfigurationUtil.getNamedConfigurations(context.getConfiguration(),
computePrefixName(namedOutput));
ConfigurationUtil.setAll(namedConfigurations, conf);
return job;
}
/**
* Wraps RecordWriter to increment output counters.
*
* Normally, the user calls context#write(key, value) - context in this case is a Hadoop class, which automatically
* increments the counter as well as writing the record.
* In the case of multiple outputs, the user calls context#write(outputName, key, value) - in this case, the context
* is a CDAP class, and this doesn't at all translate into a call to Hadoop's context#write. Because of that, the
* metrics for output records aren't automatically incremented.
*/
private static class MeteredRecordWriter<K, V> extends RecordWriter<K, V> {
private final RecordWriter<K, V> writer;
private final String groupName;
private final String counterName;
private final TaskInputOutputContext context;
public MeteredRecordWriter(RecordWriter<K, V> writer, TaskInputOutputContext context) {
this.writer = writer;
this.context = context;
this.groupName = TaskCounter.class.getName();
this.counterName = getCounterName(context);
}
public void write(K key, V value) throws IOException, InterruptedException {
context.getCounter(groupName, counterName).increment(1);
writer.write(key, value);
}
public void close(TaskAttemptContext context) throws IOException, InterruptedException {
writer.close(context);
}
private String getCounterName(TaskInputOutputContext context) {
MapReduceMetrics.TaskType taskType = MapReduceMetrics.TaskType.from(context.getTaskAttemptID().getTaskType());
switch (taskType) {
case Mapper:
return TaskCounter.MAP_OUTPUT_RECORDS.name();
case Reducer:
return TaskCounter.REDUCE_OUTPUT_RECORDS.name();
default:
throw new IllegalArgumentException("Illegal task type: " + taskType);
}
}
}
private static class WrappedStatusReporter extends StatusReporter {
TaskAttemptContext context;
public WrappedStatusReporter(TaskAttemptContext context) {
this.context = context;
}
@Override
public Counter getCounter(Enum<?> name) {
return context.getCounter(name);
}
@Override
public Counter getCounter(String group, String name) {
return context.getCounter(group, name);
}
@Override
public void progress() {
context.progress();
}
@Override
public float getProgress() {
return context.getProgress();
}
@Override
public void setStatus(String status) {
context.setStatus(status);
}
}
/**
* Closes all the opened outputs.
* This should be called from cleanup method of map/reduce task.
*/
public void close() {
closeRecordWriters(recordWriters.values(), context);
}
/**
* Closes a collection of RecordWriters, suppressing any exceptions until close is called on each of them.
*
* @param recordWriters The Collection of RecordWriters to close
* @param context The context to pass during close of each RecordWriter
*/
public static void closeRecordWriters(Iterable<RecordWriter<?, ?>> recordWriters,
TaskAttemptContext context) {
RuntimeException ex = null;
for (RecordWriter writer : recordWriters) {
try {
writer.close(context);
} catch (IOException | InterruptedException e) {
if (ex == null) {
ex = new RuntimeException(e);
} else {
ex.addSuppressed(e);
}
}
}
if (ex != null) {
throw ex;
}
}
}