/*
* Copyright © 2015-2016 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package co.cask.cdap.internal.app.runtime.batch.dataset.partitioned;
import co.cask.cdap.api.dataset.lib.DynamicPartitioner;
import co.cask.cdap.api.dataset.lib.PartitionKey;
import co.cask.cdap.api.dataset.lib.PartitionedFileSet;
import co.cask.cdap.api.dataset.lib.PartitionedFileSetArguments;
import co.cask.cdap.api.dataset.lib.Partitioning;
import co.cask.cdap.common.conf.Constants;
import co.cask.cdap.common.lang.InstantiatorFactory;
import co.cask.cdap.data2.dataset2.lib.partitioned.PartitionedFileSetDataset;
import co.cask.cdap.internal.app.runtime.batch.BasicMapReduceTaskContext;
import co.cask.cdap.internal.app.runtime.batch.MapReduceClassLoader;
import co.cask.cdap.internal.app.runtime.batch.dataset.output.MultipleOutputs;
import com.google.common.reflect.TypeToken;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.InvalidJobConfException;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.security.TokenCache;
import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* This class extends the FileOutputFormat and allows writing dynamically to multiple partitions of a PartitionedFileSet
* Dataset.
*
* This class is used in {@link PartitionedFileSetDataset} and is referred to this class by name because data-fabric
* doesn't depends on app-fabric, while this class needs access to app-fabric class {@link BasicMapReduceTaskContext}.
*
* @param <K> Type of key
* @param <V> Type of value
*/
@SuppressWarnings("unused")
public class DynamicPartitioningOutputFormat<K, V> extends FileOutputFormat<K, V> {
private FileOutputFormat<K, V> fileOutputFormat;
/**
* Create a composite record writer that can write key/value data to different output files.
*
* @return a composite record writer
* @throws IOException
*/
@Override
public RecordWriter<K, V> getRecordWriter(final TaskAttemptContext job) throws IOException {
final String outputName = FileOutputFormat.getOutputName(job);
Configuration configuration = job.getConfiguration();
Class<? extends DynamicPartitioner> partitionerClass = configuration
.getClass(PartitionedFileSetArguments.DYNAMIC_PARTITIONER_CLASS_NAME, null, DynamicPartitioner.class);
@SuppressWarnings("unchecked")
final DynamicPartitioner<K, V> dynamicPartitioner =
new InstantiatorFactory(false).get(TypeToken.of(partitionerClass)).create();
MapReduceClassLoader classLoader = MapReduceClassLoader.getFromConfiguration(configuration);
final BasicMapReduceTaskContext<K, V> taskContext = classLoader.getTaskContextProvider().get(job);
String outputDatasetName = configuration.get(Constants.Dataset.Partitioned.HCONF_ATTR_OUTPUT_DATASET);
PartitionedFileSet outputDataset = taskContext.getDataset(outputDatasetName);
final Partitioning partitioning = outputDataset.getPartitioning();
dynamicPartitioner.initialize(taskContext);
return new RecordWriter<K, V>() {
// a cache storing the record writers for different output files.
Map<PartitionKey, RecordWriter<K, V>> recordWriters = new HashMap<>();
public void write(K key, V value) throws IOException, InterruptedException {
PartitionKey partitionKey = dynamicPartitioner.getPartitionKey(key, value);
RecordWriter<K, V> rw = this.recordWriters.get(partitionKey);
if (rw == null) {
String relativePath = PartitionedFileSetDataset.getOutputPath(partitionKey, partitioning);
String finalPath = relativePath + "/" + outputName;
// if we don't have the record writer yet for the final path, create one and add it to the cache
rw = getBaseRecordWriter(getTaskAttemptContext(job, finalPath));
this.recordWriters.put(partitionKey, rw);
}
rw.write(key, value);
}
@Override
public void close(TaskAttemptContext context) throws IOException, InterruptedException {
try {
List<RecordWriter<?, ?>> recordWriters = new ArrayList<>();
recordWriters.addAll(this.recordWriters.values());
MultipleOutputs.closeRecordWriters(recordWriters, context);
taskContext.flushOperations();
} catch (Exception e) {
throw new IOException(e);
} finally {
dynamicPartitioner.destroy();
}
}
};
}
private boolean isAvroOutputFormat(FileOutputFormat<K, V> fileOutputFormat) {
String className = fileOutputFormat.getClass().getName();
// use class name String in order avoid having a dependency on the Avro libraries here
return "org.apache.avro.mapreduce.AvroKeyOutputFormat".equals(className)
|| "org.apache.avro.mapreduce.AvroKeyValueOutputFormat".equals(className);
}
private TaskAttemptContext getTaskAttemptContext(TaskAttemptContext context,
String newOutputName) throws IOException {
Job job = new Job(context.getConfiguration());
FileOutputFormat.setOutputName(job, newOutputName);
// CDAP-4806 We must set this parameter in addition to calling FileOutputFormat#setOutputName, because
// AvroKeyOutputFormat/AvroKeyValueOutputFormat use a different parameter for the output name than FileOutputFormat.
if (isAvroOutputFormat(getFileOutputFormat(context))) {
job.getConfiguration().set("avro.mo.config.namedOutput", newOutputName);
}
Path jobOutputPath = createJobSpecificPath(FileOutputFormat.getOutputPath(job), context);
FileOutputFormat.setOutputPath(job, jobOutputPath);
return new TaskAttemptContextImpl(job.getConfiguration(), context.getTaskAttemptID());
}
/**
* @return A RecordWriter object for the given TaskAttemptContext (configured for a particular file name).
* @throws IOException
*/
protected RecordWriter<K, V> getBaseRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException {
return getFileOutputFormat(job).getRecordWriter(job);
}
private FileOutputFormat<K, V> getFileOutputFormat(TaskAttemptContext job) {
if (fileOutputFormat == null) {
Class<? extends FileOutputFormat> delegateOutputFormat = job.getConfiguration()
.getClass(Constants.Dataset.Partitioned.HCONF_ATTR_OUTPUT_FORMAT_CLASS_NAME, null, FileOutputFormat.class);
@SuppressWarnings("unchecked")
FileOutputFormat<K, V> fileOutputFormat =
new InstantiatorFactory(false).get(TypeToken.of(delegateOutputFormat)).create();
this.fileOutputFormat = fileOutputFormat;
}
return fileOutputFormat;
}
// suffixes a Path with a job-specific string
private static Path createJobSpecificPath(Path path, JobContext jobContext) {
String outputPathSuffix = "_temporary_" + jobContext.getJobID().getId();
return new Path(path, outputPathSuffix);
}
@Override
public synchronized FileOutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException {
final Path jobSpecificOutputPath = createJobSpecificPath(getOutputPath(context), context);
return new DynamicPartitioningOutputCommitter(jobSpecificOutputPath, context);
}
@Override
public void checkOutputSpecs(JobContext job) throws IOException {
// Ensure that the output directory is set and not already there
Path outDir = getOutputPath(job);
if (outDir == null) {
throw new InvalidJobConfException("Output directory not set.");
}
// get delegation token for outDir's file system
TokenCache.obtainTokensForNamenodes(job.getCredentials(),
new Path[]{outDir}, job.getConfiguration());
// we permit multiple jobs writing to the same output directory. We handle this by each one writing to distinct
// paths within that directory. See createJobSpecificPath method and usages of it.
// additionally check that output dataset and dynamic partitioner class name has been set in conf
if (job.getConfiguration().get(Constants.Dataset.Partitioned.HCONF_ATTR_OUTPUT_DATASET) == null) {
throw new InvalidJobConfException("The job configuration does not contain required property: "
+ Constants.Dataset.Partitioned.HCONF_ATTR_OUTPUT_DATASET);
}
Class<? extends DynamicPartitioner> partitionerClass = job.getConfiguration()
.getClass(PartitionedFileSetArguments.DYNAMIC_PARTITIONER_CLASS_NAME, null, DynamicPartitioner.class);
if (partitionerClass == null) {
throw new InvalidJobConfException("The job configuration does not contain required property: "
+ PartitionedFileSetArguments.DYNAMIC_PARTITIONER_CLASS_NAME);
}
Class<? extends FileOutputFormat> delegateOutputFormatClass = job.getConfiguration()
.getClass(Constants.Dataset.Partitioned.HCONF_ATTR_OUTPUT_FORMAT_CLASS_NAME, null, FileOutputFormat.class);
if (delegateOutputFormatClass == null) {
throw new InvalidJobConfException("The job configuration does not contain required property: "
+ Constants.Dataset.Partitioned.HCONF_ATTR_OUTPUT_FORMAT_CLASS_NAME);
}
}
}