/*
* Copyright © 2015 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package co.cask.cdap.internal.app.runtime.batch.dataset;
import co.cask.cdap.api.data.batch.BatchWritable;
import com.google.common.reflect.TypeToken;
import com.google.gson.Gson;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.OutputCommitter;
import org.apache.hadoop.mapreduce.OutputFormat;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import java.io.IOException;
import java.lang.reflect.Type;
import java.util.Map;
/**
* An abstract base implementation of {@link OutputFormat} for writing to {@link BatchWritable} from batch job.
*
* @param <KEY> type of the key
* @param <VALUE> type of the value
*/
public abstract class AbstractBatchWritableOutputFormat<KEY, VALUE> extends OutputFormat<KEY, VALUE> {
private static final Gson GSON = new Gson();
private static final Type DATASET_ARGS_TYPE = new TypeToken<Map<String, String>>() { }.getType();
private static final String DATASET_NAME = "output.datasetoutputformat.dataset.name";
private static final String DATASET_ARGS = "output.datasetoutputformat.dataset.args";
/**
* Sets dataset information into the given {@link Configuration}.
*
* @param hConf configuration to modify
* @param datasetName name of the dataset
* @param datasetArgs arguments for the dataset
*/
public static void setDataset(Configuration hConf, String datasetName, Map<String, String> datasetArgs) {
hConf.set(DATASET_NAME, datasetName);
hConf.set(DATASET_ARGS, GSON.toJson(datasetArgs, DATASET_ARGS_TYPE));
}
@Override
public RecordWriter<KEY, VALUE> getRecordWriter(TaskAttemptContext context) throws IOException, InterruptedException {
Configuration conf = context.getConfiguration();
String datasetName = conf.get(DATASET_NAME);
Map<String, String> datasetArgs = GSON.fromJson(conf.get(DATASET_ARGS), DATASET_ARGS_TYPE);
return new BatchWritableRecordWriter<>(createBatchWritable(context, datasetName, datasetArgs));
}
@Override
public void checkOutputSpecs(JobContext context) throws IOException, InterruptedException {
Configuration hConf = context.getConfiguration();
if (hConf.get(DATASET_NAME) == null || hConf.get(DATASET_ARGS) == null) {
throw new IOException("Dataset configurations are missing in the job configuration");
}
}
@Override
public OutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException, InterruptedException {
return new NoopOutputCommitter();
}
/**
* Subclass needs to implementation this method to return a {@link BatchWritable} for writing records to
* the given dataset.
*
* @param context the hadoop task context
* @param datasetName name of the dataset to write to
* @param datasetArgs arguments of the dataset to write to
*/
protected abstract CloseableBatchWritable<KEY, VALUE> createBatchWritable(TaskAttemptContext context,
String datasetName,
Map<String, String> datasetArgs);
/**
* Implementation of {@link RecordWriter} to write through a {@link CloseableBatchWritable}.
*/
private static final class BatchWritableRecordWriter<K, V> extends RecordWriter<K, V> {
private final CloseableBatchWritable<K, V> delegate;
private BatchWritableRecordWriter(CloseableBatchWritable<K, V> delegate) {
this.delegate = delegate;
}
@Override
public void write(K key, V value) throws IOException, InterruptedException {
delegate.write(key, value);
}
@Override
public void close(TaskAttemptContext context) throws IOException, InterruptedException {
delegate.close();
}
}
/**
* A no-op implementation of {@link OutputCommitter}.
*/
private static final class NoopOutputCommitter extends OutputCommitter {
@Override
public void setupJob(final JobContext jobContext) throws IOException {
// DO NOTHING, see needsTaskCommit() comment
}
@Override
public boolean needsTaskCommit(final TaskAttemptContext taskContext) throws IOException {
// Don't do commit of individual task work. Work is committed on job level. Ops are flushed on RecordWriter.close.
return false;
}
@Override
public void setupTask(final TaskAttemptContext taskContext) throws IOException {
// DO NOTHING, see needsTaskCommit() comment
}
@Override
public void commitTask(final TaskAttemptContext taskContext) throws IOException {
// DO NOTHING, see needsTaskCommit() comment
}
@Override
public void abortTask(final TaskAttemptContext taskContext) throws IOException {
// DO NOTHING, see needsTaskCommit() comment
}
}
}