/*
* The Alluxio Open Foundation licenses this work under the Apache License, version 2.0
* (the "License"). You may not use this work except in compliance with the License, which is
* available at www.apache.org/licenses/LICENSE-2.0
*
* This software is distributed on an "AS IS" basis, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
* either express or implied, as more fully set forth in the License.
*
* See the NOTICE file distributed with this work for information regarding copyright ownership.
*/
package alluxio.hadoop.mapreduce;
import alluxio.annotation.PublicApi;
import alluxio.client.keyvalue.KeyValueSystem;
import alluxio.exception.AlluxioException;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import javax.annotation.concurrent.ThreadSafe;
/**
* Extension of {@link FileOutputCommitter} where creating, completing, or deleting a {@link
* KeyValueSystem} in different phases of a job's or task's lifecycle is considered.
* <p>
* This committer must be used along with {@link KeyValueOutputFormat} to merge the key-value stores
* created by each Reducer into one key-value store under the MapReduce output directory.
*/
@PublicApi
@ThreadSafe
public final class KeyValueOutputCommitter extends FileOutputCommitter {
private static final Logger LOG = LoggerFactory.getLogger(KeyValueOutputCommitter.class);
private static final KeyValueSystem KEY_VALUE_SYSTEM = KeyValueSystem.Factory.create();
/**
* Constructor.
*
* @param outputPath the job's output path, or null if the output committer is a noop
* @param taskContext the task's context
*/
public KeyValueOutputCommitter(Path outputPath, TaskAttemptContext taskContext)
throws IOException {
super(outputPath, taskContext);
}
/**
* @param taskContext MapReduce task configuration
* @return true if the task output directory exists, otherwise false
*/
@Override
public boolean needsTaskCommit(TaskAttemptContext taskContext) throws IOException {
Path taskOutputPath = new Path(KeyValueOutputFormat.getTaskOutputURI(taskContext).toString());
FileSystem fs = taskOutputPath.getFileSystem(taskContext.getConfiguration());
return fs.exists(taskOutputPath);
}
/**
* {@inheritDoc}
* <p/>
* Merges the completed key-value store under the task's temporary output directory to the
* key-value store at job output directory, then calls {@link
* FileOutputCommitter#commitTask(TaskAttemptContext)}.
*/
@Override
public void commitTask(TaskAttemptContext taskContext) throws IOException {
try {
KEY_VALUE_SYSTEM.mergeStore(KeyValueOutputFormat.getTaskOutputURI(taskContext),
KeyValueOutputFormat.getJobOutputURI(taskContext));
} catch (AlluxioException e) {
throw new IOException(e);
}
super.commitTask(taskContext);
}
/**
* {@inheritDoc}
* <p/>
* Deletes the completed key-value stores under the task's temporary output directory, and then
* calls {@link FileOutputCommitter#abortTask(TaskAttemptContext)}.
*/
@Override
public void abortTask(TaskAttemptContext taskContext) {
// TODO(binfan): in Hadoop 1.x FileOutputCommitter#abortTask doesn't throw IOException. To
// keep the code compile with early Hadoop versions, we catch this exception.
try {
try {
KEY_VALUE_SYSTEM.deleteStore(KeyValueOutputFormat.getTaskOutputURI(taskContext));
} catch (AlluxioException e) {
throw new IOException(e);
}
super.abortTask(taskContext);
} catch (IOException e) {
LOG.error("Failed to abort task", taskContext);
}
}
/**
* @return the temp directory name
*/
public static String getPendingDirName() {
// Due to Hadoop 1 support we stick with the deprecated version. If we drop support for it
// FileOutputCommitter.PENDING_DIR_NAME will be the new one.
return FileOutputCommitter.TEMP_DIR_NAME;
}
}