/**
* Copyright 2012 - CommonCrawl Foundation
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
**/
package org.commoncrawl.mapred.ec2.parser;
import java.io.IOException;
import java.net.URI;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.FileOutputCommitter;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.TaskAttemptContext;
import org.apache.hadoop.mapred.TaskAttemptID;
import org.apache.hadoop.util.StringUtils;
import org.commoncrawl.util.CCStringUtils;
import org.commoncrawl.util.TaskDataUtils;
import org.commoncrawl.util.TaskDataUtils.TaskDataClient;
/**
* A clone of portions of the FileOutputCommitter in Hadoop used to help
* isolate issues we were having with running the job on EMR using the S3N
* FileSystem.
*
* @author rana
*
*/
public class OutputCommitter extends FileOutputCommitter {
static String _taskDataCommitKey = null;
static String _taskDataCommitValue = null;
// hack...
static void setTaskDataCommitInfo(String key,String value) {
_taskDataCommitKey = key;
_taskDataCommitValue = value;
}
TaskDataClient _taskDataClient;
@Override
public void setupJob(org.apache.hadoop.mapred.JobContext context) throws IOException {
LOG.info("Setup Job Called on Custom Committer");
super.setupJob(context);
}
@Override
public void setupTask(TaskAttemptContext context) throws IOException {
super.setupTask(context);
_taskDataClient = TaskDataUtils.getTaskDataClientForTask(context.getJobConf());
};
Path getTempTaskOutputPath(TaskAttemptContext taskContext) {
JobConf conf = taskContext.getJobConf();
Path outputPath = FileOutputFormat.getOutputPath(conf);
if (outputPath != null) {
Path p = new Path(outputPath,
(FileOutputCommitter.TEMP_DIR_NAME + Path.SEPARATOR +
"_" + taskContext.getTaskAttemptID().toString()));
try {
FileSystem fs = p.getFileSystem(conf);
return p.makeQualified(fs);
} catch (IOException ie) {
LOG.warn(StringUtils .stringifyException(ie));
return p;
}
}
return null;
}
@Override
public void abortTask(TaskAttemptContext context) throws IOException {
super.abortTask(context);
if (_taskDataClient != null) {
try {
_taskDataClient.shutdown();
}
finally {
_taskDataClient = null;
_taskDataCommitKey = null;
_taskDataCommitValue = null;
}
}
}
@Override
public void commitTask(TaskAttemptContext context)
throws IOException {
LOG.info("Commit Called on Task:" + context.getTaskAttemptID().toString());
Path taskOutputPath = getTempTaskOutputPath(context);
TaskAttemptID attemptId = context.getTaskAttemptID();
JobConf job = context.getJobConf();
if (taskOutputPath != null) {
FileSystem fs = taskOutputPath.getFileSystem(job);
LOG.info("FileSystem for commit for Task:"+ attemptId + " is:" + fs.getUri());
context.getProgressible().progress();
if (fs.exists(taskOutputPath)) {
Path jobOutputPath = taskOutputPath.getParent().getParent();
// Move the task outputs to their final place
moveTaskOutputs(context, fs, jobOutputPath, taskOutputPath);
// Delete the temporary task-specific output directory
if (!fs.delete(taskOutputPath, true)) {
LOG.info("Failed to delete the temporary output" +
" directory of task: " + attemptId + " - " + taskOutputPath);
}
LOG.info("Saved output of task '" + attemptId + "' to " + jobOutputPath);
}
}
if (_taskDataClient != null) {
try {
if (_taskDataCommitKey != null && _taskDataCommitValue != null) {
_taskDataClient.updateTaskData(_taskDataCommitKey,_taskDataCommitValue);
}
_taskDataClient.shutdown();
}
finally {
_taskDataClient = null;
_taskDataCommitKey = null;
_taskDataCommitValue = null;
}
}
}
private void moveTaskOutputs(TaskAttemptContext context,
FileSystem fs,
Path jobOutputDir,
Path taskOutput)
throws IOException {
TaskAttemptID attemptId = context.getTaskAttemptID();
context.getProgressible().progress();
if (fs.isFile(taskOutput)) {
Path finalOutputPath = getFinalPath(jobOutputDir, taskOutput,
getTempTaskOutputPath(context));
LOG.info("Renaming:" + taskOutput + " to:" + finalOutputPath);
if (!fs.rename(taskOutput, finalOutputPath)) {
LOG.info("Rename Failed for:" + taskOutput + " to:" + finalOutputPath + " Trying Delete and then Rename");
if (!fs.delete(finalOutputPath, true)) {
throw new IOException("Failed to delete earlier output of task: " +
attemptId);
}
LOG.info("Renaming:" + taskOutput + " to: " + finalOutputPath);
if (!fs.rename(taskOutput, finalOutputPath)) {
throw new IOException("Failed to save output of task: " +
attemptId);
}
}
LOG.info("Moved " + taskOutput + " to " + finalOutputPath);
} else if(fs.getFileStatus(taskOutput).isDir()) {
FileStatus[] paths = fs.listStatus(taskOutput);
Path finalOutputPath = getFinalPath(jobOutputDir, taskOutput,
getTempTaskOutputPath(context));
LOG.info("Moving " + taskOutput + " to " + finalOutputPath);
fs.mkdirs(finalOutputPath);
if (paths != null) {
for (FileStatus path : paths) {
LOG.info("Moving " + path.getPath());
moveTaskOutputs(context, fs, jobOutputDir, path.getPath());
}
}
}
}
private Path getFinalPath(Path jobOutputDir, Path taskOutput,
Path taskOutputPath) throws IOException {
URI taskOutputUri = taskOutput.toUri();
URI relativePath = taskOutputPath.toUri().relativize(taskOutputUri);
if (taskOutputUri == relativePath) {//taskOutputPath is not a parent of taskOutput
throw new IOException("Can not get the relative path: base = " +
taskOutputPath + " child = " + taskOutput);
}
if (relativePath.getPath().length() > 0) {
return new Path(jobOutputDir, relativePath.getPath());
} else {
return jobOutputDir;
}
}
@Override
public boolean needsTaskCommit(TaskAttemptContext context)
throws IOException {
LOG.info("COMMITTER- Needs Commit Called on:" + context.getTaskAttemptID().toString());
try {
Path taskOutputPath = getTempTaskOutputPath(context);
if (taskOutputPath != null) {
context.getProgressible().progress();
FileSystem fs = FileSystem.get(context.getJobConf());
LOG.info("COMMITTER - Default FS is:" + fs.getUri());
// Get the file-system for the task output directory
FileSystem fsFromPath = taskOutputPath.getFileSystem(context.getJobConf());
// since task output path is created on demand,
// if it exists, task needs a commit
LOG.info("COMMITTER - Checking if outputPath Exists:" + taskOutputPath + " for task:" +context.getTaskAttemptID().toString() );
if (fs.exists(taskOutputPath)) {
LOG.info("Needs Commit Returning TRUE");
return true;
}
}
} catch (IOException ioe) {
throw ioe;
}
LOG.info("COMMITTER Needs Commit Returning FALSE");
return false;
}
}