/* * Copyright (c) 2007-2010 Concurrent, Inc. All Rights Reserved. * * Project and contact information: http://www.cascading.org/ * * This file is part of the Cascading project. * * Cascading is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Cascading is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Cascading. If not, see <http://www.gnu.org/licenses/>. */ package cascading.tap.hadoop; import java.io.IOException; import java.net.URI; import java.util.HashMap; import java.util.Map; import java.util.concurrent.atomic.AtomicInteger; import cascading.tap.Tap; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobConf; import org.apache.log4j.Logger; public class Hadoop18TapUtil { private static final Logger LOG = Logger.getLogger( Hadoop18TapUtil.class ); private static final String TEMPORARY_PATH = "_temporary"; private static Map<String, AtomicInteger> pathCounts = new HashMap<String, AtomicInteger>(); /** * should only be called if not in a Flow * * @param conf * @throws IOException */ public static void setupJob( JobConf conf ) throws IOException { Path outputPath = FileOutputFormat.getOutputPath( conf ); if( outputPath == null ) return; if( getFSSafe( conf, outputPath ) == null ) return; if( conf.get( "mapred.task.id" ) == null ) // need to stuff a fake id { String mapper = conf.getBoolean( "mapred.task.is.map", true ) ? "m" : "r"; conf.set( "mapred.task.id", String.format( "attempt_%012d_0000_%s_000000_0", (int) Math.rint( System.currentTimeMillis() ), mapper ) ); } makeTempPath( conf ); if( writeDirectlyToWorkingPath( conf, outputPath ) ) { LOG.info( "writing directly to output path: " + outputPath ); setWorkOutputPath( conf, outputPath ); return; } // "mapred.work.output.dir" Path taskOutputPath = getTaskOutputPath( conf ); setWorkOutputPath( conf, taskOutputPath ); } static synchronized void setupTask( JobConf conf ) throws IOException { String workpath = conf.get( "mapred.work.output.dir" ); if( workpath == null ) return; FileSystem fs = getFSSafe( conf, new Path( workpath ) ); if( fs == null ) return; String taskId = conf.get( "mapred.task.id" ); LOG.info( "setting up task: '" + taskId + "' - " + workpath ); AtomicInteger integer = pathCounts.get( workpath ); if( integer == null ) { integer = new AtomicInteger(); pathCounts.put( workpath, integer ); } integer.incrementAndGet(); } public static boolean needsTaskCommit( JobConf conf ) throws IOException { String workpath = conf.get( "mapred.work.output.dir" ); if( workpath == null ) return false; Path taskOutputPath = new Path( workpath ); if( taskOutputPath != null ) { FileSystem fs = getFSSafe( conf, taskOutputPath ); if( fs == null ) return false; if( fs.exists( taskOutputPath ) ) return true; } return false; } /** * copies all files from the taskoutputpath to the outputpath * * @param conf */ public static void commitTask( JobConf conf ) throws IOException { Path taskOutputPath = new Path( conf.get( "mapred.work.output.dir" ) ); FileSystem fs = getFSSafe( conf, taskOutputPath ); if( fs == null ) return; AtomicInteger integer = pathCounts.get( taskOutputPath.toString() ); if( integer.decrementAndGet() != 0 ) return; String taskId = conf.get( "mapred.task.id" ); LOG.info( "committing task: '" + taskId + "' - " + taskOutputPath ); if( taskOutputPath != null ) { if( writeDirectlyToWorkingPath( conf, taskOutputPath ) ) return; if( fs.exists( taskOutputPath ) ) { Path jobOutputPath = taskOutputPath.getParent().getParent(); // Move the task outputs to their final place moveTaskOutputs( conf, fs, jobOutputPath, taskOutputPath ); // Delete the temporary task-specific output directory if( !fs.delete( taskOutputPath, true ) ) LOG.info( "failed to delete the temporary output directory of task: '" + taskId + "' - " + taskOutputPath ); LOG.info( "saved output of task '" + taskId + "' to " + jobOutputPath ); } } } /** * Called from flow step to remove temp dirs * * @param conf * @throws IOException */ public static void cleanupTap( JobConf conf, Tap tap ) throws IOException { cleanTempPath( conf, tap.getPath() ); } /** * May only be called once. should only be called if not in a flow * * @param conf */ static void cleanupJob( JobConf conf ) throws IOException { if( isInflow( conf ) ) return; Path outputPath = FileOutputFormat.getOutputPath( conf ); cleanTempPath( conf, outputPath ); } private static synchronized void cleanTempPath( JobConf conf, Path outputPath ) throws IOException { // do the clean up of temporary directory if( outputPath != null ) { FileSystem fileSys = getFSSafe( conf, outputPath ); if( fileSys == null ) return; if( !fileSys.exists( outputPath ) ) return; Path tmpDir = new Path( outputPath, TEMPORARY_PATH ); LOG.info( "deleting temp path " + tmpDir ); if( fileSys.exists( tmpDir ) ) fileSys.delete( tmpDir, true ); } } private static FileSystem getFSSafe( JobConf conf, Path tmpDir ) { try { return tmpDir.getFileSystem( conf ); } catch( IOException e ) { // ignore } return null; } static boolean isInflow( JobConf conf ) { return conf.get( "cascading.flow.step" ) != null; } private static Path getTaskOutputPath( JobConf conf ) { String taskId = conf.get( "mapred.task.id" ); Path p = new Path( FileOutputFormat.getOutputPath( conf ), TEMPORARY_PATH + Path.SEPARATOR + "_" + taskId ); try { FileSystem fs = p.getFileSystem( conf ); return p.makeQualified( fs ); } catch( IOException ie ) { return p; } } static void setWorkOutputPath( JobConf conf, Path outputDir ) { outputDir = new Path( conf.getWorkingDirectory(), outputDir ); conf.set( "mapred.work.output.dir", outputDir.toString() ); } public static void makeTempPath( JobConf conf ) throws IOException { // create job specific temporary directory in output path Path outputPath = FileOutputFormat.getOutputPath( conf ); if( outputPath != null ) { Path tmpDir = new Path( outputPath, TEMPORARY_PATH ); FileSystem fileSys = tmpDir.getFileSystem( conf ); if( !fileSys.exists( tmpDir ) && !fileSys.mkdirs( tmpDir ) ) { LOG.error( "mkdirs failed to create " + tmpDir.toString() ); } } } private static void moveTaskOutputs( JobConf conf, FileSystem fs, Path jobOutputDir, Path taskOutput ) throws IOException { String taskId = conf.get( "mapred.task.id" ); if( fs.isFile( taskOutput ) ) { Path finalOutputPath = getFinalPath( jobOutputDir, taskOutput, getTaskOutputPath( conf ) ); if( !fs.rename( taskOutput, finalOutputPath ) ) { if( !fs.delete( finalOutputPath, true ) ) { throw new IOException( "Failed to delete earlier output of task: " + taskId ); } if( !fs.rename( taskOutput, finalOutputPath ) ) { throw new IOException( "Failed to save output of task: " + taskId ); } } LOG.debug( "Moved " + taskOutput + " to " + finalOutputPath ); } else if( fs.getFileStatus( taskOutput ).isDir() ) { FileStatus[] paths = fs.listStatus( taskOutput ); Path finalOutputPath = getFinalPath( jobOutputDir, taskOutput, getTaskOutputPath( conf ) ); fs.mkdirs( finalOutputPath ); if( paths != null ) { for( FileStatus path : paths ) { moveTaskOutputs( conf, fs, jobOutputDir, path.getPath() ); } } } } private static Path getFinalPath( Path jobOutputDir, Path taskOutput, Path taskOutputPath ) throws IOException { URI taskOutputUri = taskOutput.toUri(); URI relativePath = taskOutputPath.toUri().relativize( taskOutputUri ); if( taskOutputUri == relativePath ) {//taskOutputPath is not a parent of taskOutput throw new IOException( "Can not get the relative path: base = " + taskOutputPath + " child = " + taskOutput ); } if( relativePath.getPath().length() > 0 ) { return new Path( jobOutputDir, relativePath.getPath() ); } else { return jobOutputDir; } } /** used in AWS EMR to disable temp paths on some file systems, s3. */ private static boolean writeDirectlyToWorkingPath( JobConf conf, Path path ) { FileSystem fs = getFSSafe( conf, path ); if( fs == null ) return false; boolean result = conf.getBoolean( "mapred.output.direct." + fs.getClass().getSimpleName(), false ); if( result ) LOG.info( "output direct is enabled for this fs: " + fs.getName() ); return result; } }