/* * Copyright (c) 2007-2010 Concurrent, Inc. All Rights Reserved. * * Project and contact information: http://www.cascading.org/ * * This file is part of the Cascading project. * * Cascading is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * Cascading is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with Cascading. If not, see <http://www.gnu.org/licenses/>. */ package cascading.tap; import java.beans.ConstructorProperties; import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; import java.util.Map; import cascading.flow.hadoop.HadoopUtil; import cascading.scheme.Scheme; import cascading.scheme.SequenceFile; import cascading.tap.hadoop.TapCollector; import cascading.tap.hadoop.TapIterator; import cascading.tuple.Fields; import cascading.tuple.TupleEntryCollector; import cascading.tuple.TupleEntryIterator; import cascading.tuple.hadoop.TupleSerialization; import cascading.util.Util; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.s3native.NativeS3FileSystem; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobConf; import org.apache.log4j.Logger; /** * Class Hfs is the base class for all Hadoop file system access. Use {@link Dfs}, {@link Lfs}, or {@link S3fs} * for resources specific to Hadoop Distributed file system, the Local file system, or Amazon S3, respectively. * <p/> * Use the Hfs class if the 'kind' of resource is unknown at design time. To use, prefix a scheme to the 'stringPath'. Where * <code>hdfs://...</code> will denonte Dfs, <code>file://...</code> will denote Lfs, and * <code>s3://aws_id:aws_secret@bucket/...</code> will denote S3fs. * <p/> * Call {@link #setTemporaryDirectory(java.util.Map, String)} to use a different temporary file directory path * other than the current Hadoop default path. */ public class Hfs extends Tap { /** Field LOG */ private static final Logger LOG = Logger.getLogger( Hfs.class ); /** Field serialVersionUID */ private static final long serialVersionUID = 1L; /** Field TEMPORARY_DIRECTORY */ private static final String TEMPORARY_DIRECTORY = "cascading.tmp.dir"; /** Field stringPath */ String stringPath; /** Field uriScheme */ transient URI uriScheme; /** Field path */ transient Path path; /** Field paths */ private transient FileStatus[] statuses; /** * Method setTemporaryDirectory sets the temporary directory on the given properties object. * * @param properties of type Map<Object,Object> * @param tempDir of type String */ public static void setTemporaryDirectory( Map<Object, Object> properties, String tempDir ) { properties.put( TEMPORARY_DIRECTORY, tempDir ); } /** * Methdo getTemporaryDirectory returns the configured temporary directory from the given properties object. * * @param properties of type Map<Object,Object> * @return a String or null if not set */ public static String getTemporaryDirectory( Map<Object, Object> properties ) { return (String) properties.get( TEMPORARY_DIRECTORY ); } protected Hfs() { } @ConstructorProperties({"scheme"}) protected Hfs( Scheme scheme ) { super( scheme ); } /** * Constructor Hfs creates a new Hfs instance. * * @param fields of type Fields * @param stringPath of type String */ @ConstructorProperties({"fields", "stringPath"}) public Hfs( Fields fields, String stringPath ) { super( new SequenceFile( fields ) ); setStringPath( stringPath ); } /** * Constructor Hfs creates a new Hfs instance. * * @param fields of type Fields * @param stringPath of type String * @param replace of type boolean */ @ConstructorProperties({"fields", "stringPath", "replace"}) public Hfs( Fields fields, String stringPath, boolean replace ) { super( new SequenceFile( fields ), replace ? SinkMode.REPLACE : SinkMode.KEEP ); setStringPath( stringPath ); } /** * Constructor Hfs creates a new Hfs instance. * * @param fields of type Fields * @param stringPath of type String * @param sinkMode of type SinkMode */ @ConstructorProperties({"fields", "stringPath", "sinkMode"}) public Hfs( Fields fields, String stringPath, SinkMode sinkMode ) { super( new SequenceFile( fields ), sinkMode ); setStringPath( stringPath ); if( sinkMode == SinkMode.UPDATE ) throw new IllegalArgumentException( "updates are not supported" ); } /** * Constructor Hfs creates a new Hfs instance. * * @param scheme of type Scheme * @param stringPath of type String */ @ConstructorProperties({"scheme", "stringPath"}) public Hfs( Scheme scheme, String stringPath ) { super( scheme ); setStringPath( stringPath ); } /** * Constructor Hfs creates a new Hfs instance. * * @param scheme of type Scheme * @param stringPath of type String * @param replace of type boolean */ @ConstructorProperties({"scheme", "stringPath", "replace"}) public Hfs( Scheme scheme, String stringPath, boolean replace ) { super( scheme, replace ? SinkMode.REPLACE : SinkMode.KEEP ); setStringPath( stringPath ); } /** * Constructor Hfs creates a new Hfs instance. * * @param scheme of type Scheme * @param stringPath of type String * @param sinkMode of type SinkMode */ @ConstructorProperties({"scheme", "stringPath", "sinkMode"}) public Hfs( Scheme scheme, String stringPath, SinkMode sinkMode ) { super( scheme, sinkMode ); setStringPath( stringPath ); } protected void setStringPath( String stringPath ) { this.stringPath = Util.normalizeUrl( stringPath ); } protected void setUriScheme( URI uriScheme ) { this.uriScheme = uriScheme; } public URI getURIScheme( JobConf jobConf ) throws IOException { if( uriScheme != null ) return uriScheme; uriScheme = makeURIScheme( jobConf ); return uriScheme; } protected URI makeURIScheme( JobConf jobConf ) throws IOException { try { URI uriScheme = null; if( LOG.isDebugEnabled() ) LOG.debug( "handling path: " + stringPath ); URI uri = new URI( stringPath ); String schemeString = uri.getScheme(); String authority = uri.getAuthority(); if( LOG.isDebugEnabled() ) { LOG.debug( "found scheme: " + schemeString ); LOG.debug( "found authority: " + authority ); } if( schemeString != null && authority != null ) uriScheme = new URI( schemeString + "://" + uri.getAuthority() ); else if( schemeString != null ) uriScheme = new URI( schemeString + ":///" ); else uriScheme = getDefaultFileSystemURIScheme( jobConf ); if( LOG.isDebugEnabled() ) LOG.debug( "using uri scheme: " + uriScheme ); return uriScheme; } catch( URISyntaxException exception ) { throw new TapException( "could not determine scheme from path: " + getPath(), exception ); } } /** * Method getDefaultFileSystemURIScheme returns the URI scheme for the default Hadoop FileSystem. * * @param jobConf of type JobConf * @return URI * @throws IOException when */ public URI getDefaultFileSystemURIScheme( JobConf jobConf ) throws IOException { return getDefaultFileSystem( jobConf ).getUri(); } @Override public boolean isWriteDirect() { return super.isWriteDirect() || stringPath != null && stringPath.matches( "(^https?://.*$)|(^s3tp://.*$)" ); } protected FileSystem getDefaultFileSystem( JobConf jobConf ) throws IOException { return FileSystem.get( jobConf ); } protected FileSystem getFileSystem( JobConf jobConf ) throws IOException { return FileSystem.get( getURIScheme( jobConf ), jobConf ); } /** @see Tap#getPath() */ @Override public Path getPath() { if( path != null ) return path; if( stringPath == null ) throw new IllegalStateException( "path not initialized" ); path = new Path( stringPath ); return path; } @Override public Path getQualifiedPath( JobConf conf ) throws IOException { return getPath().makeQualified( getFileSystem( conf ) ); } @Override public void sourceInit( JobConf conf ) throws IOException { Path qualifiedPath = getQualifiedPath( conf ); for( Path exitingPath : FileInputFormat.getInputPaths( conf ) ) { if( exitingPath.equals( qualifiedPath ) ) throw new TapException( "may not add duplicate paths, found: " + exitingPath ); } FileInputFormat.addInputPath( conf, qualifiedPath ); super.sourceInit( conf ); makeLocal( conf, qualifiedPath, "forcing job to local mode, via source: " ); TupleSerialization.setSerializations( conf ); // allows Hfs to be used independent of Flow } @Override public void sinkInit( JobConf conf ) throws IOException { // do not delete if initialized from within a task if( isReplace() && conf.get( "mapred.task.partition" ) == null ) deletePath( conf ); Path qualifiedPath = getQualifiedPath( conf ); FileOutputFormat.setOutputPath( conf, qualifiedPath ); super.sinkInit( conf ); makeLocal( conf, qualifiedPath, "forcing job to local mode, via sink: " ); TupleSerialization.setSerializations( conf ); // allows Hfs to be used independent of Flow } private void makeLocal( JobConf conf, Path qualifiedPath, String infoMessage ) { if( !conf.get( "mapred.job.tracker", "" ).equalsIgnoreCase( "local" ) && qualifiedPath.toUri().getScheme().equalsIgnoreCase( "file" ) ) { if( LOG.isInfoEnabled() ) LOG.info( infoMessage + toString() ); conf.set( "mapred.job.tracker", "local" ); // force job to run locally } } @Override public boolean makeDirs( JobConf conf ) throws IOException { if( LOG.isDebugEnabled() ) LOG.debug( "making dirs: " + getQualifiedPath( conf ) ); return getFileSystem( conf ).mkdirs( getPath() ); } @Override public boolean deletePath( JobConf conf ) throws IOException { if( LOG.isDebugEnabled() ) LOG.debug( "deleting: " + getQualifiedPath( conf ) ); // do not delete the root directory if( getQualifiedPath( conf ).depth() == 0 ) return true; FileSystem fileSystem = getFileSystem( conf ); try { return fileSystem.delete( getPath(), true ); } catch( NullPointerException exception ) { // hack to get around npe thrown when fs reaches root directory if( !( fileSystem instanceof NativeS3FileSystem ) ) throw exception; } return true; } @Override public boolean pathExists( JobConf conf ) throws IOException { return getFileSystem( conf ).exists( getPath() ); } @Override public long getPathModified( JobConf conf ) throws IOException { FileStatus fileStatus = getFileSystem( conf ).getFileStatus( getPath() ); if( !fileStatus.isDir() ) return fileStatus.getModificationTime(); makeStatuses( conf ); // statuses is empty, return 0 if( statuses == null || statuses.length == 0 ) return 0; long date = 0; // filter out directories as we don't recurse into sub dirs for( FileStatus status : statuses ) { if( !status.isDir() ) date = Math.max( date, status.getModificationTime() ); } return date; } protected Path getTempPath( JobConf conf ) { String tempDir = conf.get( TEMPORARY_DIRECTORY ); if( tempDir == null ) tempDir = conf.get( "hadoop.tmp.dir" ); return new Path( tempDir ); } protected String makeTemporaryPathDir( String name ) { // _ is treated as a hidden file, so wipe them out name = name.replaceAll( "^[_\\W\\s]+", "" ); if( name.isEmpty() ) name = "temp-path"; return name.replaceAll( "[\\W\\s]+", "_" ) + Integer.toString( (int) ( 10000000 * Math.random() ) ); } /** * Given a file-system object, it makes an array of paths * * @param conf of type JobConf * @throws IOException on failure */ private void makeStatuses( JobConf conf ) throws IOException { if( statuses != null ) return; statuses = getFileSystem( conf ).listStatus( getPath() ); } /** @see Object#toString() */ @Override public String toString() { if( stringPath != null ) return getClass().getSimpleName() + "[\"" + getScheme() + "\"]" + "[\"" + Util.sanitizeUrl( stringPath ) + "\"]"; // sanitize else return getClass().getSimpleName() + "[\"" + getScheme() + "\"]" + "[not initialized]"; } /** @see Tap#equals(Object) */ @Override public boolean equals( Object object ) { if( this == object ) return true; if( object == null || getClass() != object.getClass() ) return false; if( !super.equals( object ) ) return false; Hfs hfs = (Hfs) object; if( stringPath != null ? !stringPath.equals( hfs.stringPath ) : hfs.stringPath != null ) return false; return true; } /** @see Tap#hashCode() */ @Override public int hashCode() { int result = super.hashCode(); result = 31 * result + ( stringPath != null ? stringPath.hashCode() : 0 ); return result; } public TupleEntryIterator openForRead( JobConf conf ) throws IOException { Map<Object, Object> properties = HadoopUtil.createProperties( conf ); properties.remove( "mapred.input.dir" ); conf = HadoopUtil.createJobConf( properties, null ); return new TupleEntryIterator( getSourceFields(), new TapIterator( this, conf ) ); } public TupleEntryCollector openForWrite( JobConf conf ) throws IOException { return new TapCollector( this, conf ); } }