package com.scaleunlimited.cascading.hadoop; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.s3.S3FileSystem; import org.apache.hadoop.fs.s3native.NativeS3FileSystem; import org.apache.hadoop.mapred.JobConf; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.scaleunlimited.cascading.BasePath; public class HadoopPath extends BasePath { private static final Logger LOGGER = LoggerFactory.getLogger(HadoopPath.class); private static final String DISTCP_S3_FOLDER_MARKER_FILENAME_SUFFIX = "_$folder$"; private static final long S3_DELETION_LATENCY_MILLISECONDS = 5 * 60 * 1000L; private static final long S3_DELETION_RETRY_MILLISECONDS = 10 * 1000L; private Configuration _conf; private Path _hadoopPath; private FileSystem _hadoopFS; public HadoopPath(String path) throws IOException { this(path, new Configuration()); } public HadoopPath(String path, Configuration conf) throws IOException { super(path); _conf = conf; Path relativePath = new Path(path); _hadoopFS = relativePath.getFileSystem(_conf); if (!relativePath.isAbsolute()) { Path parent; if (HadoopUtils.isConfigLocal(_conf)) { // When running locally, there is a concept of a working directory // that we need to keep using. parent = _hadoopFS.getWorkingDirectory(); } else { // We always resolve paths relative to the home directory, as // using the "working directory" seems fragile - this seems to // change is some situations, even though we don't provide a // way for the caller to set it in the _hadoopFS that we use. parent = _hadoopFS.getHomeDirectory(); } _hadoopPath = new Path(parent, relativePath); } else { _hadoopPath = relativePath; } } public HadoopPath(BasePath parent, String subdir) throws IOException { this(parent, subdir, new Configuration()); } public HadoopPath(BasePath parent, String subdir, Configuration conf) throws IOException { this(parent.getPath() + "/" + subdir); Path parentPath = new Path(parent.getAbsolutePath()); _hadoopFS = parentPath.getFileSystem(conf); _hadoopPath = new Path(parentPath, subdir); } public Path getHadoopPath() { return _hadoopPath; } @Override public String getAbsolutePath() { return _hadoopPath.toString(); } @Override public boolean isFile() { try { return _hadoopFS.isFile(_hadoopPath); } catch (IOException e) { LOGGER.error("Exception getting information about Hadoop path: " + e.getMessage(), e); return false; } } @Override public boolean isDirectory() { try { return _hadoopFS.getFileStatus(_hadoopPath).isDir(); } catch (IOException e) { LOGGER.error("Exception getting information about Hadoop path: " + e.getMessage(), e); return false; } } @Override public boolean exists() { try { return _hadoopFS.exists(_hadoopPath); } catch (IOException e) { LOGGER.error("Exception getting information about Hadoop path: " + e.getMessage(), e); return false; } } @Override public boolean mkdirs() { try { return _hadoopFS.mkdirs(_hadoopPath); } catch (IOException e) { LOGGER.error("Exception creating directory for Hadoop path: " + e.getMessage(), e); return false; } } @Override public boolean rename(BasePath path) throws IOException { if (!(path instanceof HadoopPath)) { throw new IllegalArgumentException("HadoopPath can only be renamed to another HadoopPath."); } return _hadoopFS.rename(_hadoopPath, ((HadoopPath) path)._hadoopPath); } @Override public boolean delete(boolean isRecursive) { try { // Try really hard to make things easy for S3 folder deletions // by ensuring that folder names always end with a slash. Path parentPath = _hadoopPath.getParent(); String targetName = _hadoopPath.getName(); Path targetPath = _hadoopPath; boolean isFolder = !(_hadoopFS.isFile(targetPath)); if ( isFolder && (!(targetName.endsWith(Path.SEPARATOR)))) { targetName = targetName + Path.SEPARATOR; targetPath = new Path(parentPath, targetName); } boolean result = _hadoopFS.delete(targetPath, isRecursive); // Apparently, a folder delete in S3 can return true, even though // the folder might still exist. :( if (result) { // Give S3 some time to finish deleting the object so we // won't think the deletion failed when it is really going to // succeed soon. if ( (_hadoopFS instanceof S3FileSystem) || (_hadoopFS instanceof NativeS3FileSystem)) { long shouldBeCompleteTime = System.currentTimeMillis(); while (exists()) { long extraTime = ( System.currentTimeMillis() - shouldBeCompleteTime); if (extraTime > S3_DELETION_LATENCY_MILLISECONDS) { String message = String.format( "I have patiently waited %d seconds, but S3 still has not finished deleting %s - aborting!", (extraTime / 1000), _hadoopPath); LOGGER.error(message); return false; } String message = String.format( "S3 still has not finished deleting %s after %d seconds, continuing to wait...", _hadoopPath, (extraTime / 1000)); LOGGER.error(message); Thread.sleep(S3_DELETION_RETRY_MILLISECONDS); } // Ensure that the special folder file associated with // (i.e., defining) S3 "folders" is always deleted as well. if (isFolder) { String folderMarkerFileName = ( targetName.substring(0, targetName.length()-1) + DISTCP_S3_FOLDER_MARKER_FILENAME_SUFFIX); Path folderMarkerPath = new Path(parentPath, folderMarkerFileName); if (_hadoopFS.exists(folderMarkerPath)) { boolean folderMarkerDeleteResult = _hadoopFS.delete(folderMarkerPath, false); String message = String.format( "Extra [Native]S3FileSystem.delete of %s returned %s", folderMarkerPath, folderMarkerDeleteResult); LOGGER.info(message); } } } if (exists()) { String message = String.format( "FileSystem.delete of %s returned true but exists still returns true", _hadoopPath); LOGGER.error(message); return false; } } else { String message = String.format( "Initial FileSystem.delete of %s returned false", _hadoopPath); LOGGER.debug(message); return false; } // Pass the result of the target deletion on to the caller return result; } catch (Exception e) { LOGGER.error("Exception deleting Hadoop path: " + e.getMessage(), e); return false; } } @Override public String getName() { return _hadoopPath.getName(); } @Override public BasePath[] list() throws Exception { FileStatus[] files = _hadoopFS.listStatus(_hadoopPath); HadoopPath[] result = new HadoopPath[files.length]; for (int i = 0; i < files.length; i++) { result[i] = new HadoopPath(files[i].getPath().toString(), _conf); } return result; } @Override public boolean isLocal() { return _hadoopFS.getUri().getScheme().equals("file://"); } @Override public boolean createNewFile() throws IOException { return _hadoopFS.createNewFile(_hadoopPath); } @Override public InputStream openInputStream() throws IOException { return _hadoopFS.open(_hadoopPath); } @Override public OutputStream openOutputStream() throws IOException { return _hadoopFS.create(_hadoopPath, true); } @Override public String toString() { return getAbsolutePath(); } }