package org.commoncrawl.util; import java.io.IOException; import java.net.URI; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.permission.FsPermission; import org.apache.hadoop.fs.s3native.NativeS3FileSystem; import org.apache.hadoop.util.Progressable; import com.amazonaws.auth.BasicAWSCredentials; import com.amazonaws.services.s3.AmazonS3Client; /** * Beginnings of a replacement for the S3N FileSystem * * @author rana * */ public class S3NFileSystem extends NativeS3FileSystem { AmazonS3Client _s3Client; @Override public void initialize(URI uri, Configuration conf) throws IOException { super.initialize(uri, conf); _s3Client = new AmazonS3Client(new BasicAWSCredentials(getConf().get("fs.s3n.awsAccessKeyId"),getConf().get("fs.s3n.awsSecretAccessKey"))); } @Override public void close() throws IOException { super.close(); _s3Client.shutdown(); } @Override public FSDataInputStream open(Path f, int bufferSize) throws IOException { // make absolute if (!f.isAbsolute()) { f = new Path(getWorkingDirectory(),f); } // qualify with f = makeQualified(f); return new FSDataInputStream( new S3SeekableResilientInputStream( f.toUri(), getConf().get("fs.s3n.awsAccessKeyId"), getConf().get("fs.s3n.awsSecretAccessKey"), bufferSize, 100)); } private Path makeAbsolute(Path path) { if (path.isAbsolute()) { return path; } return new Path(getWorkingDirectory(), path); } @Override public FSDataOutputStream create(Path f, FsPermission permission, boolean overwrite, int bufferSize, short replication, long blockSize, Progressable progress) throws IOException { if (exists(f) && !overwrite) { throw new IOException("File already exists:"+f); } Path absolutePath = makeAbsolute(f); Path qualifiedPath = makeQualified(absolutePath); return new FSDataOutputStream( new S3MultipartUploadStream( qualifiedPath.toUri(), getConf().get("fs.s3n.awsAccessKeyId"), getConf().get("fs.s3n.awsSecretAccessKey"), overwrite, null, S3MultipartUploadStream.DEFAULT_UPLOADER_COUNT, S3MultipartUploadStream.DEFAULT_PART_BUFFER_SIZE, Math.max(S3MultipartUploadStream.DEFAULT_MAX_QUEUED_COUNT, bufferSize / S3MultipartUploadStream.DEFAULT_PART_BUFFER_SIZE), S3MultipartUploadStream.DEFAULT_MAX_RETRIES_PER_PART), statistics); } }