package org.commoncrawl.util;
import java.io.EOFException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.PositionedReadable;
import org.apache.hadoop.fs.Seekable;
import org.apache.hadoop.io.IOUtils;
import com.amazonaws.auth.BasicAWSCredentials;
import com.amazonaws.services.s3.AmazonS3Client;
import com.amazonaws.services.s3.model.ObjectMetadata;
/**
* A resilient blocking InputStream that reads data off of S3. Resilient - as in able to transparently recover from
* IO errors when reading data from S3.
*
* @author rana
*
*/
public class S3SeekableResilientInputStream extends InputStream implements Seekable, PositionedReadable {
URI _uri;
String _s3AccessKey;
String _s3Secret;
S3InputStream _s3Stream;
int _bufferSize;
// position of the file cursor
long _cursorPos = 0;
// position of the stream
long _streamPos = 0;
long _streamLength=-1;
int _retryCounts;
int _maxRetries;
/** logging **/
private static final Log LOG = LogFactory.getLog(S3SeekableResilientInputStream.class);
public S3SeekableResilientInputStream(URI uri,String s3AccessKey,String s3Secret,int bufferSize,int maxRetries)throws IOException{
_uri = uri;
_bufferSize = bufferSize;
_s3AccessKey = s3AccessKey;
_s3Secret = s3Secret;
_maxRetries = maxRetries;
_streamLength = getFileLength(uri, s3AccessKey, s3Secret);
restartStream();
}
private static long getFileLength(URI uri,String s3AccessKey,String s3Secret)throws IOException {
BasicAWSCredentials credentials
= new BasicAWSCredentials(
s3AccessKey,
s3Secret);
AmazonS3Client s3Client = new AmazonS3Client(credentials);
try {
ObjectMetadata metadata = s3Client.getObjectMetadata(uri.getHost(), uri.getPath().substring(1));
return metadata.getContentLength();
}
finally {
s3Client.shutdown();
}
}
private void restartStream()throws IOException {
if (_s3Stream != null) {
_s3Stream.close();
_s3Stream = null;
}
LOG.info("Restart Stream:" + _uri.toString() + " at Position:"+ _streamPos);
_s3Stream = new S3InputStream(_uri, _s3AccessKey, _s3Secret,_bufferSize,_streamPos);
}
@Override
public int read() throws IOException {
IOException lastException = null;
if (_streamPos != _cursorPos) {
_streamPos = _cursorPos;
restartStream();
}
do {
try {
int bytesRead = _s3Stream.read();
if (bytesRead != -1) {
_streamPos++;
_cursorPos++;
}
else if (_streamPos != _streamLength) {
throw new EOFException();
}
return bytesRead;
}
catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
lastException = e;
}
if (++_retryCounts < _maxRetries) {
restartStream();
}
else {
LOG.error("Retry Count for Stream:" + _uri.toString() + " Exceeded!");
break;
}
}
while (true);
throw lastException;
}
@Override
public int read(byte[] b, int off, int len) throws IOException {
if (_streamPos != _cursorPos) {
_streamPos = _cursorPos;
restartStream();
}
int bytesRead = streamRead(b, off, len);
if (bytesRead != -1)
_cursorPos += bytesRead;
return bytesRead;
}
private int streamRead(byte[] b, int off, int len) throws IOException {
IOException lastException = null;
do {
try {
int bytesRead = _s3Stream.read(b, off, len);
if (bytesRead != -1) {
_streamPos += bytesRead;
//LOG.info("Stream:" + _uri.toString() + " Pos:" + _streamPos);
}
else if (_streamPos != _streamLength) {
throw new EOFException();
}
return bytesRead;
}
catch (IOException e) {
LOG.error(CCStringUtils.stringifyException(e));
lastException = e;
}
if (++_retryCounts < _maxRetries) {
restartStream();
}
else {
LOG.error("Retry Count for Stream:" + _uri.toString() + " Exceeded!");
break;
}
}
while (true);
throw lastException;
}
@Override
public void close() throws IOException {
if (_s3Stream != null) {
_s3Stream.close();
_s3Stream = null;
}
}
@Override
public void seek(long pos) throws IOException {
_cursorPos = pos;
if (_streamPos != _cursorPos) {
_streamPos = _cursorPos;
restartStream();
}
}
@Override
public long getPos() throws IOException {
return _cursorPos;
}
@Override
public boolean seekToNewSource(long targetPos) throws IOException {
seek(targetPos);
return true;
}
@Override
public int read(long position, byte[] buffer, int offset, int length)throws IOException {
LOG.info("PREAD pos:" + position + " originalPos:" + _streamPos + " URI:" + _uri.toString());
if (_streamPos != position) {
_streamPos = position;
restartStream();
}
return streamRead(buffer,offset,length);
}
@Override
public void readFully(long position, byte[] buffer, int offset, int length)
throws IOException {
LOG.info("PREAD pos:" + position + " originalPos:" + _streamPos + " URI:" + _uri.toString());
if (_streamPos != position) {
_streamPos = position;
restartStream();
}
int toRead = length;
while (toRead > 0) {
int ret = streamRead(buffer, offset, toRead);
if (ret < 0) {
throw new IOException("Premature EOF from inputStream");
}
toRead -= ret;
offset += ret;
}
}
@Override
public void readFully(long position, byte[] buffer) throws IOException {
readFully(position, buffer, 0, buffer.length);
}
}