package water.persist;
import java.io.*;
import java.net.SocketTimeoutException;
import java.util.Arrays;
import java.util.Properties;
import water.*;
import water.Job.ProgressMonitor;
import water.api.Constants.Extensions;
import water.fvec.FileVec;
import water.fvec.Vec;
import water.util.Log;
import water.util.RIStream;
import com.amazonaws.*;
import com.amazonaws.auth.*;
import com.amazonaws.services.s3.AmazonS3;
import com.amazonaws.services.s3.AmazonS3Client;
import com.amazonaws.services.s3.model.*;
import com.google.common.base.Objects;
import com.google.common.io.ByteStreams;
/** Persistence backend for S3 */
public final class PersistS3 extends Persist {
private static final String HELP = "You can specify a credentials properties file with the -aws_credentials command line switch.";
private static final String KEY_PREFIX = "s3://";
private static final int KEY_PREFIX_LEN = KEY_PREFIX.length();
private static final Object _lock = new Object();
private static volatile AmazonS3 _s3;
public static AmazonS3 getClient() {
if( _s3 == null ) {
synchronized( _lock ) {
if( _s3 == null ) {
try {
_s3 = new AmazonS3Client(new H2OAWSCredentialsProviderChain(), s3ClientCfg());
} catch( Throwable e ) {
StringBuilder msg = new StringBuilder();
msg.append(e.getMessage() + "\n");
msg.append("Unable to load S3 credentials.");
if( H2O.OPT_ARGS.aws_credentials == null ) msg.append(HELP);
throw Log.err(new RuntimeException(msg.toString()));
}
}
}
}
return _s3;
}
/** Modified version of default credentials provider which includes H2O-specific
* credentials provider.
*/
public static class H2OAWSCredentialsProviderChain extends AWSCredentialsProviderChain {
public H2OAWSCredentialsProviderChain() {
super(new H2OArgCredentialsProvider(),
new InstanceProfileCredentialsProvider(),
new EnvironmentVariableCredentialsProvider(),
new SystemPropertiesCredentialsProvider());
}
}
/** A simple credentials provider reading file-based credentials from given
* command argument <code>--aws_credentials</code>.
*/
static class H2OArgCredentialsProvider implements AWSCredentialsProvider {
// Default location of the AWS credentials file
public static final String DEFAULT_CREDENTIALS_LOCATION = "AwsCredentials.properties";
@Override public AWSCredentials getCredentials() {
File credentials = new File(Objects.firstNonNull(H2O.OPT_ARGS.aws_credentials, DEFAULT_CREDENTIALS_LOCATION));
try {
return new PropertiesCredentials(credentials);
} catch (IOException e) {
throw new AmazonClientException("Unable to load AWS credentials from file " + credentials);
}
}
@Override public void refresh() {}
@Override
public String toString() {
return getClass().getSimpleName();
}
}
public static final class H2SO3InputStream extends RIStream {
Key _k;
long _to;
String[] _bk;
@Override protected InputStream open(long offset) {
return getClient().getObject(new GetObjectRequest(_bk[0], _bk[1]).withRange(offset, _to)).getObjectContent();
}
public H2SO3InputStream(Key k, ProgressMonitor pmon) {
this(k, pmon, 0, Long.MAX_VALUE);
}
public H2SO3InputStream(Key k, ProgressMonitor pmon, long from, long to) {
super(from, pmon);
_k = k;
_to = Math.min(DKV.get(k).length() - 1, to);
_bk = decodeKey(k);
open();
}
}
public static InputStream openStream(Key k, ProgressMonitor pmon) throws IOException {
return new H2SO3InputStream(k, pmon);
}
public static Key loadKey(S3ObjectSummary obj) throws IOException {
Key k = encodeKey(obj.getBucketName(), obj.getKey());
long size = obj.getSize();
Value val = new Value(k, (int) size, Value.S3); // Plain Value
val.setdsk();
DKV.put(k, val);
return k;
}
// file implementation -------------------------------------------------------
// Read up to 'len' bytes of Value. Value should already be persisted to
// disk. A racing delete can trigger a failure where we get a null return,
// but no crash (although one could argue that a racing load&delete is a bug
// no matter what).
@Override public byte[] load(Value v) {
long start_io_ms = System.currentTimeMillis();
byte[] b = MemoryManager.malloc1(v._max);
Key k = v._key;
long skip = 0;
// Skip offset based on chunk number
if(k._kb[0] == Key.DVEC)
skip = FileVec.chunkOffset(k); // The offset
// Too complicate matters, S3 likes to reset connections when H2O hits it
// too hard. We "fix" this by just trying again, assuming we're getting
// hit with a bogus resource limit (H2O doing a parse looks like a DDOS to
// Amazon S3).
S3ObjectInputStream s = null;
while( true ) { // Loop, in case we get premature EOF's
try {
long start_ns = System.nanoTime(); // Blocking i/o call timing - without counting repeats
s = getObjectForKey(k, skip, v._max).getObjectContent();
ByteStreams.readFully(s, b); // delegate work to Google (it reads the byte buffer in a cycle as we did)
assert v.isPersisted();
TimeLine.record_IOclose(start_ns, start_io_ms, 1/* read */, v._max, Value.S3);
return b;
// Explicitly ignore the following exceptions but
// fail on the rest IOExceptions
} catch( EOFException e ) {
ignoreAndWait(e, false);
} catch( SocketTimeoutException e ) {
ignoreAndWait(e, false);
} catch( IOException e ) {
ignoreAndWait(e, true);
} finally {
try {
if( s != null ) s.close();
} catch( IOException e ) {}
}
}
}
private static void ignoreAndWait(final Exception e, boolean printException) {
H2O.ignore(e, "Hit the S3 reset problem, waiting and retrying...", printException);
try {
Thread.sleep(500);
} catch( InterruptedException ie ) {}
}
// Store Value v to disk.
@Override public void store(Value v) {
if( !v._key.home() ) return;
throw H2O.unimpl(); // VA only
}
/**
* Creates the key for given S3 bucket and key. Returns the H2O key, or null if the key cannot be
* created.
*
* @param bucket
* Bucket name
* @param key
* Key name (S3)
* @return H2O key pointing to the given bucket and key.
*/
public static Key encodeKey(String bucket, String key) {
Key res = encodeKeyImpl(bucket, key);
// assert checkBijection(res, bucket, key);
return res;
}
/**
* Decodes the given H2O key to the S3 bucket and key name. Returns the array of two strings,
* first one is the bucket name and second one is the key name.
*
* @param k
* Key to be decoded.
* @return Pair (array) of bucket name and key name.
*/
public static String[] decodeKey(Key k) {
return decodeKeyImpl(k);
// assert checkBijection(k, res[0], res[1]);
// return res;
}
// private static boolean checkBijection(Key k, String bucket, String key) {
// Key en = encodeKeyImpl(bucket, key);
// String[] de = decodeKeyImpl(k);
// boolean res = Arrays.equals(k._kb, en._kb) && bucket.equals(de[0]) && key.equals(de[1]);
// assert res : "Bijection failure:" + "\n\tKey 1:" + k + "\n\tKey 2:" + en + "\n\tBkt 1:" + bucket + "\n\tBkt 2:"
// + de[0] + "\n\tStr 1:" + key + "\n\tStr 2:" + de[1] + "";
// return res;
// }
private static Key encodeKeyImpl(String bucket, String key) {
return Key.make(KEY_PREFIX + bucket + '/' + key);
}
private static String[] decodeKeyImpl(Key k) {
String s = new String((k._kb[0] == Key.DVEC)?Arrays.copyOfRange(k._kb, Vec.KEY_PREFIX_LEN, k._kb.length):k._kb);
assert s.startsWith(KEY_PREFIX) && s.indexOf('/') >= 0 : "Attempting to decode non s3 key: " + k;
s = s.substring(KEY_PREFIX_LEN);
int dlm = s.indexOf('/');
String bucket = s.substring(0, dlm);
String key = s.substring(dlm + 1);
return new String[] { bucket, key };
}
// Gets the S3 object associated with the key that can read length bytes from offset
private static S3Object getObjectForKey(Key k, long offset, long length) throws IOException {
String[] bk = decodeKey(k);
GetObjectRequest r = new GetObjectRequest(bk[0], bk[1]);
r.setRange(offset, offset + length - 1); // Range is *inclusive* according to docs???
return getClient().getObject(r);
}
// Gets the object metadata associated with given key.
private static ObjectMetadata getObjectMetadataForKey(Key k) {
String[] bk = decodeKey(k);
assert (bk.length == 2);
return getClient().getObjectMetadata(bk[0], bk[1]);
}
/** S3 socket timeout property name */
public final static String S3_SOCKET_TIMEOUT_PROP = "water.s3.socketTimeout";
/** S3 connection timeout property name */
public final static String S3_CONNECTION_TIMEOUT_PROP = "water.s3.connectionTimeout";
/** S3 maximal error retry number */
public final static String S3_MAX_ERROR_RETRY_PROP = "water.s3.maxErrorRetry";
/** S3 maximal http connections */
public final static String S3_MAX_HTTP_CONNECTIONS_PROP = "water.s3.maxHttpConnections";
static ClientConfiguration s3ClientCfg() {
ClientConfiguration cfg = new ClientConfiguration();
Properties prop = System.getProperties();
if( prop.containsKey(S3_SOCKET_TIMEOUT_PROP) ) cfg.setSocketTimeout(Integer.getInteger(S3_SOCKET_TIMEOUT_PROP));
if( prop.containsKey(S3_CONNECTION_TIMEOUT_PROP) ) cfg.setConnectionTimeout(Integer
.getInteger(S3_CONNECTION_TIMEOUT_PROP));
if( prop.containsKey(S3_MAX_ERROR_RETRY_PROP) ) cfg.setMaxErrorRetry(Integer.getInteger(S3_MAX_ERROR_RETRY_PROP));
if( prop.containsKey(S3_MAX_HTTP_CONNECTIONS_PROP) ) cfg.setMaxConnections(Integer
.getInteger(S3_MAX_HTTP_CONNECTIONS_PROP));
cfg.setProtocol(Protocol.HTTP);
return cfg;
}
// TODO needed if storing ice to S3
@Override public String getPath() {
throw new UnsupportedOperationException();
}
@Override public void clear() {
throw new UnsupportedOperationException();
}
@Override public void loadExisting() {
throw new UnsupportedOperationException();
}
@Override public void delete(Value v) {
throw new UnsupportedOperationException();
}
}