package water.persist; import java.io.*; import java.net.SocketTimeoutException; import java.util.Arrays; import java.util.Properties; import water.*; import water.Job.ProgressMonitor; import water.api.Constants.Extensions; import water.fvec.FileVec; import water.fvec.Vec; import water.util.Log; import water.util.RIStream; import com.amazonaws.*; import com.amazonaws.auth.*; import com.amazonaws.services.s3.AmazonS3; import com.amazonaws.services.s3.AmazonS3Client; import com.amazonaws.services.s3.model.*; import com.google.common.base.Objects; import com.google.common.io.ByteStreams; /** Persistence backend for S3 */ public final class PersistS3 extends Persist { private static final String HELP = "You can specify a credentials properties file with the -aws_credentials command line switch."; private static final String KEY_PREFIX = "s3://"; private static final int KEY_PREFIX_LEN = KEY_PREFIX.length(); private static final Object _lock = new Object(); private static volatile AmazonS3 _s3; public static AmazonS3 getClient() { if( _s3 == null ) { synchronized( _lock ) { if( _s3 == null ) { try { _s3 = new AmazonS3Client(new H2OAWSCredentialsProviderChain(), s3ClientCfg()); } catch( Throwable e ) { StringBuilder msg = new StringBuilder(); msg.append(e.getMessage() + "\n"); msg.append("Unable to load S3 credentials."); if( H2O.OPT_ARGS.aws_credentials == null ) msg.append(HELP); throw Log.err(new RuntimeException(msg.toString())); } } } } return _s3; } /** Modified version of default credentials provider which includes H2O-specific * credentials provider. */ public static class H2OAWSCredentialsProviderChain extends AWSCredentialsProviderChain { public H2OAWSCredentialsProviderChain() { super(new H2OArgCredentialsProvider(), new InstanceProfileCredentialsProvider(), new EnvironmentVariableCredentialsProvider(), new SystemPropertiesCredentialsProvider()); } } /** A simple credentials provider reading file-based credentials from given * command argument <code>--aws_credentials</code>. */ static class H2OArgCredentialsProvider implements AWSCredentialsProvider { // Default location of the AWS credentials file public static final String DEFAULT_CREDENTIALS_LOCATION = "AwsCredentials.properties"; @Override public AWSCredentials getCredentials() { File credentials = new File(Objects.firstNonNull(H2O.OPT_ARGS.aws_credentials, DEFAULT_CREDENTIALS_LOCATION)); try { return new PropertiesCredentials(credentials); } catch (IOException e) { throw new AmazonClientException("Unable to load AWS credentials from file " + credentials); } } @Override public void refresh() {} @Override public String toString() { return getClass().getSimpleName(); } } public static final class H2SO3InputStream extends RIStream { Key _k; long _to; String[] _bk; @Override protected InputStream open(long offset) { return getClient().getObject(new GetObjectRequest(_bk[0], _bk[1]).withRange(offset, _to)).getObjectContent(); } public H2SO3InputStream(Key k, ProgressMonitor pmon) { this(k, pmon, 0, Long.MAX_VALUE); } public H2SO3InputStream(Key k, ProgressMonitor pmon, long from, long to) { super(from, pmon); _k = k; _to = Math.min(DKV.get(k).length() - 1, to); _bk = decodeKey(k); open(); } } public static InputStream openStream(Key k, ProgressMonitor pmon) throws IOException { return new H2SO3InputStream(k, pmon); } public static Key loadKey(S3ObjectSummary obj) throws IOException { Key k = encodeKey(obj.getBucketName(), obj.getKey()); long size = obj.getSize(); Value val = new Value(k, (int) size, Value.S3); // Plain Value val.setdsk(); DKV.put(k, val); return k; } // file implementation ------------------------------------------------------- // Read up to 'len' bytes of Value. Value should already be persisted to // disk. A racing delete can trigger a failure where we get a null return, // but no crash (although one could argue that a racing load&delete is a bug // no matter what). @Override public byte[] load(Value v) { long start_io_ms = System.currentTimeMillis(); byte[] b = MemoryManager.malloc1(v._max); Key k = v._key; long skip = 0; // Skip offset based on chunk number if(k._kb[0] == Key.DVEC) skip = FileVec.chunkOffset(k); // The offset // Too complicate matters, S3 likes to reset connections when H2O hits it // too hard. We "fix" this by just trying again, assuming we're getting // hit with a bogus resource limit (H2O doing a parse looks like a DDOS to // Amazon S3). S3ObjectInputStream s = null; while( true ) { // Loop, in case we get premature EOF's try { long start_ns = System.nanoTime(); // Blocking i/o call timing - without counting repeats s = getObjectForKey(k, skip, v._max).getObjectContent(); ByteStreams.readFully(s, b); // delegate work to Google (it reads the byte buffer in a cycle as we did) assert v.isPersisted(); TimeLine.record_IOclose(start_ns, start_io_ms, 1/* read */, v._max, Value.S3); return b; // Explicitly ignore the following exceptions but // fail on the rest IOExceptions } catch( EOFException e ) { ignoreAndWait(e, false); } catch( SocketTimeoutException e ) { ignoreAndWait(e, false); } catch( IOException e ) { ignoreAndWait(e, true); } finally { try { if( s != null ) s.close(); } catch( IOException e ) {} } } } private static void ignoreAndWait(final Exception e, boolean printException) { H2O.ignore(e, "Hit the S3 reset problem, waiting and retrying...", printException); try { Thread.sleep(500); } catch( InterruptedException ie ) {} } // Store Value v to disk. @Override public void store(Value v) { if( !v._key.home() ) return; throw H2O.unimpl(); // VA only } /** * Creates the key for given S3 bucket and key. Returns the H2O key, or null if the key cannot be * created. * * @param bucket * Bucket name * @param key * Key name (S3) * @return H2O key pointing to the given bucket and key. */ public static Key encodeKey(String bucket, String key) { Key res = encodeKeyImpl(bucket, key); // assert checkBijection(res, bucket, key); return res; } /** * Decodes the given H2O key to the S3 bucket and key name. Returns the array of two strings, * first one is the bucket name and second one is the key name. * * @param k * Key to be decoded. * @return Pair (array) of bucket name and key name. */ public static String[] decodeKey(Key k) { return decodeKeyImpl(k); // assert checkBijection(k, res[0], res[1]); // return res; } // private static boolean checkBijection(Key k, String bucket, String key) { // Key en = encodeKeyImpl(bucket, key); // String[] de = decodeKeyImpl(k); // boolean res = Arrays.equals(k._kb, en._kb) && bucket.equals(de[0]) && key.equals(de[1]); // assert res : "Bijection failure:" + "\n\tKey 1:" + k + "\n\tKey 2:" + en + "\n\tBkt 1:" + bucket + "\n\tBkt 2:" // + de[0] + "\n\tStr 1:" + key + "\n\tStr 2:" + de[1] + ""; // return res; // } private static Key encodeKeyImpl(String bucket, String key) { return Key.make(KEY_PREFIX + bucket + '/' + key); } private static String[] decodeKeyImpl(Key k) { String s = new String((k._kb[0] == Key.DVEC)?Arrays.copyOfRange(k._kb, Vec.KEY_PREFIX_LEN, k._kb.length):k._kb); assert s.startsWith(KEY_PREFIX) && s.indexOf('/') >= 0 : "Attempting to decode non s3 key: " + k; s = s.substring(KEY_PREFIX_LEN); int dlm = s.indexOf('/'); String bucket = s.substring(0, dlm); String key = s.substring(dlm + 1); return new String[] { bucket, key }; } // Gets the S3 object associated with the key that can read length bytes from offset private static S3Object getObjectForKey(Key k, long offset, long length) throws IOException { String[] bk = decodeKey(k); GetObjectRequest r = new GetObjectRequest(bk[0], bk[1]); r.setRange(offset, offset + length - 1); // Range is *inclusive* according to docs??? return getClient().getObject(r); } // Gets the object metadata associated with given key. private static ObjectMetadata getObjectMetadataForKey(Key k) { String[] bk = decodeKey(k); assert (bk.length == 2); return getClient().getObjectMetadata(bk[0], bk[1]); } /** S3 socket timeout property name */ public final static String S3_SOCKET_TIMEOUT_PROP = "water.s3.socketTimeout"; /** S3 connection timeout property name */ public final static String S3_CONNECTION_TIMEOUT_PROP = "water.s3.connectionTimeout"; /** S3 maximal error retry number */ public final static String S3_MAX_ERROR_RETRY_PROP = "water.s3.maxErrorRetry"; /** S3 maximal http connections */ public final static String S3_MAX_HTTP_CONNECTIONS_PROP = "water.s3.maxHttpConnections"; static ClientConfiguration s3ClientCfg() { ClientConfiguration cfg = new ClientConfiguration(); Properties prop = System.getProperties(); if( prop.containsKey(S3_SOCKET_TIMEOUT_PROP) ) cfg.setSocketTimeout(Integer.getInteger(S3_SOCKET_TIMEOUT_PROP)); if( prop.containsKey(S3_CONNECTION_TIMEOUT_PROP) ) cfg.setConnectionTimeout(Integer .getInteger(S3_CONNECTION_TIMEOUT_PROP)); if( prop.containsKey(S3_MAX_ERROR_RETRY_PROP) ) cfg.setMaxErrorRetry(Integer.getInteger(S3_MAX_ERROR_RETRY_PROP)); if( prop.containsKey(S3_MAX_HTTP_CONNECTIONS_PROP) ) cfg.setMaxConnections(Integer .getInteger(S3_MAX_HTTP_CONNECTIONS_PROP)); cfg.setProtocol(Protocol.HTTP); return cfg; } // TODO needed if storing ice to S3 @Override public String getPath() { throw new UnsupportedOperationException(); } @Override public void clear() { throw new UnsupportedOperationException(); } @Override public void loadExisting() { throw new UnsupportedOperationException(); } @Override public void delete(Value v) { throw new UnsupportedOperationException(); } }