package thredds.crawlabledataset.s3; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Date; import java.util.List; import java.util.Objects; import java.util.concurrent.TimeUnit; import com.amazonaws.services.s3.model.ObjectListing; import com.amazonaws.services.s3.model.ObjectMetadata; import com.amazonaws.services.s3.model.S3ObjectSummary; import com.google.common.cache.Cache; import com.google.common.cache.CacheBuilder; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import thredds.crawlabledataset.CrawlableDataset; import thredds.crawlabledataset.CrawlableDatasetFile; /** * CrawlableDataset implementation that allows THREDDS to interact with datasets stored on Amazon S3. * * @author danfruehauf * @author cwardgar * @since 2015/08/23 */ public class CrawlableDatasetAmazonS3 extends CrawlableDatasetFile { private static final Logger logger = LoggerFactory.getLogger(CrawlableDatasetAmazonS3.class); private static final long ENTRY_EXPIRATION_TIME = 1; // In hours. private static final long MAX_SUMMARY_ENTRIES = 10000; private static final Cache<S3URI, S3ObjectSummary> objectSummaryCache = CacheBuilder.newBuilder() .expireAfterAccess(ENTRY_EXPIRATION_TIME, TimeUnit.HOURS) .maximumSize(MAX_SUMMARY_ENTRIES) .build(); private static ThreddsS3Client defaultThreddsS3Client = new CachingThreddsS3Client(new ThreddsS3ClientImpl()); private final S3URI s3uri; private final ThreddsS3Client threddsS3Client; // A configObject is required by the superclass constructor (CrawlableDatasetFile). // However, it is ignored; in fact, a warning is emitted if it ISN'T null. // So, for convenience, we're providing constructors without the configObject parameter. public CrawlableDatasetAmazonS3(String path) { this(path, null); } public CrawlableDatasetAmazonS3(String path, Object configObject) { // Used reflectively by CrawlableDatasetFactory this(new S3URI(path), configObject); } public CrawlableDatasetAmazonS3(S3URI s3uri) { this(s3uri, null); } public CrawlableDatasetAmazonS3(S3URI s3uri, Object configObject) { this(s3uri, configObject, defaultThreddsS3Client); } public CrawlableDatasetAmazonS3(S3URI s3uri, Object configObject, ThreddsS3Client threddsS3Client) { super(s3uri.toString(), configObject); this.s3uri = s3uri; this.threddsS3Client = threddsS3Client; } //////////////////////////////////////// Static //////////////////////////////////////// public static void setDefaultThreddsS3Client(ThreddsS3Client threddsS3Client) { defaultThreddsS3Client = threddsS3Client; } public static void clearCache() { objectSummaryCache.invalidateAll(); } //////////////////////////////////////// Getters //////////////////////////////////////// public S3URI getS3URI() { return s3uri; } public ThreddsS3Client getThreddsS3Client() { return threddsS3Client; } //////////////////////////////////////// CrawlableDatasetFile //////////////////////////////////////// @Override public File getFile() { try { return threddsS3Client.saveObjectToFile(s3uri, s3uri.getTempFile()); } catch (IOException e) { logger.error(String.format("Could not save S3 object '%s' to file.", s3uri), e); return null; } } //////////////////////////////////////// CrawlableDataset //////////////////////////////////////// @Override public String getPath() { return s3uri.toString(); } @Override public String getName() { return s3uri.getBaseName(); } @Override public CrawlableDataset getParentDataset() { return new CrawlableDatasetAmazonS3(s3uri.getParent(), getConfigObject(), threddsS3Client); } @Override public CrawlableDataset getDescendant(String relativePath) { return new CrawlableDatasetAmazonS3(s3uri.getChild(relativePath), getConfigObject(), threddsS3Client); } @Override public boolean exists() { return objectSummaryCache.getIfPresent(s3uri) != null || threddsS3Client.getObjectMetadata(s3uri) != null || threddsS3Client.listObjects(s3uri) != null; } @Override public boolean isCollection() { return objectSummaryCache.getIfPresent(s3uri) == null && threddsS3Client.listObjects(s3uri) != null; } @Override public List<CrawlableDataset> listDatasets() throws IOException { boolean isCachedObject = objectSummaryCache.getIfPresent(s3uri) != null; // Cached objects aren't collections. ObjectListing objectListing; if (isCachedObject || (objectListing = threddsS3Client.listObjects(s3uri)) == null) { String tmpMsg = String.format("'%s' is not a collection dataset.", s3uri); logger.error("listDatasets(): " + tmpMsg); throw new IllegalStateException(tmpMsg); } List<CrawlableDataset> crawlableDsets = new ArrayList<>(); for (final S3ObjectSummary objectSummary : objectListing.getObjectSummaries()) { S3URI childS3uri = new S3URI(objectSummary.getBucketName(), objectSummary.getKey()); CrawlableDatasetAmazonS3 crawlableDset = new CrawlableDatasetAmazonS3(childS3uri, getConfigObject(), threddsS3Client); crawlableDsets.add(crawlableDset); // Add summary to the cache. The cache will be queried in length() and lastModified(). objectSummaryCache.put(childS3uri, objectSummary); } for (String commonPrefix : objectListing.getCommonPrefixes()) { S3URI childS3uri = new S3URI(s3uri.getBucket(), commonPrefix); CrawlableDatasetAmazonS3 crawlableDset = new CrawlableDatasetAmazonS3(childS3uri, getConfigObject(), threddsS3Client); crawlableDsets.add(crawlableDset); } assert !crawlableDsets.isEmpty() : "This is a collection and collections shouldn't be empty."; return crawlableDsets; } /** * Returns the size of the dataset, in bytes. Will be zero if this dataset is a collection or non-existent. * * @return the size of the dataset */ @Override public long length() { // If the summary is already in the cache, return it. // It'll have been added by a listDatasets() call on the parent directory. S3ObjectSummary objectSummary = objectSummaryCache.getIfPresent(s3uri); if (objectSummary != null) { return objectSummary.getSize(); } /* Get the metadata directly from S3. This will be expensive. * We get punished hard if length() and/or lastModified() is called on a bunch of datasets without * listDatasets() first being called on their parent directory. * * So, is the right thing to do here "getParentDataset().listDatasets()" and then query the cache again? * Perhaps, but listDatasets() throws an IOException, and length() and lastModified() do not. * We would have to change their signatures and the upstream client code to make it work. */ ObjectMetadata metadata = threddsS3Client.getObjectMetadata(s3uri); if (metadata != null) { return metadata.getContentLength(); } else { // "this" may be a collection or non-existent. In both cases, we return 0. return 0; } } /** * Returns the date that the dataset was last modified. Will be null if the dataset is a collection or non-existent. * * @return the date that the dataset was last modified */ @Override public Date lastModified() { S3ObjectSummary objectSummary = objectSummaryCache.getIfPresent(s3uri); if (objectSummary != null) { return objectSummary.getLastModified(); } ObjectMetadata metadata = threddsS3Client.getObjectMetadata(s3uri); if (metadata != null) { return metadata.getLastModified(); } else { // "this" may be a collection or non-existent. In both cases, we return null. return null; } } //////////////////////////////////////// Object //////////////////////////////////////// @Override public String toString() { return String.format("CrawlableDatasetAmazonS3{'%s'}", s3uri); } // Not considering threddsS3Client in either of these becaue ThreddsS3Client doesn't implement equals or hashCode. // It's very hard to provide those methods because ThreddsS3Client contains a AmazonS3Client data member, and that // class doesn't implement those methods either. // Still, it's probably okay because we almost always only care about the S3URI. @Override public boolean equals(Object other) { if (this == other) { return true; } else if (other == null || getClass() != other.getClass()) { return false; } CrawlableDatasetAmazonS3 that = (CrawlableDatasetAmazonS3) other; return Objects.equals(this.getS3URI(), that.getS3URI()) && Objects.equals(this.getConfigObject(), that.getConfigObject()); } @Override public int hashCode() { return Arrays.deepHashCode(new Object[] { this.getS3URI(), this.getConfigObject() }); } }