/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.fs.s3a; import java.io.IOException; import java.io.OutputStream; import java.util.ArrayList; import java.util.List; import java.util.concurrent.Callable; import java.util.concurrent.ExecutionException; import java.util.concurrent.ExecutorService; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; import com.amazonaws.AmazonClientException; import com.amazonaws.event.ProgressEvent; import com.amazonaws.event.ProgressEventType; import com.amazonaws.event.ProgressListener; import com.amazonaws.services.s3.model.CompleteMultipartUploadResult; import com.amazonaws.services.s3.model.PartETag; import com.amazonaws.services.s3.model.PutObjectRequest; import com.amazonaws.services.s3.model.PutObjectResult; import com.amazonaws.services.s3.model.UploadPartRequest; import com.google.common.base.Preconditions; import com.google.common.util.concurrent.Futures; import com.google.common.util.concurrent.ListenableFuture; import com.google.common.util.concurrent.ListeningExecutorService; import com.google.common.util.concurrent.MoreExecutors; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.io.retry.RetryPolicies; import org.apache.hadoop.io.retry.RetryPolicy; import org.apache.hadoop.util.Progressable; import static org.apache.hadoop.fs.s3a.S3AUtils.*; import static org.apache.hadoop.fs.s3a.Statistic.*; /** * Upload files/parts directly via different buffering mechanisms: * including memory and disk. * * If the stream is closed and no update has started, then the upload * is instead done as a single PUT operation. * * Unstable: statistics and error handling might evolve. */ @InterfaceAudience.Private @InterfaceStability.Unstable class S3ABlockOutputStream extends OutputStream { private static final Logger LOG = LoggerFactory.getLogger(S3ABlockOutputStream.class); /** Owner FileSystem. */ private final S3AFileSystem fs; /** Object being uploaded. */ private final String key; /** Size of all blocks. */ private final int blockSize; /** Callback for progress. */ private final ProgressListener progressListener; private final ListeningExecutorService executorService; /** * Retry policy for multipart commits; not all AWS SDK versions retry that. */ private final RetryPolicy retryPolicy = RetryPolicies.retryUpToMaximumCountWithProportionalSleep( 5, 2000, TimeUnit.MILLISECONDS); /** * Factory for blocks. */ private final S3ADataBlocks.BlockFactory blockFactory; /** Preallocated byte buffer for writing single characters. */ private final byte[] singleCharWrite = new byte[1]; /** Multipart upload details; null means none started. */ private MultiPartUpload multiPartUpload; /** Closed flag. */ private final AtomicBoolean closed = new AtomicBoolean(false); /** Current data block. Null means none currently active */ private S3ADataBlocks.DataBlock activeBlock; /** Count of blocks uploaded. */ private long blockCount = 0; /** Statistics to build up. */ private final S3AInstrumentation.OutputStreamStatistics statistics; /** * Write operation helper; encapsulation of the filesystem operations. */ private final S3AFileSystem.WriteOperationHelper writeOperationHelper; /** * An S3A output stream which uploads partitions in a separate pool of * threads; different {@link S3ADataBlocks.BlockFactory} * instances can control where data is buffered. * * @param fs S3AFilesystem * @param key S3 object to work on. * @param executorService the executor service to use to schedule work * @param progress report progress in order to prevent timeouts. If * this object implements {@code ProgressListener} then it will be * directly wired up to the AWS client, so receive detailed progress * information. * @param blockSize size of a single block. * @param blockFactory factory for creating stream destinations * @param statistics stats for this stream * @param writeOperationHelper state of the write operation. * @throws IOException on any problem */ S3ABlockOutputStream(S3AFileSystem fs, String key, ExecutorService executorService, Progressable progress, long blockSize, S3ADataBlocks.BlockFactory blockFactory, S3AInstrumentation.OutputStreamStatistics statistics, S3AFileSystem.WriteOperationHelper writeOperationHelper) throws IOException { this.fs = fs; this.key = key; this.blockFactory = blockFactory; this.blockSize = (int) blockSize; this.statistics = statistics; this.writeOperationHelper = writeOperationHelper; Preconditions.checkArgument(blockSize >= Constants.MULTIPART_MIN_SIZE, "Block size is too small: %d", blockSize); this.executorService = MoreExecutors.listeningDecorator(executorService); this.multiPartUpload = null; this.progressListener = (progress instanceof ProgressListener) ? (ProgressListener) progress : new ProgressableListener(progress); // create that first block. This guarantees that an open + close sequence // writes a 0-byte entry. createBlockIfNeeded(); LOG.debug("Initialized S3ABlockOutputStream for {}" + " output to {}", writeOperationHelper, activeBlock); } /** * Demand create a destination block. * @return the active block; null if there isn't one. * @throws IOException on any failure to create */ private synchronized S3ADataBlocks.DataBlock createBlockIfNeeded() throws IOException { if (activeBlock == null) { blockCount++; if (blockCount>= Constants.MAX_MULTIPART_COUNT) { LOG.error("Number of partitions in stream exceeds limit for S3: " + Constants.MAX_MULTIPART_COUNT + " write may fail."); } activeBlock = blockFactory.create(blockCount, this.blockSize, statistics); } return activeBlock; } /** * Synchronized accessor to the active block. * @return the active block; null if there isn't one. */ private synchronized S3ADataBlocks.DataBlock getActiveBlock() { return activeBlock; } /** * Predicate to query whether or not there is an active block. * @return true if there is an active block. */ private synchronized boolean hasActiveBlock() { return activeBlock != null; } /** * Clear the active block. */ private void clearActiveBlock() { if (activeBlock != null) { LOG.debug("Clearing active block"); } synchronized (this) { activeBlock = null; } } /** * Check for the filesystem being open. * @throws IOException if the filesystem is closed. */ void checkOpen() throws IOException { if (closed.get()) { throw new IOException("Filesystem " + writeOperationHelper + " closed"); } } /** * The flush operation does not trigger an upload; that awaits * the next block being full. What it does do is call {@code flush() } * on the current block, leaving it to choose how to react. * @throws IOException Any IO problem. */ @Override public synchronized void flush() throws IOException { checkOpen(); S3ADataBlocks.DataBlock dataBlock = getActiveBlock(); if (dataBlock != null) { dataBlock.flush(); } } /** * Writes a byte to the destination. If this causes the buffer to reach * its limit, the actual upload is submitted to the threadpool. * @param b the int of which the lowest byte is written * @throws IOException on any problem */ @Override public synchronized void write(int b) throws IOException { singleCharWrite[0] = (byte)b; write(singleCharWrite, 0, 1); } /** * Writes a range of bytes from to the memory buffer. If this causes the * buffer to reach its limit, the actual upload is submitted to the * threadpool and the remainder of the array is written to memory * (recursively). * @param source byte array containing * @param offset offset in array where to start * @param len number of bytes to be written * @throws IOException on any problem */ @Override public synchronized void write(byte[] source, int offset, int len) throws IOException { S3ADataBlocks.validateWriteArgs(source, offset, len); checkOpen(); if (len == 0) { return; } S3ADataBlocks.DataBlock block = createBlockIfNeeded(); int written = block.write(source, offset, len); int remainingCapacity = block.remainingCapacity(); if (written < len) { // not everything was written —the block has run out // of capacity // Trigger an upload then process the remainder. LOG.debug("writing more data than block has capacity -triggering upload"); uploadCurrentBlock(); // tail recursion is mildly expensive, but given buffer sizes must be MB. // it's unlikely to recurse very deeply. this.write(source, offset + written, len - written); } else { if (remainingCapacity == 0) { // the whole buffer is done, trigger an upload uploadCurrentBlock(); } } } /** * Start an asynchronous upload of the current block. * @throws IOException Problems opening the destination for upload * or initializing the upload. */ private synchronized void uploadCurrentBlock() throws IOException { Preconditions.checkState(hasActiveBlock(), "No active block"); LOG.debug("Writing block # {}", blockCount); if (multiPartUpload == null) { LOG.debug("Initiating Multipart upload"); multiPartUpload = new MultiPartUpload(); } try { multiPartUpload.uploadBlockAsync(getActiveBlock()); } finally { // set the block to null, so the next write will create a new block. clearActiveBlock(); } } /** * Close the stream. * * This will not return until the upload is complete * or the attempt to perform the upload has failed. * Exceptions raised in this method are indicative that the write has * failed and data is at risk of being lost. * @throws IOException on any failure. */ @Override public void close() throws IOException { if (closed.getAndSet(true)) { // already closed LOG.debug("Ignoring close() as stream is already closed"); return; } S3ADataBlocks.DataBlock block = getActiveBlock(); boolean hasBlock = hasActiveBlock(); LOG.debug("{}: Closing block #{}: current block= {}", this, blockCount, hasBlock ? block : "(none)"); try { if (multiPartUpload == null) { if (hasBlock) { // no uploads of data have taken place, put the single block up. // This must happen even if there is no data, so that 0 byte files // are created. putObject(); } } else { // there has already been at least one block scheduled for upload; // put up the current then wait if (hasBlock && block.hasData()) { //send last part uploadCurrentBlock(); } // wait for the partial uploads to finish final List<PartETag> partETags = multiPartUpload.waitForAllPartUploads(); // then complete the operation multiPartUpload.complete(partETags); } LOG.debug("Upload complete for {}", writeOperationHelper); } catch (IOException ioe) { writeOperationHelper.writeFailed(ioe); throw ioe; } finally { closeAll(LOG, block, blockFactory); LOG.debug("Statistics: {}", statistics); closeAll(LOG, statistics); clearActiveBlock(); } // All end of write operations, including deleting fake parent directories writeOperationHelper.writeSuccessful(); } /** * Upload the current block as a single PUT request; if the buffer * is empty a 0-byte PUT will be invoked, as it is needed to create an * entry at the far end. * @throws IOException any problem. */ private void putObject() throws IOException { LOG.debug("Executing regular upload for {}", writeOperationHelper); final S3ADataBlocks.DataBlock block = getActiveBlock(); int size = block.dataSize(); final S3ADataBlocks.BlockUploadData uploadData = block.startUpload(); final PutObjectRequest putObjectRequest = uploadData.hasFile() ? writeOperationHelper.newPutRequest(uploadData.getFile()) : writeOperationHelper.newPutRequest(uploadData.getUploadStream(), size); long transferQueueTime = now(); BlockUploadProgress callback = new BlockUploadProgress( block, progressListener, transferQueueTime); putObjectRequest.setGeneralProgressListener(callback); statistics.blockUploadQueued(size); ListenableFuture<PutObjectResult> putObjectResult = executorService.submit(new Callable<PutObjectResult>() { @Override public PutObjectResult call() throws Exception { PutObjectResult result; try { // the putObject call automatically closes the input // stream afterwards. result = writeOperationHelper.putObject(putObjectRequest); } finally { closeAll(LOG, uploadData, block); } return result; } }); clearActiveBlock(); //wait for completion try { putObjectResult.get(); } catch (InterruptedException ie) { LOG.warn("Interrupted object upload", ie); Thread.currentThread().interrupt(); } catch (ExecutionException ee) { throw extractException("regular upload", key, ee); } } @Override public String toString() { final StringBuilder sb = new StringBuilder( "S3ABlockOutputStream{"); sb.append(writeOperationHelper.toString()); sb.append(", blockSize=").append(blockSize); // unsynced access; risks consistency in exchange for no risk of deadlock. S3ADataBlocks.DataBlock block = activeBlock; if (block != null) { sb.append(", activeBlock=").append(block); } sb.append('}'); return sb.toString(); } private void incrementWriteOperations() { fs.incrementWriteOperations(); } /** * Current time in milliseconds. * @return time */ private long now() { return System.currentTimeMillis(); } /** * Get the statistics for this stream. * @return stream statistics */ S3AInstrumentation.OutputStreamStatistics getStatistics() { return statistics; } /** * Multiple partition upload. */ private class MultiPartUpload { private final String uploadId; private final List<ListenableFuture<PartETag>> partETagsFutures; MultiPartUpload() throws IOException { this.uploadId = writeOperationHelper.initiateMultiPartUpload(); this.partETagsFutures = new ArrayList<>(2); LOG.debug("Initiated multi-part upload for {} with " + "id '{}'", writeOperationHelper, uploadId); } /** * Upload a block of data. * This will take the block * @param block block to upload * @throws IOException upload failure */ private void uploadBlockAsync(final S3ADataBlocks.DataBlock block) throws IOException { LOG.debug("Queueing upload of {}", block); final int size = block.dataSize(); final S3ADataBlocks.BlockUploadData uploadData = block.startUpload(); final int currentPartNumber = partETagsFutures.size() + 1; final UploadPartRequest request = writeOperationHelper.newUploadPartRequest( uploadId, currentPartNumber, size, uploadData.getUploadStream(), uploadData.getFile()); long transferQueueTime = now(); BlockUploadProgress callback = new BlockUploadProgress( block, progressListener, transferQueueTime); request.setGeneralProgressListener(callback); statistics.blockUploadQueued(block.dataSize()); ListenableFuture<PartETag> partETagFuture = executorService.submit(new Callable<PartETag>() { @Override public PartETag call() throws Exception { // this is the queued upload operation LOG.debug("Uploading part {} for id '{}'", currentPartNumber, uploadId); // do the upload PartETag partETag; try { partETag = fs.uploadPart(request).getPartETag(); LOG.debug("Completed upload of {} to part {}", block, partETag.getETag()); LOG.debug("Stream statistics of {}", statistics); } finally { // close the stream and block closeAll(LOG, uploadData, block); } return partETag; } }); partETagsFutures.add(partETagFuture); } /** * Block awaiting all outstanding uploads to complete. * @return list of results * @throws IOException IO Problems */ private List<PartETag> waitForAllPartUploads() throws IOException { LOG.debug("Waiting for {} uploads to complete", partETagsFutures.size()); try { return Futures.allAsList(partETagsFutures).get(); } catch (InterruptedException ie) { LOG.warn("Interrupted partUpload", ie); Thread.currentThread().interrupt(); return null; } catch (ExecutionException ee) { //there is no way of recovering so abort //cancel all partUploads LOG.debug("While waiting for upload completion", ee); LOG.debug("Cancelling futures"); for (ListenableFuture<PartETag> future : partETagsFutures) { future.cancel(true); } //abort multipartupload this.abort(); throw extractException("Multi-part upload with id '" + uploadId + "' to " + key, key, ee); } } /** * This completes a multipart upload. * Sometimes it fails; here retries are handled to avoid losing all data * on a transient failure. * @param partETags list of partial uploads * @throws IOException on any problem */ private CompleteMultipartUploadResult complete(List<PartETag> partETags) throws IOException { int retryCount = 0; AmazonClientException lastException; String operation = String.format("Completing multi-part upload for key '%s'," + " id '%s' with %s partitions ", key, uploadId, partETags.size()); do { try { LOG.debug(operation); return writeOperationHelper.completeMultipartUpload( uploadId, partETags); } catch (AmazonClientException e) { lastException = e; statistics.exceptionInMultipartComplete(); } } while (shouldRetry(operation, lastException, retryCount++)); // this point is only reached if the operation failed more than // the allowed retry count throw translateException(operation, key, lastException); } /** * Abort a multi-part upload. Retries are attempted on failures. * IOExceptions are caught; this is expected to be run as a cleanup process. */ public void abort() { int retryCount = 0; AmazonClientException lastException; fs.incrementStatistic(OBJECT_MULTIPART_UPLOAD_ABORTED); String operation = String.format("Aborting multi-part upload for '%s', id '%s", writeOperationHelper, uploadId); do { try { LOG.debug(operation); writeOperationHelper.abortMultipartUpload(uploadId); return; } catch (AmazonClientException e) { lastException = e; statistics.exceptionInMultipartAbort(); } } while (shouldRetry(operation, lastException, retryCount++)); // this point is only reached if the operation failed more than // the allowed retry count LOG.warn("Unable to abort multipart upload, you may need to purge " + "uploaded parts", lastException); } /** * Predicate to determine whether a failed operation should * be attempted again. * If a retry is advised, the exception is automatically logged and * the filesystem statistic {@link Statistic#IGNORED_ERRORS} incremented. * The method then sleeps for the sleep time suggested by the sleep policy; * if the sleep is interrupted then {@code Thread.interrupted()} is set * to indicate the thread was interrupted; then false is returned. * * @param operation operation for log message * @param e exception raised. * @param retryCount number of retries already attempted * @return true if another attempt should be made */ private boolean shouldRetry(String operation, AmazonClientException e, int retryCount) { try { RetryPolicy.RetryAction retryAction = retryPolicy.shouldRetry(e, retryCount, 0, true); boolean retry = retryAction == RetryPolicy.RetryAction.RETRY; if (retry) { fs.incrementStatistic(IGNORED_ERRORS); LOG.info("Retrying {} after exception ", operation, e); Thread.sleep(retryAction.delayMillis); } return retry; } catch (InterruptedException ex) { Thread.currentThread().interrupt(); return false; } catch (Exception ignored) { return false; } } } /** * The upload progress listener registered for events returned * during the upload of a single block. * It updates statistics and handles the end of the upload. * Transfer failures are logged at WARN. */ private final class BlockUploadProgress implements ProgressListener { private final S3ADataBlocks.DataBlock block; private final ProgressListener nextListener; private final long transferQueueTime; private long transferStartTime; /** * Track the progress of a single block upload. * @param block block to monitor * @param nextListener optional next progress listener * @param transferQueueTime time the block was transferred * into the queue */ private BlockUploadProgress(S3ADataBlocks.DataBlock block, ProgressListener nextListener, long transferQueueTime) { this.block = block; this.transferQueueTime = transferQueueTime; this.nextListener = nextListener; } @Override public void progressChanged(ProgressEvent progressEvent) { ProgressEventType eventType = progressEvent.getEventType(); long bytesTransferred = progressEvent.getBytesTransferred(); int size = block.dataSize(); switch (eventType) { case REQUEST_BYTE_TRANSFER_EVENT: // bytes uploaded statistics.bytesTransferred(bytesTransferred); break; case TRANSFER_PART_STARTED_EVENT: transferStartTime = now(); statistics.blockUploadStarted(transferStartTime - transferQueueTime, size); incrementWriteOperations(); break; case TRANSFER_PART_COMPLETED_EVENT: statistics.blockUploadCompleted(now() - transferStartTime, size); break; case TRANSFER_PART_FAILED_EVENT: statistics.blockUploadFailed(now() - transferStartTime, size); LOG.warn("Transfer failure of block {}", block); break; default: // nothing } if (nextListener != null) { nextListener.progressChanged(progressEvent); } } } /** * Bridge from AWS {@code ProgressListener} to Hadoop {@link Progressable}. */ private static class ProgressableListener implements ProgressListener { private final Progressable progress; public ProgressableListener(Progressable progress) { this.progress = progress; } public void progressChanged(ProgressEvent progressEvent) { if (progress != null) { progress.progress(); } } } }