/* * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.facebook.presto.execution.buffer; import com.facebook.presto.OutputBuffers.OutputBufferId; import com.google.common.collect.ImmutableList; import com.google.common.util.concurrent.ListenableFuture; import com.google.common.util.concurrent.SettableFuture; import io.airlift.units.DataSize; import javax.annotation.concurrent.GuardedBy; import javax.annotation.concurrent.Immutable; import javax.annotation.concurrent.ThreadSafe; import java.util.ArrayList; import java.util.Collection; import java.util.LinkedList; import java.util.List; import java.util.Optional; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicLong; import static com.facebook.presto.execution.buffer.BufferResult.emptyResults; import static com.google.common.base.MoreObjects.toStringHelper; import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.base.Preconditions.checkState; import static com.google.common.base.Verify.verify; import static com.google.common.util.concurrent.Futures.immediateFuture; import static java.lang.Math.toIntExact; import static java.util.Objects.requireNonNull; @ThreadSafe class ClientBuffer { private final String taskInstanceId; private final OutputBufferId bufferId; private final AtomicLong rowsAdded = new AtomicLong(); private final AtomicLong pagesAdded = new AtomicLong(); private final AtomicLong bufferedBytes = new AtomicLong(); @GuardedBy("this") private final AtomicLong currentSequenceId = new AtomicLong(); @GuardedBy("this") private final LinkedList<SerializedPageReference> pages = new LinkedList<>(); @GuardedBy("this") private boolean noMorePages; // destroyed is set when the client sends a DELETE to the buffer // this is an acknowledgement that the client has observed the end of the buffer @GuardedBy("this") private final AtomicBoolean destroyed = new AtomicBoolean(); @GuardedBy("this") private PendingRead pendingRead; public ClientBuffer(String taskInstanceId, OutputBufferId bufferId) { this.taskInstanceId = requireNonNull(taskInstanceId, "taskInstanceId is null"); this.bufferId = requireNonNull(bufferId, "bufferId is null"); } public BufferInfo getInfo() { // // NOTE: this code must be lock free so state machine updates do not hang // @SuppressWarnings("FieldAccessNotGuarded") boolean destroyed = this.destroyed.get(); @SuppressWarnings("FieldAccessNotGuarded") long sequenceId = this.currentSequenceId.get(); // if destroyed the buffered page count must be zero regardless of observation ordering in this lock free code int bufferedPages = destroyed ? 0 : Math.max(toIntExact(pagesAdded.get() - sequenceId), 0); PageBufferInfo pageBufferInfo = new PageBufferInfo(bufferId.getId(), bufferedPages, bufferedBytes.get(), rowsAdded.get(), pagesAdded.get()); return new BufferInfo(bufferId, destroyed, bufferedPages, sequenceId, pageBufferInfo); } public boolean isDestroyed() { // // NOTE: this code must be lock free so state machine updates do not hang // @SuppressWarnings("FieldAccessNotGuarded") boolean destroyed = this.destroyed.get(); return destroyed; } public void destroy() { List<SerializedPageReference> removedPages; PendingRead pendingRead; synchronized (this) { removedPages = ImmutableList.copyOf(pages); pages.clear(); bufferedBytes.getAndSet(0); noMorePages = true; destroyed.set(true); pendingRead = this.pendingRead; this.pendingRead = null; } removedPages.forEach(SerializedPageReference::dereferencePage); if (pendingRead != null) { pendingRead.completeResultFutureWithEmpty(); } } public void enqueuePages(Collection<SerializedPageReference> pages) { PendingRead pendingRead; synchronized (this) { // ignore pages after no more pages is set // this can happen with limit queries if (noMorePages) { return; } addPages(pages); pendingRead = this.pendingRead; this.pendingRead = null; } // we just added a page, so process the pending read if (pendingRead != null) { processRead(pendingRead); } } private synchronized void addPages(Collection<SerializedPageReference> pages) { pages.forEach(SerializedPageReference::addReference); this.pages.addAll(pages); long rowCount = pages.stream().mapToLong(SerializedPageReference::getPositionCount).sum(); rowsAdded.addAndGet(rowCount); pagesAdded.addAndGet(pages.size()); long bytesAdded = pages.stream().mapToLong(SerializedPageReference::getRetainedSizeInBytes).sum(); bufferedBytes.addAndGet(bytesAdded); } public ListenableFuture<BufferResult> getPages(long sequenceId, DataSize maxSize) { return getPages(sequenceId, maxSize, Optional.empty()); } public ListenableFuture<BufferResult> getPages(long sequenceId, DataSize maxSize, Optional<PagesSupplier> pagesSupplier) { checkArgument(sequenceId >= 0, "Invalid sequence id"); // acknowledge pages first, out side of locks to not trigger callbacks while holding the lock acknowledgePages(sequenceId); // attempt to load some data before processing the read pagesSupplier.ifPresent(supplier -> loadPagesIfNecessary(supplier, maxSize)); PendingRead oldPendingRead = null; try { synchronized (this) { // save off the old pending read so we can abort it out side of the lock oldPendingRead = this.pendingRead; this.pendingRead = null; // Return results immediately if we have data, there will be no more data, or this is // an out of order request if (!pages.isEmpty() || noMorePages || sequenceId != currentSequenceId.get()) { return immediateFuture(processRead(sequenceId, maxSize)); } // otherwise, wait for more data to arrive pendingRead = new PendingRead(taskInstanceId, sequenceId, maxSize); return pendingRead.getResultFuture(); } } finally { if (oldPendingRead != null) { // Each buffer is private to a single client, and each client should only have one outstanding // read. Therefore, we abort the existing read since it was most likely abandoned by the client. oldPendingRead.completeResultFutureWithEmpty(); } } } public void setNoMorePages() { PendingRead pendingRead; synchronized (this) { // ignore duplicate calls if (noMorePages) { return; } noMorePages = true; pendingRead = this.pendingRead; this.pendingRead = null; } // there will be no more pages, so process the pending read if (pendingRead != null) { processRead(pendingRead); } } public void loadPagesIfNecessary(PagesSupplier pagesSupplier) { requireNonNull(pagesSupplier, "pagesSupplier is null"); // Get the max size from the current pending read, which may not be the // same pending read instance by the time pages are loaded but this is // safe since the size is rechecked before returning pages. DataSize maxSize; synchronized (this) { if (pendingRead == null) { return; } maxSize = pendingRead.getMaxSize(); } boolean dataAdded = loadPagesIfNecessary(pagesSupplier, maxSize); if (dataAdded) { PendingRead pendingRead; synchronized (this) { pendingRead = this.pendingRead; } if (pendingRead != null) { processRead(pendingRead); } } } /** * If there no data, attempt to load some from the pages supplier. */ private boolean loadPagesIfNecessary(PagesSupplier pagesSupplier, DataSize maxSize) { checkState(!Thread.holdsLock(this), "Can not load pages while holding a lock on this"); List<SerializedPageReference> pageReferences; synchronized (this) { if (noMorePages) { return false; } if (!pages.isEmpty()) { return false; } // The page supplier has incremented the page reference count, and addPages below also increments // the reference count, so we need to drop the page supplier reference. The call dereferencePage // is performed outside of synchronized to avoid making a callback while holding a lock. pageReferences = pagesSupplier.getPages(maxSize); // add the pages to this buffer, which will increase the reference count addPages(pageReferences); // check for no more pages if (!pagesSupplier.mayHaveMorePages()) { noMorePages = true; } } // sent pages will have an initial reference count, so drop it pageReferences.forEach(SerializedPageReference::dereferencePage); return !pageReferences.isEmpty(); } private void processRead(PendingRead pendingRead) { checkState(!Thread.holdsLock(this), "Can not process pending read while holding a lock on this"); if (pendingRead.getResultFuture().isDone()) { return; } BufferResult bufferResult = processRead(pendingRead.getSequenceId(), pendingRead.getMaxSize()); pendingRead.getResultFuture().set(bufferResult); } /** * @return a result with at least one page if we have pages in buffer, empty result otherwise */ private synchronized BufferResult processRead(long sequenceId, DataSize maxSize) { // When pages are added to the partition buffer they are effectively // assigned an id starting from zero. When a read is processed, the // "token" is the id of the page to start the read from, so the first // step of the read is to acknowledge, and drop all pages up to the // provided sequenceId. Then pages starting from the sequenceId are // returned with the sequenceId of the next page to read. // // Since the buffer API is asynchronous there are a number of problems // that can occur our of order request (typically from retries due to // request failures): // - Request to read pages that have already been acknowledged. // Simply, send an result with no pages and the requested sequenceId, // and since the client has already acknowledge the pages, it will // ignore the out of order response. // - Request to read after the buffer has been destroyed. When the // buffer is destroyed all pages are dropped, so the read sequenceId // appears to be off the end of the queue. Normally a read past the // end of the queue would be be an error, but this specific case is // detected and handled. The client is sent an empty response with // the finished flag set and next token is the max acknowledged page // when the buffer is destroyed. // // if request is for pages before the current position, just return an empty result if (sequenceId < currentSequenceId.get()) { return emptyResults(taskInstanceId, sequenceId, false); } // if this buffer is finished, notify the client of this, so the client // will destroy this buffer if (pages.isEmpty() && noMorePages) { return emptyResults(taskInstanceId, currentSequenceId.get(), true); } // if request is for pages after the current position, there is a bug somewhere // a read call is always proceeded by acknowledge pages, which // will advance the sequence id to at least the request position, unless // the buffer is destroyed, and in that case the buffer will be empty with // no more pages set, which is checked above verify(sequenceId == currentSequenceId.get(), "Invalid sequence id"); // read the new pages long maxBytes = maxSize.toBytes(); List<SerializedPage> result = new ArrayList<>(); long bytes = 0; for (SerializedPageReference page : pages) { bytes += page.getRetainedSizeInBytes(); // break (and don't add) if this page would exceed the limit if (!result.isEmpty() && bytes > maxBytes) { break; } result.add(page.getSerializedPage()); } return new BufferResult(taskInstanceId, sequenceId, sequenceId + result.size(), false, result); } /** * Drops pages up to the specified sequence id */ private void acknowledgePages(long sequenceId) { checkState(!Thread.holdsLock(this), "Can not acknowledge pages while holding a lock on this"); List<SerializedPageReference> removedPages = new ArrayList<>(); synchronized (this) { if (destroyed.get()) { return; } // if pages have already been acknowledged, just ignore this long oldCurrentSequenceId = currentSequenceId.get(); if (sequenceId < oldCurrentSequenceId) { return; } int pagesToRemove = toIntExact(sequenceId - oldCurrentSequenceId); checkArgument(pagesToRemove <= pages.size(), "Invalid sequence id"); long bytesRemoved = 0; for (int i = 0; i < pagesToRemove; i++) { SerializedPageReference removedPage = pages.removeFirst(); removedPages.add(removedPage); bytesRemoved += removedPage.getRetainedSizeInBytes(); } // update current sequence id verify(currentSequenceId.compareAndSet(oldCurrentSequenceId, oldCurrentSequenceId + pagesToRemove)); // update memory tracking verify(bufferedBytes.addAndGet(-bytesRemoved) >= 0); } // dereference outside of synchronized to avoid making a callback while holding a lock removedPages.forEach(SerializedPageReference::dereferencePage); } @Override public String toString() { @SuppressWarnings("FieldAccessNotGuarded") long sequenceId = currentSequenceId.get(); @SuppressWarnings("FieldAccessNotGuarded") boolean destroyed = this.destroyed.get(); return toStringHelper(this) .add("bufferId", bufferId) .add("sequenceId", sequenceId) .add("destroyed", destroyed) .toString(); } @Immutable private static class PendingRead { private final String taskInstanceId; private final long sequenceId; private final DataSize maxSize; private final SettableFuture<BufferResult> resultFuture = SettableFuture.create(); private PendingRead(String taskInstanceId, long sequenceId, DataSize maxSize) { this.taskInstanceId = requireNonNull(taskInstanceId, "taskInstanceId is null"); this.sequenceId = sequenceId; this.maxSize = maxSize; } public long getSequenceId() { return sequenceId; } public DataSize getMaxSize() { return maxSize; } public SettableFuture<BufferResult> getResultFuture() { return resultFuture; } public void completeResultFutureWithEmpty() { resultFuture.set(emptyResults(taskInstanceId, sequenceId, false)); } } public interface PagesSupplier { /** * Gets pages up to the specified size limit or a single page that exceeds the size limit. */ List<SerializedPageReference> getPages(DataSize maxSize); /** * @return true if more pages may be produced; false otherwise */ boolean mayHaveMorePages(); } }