/**
* Copyright 2016 LinkedIn Corp. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
package com.github.ambry.router;
import com.github.ambry.clustermap.ClusterMap;
import com.github.ambry.clustermap.PartitionId;
import com.github.ambry.clustermap.ReplicaId;
import com.github.ambry.commons.BlobId;
import com.github.ambry.commons.ByteBufferAsyncWritableChannel;
import com.github.ambry.commons.ResponseHandler;
import com.github.ambry.commons.ServerErrorCode;
import com.github.ambry.config.RouterConfig;
import com.github.ambry.messageformat.BlobProperties;
import com.github.ambry.messageformat.BlobType;
import com.github.ambry.messageformat.MetadataContentSerDe;
import com.github.ambry.network.Port;
import com.github.ambry.network.RequestInfo;
import com.github.ambry.network.ResponseInfo;
import com.github.ambry.notification.NotificationBlobType;
import com.github.ambry.notification.NotificationSystem;
import com.github.ambry.protocol.PutRequest;
import com.github.ambry.protocol.PutResponse;
import com.github.ambry.protocol.RequestOrResponse;
import com.github.ambry.store.StoreKey;
import com.github.ambry.utils.Pair;
import com.github.ambry.utils.Time;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.concurrent.ThreadLocalRandom;
import java.util.concurrent.atomic.AtomicReference;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* PutOperation class is responsible for maintaining the state associated with a put operation, and completing it.
* The associated object of a put operation ends up as one or more chunks, depending on its size. If the size is less
* than max put chunk size, then the operation will involve a single chunk. Those that are larger are split into
* multiple chunks and distributed across partitions individually. Each chunk will be a blob at the server and will
* have its own blob id. For such composite blobs, an associated metadata chunk will be created that consists of
* the blob ids of every data chunk; and the blob id of this metadata chunk will become the blob id of the overall
* object.
*
* For simple blobs (blobs that are under the max put chunk size), only a single chunk is created and held
* within a single PutChunk object which is used to complete the whole operation. No metadata chunks are created. The
* blob id of the single data chunk will be the blob id of the blob.
*
* For composite blobs, a PutOperation consists of a fixed number of PutChunk objects and a MetadataPutChunk object.
* Each PutChunk holds one of the chunks of the object at any time and attempts to put the chunk successfully, and then
* possibly moves on to hold another chunk of the object. This is repeated until either the operation fails or until all
* the chunks are put successfully. If all the chunks are put successfully, the MetadataPutChunk is created to put
* the metadata chunk consisting of the blob ids of all the data chunks, and the successful put of the metadata
* chunk successfully completes the operation.
*/
class PutOperation {
// Operation arguments.
private final RouterConfig routerConfig;
private final NonBlockingRouterMetrics routerMetrics;
private final ClusterMap clusterMap;
private final ResponseHandler responseHandler;
private final NotificationSystem notificationSystem;
private final BlobProperties passedInBlobProperties;
private final byte[] userMetadata;
private final ReadableStreamChannel channel;
private final ByteBufferAsyncWritableChannel chunkFillerChannel;
private final FutureResult<String> futureResult;
private final Callback<String> callback;
private final RouterCallback routerCallback;
private final Time time;
private BlobProperties finalBlobProperties;
// Parameters associated with the state.
// the list of PutChunks that will be used to hold chunks that are sent out. A PutChunk will only hold one chunk at
// any time, but will be reused as and when the operation on the chunk is complete.
final ConcurrentLinkedQueue<PutChunk> putChunks;
// the total size of the object (the overall blob). This will be initialized to -1 indicating that the value is
// not yet determined. Once the chunk filling is complete, this will have the actual size of the data read
// from the channel.
private long blobSize = -1;
// total bytes of this object that has been filled so far by the ChunkFillerThread.
private long bytesFilledSoFar;
// the reference to the chunk in putChunks that was most recently filled or became eligible for getting filled.
private PutChunk chunkToFill;
// counter for tracking the chunks being filled.
private int chunkCounter;
// the current ByteBuffer/position in the chunkFillerChannel.
private ByteBuffer channelReadBuffer;
// indicates whether chunk filling is complete and successful.
private volatile boolean chunkFillingCompletedSuccessfully = false;
// the metadata chunk for this operation. There will always be a metadata chunk that tracks the data chunks.
// However, if the operation completes and results in only one data chunk, then the metadata chunk will not be sent
// out.
private final MetadataPutChunk metadataPutChunk;
// denotes whether the operation is complete.
private volatile boolean operationCompleted = false;
// the blob id of the overall blob. This will be set if and when the operation is successful.
private BlobId blobId;
// the cause for failure of this operation. This will be set if and when the operation encounters an irrecoverable
// failure.
private final AtomicReference<Exception> operationException = new AtomicReference<Exception>();
// To find the PutChunk to hand over the response quickly.
private final Map<Integer, PutChunk> correlationIdToPutChunk = new HashMap<Integer, PutChunk>();
// The time at which the operation was submitted.
private final long submissionTimeMs;
// The point in time at which the most recent wait for free chunk availability started.
private long startTimeForChunkAvailabilityWaitMs;
// The point in time at which the most recent wait for channel data availability started.
private long startTimeForChannelDataAvailabilityMs;
// The time spent in waiting for a chunk to become available to be filled when the channel had data.
private long waitTimeForCurrentChunkAvailabilityMs;
// The time spent by a chunk for data to be available in the channel.
private long waitTimeForChannelDataAvailabilityMs;
private static final Logger logger = LoggerFactory.getLogger(PutOperation.class);
/**
* Construct a PutOperation with the given parameters. For any operation, based on the max chunk size for puts,
* an object contained within the {@link ReadableStreamChannel} will either be put as a single blob if its size is
* less than the max chunk size; or will be split into as many chunks as required each of which is no longer in
* size than the max chunk put size, and a single metadata blob containing the information about each of these
* chunks.
* @param routerConfig the {@link RouterConfig} containing the configs for put operations.
* @param routerMetrics The {@link NonBlockingRouterMetrics} to be used for reporting metrics.
* @param clusterMap the {@link ClusterMap} of the cluster
* @param responseHandler the {@link ResponseHandler} responsible for failure detection.
* @param notificationSystem the {@link NotificationSystem} to use for blob creation notifications.
*@param userMetadata the userMetadata associated with the put operation.
* @param channel the {@link ReadableStreamChannel} containing the blob data.
* @param futureResult the future that will contain the result of the operation.
* @param callback the callback that is to be called when the operation completes.
* @param routerCallback The {@link RouterCallback} to use for callbacks to the router.
* @param time the Time instance to use.
* @param blobProperties the BlobProperties associated with the put operation.
* @throws RouterException if there is an error in constructing the PutOperation with the given parameters.
*/
PutOperation(RouterConfig routerConfig, NonBlockingRouterMetrics routerMetrics, ClusterMap clusterMap,
ResponseHandler responseHandler, NotificationSystem notificationSystem, byte[] userMetadata,
ReadableStreamChannel channel, FutureResult<String> futureResult, Callback<String> callback,
RouterCallback routerCallback, ByteBufferAsyncWritableChannel.ChannelEventListener writableChannelEventListener,
Time time, BlobProperties blobProperties) throws RouterException {
submissionTimeMs = time.milliseconds();
this.routerConfig = routerConfig;
this.routerMetrics = routerMetrics;
this.clusterMap = clusterMap;
this.responseHandler = responseHandler;
this.notificationSystem = notificationSystem;
this.passedInBlobProperties = blobProperties;
this.userMetadata = userMetadata;
this.channel = channel;
this.futureResult = futureResult;
this.callback = callback;
this.routerCallback = routerCallback;
this.time = time;
bytesFilledSoFar = 0;
chunkCounter = -1;
putChunks = new ConcurrentLinkedQueue<>();
metadataPutChunk = new MetadataPutChunk();
chunkFillerChannel = new ByteBufferAsyncWritableChannel(writableChannelEventListener);
}
/**
* Start reading from the channel containing the data for this operation.
*/
void startReadingFromChannel() {
channel.readInto(chunkFillerChannel, new Callback<Long>() {
@Override
public void onCompletion(Long result, Exception exception) {
if (exception != null) {
setOperationExceptionAndComplete(exception);
routerCallback.onPollReady();
} else {
blobSize = result;
chunkFillingCompletedSuccessfully = true;
}
chunkFillerChannel.close();
}
});
}
/**
* returns whether the operation has completed.
* @return whether the operation has completed.
*/
boolean isOperationComplete() {
return operationCompleted;
}
/**
* Notify for overall blob creation if the operation is complete and the blob was put successfully. Also ensure that
* notifications have been sent out for all successfully put data chunks.
*/
void maybeNotifyForBlobCreation() {
if (isOperationComplete()) {
boolean composite = !getSuccessfullyPutChunkIdsIfComposite().isEmpty();
if (composite) {
metadataPutChunk.notifyForFirstChunkCreation();
}
if (blobId != null) {
notificationSystem.onBlobCreated(getBlobIdString(), getBlobProperties(), getUserMetadata(),
composite ? NotificationBlobType.Composite : NotificationBlobType.Simple);
}
}
}
/**
* For this operation, create and populate put requests for chunks (in the form of {@link RequestInfo}) to
* send out.
* @param requestRegistrationCallback the {@link RequestRegistrationCallback} to call for every request that gets
* created as part of this poll operation.
*/
void poll(RequestRegistrationCallback<PutOperation> requestRegistrationCallback) {
if (operationCompleted) {
return;
}
metadataPutChunk.poll(requestRegistrationCallback);
if (metadataPutChunk.isComplete()) {
if (getNumDataChunks() > 1) {
onChunkOperationComplete(metadataPutChunk);
}
} else if (!metadataPutChunk.isReady()) {
for (PutChunk chunk : putChunks) {
if (chunk.isReady()) {
chunk.poll(requestRegistrationCallback);
if (chunk.isComplete()) {
onChunkOperationComplete(chunk);
// After each chunk is processed, check whether the operation itself has completed
if (operationCompleted) {
return;
}
}
}
}
}
}
/**
* Handle the given {@link ResponseInfo} by handing it over to the correct {@link PutChunk} that issued the request.
* @param responseInfo the {@link ResponseInfo} to be handled.
* @param putResponse the {@link PutResponse} associated with this response.
*/
void handleResponse(ResponseInfo responseInfo, PutResponse putResponse) {
PutChunk putChunk = correlationIdToPutChunk.remove(
((RequestOrResponse) responseInfo.getRequestInfo().getRequest()).getCorrelationId());
putChunk.handleResponse(responseInfo, putResponse);
if (putChunk.isComplete()) {
onChunkOperationComplete(putChunk);
}
}
/**
* Called when the operation on a {@link PutChunk} is complete: That is, the chunk is successfully put or there was
* an irrecoverable error in doing so. The {@link PutChunk} guarantees that in the former case,
* the blobId of the chunk is set; and in the latter case, it is null.
* @param chunk the {@link PutChunk} that has completed its operation.
*/
private void onChunkOperationComplete(PutChunk chunk) {
if (chunk.getChunkBlobId() == null) {
// the overall operation has failed if any of the chunk fails.
if (chunk.getChunkException() == null) {
logger.error("Operation on chunk failed, but no exception was set");
}
logger.error("Failed putting chunk at index: " + chunk.getChunkIndex() + ", failing the entire operation");
operationCompleted = true;
} else if (chunk != metadataPutChunk) {
// a data chunk has succeeded.
logger.trace("Successfully put chunk with blob id: " + chunk.getChunkBlobId());
metadataPutChunk.addChunkId(chunk.chunkBlobId, chunk.chunkIndex);
metadataPutChunk.maybeNotifyForChunkCreation(chunk);
} else {
blobId = chunk.getChunkBlobId();
if (chunk.failedAttempts > 0) {
logger.trace("Slipped put succeeded for chunk: " + chunk.getChunkBlobId());
routerMetrics.slippedPutSuccessCount.inc();
} else {
logger.trace("Successfully put chunk: " + chunk.getChunkBlobId());
}
operationCompleted = true;
}
routerMetrics.putChunkOperationLatencyMs.update(time.milliseconds() - chunk.chunkReadyAtMs);
chunk.clear();
}
/**
* Returns whether chunk filling is complete (successfully or otherwise).
* @return true if chunk filling is complete, false otherwise.
*/
boolean isChunkFillingDone() {
return chunkFillingCompletedSuccessfully || operationCompleted;
}
/**
* This method runs in the context of the ChunkFiller thread. As long as there are chunks available to
* be written to, it gets the chunk that is to be filled and keeps filling it with the data from the
* chunkFillerChannel, if there is any.
* @throws InterruptedException if the call to get a chunk from the chunkFillerChannel is interrupted.
*/
void fillChunks() {
try {
PutChunk chunkToFill;
while (!isChunkFillingDone()) {
// Attempt to fill a chunk
if (channelReadBuffer == null) {
channelReadBuffer = chunkFillerChannel.getNextChunk(0);
}
if (channelReadBuffer != null) {
maybeStopTrackingWaitForChannelDataTime();
chunkToFill = getChunkToFill();
if (chunkToFill == null) {
// channel has data, but no chunks are free to be filled yet.
maybeStartTrackingWaitForChunkTime();
break;
} else {
// channel has data, and there is a chunk that can be filled.
maybeStopTrackingWaitForChunkTime();
bytesFilledSoFar += chunkToFill.fillFrom(channelReadBuffer);
if (chunkToFill.isReady()) {
routerCallback.onPollReady();
updateChunkFillerWaitTimeMetrics();
}
if (!channelReadBuffer.hasRemaining()) {
chunkFillerChannel.resolveOldestChunk(null);
channelReadBuffer = null;
}
}
} else {
// channel does not have more data yet.
if (getFreeChunk() != null) {
// this means there is a chunk available to be filled, but no data in the channel.
maybeStartTrackingWaitForChannelDataTime();
}
break;
}
}
if (chunkFillingCompletedSuccessfully) {
PutChunk lastChunk = getBuildingChunk();
if (lastChunk != null) {
if (chunkCounter != 0 && lastChunk.buf.position() == 0) {
logger.trace("The last buffer(s) received from chunkFillerChannel have no data, discarding them.");
} else {
lastChunk.onFillComplete(true);
updateChunkFillerWaitTimeMetrics();
}
}
routerCallback.onPollReady();
}
} catch (Exception e) {
RouterException routerException = e instanceof RouterException ? (RouterException) e
: new RouterException("PutOperation fillChunks encountered unexpected error", e,
RouterErrorCode.UnexpectedInternalError);
routerMetrics.chunkFillerUnexpectedErrorCount.inc();
routerCallback.onPollReady();
setOperationExceptionAndComplete(routerException);
}
}
/**
* Called whenever the channel has data but no free or building chunk is available to be filled.
*/
private void maybeStartTrackingWaitForChunkTime() {
if (startTimeForChunkAvailabilityWaitMs == 0) {
// this is the first point in time after the last chunk filling (if any) when the filling was blocked due to
// chunk unavailability, so mark this time.
startTimeForChunkAvailabilityWaitMs = time.milliseconds();
} else {
// the wait was already initiated, so do nothing.
}
}
/**
* Called whenever the channel has data and there is a free or building chunk available to be filled.
*/
private void maybeStopTrackingWaitForChunkTime() {
if (startTimeForChunkAvailabilityWaitMs != 0) {
// this is the first point in time since the last wait that a chunk became available for filling.
waitTimeForCurrentChunkAvailabilityMs = time.milliseconds() - startTimeForChunkAvailabilityWaitMs;
startTimeForChunkAvailabilityWaitMs = 0;
}
}
/**
* Called whenever a chunk is available to be filled but there is no data available in the channel.
*/
private void maybeStartTrackingWaitForChannelDataTime() {
if (startTimeForChannelDataAvailabilityMs == 0) {
// this is the first point in time after the last time data was read from the channel that data became
// unavailable in the channel, so mark this time.
startTimeForChannelDataAvailabilityMs = time.milliseconds();
} else {
// the wait was already initiated, so do nothing.
}
}
/**
* Called whenever data becomes available in the channel.
*/
private void maybeStopTrackingWaitForChannelDataTime() {
if (startTimeForChannelDataAvailabilityMs != 0) {
// this is the first point in time since the last wait that data became available in the channel.
waitTimeForChannelDataAvailabilityMs += time.milliseconds() - startTimeForChannelDataAvailabilityMs;
startTimeForChannelDataAvailabilityMs = 0;
}
}
/**
* Update metrics related to how long a channel had to wait for a chunk to become available for filling, and
* how long the chunk had to wait for data to become available in the channel.
*/
private void updateChunkFillerWaitTimeMetrics() {
routerMetrics.waitTimeForFreeChunkAvailabilityMs.update(waitTimeForCurrentChunkAvailabilityMs);
routerMetrics.waitTimeForChannelDataAvailabilityMs.update(waitTimeForChannelDataAvailabilityMs);
waitTimeForCurrentChunkAvailabilityMs = 0;
waitTimeForChannelDataAvailabilityMs = 0;
}
/**
* Get the chunk to be filled. At most one chunk for an operation will ever be in Building state. If there is such
* a chunk, that is returned. If not, if there is a Free chunk, that is returned. If no Free chunks are available
* either, then null is returned.
* @return the chunk to fill, or null if there are no chunks eligible for filling.
*/
private PutChunk getChunkToFill() throws RouterException {
if (chunkToFill == null || !chunkToFill.isBuilding()) {
chunkToFill = getFreeChunk();
if (chunkToFill != null) {
if (chunkCounter == Integer.MAX_VALUE) {
throw new RouterException("Blob is too large", RouterErrorCode.BlobTooLarge);
}
chunkCounter++;
chunkToFill.prepareForBuilding(chunkCounter, routerConfig.routerMaxPutChunkSizeBytes);
}
}
return chunkToFill;
}
/**
* @return A free chunk, if one is available; null otherwise.
*/
private PutChunk getFreeChunk() {
PutChunk chunkToReturn = null;
for (PutChunk chunk : putChunks) {
if (chunk.isFree()) {
chunkToReturn = chunk;
break;
}
}
if (chunkToReturn == null && putChunks.size() < NonBlockingRouter.MAX_IN_MEM_CHUNKS) {
chunkToReturn = new PutChunk();
putChunks.add(chunkToReturn);
}
return chunkToReturn;
}
/**
* Get the PutChunk that is in Building state. Note that there can be at most one such PutChunk at any time.
* @return the PutChunk that is in Building state; null if no PutChunk is in Building state.
*/
private PutChunk getBuildingChunk() {
PutChunk chunkToReturn = null;
for (PutChunk chunk : putChunks) {
if (chunk.isBuilding()) {
chunkToReturn = chunk;
break;
}
}
return chunkToReturn;
}
/**
* Return the number of data chunks that this operation resulted in. This method should only be called once the
* chunk filling has completed (which is when the final size is determined).
* @return the number of data chunks that this operation resulted in.
* @throws IllegalStateException if the chunk filling has not yet completed.
*/
int getNumDataChunks() {
return RouterUtils.getNumChunksForBlobAndChunkSize(getBlobSize(), routerConfig.routerMaxPutChunkSizeBytes);
}
/**
* @return the size of the blob in this operation. This method should only be called once the chunk filling has
* completed (which is when the final size is determined).
*/
long getBlobSize() {
if (!chunkFillingCompletedSuccessfully) {
throw new IllegalStateException("Request for blob size before chunk fill completion");
}
return blobSize;
}
/**
* Return the blobId string associated with this operation if it was successful, or null if it failed. This should
* (obviously) only be called once the operation is complete.
* @return the blobId if the operation is successful; null otherwise.
*/
String getBlobIdString() {
return blobId == null ? null : blobId.getID();
}
/**
* Return the {@link BlobProperties} associated with this operation.
* @return the {@link BlobProperties} associated with this operation.
*/
BlobProperties getBlobProperties() {
if (finalBlobProperties == null) {
throw new IllegalStateException("blob properties has not yet been finalized");
}
return finalBlobProperties;
}
/**
* Return the userMetadata associated with this operation.
* @return the userMetadata associated with this operation.
*/
byte[] getUserMetadata() {
return userMetadata;
}
/**
* Return the {@link Callback} associated with this operation.
* @return the {@link Callback} associated with this operation.
*/
Callback<String> getCallback() {
return callback;
}
/**
* Return the {@link FutureResult} associated with this operation.
* @return the {@link FutureResult} associated with this operation.
*/
FutureResult<String> getFuture() {
return futureResult;
}
/**
* The exception associated with this operation if it failed; null otherwise.
* @return exception associated with this operation if it failed; null otherwise.
*/
Exception getOperationException() {
return operationException.get();
}
/**
* The time at which this operation was submitted.
* @return the time at which the operation was submitted.
*/
long getSubmissionTimeMs() {
return submissionTimeMs;
}
/**
* @return the service ID for this put operation.
*/
String getServiceId() {
return passedInBlobProperties.getServiceId();
}
/**
* If this is a composite object, fill the list with successfully put chunk ids.
* @return the list of successfully put chunk ids if this is a composite object, empty list otherwise.
*/
List<StoreKey> getSuccessfullyPutChunkIdsIfComposite() {
List<StoreKey> successfulChunks = metadataPutChunk.getSuccessfullyPutChunkIds();
// If the overall operation failed, we treat the successfully put chunks as part of a composite blob.
boolean operationFailed = blobId == null || getOperationException() != null;
if (operationFailed || successfulChunks.size() > 1) {
return successfulChunks;
} else {
return Collections.emptyList();
}
}
/**
* Set the irrecoverable exception associated with this operation. When this is called, the operation has failed.
* @param exception the irrecoverable exception associated with this operation.
*/
void setOperationExceptionAndComplete(Exception exception) {
operationException.set(exception);
operationCompleted = true;
}
/**
* PutChunk is responsible for storing chunks to be put, managing their state and completing the operation on the
* chunks. A PutChunk object is not really associated with one single chunk of data. Instead, it acts a holder that
* handles a chunk of data and takes it to completion, and once done, moves on to handle more chunks of data. This
* why there is a reference to the "current chunk" in the comments.
*/
class PutChunk {
// the position of the current chunk in the overall blob.
private int chunkIndex;
// the blobId of the current chunk.
protected BlobId chunkBlobId;
// the BlobProperties to associate with this chunk.
private BlobProperties chunkBlobProperties;
// the most recent time at which this chunk became Free.
private long chunkFreeAtMs;
// the most recent time time at which this chunk became ready.
private long chunkReadyAtMs;
// The exception encountered while putting the current chunk. Not all errors are irrecoverable. An error may or
// may not get overridden by a subsequent error, and this variable is meant to store the most relevant error.
private RouterException chunkException;
// the state of the current chunk.
protected volatile ChunkState state;
// the ByteBuffer that has the data for the current chunk.
protected ByteBuffer buf;
// the OperationTracker used to track the status of requests for the current chunk.
protected OperationTracker operationTracker;
// the number of times a put was attempted for the current chunk.
private int failedAttempts;
// the partitionId chosen for the current chunk.
private PartitionId partitionId;
// the list of partitions already attempted for this chunk.
private List<PartitionId> attemptedPartitionIds = new ArrayList<PartitionId>();
// map of correlation id to the request metadata for every request issued for the current chunk.
private final Map<Integer, ChunkPutRequestInfo> correlationIdToChunkPutRequestInfo =
new TreeMap<Integer, ChunkPutRequestInfo>();
// list of buffers that were once associated with this chunk and are not yet freed.
private final List<DefunctBufferInfo> defunctBufferInfos = new ArrayList<>();
private final Logger logger = LoggerFactory.getLogger(PutChunk.class);
/**
* Construct a PutChunk
*/
public PutChunk() {
clear();
}
/**
* Clear the state to make way for a new data chunk.
*/
void clear() {
chunkIndex = -1;
chunkBlobId = null;
chunkException = null;
failedAttempts = 0;
partitionId = null;
attemptedPartitionIds.clear();
maybeUpdateDefunctBufferInfos();
correlationIdToChunkPutRequestInfo.clear();
// this assignment should be the last statement as this immediately makes this chunk available to the
// ChunkFiller thread for filling.
state = ChunkState.Free;
chunkFreeAtMs = time.milliseconds();
}
/**
* Go through the list of requests for which responses were not received, and if there are any that are not yet
* sent out completely, add the associated buffer to the defunct list for freeing in the future.
*/
private void maybeUpdateDefunctBufferInfos() {
ArrayList<PutRequest> requestsAwaitingSendCompletion = null;
for (Map.Entry<Integer, ChunkPutRequestInfo> entry : correlationIdToChunkPutRequestInfo.entrySet()) {
if (!entry.getValue().putRequest.isSendComplete()) {
if (requestsAwaitingSendCompletion == null) {
requestsAwaitingSendCompletion = new ArrayList<>();
}
requestsAwaitingSendCompletion.add(entry.getValue().putRequest);
}
}
if (requestsAwaitingSendCompletion != null) {
// This means that the buffer associated with this PutChunk could get read by the NetworkClient in the
// future and assigning this PutChunk to a subsequent chunk of the overall blob could lead to this buffer
// getting read and written concurrently, or other undefined behavior. There are multiple ways to handle this,
// and the simplest way is to set the buf to null so that it gets allocated afresh if/when this PutChunk gets
// assigned for a subsequent chunk of the overall blob. Every time this chunk gets polled, an attempt to clear
// out the list will be made.
defunctBufferInfos.add(new DefunctBufferInfo(buf, requestsAwaitingSendCompletion));
buf = null;
}
}
/**
* Iterate defunctBufferInfos and possibly free up entries from it.
*/
private void maybeFreeDefunctBuffers() {
for (Iterator<DefunctBufferInfo> iter = defunctBufferInfos.iterator(); iter.hasNext(); ) {
boolean canBeFreed = true;
for (PutRequest putRequest : iter.next().putRequests) {
if (!putRequest.isSendComplete()) {
canBeFreed = false;
}
}
if (canBeFreed) {
// this is where the buffer will be freed if the buffer pool is used. For now, simply remove the reference.
iter.remove();
}
}
}
/**
* @return the position of the current chunk in the overall blob.
*/
int getChunkIndex() {
return chunkIndex;
}
/**
* @return the state of the PutChunk.
*/
ChunkState getState() {
return state;
}
/**
* @return the {@link BlobId} of the current chunk - valid only after the chunk is successfully put.
*/
BlobId getChunkBlobId() {
return chunkBlobId;
}
/**
* @return the {@link RouterException}, if any, encountered for the current chunk.
*/
RouterException getChunkException() {
return chunkException;
}
/**
* @return true if this PutChunk is free so a chunk of the overall blob can be filled in.
*/
boolean isFree() {
return state == ChunkState.Free;
}
/**
* @return true if this PutChunk is being built/filled with a chunk of the overall blob.
*/
boolean isBuilding() {
return state == ChunkState.Building;
}
/**
* @return true if this PutChunk is ready with a chunk of the overall blob.
*/
boolean isReady() {
return state == ChunkState.Ready;
}
/**
* @return true if the operation on the current chunk is complete.
*/
boolean isComplete() {
return state == ChunkState.Complete;
}
/**
* Prepare this chunk for building, that is, for being filled with data from the channel.
* @param chunkIndex the position in the overall blob that this chunk is going to be in.
* @param size size to allocate memory for the buffer that will hold the data for this chunk.
*/
private void prepareForBuilding(int chunkIndex, int size) {
this.chunkIndex = chunkIndex;
if (buf == null) {
buf = ByteBuffer.allocate(size);
} else {
buf.clear();
buf.limit(size);
}
state = ChunkState.Building;
}
/**
* Prepare this chunk (that is completely built), for puts.
*/
private void prepareForSending() {
try {
// if this is part of a retry, make sure no previously attempted partitions are retried.
if (partitionId != null) {
attemptedPartitionIds.add(partitionId);
}
partitionId = getPartitionForPut(attemptedPartitionIds);
chunkBlobId = new BlobId(partitionId);
chunkBlobProperties = new BlobProperties(buf.remaining(), passedInBlobProperties.getServiceId(),
passedInBlobProperties.getOwnerId(), passedInBlobProperties.getContentType(),
passedInBlobProperties.isPrivate(), passedInBlobProperties.getTimeToLiveInSeconds(),
passedInBlobProperties.getCreationTimeInMs());
operationTracker = new SimpleOperationTracker(routerConfig.routerDatacenterName, partitionId, false,
routerConfig.routerPutSuccessTarget, routerConfig.routerPutRequestParallelism);
correlationIdToChunkPutRequestInfo.clear();
state = ChunkState.Ready;
} catch (RouterException e) {
setOperationExceptionAndComplete(e);
} catch (Exception e) {
setOperationExceptionAndComplete(new RouterException("Operation tracker could not be initialized", e,
RouterErrorCode.UnexpectedInternalError));
}
}
/**
* Do the actions required when the chunk has been completely built.
* @param updateMetric whether chunk fill completion metrics should be updated.
*/
void onFillComplete(boolean updateMetric) {
buf.flip();
prepareForSending();
chunkReadyAtMs = time.milliseconds();
if (updateMetric) {
routerMetrics.chunkFillTimeMs.update(chunkReadyAtMs - chunkFreeAtMs);
}
}
/**
* Fill the buffer of the current chunk with the data from the given {@link ByteBuffer}.
* @param channelReadBuffer the {@link ByteBuffer} from which to read data.
* @return the number of bytes transferred in this operation.
*/
int fillFrom(ByteBuffer channelReadBuffer) {
int toWrite = Math.min(channelReadBuffer.remaining(), buf.remaining());
if (channelReadBuffer.remaining() > buf.remaining()) {
// Manipulate limit of the source buffer in order to read only enough to fill the chunk
int savedLimit = channelReadBuffer.limit();
channelReadBuffer.limit(channelReadBuffer.position() + buf.remaining());
buf.put(channelReadBuffer);
channelReadBuffer.limit(savedLimit);
} else {
buf.put(channelReadBuffer);
}
if (!buf.hasRemaining()) {
onFillComplete(true);
}
return toWrite;
}
/**
* Check if the operation on the chunk is eligible for completion, if so complete it.
*/
void checkAndMaybeComplete() {
boolean done = false;
// Now, check if this chunk is done.
if (operationTracker.isDone()) {
if (!operationTracker.hasSucceeded()) {
failedAttempts++;
if (failedAttempts <= routerConfig.routerMaxSlippedPutAttempts) {
logger.trace("Attempt to put chunk with id: " + chunkBlobId + " failed, attempting slipped put");
routerMetrics.slippedPutAttemptCount.inc();
prepareForSending();
} else {
// this chunk could not be successfully put. The whole operation has to fail.
chunkBlobId = null;
setOperationExceptionAndComplete(chunkException);
done = true;
}
} else {
chunkException = null;
done = true;
}
}
if (done) {
state = ChunkState.Complete;
}
}
/**
* This is one of two main entry points to this class, the other being
* {@link #handleResponse(ResponseInfo, PutResponse)}.
* Apart from fetching requests to send out, this also checks for timeouts of issued requests,
* status of the operation and anything else that needs to be done within this PutChunk. The callers guarantee
* that this method is called on all the PutChunks of an operation until either the operation,
* or the chunk operation is completed.
* @param requestRegistrationCallback the {@link RequestRegistrationCallback} to call for every request that gets
* created as part of this poll operation.
*/
void poll(RequestRegistrationCallback<PutOperation> requestRegistrationCallback) {
maybeFreeDefunctBuffers();
cleanupExpiredInFlightRequests();
checkAndMaybeComplete();
if (!isComplete()) {
fetchRequests(requestRegistrationCallback);
}
}
/**
* Clean up requests sent out by this operation that have now timed out.
*/
private void cleanupExpiredInFlightRequests() {
Iterator<Map.Entry<Integer, ChunkPutRequestInfo>> inFlightRequestsIterator =
correlationIdToChunkPutRequestInfo.entrySet().iterator();
while (inFlightRequestsIterator.hasNext()) {
Map.Entry<Integer, ChunkPutRequestInfo> entry = inFlightRequestsIterator.next();
if (time.milliseconds() - entry.getValue().startTimeMs > routerConfig.routerRequestTimeoutMs) {
onErrorResponse(entry.getValue().replicaId);
// Do not notify this as a failure to the response handler, as this timeout could simply be due to
// connection unavailability. If there is indeed a network error, the NetworkClient will provide an error
// response and the response handler will be notified accordingly.
chunkException = new RouterException("Timed out waiting for a response", RouterErrorCode.OperationTimedOut);
inFlightRequestsIterator.remove();
} else {
// the entries are ordered by correlation id and time. Break on the first request that has not timed out.
break;
}
}
}
/**
* Fetch {@link PutRequest}s to send for the current data chunk.
*/
private void fetchRequests(RequestRegistrationCallback<PutOperation> requestRegistrationCallback) {
Iterator<ReplicaId> replicaIterator = operationTracker.getReplicaIterator();
while (replicaIterator.hasNext()) {
ReplicaId replicaId = replicaIterator.next();
String hostname = replicaId.getDataNodeId().getHostname();
Port port = replicaId.getDataNodeId().getPortToConnectTo();
PutRequest putRequest = createPutRequest();
RouterRequestInfo request = new RouterRequestInfo(hostname, port, putRequest, replicaId);
int correlationId = putRequest.getCorrelationId();
correlationIdToChunkPutRequestInfo.put(correlationId,
new ChunkPutRequestInfo(replicaId, putRequest, time.milliseconds()));
correlationIdToPutChunk.put(correlationId, this);
requestRegistrationCallback.registerRequestToSend(PutOperation.this, request);
replicaIterator.remove();
if (RouterUtils.isRemoteReplica(routerConfig, replicaId)) {
logger.trace("Making request to a remote replica in", replicaId.getDataNodeId().getDatacenterName());
routerMetrics.crossColoRequestCount.inc();
}
routerMetrics.getDataNodeBasedMetrics(replicaId.getDataNodeId()).putRequestRate.mark();
}
}
/**
* Create and return the {@link PutRequest} associated with the current chunk. This method should only be called
* when the chunk is in ready state.
* @return the crated {@link PutRequest}.
*/
protected PutRequest createPutRequest() {
return new PutRequest(NonBlockingRouter.correlationIdGenerator.incrementAndGet(), routerConfig.routerHostname,
chunkBlobId, chunkBlobProperties, ByteBuffer.wrap(userMetadata), buf.duplicate(), buf.remaining(),
BlobType.DataBlob);
}
/**
* Choose a random {@link PartitionId} for putting the current chunk and return it.
* @param partitionIdsToExclude the list of {@link PartitionId}s that should be excluded from consideration.
* @return the chosen {@link PartitionId}
* @throws RouterException
*/
protected PartitionId getPartitionForPut(List<PartitionId> partitionIdsToExclude) throws RouterException {
// getWritablePartitions creates and returns a new list, so it is safe to manipulate it.
List<? extends PartitionId> partitions = clusterMap.getWritablePartitionIds();
partitions.removeAll(partitionIdsToExclude);
if (partitions.isEmpty()) {
throw new RouterException("No writable partitions available.", RouterErrorCode.AmbryUnavailable);
}
return partitions.get(ThreadLocalRandom.current().nextInt(partitions.size()));
}
/**
* This method is the entry point for handling responses received for requests sent out on behalf of this chunk.
* For puts, processing involves determining whether the request was successful, notifying the operation
* tracker so it can track the status of the operation, and notifying the response handler for failure detection.
* Finally, a check is done to determine whether the operation on the chunk is eligible for completion,
* if so the operation is completed right away.
* @param responseInfo the response received for a request sent out on behalf of this chunk.
* @param putResponse the {@link PutResponse} associated with this response.
*/
void handleResponse(ResponseInfo responseInfo, PutResponse putResponse) {
int correlationId = ((PutRequest) responseInfo.getRequestInfo().getRequest()).getCorrelationId();
ChunkPutRequestInfo chunkPutRequestInfo = correlationIdToChunkPutRequestInfo.remove(correlationId);
if (chunkPutRequestInfo == null) {
// Ignore right away. This could mean:
// - the response is valid for this chunk, but was timed out and removed from the map.
// - the response is for an earlier attempt of this chunk (slipped put scenario). And the map was cleared
// before attempting the slipped put.
// - the response is for an earlier chunk held by this PutChunk.
return;
}
long requestLatencyMs = time.milliseconds() - chunkPutRequestInfo.startTimeMs;
routerMetrics.routerRequestLatencyMs.update(requestLatencyMs);
routerMetrics.getDataNodeBasedMetrics(chunkPutRequestInfo.replicaId.getDataNodeId()).putRequestLatencyMs.update(
requestLatencyMs);
boolean isSuccessful;
if (responseInfo.getError() != null) {
setChunkException(new RouterException("Operation timed out", RouterErrorCode.OperationTimedOut));
isSuccessful = false;
} else {
if (putResponse == null) {
setChunkException(new RouterException("Response deserialization received an unexpected error",
RouterErrorCode.UnexpectedInternalError));
isSuccessful = false;
} else {
if (putResponse.getCorrelationId() != correlationId) {
// The NetworkClient associates a response with a request based on the fact that only one request is sent
// out over a connection id, and the response received on a connection id must be for the latest request
// sent over it. The check here ensures that is indeed the case. If not, log an error and fail this request.
// There is no other way to handle it.
routerMetrics.unknownReplicaResponseError.inc();
logger.error("The correlation id in the PutResponse " + putResponse.getCorrelationId()
+ " is not the same as the correlation id in the associated PutRequest: " + correlationId);
setChunkException(
new RouterException("Unexpected internal error", RouterErrorCode.UnexpectedInternalError));
isSuccessful = false;
// we do not notify the ResponseHandler responsible for failure detection as this is an unexpected error.
} else {
ServerErrorCode putError = putResponse.getError();
if (putError == ServerErrorCode.No_Error) {
logger.trace("The putRequest was successful");
isSuccessful = true;
} else {
// chunkException will be set within processServerError.
processServerError(putResponse.getError());
isSuccessful = false;
}
}
}
}
if (isSuccessful) {
operationTracker.onResponse(chunkPutRequestInfo.replicaId, true);
if (RouterUtils.isRemoteReplica(routerConfig, chunkPutRequestInfo.replicaId)) {
logger.trace("Cross colo request successful for remote replica in ",
chunkPutRequestInfo.replicaId.getDataNodeId().getDatacenterName());
routerMetrics.crossColoSuccessCount.inc();
}
} else {
onErrorResponse(chunkPutRequestInfo.replicaId);
}
checkAndMaybeComplete();
}
/**
* Perform the necessary actions when a request to a replica fails.
* @param replicaId the {@link ReplicaId} associated with the failed response.
*/
void onErrorResponse(ReplicaId replicaId) {
operationTracker.onResponse(replicaId, false);
routerMetrics.routerRequestErrorCount.inc();
routerMetrics.getDataNodeBasedMetrics(replicaId.getDataNodeId()).putRequestErrorCount.inc();
}
/**
* Possibly set the exception for this chunk using the given exception. Calling this method with an exception does
* not necessarily result in that being set as the chunkException. The idea is to set the most relevant exception
* in case of errors.
* @param exception the exception that may be set as the chunkException.
*/
private void setChunkException(RouterException exception) {
chunkException = exception;
}
/**
* Process an error received from the server. The idea is to convert from the ServerErrorCode to a RouterErrorCode.
* @param error the ServerErrorCode received from a response to a request.
*/
private void processServerError(ServerErrorCode error) {
// for puts, all errors are effectively Ambry_Unavailable. One could argue that certain errors
// are retryable and are eligible for slipped puts, but what could those be? Partition_ReadOnly and
// BlobId_Already_Exists are outliers (should not really happen) that those should really
// result in Ambry_Unavailable or UnexpectedInternalError.
// However, for metrics, we will need to distinguish them here.
logger.trace("Server returned an error: ", error);
setChunkException(new RouterException("Could not complete operation, server returned: " + error,
RouterErrorCode.AmbryUnavailable));
}
/**
* A class that holds information about requests sent out by this PutChunk.
*/
private class ChunkPutRequestInfo {
final ReplicaId replicaId;
final PutRequest putRequest;
final long startTimeMs;
/**
* Construct a ChunkPutRequestInfo
* @param replicaId the replica to which this request is being sent.
* @param startTimeMs the time at which this request was created.
*/
ChunkPutRequestInfo(ReplicaId replicaId, PutRequest putRequest, long startTimeMs) {
this.replicaId = replicaId;
this.putRequest = putRequest;
this.startTimeMs = startTimeMs;
}
}
/**
* Class that holds the buffer of a chunk that will no longer be used and is kept around only because the
* associated requests are not yet completely sent out.
*/
private class DefunctBufferInfo {
// the buffer that is now defunct, but not yet freed.
final ByteBuffer buf;
// Requests that are reading from this buffer.
final List<PutRequest> putRequests;
/**
* Construct a DefunctBufferInfo
* @param buf the buffer that is now defunct and waiting to be freed.
* @param putRequests the requests associated with this buffer whose send completion blocks the freeing of this
* buffer.
*/
DefunctBufferInfo(ByteBuffer buf, List<PutRequest> putRequests) {
this.buf = buf;
this.putRequests = putRequests;
}
}
}
/**
* MetadataPutChunk responsible for maintaining the state of the metadata chunk and completing the chunk operation
* on it.
*/
private class MetadataPutChunk extends PutChunk {
TreeMap<Integer, StoreKey> indexToChunkIds;
Pair<? extends StoreKey, BlobProperties> firstChunkIdAndProperties = null;
/**
* Initialize the MetadataPutChunk.
*/
MetadataPutChunk() {
indexToChunkIds = new TreeMap<>();
// metadata blob is in building state.
state = ChunkState.Building;
}
/**
* Add the given blobId of a successfully put data chunk to the metadata at its position in the overall blob.
* @param chunkBlobId the blobId of the associated data chunk
* @param chunkIndex the position of the associated data chunk in the overall blob.
*/
void addChunkId(BlobId chunkBlobId, int chunkIndex) {
indexToChunkIds.put(chunkIndex, chunkBlobId);
}
/**
* Call {@link NotificationSystem#onBlobCreated(String, BlobProperties, byte[], NotificationBlobType)} for this
* chunk, unless it is the first chunk, in which case it might be an entire simple blob. In that case, save
* the {@link BlobProperties} from the first chunk.
* @param chunk the {@link PutChunk} created.
*/
void maybeNotifyForChunkCreation(PutChunk chunk) {
if (chunk.chunkIndex == 0) {
firstChunkIdAndProperties = new Pair<>(chunk.chunkBlobId, chunk.chunkBlobProperties);
} else {
notificationSystem.onBlobCreated(chunk.chunkBlobId.getID(), chunk.chunkBlobProperties, userMetadata,
NotificationBlobType.DataChunk);
}
}
/**
* Notify for the creation of the first chunk. To be called after the overall operation is completed if the overall
* blob is composite. If no first chunk was put successfully, this will do nothing.
*/
void notifyForFirstChunkCreation() {
String chunkId = firstChunkIdAndProperties.getFirst().getID();
BlobProperties chunkProperties = firstChunkIdAndProperties.getSecond();
notificationSystem.onBlobCreated(chunkId, chunkProperties, userMetadata, NotificationBlobType.DataChunk);
}
@Override
void poll(RequestRegistrationCallback<PutOperation> requestRegistrationCallback) {
if (isBuilding() && chunkFillingCompletedSuccessfully && indexToChunkIds.size() == getNumDataChunks()) {
finalizeMetadataChunk();
}
if (isReady()) {
super.poll(requestRegistrationCallback);
}
}
/**
* To be called when chunk filling completes successfully. Finalizing involves preparing the metadata chunk
* for sending if this blob is composite, or marking the operation complete if this is a simple blob.
*/
private void finalizeMetadataChunk() {
finalBlobProperties =
new BlobProperties(getBlobSize(), passedInBlobProperties.getServiceId(), passedInBlobProperties.getOwnerId(),
passedInBlobProperties.getContentType(), passedInBlobProperties.isPrivate(),
passedInBlobProperties.getTimeToLiveInSeconds(), passedInBlobProperties.getCreationTimeInMs());
if (getNumDataChunks() > 1) {
// values returned are in the right order as TreeMap returns them in key-order.
List<StoreKey> orderedChunkIdList = new ArrayList<>(indexToChunkIds.values());
buf = MetadataContentSerDe.serializeMetadataContent(routerConfig.routerMaxPutChunkSizeBytes, getBlobSize(),
orderedChunkIdList);
onFillComplete(false);
} else {
blobId = (BlobId) indexToChunkIds.get(0);
state = ChunkState.Complete;
operationCompleted = true;
}
}
/**
* @return a list of all of the successfully put chunk ids associated with this blob
*/
List<StoreKey> getSuccessfullyPutChunkIds() {
return new ArrayList<>(indexToChunkIds.values());
}
/**
* {@inheritDoc}
*
* In constructing the put request for the metadata blob, MetadataPutChunk serializes the metadata.
* @return the created {@link PutRequest}.
*/
@Override
protected PutRequest createPutRequest() {
return new PutRequest(NonBlockingRouter.correlationIdGenerator.incrementAndGet(), routerConfig.routerHostname,
chunkBlobId, finalBlobProperties, ByteBuffer.wrap(userMetadata), buf.duplicate(), buf.remaining(),
BlobType.MetadataBlob);
}
}
/**
* Different states of a PutChunk.
*/
enum ChunkState {
/**
* The Chunk is free and can be filled with data.
*/
Free, /**
* The Chunk is being built. It may have some data but is not yet ready to be sent.
*/
Building, /**
* The Chunk is ready to be sent out.
*/
Ready, /**
* The Chunk is complete.
*/
Complete,
}
}