/* * Licensed to ElasticSearch and Shay Banon under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. ElasticSearch licenses this * file to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.elasticsearch.action.support.replication; import org.elasticsearch.ElasticSearchException; import org.elasticsearch.ExceptionsHelper; import org.elasticsearch.action.*; import org.elasticsearch.action.support.TransportAction; import org.elasticsearch.cluster.ClusterChangedEvent; import org.elasticsearch.cluster.ClusterService; import org.elasticsearch.cluster.ClusterState; import org.elasticsearch.cluster.TimeoutClusterStateListener; import org.elasticsearch.cluster.action.shard.ShardStateAction; import org.elasticsearch.cluster.block.ClusterBlockException; import org.elasticsearch.cluster.node.DiscoveryNode; import org.elasticsearch.cluster.node.DiscoveryNodes; import org.elasticsearch.cluster.routing.ShardIterator; import org.elasticsearch.cluster.routing.ShardRouting; import org.elasticsearch.common.Nullable; import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.io.stream.StreamOutput; import org.elasticsearch.common.io.stream.Streamable; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.unit.TimeValue; import org.elasticsearch.index.IndexShardMissingException; import org.elasticsearch.index.engine.DocumentAlreadyExistsException; import org.elasticsearch.index.engine.VersionConflictEngineException; import org.elasticsearch.index.shard.IllegalIndexShardStateException; import org.elasticsearch.indices.IndexMissingException; import org.elasticsearch.indices.IndicesService; import org.elasticsearch.node.NodeClosedException; import org.elasticsearch.rest.RestStatus; import org.elasticsearch.threadpool.ThreadPool; import org.elasticsearch.transport.*; import java.io.IOException; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; import static org.elasticsearch.ExceptionsHelper.detailedMessage; /** */ public abstract class TransportShardReplicationOperationAction<Request extends ShardReplicationOperationRequest, ReplicaRequest extends ActionRequest, Response extends ActionResponse> extends TransportAction<Request, Response> { protected final TransportService transportService; protected final ClusterService clusterService; protected final IndicesService indicesService; protected final ShardStateAction shardStateAction; protected final ReplicationType defaultReplicationType; protected final WriteConsistencyLevel defaultWriteConsistencyLevel; protected final TransportRequestOptions transportOptions; final String transportAction; final String transportReplicaAction; final String executor; final boolean checkWriteConsistency; protected TransportShardReplicationOperationAction(Settings settings, TransportService transportService, ClusterService clusterService, IndicesService indicesService, ThreadPool threadPool, ShardStateAction shardStateAction) { super(settings, threadPool); this.transportService = transportService; this.clusterService = clusterService; this.indicesService = indicesService; this.shardStateAction = shardStateAction; this.transportAction = transportAction(); this.transportReplicaAction = transportReplicaAction(); this.executor = executor(); this.checkWriteConsistency = checkWriteConsistency(); transportService.registerHandler(transportAction, new OperationTransportHandler()); transportService.registerHandler(transportReplicaAction, new ReplicaOperationTransportHandler()); this.transportOptions = transportOptions(); this.defaultReplicationType = ReplicationType.fromString(settings.get("action.replication_type", "sync")); this.defaultWriteConsistencyLevel = WriteConsistencyLevel.fromString(settings.get("action.write_consistency", "quorum")); } @Override protected void doExecute(Request request, ActionListener<Response> listener) { new AsyncShardOperationAction(request, listener).start(); } protected abstract Request newRequestInstance(); protected abstract ReplicaRequest newReplicaRequestInstance(); protected abstract Response newResponseInstance(); protected abstract String transportAction(); protected abstract String executor(); protected abstract PrimaryResponse<Response, ReplicaRequest> shardOperationOnPrimary(ClusterState clusterState, PrimaryOperationRequest shardRequest); protected abstract void shardOperationOnReplica(ReplicaOperationRequest shardRequest); /** * Called once replica operations have been dispatched on the */ protected void postPrimaryOperation(Request request, PrimaryResponse<Response, ReplicaRequest> response) { } protected abstract ShardIterator shards(ClusterState clusterState, Request request) throws ElasticSearchException; protected abstract boolean checkWriteConsistency(); protected abstract ClusterBlockException checkGlobalBlock(ClusterState state, Request request); protected abstract ClusterBlockException checkRequestBlock(ClusterState state, Request request); /** * Resolves the request, by default, simply setting the concrete index (if its aliased one). If the resolve * means a different execution, then return false here to indicate not to continue and execute this request. */ protected boolean resolveRequest(ClusterState state, Request request, ActionListener<Response> listener) { request.index(state.metaData().concreteIndex(request.index())); return true; } protected TransportRequestOptions transportOptions() { return TransportRequestOptions.EMPTY; } /** * Should the operations be performed on the replicas as well. Defaults to <tt>false</tt> meaning operations * will be executed on the replica. */ protected boolean ignoreReplicas() { return false; } private String transportReplicaAction() { return transportAction() + "/replica"; } protected boolean retryPrimaryException(Throwable e) { Throwable cause = ExceptionsHelper.unwrapCause(e); return cause instanceof IndexShardMissingException || cause instanceof IllegalIndexShardStateException || cause instanceof IndexMissingException; } /** * Should an exception be ignored when the operation is performed on the replica. */ boolean ignoreReplicaException(Throwable e) { Throwable cause = ExceptionsHelper.unwrapCause(e); if (cause instanceof IllegalIndexShardStateException) { return true; } if (cause instanceof IndexMissingException) { return true; } if (cause instanceof IndexShardMissingException) { return true; } if (cause instanceof ConnectTransportException) { return true; } // on version conflict or document missing, it means // that a news change has crept into the replica, and its fine if (cause instanceof VersionConflictEngineException) { return true; } // same here if (cause instanceof DocumentAlreadyExistsException) { return true; } return false; } class OperationTransportHandler extends BaseTransportRequestHandler<Request> { @Override public Request newInstance() { return newRequestInstance(); } @Override public String executor() { return ThreadPool.Names.SAME; } @Override public void messageReceived(final Request request, final TransportChannel channel) throws Exception { // no need to have a threaded listener since we just send back a response request.listenerThreaded(false); // if we have a local operation, execute it on a thread since we don't spawn request.operationThreaded(true); execute(request, new ActionListener<Response>() { @Override public void onResponse(Response result) { try { channel.sendResponse(result); } catch (Exception e) { onFailure(e); } } @Override public void onFailure(Throwable e) { try { channel.sendResponse(e); } catch (Exception e1) { logger.warn("Failed to send response for " + transportAction, e1); } } }); } } class ReplicaOperationTransportHandler extends BaseTransportRequestHandler<ReplicaOperationRequest> { @Override public ReplicaOperationRequest newInstance() { return new ReplicaOperationRequest(); } @Override public String executor() { return executor; } @Override public void messageReceived(final ReplicaOperationRequest request, final TransportChannel channel) throws Exception { shardOperationOnReplica(request); channel.sendResponse(TransportResponse.Empty.INSTANCE); } } protected class PrimaryOperationRequest implements Streamable { public int shardId; public Request request; public PrimaryOperationRequest() { } public PrimaryOperationRequest(int shardId, Request request) { this.shardId = shardId; this.request = request; } @Override public void readFrom(StreamInput in) throws IOException { shardId = in.readVInt(); request = newRequestInstance(); request.readFrom(in); } @Override public void writeTo(StreamOutput out) throws IOException { out.writeVInt(shardId); request.writeTo(out); } } protected class ReplicaOperationRequest extends TransportRequest { public int shardId; public ReplicaRequest request; public ReplicaOperationRequest() { } public ReplicaOperationRequest(int shardId, ReplicaRequest request) { super(request); this.shardId = shardId; this.request = request; } @Override public void readFrom(StreamInput in) throws IOException { super.readFrom(in); shardId = in.readVInt(); request = newReplicaRequestInstance(); request.readFrom(in); } @Override public void writeTo(StreamOutput out) throws IOException { super.writeTo(out); out.writeVInt(shardId); request.writeTo(out); } } protected class AsyncShardOperationAction { private final ActionListener<Response> listener; private final Request request; private DiscoveryNodes nodes; private ShardIterator shardIt; private final AtomicBoolean primaryOperationStarted = new AtomicBoolean(); private final ReplicationType replicationType; AsyncShardOperationAction(Request request, ActionListener<Response> listener) { this.request = request; this.listener = listener; if (request.replicationType() != ReplicationType.DEFAULT) { replicationType = request.replicationType(); } else { replicationType = defaultReplicationType; } } public void start() { start(false); } /** * Returns <tt>true</tt> if the action starting to be performed on the primary (or is done). */ public boolean start(final boolean fromClusterEvent) throws ElasticSearchException { final ClusterState clusterState = clusterService.state(); nodes = clusterState.nodes(); try { ClusterBlockException blockException = checkGlobalBlock(clusterState, request); if (blockException != null) { if (blockException.retryable()) { retry(fromClusterEvent, blockException); return false; } else { throw blockException; } } // check if we need to execute, and if not, return if (!resolveRequest(clusterState, request, listener)) { return true; } blockException = checkRequestBlock(clusterState, request); if (blockException != null) { if (blockException.retryable()) { retry(fromClusterEvent, blockException); return false; } else { throw blockException; } } shardIt = shards(clusterState, request); } catch (Exception e) { listener.onFailure(e); return true; } // no shardIt, might be in the case between index gateway recovery and shardIt initialization if (shardIt.size() == 0) { retry(fromClusterEvent, null); return false; } boolean foundPrimary = false; ShardRouting shardX; while ((shardX = shardIt.nextOrNull()) != null) { final ShardRouting shard = shardX; // we only deal with primary shardIt here... if (!shard.primary()) { continue; } if (!shard.active() || !nodes.nodeExists(shard.currentNodeId())) { retry(fromClusterEvent, null); return false; } // check here for consistency if (checkWriteConsistency) { WriteConsistencyLevel consistencyLevel = defaultWriteConsistencyLevel; if (request.consistencyLevel() != WriteConsistencyLevel.DEFAULT) { consistencyLevel = request.consistencyLevel(); } int requiredNumber = 1; if (consistencyLevel == WriteConsistencyLevel.QUORUM && shardIt.size() > 2) { // only for more than 2 in the number of shardIt it makes sense, otherwise its 1 shard with 1 replica, quorum is 1 (which is what it is initialized to) requiredNumber = (shardIt.size() / 2) + 1; } else if (consistencyLevel == WriteConsistencyLevel.ALL) { requiredNumber = shardIt.size(); } if (shardIt.sizeActive() < requiredNumber) { retry(fromClusterEvent, null); return false; } } if (!primaryOperationStarted.compareAndSet(false, true)) { return true; } foundPrimary = true; if (shard.currentNodeId().equals(nodes.localNodeId())) { if (request.operationThreaded()) { request.beforeLocalFork(); threadPool.executor(executor).execute(new Runnable() { @Override public void run() { performOnPrimary(shard.id(), fromClusterEvent, shard, clusterState); } }); } else { performOnPrimary(shard.id(), fromClusterEvent, shard, clusterState); } } else { DiscoveryNode node = nodes.get(shard.currentNodeId()); transportService.sendRequest(node, transportAction, request, transportOptions, new BaseTransportResponseHandler<Response>() { @Override public Response newInstance() { return newResponseInstance(); } @Override public String executor() { return ThreadPool.Names.SAME; } @Override public void handleResponse(Response response) { listener.onResponse(response); } @Override public void handleException(TransportException exp) { // if we got disconnected from the node, or the node / shard is not in the right state (being closed) if (exp.unwrapCause() instanceof ConnectTransportException || exp.unwrapCause() instanceof NodeClosedException || retryPrimaryException(exp)) { primaryOperationStarted.set(false); // we already marked it as started when we executed it (removed the listener) so pass false // to re-add to the cluster listener retry(false, null); } else { listener.onFailure(exp); } } }); } break; } // we should never get here, but here we go if (!foundPrimary) { retry(fromClusterEvent, null); return false; } return true; } void retry(boolean fromClusterEvent, @Nullable final Throwable failure) { if (!fromClusterEvent) { // make it threaded operation so we fork on the discovery listener thread request.beforeLocalFork(); request.operationThreaded(true); clusterService.add(request.timeout(), new TimeoutClusterStateListener() { @Override public void postAdded() { if (start(true)) { // if we managed to start and perform the operation on the primary, we can remove this listener clusterService.remove(this); } } @Override public void onClose() { clusterService.remove(this); listener.onFailure(new NodeClosedException(nodes.localNode())); } @Override public void clusterChanged(ClusterChangedEvent event) { if (start(true)) { // if we managed to start and perform the operation on the primary, we can remove this listener clusterService.remove(this); } } @Override public void onTimeout(TimeValue timeValue) { // just to be on the safe side, see if we can start it now? if (start(true)) { clusterService.remove(this); return; } clusterService.remove(this); Throwable listenerFailure = failure; if (listenerFailure == null) { if (shardIt == null) { listenerFailure = new UnavailableShardsException(null, "no available shards: Timeout waiting for [" + timeValue + "], request: " + request.toString()); } else { listenerFailure = new UnavailableShardsException(shardIt.shardId(), "[" + shardIt.size() + "] shardIt, [" + shardIt.sizeActive() + "] active : Timeout waiting for [" + timeValue + "], request: " + request.toString()); } } listener.onFailure(listenerFailure); } }); } } void performOnPrimary(int primaryShardId, boolean fromDiscoveryListener, final ShardRouting shard, ClusterState clusterState) { try { PrimaryResponse<Response, ReplicaRequest> response = shardOperationOnPrimary(clusterState, new PrimaryOperationRequest(primaryShardId, request)); performReplicas(response); } catch (Exception e) { // shard has not been allocated yet, retry it here if (retryPrimaryException(e)) { primaryOperationStarted.set(false); retry(fromDiscoveryListener, null); return; } if (e instanceof ElasticSearchException && ((ElasticSearchException) e).status() == RestStatus.CONFLICT) { if (logger.isTraceEnabled()) { logger.trace(shard.shortSummary() + ": Failed to execute [" + request + "]", e); } } else { if (logger.isDebugEnabled()) { logger.debug(shard.shortSummary() + ": Failed to execute [" + request + "]", e); } } listener.onFailure(e); } } void performReplicas(final PrimaryResponse<Response, ReplicaRequest> response) { if (ignoreReplicas() || shardIt.size() == 1 /* no replicas */) { postPrimaryOperation(request, response); listener.onResponse(response.response()); return; } // initialize the counter int replicaCounter = shardIt.assignedReplicasIncludingRelocating(); if (replicaCounter == 0) { postPrimaryOperation(request, response); listener.onResponse(response.response()); return; } if (replicationType == ReplicationType.ASYNC) { postPrimaryOperation(request, response); // async replication, notify the listener listener.onResponse(response.response()); // now, trick the counter so it won't decrease to 0 and notify the listeners replicaCounter = Integer.MIN_VALUE; } // we add one to the replica count to do the postPrimaryOperation replicaCounter++; AtomicInteger counter = new AtomicInteger(replicaCounter); shardIt.reset(); // reset the iterator ShardRouting shard; while ((shard = shardIt.nextOrNull()) != null) { // if its unassigned, nothing to do here... if (shard.unassigned()) { continue; } // if the shard is primary and relocating, add one to the counter since we perform it on the replica as well // (and we already did it on the primary) boolean doOnlyOnRelocating = false; if (shard.primary()) { if (shard.relocating()) { doOnlyOnRelocating = true; } else { continue; } } // we index on a replica that is initializing as well since we might not have got the event // yet that it was started. We will get an exception IllegalShardState exception if its not started // and that's fine, we will ignore it if (!doOnlyOnRelocating) { performOnReplica(response, counter, shard, shard.currentNodeId()); } if (shard.relocating()) { performOnReplica(response, counter, shard, shard.relocatingNodeId()); } } // now do the postPrimary operation, and check if the listener needs to be invoked postPrimaryOperation(request, response); // we also invoke here in case replicas finish before postPrimaryAction does if (counter.decrementAndGet() == 0) { listener.onResponse(response.response()); } } void performOnReplica(final PrimaryResponse<Response, ReplicaRequest> response, final AtomicInteger counter, final ShardRouting shard, String nodeId) { // if we don't have that node, it means that it might have failed and will be created again, in // this case, we don't have to do the operation, and just let it failover if (!nodes.nodeExists(nodeId)) { if (counter.decrementAndGet() == 0) { listener.onResponse(response.response()); } return; } final ReplicaOperationRequest shardRequest = new ReplicaOperationRequest(shardIt.shardId().id(), response.replicaRequest()); if (!nodeId.equals(nodes.localNodeId())) { DiscoveryNode node = nodes.get(nodeId); transportService.sendRequest(node, transportReplicaAction, shardRequest, transportOptions, new EmptyTransportResponseHandler(ThreadPool.Names.SAME) { @Override public void handleResponse(TransportResponse.Empty vResponse) { finishIfPossible(); } @Override public void handleException(TransportException exp) { if (!ignoreReplicaException(exp.unwrapCause())) { logger.warn("Failed to perform " + transportAction + " on replica " + shardIt.shardId(), exp); shardStateAction.shardFailed(shard, "Failed to perform [" + transportAction + "] on replica, message [" + detailedMessage(exp) + "]"); } finishIfPossible(); } private void finishIfPossible() { if (counter.decrementAndGet() == 0) { listener.onResponse(response.response()); } } }); } else { if (request.operationThreaded()) { request.beforeLocalFork(); threadPool.executor(executor).execute(new Runnable() { @Override public void run() { try { shardOperationOnReplica(shardRequest); } catch (Exception e) { if (!ignoreReplicaException(e)) { logger.warn("Failed to perform " + transportAction + " on replica " + shardIt.shardId(), e); shardStateAction.shardFailed(shard, "Failed to perform [" + transportAction + "] on replica, message [" + detailedMessage(e) + "]"); } } if (counter.decrementAndGet() == 0) { listener.onResponse(response.response()); } } }); } else { try { shardOperationOnReplica(shardRequest); } catch (Exception e) { if (!ignoreReplicaException(e)) { logger.warn("Failed to perform " + transportAction + " on replica" + shardIt.shardId(), e); shardStateAction.shardFailed(shard, "Failed to perform [" + transportAction + "] on replica, message [" + detailedMessage(e) + "]"); } } if (counter.decrementAndGet() == 0) { listener.onResponse(response.response()); } } } } } public static class PrimaryResponse<Response, ReplicaRequest> { private final ReplicaRequest replicaRequest; private final Response response; private final Object payload; public PrimaryResponse(ReplicaRequest replicaRequest, Response response, Object payload) { this.replicaRequest = replicaRequest; this.response = response; this.payload = payload; } public ReplicaRequest replicaRequest() { return this.replicaRequest; } public Response response() { return response; } public Object payload() { return payload; } } }