/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.elasticsearch.snapshots;
import com.carrotsearch.hppc.cursors.ObjectObjectCursor;
import org.apache.logging.log4j.message.ParameterizedMessage;
import org.apache.logging.log4j.util.Supplier;
import org.apache.lucene.index.IndexCommit;
import org.elasticsearch.ExceptionsHelper;
import org.elasticsearch.cluster.ClusterChangedEvent;
import org.elasticsearch.cluster.ClusterState;
import org.elasticsearch.cluster.ClusterStateApplier;
import org.elasticsearch.cluster.ClusterStateTaskConfig;
import org.elasticsearch.cluster.ClusterStateTaskExecutor;
import org.elasticsearch.cluster.SnapshotsInProgress;
import org.elasticsearch.cluster.SnapshotsInProgress.ShardSnapshotStatus;
import org.elasticsearch.cluster.SnapshotsInProgress.State;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.cluster.service.ClusterService;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.Priority;
import org.elasticsearch.common.collect.ImmutableOpenMap;
import org.elasticsearch.common.component.AbstractLifecycleComponent;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.unit.ByteSizeValue;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.common.util.concurrent.AbstractRunnable;
import org.elasticsearch.index.engine.SnapshotFailedEngineException;
import org.elasticsearch.index.shard.IndexEventListener;
import org.elasticsearch.index.shard.IndexShard;
import org.elasticsearch.index.shard.IndexShardState;
import org.elasticsearch.index.shard.ShardId;
import org.elasticsearch.index.snapshots.IndexShardSnapshotFailedException;
import org.elasticsearch.index.snapshots.IndexShardSnapshotStatus;
import org.elasticsearch.index.snapshots.IndexShardSnapshotStatus.Stage;
import org.elasticsearch.indices.IndicesService;
import org.elasticsearch.repositories.IndexId;
import org.elasticsearch.repositories.Repository;
import org.elasticsearch.threadpool.ThreadPool;
import org.elasticsearch.transport.EmptyTransportResponseHandler;
import org.elasticsearch.transport.TransportChannel;
import org.elasticsearch.transport.TransportRequest;
import org.elasticsearch.transport.TransportRequestHandler;
import org.elasticsearch.transport.TransportResponse;
import org.elasticsearch.transport.TransportService;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.Executor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
import java.util.function.Function;
import java.util.stream.Collectors;
import static java.util.Collections.emptyMap;
import static java.util.Collections.unmodifiableMap;
import static org.elasticsearch.cluster.SnapshotsInProgress.completed;
/**
* This service runs on data and master nodes and controls currently snapshotted shards on these nodes. It is responsible for
* starting and stopping shard level snapshots
*/
public class SnapshotShardsService extends AbstractLifecycleComponent implements ClusterStateApplier, IndexEventListener {
public static final String UPDATE_SNAPSHOT_ACTION_NAME = "internal:cluster/snapshot/update_snapshot";
private final ClusterService clusterService;
private final IndicesService indicesService;
private final SnapshotsService snapshotsService;
private final TransportService transportService;
private final ThreadPool threadPool;
private final Lock shutdownLock = new ReentrantLock();
private final Condition shutdownCondition = shutdownLock.newCondition();
private volatile Map<Snapshot, SnapshotShards> shardSnapshots = emptyMap();
private final SnapshotStateExecutor snapshotStateExecutor = new SnapshotStateExecutor();
@Inject
public SnapshotShardsService(Settings settings, ClusterService clusterService, SnapshotsService snapshotsService, ThreadPool threadPool,
TransportService transportService, IndicesService indicesService) {
super(settings);
this.indicesService = indicesService;
this.snapshotsService = snapshotsService;
this.transportService = transportService;
this.clusterService = clusterService;
this.threadPool = threadPool;
if (DiscoveryNode.isDataNode(settings)) {
// this is only useful on the nodes that can hold data
// addLowPriorityApplier to make sure that Repository will be created before snapshot
clusterService.addLowPriorityApplier(this);
}
if (DiscoveryNode.isMasterNode(settings)) {
// This needs to run only on nodes that can become masters
transportService.registerRequestHandler(UPDATE_SNAPSHOT_ACTION_NAME, UpdateIndexShardSnapshotStatusRequest::new, ThreadPool.Names.SAME, new UpdateSnapshotStateRequestHandler());
}
}
@Override
protected void doStart() {
}
@Override
protected void doStop() {
shutdownLock.lock();
try {
while(!shardSnapshots.isEmpty() && shutdownCondition.await(5, TimeUnit.SECONDS)) {
// Wait for at most 5 second for locally running snapshots to finish
}
} catch (InterruptedException ex) {
Thread.currentThread().interrupt();
} finally {
shutdownLock.unlock();
}
}
@Override
protected void doClose() {
clusterService.removeApplier(this);
}
@Override
public void applyClusterState(ClusterChangedEvent event) {
try {
SnapshotsInProgress prev = event.previousState().custom(SnapshotsInProgress.TYPE);
SnapshotsInProgress curr = event.state().custom(SnapshotsInProgress.TYPE);
if ((prev == null && curr != null) || (prev != null && prev.equals(curr) == false)) {
processIndexShardSnapshots(event);
}
String masterNodeId = event.state().nodes().getMasterNodeId();
if (masterNodeId != null && masterNodeId.equals(event.previousState().nodes().getMasterNodeId()) == false) {
syncShardStatsOnNewMaster(event);
}
} catch (Exception e) {
logger.warn("Failed to update snapshot state ", e);
}
}
@Override
public void beforeIndexShardClosed(ShardId shardId, @Nullable IndexShard indexShard, Settings indexSettings) {
// abort any snapshots occurring on the soon-to-be closed shard
Map<Snapshot, SnapshotShards> snapshotShardsMap = shardSnapshots;
for (Map.Entry<Snapshot, SnapshotShards> snapshotShards : snapshotShardsMap.entrySet()) {
Map<ShardId, IndexShardSnapshotStatus> shards = snapshotShards.getValue().shards;
if (shards.containsKey(shardId)) {
logger.debug("[{}] shard closing, abort snapshotting for snapshot [{}]", shardId, snapshotShards.getKey().getSnapshotId());
shards.get(shardId).abort();
}
}
}
/**
* Returns status of shards that are snapshotted on the node and belong to the given snapshot
* <p>
* This method is executed on data node
* </p>
*
* @param snapshot snapshot
* @return map of shard id to snapshot status
*/
public Map<ShardId, IndexShardSnapshotStatus> currentSnapshotShards(Snapshot snapshot) {
SnapshotShards snapshotShards = shardSnapshots.get(snapshot);
if (snapshotShards == null) {
return null;
} else {
return snapshotShards.shards;
}
}
/**
* Checks if any new shards should be snapshotted on this node
*
* @param event cluster state changed event
*/
private void processIndexShardSnapshots(ClusterChangedEvent event) {
SnapshotsInProgress snapshotsInProgress = event.state().custom(SnapshotsInProgress.TYPE);
Map<Snapshot, SnapshotShards> survivors = new HashMap<>();
// First, remove snapshots that are no longer there
for (Map.Entry<Snapshot, SnapshotShards> entry : shardSnapshots.entrySet()) {
final Snapshot snapshot = entry.getKey();
if (snapshotsInProgress != null && snapshotsInProgress.snapshot(snapshot) != null) {
survivors.put(entry.getKey(), entry.getValue());
} else {
// abort any running snapshots of shards for the removed entry;
// this could happen if for some reason the cluster state update for aborting
// running shards is missed, then the snapshot is removed is a subsequent cluster
// state update, which is being processed here
for (IndexShardSnapshotStatus snapshotStatus : entry.getValue().shards.values()) {
if (snapshotStatus.stage() == Stage.INIT || snapshotStatus.stage() == Stage.STARTED) {
snapshotStatus.abort();
}
}
}
}
// For now we will be mostly dealing with a single snapshot at a time but might have multiple simultaneously running
// snapshots in the future
Map<Snapshot, Map<ShardId, IndexShardSnapshotStatus>> newSnapshots = new HashMap<>();
// Now go through all snapshots and update existing or create missing
final String localNodeId = event.state().nodes().getLocalNodeId();
final DiscoveryNode masterNode = event.state().nodes().getMasterNode();
final Map<Snapshot, Map<String, IndexId>> snapshotIndices = new HashMap<>();
if (snapshotsInProgress != null) {
for (SnapshotsInProgress.Entry entry : snapshotsInProgress.entries()) {
snapshotIndices.put(entry.snapshot(),
entry.indices().stream().collect(Collectors.toMap(IndexId::getName, Function.identity())));
if (entry.state() == State.STARTED) {
Map<ShardId, IndexShardSnapshotStatus> startedShards = new HashMap<>();
SnapshotShards snapshotShards = shardSnapshots.get(entry.snapshot());
for (ObjectObjectCursor<ShardId, ShardSnapshotStatus> shard : entry.shards()) {
// Add all new shards to start processing on
if (localNodeId.equals(shard.value.nodeId())) {
if (shard.value.state() == State.INIT && (snapshotShards == null || !snapshotShards.shards.containsKey(shard.key))) {
logger.trace("[{}] - Adding shard to the queue", shard.key);
startedShards.put(shard.key, new IndexShardSnapshotStatus());
}
}
}
if (!startedShards.isEmpty()) {
newSnapshots.put(entry.snapshot(), startedShards);
if (snapshotShards != null) {
// We already saw this snapshot but we need to add more started shards
Map<ShardId, IndexShardSnapshotStatus> shards = new HashMap<>();
// Put all shards that were already running on this node
shards.putAll(snapshotShards.shards);
// Put all newly started shards
shards.putAll(startedShards);
survivors.put(entry.snapshot(), new SnapshotShards(unmodifiableMap(shards)));
} else {
// Brand new snapshot that we haven't seen before
survivors.put(entry.snapshot(), new SnapshotShards(unmodifiableMap(startedShards)));
}
}
} else if (entry.state() == State.ABORTED) {
// Abort all running shards for this snapshot
SnapshotShards snapshotShards = shardSnapshots.get(entry.snapshot());
if (snapshotShards != null) {
for (ObjectObjectCursor<ShardId, ShardSnapshotStatus> shard : entry.shards()) {
IndexShardSnapshotStatus snapshotStatus = snapshotShards.shards.get(shard.key);
if (snapshotStatus != null) {
switch (snapshotStatus.stage()) {
case INIT:
case STARTED:
snapshotStatus.abort();
break;
case FINALIZE:
logger.debug("[{}] trying to cancel snapshot on shard [{}] that is finalizing, letting it finish", entry.snapshot(), shard.key);
break;
case DONE:
logger.debug("[{}] trying to cancel snapshot on the shard [{}] that is already done, updating status on the master", entry.snapshot(), shard.key);
updateIndexShardSnapshotStatus(entry.snapshot(), shard.key,
new ShardSnapshotStatus(localNodeId, State.SUCCESS), masterNode);
break;
case FAILURE:
logger.debug("[{}] trying to cancel snapshot on the shard [{}] that has already failed, updating status on the master", entry.snapshot(), shard.key);
updateIndexShardSnapshotStatus(entry.snapshot(), shard.key,
new ShardSnapshotStatus(localNodeId, State.FAILED, snapshotStatus.failure()), masterNode);
break;
default:
throw new IllegalStateException("Unknown snapshot shard stage " + snapshotStatus.stage());
}
}
}
}
}
}
}
// Update the list of snapshots that we saw and tried to started
// If startup of these shards fails later, we don't want to try starting these shards again
shutdownLock.lock();
try {
shardSnapshots = unmodifiableMap(survivors);
if (shardSnapshots.isEmpty()) {
// Notify all waiting threads that no more snapshots
shutdownCondition.signalAll();
}
} finally {
shutdownLock.unlock();
}
// We have new shards to starts
if (newSnapshots.isEmpty() == false) {
Executor executor = threadPool.executor(ThreadPool.Names.SNAPSHOT);
for (final Map.Entry<Snapshot, Map<ShardId, IndexShardSnapshotStatus>> entry : newSnapshots.entrySet()) {
Map<String, IndexId> indicesMap = snapshotIndices.get(entry.getKey());
assert indicesMap != null;
for (final Map.Entry<ShardId, IndexShardSnapshotStatus> shardEntry : entry.getValue().entrySet()) {
final ShardId shardId = shardEntry.getKey();
try {
final IndexShard indexShard = indicesService.indexServiceSafe(shardId.getIndex()).getShardOrNull(shardId.id());
final IndexId indexId = indicesMap.get(shardId.getIndexName());
assert indexId != null;
executor.execute(new AbstractRunnable() {
@Override
public void doRun() {
snapshot(indexShard, entry.getKey(), indexId, shardEntry.getValue());
updateIndexShardSnapshotStatus(entry.getKey(), shardId,
new ShardSnapshotStatus(localNodeId, State.SUCCESS), masterNode);
}
@Override
public void onFailure(Exception e) {
logger.warn((Supplier<?>) () -> new ParameterizedMessage("[{}] [{}] failed to create snapshot", shardId, entry.getKey()), e);
updateIndexShardSnapshotStatus(entry.getKey(), shardId,
new ShardSnapshotStatus(localNodeId, State.FAILED, ExceptionsHelper.detailedMessage(e)), masterNode);
}
});
} catch (Exception e) {
updateIndexShardSnapshotStatus(entry.getKey(), shardId,
new ShardSnapshotStatus(localNodeId, State.FAILED, ExceptionsHelper.detailedMessage(e)), masterNode);
}
}
}
}
}
/**
* Creates shard snapshot
*
* @param snapshot snapshot
* @param snapshotStatus snapshot status
*/
private void snapshot(final IndexShard indexShard, final Snapshot snapshot, final IndexId indexId, final IndexShardSnapshotStatus snapshotStatus) {
Repository repository = snapshotsService.getRepositoriesService().repository(snapshot.getRepository());
ShardId shardId = indexShard.shardId();
if (!indexShard.routingEntry().primary()) {
throw new IndexShardSnapshotFailedException(shardId, "snapshot should be performed only on primary");
}
if (indexShard.routingEntry().relocating()) {
// do not snapshot when in the process of relocation of primaries so we won't get conflicts
throw new IndexShardSnapshotFailedException(shardId, "cannot snapshot while relocating");
}
if (indexShard.state() == IndexShardState.CREATED || indexShard.state() == IndexShardState.RECOVERING) {
// shard has just been created, or still recovering
throw new IndexShardSnapshotFailedException(shardId, "shard didn't fully recover yet");
}
try {
// we flush first to make sure we get the latest writes snapshotted
IndexCommit snapshotIndexCommit = indexShard.acquireIndexCommit(true);
try {
repository.snapshotShard(indexShard, snapshot.getSnapshotId(), indexId, snapshotIndexCommit, snapshotStatus);
if (logger.isDebugEnabled()) {
StringBuilder sb = new StringBuilder();
sb.append(" index : version [").append(snapshotStatus.indexVersion()).append("], number_of_files [").append(snapshotStatus.numberOfFiles()).append("] with total_size [").append(new ByteSizeValue(snapshotStatus.totalSize())).append("]\n");
logger.debug("snapshot ({}) completed to {}, took [{}]\n{}", snapshot, repository,
TimeValue.timeValueMillis(snapshotStatus.time()), sb);
}
} finally {
indexShard.releaseIndexCommit(snapshotIndexCommit);
}
} catch (SnapshotFailedEngineException e) {
throw e;
} catch (IndexShardSnapshotFailedException e) {
throw e;
} catch (Exception e) {
throw new IndexShardSnapshotFailedException(shardId, "Failed to snapshot", e);
}
}
/**
* Checks if any shards were processed that the new master doesn't know about
*/
private void syncShardStatsOnNewMaster(ClusterChangedEvent event) {
SnapshotsInProgress snapshotsInProgress = event.state().custom(SnapshotsInProgress.TYPE);
if (snapshotsInProgress == null) {
return;
}
final String localNodeId = event.state().nodes().getLocalNodeId();
final DiscoveryNode masterNode = event.state().nodes().getMasterNode();
for (SnapshotsInProgress.Entry snapshot : snapshotsInProgress.entries()) {
if (snapshot.state() == State.STARTED || snapshot.state() == State.ABORTED) {
Map<ShardId, IndexShardSnapshotStatus> localShards = currentSnapshotShards(snapshot.snapshot());
if (localShards != null) {
ImmutableOpenMap<ShardId, ShardSnapshotStatus> masterShards = snapshot.shards();
for(Map.Entry<ShardId, IndexShardSnapshotStatus> localShard : localShards.entrySet()) {
ShardId shardId = localShard.getKey();
IndexShardSnapshotStatus localShardStatus = localShard.getValue();
ShardSnapshotStatus masterShard = masterShards.get(shardId);
if (masterShard != null && masterShard.state().completed() == false) {
// Master knows about the shard and thinks it has not completed
if (localShardStatus.stage() == Stage.DONE) {
// but we think the shard is done - we need to make new master know that the shard is done
logger.debug("[{}] new master thinks the shard [{}] is not completed but the shard is done locally, updating status on the master", snapshot.snapshot(), shardId);
updateIndexShardSnapshotStatus(snapshot.snapshot(), shardId,
new ShardSnapshotStatus(localNodeId, State.SUCCESS), masterNode);
} else if (localShard.getValue().stage() == Stage.FAILURE) {
// but we think the shard failed - we need to make new master know that the shard failed
logger.debug("[{}] new master thinks the shard [{}] is not completed but the shard failed locally, updating status on master", snapshot.snapshot(), shardId);
updateIndexShardSnapshotStatus(snapshot.snapshot(), shardId,
new ShardSnapshotStatus(localNodeId, State.FAILED, localShardStatus.failure()), masterNode);
}
}
}
}
}
}
}
/**
* Stores the list of shards that has to be snapshotted on this node
*/
private static class SnapshotShards {
private final Map<ShardId, IndexShardSnapshotStatus> shards;
private SnapshotShards(Map<ShardId, IndexShardSnapshotStatus> shards) {
this.shards = shards;
}
}
/**
* Internal request that is used to send changes in snapshot status to master
*/
public static class UpdateIndexShardSnapshotStatusRequest extends TransportRequest {
private Snapshot snapshot;
private ShardId shardId;
private ShardSnapshotStatus status;
public UpdateIndexShardSnapshotStatusRequest() {
}
public UpdateIndexShardSnapshotStatusRequest(Snapshot snapshot, ShardId shardId, ShardSnapshotStatus status) {
this.snapshot = snapshot;
this.shardId = shardId;
this.status = status;
}
@Override
public void readFrom(StreamInput in) throws IOException {
super.readFrom(in);
snapshot = new Snapshot(in);
shardId = ShardId.readShardId(in);
status = new ShardSnapshotStatus(in);
}
@Override
public void writeTo(StreamOutput out) throws IOException {
super.writeTo(out);
snapshot.writeTo(out);
shardId.writeTo(out);
status.writeTo(out);
}
public Snapshot snapshot() {
return snapshot;
}
public ShardId shardId() {
return shardId;
}
public ShardSnapshotStatus status() {
return status;
}
@Override
public String toString() {
return snapshot + ", shardId [" + shardId + "], status [" + status.state() + "]";
}
}
/**
* Updates the shard status
*/
public void updateIndexShardSnapshotStatus(Snapshot snapshot, ShardId shardId, ShardSnapshotStatus status, DiscoveryNode master) {
UpdateIndexShardSnapshotStatusRequest request = new UpdateIndexShardSnapshotStatusRequest(snapshot, shardId, status);
try {
transportService.sendRequest(master, UPDATE_SNAPSHOT_ACTION_NAME, request, EmptyTransportResponseHandler.INSTANCE_SAME);
} catch (Exception e) {
logger.warn((Supplier<?>) () -> new ParameterizedMessage("[{}] [{}] failed to update snapshot state", request.snapshot(), request.status()), e);
}
}
/**
* Updates the shard status on master node
*
* @param request update shard status request
*/
private void innerUpdateSnapshotState(final UpdateIndexShardSnapshotStatusRequest request) {
logger.trace("received updated snapshot restore state [{}]", request);
clusterService.submitStateUpdateTask(
"update snapshot state",
request,
ClusterStateTaskConfig.build(Priority.NORMAL),
snapshotStateExecutor,
(source, e) -> logger.warn((Supplier<?>) () -> new ParameterizedMessage("[{}][{}] failed to update snapshot status to [{}]",
request.snapshot(), request.shardId(), request.status()), e));
}
class SnapshotStateExecutor implements ClusterStateTaskExecutor<UpdateIndexShardSnapshotStatusRequest> {
@Override
public ClusterTasksResult<UpdateIndexShardSnapshotStatusRequest> execute(ClusterState currentState, List<UpdateIndexShardSnapshotStatusRequest> tasks) throws Exception {
final SnapshotsInProgress snapshots = currentState.custom(SnapshotsInProgress.TYPE);
if (snapshots != null) {
int changedCount = 0;
final List<SnapshotsInProgress.Entry> entries = new ArrayList<>();
for (SnapshotsInProgress.Entry entry : snapshots.entries()) {
ImmutableOpenMap.Builder<ShardId, ShardSnapshotStatus> shards = ImmutableOpenMap.builder();
boolean updated = false;
for (UpdateIndexShardSnapshotStatusRequest updateSnapshotState : tasks) {
if (entry.snapshot().equals(updateSnapshotState.snapshot())) {
logger.trace("[{}] Updating shard [{}] with status [{}]", updateSnapshotState.snapshot(), updateSnapshotState.shardId(), updateSnapshotState.status().state());
if (updated == false) {
shards.putAll(entry.shards());
updated = true;
}
shards.put(updateSnapshotState.shardId(), updateSnapshotState.status());
changedCount++;
}
}
if (updated) {
if (completed(shards.values()) == false) {
entries.add(new SnapshotsInProgress.Entry(entry, shards.build()));
} else {
// Snapshot is finished - mark it as done
// TODO: Add PARTIAL_SUCCESS status?
SnapshotsInProgress.Entry updatedEntry = new SnapshotsInProgress.Entry(entry, State.SUCCESS, shards.build());
entries.add(updatedEntry);
// Finalize snapshot in the repository
snapshotsService.endSnapshot(updatedEntry);
logger.info("snapshot [{}] is done", updatedEntry.snapshot());
}
} else {
entries.add(entry);
}
}
if (changedCount > 0) {
logger.trace("changed cluster state triggered by {} snapshot state updates", changedCount);
final SnapshotsInProgress updatedSnapshots = new SnapshotsInProgress(entries.toArray(new SnapshotsInProgress.Entry[entries.size()]));
return ClusterTasksResult.<UpdateIndexShardSnapshotStatusRequest>builder().successes(tasks).build(
ClusterState.builder(currentState).putCustom(SnapshotsInProgress.TYPE, updatedSnapshots).build());
}
}
return ClusterTasksResult.<UpdateIndexShardSnapshotStatusRequest>builder().successes(tasks).build(currentState);
}
}
/**
* Transport request handler that is used to send changes in snapshot status to master
*/
class UpdateSnapshotStateRequestHandler implements TransportRequestHandler<UpdateIndexShardSnapshotStatusRequest> {
@Override
public void messageReceived(UpdateIndexShardSnapshotStatusRequest request, final TransportChannel channel) throws Exception {
innerUpdateSnapshotState(request);
channel.sendResponse(TransportResponse.Empty.INSTANCE);
}
}
}