/* * Licensed to Elasticsearch under one or more contributor * license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright * ownership. Elasticsearch licenses this file to you under * the Apache License, Version 2.0 (the "License"); you may * not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.elasticsearch.indices.store; import org.apache.lucene.store.StoreRateLimiting; import org.elasticsearch.cluster.*; import org.elasticsearch.cluster.node.DiscoveryNode; import org.elasticsearch.cluster.routing.IndexRoutingTable; import org.elasticsearch.cluster.routing.IndexShardRoutingTable; import org.elasticsearch.cluster.routing.ShardRouting; import org.elasticsearch.common.collect.Tuple; import org.elasticsearch.common.component.AbstractComponent; import org.elasticsearch.common.inject.Inject; import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.io.stream.StreamOutput; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.unit.ByteSizeUnit; import org.elasticsearch.common.unit.ByteSizeValue; import org.elasticsearch.common.unit.TimeValue; import org.elasticsearch.common.util.concurrent.EsRejectedExecutionException; import org.elasticsearch.index.IndexService; import org.elasticsearch.index.shard.IndexShard; import org.elasticsearch.index.shard.IndexShardState; import org.elasticsearch.index.shard.ShardId; import org.elasticsearch.indices.IndicesService; import org.elasticsearch.node.settings.NodeSettingsService; import org.elasticsearch.threadpool.ThreadPool; import org.elasticsearch.transport.*; import java.io.Closeable; import java.io.IOException; import java.util.ArrayList; import java.util.EnumSet; import java.util.List; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; /** * */ public class IndicesStore extends AbstractComponent implements ClusterStateListener, Closeable { public static final String INDICES_STORE_THROTTLE_TYPE = "indices.store.throttle.type"; public static final String INDICES_STORE_THROTTLE_MAX_BYTES_PER_SEC = "indices.store.throttle.max_bytes_per_sec"; public static final String INDICES_STORE_DELETE_SHARD_TIMEOUT = "indices.store.delete.shard.timeout"; public static final String ACTION_SHARD_EXISTS = "internal:index/shard/exists"; private static final EnumSet<IndexShardState> ACTIVE_STATES = EnumSet.of(IndexShardState.STARTED, IndexShardState.RELOCATED); class ApplySettings implements NodeSettingsService.Listener { @Override public void onRefreshSettings(Settings settings) { String rateLimitingType = settings.get(INDICES_STORE_THROTTLE_TYPE, IndicesStore.this.rateLimitingType); // try and parse the type StoreRateLimiting.Type.fromString(rateLimitingType); if (!rateLimitingType.equals(IndicesStore.this.rateLimitingType)) { logger.info("updating indices.store.throttle.type from [{}] to [{}]", IndicesStore.this.rateLimitingType, rateLimitingType); IndicesStore.this.rateLimitingType = rateLimitingType; IndicesStore.this.rateLimiting.setType(rateLimitingType); } ByteSizeValue rateLimitingThrottle = settings.getAsBytesSize(INDICES_STORE_THROTTLE_MAX_BYTES_PER_SEC, IndicesStore.this.rateLimitingThrottle); if (!rateLimitingThrottle.equals(IndicesStore.this.rateLimitingThrottle)) { logger.info("updating indices.store.throttle.max_bytes_per_sec from [{}] to [{}], note, type is [{}]", IndicesStore.this.rateLimitingThrottle, rateLimitingThrottle, IndicesStore.this.rateLimitingType); IndicesStore.this.rateLimitingThrottle = rateLimitingThrottle; IndicesStore.this.rateLimiting.setMaxRate(rateLimitingThrottle); } } } private final NodeSettingsService nodeSettingsService; private final IndicesService indicesService; private final ClusterService clusterService; private final TransportService transportService; private volatile String rateLimitingType; private volatile ByteSizeValue rateLimitingThrottle; private final StoreRateLimiting rateLimiting = new StoreRateLimiting(); private final ApplySettings applySettings = new ApplySettings(); private TimeValue deleteShardTimeout; @Inject public IndicesStore(Settings settings, NodeSettingsService nodeSettingsService, IndicesService indicesService, ClusterService clusterService, TransportService transportService) { super(settings); this.nodeSettingsService = nodeSettingsService; this.indicesService = indicesService; this.clusterService = clusterService; this.transportService = transportService; transportService.registerRequestHandler(ACTION_SHARD_EXISTS, ShardActiveRequest.class, ThreadPool.Names.SAME, new ShardActiveRequestHandler()); // we don't limit by default (we default to CMS's auto throttle instead): this.rateLimitingType = settings.get("indices.store.throttle.type", StoreRateLimiting.Type.NONE.name()); rateLimiting.setType(rateLimitingType); this.rateLimitingThrottle = settings.getAsBytesSize("indices.store.throttle.max_bytes_per_sec", new ByteSizeValue(10240, ByteSizeUnit.MB)); rateLimiting.setMaxRate(rateLimitingThrottle); this.deleteShardTimeout = settings.getAsTime(INDICES_STORE_DELETE_SHARD_TIMEOUT, new TimeValue(30, TimeUnit.SECONDS)); logger.debug("using indices.store.throttle.type [{}], with index.store.throttle.max_bytes_per_sec [{}]", rateLimitingType, rateLimitingThrottle); nodeSettingsService.addListener(applySettings); if (DiscoveryNode.dataNode(settings)) { clusterService.addLast(this); } } IndicesStore() { super(Settings.EMPTY); nodeSettingsService = null; indicesService = null; this.clusterService = null; this.transportService = null; } public StoreRateLimiting rateLimiting() { return this.rateLimiting; } @Override public void close() { nodeSettingsService.removeListener(applySettings); if (DiscoveryNode.dataNode(settings)) { clusterService.remove(this); } } @Override public void clusterChanged(ClusterChangedEvent event) { /* * No more relocated shard to remove with elassandra. if (!event.routingTableChanged()) { return; } if (event.state().blocks().disableStatePersistence()) { return; } for (IndexRoutingTable indexRoutingTable : event.state().routingTable()) { // Note, closed indices will not have any routing information, so won't be deleted for (IndexShardRoutingTable indexShardRoutingTable : indexRoutingTable) { if (shardCanBeDeleted(event.state(), indexShardRoutingTable)) { ShardId shardId = indexShardRoutingTable.shardId(); if (indicesService.canDeleteShardContent(shardId, event.state().getMetaData().index(shardId.getIndex()))) { deleteShardIfExistElseWhere(event.state(), indexShardRoutingTable); } } } } */ } boolean shardCanBeDeleted(ClusterState state, IndexShardRoutingTable indexShardRoutingTable) { // a shard can be deleted if all its copies are active, and its not allocated on this node if (indexShardRoutingTable.size() == 0) { // should not really happen, there should always be at least 1 (primary) shard in a // shard replication group, in any case, protected from deleting something by mistake return false; } for (ShardRouting shardRouting : indexShardRoutingTable) { // be conservative here, check on started, not even active if (!shardRouting.started()) { return false; } // if the allocated or relocation node id doesn't exists in the cluster state it may be a stale node, // make sure we don't do anything with this until the routing table has properly been rerouted to reflect // the fact that the node does not exists DiscoveryNode node = state.nodes().get(shardRouting.currentNodeId()); if (node == null) { return false; } if (shardRouting.relocatingNodeId() != null) { node = state.nodes().get(shardRouting.relocatingNodeId()); if (node == null) { return false; } } // check if shard is active on the current node or is getting relocated to the our node String localNodeId = state.getNodes().localNode().id(); if (localNodeId.equals(shardRouting.currentNodeId()) || localNodeId.equals(shardRouting.relocatingNodeId())) { return false; } } return true; } // TODO will have to ammend this for shadow replicas so we don't delete the shared copy... private void deleteShardIfExistElseWhere(ClusterState state, IndexShardRoutingTable indexShardRoutingTable) { List<Tuple<DiscoveryNode, ShardActiveRequest>> requests = new ArrayList<>(indexShardRoutingTable.size()); String indexUUID = state.getMetaData().index(indexShardRoutingTable.shardId().getIndex()).getIndexUUID(); ClusterName clusterName = state.getClusterName(); for (ShardRouting shardRouting : indexShardRoutingTable) { // Node can't be null, because otherwise shardCanBeDeleted() would have returned false DiscoveryNode currentNode = state.nodes().get(shardRouting.currentNodeId()); assert currentNode != null; requests.add(new Tuple<>(currentNode, new ShardActiveRequest(clusterName, indexUUID, shardRouting.shardId(), deleteShardTimeout))); if (shardRouting.relocatingNodeId() != null) { DiscoveryNode relocatingNode = state.nodes().get(shardRouting.relocatingNodeId()); assert relocatingNode != null; requests.add(new Tuple<>(relocatingNode, new ShardActiveRequest(clusterName, indexUUID, shardRouting.shardId(), deleteShardTimeout))); } } ShardActiveResponseHandler responseHandler = new ShardActiveResponseHandler(indexShardRoutingTable.shardId(), state, requests.size()); for (Tuple<DiscoveryNode, ShardActiveRequest> request : requests) { logger.trace("{} sending shard active check to {}", request.v2().shardId, request.v1()); transportService.sendRequest(request.v1(), ACTION_SHARD_EXISTS, request.v2(), responseHandler); } } private class ShardActiveResponseHandler implements TransportResponseHandler<ShardActiveResponse> { private final ShardId shardId; private final int expectedActiveCopies; private final ClusterState clusterState; private final AtomicInteger awaitingResponses; private final AtomicInteger activeCopies; public ShardActiveResponseHandler(ShardId shardId, ClusterState clusterState, int expectedActiveCopies) { this.shardId = shardId; this.expectedActiveCopies = expectedActiveCopies; this.clusterState = clusterState; this.awaitingResponses = new AtomicInteger(expectedActiveCopies); this.activeCopies = new AtomicInteger(); } @Override public ShardActiveResponse newInstance() { return new ShardActiveResponse(); } @Override public void handleResponse(ShardActiveResponse response) { logger.trace("{} is {}active on node {}", shardId, response.shardActive ? "" : "not ", response.node); if (response.shardActive) { activeCopies.incrementAndGet(); } if (awaitingResponses.decrementAndGet() == 0) { allNodesResponded(); } } @Override public void handleException(TransportException exp) { logger.debug("shards active request failed for {}", exp, shardId); if (awaitingResponses.decrementAndGet() == 0) { allNodesResponded(); } } @Override public String executor() { return ThreadPool.Names.SAME; } private void allNodesResponded() { if (activeCopies.get() != expectedActiveCopies) { logger.trace("not deleting shard {}, expected {} active copies, but only {} found active copies", shardId, expectedActiveCopies, activeCopies.get()); return; } ClusterState latestClusterState = clusterService.state(); if (clusterState.getVersion() != latestClusterState.getVersion()) { logger.trace("not deleting shard {}, the latest cluster state version[{}] is not equal to cluster state before shard active api call [{}]", shardId, latestClusterState.getVersion(), clusterState.getVersion()); return; } clusterService.submitStateUpdateTask("indices_store ([" + shardId + "] active fully on other nodes)", new ClusterStateNonMasterUpdateTask() { @Override public ClusterState execute(ClusterState currentState) throws Exception { if (clusterState.getVersion() != currentState.getVersion()) { logger.trace("not deleting shard {}, the update task state version[{}] is not equal to cluster state before shard active api call [{}]", shardId, currentState.getVersion(), clusterState.getVersion()); return currentState; } try { indicesService.deleteShardStore("no longer used", shardId, currentState); } catch (Throwable ex) { logger.debug("{} failed to delete unallocated shard, ignoring", ex, shardId); } return currentState; } @Override public void onFailure(String source, Throwable t) { logger.error("{} unexpected error during deletion of unallocated shard", t, shardId); } }); } } private class ShardActiveRequestHandler extends TransportRequestHandler<ShardActiveRequest> { @Override public void messageReceived(final ShardActiveRequest request, final TransportChannel channel) throws Exception { IndexShard indexShard = getShard(request); // make sure shard is really there before register cluster state observer if (indexShard == null) { channel.sendResponse(new ShardActiveResponse(false, clusterService.localNode())); } else { // create observer here. we need to register it here because we need to capture the current cluster state // which will then be compared to the one that is applied when we call waitForNextChange(). if we create it // later we might miss an update and wait forever in case no new cluster state comes in. // in general, using a cluster state observer here is a workaround for the fact that we cannot listen on shard state changes explicitly. // instead we wait for the cluster state changes because we know any shard state change will trigger or be // triggered by a cluster state change. ClusterStateObserver observer = new ClusterStateObserver(clusterService, request.timeout, logger); // check if shard is active. if so, all is good boolean shardActive = shardActive(indexShard); if (shardActive) { channel.sendResponse(new ShardActiveResponse(true, clusterService.localNode())); } else { // shard is not active, might be POST_RECOVERY so check if cluster state changed inbetween or wait for next change observer.waitForNextChange(new ClusterStateObserver.Listener() { @Override public void onNewClusterState(ClusterState state) { sendResult(shardActive(getShard(request))); } @Override public void onClusterServiceClose() { sendResult(false); } @Override public void onTimeout(TimeValue timeout) { sendResult(shardActive(getShard(request))); } public void sendResult(boolean shardActive) { try { channel.sendResponse(new ShardActiveResponse(shardActive, clusterService.localNode())); } catch (IOException e) { logger.error("failed send response for shard active while trying to delete shard {} - shard will probably not be removed", e, request.shardId); } catch (EsRejectedExecutionException e) { logger.error("failed send response for shard active while trying to delete shard {} - shard will probably not be removed", e, request.shardId); } } }, new ClusterStateObserver.ValidationPredicate() { @Override protected boolean validate(ClusterState newState) { // the shard is not there in which case we want to send back a false (shard is not active), so the cluster state listener must be notified // or the shard is active in which case we want to send back that the shard is active // here we could also evaluate the cluster state and get the information from there. we // don't do it because we would have to write another method for this that would have the same effect IndexShard indexShard = getShard(request); return indexShard == null || shardActive(indexShard); } }); } } } private boolean shardActive(IndexShard indexShard) { if (indexShard != null) { return ACTIVE_STATES.contains(indexShard.state()); } return false; } private IndexShard getShard(ShardActiveRequest request) { ClusterName thisClusterName = clusterService.state().getClusterName(); if (!thisClusterName.equals(request.clusterName)) { logger.trace("shard exists request meant for cluster[{}], but this is cluster[{}], ignoring request", request.clusterName, thisClusterName); return null; } ShardId shardId = request.shardId; IndexService indexService = indicesService.indexService(shardId.index().getName()); if (indexService != null && indexService.indexUUID().equals(request.indexUUID)) { return indexService.shard(shardId.id()); } return null; } } public static class ShardActiveRequest extends TransportRequest { protected TimeValue timeout = null; private ClusterName clusterName; private String indexUUID; private ShardId shardId; public ShardActiveRequest() { } ShardActiveRequest(ClusterName clusterName, String indexUUID, ShardId shardId, TimeValue timeout) { this.shardId = shardId; this.indexUUID = indexUUID; this.clusterName = clusterName; this.timeout = timeout; } @Override public void readFrom(StreamInput in) throws IOException { super.readFrom(in); clusterName = ClusterName.readClusterName(in); indexUUID = in.readString(); shardId = ShardId.readShardId(in); timeout = new TimeValue(in.readLong(), TimeUnit.MILLISECONDS); } @Override public void writeTo(StreamOutput out) throws IOException { super.writeTo(out); clusterName.writeTo(out); out.writeString(indexUUID); shardId.writeTo(out); out.writeLong(timeout.millis()); } } private static class ShardActiveResponse extends TransportResponse { private boolean shardActive; private DiscoveryNode node; ShardActiveResponse() { } ShardActiveResponse(boolean shardActive, DiscoveryNode node) { this.shardActive = shardActive; this.node = node; } @Override public void readFrom(StreamInput in) throws IOException { super.readFrom(in); shardActive = in.readBoolean(); node = DiscoveryNode.readNode(in); } @Override public void writeTo(StreamOutput out) throws IOException { super.writeTo(out); out.writeBoolean(shardActive); node.writeTo(out); } } }