/* * Licensed to Crate under one or more contributor license agreements. * See the NOTICE file distributed with this work for additional * information regarding copyright ownership. Crate licenses this file * to you under the Apache License, Version 2.0 (the "License"); you may * not use this file except in compliance with the License. You may * obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. * * However, if you have executed another commercial license agreement * with Crate these terms will supersede the license and you may use the * software solely pursuant to the terms of the relevant commercial * agreement. */ package io.crate.cluster.gracefulstop; import com.google.common.annotations.VisibleForTesting; import io.crate.action.sql.SQLOperations; import io.crate.operation.collect.stats.JobsLogs; import io.crate.settings.CrateSetting; import io.crate.types.DataTypes; import org.elasticsearch.action.ActionListener; import org.elasticsearch.action.admin.cluster.health.ClusterHealthRequest; import org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse; import org.elasticsearch.action.admin.cluster.health.TransportClusterHealthAction; import org.elasticsearch.action.admin.cluster.settings.ClusterUpdateSettingsRequest; import org.elasticsearch.action.admin.cluster.settings.ClusterUpdateSettingsResponse; import org.elasticsearch.action.admin.cluster.settings.TransportClusterUpdateSettingsAction; import org.elasticsearch.cluster.ClusterChangedEvent; import org.elasticsearch.cluster.ClusterStateListener; import org.elasticsearch.cluster.node.DiscoveryNode; import org.elasticsearch.cluster.node.DiscoveryNodes; import org.elasticsearch.cluster.service.ClusterService; import org.elasticsearch.common.Priority; import org.elasticsearch.common.collect.MapBuilder; import org.elasticsearch.common.component.AbstractLifecycleComponent; import org.elasticsearch.common.inject.Inject; import org.elasticsearch.common.inject.Singleton; import org.elasticsearch.common.settings.ClusterSettings; import org.elasticsearch.common.settings.Setting; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.unit.TimeValue; import org.elasticsearch.threadpool.ThreadPool; import sun.misc.Signal; import sun.misc.SignalHandler; import javax.annotation.Nullable; import java.util.HashMap; import java.util.Map; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; @Singleton public class DecommissioningService extends AbstractLifecycleComponent implements SignalHandler, ClusterStateListener { static final String DECOMMISSION_PREFIX = "crate.internal.decommission."; public static final CrateSetting<Settings> DECOMMISSION_INTERNAL_SETTING_GROUP = CrateSetting.of(Setting.groupSetting( DECOMMISSION_PREFIX, Setting.Property.NodeScope, Setting.Property.Dynamic), DataTypes.OBJECT); public static final CrateSetting<DataAvailability> GRACEFUL_STOP_MIN_AVAILABILITY_SETTING = CrateSetting.of(new Setting<>( "cluster.graceful_stop.min_availability", DataAvailability.PRIMARIES.name(), DataAvailability::of, Setting.Property.Dynamic, Setting.Property.NodeScope), DataTypes.STRING); public static final CrateSetting<Boolean> GRACEFUL_STOP_REALLOCATE_SETTING = CrateSetting.of(Setting.boolSetting( "cluster.graceful_stop.reallocate", true, Setting.Property.Dynamic, Setting.Property.NodeScope), DataTypes.BOOLEAN); public static final CrateSetting<Boolean> GRACEFUL_STOP_FORCE_SETTING = CrateSetting.of(Setting.boolSetting( "cluster.graceful_stop.force", false, Setting.Property.Dynamic, Setting.Property.NodeScope), DataTypes.BOOLEAN); public static final CrateSetting<TimeValue> GRACEFUL_STOP_TIMEOUT_SETTING = CrateSetting.of(Setting.positiveTimeSetting( "cluster.graceful_stop.timeout", new TimeValue(7_200_000), Setting.Property.Dynamic, Setting.Property.NodeScope), DataTypes.STRING); private final ClusterService clusterService; private final JobsLogs jobsLogs; private final ThreadPool threadPool; private final SQLOperations sqlOperations; private final TransportClusterHealthAction healthAction; private final TransportClusterUpdateSettingsAction updateSettingsAction; private TimeValue gracefulStopTimeout; private Boolean forceStop; private DataAvailability dataAvailability; @Inject public DecommissioningService(Settings settings, final ClusterService clusterService, JobsLogs jobsLogs, ThreadPool threadPool, SQLOperations sqlOperations, final TransportClusterHealthAction healthAction, final TransportClusterUpdateSettingsAction updateSettingsAction) { super(settings); this.clusterService = clusterService; this.jobsLogs = jobsLogs; this.threadPool = threadPool; this.sqlOperations = sqlOperations; this.healthAction = healthAction; this.updateSettingsAction = updateSettingsAction; gracefulStopTimeout = GRACEFUL_STOP_TIMEOUT_SETTING.setting().get(settings); forceStop = GRACEFUL_STOP_FORCE_SETTING.setting().get(settings); dataAvailability = GRACEFUL_STOP_MIN_AVAILABILITY_SETTING.setting().get(settings); ClusterSettings clusterSettings = clusterService.getClusterSettings(); clusterSettings.addSettingsUpdateConsumer(GRACEFUL_STOP_TIMEOUT_SETTING.setting(), this::setGracefulStopTimeout); clusterSettings.addSettingsUpdateConsumer(GRACEFUL_STOP_FORCE_SETTING.setting(), this::setGracefulStopForce); clusterSettings.addSettingsUpdateConsumer(GRACEFUL_STOP_MIN_AVAILABILITY_SETTING.setting(), this::setDataAvailability); try { Signal signal = new Signal("USR2"); Signal.handle(signal, this); } catch (IllegalArgumentException e) { logger.warn("SIGUSR2 signal not supported on {}.", System.getProperty("os.name"), e); } } private void removeRemovedNodes(ClusterChangedEvent event) { if (!event.localNodeMaster() || !event.nodesRemoved()) { return; } Map<String, Object> removedDecommissionedNodes = getRemovedDecommissionedNodes( event.nodesDelta(), event.state().metaData().transientSettings()); if (removedDecommissionedNodes != null) { updateSettingsAction.execute(new ClusterUpdateSettingsRequest().transientSettings(removedDecommissionedNodes)); } } @Nullable private static Map<String, Object> getRemovedDecommissionedNodes(DiscoveryNodes.Delta nodesDelta, Settings transientSettings) { Map<String, Object> toRemove = null; for (DiscoveryNode discoveryNode : nodesDelta.removedNodes()) { Map<String, String> asMap = DECOMMISSION_INTERNAL_SETTING_GROUP.setting().get(transientSettings).getAsMap(); String nodeId = discoveryNode.getId(); if (asMap.containsKey(nodeId)) { if (toRemove == null) { toRemove = new HashMap<>(); } toRemove.put(DECOMMISSION_PREFIX + nodeId, null); } } return toRemove; } @Override protected void doStart() { // add listener here to avoid guice proxy errors if the ClusterService could not be build clusterService.add(this); } @Override public void clusterChanged(ClusterChangedEvent event) { removeRemovedNodes(event); } private void decommission() { // fail on new requests so that clients don't use this node anymore sqlOperations.disable(); /* * setting this setting will cause the {@link DecommissionAllocationDecider} to prevent allocations onto this node * * nodeIds are part of the key to prevent conflicts if other nodes are being decommissioned in parallel */ Settings settings = Settings.builder().put( DECOMMISSION_PREFIX + clusterService.localNode().getId(), true).build(); updateSettingsAction.execute(new ClusterUpdateSettingsRequest().transientSettings(settings), new ActionListener<ClusterUpdateSettingsResponse>() { @Override public void onResponse(ClusterUpdateSettingsResponse clusterUpdateSettingsResponse) { // changing settings triggers AllocationService.reroute -> shards will be relocated // NOTE: it waits for ALL relocating shards, not just those that involve THIS node. ClusterHealthRequest request = new ClusterHealthRequest() .waitForNoRelocatingShards(true) .waitForEvents(Priority.LANGUID) .timeout(gracefulStopTimeout); if (dataAvailability == DataAvailability.FULL) { request = request.waitForGreenStatus(); } else { request = request.waitForYellowStatus(); } final long startTime = System.nanoTime(); healthAction.execute(request, new ActionListener<ClusterHealthResponse>() { @Override public void onResponse(ClusterHealthResponse clusterHealthResponse) { exitIfNoActiveRequests(startTime); } @Override public void onFailure(Exception e) { forceStopOrAbort(e); } }); } @Override public void onFailure(Exception e) { logger.error("Couldn't set settings. Graceful shutdown failed", e); } }); } void forceStopOrAbort(@Nullable Throwable e) { if (forceStop) { exit(); } else { logger.warn("Aborting graceful shutdown due to error", e); removeDecommissioningSetting(); sqlOperations.enable(); } } void exitIfNoActiveRequests(final long startTime) { if (jobsLogs.activeRequests() == 0L) { exit(); return; } if (System.nanoTime() - startTime > gracefulStopTimeout.nanos()) { forceStopOrAbort(new TimeoutException("gracefulStopTimeout reached - waited too long for pending requests to finish")); return; } logger.info("There are still active requests on this node, delaying graceful shutdown"); // use scheduler instead of busy loop to avoid blocking a listener thread threadPool.scheduler().schedule(new Runnable() { @Override public void run() { exitIfNoActiveRequests(startTime); } }, 5, TimeUnit.SECONDS); } void exit() { System.exit(0); } @VisibleForTesting protected void removeDecommissioningSetting() { Map<String, Object> settingsToRemove = MapBuilder.<String, Object>newMapBuilder() .put(DECOMMISSION_PREFIX + clusterService.localNode().getId(), null) .map(); updateSettingsAction.execute(new ClusterUpdateSettingsRequest().transientSettings(settingsToRemove)); } @Override protected void doStop() { clusterService.remove(this); } @Override protected void doClose() { } @Override public void handle(Signal signal) { if (dataAvailability == DataAvailability.NONE) { System.exit(0); } else { decommission(); } } private void setGracefulStopTimeout(TimeValue gracefulStopTimeout) { this.gracefulStopTimeout = gracefulStopTimeout; } private void setGracefulStopForce(boolean forceStop) { this.forceStop = forceStop; } private void setDataAvailability(DataAvailability dataAvailability) { this.dataAvailability = dataAvailability; } }