/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.ambari.server.agent; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; import org.apache.ambari.server.AmbariException; import org.apache.ambari.server.events.ClusterConfigChangedEvent; import org.apache.ambari.server.events.MaintenanceModeEvent; import org.apache.ambari.server.events.ServiceComponentInstalledEvent; import org.apache.ambari.server.events.ServiceComponentRecoveryChangedEvent; import org.apache.ambari.server.events.ServiceComponentUninstalledEvent; import org.apache.ambari.server.events.publishers.AmbariEventPublisher; import org.apache.ambari.server.state.Cluster; import org.apache.ambari.server.state.Clusters; import org.apache.ambari.server.state.Config; import org.apache.ambari.server.state.ConfigHelper; import org.apache.ambari.server.state.Host; import org.apache.ambari.server.state.MaintenanceState; import org.apache.ambari.server.state.Service; import org.apache.ambari.server.state.ServiceComponentHost; import org.apache.commons.lang.StringUtils; import com.google.common.eventbus.AllowConcurrentEvents; import com.google.common.eventbus.Subscribe; import com.google.inject.Inject; import com.google.inject.Singleton; @Singleton public class RecoveryConfigHelper { /** * Recovery related configuration */ public static final String RECOVERY_ENABLED_KEY = "recovery_enabled"; public static final String RECOVERY_TYPE_KEY = "recovery_type"; public static final String RECOVERY_TYPE_DEFAULT = "AUTO_START"; public static final String RECOVERY_LIFETIME_MAX_COUNT_KEY = "recovery_lifetime_max_count"; public static final String RECOVERY_LIFETIME_MAX_COUNT_DEFAULT = "12"; public static final String RECOVERY_MAX_COUNT_KEY = "recovery_max_count"; public static final String RECOVERY_MAX_COUNT_DEFAULT = "6"; public static final String RECOVERY_WINDOW_IN_MIN_KEY = "recovery_window_in_minutes"; public static final String RECOVERY_WINDOW_IN_MIN_DEFAULT = "60"; public static final String RECOVERY_RETRY_GAP_KEY = "recovery_retry_interval"; public static final String RECOVERY_RETRY_GAP_DEFAULT = "5"; @Inject private Clusters clusters; /** * Cluster --> Host --> Timestamp */ private ConcurrentHashMap<String, ConcurrentHashMap<String, Long>> timestampMap; @Inject public RecoveryConfigHelper(AmbariEventPublisher eventPublisher) { eventPublisher.register(this); timestampMap = new ConcurrentHashMap<>(); } public RecoveryConfig getDefaultRecoveryConfig() throws AmbariException { return getRecoveryConfig(null, null); } public RecoveryConfig getRecoveryConfig(String clusterName, String hostname) throws AmbariException { long now = System.currentTimeMillis(); if (StringUtils.isNotEmpty(clusterName)) { // Insert or update timestamp for cluster::host ConcurrentHashMap<String, Long> hostTimestamp = timestampMap.get(clusterName); if (hostTimestamp == null) { hostTimestamp = new ConcurrentHashMap<>(); timestampMap.put(clusterName, hostTimestamp); } if (StringUtils.isNotEmpty(hostname)) { hostTimestamp.put(hostname, now); } } AutoStartConfig autoStartConfig = new AutoStartConfig(clusterName); RecoveryConfig recoveryConfig = new RecoveryConfig(); recoveryConfig.setMaxCount(autoStartConfig.getNodeRecoveryMaxCount()); recoveryConfig.setMaxLifetimeCount(autoStartConfig.getNodeRecoveryLifetimeMaxCount()); recoveryConfig.setRetryGap(autoStartConfig.getNodeRecoveryRetryGap()); recoveryConfig.setType(autoStartConfig.getNodeRecoveryType()); recoveryConfig.setWindowInMinutes(autoStartConfig.getNodeRecoveryWindowInMin()); recoveryConfig.setRecoveryTimestamp(now); if (autoStartConfig.isRecoveryEnabled()) { recoveryConfig.setEnabledComponents(StringUtils.join(autoStartConfig.getEnabledComponents(hostname), ',')); } return recoveryConfig; } /** * Computes if the recovery configuration was updated since the last time it was sent to the agent. * * @param clusterName - Name of the cluster which the host belongs to. * @param hostname - Host name from agent. * @param recoveryTimestamp - Time when the recovery configuration was last sent to the agent. Agent * stores this value and sends it during each heartbeat. -1 if agent was * restarted or configuration was not sent to the agent since it started. * @return */ public boolean isConfigStale(String clusterName, String hostname, long recoveryTimestamp) { // Look up the last updated timestamp for the clusterName-->hostname-->timestamp if // it is available. If found, compare it with the timestamp from the agent. It the timestamp // is different from the timestamp sent by the agent, the recovery config on the agent // side is stale and should be sent to the agent during this heartbeat. if (StringUtils.isEmpty(clusterName)) { throw new IllegalArgumentException("clusterName cannot be empty or null."); } if (StringUtils.isEmpty(hostname)) { throw new IllegalArgumentException("hostname cannot be empty or null."); } ConcurrentHashMap<String, Long> hostTimestamp = timestampMap.get(clusterName); if (hostTimestamp == null) { return true; } Long timestamp = hostTimestamp.get(hostname); /* * An agent that did not get the configuration during registration because it * was not yet a part of a cluster but now is will not have an entry. */ if (timestamp == null) { return true; } if (timestamp.longValue() != recoveryTimestamp) { return true; } return false; } /** * Maintenance mode of a host, service or service component host changed. * @param event * @throws AmbariException */ @Subscribe @AllowConcurrentEvents public void handleMaintenanceModeEvent(MaintenanceModeEvent event) throws AmbariException { if (event.getHost() != null) { /* * If any one component in the host is recovery enabled, * invalidate the host timestamp. */ Cluster cluster = clusters.getCluster(event.getClusterId()); if (cluster == null) { return; } Host host = event.getHost(); List<ServiceComponentHost> scHosts = cluster.getServiceComponentHosts(host.getHostName()); for (ServiceComponentHost sch : scHosts) { if (sch.isRecoveryEnabled()) { invalidateRecoveryTimestamp(sch.getClusterName(), sch.getHostName()); break; } } } else if (event.getService() != null) { /** * Simply invalidate all the hosts in the cluster. * The recovery config will be sent to all the hosts * even if some of the hosts do not have components * in recovery mode. * Looping through all the hosts and its components * to determine which host to send the recovery config * may not be efficient. */ Service service = event.getService(); invalidateRecoveryTimestamp(service.getCluster().getClusterName(), null); } else if (event.getServiceComponentHost() != null) { ServiceComponentHost sch = event.getServiceComponentHost(); if (sch.isRecoveryEnabled()) { invalidateRecoveryTimestamp(sch.getClusterName(), sch.getHostName()); } } } /** * A service component was installed on a host. * @param event * @throws AmbariException */ @Subscribe @AllowConcurrentEvents public void handleServiceComponentInstalledEvent(ServiceComponentInstalledEvent event) throws AmbariException { if (event.isRecoveryEnabled()) { Cluster cluster = clusters.getClusterById(event.getClusterId()); if (cluster != null) { invalidateRecoveryTimestamp(cluster.getClusterName(), event.getHostName()); } } } /** * A service component was uninstalled from a host. * @param event * @throws AmbariException */ @Subscribe @AllowConcurrentEvents public void handleServiceComponentUninstalledEvent(ServiceComponentUninstalledEvent event) throws AmbariException { if (event.isRecoveryEnabled()) { Cluster cluster = clusters.getClusterById(event.getClusterId()); if (cluster != null) { invalidateRecoveryTimestamp(cluster.getClusterName(), event.getHostName()); } } } /** * Recovery enabled was turned on or off. * @param event */ @Subscribe @AllowConcurrentEvents public void handleServiceComponentRecoveryChangedEvent(ServiceComponentRecoveryChangedEvent event) { invalidateRecoveryTimestamp(event.getClusterName(), null); } /** * Cluster-env configuration changed. * @param event */ @Subscribe @AllowConcurrentEvents public void handleClusterEnvConfigChangedEvent(ClusterConfigChangedEvent event) { if (StringUtils.equals(event.getConfigType(), ConfigHelper.CLUSTER_ENV)) { invalidateRecoveryTimestamp(event.getClusterName(), null); } } private void invalidateRecoveryTimestamp(String clusterName, String hostname) { if (StringUtils.isNotEmpty(clusterName)) { ConcurrentHashMap<String, Long> hostTimestamp = timestampMap.get(clusterName); if (hostTimestamp != null) { if (StringUtils.isNotEmpty(hostname)) { // Clear the time stamp for the specified host in this cluster hostTimestamp.put(hostname, 0L); } else { // Clear the time stamp for all hosts in this cluster for(Map.Entry<String, Long> hostEntry : hostTimestamp.entrySet()) { hostEntry.setValue(0L); } } } } } /** * Helper class to get auto start configuration */ class AutoStartConfig { private Cluster cluster; private Map<String, String> configProperties; public AutoStartConfig(String clusterName) throws AmbariException { if (StringUtils.isNotEmpty(clusterName)) { cluster = clusters.getCluster(clusterName); } if (cluster != null) { Config config = cluster.getDesiredConfigByType(getConfigType()); if (config != null) { configProperties = config.getProperties(); } } if (configProperties == null) { configProperties = new HashMap<>(); } } /** * Get a list of enabled components for the specified host and cluster. Filter by * Maintenance Mode OFF, so that agent does not auto start components that are in * maintenance mode. * @return */ private List<String> getEnabledComponents(String hostname) throws AmbariException { List<String> enabledComponents = new ArrayList<>(); if (cluster == null) { return enabledComponents; } Host host = clusters.getHost(hostname); if (host == null) { return enabledComponents; } // if host is in maintenance mode then ignore all the components for auto start if (host.getMaintenanceState(cluster.getClusterId()) == MaintenanceState.ON) { return enabledComponents; } List<ServiceComponentHost> scHosts = cluster.getServiceComponentHosts(hostname); for (ServiceComponentHost sch : scHosts) { if (sch.isRecoveryEnabled()) { Service service = cluster.getService(sch.getServiceName()); // service should not be in maintenance mode if (service.getMaintenanceState() == MaintenanceState.OFF) { // Keep the components that are not in maintenance mode. if (sch.getMaintenanceState() == MaintenanceState.OFF) { enabledComponents.add(sch.getServiceComponentName()); } } } } return enabledComponents; } /** * The configuration type name. * @return */ private String getConfigType() { return "cluster-env"; } /** * Get a value indicating whether the cluster supports recovery. * * @return True or false. */ private boolean isRecoveryEnabled() { return Boolean.parseBoolean(getProperty(RECOVERY_ENABLED_KEY, "false")); } /** * Get the node recovery type. The only supported value is AUTO_START. * @return */ private String getNodeRecoveryType() { return getProperty(RECOVERY_TYPE_KEY, RECOVERY_TYPE_DEFAULT); } /** * Get configured max count of recovery attempt allowed per host component in a window * This is reset when agent is restarted. * @return */ private String getNodeRecoveryMaxCount() { return getProperty(RECOVERY_MAX_COUNT_KEY, RECOVERY_MAX_COUNT_DEFAULT); } /** * Get configured max lifetime count of recovery attempt allowed per host component. * This is reset when agent is restarted. * @return */ private String getNodeRecoveryLifetimeMaxCount() { return getProperty(RECOVERY_LIFETIME_MAX_COUNT_KEY, RECOVERY_LIFETIME_MAX_COUNT_DEFAULT); } /** * Get configured window size in minutes * @return */ private String getNodeRecoveryWindowInMin() { return getProperty(RECOVERY_WINDOW_IN_MIN_KEY, RECOVERY_WINDOW_IN_MIN_DEFAULT); } /** * Get the configured retry gap between tries per host component * @return */ private String getNodeRecoveryRetryGap() { return getProperty(RECOVERY_RETRY_GAP_KEY, RECOVERY_RETRY_GAP_DEFAULT); } /** * Get the property value for the specified key. If not present, return default value. * @param key The key for which property value is required. * @param defaultValue Default value to return if key is not found. * @return */ private String getProperty(String key, String defaultValue) { if (configProperties.containsKey(key)) { return configProperties.get(key); } return defaultValue; } } }