/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.ambari.server.agent; import static org.apache.ambari.server.agent.ExecutionCommand.KeyNames.COMMAND_TIMEOUT; import static org.apache.ambari.server.agent.ExecutionCommand.KeyNames.HOOKS_FOLDER; import static org.apache.ambari.server.agent.ExecutionCommand.KeyNames.JDK_LOCATION; import static org.apache.ambari.server.agent.ExecutionCommand.KeyNames.SCRIPT; import static org.apache.ambari.server.agent.ExecutionCommand.KeyNames.SCRIPT_TYPE; import static org.apache.ambari.server.agent.ExecutionCommand.KeyNames.SERVICE_PACKAGE_FOLDER; import static org.apache.ambari.server.agent.ExecutionCommand.KeyNames.STACK_NAME; import static org.apache.ambari.server.agent.ExecutionCommand.KeyNames.STACK_VERSION; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeMap; import org.apache.ambari.server.AmbariException; import org.apache.ambari.server.RoleCommand; import org.apache.ambari.server.actionmanager.ActionManager; import org.apache.ambari.server.api.services.AmbariMetaInfo; import org.apache.ambari.server.configuration.Configuration; import org.apache.ambari.server.controller.AmbariManagementController; import org.apache.ambari.server.state.Cluster; import org.apache.ambari.server.state.Clusters; import org.apache.ambari.server.state.CommandScriptDefinition; import org.apache.ambari.server.state.ComponentInfo; import org.apache.ambari.server.state.Config; import org.apache.ambari.server.state.ConfigHelper; import org.apache.ambari.server.state.DesiredConfig; import org.apache.ambari.server.state.Host; import org.apache.ambari.server.state.HostState; import org.apache.ambari.server.state.Service; import org.apache.ambari.server.state.ServiceComponent; import org.apache.ambari.server.state.ServiceComponentHost; import org.apache.ambari.server.state.ServiceInfo; import org.apache.ambari.server.state.StackId; import org.apache.ambari.server.state.StackInfo; import org.apache.ambari.server.state.State; import org.apache.ambari.server.state.fsm.InvalidStateTransitionException; import org.apache.ambari.server.state.host.HostHeartbeatLostEvent; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import com.google.inject.Injector; /** * Monitors the node state and heartbeats. */ public class HeartbeatMonitor implements Runnable { private static Log LOG = LogFactory.getLog(HeartbeatMonitor.class); private Clusters clusters; private ActionQueue actionQueue; private ActionManager actionManager; private final int threadWakeupInterval; //1 minute private volatile boolean shouldRun = true; private Thread monitorThread = null; private final ConfigHelper configHelper; private final AmbariMetaInfo ambariMetaInfo; private final AmbariManagementController ambariManagementController; private final Configuration configuration; private final AgentRequests agentRequests; public HeartbeatMonitor(Clusters clusters, ActionQueue aq, ActionManager am, int threadWakeupInterval, Injector injector) { this.clusters = clusters; actionQueue = aq; actionManager = am; this.threadWakeupInterval = threadWakeupInterval; configHelper = injector.getInstance(ConfigHelper.class); ambariMetaInfo = injector.getInstance(AmbariMetaInfo.class); ambariManagementController = injector.getInstance( AmbariManagementController.class); configuration = injector.getInstance(Configuration.class); agentRequests = new AgentRequests(); } public void shutdown() { shouldRun = false; } public void start() { monitorThread = new Thread(this, "ambari-hearbeat-monitor"); monitorThread.start(); } void join(long millis) throws InterruptedException { monitorThread.join(millis); } public boolean isAlive() { return monitorThread.isAlive(); } public AgentRequests getAgentRequests() { return agentRequests; } @Override public void run() { while (shouldRun) { try { doWork(); LOG.trace("Putting monitor to sleep for " + threadWakeupInterval + " " + "milliseconds"); Thread.sleep(threadWakeupInterval); } catch (InterruptedException ex) { LOG.warn("Scheduler thread is interrupted going to stop", ex); shouldRun = false; } catch (Exception ex) { LOG.warn("Exception received", ex); } catch (Throwable t) { LOG.warn("ERROR", t); } } } //Go through all the nodes, check for last heartbeat or any waiting state //If heartbeat is lost, update node clusters state, purge the action queue //notify action manager for node failure. private void doWork() throws InvalidStateTransitionException, AmbariException { List<Host> allHosts = clusters.getHosts(); long now = System.currentTimeMillis(); for (Host hostObj : allHosts) { if (hostObj.getState() == HostState.HEARTBEAT_LOST) { //do not check if host already known be lost continue; } String host = hostObj.getHostName(); HostState hostState = hostObj.getState(); String hostname = hostObj.getHostName(); long lastHeartbeat = 0; try { lastHeartbeat = clusters.getHost(host).getLastHeartbeatTime(); } catch (AmbariException e) { LOG.warn("Exception in getting host object; Is it fatal?", e); } if (lastHeartbeat + 2 * threadWakeupInterval < now) { LOG.warn("Heartbeat lost from host " + host); //Heartbeat is expired hostObj.handleEvent(new HostHeartbeatLostEvent(host)); // mark all components that are not clients with unknown status for (Cluster cluster : clusters.getClustersForHost(hostObj.getHostName())) { for (ServiceComponentHost sch : cluster.getServiceComponentHosts(hostObj.getHostName())) { Service s = cluster.getService(sch.getServiceName()); ServiceComponent sc = s.getServiceComponent(sch.getServiceComponentName()); if (!sc.isClientComponent() && !sch.getState().equals(State.INIT) && !sch.getState().equals(State.INSTALLING) && !sch.getState().equals(State.INSTALL_FAILED) && !sch.getState().equals(State.UNINSTALLED) && !sch.getState().equals(State.DISABLED)) { LOG.warn("Setting component state to UNKNOWN for component " + sc.getName() + " on " + host); sch.setState(State.UNKNOWN); } } } //Purge action queue actionQueue.dequeueAll(host); //notify action manager actionManager.handleLostHost(host); } if (hostState == HostState.WAITING_FOR_HOST_STATUS_UPDATES) { long timeSpentInState = hostObj.getTimeInState(); if (timeSpentInState + 5 * threadWakeupInterval < now) { //Go back to init, the agent will be asked to register again in the next heartbeat LOG.warn("timeSpentInState + 5*threadWakeupInterval < now, Go back to init"); hostObj.setState(HostState.INIT); } } // Get status of service components List<StatusCommand> cmds = generateStatusCommands(hostname); LOG.trace("Generated " + cmds.size() + " status commands for host: " + hostname); if (cmds.isEmpty()) { // Nothing to do } else { for (StatusCommand command : cmds) { actionQueue.enqueue(hostname, command); } } } } /** * @param hostname * @return list of commands to get status of service components on a concrete host */ public List<StatusCommand> generateStatusCommands(String hostname) throws AmbariException { List<StatusCommand> cmds = new ArrayList<>(); for (Cluster cl : clusters.getClustersForHost(hostname)) { Map<String, DesiredConfig> desiredConfigs = cl.getDesiredConfigs(); for (ServiceComponentHost sch : cl.getServiceComponentHosts(hostname)) { switch (sch.getState()) { case INIT: case INSTALLING: case STARTING: case STOPPING: //don't send commands until component is installed at least continue; default: StatusCommand statusCmd = createStatusCommand(hostname, cl, sch, desiredConfigs); cmds.add(statusCmd); } } } return cmds; } /** * Generates status command and fills all appropriate fields. * @throws AmbariException */ private StatusCommand createStatusCommand(String hostname, Cluster cluster, ServiceComponentHost sch, Map<String, DesiredConfig> desiredConfigs) throws AmbariException { String serviceName = sch.getServiceName(); String componentName = sch.getServiceComponentName(); StackId stackId = cluster.getDesiredStackVersion(); ServiceInfo serviceInfo = ambariMetaInfo.getService(stackId.getStackName(), stackId.getStackVersion(), serviceName); ComponentInfo componentInfo = ambariMetaInfo.getComponent( stackId.getStackName(), stackId.getStackVersion(), serviceName, componentName); StackInfo stackInfo = ambariMetaInfo.getStack(stackId.getStackName(), stackId.getStackVersion()); Map<String, Map<String, String>> configurations = new TreeMap<>(); Map<String, Map<String, Map<String, String>>> configurationAttributes = new TreeMap<>(); // get the cluster config for type '*-env' // apply config group overrides //Config clusterConfig = cluster.getDesiredConfigByType(GLOBAL); Collection<Config> clusterConfigs = cluster.getAllConfigs(); // creating list with desired config types to validate if cluster config actual Set<String> desiredConfigTypes = desiredConfigs.keySet(); // Apply global properties for this host from all config groups Map<String, Map<String, String>> allConfigTags = configHelper .getEffectiveDesiredTags(cluster, hostname); for(Config clusterConfig: clusterConfigs) { String configType = clusterConfig.getType(); if(!configType.endsWith("-env") || !desiredConfigTypes.contains(configType)) { continue; } // cluster config for 'global' Map<String, String> props = new HashMap<>(clusterConfig.getProperties()); Map<String, Map<String, String>> configTags = new HashMap<>(); for (Map.Entry<String, Map<String, String>> entry : allConfigTags.entrySet()) { if (entry.getKey().equals(clusterConfig.getType())) { configTags.put(clusterConfig.getType(), entry.getValue()); } } Map<String, Map<String, String>> properties = configHelper .getEffectiveConfigProperties(cluster, configTags); if (!properties.isEmpty()) { for (Map<String, String> propertyMap : properties.values()) { props.putAll(propertyMap); } } configurations.put(clusterConfig.getType(), props); Map<String, Map<String, String>> attrs = new TreeMap<>(); configHelper.cloneAttributesMap(clusterConfig.getPropertiesAttributes(), attrs); Map<String, Map<String, Map<String, String>>> attributes = configHelper .getEffectiveConfigAttributes(cluster, configTags); for (Map<String, Map<String, String>> attributesMap : attributes.values()) { configHelper.cloneAttributesMap(attributesMap, attrs); } configurationAttributes.put(clusterConfig.getType(), attrs); } StatusCommand statusCmd = new StatusCommand(); statusCmd.setClusterName(cluster.getClusterName()); statusCmd.setServiceName(serviceName); statusCmd.setComponentName(componentName); statusCmd.setConfigurations(configurations); statusCmd.setConfigurationAttributes(configurationAttributes); statusCmd.setHostname(hostname); // If Agent wants the command and the States differ statusCmd.setDesiredState(sch.getDesiredState()); statusCmd.setHasStaleConfigs(configHelper.isStaleConfigs(sch, desiredConfigs)); if (getAgentRequests().shouldSendExecutionDetails(hostname, componentName)) { LOG.info(componentName + " is at " + sch.getState() + " adding more payload per agent ask"); statusCmd.setPayloadLevel(StatusCommand.StatusCommandPayload.EXECUTION_COMMAND); } // Fill command params Map<String, String> commandParams = statusCmd.getCommandParams(); String commandTimeout = configuration.getDefaultAgentTaskTimeout(false); CommandScriptDefinition script = componentInfo.getCommandScript(); if (serviceInfo.getSchemaVersion().equals(AmbariMetaInfo.SCHEMA_VERSION_2)) { if (script != null) { commandParams.put(SCRIPT, script.getScript()); commandParams.put(SCRIPT_TYPE, script.getScriptType().toString()); if (script.getTimeout() > 0) { commandTimeout = String.valueOf(script.getTimeout()); } } else { String message = String.format("Component %s of service %s has not " + "command script defined", componentName, serviceName); throw new AmbariException(message); } } commandParams.put(COMMAND_TIMEOUT, commandTimeout); commandParams.put(SERVICE_PACKAGE_FOLDER, serviceInfo.getServicePackageFolder()); commandParams.put(HOOKS_FOLDER, stackInfo.getStackHooksFolder()); // Fill host level params Map<String, String> hostLevelParams = statusCmd.getHostLevelParams(); hostLevelParams.put(JDK_LOCATION, ambariManagementController.getJdkResourceUrl()); hostLevelParams.put(STACK_NAME, stackId.getStackName()); hostLevelParams.put(STACK_VERSION, stackId.getStackVersion()); if (statusCmd.getPayloadLevel() == StatusCommand.StatusCommandPayload.EXECUTION_COMMAND) { ExecutionCommand ec = ambariManagementController.getExecutionCommand(cluster, sch, RoleCommand.START); statusCmd.setExecutionCommand(ec); LOG.debug(componentName + " has more payload for execution command"); } return statusCmd; } }