/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.ambari.server.checks;
import java.text.MessageFormat;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.ambari.server.AmbariException;
import org.apache.ambari.server.controller.PrereqCheckRequest;
import org.apache.ambari.server.orm.models.HostComponentSummary;
import org.apache.ambari.server.state.Cluster;
import org.apache.ambari.server.state.ComponentInfo;
import org.apache.ambari.server.state.Host;
import org.apache.ambari.server.state.MaintenanceState;
import org.apache.ambari.server.state.Service;
import org.apache.ambari.server.state.ServiceComponent;
import org.apache.ambari.server.state.StackId;
import org.apache.ambari.server.state.State;
import org.apache.ambari.server.state.stack.PrereqCheckStatus;
import org.apache.ambari.server.state.stack.PrerequisiteCheck;
import org.apache.ambari.server.state.stack.upgrade.UpgradeType;
import org.apache.commons.lang.StringUtils;
import com.google.inject.Singleton;
/**
* The {@link ServicesUpCheck} class is used to ensure that services are up
* "enough" before an upgrade begins. This class uses the following rules:
* <ul>
* <li>If the component is a CLIENT, it will skip it</li>
* <li>If the component is a MASTER then it must be online and not in MM</li>
* <li>If the component is a SLAVE
* <ul>
* <li>If the cardinality is 1+, then determine if {@value #SLAVE_THRESHOLD} %
* are running. Hosts in MM are counted as being "up" since they are not part of
* the upgrade.</li>
* <li>If the cardinality is 0, then all instances must be online. Hosts in MM
* are counted as being "up" since they are not part of the upgrade.</li>
* </ul>
* </ul>
* It seems counter-intuitive to have a liveliness check which allows a
* percentage of the slaves to be down. The goal is to be able to start an
* upgrade, even if some slave components on healthy hosts are down. We still
* want hosts to be scehdule for upgrade of their other components.
*/
@Singleton
@UpgradeCheck(
group = UpgradeCheckGroup.LIVELINESS,
order = 2.0f,
required = { UpgradeType.ROLLING, UpgradeType.NON_ROLLING, UpgradeType.HOST_ORDERED })
public class ServicesUpCheck extends AbstractCheckDescriptor {
private static final float SLAVE_THRESHOLD = 0.5f;
/**
* Constructor.
*/
public ServicesUpCheck() {
super(CheckDescription.SERVICES_UP);
}
/**
* {@inheritDoc}
*/
@Override
public void perform(PrerequisiteCheck prerequisiteCheck, PrereqCheckRequest request)
throws AmbariException {
final String clusterName = request.getClusterName();
final Cluster cluster = clustersProvider.get().getCluster(clusterName);
List<String> errorMessages = new ArrayList<>();
Set<String> failedServiceNames = new HashSet<>();
StackId stackId = cluster.getCurrentStackVersion();
for (Map.Entry<String, Service> serviceEntry : cluster.getServices().entrySet()) {
final Service service = serviceEntry.getValue();
// Ignore services like Tez that are clientOnly.
if (service.isClientOnlyService()) {
continue;
}
Map<String, ServiceComponent> serviceComponents = service.getServiceComponents();
for (Map.Entry<String, ServiceComponent> component : serviceComponents.entrySet()) {
ServiceComponent serviceComponent = component.getValue();
// In Services like HDFS, ignore components like HDFS Client
if (serviceComponent.isClientComponent()) {
continue;
}
// skip if the component is not part of the finalization version check
if (!serviceComponent.isVersionAdvertised()) {
continue;
}
// TODO, add more logic that checks the Upgrade Pack.
// These components are not in the upgrade pack and do not advertise a
// version:
// ZKFC, Ambari Metrics, Kerberos, Atlas (right now).
// Generally, if it advertises a version => in the upgrade pack.
// So it can be in the Upgrade Pack but not advertise a version.
List<HostComponentSummary> hostComponentSummaries = HostComponentSummary.getHostComponentSummaries(
service.getName(), serviceComponent.getName());
// not installed, nothing to do
if (hostComponentSummaries.isEmpty()) {
continue;
}
// non-master, "true" slaves with cardinality 1+
boolean checkThreshold = false;
if (!serviceComponent.isMasterComponent()) {
ComponentInfo componentInfo = ambariMetaInfo.get().getComponent(stackId.getStackName(),
stackId.getStackVersion(), serviceComponent.getServiceName(),
serviceComponent.getName());
String cardinality = componentInfo.getCardinality();
// !!! check if there can be more than one. This will match, say,
// datanodes but not ZKFC
if (null != cardinality
&& (cardinality.equals("ALL") || cardinality.matches("[1-9].*"))) {
checkThreshold = true;
}
}
// check threshold for slaves which have a non-0 cardinality
if (checkThreshold) {
int total = hostComponentSummaries.size();
int up = 0;
int down = 0;
for (HostComponentSummary summary : hostComponentSummaries) {
if (isConsideredDown(cluster, serviceComponent, summary)) {
down++;
} else {
up++;
}
}
if ((float) down / total > SLAVE_THRESHOLD) { // arbitrary
failedServiceNames.add(service.getName());
String message = MessageFormat.format(
"{0}: {1} out of {2} {3} are started; there should be {4,number,percent} started before upgrading.",
service.getName(), up, total, serviceComponent.getName(), SLAVE_THRESHOLD);
errorMessages.add(message);
}
} else {
for (HostComponentSummary summary : hostComponentSummaries) {
if (isConsideredDown(cluster, serviceComponent, summary)) {
failedServiceNames.add(service.getName());
String message = MessageFormat.format("{0}: {1} (in {2} on host {3})",
service.getName(), serviceComponent.getName(), summary.getCurrentState(),
summary.getHostName());
errorMessages.add(message);
break;
}
}
}
}
}
if (!errorMessages.isEmpty()) {
prerequisiteCheck.setFailedOn(new LinkedHashSet<>(failedServiceNames));
prerequisiteCheck.setStatus(PrereqCheckStatus.FAIL);
prerequisiteCheck.setFailReason(
"The following Service Components should be in a started state. Please invoke a service Stop and full Start and try again. "
+ StringUtils.join(errorMessages, ", "));
}
}
/**
* Gets whether this component should be considered as being "down" for the
* purposes of this check. Component type, maintenance mode, and state are
* taken into account.
*
* @param cluster
* the cluster
* @param serviceComponent
* the component
* @param summary
* a summary of the state of the component on a host
* @return {@code true} if the host component should be considered as failing
* this test.
* @throws AmbariException
*/
private boolean isConsideredDown(Cluster cluster, ServiceComponent serviceComponent,
HostComponentSummary summary) throws AmbariException {
Host host = clustersProvider.get().getHostById(summary.getHostId());
MaintenanceState maintenanceState = host.getMaintenanceState(cluster.getClusterId());
// non-MASTER components in maintenance mode should not count
if (maintenanceState == MaintenanceState.ON && !serviceComponent.isMasterComponent()) {
return false;
}
State desiredState = summary.getDesiredState();
State currentState = summary.getCurrentState();
switch (desiredState) {
case INSTALLED:
case STARTED:
return currentState != State.STARTED;
default:
return false;
}
}
}