/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.ambari.server.topology;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.Executor;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import javax.inject.Inject;
import org.apache.ambari.server.AmbariException;
import org.apache.ambari.server.actionmanager.HostRoleCommand;
import org.apache.ambari.server.actionmanager.HostRoleStatus;
import org.apache.ambari.server.api.services.stackadvisor.StackAdvisorBlueprintProcessor;
import org.apache.ambari.server.configuration.Configuration;
import org.apache.ambari.server.controller.AmbariServer;
import org.apache.ambari.server.controller.RequestStatusResponse;
import org.apache.ambari.server.controller.ShortTaskStatus;
import org.apache.ambari.server.controller.internal.ArtifactResourceProvider;
import org.apache.ambari.server.controller.internal.BaseClusterRequest;
import org.apache.ambari.server.controller.internal.CalculatedStatus;
import org.apache.ambari.server.controller.internal.CredentialResourceProvider;
import org.apache.ambari.server.controller.internal.ProvisionClusterRequest;
import org.apache.ambari.server.controller.internal.RequestImpl;
import org.apache.ambari.server.controller.internal.ScaleClusterRequest;
import org.apache.ambari.server.controller.internal.Stack;
import org.apache.ambari.server.controller.spi.NoSuchParentResourceException;
import org.apache.ambari.server.controller.spi.RequestStatus;
import org.apache.ambari.server.controller.spi.Resource;
import org.apache.ambari.server.controller.spi.ResourceAlreadyExistsException;
import org.apache.ambari.server.controller.spi.ResourceProvider;
import org.apache.ambari.server.controller.spi.SystemException;
import org.apache.ambari.server.controller.spi.UnsupportedPropertyException;
import org.apache.ambari.server.events.AmbariEvent;
import org.apache.ambari.server.events.ClusterConfigFinishedEvent;
import org.apache.ambari.server.events.HostsRemovedEvent;
import org.apache.ambari.server.events.RequestFinishedEvent;
import org.apache.ambari.server.events.publishers.AmbariEventPublisher;
import org.apache.ambari.server.orm.dao.HostRoleCommandStatusSummaryDTO;
import org.apache.ambari.server.orm.dao.SettingDAO;
import org.apache.ambari.server.orm.entities.SettingEntity;
import org.apache.ambari.server.orm.entities.StageEntity;
import org.apache.ambari.server.security.authorization.AuthorizationHelper;
import org.apache.ambari.server.state.Host;
import org.apache.ambari.server.state.SecurityType;
import org.apache.ambari.server.state.host.HostImpl;
import org.apache.ambari.server.state.quicklinksprofile.QuickLinksProfile;
import org.apache.ambari.server.topology.tasks.ConfigureClusterTask;
import org.apache.ambari.server.topology.tasks.ConfigureClusterTaskFactory;
import org.apache.ambari.server.topology.validators.TopologyValidatorService;
import org.apache.ambari.server.utils.RetryHelper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.eventbus.Subscribe;
import com.google.inject.Singleton;
import com.google.inject.persist.Transactional;
/**
* Manages all cluster provisioning actions on the cluster topology.
*/
//todo: cluster isolation
@Singleton
public class TopologyManager {
/**
* internal token for topology related async tasks
*/
public static final String INTERNAL_AUTH_TOKEN = "internal_topology_token";
public static final String INITIAL_CONFIG_TAG = "INITIAL";
public static final String TOPOLOGY_RESOLVED_TAG = "TOPOLOGY_RESOLVED";
public static final String KDC_ADMIN_CREDENTIAL = "kdc.admin.credential";
private static final String CLUSTER_ENV_CONFIG_TYPE_NAME = "cluster-env";
private static final String CLUSTER_CONFIG_TASK_MAX_TIME_IN_MILLIS_PROPERTY_NAME = "cluster_configure_task_timeout";
private PersistedState persistedState;
private final ExecutorService executor = Executors.newSingleThreadExecutor();
private final Executor taskExecutor; // executes TopologyTasks
private final boolean parallelTaskCreationEnabled;
private Collection<String> hostsToIgnore = new HashSet<>();
private final List<HostImpl> availableHosts = new LinkedList<>();
private final Map<String, LogicalRequest> reservedHosts = new HashMap<>();
private final Map<Long, LogicalRequest> allRequests = new HashMap<>();
// priority is given to oldest outstanding requests
private final Collection<LogicalRequest> outstandingRequests = new ArrayList<>();
//todo: currently only support a single cluster
private Map<Long, ClusterTopology> clusterTopologyMap = new HashMap<>();
@Inject
private StackAdvisorBlueprintProcessor stackAdvisorBlueprintProcessor;
@Inject
private LogicalRequestFactory logicalRequestFactory;
@Inject
private AmbariContext ambariContext;
private final Object initializationLock = new Object();
@Inject
private SecurityConfigurationFactory securityConfigurationFactory;
@Inject
private ConfigureClusterTaskFactory configureClusterTaskFactory;
@Inject
private AmbariEventPublisher ambariEventPublisher;
@Inject
private SettingDAO settingDAO;
@Inject
private TopologyValidatorService topologyValidatorService;
/**
* A boolean not cached thread-local (volatile) to prevent double-checked
* locking on the synchronized keyword.
*/
private volatile boolean isInitialized;
private final static Logger LOG = LoggerFactory.getLogger(TopologyManager.class);
/**
* Stores request that belongs to blueprint creation
*/
private Map<Long, LogicalRequest> clusterProvisionWithBlueprintCreateRequests = new HashMap<>();
/**
* Flag to show whether blueprint is already finished or not. It is used for shortcuts.
*/
private Map<Long, Boolean> clusterProvisionWithBlueprintCreationFinished = new HashMap<>();
public TopologyManager() {
parallelTaskCreationEnabled = false;
taskExecutor = executor;
}
@Inject
public TopologyManager(Configuration configuration) {
int threadPoolSize = configuration.getParallelTopologyTaskCreationThreadPoolSize();
parallelTaskCreationEnabled = configuration.isParallelTopologyTaskCreationEnabled() && threadPoolSize > 1;
taskExecutor = parallelTaskCreationEnabled
? Executors.newFixedThreadPool(threadPoolSize)
: executor;
}
// executed by the IoC framework after creating the object (guice)
@Inject
private void register() {
ambariEventPublisher.register(this);
}
@Inject
private void setPersistedState() {
persistedState = ambariContext.getPersistedTopologyState();
}
//todo: can't call in constructor.
//todo: Very important that this occurs prior to any usage
private void ensureInitialized() {
if (!isInitialized) {
synchronized (initializationLock) {
if (!isInitialized) {
replayRequests(persistedState.getAllRequests());
// ensure KERBEROS_CLIENT is present in each hostgroup even if it's not in original BP
for(ClusterTopology clusterTopology : clusterTopologyMap.values()) {
if (clusterTopology.isClusterKerberosEnabled()) {
addKerberosClient(clusterTopology);
}
}
isInitialized = true;
}
}
}
}
/**
* Called when heartbeat processing finishes
* @param event
*/
@Subscribe
public void onRequestFinished(RequestFinishedEvent event) {
if(event.getType() != AmbariEvent.AmbariEventType.REQUEST_FINISHED
|| clusterProvisionWithBlueprintCreateRequests.isEmpty()
|| Boolean.TRUE.equals(clusterProvisionWithBlueprintCreationFinished.get(event.getClusterId()))) {
return;
}
if(isClusterProvisionWithBlueprintFinished(event.getClusterId())) {
clusterProvisionWithBlueprintCreationFinished.put(event.getClusterId(), Boolean.TRUE);
LogicalRequest provisionRequest = clusterProvisionWithBlueprintCreateRequests.get(event.getClusterId());
if(isLogicalRequestSuccessful(provisionRequest)) {
LOG.info("Cluster creation request id={} using Blueprint {} successfully completed for cluster id={}",
clusterProvisionWithBlueprintCreateRequests.get(event.getClusterId()).getRequestId(),
clusterTopologyMap.get(event.getClusterId()).getBlueprint().getName(),
event.getClusterId());
} else {
LOG.info("Cluster creation request id={} using Blueprint {} failed for cluster id={}",
clusterProvisionWithBlueprintCreateRequests.get(event.getClusterId()).getRequestId(),
clusterTopologyMap.get(event.getClusterId()).getBlueprint().getName(),
event.getClusterId());
}
}
}
/**
* Returns if provision request for a cluster is tracked
* @param clusterId
* @return
*/
public boolean isClusterProvisionWithBlueprintTracked(long clusterId) {
return clusterProvisionWithBlueprintCreateRequests.containsKey(clusterId);
}
/**
* Returns if the provision request for a cluster is finished.
* Note that this method returns false if the request is not tracked.
* See {@link TopologyManager#isClusterProvisionWithBlueprintTracked(long)}
* @param clusterId
* @return
*/
public boolean isClusterProvisionWithBlueprintFinished(long clusterId) {
if(!isClusterProvisionWithBlueprintTracked(clusterId)) {
return false; // no blueprint request is running
}
// shortcut
if(clusterProvisionWithBlueprintCreationFinished.containsKey(clusterId) && clusterProvisionWithBlueprintCreationFinished.get(clusterId)) {
return true;
}
return isLogicalRequestFinished(clusterProvisionWithBlueprintCreateRequests.get(clusterId));
}
public RequestStatusResponse provisionCluster(final ProvisionClusterRequest request) throws InvalidTopologyException, AmbariException {
ensureInitialized();
final ClusterTopology topology = new ClusterTopologyImpl(ambariContext, request);
final String clusterName = request.getClusterName();
final Stack stack = topology.getBlueprint().getStack();
final String repoVersion = request.getRepositoryVersion();
// get the id prior to creating ambari resources which increments the counter
final Long provisionId = ambariContext.getNextRequestId();
SecurityType securityType = null;
Credential credential = null;
SecurityConfiguration securityConfiguration = processSecurityConfiguration(request);
if (securityConfiguration != null && securityConfiguration.getType() == SecurityType.KERBEROS) {
securityType = SecurityType.KERBEROS;
addKerberosClient(topology);
// refresh default stack config after adding KERBEROS_CLIENT component to topology
topology.getBlueprint().getConfiguration().setParentConfiguration(stack.getConfiguration(topology.getBlueprint().getServices()));
credential = request.getCredentialsMap().get(KDC_ADMIN_CREDENTIAL);
if (credential == null) {
throw new InvalidTopologyException(KDC_ADMIN_CREDENTIAL + " is missing from request.");
}
}
topologyValidatorService.validateTopologyConfiguration(topology);
// create resources
ambariContext.createAmbariResources(topology, clusterName, securityType, repoVersion);
if (securityConfiguration != null && securityConfiguration.getDescriptor() != null) {
submitKerberosDescriptorAsArtifact(clusterName, securityConfiguration.getDescriptor());
}
if (credential != null) {
submitCredential(clusterName, credential);
}
long clusterId = ambariContext.getClusterId(clusterName);
topology.setClusterId(clusterId);
request.setClusterId(clusterId);
// set recommendation strategy
topology.setConfigRecommendationStrategy(request.getConfigRecommendationStrategy());
// set provision action requested
topology.setProvisionAction(request.getProvisionAction());
// persist request
LogicalRequest logicalRequest = RetryHelper.executeWithRetry(new Callable<LogicalRequest>() {
@Override
public LogicalRequest call() throws Exception {
LogicalRequest logicalRequest = processAndPersistProvisionClusterTopologyRequest(request, topology, provisionId);
return logicalRequest;
}
}
);
clusterTopologyMap.put(clusterId, topology);
addClusterConfigRequest(topology, new ClusterConfigurationRequest(ambariContext, topology, true,
stackAdvisorBlueprintProcessor, securityType == SecurityType.KERBEROS));
// Notify listeners that cluster configuration finished
executor.submit(new Callable<Boolean>() {
@Override
public Boolean call() throws Exception {
ambariEventPublisher.publish(new ClusterConfigFinishedEvent(clusterName));
return Boolean.TRUE;
}
});
// Process the logical request
processRequest(request, topology, logicalRequest);
//todo: this should be invoked as part of a generic lifecycle event which could possibly
//todo: be tied to cluster state
ambariContext.persistInstallStateForUI(clusterName, stack.getName(), stack.getVersion());
clusterProvisionWithBlueprintCreateRequests.put(clusterId, logicalRequest);
return getRequestStatus(logicalRequest.getRequestId());
}
/**
* Saves the quick links profile to the DB as an Ambari setting. Creates a new setting entity or updates the existing
* one.
* @param quickLinksProfileJson the quicklinks profile in Json format
*/
void saveOrUpdateQuickLinksProfile(String quickLinksProfileJson) {
SettingEntity settingEntity = settingDAO.findByName(QuickLinksProfile.SETTING_NAME_QUICKLINKS_PROFILE);
// create new
if (null == settingEntity) {
settingEntity = new SettingEntity();
settingEntity.setName(QuickLinksProfile.SETTING_NAME_QUICKLINKS_PROFILE);
settingEntity.setSettingType(QuickLinksProfile.SETTING_TYPE_AMBARI_SERVER);
settingEntity.setContent(quickLinksProfileJson);
settingEntity.setUpdatedBy(AuthorizationHelper.getAuthenticatedName());
settingEntity.setUpdateTimestamp(System.currentTimeMillis());
settingDAO.create(settingEntity);
}
// update existing
else {
settingEntity.setContent(quickLinksProfileJson);
settingEntity.setUpdatedBy(AuthorizationHelper.getAuthenticatedName());
settingEntity.setUpdateTimestamp(System.currentTimeMillis());
settingDAO.merge(settingEntity);
}
}
private void submitCredential(String clusterName, Credential credential) {
ResourceProvider provider =
ambariContext.getClusterController().ensureResourceProvider(Resource.Type.Credential);
Map<String, Object> properties = new HashMap<>();
properties.put(CredentialResourceProvider.CREDENTIAL_CLUSTER_NAME_PROPERTY_ID, clusterName);
properties.put(CredentialResourceProvider.CREDENTIAL_ALIAS_PROPERTY_ID, KDC_ADMIN_CREDENTIAL);
properties.put(CredentialResourceProvider.CREDENTIAL_PRINCIPAL_PROPERTY_ID, credential.getPrincipal());
properties.put(CredentialResourceProvider.CREDENTIAL_KEY_PROPERTY_ID, credential.getKey());
properties.put(CredentialResourceProvider.CREDENTIAL_TYPE_PROPERTY_ID, credential.getType().name());
org.apache.ambari.server.controller.spi.Request request = new RequestImpl(Collections.<String>emptySet(),
Collections.singleton(properties), Collections.<String, String>emptyMap(), null);
try {
RequestStatus status = provider.createResources(request);
if (status.getStatus() != RequestStatus.Status.Complete) {
throw new RuntimeException("Failed to attach kerberos_descriptor artifact to cluster!");
}
} catch (SystemException | UnsupportedPropertyException | NoSuchParentResourceException e) {
throw new RuntimeException("Failed to attach kerberos_descriptor artifact to cluster: " + e);
} catch (ResourceAlreadyExistsException e) {
throw new RuntimeException("Failed to attach kerberos_descriptor artifact to cluster as resource already exists.");
}
}
/**
* Retrieve security info from Blueprint if missing from Cluster Template request.
*
* @param request
* @return
*/
private SecurityConfiguration processSecurityConfiguration(ProvisionClusterRequest request) {
LOG.debug("Getting security configuration from the request ...");
SecurityConfiguration securityConfiguration = request.getSecurityConfiguration();
if (securityConfiguration == null) {
// todo - perform this logic at request creation instead!
LOG.debug("There's no security configuration in the request, retrieving it from the associated blueprint");
securityConfiguration = request.getBlueprint().getSecurity();
if (securityConfiguration != null && securityConfiguration.getType() == SecurityType.KERBEROS &&
securityConfiguration.getDescriptorReference() != null) {
securityConfiguration = securityConfigurationFactory.loadSecurityConfigurationByReference
(securityConfiguration.getDescriptorReference());
}
}
return securityConfiguration;
}
private void submitKerberosDescriptorAsArtifact(String clusterName, String descriptor) {
ResourceProvider artifactProvider =
ambariContext.getClusterController().ensureResourceProvider(Resource.Type.Artifact);
Map<String, Object> properties = new HashMap<>();
properties.put(ArtifactResourceProvider.ARTIFACT_NAME_PROPERTY, "kerberos_descriptor");
properties.put("Artifacts/cluster_name", clusterName);
Map<String, String> requestInfoProps = new HashMap<>();
requestInfoProps.put(org.apache.ambari.server.controller.spi.Request.REQUEST_INFO_BODY_PROPERTY,
"{\"" + ArtifactResourceProvider.ARTIFACT_DATA_PROPERTY + "\": " + descriptor + "}");
org.apache.ambari.server.controller.spi.Request request = new RequestImpl(Collections.<String>emptySet(),
Collections.singleton(properties), requestInfoProps, null);
try {
RequestStatus status = artifactProvider.createResources(request);
try {
while (status.getStatus() != RequestStatus.Status.Complete) {
LOG.info("Waiting for kerberos_descriptor artifact creation.");
Thread.sleep(100);
}
} catch (InterruptedException e) {
LOG.info("Wait for resource creation interrupted!");
}
if (status.getStatus() != RequestStatus.Status.Complete) {
throw new RuntimeException("Failed to attach kerberos_descriptor artifact to cluster!");
}
} catch (SystemException | UnsupportedPropertyException | NoSuchParentResourceException e) {
throw new RuntimeException("Failed to attach kerberos_descriptor artifact to cluster: " + e);
} catch (ResourceAlreadyExistsException e) {
throw new RuntimeException("Failed to attach kerberos_descriptor artifact to cluster as resource already exists.");
}
}
public RequestStatusResponse scaleHosts(final ScaleClusterRequest request)
throws InvalidTopologyException, AmbariException {
ensureInitialized();
LOG.info("TopologyManager.scaleHosts: Entering");
String clusterName = request.getClusterName();
long clusterId = ambariContext.getClusterId(clusterName);
final ClusterTopology topology = clusterTopologyMap.get(clusterId);
if (topology == null) {
throw new InvalidTopologyException("Unable to retrieve cluster topology for cluster. This is most likely a " +
"result of trying to scale a cluster via the API which was created using " +
"the Ambari UI. At this time only clusters created via the API using a " +
"blueprint can be scaled with this API. If the cluster was originally created " +
"via the API as described above, please file a Jira for this matter.");
}
hostNameCheck(request, topology);
request.setClusterId(clusterId);
// this registers/updates all request host groups
topology.update(request);
final Long requestId = ambariContext.getNextRequestId();
LogicalRequest logicalRequest = RetryHelper.executeWithRetry(new Callable<LogicalRequest>() {
@Override
public LogicalRequest call() throws Exception {
LogicalRequest logicalRequest = processAndPersistTopologyRequest(request, topology, requestId);
return logicalRequest;
}
}
);
processRequest(request, topology, logicalRequest);
return getRequestStatus(logicalRequest.getRequestId());
}
public void removePendingHostRequests(String clusterName, long requestId) {
ensureInitialized();
LOG.info("TopologyManager.removePendingHostRequests: Entering");
long clusterId = 0;
try {
clusterId = ambariContext.getClusterId(clusterName);
} catch (AmbariException e) {
LOG.error("Unable to retrieve clusterId", e);
throw new IllegalArgumentException("Unable to retrieve clusterId");
}
ClusterTopology topology = clusterTopologyMap.get(clusterId);
if (topology == null) {
throw new IllegalArgumentException("Unable to retrieve cluster topology for cluster");
}
LogicalRequest logicalRequest = allRequests.get(requestId);
if (logicalRequest == null) {
throw new IllegalArgumentException("No Logical Request found for requestId: " + requestId);
}
Collection<HostRequest> pendingHostRequests = logicalRequest.removePendingHostRequests(null);
if (!logicalRequest.hasPendingHostRequests()) {
outstandingRequests.remove(logicalRequest);
}
persistedState.removeHostRequests(pendingHostRequests);
// set current host count to number of currently connected hosts
for (HostGroupInfo currentHostGroupInfo : topology.getHostGroupInfo().values()) {
currentHostGroupInfo.setRequestedCount(currentHostGroupInfo.getHostNames().size());
}
LOG.info("TopologyManager.removePendingHostRequests: Exit");
}
/**
* Creates and persists a {@see PersistedTopologyRequest} and a {@see LogicalRequest} for the provided
* provision cluster request and topology.
* @param request Provision cluster request to create a logical request for.
* @param topology Cluster topology
* @param logicalRequestId The Id for the created logical request
* @return Logical request created.
*/
@Transactional
protected LogicalRequest processAndPersistProvisionClusterTopologyRequest(ProvisionClusterRequest request, ClusterTopology topology, Long logicalRequestId)
throws InvalidTopologyException, AmbariException {
if (null != request.getQuickLinksProfileJson()) {
saveOrUpdateQuickLinksProfile(request.getQuickLinksProfileJson());
}
LogicalRequest logicalRequest = processAndPersistTopologyRequest(request, topology, logicalRequestId);
return logicalRequest;
}
/**
* Creates and persists a {@see PersistedTopologyRequest} and a {@see LogicalRequest} for the provided request and topology.
* @param request {@see ProvisionClusterRequest} or {@see ScaleClusterRequest} to create a logical request for.
* @param topology Cluster topology
* @param logicalRequestId The Id for the created logical request
* @return Logical request created.
*/
@Transactional
protected LogicalRequest processAndPersistTopologyRequest(BaseClusterRequest request, ClusterTopology topology, Long logicalRequestId)
throws InvalidTopologyException, AmbariException {
PersistedTopologyRequest persistedRequest = persistedState.persistTopologyRequest(request);
LogicalRequest logicalRequest = createLogicalRequest(persistedRequest, topology, logicalRequestId);
return logicalRequest;
}
private void hostNameCheck(ScaleClusterRequest request, ClusterTopology topology) throws InvalidTopologyException {
Set<String> hostNames = new HashSet<>();
for(Map.Entry<String, HostGroupInfo> entry : request.getHostGroupInfo().entrySet()) {
hostNames.addAll(entry.getValue().getHostNames());
}
for(String hostName : hostNames) {
// check if host exists already
if(topology.getHostGroupForHost(hostName) != null) {
throw new InvalidTopologyException("Host " + hostName + " cannot be added, because it is already in the cluster");
}
}
}
public void onHostRegistered(HostImpl host, boolean associatedWithCluster) {
ensureInitialized();
LOG.info("TopologyManager.onHostRegistered: Entering");
if (associatedWithCluster || isHostIgnored(host.getHostName())) {
LOG.info("TopologyManager.onHostRegistered: host = {} is already associated with the cluster or is currently being processed", host.getHostName());
return;
}
boolean matchedToRequest = false;
String hostName = host.getHostName();
// The lock ordering in this method must always be the same ordering as TopologyManager.processRequest
// TODO: Locking strategies for TopologyManager should be reviewed and possibly rewritten in a future release
synchronized (availableHosts) {
synchronized (reservedHosts) {
if (reservedHosts.containsKey(hostName)) {
LogicalRequest request = reservedHosts.remove(hostName);
HostOfferResponse response = request.offer(host);
if (response.getAnswer() != HostOfferResponse.Answer.ACCEPTED) {
throw new RuntimeException("LogicalRequest declined host offer of explicitly requested host: " + hostName);
}
LOG.info("TopologyManager.onHostRegistered: processing accepted host offer for reserved host = {}", hostName);
processAcceptedHostOffer(getClusterTopology(request.getClusterId()), response, host);
matchedToRequest = true;
}
}
// can be true if host was reserved
if (!matchedToRequest) {
synchronized (outstandingRequests) {
Iterator<LogicalRequest> outstandingRequestIterator = outstandingRequests.iterator();
while (!matchedToRequest && outstandingRequestIterator.hasNext()) {
LogicalRequest request = outstandingRequestIterator.next();
HostOfferResponse hostOfferResponse = request.offer(host);
switch (hostOfferResponse.getAnswer()) {
case ACCEPTED:
matchedToRequest = true;
LOG.info("TopologyManager.onHostRegistered: processing accepted host offer for matched host = {}", hostName);
processAcceptedHostOffer(getClusterTopology(request.getClusterId()), hostOfferResponse, host);
break;
case DECLINED_DONE:
LOG.info("TopologyManager.onHostRegistered: DECLINED_DONE received for host = {}", hostName);
outstandingRequestIterator.remove();
break;
case DECLINED_PREDICATE:
LOG.info("TopologyManager.onHostRegistered: DECLINED_PREDICATE received for host = {}", hostName);
break;
}
}
}
}
if (!matchedToRequest) {
boolean addToAvailableList = true;
for (HostImpl registered : availableHosts) {
if (registered.getHostId() == host.getHostId()) {
LOG.info("Host {} re-registered, will not be added to the available hosts list", hostName);
addToAvailableList = false;
break;
}
}
if (addToAvailableList) {
LOG.info("TopologyManager: Queueing available host {}", hostName);
availableHosts.add(host);
}
}
}
}
/**
* Through this method {@see TopologyManager} gets notified when a connection to a host in the cluster is lost.
* The passed host will be excluded from scheduling any tasks onto it as it can't be reached.
* @param host
*/
public void onHostHeartBeatLost(Host host) {
if (AmbariServer.getController() == null) {
return;
}
ensureInitialized();
synchronized (availableHosts) {
LOG.info("Hearbeat for host {} lost thus removing it from available hosts.", host.getHostName());
availableHosts.remove(host);
}
}
public LogicalRequest getRequest(long requestId) {
ensureInitialized();
return allRequests.get(requestId);
}
public Collection<LogicalRequest> getRequests(Collection<Long> requestIds) {
ensureInitialized();
if (requestIds.isEmpty()) {
return allRequests.values();
} else {
Collection<LogicalRequest> matchingRequests = new ArrayList<>();
for (long id : requestIds) {
LogicalRequest request = allRequests.get(id);
if (request != null) {
matchingRequests.add(request);
}
}
return matchingRequests;
}
}
/**
* Currently we are just returning all stages for all requests
* and relying on the StageResourceProvider to convert each to a resource and do a predicate eval on each.
*/
public Collection<StageEntity> getStages() {
ensureInitialized();
Collection<StageEntity> stages = new ArrayList<>();
for (LogicalRequest logicalRequest : allRequests.values()) {
stages.addAll(logicalRequest.getStageEntities());
}
return stages;
}
public Collection<HostRoleCommand> getTasks(long requestId) {
ensureInitialized();
LogicalRequest request = allRequests.get(requestId);
return request == null ? Collections.<HostRoleCommand>emptyList() : request.getCommands();
}
public Collection<HostRoleCommand> getTasks(Collection<Long> requestIds) {
ensureInitialized();
Collection<HostRoleCommand> tasks = new ArrayList<>();
for (long id : requestIds) {
tasks.addAll(getTasks(id));
}
return tasks;
}
public Map<Long, HostRoleCommandStatusSummaryDTO> getStageSummaries(Long requestId) {
ensureInitialized();
LogicalRequest request = allRequests.get(requestId);
return request == null ? Collections.<Long, HostRoleCommandStatusSummaryDTO>emptyMap() :
request.getStageSummaries();
}
public RequestStatusResponse getRequestStatus(long requestId) {
ensureInitialized();
LogicalRequest request = allRequests.get(requestId);
return request == null ? null : request.getRequestStatus();
}
public Collection<RequestStatusResponse> getRequestStatus(Collection<Long> ids) {
ensureInitialized();
List<RequestStatusResponse> requestStatusResponses = new ArrayList<>();
for (long id : ids) {
RequestStatusResponse response = getRequestStatus(id);
if (response != null) {
requestStatusResponses.add(response);
}
}
return requestStatusResponses;
}
public ClusterTopology getClusterTopology(Long clusterId) {
ensureInitialized();
return clusterTopologyMap.get(clusterId);
}
/**
* Gets a map of components keyed by host which have operations in the
* {@link HostRoleStatus#PENDING} state. This could either be because hosts
* have not registered or becuase the operations are actually waiting to be
* queued.
*
* @return a mapping of host with pending components.
*/
public Map<String, Collection<String>> getPendingHostComponents() {
ensureInitialized();
Map<String, Collection<String>> hostComponentMap = new HashMap<>();
for (LogicalRequest logicalRequest : allRequests.values()) {
Map<Long, HostRoleCommandStatusSummaryDTO> summary = logicalRequest.getStageSummaries();
final CalculatedStatus status = CalculatedStatus.statusFromStageSummary(summary, summary.keySet());
// either use the calculated status of the stage or the fact that there
// are no tasks and the request has no end time to determine if the
// request is still in progress
boolean logicalRequestInProgress = false;
if (status.getStatus().isInProgress() || (summary.isEmpty() && logicalRequest.getEndTime() <= 0) ) {
logicalRequestInProgress = true;
}
if (logicalRequestInProgress) {
Map<String, Collection<String>> requestTopology = logicalRequest.getProjectedTopology();
for (Map.Entry<String, Collection<String>> entry : requestTopology.entrySet()) {
String host = entry.getKey();
Collection<String> hostComponents = hostComponentMap.get(host);
if (hostComponents == null) {
hostComponents = new HashSet<>();
hostComponentMap.put(host, hostComponents);
}
hostComponents.addAll(entry.getValue());
}
}
}
return hostComponentMap;
}
private void processRequest(TopologyRequest request, ClusterTopology topology, final LogicalRequest logicalRequest)
throws AmbariException {
LOG.info("TopologyManager.processRequest: Entering");
finalizeTopology(request, topology);
boolean requestHostComplete = false;
//todo: overall synchronization. Currently we have nested synchronization here
// The lock ordering in this method must always be the same ordering as TopologyManager.onHostRegistered
// TODO: Locking strategies for TopologyManager should be reviewed and possibly rewritten in a future release
synchronized (availableHosts) {
Iterator<HostImpl> hostIterator = availableHosts.iterator();
while (!requestHostComplete && hostIterator.hasNext()) {
HostImpl host = hostIterator.next();
synchronized (reservedHosts) {
String hostname = host.getHostName();
if (reservedHosts.containsKey(hostname)) {
if (logicalRequest.equals(reservedHosts.get(hostname))) {
// host is registered to this request, remove it from reserved map
LOG.info("TopologyManager.processRequest: host name = {} is mapped to LogicalRequest ID = {} and will be removed from the reserved hosts.",
hostname, logicalRequest.getRequestId());
reservedHosts.remove(hostname);
} else {
// host is registered with another request, don't offer
//todo: clean up logic
LOG.info("TopologyManager.processRequest: host name = {} is registered with another request, and will not be offered to LogicalRequest ID = {}",
hostname, logicalRequest.getRequestId());
continue;
}
}
}
LOG.info("TopologyManager.processRequest: offering host name = {} to LogicalRequest ID = {}",
host.getHostName(), logicalRequest.getRequestId());
HostOfferResponse response = logicalRequest.offer(host);
switch (response.getAnswer()) {
case ACCEPTED:
//todo: when host matches last host it returns ACCEPTED so we don't know that logical request is no
//todo: longer outstanding until we call offer again. This is really only an issue if we need to
//todo: deal specifically with outstanding hosts other than calling offer. Also, failure handling
//todo: may affect this behavior??
hostIterator.remove();
LOG.info("TopologyManager.processRequest: host name = {} was ACCEPTED by LogicalRequest ID = {} , host has been removed from available hosts.",
host.getHostName(), logicalRequest.getRequestId());
processAcceptedHostOffer(getClusterTopology(logicalRequest.getClusterId()), response, host);
break;
case DECLINED_DONE:
requestHostComplete = true;
LOG.info("TopologyManager.processRequest: host name = {} was DECLINED_DONE by LogicalRequest ID = {}",
host.getHostName(), logicalRequest.getRequestId());
break;
case DECLINED_PREDICATE:
LOG.info("TopologyManager.processRequest: host name = {} was DECLINED_PREDICATE by LogicalRequest ID = {}",
host.getHostName(), logicalRequest.getRequestId());
break;
}
}
if (!requestHostComplete) {
// not all required hosts have been matched (see earlier comment regarding outstanding logical requests)
LOG.info("TopologyManager.processRequest: not all required hosts have been matched, so adding LogicalRequest ID = {} to outstanding requests",
logicalRequest.getRequestId());
synchronized (outstandingRequests) {
outstandingRequests.add(logicalRequest);
}
}
}
}
@Transactional
protected LogicalRequest createLogicalRequest(final PersistedTopologyRequest request, ClusterTopology topology, Long requestId)
throws AmbariException {
final LogicalRequest logicalRequest = logicalRequestFactory.createRequest(
requestId, request.getRequest(), topology);
persistedState.persistLogicalRequest(logicalRequest, request.getId());
allRequests.put(logicalRequest.getRequestId(), logicalRequest);
LOG.info("TopologyManager.createLogicalRequest: created LogicalRequest with ID = {} and completed persistence of this request.",
logicalRequest.getRequestId());
synchronized (reservedHosts) {
for (String host : logicalRequest.getReservedHosts()) {
reservedHosts.put(host, logicalRequest);
}
}
return logicalRequest;
}
private void processAcceptedHostOffer(final ClusterTopology topology, final HostOfferResponse response, final HostImpl host) {
final String hostName = host.getHostName();
try {
topology.addHostToTopology(response.getHostGroupName(), hostName);
// update the host with the rack info if applicable
updateHostWithRackInfo(topology, response, host);
} catch (InvalidTopologyException e) {
// host already registered
throw new RuntimeException("An internal error occurred while performing request host registration: " + e, e);
} catch (NoSuchHostGroupException e) {
// invalid host group
throw new RuntimeException("An internal error occurred while performing request host registration: " + e, e);
}
// persist the host request -> hostName association
try {
RetryHelper.executeWithRetry(new Callable<Void>() {
@Override
public Void call() throws Exception {
persistTopologyHostRegistration(response.getHostRequestId(), host);
return null;
}
});
} catch (AmbariException e) {
LOG.error("Exception ocurred while registering host name", e);
throw new RuntimeException(e);
}
LOG.info("TopologyManager.processAcceptedHostOffer: queue tasks for host = {} which responded {}", hostName, response.getAnswer());
if (parallelTaskCreationEnabled) {
executor.execute(new Runnable() { // do not start until cluster config done
@Override
public void run() {
queueHostTasks(topology, response, hostName);
}
});
} else {
queueHostTasks(topology, response, hostName);
}
}
@Transactional
protected void persistTopologyHostRegistration(long hostRequestId, final HostImpl host) {
persistedState.registerHostName(hostRequestId, host.getHostName());
persistedState.registerInTopologyHostInfo(host);
}
private void queueHostTasks(ClusterTopology topology, HostOfferResponse response, String hostName) {
LOG.info("TopologyManager.processAcceptedHostOffer: queueing tasks for host = {}", hostName);
response.executeTasks(taskExecutor, hostName, topology, ambariContext);
}
private void updateHostWithRackInfo(ClusterTopology topology, HostOfferResponse response, HostImpl host) {
// the rack info from the cluster creation template
String rackInfoFromTemplate = topology.getHostGroupInfo().get(response.getHostGroupName()).getHostRackInfo().get
(host.getHostName());
if (null != rackInfoFromTemplate) {
host.setRackInfo(rackInfoFromTemplate);
try {
// todo: do we need this in case of blueprints?
ambariContext.getController().registerRackChange(ambariContext.getClusterName(topology.getClusterId()));
} catch (AmbariException e) {
LOG.error("Could not register rack change for cluster id {}", topology.getClusterId());
LOG.error("Exception during rack change: ", e);
}
}
}
private void replayRequests(Map<ClusterTopology, List<LogicalRequest>> persistedRequests) {
LOG.info("TopologyManager.replayRequests: Entering");
boolean configChecked = false;
for (Map.Entry<ClusterTopology, List<LogicalRequest>> requestEntry : persistedRequests.entrySet()) {
ClusterTopology topology = requestEntry.getKey();
clusterTopologyMap.put(topology.getClusterId(), topology);
// update provision request cache
LogicalRequest provisionRequest = persistedState.getProvisionRequest(topology.getClusterId());
if(provisionRequest != null) {
clusterProvisionWithBlueprintCreateRequests.put(topology.getClusterId(), provisionRequest);
clusterProvisionWithBlueprintCreationFinished.put(topology.getClusterId(),
isLogicalRequestFinished(clusterProvisionWithBlueprintCreateRequests.get(topology.getClusterId())));
}
for (LogicalRequest logicalRequest : requestEntry.getValue()) {
allRequests.put(logicalRequest.getRequestId(), logicalRequest);
if (logicalRequest.hasPendingHostRequests()) {
outstandingRequests.add(logicalRequest);
for (String reservedHost : logicalRequest.getReservedHosts()) {
reservedHosts.put(reservedHost, logicalRequest);
}
// completed host requests are host requests which have been mapped to a host
// and the host has ben added to the cluster
for (HostRequest hostRequest : logicalRequest.getCompletedHostRequests()) {
try {
String hostName = hostRequest.getHostName();
topology.addHostToTopology(hostRequest.getHostgroupName(), hostName);
hostsToIgnore.add(hostName);
LOG.info("TopologyManager.replayRequests: host name = {} has been added to cluster and to ignore list.", hostName);
} catch (InvalidTopologyException e) {
LOG.warn("Attempted to add host to multiple host groups while replaying requests: " + e, e);
} catch (NoSuchHostGroupException e) {
LOG.warn("Failed to add host to topology while replaying requests: " + e, e);
}
}
}
}
if (!configChecked) {
configChecked = true;
if (!ambariContext.isTopologyResolved(topology.getClusterId())) {
LOG.info("TopologyManager.replayRequests: no config with TOPOLOGY_RESOLVED found, adding cluster config request");
addClusterConfigRequest(topology, new ClusterConfigurationRequest(
ambariContext, topology, false, stackAdvisorBlueprintProcessor));
}
}
}
LOG.info("TopologyManager.replayRequests: Exit");
}
/**
* @param logicalRequest
* @return true if all the tasks in the logical request are in completed state, false otherwise
*/
private boolean isLogicalRequestFinished(LogicalRequest logicalRequest) {
if(logicalRequest != null) {
boolean completed = true;
for(ShortTaskStatus ts : logicalRequest.getRequestStatus().getTasks()) {
if(!HostRoleStatus.valueOf(ts.getStatus()).isCompletedState()) {
completed = false;
}
}
return completed;
}
return false;
}
/**
* Returns if all the tasks in the logical request have completed state.
* @param logicalRequest
* @return
*/
private boolean isLogicalRequestSuccessful(LogicalRequest logicalRequest) {
if(logicalRequest != null) {
for(ShortTaskStatus ts : logicalRequest.getRequestStatus().getTasks()) {
if(HostRoleStatus.valueOf(ts.getStatus()) != HostRoleStatus.COMPLETED) {
return false;
}
}
}
return true;
}
//todo: this should invoke a callback on each 'service' in the topology
private void finalizeTopology(TopologyRequest request, ClusterTopology topology) {
}
private boolean isHostIgnored(String host) {
return hostsToIgnore.remove(host);
}
/**
* Add the kerberos client to groups if kerberos is enabled for the cluster.
*
* @param topology cluster topology
*/
private void addKerberosClient(ClusterTopology topology) {
for (HostGroup group : topology.getBlueprint().getHostGroups().values()) {
group.addComponent("KERBEROS_CLIENT");
}
}
/**
* Register the configuration task which is responsible for configuration topology resolution
* and setting the updated configuration on the cluster. This task needs to be submitted to the
* executor before any host requests to ensure that no install or start tasks are executed prior
* to configuration being set on the cluster.
*
* @param topology cluster topology
* @param configurationRequest configuration request to be executed
*/
private void addClusterConfigRequest(ClusterTopology topology, ClusterConfigurationRequest configurationRequest) {
String timeoutStr = topology.getConfiguration().getPropertyValue(CLUSTER_ENV_CONFIG_TYPE_NAME,
CLUSTER_CONFIG_TASK_MAX_TIME_IN_MILLIS_PROPERTY_NAME);
long timeout = 1000 * 60 * 30; // 30 minutes
long delay = 1000; //ms
if (timeoutStr != null) {
timeout = Long.parseLong(timeoutStr);
LOG.debug("ConfigureClusterTask timeout set to: {}", timeout);
} else {
LOG.debug("No timeout constraints found in configuration. Wired defaults will be applied.");
}
ConfigureClusterTask configureClusterTask = configureClusterTaskFactory.createConfigureClusterTask(topology,
configurationRequest);
AsyncCallableService<Boolean> asyncCallableService = new AsyncCallableService<>(configureClusterTask, timeout, delay,
Executors.newScheduledThreadPool(1));
executor.submit(asyncCallableService);
}
/**
*
* Removes a host from the available hosts when the host gets deleted.
* @param hostsRemovedEvent the event containing the hostname
*/
@Subscribe
public void processHostRemovedEvent(HostsRemovedEvent hostsRemovedEvent) {
if (hostsRemovedEvent.getHostNames().isEmpty()) {
LOG.warn("Missing host name from host removed event [{}] !", hostsRemovedEvent);
return;
}
LOG.info("Removing hosts [{}] from available hosts on hosts removed event.", hostsRemovedEvent.getHostNames());
Set<HostImpl> toBeRemoved = new HashSet<>();
// synchronization is required here as the list may be modified concurrently. See comments in this whole class.
synchronized (availableHosts) {
for (HostImpl hostImpl : availableHosts) {
for (String hostName : hostsRemovedEvent.getHostNames()) {
if (hostName.equals(hostImpl.getHostName())) {
toBeRemoved.add(hostImpl);
break;
}
}
}
if (!toBeRemoved.isEmpty()) {
for (HostImpl host : toBeRemoved) {
availableHosts.remove(host);
LOG.info("Removed host: [{}] from available hosts", host.getHostName());
}
} else {
LOG.debug("No any host [{}] found in available hosts", hostsRemovedEvent.getHostNames());
}
}
}
}