package com.sequenceiq.cloudbreak.core.bootstrap.service; import static com.sequenceiq.cloudbreak.common.type.OrchestratorConstants.SWARM; import static com.sequenceiq.cloudbreak.core.bootstrap.service.ClusterDeletionBasedExitCriteriaModel.clusterDeletionBasedExitCriteriaModel; import static com.sequenceiq.cloudbreak.orchestrator.container.DockerContainer.MUNCHAUSEN; import static com.sequenceiq.cloudbreak.service.PollingResult.EXIT; import static com.sequenceiq.cloudbreak.service.PollingResult.TIMEOUT; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; import javax.inject.Inject; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.stereotype.Component; import com.sequenceiq.cloudbreak.cloud.scheduler.CancellationException; import com.sequenceiq.cloudbreak.core.CloudbreakException; import com.sequenceiq.cloudbreak.core.bootstrap.service.container.ContainerBootstrapApiCheckerTask; import com.sequenceiq.cloudbreak.core.bootstrap.service.container.ContainerClusterAvailabilityCheckerTask; import com.sequenceiq.cloudbreak.core.bootstrap.service.container.ContainerOrchestratorResolver; import com.sequenceiq.cloudbreak.core.bootstrap.service.container.context.ContainerBootstrapApiContext; import com.sequenceiq.cloudbreak.core.bootstrap.service.container.context.ContainerOrchestratorClusterContext; import com.sequenceiq.cloudbreak.core.bootstrap.service.host.HostBootstrapApiCheckerTask; import com.sequenceiq.cloudbreak.core.bootstrap.service.host.HostClusterAvailabilityCheckerTask; import com.sequenceiq.cloudbreak.core.bootstrap.service.host.HostOrchestratorResolver; import com.sequenceiq.cloudbreak.core.bootstrap.service.host.context.HostBootstrapApiContext; import com.sequenceiq.cloudbreak.core.bootstrap.service.host.context.HostOrchestratorClusterContext; import com.sequenceiq.cloudbreak.domain.InstanceMetaData; import com.sequenceiq.cloudbreak.domain.Orchestrator; import com.sequenceiq.cloudbreak.domain.Stack; import com.sequenceiq.cloudbreak.orchestrator.container.ContainerOrchestrator; import com.sequenceiq.cloudbreak.orchestrator.exception.CloudbreakOrchestratorCancelledException; import com.sequenceiq.cloudbreak.orchestrator.exception.CloudbreakOrchestratorException; import com.sequenceiq.cloudbreak.orchestrator.host.HostOrchestrator; import com.sequenceiq.cloudbreak.orchestrator.model.GatewayConfig; import com.sequenceiq.cloudbreak.orchestrator.model.Node; import com.sequenceiq.cloudbreak.repository.OrchestratorRepository; import com.sequenceiq.cloudbreak.repository.StackRepository; import com.sequenceiq.cloudbreak.service.GatewayConfigService; import com.sequenceiq.cloudbreak.service.PollingResult; import com.sequenceiq.cloudbreak.service.PollingService; @Component public class ClusterBootstrapper { private static final Logger LOGGER = LoggerFactory.getLogger(ClusterBootstrapper.class); private static final int POLL_INTERVAL = 5000; private static final int MAX_POLLING_ATTEMPTS = 500; @Inject private StackRepository stackRepository; @Inject private OrchestratorRepository orchestratorRepository; @Inject private PollingService<ContainerBootstrapApiContext> containerBootstrapApiPollingService; @Inject private PollingService<HostBootstrapApiContext> hostBootstrapApiPollingService; @Inject private ContainerBootstrapApiCheckerTask containerBootstrapApiCheckerTask; @Inject private HostBootstrapApiCheckerTask hostBootstrapApiCheckerTask; @Inject private PollingService<ContainerOrchestratorClusterContext> containerClusterAvailabilityPollingService; @Inject private PollingService<HostOrchestratorClusterContext> hostClusterAvailabilityPollingService; @Inject private ContainerClusterAvailabilityCheckerTask containerClusterAvailabilityCheckerTask; @Inject private HostClusterAvailabilityCheckerTask hostClusterAvailabilityCheckerTask; @Inject private ClusterBootstrapperErrorHandler clusterBootstrapperErrorHandler; @Inject private ContainerOrchestratorResolver containerOrchestratorResolver; @Inject private HostOrchestratorResolver hostOrchestratorResolver; @Inject private GatewayConfigService gatewayConfigService; @Inject private ContainerConfigService containerConfigService; @Inject private OrchestratorTypeResolver orchestratorTypeResolver; public void bootstrapMachines(Long stackId) throws CloudbreakException { Stack stack = stackRepository.findOneWithLists(stackId); String stackOrchestratorType = stack.getOrchestrator().getType(); OrchestratorType orchestratorType = orchestratorTypeResolver.resolveType(stackOrchestratorType); if (orchestratorType.hostOrchestrator()) { bootstrapOnHost(stack); } else if (orchestratorTypeResolver.resolveType(stack.getOrchestrator()).containerOrchestrator()) { LOGGER.info("Skipping bootstrap of the machines because the stack's orchestrator type is '{}'.", stackOrchestratorType); } else if (orchestratorType.containerOrchestrator()) { bootstrapContainers(stack); } else { LOGGER.error("Orchestrator not found: {}", stackOrchestratorType); throw new CloudbreakException("HostOrchestrator not found: " + stackOrchestratorType); } } @SuppressWarnings("unchecked") public void bootstrapOnHost(Stack stack) throws CloudbreakException { Set<Node> nodes = new HashSet<>(); for (InstanceMetaData instanceMetaData : stack.getRunningInstanceMetaData()) { addNode(nodes, instanceMetaData); } try { HostOrchestrator hostOrchestrator = hostOrchestratorResolver.get(stack.getOrchestrator().getType()); List<GatewayConfig> allGatewayConfig = new ArrayList<>(); Boolean enableKnox = stack.getCluster().getGateway().getEnableGateway(); for (InstanceMetaData gateway : stack.getGatewayInstanceMetadata()) { GatewayConfig gatewayConfig = gatewayConfigService.getGatewayConfig(stack, gateway, enableKnox); allGatewayConfig.add(gatewayConfig); PollingResult bootstrapApiPolling = hostBootstrapApiPollingService.pollWithTimeoutSingleFailure( hostBootstrapApiCheckerTask, new HostBootstrapApiContext(stack, gatewayConfig, hostOrchestrator), POLL_INTERVAL, MAX_POLLING_ATTEMPTS); validatePollingResultForCancellation(bootstrapApiPolling, "Polling of bootstrap API was cancelled."); } hostOrchestrator.bootstrap(allGatewayConfig, nodes, clusterDeletionBasedExitCriteriaModel(stack.getId(), null)); InstanceMetaData primaryGateway = stack.getPrimaryGatewayInstance(); GatewayConfig gatewayConfig = gatewayConfigService.getGatewayConfig(stack, primaryGateway, enableKnox); String gatewayIp = gatewayConfigService.getGatewayIp(stack, primaryGateway); PollingResult allNodesAvailabilityPolling = hostClusterAvailabilityPollingService.pollWithTimeoutSingleFailure( hostClusterAvailabilityCheckerTask, new HostOrchestratorClusterContext(stack, hostOrchestrator, gatewayConfig, nodes), POLL_INTERVAL, MAX_POLLING_ATTEMPTS); validatePollingResultForCancellation(allNodesAvailabilityPolling, "Polling of all nodes availability was cancelled."); Orchestrator orchestrator = stack.getOrchestrator(); orchestrator.setApiEndpoint(gatewayIp + ":" + stack.getGatewayPort()); orchestrator.setType(hostOrchestrator.name()); orchestratorRepository.save(orchestrator); if (TIMEOUT.equals(allNodesAvailabilityPolling)) { clusterBootstrapperErrorHandler.terminateFailedNodes(hostOrchestrator, null, stack, gatewayConfig, nodes); } } catch (Exception e) { throw new CloudbreakException(e); } } public void bootstrapContainers(Stack stack) throws CloudbreakException { Set<Node> nodes = new HashSet<>(); for (InstanceMetaData instanceMetaData : stack.getRunningInstanceMetaData()) { nodes.add(new Node(instanceMetaData.getPrivateIp(), instanceMetaData.getPublicIpWrapper())); } try { InstanceMetaData gatewayInstance = stack.getPrimaryGatewayInstance(); GatewayConfig gatewayConfig = gatewayConfigService.getGatewayConfig(stack, gatewayInstance, stack.getCluster().getGateway().getEnableGateway()); String gatewayIp = gatewayConfigService.getGatewayIp(stack, gatewayInstance); ContainerOrchestrator containerOrchestrator = containerOrchestratorResolver.get(SWARM); PollingResult bootstrapApiPolling = containerBootstrapApiPollingService.pollWithTimeoutSingleFailure( containerBootstrapApiCheckerTask, new ContainerBootstrapApiContext(stack, gatewayConfig, containerOrchestrator), POLL_INTERVAL, MAX_POLLING_ATTEMPTS); validatePollingResultForCancellation(bootstrapApiPolling, "Polling of bootstrap API was cancelled."); List<Set<Node>> nodeMap = prepareBootstrapSegments(nodes, containerOrchestrator.getMaxBootstrapNodes(), gatewayIp); containerOrchestrator.bootstrap(gatewayConfig, containerConfigService.get(stack, MUNCHAUSEN), nodeMap.get(0), stack.getConsulServers(), clusterDeletionBasedExitCriteriaModel(stack.getId(), null)); if (nodeMap.size() > 1) { PollingResult gatewayAvailability = containerClusterAvailabilityPollingService.pollWithTimeoutSingleFailure( containerClusterAvailabilityCheckerTask, new ContainerOrchestratorClusterContext(stack, containerOrchestrator, gatewayConfig, nodeMap.get(0)), POLL_INTERVAL, MAX_POLLING_ATTEMPTS); validatePollingResultForCancellation(gatewayAvailability, "Polling of gateway node availability was cancelled."); for (int i = 1; i < nodeMap.size(); i++) { containerOrchestrator.bootstrapNewNodes(gatewayConfig, containerConfigService.get(stack, MUNCHAUSEN), nodeMap.get(i), clusterDeletionBasedExitCriteriaModel(stack.getId(), null)); PollingResult agentAvailability = containerClusterAvailabilityPollingService.pollWithTimeoutSingleFailure( containerClusterAvailabilityCheckerTask, new ContainerOrchestratorClusterContext(stack, containerOrchestrator, gatewayConfig, nodeMap.get(i)), POLL_INTERVAL, MAX_POLLING_ATTEMPTS); validatePollingResultForCancellation(agentAvailability, "Polling of agent nodes availability was cancelled."); } } PollingResult allNodesAvailabilityPolling = containerClusterAvailabilityPollingService.pollWithTimeoutSingleFailure( containerClusterAvailabilityCheckerTask, new ContainerOrchestratorClusterContext(stack, containerOrchestrator, gatewayConfig, nodes), POLL_INTERVAL, MAX_POLLING_ATTEMPTS); validatePollingResultForCancellation(allNodesAvailabilityPolling, "Polling of all nodes availability was cancelled."); Orchestrator orchestrator = new Orchestrator(); orchestrator.setApiEndpoint(gatewayIp + ":" + stack.getGatewayPort()); orchestrator.setType("SWARM"); orchestratorRepository.save(orchestrator); stack.setOrchestrator(orchestrator); stackRepository.save(stack); if (TIMEOUT.equals(allNodesAvailabilityPolling)) { clusterBootstrapperErrorHandler.terminateFailedNodes(null, containerOrchestrator, stack, gatewayConfig, nodes); } } catch (CloudbreakOrchestratorCancelledException e) { throw new CancellationException(e.getMessage()); } catch (CloudbreakOrchestratorException e) { throw new CloudbreakException(e); } } public void bootstrapNewNodes(Long stackId, Set<String> upscaleCandidateAddresses) throws CloudbreakException { Stack stack = stackRepository.findOneWithLists(stackId); Set<Node> nodes = new HashSet<>(); Set<Node> allNodes = new HashSet<>(); for (InstanceMetaData instanceMetaData : stack.getRunningInstanceMetaData()) { if (upscaleCandidateAddresses.contains(instanceMetaData.getPrivateIp())) { nodes.add(new Node(instanceMetaData.getPrivateIp(), instanceMetaData.getPublicIpWrapper(), instanceMetaData.getDiscoveryFQDN())); } addNode(allNodes, instanceMetaData); } try { InstanceMetaData gatewayInstance = stack.getPrimaryGatewayInstance(); OrchestratorType orchestratorType = orchestratorTypeResolver.resolveType(stack.getOrchestrator().getType()); if (orchestratorType.hostOrchestrator()) { List<GatewayConfig> allGatewayConfigs = gatewayConfigService.getAllGatewayConfigs(stack); bootstrapNewNodesOnHost(stack, allGatewayConfigs, nodes, allNodes); } else if (orchestratorType.containerOrchestrator()) { GatewayConfig gatewayConfig = gatewayConfigService.getGatewayConfig(stack, gatewayInstance, stack.getCluster().getGateway().getEnableGateway()); bootstrapNewNodesInContainer(stack, gatewayInstance, nodes, gatewayConfig); } } catch (CloudbreakOrchestratorCancelledException e) { throw new CancellationException(e.getMessage()); } catch (CloudbreakOrchestratorException e) { throw new CloudbreakException(e); } } private void bootstrapNewNodesOnHost(Stack stack, List<GatewayConfig> allGatewayConfigs, Set<Node> nodes, Set<Node> allNodes) throws CloudbreakException, CloudbreakOrchestratorException { HostOrchestrator hostOrchestrator = hostOrchestratorResolver.get(stack.getOrchestrator().getType()); Boolean enableKnox = stack.getCluster().getGateway().getEnableGateway(); for (InstanceMetaData gateway : stack.getGatewayInstanceMetadata()) { GatewayConfig gatewayConfig = gatewayConfigService.getGatewayConfig(stack, gateway, enableKnox); PollingResult bootstrapApiPolling = hostBootstrapApiPollingService.pollWithTimeoutSingleFailure( hostBootstrapApiCheckerTask, new HostBootstrapApiContext(stack, gatewayConfig, hostOrchestrator), POLL_INTERVAL, MAX_POLLING_ATTEMPTS); validatePollingResultForCancellation(bootstrapApiPolling, "Polling of bootstrap API was cancelled."); } Set<InstanceMetaData> runningInstanceMetaData = stack.getRunningInstanceMetaData(); nodes.forEach(n -> n.setHostGroup(runningInstanceMetaData.stream() .filter(i -> i.getPrivateIp().equals(n.getPrivateIp())).findFirst().get().getInstanceGroupName())); hostOrchestrator.bootstrapNewNodes(allGatewayConfigs, nodes, allNodes, clusterDeletionBasedExitCriteriaModel(stack.getId(), null)); InstanceMetaData primaryGateway = stack.getPrimaryGatewayInstance(); GatewayConfig gatewayConfig = gatewayConfigService.getGatewayConfig(stack, primaryGateway, enableKnox); PollingResult allNodesAvailabilityPolling = hostClusterAvailabilityPollingService.pollWithTimeoutSingleFailure(hostClusterAvailabilityCheckerTask, new HostOrchestratorClusterContext(stack, hostOrchestrator, gatewayConfig, nodes), POLL_INTERVAL, MAX_POLLING_ATTEMPTS); validatePollingResultForCancellation(allNodesAvailabilityPolling, "Polling of new nodes availability was cancelled."); if (TIMEOUT.equals(allNodesAvailabilityPolling)) { clusterBootstrapperErrorHandler.terminateFailedNodes(hostOrchestrator, null, stack, gatewayConfig, nodes); } } private void bootstrapNewNodesInContainer(Stack stack, InstanceMetaData gatewayInstance, Set<Node> nodes, GatewayConfig gatewayConfig) throws CloudbreakException, CloudbreakOrchestratorException { ContainerOrchestrator containerOrchestrator = containerOrchestratorResolver.get(SWARM); String gatewayIpToTls = gatewayConfigService.getGatewayIp(stack, gatewayInstance); List<Set<Node>> nodeMap = prepareBootstrapSegments(nodes, containerOrchestrator.getMaxBootstrapNodes(), gatewayIpToTls); for (Set<Node> aNodeMap : nodeMap) { containerOrchestrator.bootstrapNewNodes(gatewayConfig, containerConfigService.get(stack, MUNCHAUSEN), aNodeMap, clusterDeletionBasedExitCriteriaModel(stack.getId(), null)); PollingResult newNodesAvailabilityPolling = containerClusterAvailabilityPollingService.pollWithTimeoutSingleFailure( containerClusterAvailabilityCheckerTask, new ContainerOrchestratorClusterContext(stack, containerOrchestrator, gatewayConfig, aNodeMap), POLL_INTERVAL, MAX_POLLING_ATTEMPTS); validatePollingResultForCancellation(newNodesAvailabilityPolling, "Polling of new nodes availability was cancelled."); } PollingResult pollingResult = containerClusterAvailabilityPollingService.pollWithTimeoutSingleFailure( containerClusterAvailabilityCheckerTask, new ContainerOrchestratorClusterContext(stack, containerOrchestrator, gatewayConfig, nodes), POLL_INTERVAL, MAX_POLLING_ATTEMPTS); validatePollingResultForCancellation(pollingResult, "Polling of new nodes availability was cancelled."); if (TIMEOUT.equals(pollingResult)) { clusterBootstrapperErrorHandler.terminateFailedNodes(null, containerOrchestrator, stack, gatewayConfig, nodes); } } private List<Set<Node>> prepareBootstrapSegments(Set<Node> nodes, int maxBootstrapNodes, String gatewayIp) { List<Set<Node>> result = new ArrayList<>(); Set<Node> newNodes = new HashSet<>(); Node gatewayNode = getGateWayNode(nodes, gatewayIp); if (gatewayNode != null) { newNodes.add(gatewayNode); } for (Node node : nodes) { if (!gatewayIp.equals(node.getPublicIp())) { newNodes.add(node); if (newNodes.size() >= maxBootstrapNodes) { result.add(newNodes); newNodes = new HashSet<>(); } } } if (!newNodes.isEmpty()) { result.add(newNodes); } return result; } private Node getGateWayNode(Set<Node> nodes, String gatewayIp) { for (Node node : nodes) { if (gatewayIp.equals(node.getPublicIp())) { return node; } } return null; } private void validatePollingResultForCancellation(PollingResult pollingResult, String cancelledMessage) { if (EXIT.equals(pollingResult)) { throw new CancellationException(cancelledMessage); } } private void addNode(Set<Node> nodes, InstanceMetaData instanceMetaData) { if (instanceMetaData.getPrivateIp() == null && instanceMetaData.getPublicIpWrapper() == null) { LOGGER.warn("Skipping instancemetadata because the public ip and private ip are null '{}'.", instanceMetaData); } else { Node node = new Node(instanceMetaData.getPrivateIp(), instanceMetaData.getPublicIpWrapper(), instanceMetaData.getDiscoveryFQDN()); node.setHostGroup(instanceMetaData.getInstanceGroupName()); nodes.add(node); } } }