/* * Copyright © 2016 Cask Data, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package co.cask.cdap.master.startup; import co.cask.cdap.common.conf.CConfiguration; import co.cask.cdap.common.conf.Constants; import com.google.common.base.Joiner; import com.google.common.util.concurrent.ThreadFactoryBuilder; import com.google.inject.Inject; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.yarn.api.records.NodeId; import org.apache.hadoop.yarn.api.records.NodeReport; import org.apache.hadoop.yarn.api.records.Resource; import org.apache.hadoop.yarn.client.api.YarnClient; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.HashSet; import java.util.List; import java.util.Set; import java.util.concurrent.Callable; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; /** * Checks that YARN is available and has enough resources to run all system services. */ // class is picked up through classpath examination @SuppressWarnings("unused") class YarnCheck extends AbstractMasterCheck { private static final Logger LOG = LoggerFactory.getLogger(YarnCheck.class); private final Configuration hConf; @Inject private YarnCheck(CConfiguration cConf, Configuration hConf) { super(cConf); this.hConf = hConf; } @Override public void run() { int yarnConnectTimeout = cConf.getInt(Constants.Startup.YARN_CONNECT_TIMEOUT_SECONDS, 60); LOG.info("Checking YARN availability -- may take up to {} seconds.", yarnConnectTimeout); final YarnClient yarnClient = YarnClient.createYarnClient(); yarnClient.init(hConf); List<NodeReport> nodeReports; // if yarn is not up, yarnClient.start() will hang. ExecutorService executorService = Executors.newSingleThreadExecutor( new ThreadFactoryBuilder().setNameFormat("startup-checker").build()); try { Future<List<NodeReport>> result = executorService.submit(new Callable<List<NodeReport>>() { @Override public List<NodeReport> call() throws Exception { yarnClient.start(); return yarnClient.getNodeReports(); } }); nodeReports = result.get(yarnConnectTimeout, TimeUnit.SECONDS); LOG.info(" YARN availability successfully verified."); } catch (Exception e) { throw new RuntimeException( "Unable to get status of YARN nodemanagers. " + "Please check that YARN is running " + "and that the correct Hadoop configuration (core-site.xml, yarn-site.xml) and libraries " + "are included in the CDAP master classpath.", e); } finally { try { yarnClient.stop(); } catch (Exception e) { LOG.warn("Error stopping yarn client.", e); } finally { executorService.shutdown(); } } checkResources(nodeReports); } private void checkResources(List<NodeReport> nodeReports) { LOG.info("Checking that YARN has enough resources to run all system services."); int memoryCapacity = 0; int vcoresCapacity = 0; int memoryUsed = 0; int vcoresUsed = 0; int availableNodes = 0; for (NodeReport nodeReport : nodeReports) { NodeId nodeId = nodeReport.getNodeId(); LOG.debug("Got report for node {}", nodeId); if (!nodeReport.getNodeState().isUnusable()) { Resource nodeCapability = nodeReport.getCapability(); Resource nodeUsed = nodeReport.getUsed(); // some versions of hadoop return null, others do not if (nodeCapability != null) { LOG.debug("node {} resource capability: memory = {}, vcores = {}", nodeId, nodeCapability.getMemory(), nodeCapability.getVirtualCores()); memoryCapacity += nodeCapability.getMemory(); vcoresCapacity += nodeCapability.getVirtualCores(); } if (nodeUsed != null) { LOG.debug("node {} resources used: memory = {}, vcores = {}", nodeId, nodeUsed.getMemory(), nodeUsed.getVirtualCores()); memoryUsed += nodeUsed.getMemory(); vcoresUsed += nodeUsed.getVirtualCores(); } availableNodes++; } } LOG.debug("YARN resource capacity: {} MB of memory and {} virtual cores.", memoryCapacity, vcoresCapacity); LOG.debug("YARN resources used: {} MB of memory and {} virtual cores.", memoryUsed, vcoresUsed); // calculate memory and vcores required by CDAP int requiredMemoryMB = 0; int requiredVCores = 0; Set<String> invalidKeys = new HashSet<>(); for (ServiceResourceKeys serviceResourceKeys : systemServicesResourceKeys) { boolean hasConfigError = false; int instances = 0; int memoryMB = 0; int vcores = 0; try { instances = cConf.getInt(serviceResourceKeys.getInstancesKey()); } catch (Exception e) { invalidKeys.add(serviceResourceKeys.getInstancesKey()); hasConfigError = true; } try { memoryMB = cConf.getInt(serviceResourceKeys.getMemoryKey()); } catch (Exception e) { invalidKeys.add(serviceResourceKeys.getMemoryKey()); hasConfigError = true; } try { vcores = cConf.getInt(serviceResourceKeys.getVcoresKey()); } catch (Exception e) { invalidKeys.add(serviceResourceKeys.getVcoresKey()); hasConfigError = true; } if (!hasConfigError) { LOG.debug("Resource settings for system service {}: {}={}, {}={}, {}={}", serviceResourceKeys.getServiceName(), serviceResourceKeys.getInstancesKey(), instances, serviceResourceKeys.getMemoryKey(), memoryMB, serviceResourceKeys.getVcoresKey(), vcores); requiredMemoryMB += memoryMB * instances; requiredVCores += vcores * instances; } } if (!invalidKeys.isEmpty()) { throw new RuntimeException( "YARN resources check failed to invalid config settings for keys: " + Joiner.on(',').join(invalidKeys)); } LOG.debug("{} MB of memory and {} virtual cores are required.", requiredMemoryMB, requiredVCores); int availableMemoryMB = memoryCapacity - memoryUsed; int availableVCores = vcoresCapacity - vcoresUsed; boolean memoryOK = requiredMemoryMB <= availableMemoryMB; // if this is negative or zero just assume its not using vcores boolean vcoresOK = vcoresCapacity <= 0 || requiredVCores <= availableVCores; if (!memoryOK && !vcoresOK) { LOG.warn( "Services require {} MB of memory and {} vcores, " + "but the cluster only has {} MB of memory and {} vcores available.", requiredMemoryMB, requiredVCores, availableMemoryMB, availableVCores); } else if (!memoryOK) { LOG.warn( "Services require {} MB of memory but the cluster only has {} MB of memory available.", requiredMemoryMB, availableMemoryMB); } else if (!vcoresOK) { LOG.warn( "Services require {} vcores but the cluster only has {} vcores available.", requiredVCores, availableVCores); } else { LOG.info(" YARN resources successfully verified."); } } }