package water.hadoop;
import java.io.IOException;
import java.util.*;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.yarn.api.records.*;
import org.apache.hadoop.yarn.client.api.YarnClient;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.exceptions.YarnException;
public class H2OYarnDiagnostic {
// These are known up front.
Configuration conf;
String applicationId;
String queueName;
int numNodes;
int nodeMemoryMb;
int nodeVirtualCores;
int numNodesStarted;
// Fill these in as we process the queue information, and remember the
// answers for printing helpful diagnostics.
int queueAvailableMemory;
int queueAvailableVirtualCores;
public static void main(String[] args) throws Exception {
if (args.length != 5) {
System.out.println("usage: applicationId queueName numNodes nodeMemoryMb numNodesStarted");
System.exit(1);
}
diagnose(args[0], args[1], Integer.valueOf(args[2]), Integer.valueOf(args[3]), Integer.valueOf(args[4]));
}
/**
* The assumption is this method doesn't get called unless a problem occurred.
*
* @param queueName YARN queue name
* @param numNodes Requested number of worker containers (not including AM)
* @param nodeMemoryMb Requested worker container size
* @param numNodesStarted Number of containers that actually got started before giving up
* @throws Exception
*/
public static void diagnose(String applicationId, String queueName, int numNodes, int nodeMemoryMb, int numNodesStarted) throws Exception {
H2OYarnDiagnostic client = new H2OYarnDiagnostic();
client.applicationId = applicationId;
client.queueName = queueName;
client.numNodes = numNodes;
client.nodeMemoryMb = nodeMemoryMb;
client.nodeVirtualCores = 1;
client.numNodesStarted = numNodesStarted;
client.run();
}
public H2OYarnDiagnostic() throws Exception {
this(new YarnConfiguration());
}
private H2OYarnDiagnostic(Configuration conf) throws Exception {
this.conf = conf;
}
private void run() throws IOException, YarnException {
YarnClient yarnClient;
yarnClient = YarnClient.createYarnClient();
yarnClient.init(conf);
yarnClient.start();
List<NodeReport> clusterNodeReports = yarnClient.getNodeReports();
List<QueueInfo> rootQueues = yarnClient.getRootQueueInfos();
QueueInfo queueInfo = yarnClient.getQueueInfo(this.queueName);
if (queueInfo == null) {
printErrorDiagnosis("Queue not found (" + this.queueName + ")");
return;
}
System.out.println("");
printYarnClusterMetrics(yarnClient);
System.out.println("");
printClusterNodeReports(clusterNodeReports);
System.out.println("");
printQueueInfo(queueInfo);
System.out.println("");
printQueueCapacity(clusterNodeReports, queueInfo, rootQueues);
System.out.println("");
printDiagnosis(clusterNodeReports);
}
final int LEFT = 0;
final int RIGHT = 1;
private String prettyPrintString(String s, int width, int justification) {
if (justification == LEFT) {
return String.format("%-" + width + "s", s);
}
else {
return String.format("%" + width + "s", s);
}
}
private String prettyPrintMb(int mb) {
return prettyPrintMb(mb, true);
}
private String prettyPrintMb(int mb, boolean printUnits) {
double gb = (double)mb / 1024.0;
String s = String.format("%.1f", gb);
if (printUnits) {
s += " GB";
}
return s;
}
private String prettyPrintCapacity(float cap) {
return String.format("%.2f", cap);
}
private void printYarnClusterMetrics(YarnClient yarnClient) throws IOException, YarnException {
System.out.println("----- YARN cluster metrics -----");
YarnClusterMetrics clusterMetrics = yarnClient.getYarnClusterMetrics();
System.out.println("Number of YARN worker nodes: " + clusterMetrics.getNumNodeManagers());
}
private void printClusterNodeReports(List<NodeReport> clusterNodeReports) throws IOException, YarnException {
ArrayList<String> nodes = new ArrayList<String>();
ArrayList<String> racks = new ArrayList<String>();
ArrayList<String> states = new ArrayList<String>();
ArrayList<String> containers = new ArrayList<String>();
ArrayList<String> gbs = new ArrayList<String>();
ArrayList<String> vcores = new ArrayList<String>();
System.out.println("----- Nodes -----");
for (NodeReport node : clusterNodeReports) {
Resource capability = node.getCapability();
Resource used = node.getUsed();
nodes.add("Node: http://" + node.getHttpAddress());
racks.add("Rack: " + node.getRackName());
states.add("" + node.getNodeState());
containers.add("" + node.getNumContainers() + " containers used");
gbs.add(prettyPrintMb(used.getMemory(), false) + " / " + prettyPrintMb(capability.getMemory(), false) + " GB used");
vcores.add(used.getVirtualCores() + " / " + capability.getVirtualCores() + " vcores used");
}
ArrayList<ArrayList<String>> cols = new ArrayList<ArrayList<String>>();
cols.add(nodes);
cols.add(racks);
cols.add(states);
cols.add(containers);
cols.add(gbs);
cols.add(vcores);
int numCols = cols.size();
int[] colJustifications = new int[numCols];
colJustifications[0] = LEFT;
colJustifications[1] = LEFT;
colJustifications[2] = RIGHT;
colJustifications[3] = RIGHT;
colJustifications[4] = RIGHT;
colJustifications[5] = RIGHT;
int[] colWidths = new int[numCols];
for (int j = 0; j < numCols; j++) {
for (String s : cols.get(j)) {
if (s.length() > colWidths[j]) {
colWidths[j] = s.length();
}
}
}
for (int i = 0; i < nodes.size(); i++) {
for (int j = 0; j < numCols; j++) {
if (j == 1) {
System.out.print(" ");
}
else if (j > 1) {
System.out.print(", ");
}
System.out.print(prettyPrintString(cols.get(j).get(i), colWidths[j], colJustifications[j]));
}
System.out.println("");
}
}
private void printQueueInfo2(QueueInfo queueInfo) throws IOException, YarnException {
System.out.println("Queue name: " + queueInfo.getQueueName());
System.out.println(" Queue state: " + queueInfo.getQueueState());
System.out.println(" Current capacity: " + prettyPrintCapacity(queueInfo.getCurrentCapacity()));
System.out.println(" Capacity: " + prettyPrintCapacity(queueInfo.getCapacity()));
System.out.println(" Maximum capacity: " + prettyPrintCapacity(queueInfo.getMaximumCapacity()));
System.out.println(" Application count: " + queueInfo.getApplications().size());
List<ApplicationReport> applications = queueInfo.getApplications();
if (applications.size() > 0) {
System.out.println(" ----- Applications in this queue -----");
}
for (ApplicationReport ar : applications){
System.out.println(" Application ID: " + ar.getApplicationId()
+ " (" + ar.getName() + ")");
System.out.println(" Started: " + ar.getUser()
+ " (" + new Date(ar.getStartTime()).toString() + ")");
YarnApplicationState as = ar.getYarnApplicationState();
if (as != YarnApplicationState.RUNNING) {
System.out.println(" Application state: " + ar.getYarnApplicationState());
}
System.out.println(" Tracking URL: " + ar.getTrackingUrl());
System.out.println(" Queue name: " + ar.getQueue());
ApplicationResourceUsageReport ur = ar.getApplicationResourceUsageReport();
System.out.println(" Used/Reserved containers: "
+ ur.getNumUsedContainers()
+ " / " + ur.getNumReservedContainers()
);
System.out.println(" Needed/Used/Reserved memory: "
+ prettyPrintMb(ur.getNeededResources().getMemory())
+ " / " + prettyPrintMb(ur.getUsedResources().getMemory())
+ " / " + prettyPrintMb(ur.getReservedResources().getMemory())
);
System.out.println(" Needed/Used/Reserved vcores: "
+ ur.getNeededResources().getVirtualCores()
+ " / " + ur.getUsedResources().getVirtualCores()
+ " / " + ur.getReservedResources().getVirtualCores()
);
}
List<QueueInfo> childQueues = queueInfo.getChildQueues();
for (QueueInfo q : childQueues) {
printQueueInfo2(q);
}
}
private void printQueueInfo(QueueInfo queueInfo) throws IOException, YarnException {
System.out.println("----- Queues -----");
if (queueInfo == null) {
System.out.println("Queue '" + this.queueName + "' not found");
return;
}
printQueueInfo2(queueInfo);
}
private double calcFractionalCapability(double startingValue, List<QueueInfo> queues, String queueToFind) {
if (queues == null) {
return -1.0;
}
double totalCapacityAtLevel = 0;
for (QueueInfo qi : queues) {
totalCapacityAtLevel += qi.getCapacity();
}
for (QueueInfo qi : queues) {
if (qi.getQueueName().equals(queueToFind)) {
return startingValue * qi.getCapacity() / totalCapacityAtLevel;
}
}
for (QueueInfo qi : queues) {
double newStartingValue = startingValue * qi.getCapacity() / totalCapacityAtLevel;
double rv = calcFractionalCapability(newStartingValue, qi.getChildQueues(), queueToFind);
if (rv > 0) {
return rv;
}
}
throw new RuntimeException("Diagnostic failed, please contact support@h2o.ai");
}
private int calcUsedMemory(QueueInfo queueInfo) {
if (queueInfo == null) {
return 0;
}
int memory = 0;
for (ApplicationReport ar : queueInfo.getApplications()) {
memory += ar.getApplicationResourceUsageReport().getUsedResources().getMemory();
}
for (QueueInfo qi : queueInfo.getChildQueues()) {
memory += calcUsedMemory(qi);
}
return memory;
}
private int calcUsedVirtualCores(QueueInfo queueInfo) {
if (queueInfo == null) {
return 0;
}
int vc = 0;
for (ApplicationReport ar : queueInfo.getApplications()) {
vc += ar.getApplicationResourceUsageReport().getUsedResources().getVirtualCores();
}
for (QueueInfo qi : queueInfo.getChildQueues()) {
vc += calcUsedMemory(qi);
}
return vc;
}
private void printQueueCapacity(List<NodeReport> clusterNodeReports, QueueInfo queueInfo, List<QueueInfo> rootQueues) throws IOException, YarnException {
int clusterMemory = 0;
int clusterVirtualCores = 0;
for (NodeReport node : clusterNodeReports) {
if (node.getNodeState() == NodeState.RUNNING) {
Resource capability = node.getCapability();
clusterMemory += capability.getMemory();
clusterVirtualCores += capability.getVirtualCores();
}
}
String queueToFind = queueInfo.getQueueName();
double queueCapacityMemory = calcFractionalCapability(clusterMemory, rootQueues, queueToFind);
double queueCapacityVirtualCores = calcFractionalCapability(clusterVirtualCores, rootQueues, queueToFind);
int queueUsedMemory = calcUsedMemory(queueInfo);
int queueUsedVirtualCores = calcUsedVirtualCores(queueInfo);
System.out.println("Queue '" + queueInfo.getQueueName() + "' approximate utilization: "
+ prettyPrintMb(queueUsedMemory, false) + " / " + prettyPrintMb((int)queueCapacityMemory, false) + " GB used, "
+ queueUsedVirtualCores + " / " + (int)queueCapacityVirtualCores + " vcores used");
int queueAvailableMemory = (int)queueCapacityMemory - queueUsedMemory;
if (queueAvailableMemory < 0) {
queueAvailableMemory = 0;
}
this.queueAvailableMemory = queueAvailableMemory;
int queueAvailableVirtualCores = (int)queueCapacityVirtualCores - queueUsedVirtualCores;
if (queueAvailableVirtualCores < 0) {
queueAvailableVirtualCores = 0;
}
this.queueAvailableVirtualCores = queueAvailableVirtualCores;
}
private void printBar() {
System.out.println("----------------------------------------------------------------------");
}
private int numPrinted = 0;
private void printErrorDiagnosis(String s) {
System.out.println("ERROR: " + s);
numPrinted++;
}
private void printWarningDiagnosis(String s) {
System.out.println("WARNING: " + s);
numPrinted++;
}
private void printDiagnosis(List<NodeReport> clusterNodeReports) throws IOException, YarnException {
printBar();
System.out.println("");
// Check if the requested container size exceeds the available space on any node.
{
boolean containerFitsOnSomeNode = false;
for (NodeReport node: clusterNodeReports) {
if (node.getNodeState() == NodeState.RUNNING) {
Resource capability = node.getCapability();
int m = capability.getMemory();
if (m >= this.nodeMemoryMb) {
containerFitsOnSomeNode = true;
break;
}
}
}
if (! containerFitsOnSomeNode) {
printErrorDiagnosis("Job container memory request (" + prettyPrintMb(nodeMemoryMb) + ") does not fit on any YARN cluster node");
}
}
// Check if the requested job cumulative container size exceeds the space in the cluster.
{
int n = 0;
for (NodeReport node: clusterNodeReports) {
if (node.getNodeState() == NodeState.RUNNING) {
Resource capability = node.getCapability();
n += capability.getMemory();
}
}
int jobMb = this.numNodes * this.nodeMemoryMb;
if (n < jobMb) {
printErrorDiagnosis("Job memory request (" + prettyPrintMb(jobMb) + ") exceeds available YARN cluster memory (" + prettyPrintMb(n) + ")");
}
}
// Check if there are at least N virtual cores available in the cluster.
{
int n = 0;
for (NodeReport node: clusterNodeReports) {
if (node.getNodeState() == NodeState.RUNNING) {
Resource capability = node.getCapability();
n += capability.getVirtualCores();
}
}
int jobVirtualCores = this.numNodes * this.nodeVirtualCores;
if (n < jobVirtualCores) {
printErrorDiagnosis("YARN cluster available virtual cores (" + n + ") < requested H2O containers (" + jobVirtualCores + ")");
}
}
// Queue availability messages should just be warnings, rather than
// errors, since we don't *really* know how the scheduler allocates
// resources, and the calculation of partial capacities is a best
// guess. But the info could be helpful, so show it.
// Check if the requested job cumulative container size exceeds the space in the queue.
{
int jobMb = this.numNodes * this.nodeMemoryMb;
if (this.queueAvailableMemory < jobMb) {
printWarningDiagnosis("Job memory request (" + prettyPrintMb(jobMb) + ") exceeds queue available memory capacity (" + prettyPrintMb(this.queueAvailableMemory) + ")");
}
}
// Check if the requested job cumulative virtual cores exceeds the space in the queue.
{
int jobVirtualCores = this.numNodes * this.nodeVirtualCores;
if (this.queueAvailableVirtualCores < jobVirtualCores) {
printWarningDiagnosis("Job virtual cores request (" + jobVirtualCores + ") exceeds queue available virtual cores capacity (" + this.queueAvailableVirtualCores + ")");
}
}
if ((numNodesStarted > 0) && (numNodesStarted < numNodes)) {
printErrorDiagnosis("Only " + numNodesStarted + " out of the requested " + numNodes + " worker containers were started due to YARN cluster resource limitations");
}
// Default warning.
if (numPrinted == 0) {
System.out.println("ERROR: Unable to start any H2O nodes; please contact your YARN administrator.");
System.out.println("");
System.out.println(" A common cause for this is the requested container size (" + prettyPrintMb(this.nodeMemoryMb) + ")");
System.out.println(" exceeds the following YARN settings:");
System.out.println("");
System.out.println(" yarn.nodemanager.resource.memory-mb");
System.out.println(" yarn.scheduler.maximum-allocation-mb");
}
System.out.println("");
printBar();
}
}