/******************************************************************************* * =========================================================== * Ankush : Big Data Cluster Management Solution * =========================================================== * * (C) Copyright 2014, by Impetus Technologies * * This is free software; you can redistribute it and/or modify it under * the terms of the GNU Lesser General Public License (LGPL v3) as * published by the Free Software Foundation; * * This software is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * See the GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this software; if not, write to the Free Software Foundation, * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ******************************************************************************/ /** * */ package com.impetus.ankush2.hadoop.monitor; import java.io.IOException; import java.net.InetSocketAddress; import java.util.ArrayList; import java.util.HashMap; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import net.neoremind.sshxcute.core.Result; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.mapred.ClusterStatus; import org.apache.hadoop.mapred.Counters; import org.apache.hadoop.mapred.Counters.Counter; import org.apache.hadoop.mapred.Counters.Group; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.JobStatus; import org.apache.hadoop.mapred.JobTracker.State; import org.apache.hadoop.mapred.RunningJob; import org.apache.hadoop.mapred.TIPStatus; import org.apache.hadoop.mapred.TaskID; import org.apache.hadoop.mapred.TaskReport; import com.impetus.ankush.common.exception.AnkushException; import com.impetus.ankush.common.scripting.AnkushTask; import com.impetus.ankush2.common.scripting.impl.ReadConfProperty; import com.impetus.ankush2.constant.Constant; import com.impetus.ankush2.framework.config.ClusterConfig; import com.impetus.ankush2.framework.config.ComponentConfig; import com.impetus.ankush2.framework.config.NodeConfig; import com.impetus.ankush2.hadoop.utils.HadoopConstants; import com.impetus.ankush2.hadoop.utils.HadoopUtils; import com.impetus.ankush2.logger.AnkushLogger; /** * @author Akhil * */ public class JobStatusProvider { JobClient jobClient; String jobTrackerRpcPort; String jobTrackerHost; /** * @param clusterConfig * @param compConfig */ public JobStatusProvider(ClusterConfig clusterConfig, ComponentConfig compConfig) { super(); this.clusterConfig = clusterConfig; this.compConfig = compConfig; // job tracker port. jobTrackerRpcPort = HadoopUtils.getJobTrackerRpcPort(this.compConfig); // job tracker host jobTrackerHost = HadoopUtils.getJobTrackerHost(this.compConfig); jobClient = getJobClient(jobTrackerHost, jobTrackerRpcPort); } /** The cluster config. */ private ClusterConfig clusterConfig; /** The hadoop config. */ private ComponentConfig compConfig; /** The log. */ private AnkushLogger LOG = new AnkushLogger(JobStatusProvider.class); /** The Constant JOB_STATE_PREP. */ public static final int JOB_STATE_PREP = 4; // 4 /** The Constant JOB_STATE_RUNNING. */ public static final int JOB_STATE_RUNNING = 1; /** The Constant JOB_STATE_SUCCEEDED. */ public static final int JOB_STATE_SUCCEEDED = 2; /** The Constant JOB_STATE_FAILED. */ public static final int JOB_STATE_FAILED = 3; /** The Constant JOB_STATE_KILLED. */ public static final int JOB_STATE_KILLED = 5; /** * Gets the job metrics. * * @return List */ public Map<String, Object> getJobMetrics() throws AnkushException { String errMsg = "Unable to getch Hadoop Metrics, could not connect to Hadoop JobClient."; try { // Checking for !null if (jobClient != null) { // Creating an empty map for storing Hadoop Job Metrics information LinkedHashMap<String, Object> hadoopJobMetrics = new LinkedHashMap<String, Object>(); try { // Checking for null jobClient if (jobClient != null) { LOG.info("Fetching Hadoop Metrics Information.." + jobClient); // Get status information about the Map-Reduce cluster. ClusterStatus clusterStatus = jobClient.getClusterStatus(); // Get the current state of the JobTracker, State jobTrackerState = clusterStatus.getJobTrackerState(); // Get the number of currently running map tasks in the cluster. int mapTasks = clusterStatus.getMapTasks(); // Get the maximum capacity for running map tasks in the // cluster. int maxMapTasks = clusterStatus.getMaxMapTasks(); // Get the maximum capacity for running reduce tasks in the // cluster. int maxReduceTasks = clusterStatus.getMaxReduceTasks(); // Get the number of currently running reduce tasks in the // cluster. int reduceTasks = clusterStatus.getReduceTasks(); // Get the number of active task trackers in the cluster. int taskTrackers = clusterStatus.getTaskTrackers(); // Get the number of blacklisted task trackers in the cluster. int blackListedTrackers = clusterStatus .getBlacklistedTrackers(); long ttExpiryInterval = clusterStatus.getTTExpiryInterval(); int defaultMaps = 0; int defaultReduces = 0; try { defaultMaps = jobClient.getDefaultMaps(); defaultReduces = jobClient.getDefaultReduces(); } catch (Exception e) { //e.printStackTrace(); } // Putting Hadoop Metrics information in a map hadoopJobMetrics.put("jobTrackerState", String.valueOf(jobTrackerState)); hadoopJobMetrics .put("defaultMaps", String.valueOf(defaultMaps)); hadoopJobMetrics.put("defaultReduces", String.valueOf(defaultReduces)); hadoopJobMetrics.put("mapTasks", String.valueOf(mapTasks)); hadoopJobMetrics .put("reduceTasks", String.valueOf(reduceTasks)); hadoopJobMetrics.put("maxMapTasksCapacity", String.valueOf(maxMapTasks)); hadoopJobMetrics.put("maxReduceTasksCapacity", String.valueOf(maxReduceTasks)); hadoopJobMetrics.put("taskTrackers", String.valueOf(taskTrackers)); hadoopJobMetrics.put("blackListedTrackers", String.valueOf(blackListedTrackers)); hadoopJobMetrics.put("taskTrackerExpiryInterval", String.valueOf(ttExpiryInterval)); hadoopJobMetrics.put("schedulerType", getSchedulerType()); int totalJobSubmission = 0; // Get the jobs that are submitted. JobStatus[] jobStatus = jobClient.getAllJobs(); if (jobStatus != null) { totalJobSubmission = jobClient.getAllJobs().length; } List<Map<String, Object>> allJobsList = listAllJobs(); int totalJobRunning = getRunningJobList(allJobsList).size(); int completedJobs = getCompletedJobs(allJobsList).size(); hadoopJobMetrics.put("totalJobSubmission", String.valueOf(totalJobSubmission)); hadoopJobMetrics.put("totalJobRunning", String.valueOf(totalJobRunning)); hadoopJobMetrics.put("totalJobsCompleted", String.valueOf(completedJobs)); } else { HadoopUtils.addAndLogError(this.LOG, this.clusterConfig, errMsg, Constant.Component.Name.HADOOP); throw new AnkushException(errMsg); } } catch (AnkushException e) { throw e; } catch (Exception e) { HadoopUtils.addAndLogError(this.LOG, this.clusterConfig, errMsg, Constant.Component.Name.HADOOP, e); throw new AnkushException(errMsg); } return hadoopJobMetrics; } else { throw new AnkushException(errMsg); } } catch (AnkushException e) { throw e; } catch (Exception e) { HadoopUtils.addAndLogError(this.LOG, this.clusterConfig, errMsg, Constant.Component.Name.HADOOP, e); throw new AnkushException(errMsg); } } /** * Method getJobStatus. * * @return Map<Object,Object> The Map for Job Status Value. */ public Map<String, Object> getJobStatus(String jobId) throws AnkushException { String errMsg = "Unable to getch Hadoop jobs list, could not connect to Hadoop JobClient."; // Creating an empty list of map for storing job Status information try { return getJobDetails(jobClient, jobId); } catch (Exception e) { HadoopUtils.addAndLogError(this.LOG, this.clusterConfig, errMsg, Constant.Component.Name.HADOOP, e); throw new AnkushException(errMsg); } } /** * List all jobs. * * @param jobClient * the job client * @return the list * @throws InterruptedException */ public List<Map<String, Object>> listAllJobs() throws AnkushException, InterruptedException { // Creating an empty list of map for storing job Status information String errMsg = "Unable to getch Hadoop jobs list, could not connect to Hadoop JobClient."; List<Map<String, Object>> jobReports = new ArrayList<Map<String, Object>>(); try { // Checking for jobClient null if (jobClient != null) { // Get the jobs that are submitted. JobStatus[] jobStatus = jobClient.getAllJobs(); // Iterating over the list of all submitted jobs for (JobStatus jobSts : jobStatus) { jobReports.add(getJobReport(jobSts)); } } else { HadoopUtils.addAndLogError(this.LOG, this.clusterConfig, errMsg, Constant.Component.Name.HADOOP); throw new AnkushException(errMsg); } } catch (AnkushException e) { throw e; } catch (Exception e) { HadoopUtils.addAndLogError(this.LOG, this.clusterConfig, errMsg, Constant.Component.Name.HADOOP, e); throw new AnkushException(errMsg); } return jobReports; } /** * @param jobClient * @param jobSts * @return * @throws IOException */ private Map<String, Object> getJobReport(JobStatus jobSts) throws IOException { // Creating an empty map for storing job information Map<String, Object> jobReport = new HashMap<String, Object>(); // Returns the jobid of the Job org.apache.hadoop.mapred.JobID jobId = jobSts.getJobID(); // Get an RunningJob object to track an ongoing Map-Reduce // job. RunningJob job = jobClient.getJob(jobId); String jobName = ""; if (job != null) { // Get the name of the job. jobName = job.getJobName(); } // Percentage of progress in maps float mapProgress = jobSts.mapProgress() * 100; // Percentage of progress in reduce float reduceProgress = jobSts.reduceProgress() * 100; int mapTotal = 0; int reduceTotal = 0; int mapComp = 0; int reduceComp = 0; // Count for Map and Reduce Complete try { // Get the information of the current state of the map // tasks of a job TaskReport[] mapTaskReports = jobClient.getMapTaskReports(jobId); // Get the total map mapTotal = mapTaskReports.length; // Iterating over the map tasks for (TaskReport taskReport : mapTaskReports) { // The current state of a map TaskInProgress as seen // by the JobTracker. TIPStatus currentStatus = taskReport.getCurrentStatus(); if (currentStatus == TIPStatus.COMPLETE) { mapComp++; } } // Get the information of the current state of the // reduce tasks of a job. TaskReport[] reduceTaskReport = jobClient .getReduceTaskReports(jobId); // Get the total reduce reduceTotal = reduceTaskReport.length; // Iterating over the reduce tasks for (TaskReport taskReport : reduceTaskReport) { // The current state of a reduce TaskInProgress as // seen by the JobTracker. TIPStatus currentStatus = taskReport.getCurrentStatus(); if (currentStatus == TIPStatus.COMPLETE) { reduceComp++; } } } catch (Exception e) { LOG.error(e.getMessage(), e); } // Percentage of progress in setup float setupProgress = jobSts.setupProgress() * 100; // The progress made on cleanup float cleanupProgress = jobSts.cleanupProgress() * 100; // gets any available info on the reason of failure of the // job..Returns the diagnostic information on why a job // might have failed. String failureInfo = jobSts.getFailureInfo(); // Putting Job Sttaus information in map jobReport.put("jobId", jobId.toString()); jobReport.put("jobName", jobName); jobReport.put("jobPriority", jobSts.getJobPriority().toString()); jobReport.put("jobStartTime", jobSts.getStartTime()); jobReport.put("userName", jobSts.getUsername()); jobReport.put("jobComplete", jobSts.isJobComplete()); jobReport.put("mapProgress", mapProgress); jobReport.put("reduceProgress", reduceProgress); jobReport.put("mapTotal", mapTotal); jobReport.put("reduceTotal", reduceTotal); jobReport.put("mapCompleted", mapComp); jobReport.put("reduceCompleted", reduceComp); jobReport.put("setupProgress", setupProgress); jobReport.put("cleanupProgress", cleanupProgress); jobReport.put("schedulingInfo", jobSts.getSchedulingInfo()); jobReport.put("jobState", JobStatus.getJobRunState(jobSts.getRunState())); jobReport.put("failureInfo", failureInfo); jobReport.put("jobFile", job.getJobFile()); jobReport.put("trackingURL", job.getTrackingURL()); jobReport.putAll(getDetailedJobReport(jobId)); return jobReport; } /** * @param jobClient * @param jobReport * @param jobId * @param job * @throws IOException */ private Map<String, Object> getDetailedJobReport(org.apache.hadoop.mapred.JobID jobId) throws IOException { Map<String, Object> jobDetailedReport = new HashMap<String, Object>(); RunningJob job = jobClient.getJob(jobId); Counters counters = job.getCounters(); List counterList = new ArrayList(); for (Group group : counters) { Map<String, Object> counterMap = new HashMap<String, Object>(); counterMap.put("name", group.getDisplayName()); List subCounters = new ArrayList(); for (Counter counter : group) { Map subCounter = new HashMap(); subCounter.put("name", counter.getDisplayName()); subCounter.put("value", counter.getCounter()); subCounters.add(subCounter); } counterMap.put("subCounters", subCounters); counterList.add(counterMap); } jobDetailedReport.put("counters", counterList); jobDetailedReport.put("mapReport", getTaskReport(jobClient.getMapTaskReports(jobId))); jobDetailedReport.put("reduceReport", getTaskReport(jobClient.getReduceTaskReports(jobId))); jobDetailedReport.put("cleanupReport", getTaskReport(jobClient.getCleanupTaskReports(jobId))); jobDetailedReport.put("setupReport", getTaskReport(jobClient.getSetupTaskReports(jobId))); return jobDetailedReport; } public Map<String, Object> getJobDetails(JobClient jobClient, String jobId) throws AnkushException { String errMsg = "Unable to getch Hadoop jobs details, could not connect to Hadoop JobClient."; try { if (jobClient != null) { // Get the jobs that are submitted. JobStatus[] jobStatus = jobClient.getAllJobs(); for (JobStatus jobSts : jobStatus) { } } } catch (Exception e) { HadoopUtils.addAndLogError(this.LOG, this.clusterConfig, errMsg, Constant.Component.Name.HADOOP, e); throw new AnkushException(errMsg); } return null; } /** * Gets the scheduler type. * * @return String */ private String getSchedulerType() { String schedulerType = "default"; try { String hadoopConfPath = HadoopUtils .getHadoopConfDir(this.compConfig); String mapredFilePath = hadoopConfPath + HadoopConstants.FileName.ConfigurationFile.XML_MAPRED_SITE; AnkushTask readXmlProperty = new ReadConfProperty( "mapred.jobtracker.taskScheduler", mapredFilePath, Constant.File_Extension.XML, this.clusterConfig.getAgentInstallDir()); NodeConfig jobTrackerHost = this.clusterConfig.getNodes().get( HadoopUtils.getJobTrackerHost(compConfig)); Result result = jobTrackerHost.getConnection() .exec(readXmlProperty); if (result.isSuccess) { // Checking for not null if (result.sysout != null) { if (result.sysout.contains("FairScheduler")) { schedulerType = "Fair"; } if (result.sysout.contains("CapacityTaskScheduler")) { schedulerType = "Capacity"; } } } } catch (Exception e) { HadoopUtils .addAndLogError( this.LOG, this.clusterConfig, "Could not get Scheduler type from " + HadoopConstants.FileName.ConfigurationFile.XML_MAPRED_SITE + " file.", Constant.Component.Name.HADOOP, e); } // returns the scheduler type return schedulerType; } /** * Gets the running job list. * * @param allJobsList * the all jobs list * @return List */ private List<Map<String, Object>> getRunningJobList( List<Map<String, Object>> allJobsList) { return getJobList("RUNNING", allJobsList); } /** * Gets the completed jobs. * * @param allJobsList * the all jobs list * @return the completed jobs */ private List<Map<String, Object>> getCompletedJobs( List<Map<String, Object>> allJobsList) { return getJobList("SUCCEEDED", allJobsList); } /** * Getting the job list via state. * * @param state * @param allJobsList * @return */ private List<Map<String, Object>> getJobList(String state, List<Map<String, Object>> allJobsList) { // Creating an empty list of map for storing completed job information List<Map<String, Object>> completedJobList = new ArrayList<Map<String, Object>>(); // Iterating over all jobs list that are submitted to the cluster for (Map<String, Object> jAct : allJobsList) { // Extracting job state String jobState = (String) jAct.get("jobState"); if (jobState.equals(state)) { completedJobList.add(jAct); } } return completedJobList; } /** * Gets the job client. * * @return JobClient object */ private JobClient getJobClient(String host, String port) { // JobClient is the primary interface for the user-job to interact with // the JobTracker. JobClient jobClient = null; if (host == null) { host = "localhost"; } LOG.info("Requesting job Client.."); try { // Provides access to configuration parameters. Configuration conf = new Configuration(); LOG.info("JobClient : " + host + " & port : " + port); // Build a job client, connect to the indicated job tracker. jobClient = new JobClient(new InetSocketAddress(host, Integer.parseInt(port)), new JobConf(conf)); // Set the configuration to be used by this object. jobClient.setConf(conf); } catch (Exception e) { LOG.error(e.getMessage(), e); } return jobClient; } /** * Gets the task report. * * @param taskReports * the task reports * @return the task report */ private Map<String, Object> getTaskReport(TaskReport[] taskReports) { Map<String, Object> taskReportsInfo = new HashMap<String, Object>(); try { LOG.info("Total Task : " + taskReports.length); List<Map> taskLists = new ArrayList<Map>(); // A report on the state of a task. if (taskReports != null) { int completeTask = 0; int failedTask = 0; int killedTask = 0; int runningTask = 0; int pendingTask = 0; Map<String, Object[]> diagInfo = new HashMap<String, Object[]>(); // Iterating over the task reports for (TaskReport mtr : taskReports) { // Creating an empty map for storing task details Map<String, Object> taskReport = new HashMap<String, Object>(); // The current status of the task TIPStatus currentStatus = mtr.getCurrentStatus(); // Checking for task's current status COMPLETE if (currentStatus == TIPStatus.COMPLETE) { completeTask++; } // Checking for task's current status KILLED if (currentStatus == TIPStatus.KILLED) { killedTask++; } // Checking for task's current status RUNNING if (currentStatus == TIPStatus.RUNNING) { runningTask++; } // Checking for task's current status PENDING if (currentStatus == TIPStatus.PENDING) { pendingTask++; } // The id of the task. TaskID taskId = mtr.getTaskID(); float progress = mtr.getProgress(); // The most recent state String state = mtr.getState(); // Putting value in a map taskReport.put("taskId", taskId.toString()); taskReport.put("successfulTaskAttemp", mtr .getSuccessfulTaskAttempt().toString()); taskReport.put("startTime", mtr.getStartTime()); taskReport.put("finishTime", mtr.getFinishTime()); taskReport.put("progress", progress * 100); taskReport.put("state", state); taskReport.put("currentStatus", currentStatus); Counters counters = mtr.getCounters(); List countersList = new ArrayList(); for (Group group : counters) { Map<String, Object> counterMap = new HashMap<String, Object>(); counterMap.put("name", group.getDisplayName()); List subCounters = new ArrayList(); for (Counter counter : group) { Map subCounter = new HashMap(); subCounter.put("name", counter.getDisplayName()); subCounter.put("value", counter.getCounter()); subCounters.add(subCounter); } counterMap.put("subCounters", subCounters); countersList.add(counterMap); } taskReport.put("counters", countersList); taskLists.add(taskReport); // A list of error messages. String[] diagnostics = mtr.getDiagnostics(); if (diagnostics != null) { int count = 0; // Iterating over the list of error messages for (String di : diagnostics) { Object[] diagStatus = new Object[2]; diagStatus[0] = taskId; diagStatus[1] = di; diagInfo.put(taskId + "_" + count, diagStatus); count++; } } } // Putting value in a map taskReportsInfo.put("completedTask", completeTask); taskReportsInfo.put("pendingTask", pendingTask); taskReportsInfo.put("killedTask", killedTask); taskReportsInfo.put("runningTask", runningTask); taskReportsInfo.put("failedTask", failedTask); taskReportsInfo.put("failedOrKilledTask", failedTask); taskReportsInfo.put("diagInfo", diagInfo); taskReportsInfo.put("tasks", taskLists); } } catch (Exception e) { HadoopUtils.addAndLogError(this.LOG, this.clusterConfig, "Could not get task report", Constant.Component.Name.HADOOP, e); } return taskReportsInfo; } }