/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hive.ql.exec; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.exec.errors.ErrorAndSolution; import org.apache.hadoop.hive.ql.exec.errors.TaskLogProcessor; import org.apache.hadoop.hive.ql.session.SessionState.LogHelper; import org.apache.hadoop.hive.shims.ShimLoader; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.RunningJob; import org.apache.hadoop.mapred.TaskCompletionEvent; /** * JobDebugger takes a RunningJob that has failed and grabs the top 4 failing * tasks and outputs this information to the Hive CLI. */ public class JobDebugger implements Runnable { private final JobConf conf; private final RunningJob rj; private final LogHelper console; private final Map<String, List<List<String>>> stackTraces; // Mapping from task ID to the number of failures private final Map<String, Integer> failures = new HashMap<String, Integer>(); private final Set<String> successes = new HashSet<String>(); // Successful task ID's private final Map<String, TaskInfo> taskIdToInfo = new HashMap<String, TaskInfo>(); // Used for showJobFailDebugInfo private static class TaskInfo { String jobId; Set<String> logUrls; public TaskInfo(String jobId) { this.jobId = jobId; logUrls = new HashSet<String>(); } public void addLogUrl(String logUrl) { logUrls.add(logUrl); } public Set<String> getLogUrls() { return logUrls; } public String getJobId() { return jobId; } } public JobDebugger(JobConf conf, RunningJob rj, LogHelper console) { this.conf = conf; this.rj = rj; this.console = console; this.stackTraces = null; } public JobDebugger(JobConf conf, RunningJob rj, LogHelper console, Map<String, List<List<String>>> stackTraces) { this.conf = conf; this.rj = rj; this.console = console; this.stackTraces = stackTraces; } public void run() { try { showJobFailDebugInfo(); } catch (IOException e) { console.printError(e.getMessage()); } } private String getTaskAttemptLogUrl(String taskTrackerHttpAddress, String taskAttemptId) { return taskTrackerHttpAddress + "/tasklog?taskid=" + taskAttemptId + "&start=-8193"; } class TaskLogGrabber implements Runnable { public void run() { try { getTaskLogs(); } catch (IOException e) { console.printError(e.getMessage()); } } private void getTaskLogs() throws IOException { int startIndex = 0; while (true) { TaskCompletionEvent[] taskCompletions = rj.getTaskCompletionEvents(startIndex); if (taskCompletions == null || taskCompletions.length == 0) { break; } boolean more = true; boolean firstError = true; for (TaskCompletionEvent t : taskCompletions) { // getTaskJobIDs returns Strings for compatibility with Hadoop versions // without TaskID or TaskAttemptID String[] taskJobIds = ShimLoader.getHadoopShims().getTaskJobIDs(t); if (taskJobIds == null) { console.printError("Task attempt info is unavailable in this Hadoop version"); more = false; break; } // For each task completion event, get the associated task id, job id // and the logs String taskId = taskJobIds[0]; String jobId = taskJobIds[1]; if (firstError) { console.printError("Examining task ID: " + taskId + " (and more) from job " + jobId); firstError = false; } TaskInfo ti = taskIdToInfo.get(taskId); if (ti == null) { ti = new TaskInfo(jobId); taskIdToInfo.put(taskId, ti); } // These tasks should have come from the same job. assert (ti.getJobId() != null && ti.getJobId().equals(jobId)); String taskAttemptLogUrl = ShimLoader.getHadoopShims().getTaskAttemptLogUrl( conf, t.getTaskTrackerHttp(), t.getTaskId()); if (taskAttemptLogUrl != null) { ti.getLogUrls().add(taskAttemptLogUrl); } // If a task failed, then keep track of the total number of failures // for that task (typically, a task gets re-run up to 4 times if it // fails if (t.getTaskStatus() != TaskCompletionEvent.Status.SUCCEEDED) { Integer failAttempts = failures.get(taskId); if (failAttempts == null) { failAttempts = Integer.valueOf(0); } failAttempts = Integer.valueOf(failAttempts.intValue() + 1); failures.put(taskId, failAttempts); } else { successes.add(taskId); } } if (!more) { break; } startIndex += taskCompletions.length; } } } @SuppressWarnings("deprecation") private void showJobFailDebugInfo() throws IOException { console.printError("Error during job, obtaining debugging information..."); // Loop to get all task completion events because getTaskCompletionEvents // only returns a subset per call TaskLogGrabber tlg = new TaskLogGrabber(); Thread t = new Thread(tlg); try { t.start(); t.join(HiveConf.getIntVar(conf, HiveConf.ConfVars.TASKLOG_DEBUG_TIMEOUT)); } catch (InterruptedException e) { console.printError("Timed out trying to finish grabbing task log URLs, " + "some task info may be missing"); } // Remove failures for tasks that succeeded for (String task : successes) { failures.remove(task); } if (failures.keySet().size() == 0) { return; } // Find the highest failure count int maxFailures = 0; for (Integer failCount : failures.values()) { if (maxFailures < failCount.intValue()) { maxFailures = failCount.intValue(); } } // Display Error Message for tasks with the highest failure count String jtUrl = JobTrackerURLResolver.getURL(conf); for (String task : failures.keySet()) { if (failures.get(task).intValue() == maxFailures) { TaskInfo ti = taskIdToInfo.get(task); String jobId = ti.getJobId(); String taskUrl = jtUrl + "/taskdetails.jsp?jobid=" + jobId + "&tipid=" + task.toString(); TaskLogProcessor tlp = new TaskLogProcessor(conf); for (String logUrl : ti.getLogUrls()) { tlp.addTaskAttemptLogUrl(logUrl); } if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.JOB_DEBUG_CAPTURE_STACKTRACES) && stackTraces != null) { if (!stackTraces.containsKey(jobId)) { stackTraces.put(jobId, new ArrayList<List<String>>()); } stackTraces.get(jobId).addAll(tlp.getStackTraces()); } if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.SHOW_JOB_FAIL_DEBUG_INFO)) { List<ErrorAndSolution> errors = tlp.getErrors(); StringBuilder sb = new StringBuilder(); // We use a StringBuilder and then call printError only once as // printError will write to both stderr and the error log file. In // situations where both the stderr and the log file output is // simultaneously output to a single stream, this will look cleaner. sb.append("\n"); sb.append("Task with the most failures(" + maxFailures + "): \n"); sb.append("-----\n"); sb.append("Task ID:\n " + task + "\n\n"); sb.append("URL:\n " + taskUrl + "\n"); for (ErrorAndSolution e : errors) { sb.append("\n"); sb.append("Possible error:\n " + e.getError() + "\n\n"); sb.append("Solution:\n " + e.getSolution() + "\n"); } sb.append("-----\n"); console.printError(sb.toString()); } // Only print out one task because that's good enough for debugging. break; } } return; } }