/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.mapred; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.mapred.TaskTracker.TaskInProgress; import org.apache.hadoop.syscall.LinuxSystemCall; import org.apache.hadoop.util.ProcessTree; import org.apache.hadoop.util.ProcfsBasedProcessTree; import org.apache.hadoop.util.ResourceCalculatorPlugin; import java.util.concurrent.ConcurrentHashMap; /** * Manages memory usage of tasks running under this TT. Kills any task-trees * that overflow and over-step memory limits. */ class TaskMemoryManagerThread extends Thread { public static final int TASK_MAX_PHYSICAL_MEMORY_MB_DEFAULT = 5 * 1024; private static Log LOG = LogFactory.getLog(TaskMemoryManagerThread.class); private TaskTracker taskTracker; private ResourceCalculatorPlugin resourceCalculator; private final long monitoringInterval; private long sleepInterval; // The amout of memory are all in bytes private final long maxMemoryAllowedForAllTasks; private long reservedRssMemory; private long maxRssMemoryAllowedForAllTasks; private int maxRssMemoryAllowedUpdateCounter; static private boolean doUpdateReservedPhysicalMemory = true; static public final String HIGH_MEMORY_KEYWORD = "high-memory"; static public final String TT_MEMORY_MANAGER_MONITORING_INTERVAL = "mapred.tasktracker.taskmemorymanager.monitoring-interval"; // The amount of memory which will not be used for running tasks // If this is violated, task with largest memory will be killed. static public final String TT_RESERVED_PHYSICAL_MEMORY_MB = "mapred.tasktracker.reserved.physicalmemory.mb"; // The maximum amount of memory that can be used for running task. // If this is violated, task with largest memory will be killed. static public final String TT_MAX_RSS_MEMORY_MB = "mapred.tasktracker.tasks.max.rssmemory.mb"; private final Map<TaskAttemptID, ProcessTreeInfo> processTreeInfoMap; private final Map<TaskAttemptID, ProcessTreeInfo> tasksToBeAdded; private final List<TaskAttemptID> tasksToBeRemoved; private volatile boolean running = true; public TaskMemoryManagerThread(TaskTracker taskTracker) { this(taskTracker.getTotalMemoryAllottedForTasksOnTT() * 1024 * 1024L, taskTracker.getJobConf().getLong(TT_MEMORY_MANAGER_MONITORING_INTERVAL, 5000L)); this.taskTracker = taskTracker; this.resourceCalculator = taskTracker.resourceCalculatorPlugin; loadMaxRssMemoryConfig(taskTracker.getJobConf()); } // mainly for test purposes. note that the tasktracker variable is // not set here. TaskMemoryManagerThread(long maxMemoryAllowedForAllTasks, long monitoringInterval) { setName(this.getClass().getName()); processTreeInfoMap = new ConcurrentHashMap<TaskAttemptID, ProcessTreeInfo>(); tasksToBeAdded = new HashMap<TaskAttemptID, ProcessTreeInfo>(); tasksToBeRemoved = new ArrayList<TaskAttemptID>(); this.maxMemoryAllowedForAllTasks = maxMemoryAllowedForAllTasks > 0 ? maxMemoryAllowedForAllTasks : JobConf.DISABLED_MEMORY_LIMIT; this.monitoringInterval = monitoringInterval; } public void addTask(TaskAttemptID tid, long memLimit) { synchronized (tasksToBeAdded) { LOG.debug("Add " + tid); ProcessTreeInfo ptInfo = new ProcessTreeInfo(tid, null, null, memLimit); tasksToBeAdded.put(tid, ptInfo); } } public List<ProcessTreeInfo> getTasks() { List<ProcessTreeInfo> taskList = new ArrayList<ProcessTreeInfo>(); synchronized (tasksToBeAdded) { taskList.addAll(tasksToBeAdded.values()); tasksToBeAdded.clear(); } taskList.addAll(processTreeInfoMap.values()); synchronized (tasksToBeRemoved) { for (TaskAttemptID tid : tasksToBeRemoved) { taskList.remove(tid); } tasksToBeRemoved.clear(); } return taskList; } public void setSleepInterval(long sleepInterval) { this.sleepInterval = sleepInterval; } public void resetSleepInterval() { this.sleepInterval = this.monitoringInterval; } public void removeTask(TaskAttemptID tid) { synchronized (tasksToBeRemoved) { LOG.debug("Remove " + tid); tasksToBeRemoved.add(tid); } } public static class ProcessTreeInfo { private final TaskAttemptID tid; private String pid; private ProcfsBasedProcessTree pTree; private final long memLimit; private String pidFile; public ProcessTreeInfo(TaskAttemptID tid, String pid, ProcfsBasedProcessTree pTree, long memLimit) { this.tid = tid; this.pid = pid; this.pTree = pTree; this.memLimit = memLimit; } public TaskAttemptID getTID() { return tid; } public String getPID() { return pid; } public void setPid(String pid) { this.pid = pid; } public ProcfsBasedProcessTree getProcessTree() { return pTree; } public void setProcessTree(ProcfsBasedProcessTree pTree) { this.pTree = pTree; } public long getMemLimit() { return memLimit; } } @Override public void run() { LOG.info("Starting thread: " + this.getClass()); while (running) { try { // Print the processTrees for debugging. if (LOG.isDebugEnabled()) { StringBuffer tmp = new StringBuffer("[ "); for (ProcessTreeInfo p : processTreeInfoMap.values()) { tmp.append(p.getPID()); tmp.append(" "); } LOG.debug("Current ProcessTree list : " + tmp.substring(0, tmp.length()) + "]"); } //Add new Tasks synchronized (tasksToBeAdded) { processTreeInfoMap.putAll(tasksToBeAdded); tasksToBeAdded.clear(); } //Remove finished Tasks synchronized (tasksToBeRemoved) { for (TaskAttemptID tid : tasksToBeRemoved) { processTreeInfoMap.remove(tid); } tasksToBeRemoved.clear(); } long memoryStillInUsage = 0; long rssMemoryStillInUsage = 0; taskTracker.setTaskTrackerRSSMem(resourceCalculator.getProcResourceValues().getPhysicalMemorySize()); // Now, check memory usage and kill any overflowing tasks for (Iterator<Map.Entry<TaskAttemptID, ProcessTreeInfo>> it = processTreeInfoMap .entrySet().iterator(); it.hasNext();) { Map.Entry<TaskAttemptID, ProcessTreeInfo> entry = it.next(); TaskAttemptID tid = entry.getKey(); ProcessTreeInfo ptInfo = entry.getValue(); try { String pId = ptInfo.getPID(); // Initialize any uninitialized processTrees if (pId == null) { // get pid from taskAttemptId pId = taskTracker.getPid(ptInfo.getTID()); if (pId != null) { // PID will be null, either if the pid file is yet to be created // or if the tip is finished and we removed pidFile, but the TIP // itself is still retained in runningTasks till successful // transmission to JT // create process tree object long sleeptimeBeforeSigkill = taskTracker.getJobConf().getLong( JvmManager.SLEEPTIME_BEFORE_SIGKILL_KEY, ProcessTree.DEFAULT_SLEEPTIME_BEFORE_SIGKILL); ProcfsBasedProcessTree pt = new ProcfsBasedProcessTree( pId,ProcessTree.isSetsidAvailable, sleeptimeBeforeSigkill); LOG.debug("Tracking ProcessTree " + pId + " for the first time"); ptInfo.setPid(pId); ptInfo.setProcessTree(pt); } } // End of initializing any uninitialized processTrees if (pId == null) { continue; // processTree cannot be tracked } LOG.debug("Constructing ProcessTree for : PID = " + pId + " TID = " + tid); ProcfsBasedProcessTree pTree = ptInfo.getProcessTree(); pTree = pTree.getProcessTree(); // get the updated process-tree ptInfo.setProcessTree(pTree); // update ptInfo with process-tree of // updated state long currentMemUsage = pTree.getCumulativeVmem(); long currentRssMemUsage = pTree.getCumulativeRssmem(); // as processes begin with an age 1, we want to see if there // are processes more than 1 iteration old. long curMemUsageOfAgedProcesses = pTree.getCumulativeVmem(1); long limit = ptInfo.getMemLimit(); String user = taskTracker.getUserName(ptInfo.tid); if (user == null) { // If user is null the task is deleted from the TT memory continue; } // Log RSS and virtual memory usage of all tasks LOG.debug((String.format("Memory usage of ProcessTree %s : " + "[USER,TID,RSS,VMEM,VLimit,TotalRSSLimit]" + "=[%s,%s,%s,%s,%s,%s]", pId, user, ptInfo.tid, currentRssMemUsage, currentMemUsage, limit, maxRssMemoryAllowedForAllTasks))); if (doCheckVirtualMemory() && isProcessTreeOverLimit(tid.toString(), currentMemUsage, curMemUsageOfAgedProcesses, limit)) { // Task (the root process) is still alive and overflowing memory. // Dump the process-tree and then clean it up. String msg = "TaskTree [pid=" + pId + ",tipID=" + tid + "] is running beyond memory-limits. Current usage : " + currentMemUsage + "bytes. Limit : " + limit + "bytes. Killing task. \nDump of the process-tree for " + tid + " : \n" + pTree.getProcessTreeDump(); LOG.warn(msg); taskTracker.cleanUpOverMemoryTask(tid, true, msg); LinuxSystemCall.killProcessGroup(Integer.parseInt(pId)); it.remove(); LOG.info("Removed ProcessTree with root " + pId); } else { // Accounting the total memory in usage for all tasks that are still // alive and within limits. memoryStillInUsage += currentMemUsage; rssMemoryStillInUsage += currentRssMemUsage; } } catch (Exception e) { // Log the exception and proceed to the next task. LOG.warn("Uncaught exception in TaskMemoryManager " + "while managing memory of " + tid, e); } } long availableRssMemory = resourceCalculator.getAvailablePhysicalMemorySize(); long phyTotal = resourceCalculator.getPhysicalMemorySize(); long unaccountedMemory = phyTotal - availableRssMemory - rssMemoryStillInUsage; taskTracker.getTaskTrackerInstrumentation().unaccountedMemory( unaccountedMemory); LOG.info("phyTotal:" + phyTotal + " unaccounted:" + unaccountedMemory + " vMemory:" + memoryStillInUsage + " rssMemory:" + rssMemoryStillInUsage + " rssMemoryLimit:" + maxRssMemoryAllowedForAllTasks + " rssMemoryAvailable:" + availableRssMemory + " rssMemoryReserved:" + reservedRssMemory + " totalTasks:" + processTreeInfoMap.size()); if (doCheckVirtualMemory() && memoryStillInUsage > maxMemoryAllowedForAllTasks) { LOG.warn("The total memory in usage " + memoryStillInUsage + " is overflowing TTs limits " + maxMemoryAllowedForAllTasks + ". Trying to kill a few tasks with the least progress."); killTasksWithLeastProgress(memoryStillInUsage); } updateMaxRssMemory(); if (doCheckPhysicalMemory() && (rssMemoryStillInUsage > maxRssMemoryAllowedForAllTasks || availableRssMemory < reservedRssMemory)) { LOG.warn("The total physical memory in usage " + rssMemoryStillInUsage + " is overflowing TTs limits " + maxRssMemoryAllowedForAllTasks + ". Trying to kill a few tasks with the highest memory."); failTasksWithMaxRssMemory(rssMemoryStillInUsage, availableRssMemory); } // Sleep for some time before beginning next cycle LOG.debug(this.getClass() + " : Sleeping for " + sleepInterval + " ms"); Thread.sleep(sleepInterval); } catch (InterruptedException iex) { if (running) { LOG.error("Class " + this.getClass() + " was interrupted", iex); } } catch (Throwable t) { LOG.error("Class " + this.getClass() + " encountered error", t); } } } /** * Is the total physical memory check enabled? * @return true if total physical memory check is enabled. */ private boolean doCheckPhysicalMemory() { return !(maxRssMemoryAllowedForAllTasks == JobConf.DISABLED_MEMORY_LIMIT); } /** * Is the total virtual memory check enabled? * @return true if total virtual memory check is enabled. */ private boolean doCheckVirtualMemory() { return !(maxMemoryAllowedForAllTasks == JobConf.DISABLED_MEMORY_LIMIT); } /** * Disable updating the reserved physical memory. Used only for tests. */ static public void disableUpdateReservedPhysicalMemory() { doUpdateReservedPhysicalMemory = false; } /** * Read the reserved physical memory configuration and update the maximum * physical memory allowed periodically. This allows us to change the * physcial memory limit configuration without starting TaskTracker */ private void updateMaxRssMemory() { if (!doUpdateReservedPhysicalMemory) { return; } final int MEM_CONFIGURATION_READ_PERIOD = 100; maxRssMemoryAllowedUpdateCounter++; if (maxRssMemoryAllowedUpdateCounter > MEM_CONFIGURATION_READ_PERIOD) { maxRssMemoryAllowedUpdateCounter = 0; loadMaxRssMemoryConfig(new Configuration()); } } private void loadMaxRssMemoryConfig(Configuration conf) { long reservedRssMemoryMB = conf.getLong(TaskMemoryManagerThread.TT_RESERVED_PHYSICAL_MEMORY_MB, JobConf.DISABLED_MEMORY_LIMIT); long maxRssMemoryAllowedForAllTasksMB = conf.getLong(TaskMemoryManagerThread.TT_MAX_RSS_MEMORY_MB, JobConf.DISABLED_MEMORY_LIMIT); if (reservedRssMemoryMB == JobConf.DISABLED_MEMORY_LIMIT) { reservedRssMemory = JobConf.DISABLED_MEMORY_LIMIT; maxRssMemoryAllowedForAllTasks = JobConf.DISABLED_MEMORY_LIMIT; } else { reservedRssMemory = reservedRssMemoryMB * 1024 * 1024L; if (maxRssMemoryAllowedForAllTasksMB == JobConf.DISABLED_MEMORY_LIMIT) { maxRssMemoryAllowedForAllTasks = taskTracker.getTotalPhysicalMemoryOnTT() - reservedRssMemory; } else { maxRssMemoryAllowedForAllTasks = maxRssMemoryAllowedForAllTasksMB * 1024 * 1024L; } } } /** * Check whether a task's process tree's current memory usage is over limit. * * When a java process exec's a program, it could momentarily account for * double the size of it's memory, because the JVM does a fork()+exec() * which at fork time creates a copy of the parent's memory. If the * monitoring thread detects the memory used by the task tree at the same * instance, it could assume it is over limit and kill the tree, for no * fault of the process itself. * * We counter this problem by employing a heuristic check: * - if a process tree exceeds the memory limit by more than twice, * it is killed immediately * - if a process tree has processes older than the monitoring interval * exceeding the memory limit by even 1 time, it is killed. Else it is given * the benefit of doubt to lie around for one more iteration. * * @param tId Task Id for the task tree * @param currentMemUsage Memory usage of a task tree * @param curMemUsageOfAgedProcesses Memory usage of processes older than * an iteration in a task tree * @param limit The limit specified for the task * @return true if the memory usage is more than twice the specified limit, * or if processes in the tree, older than this thread's * monitoring interval, exceed the memory limit. False, * otherwise. */ boolean isProcessTreeOverLimit(String tId, long currentMemUsage, long curMemUsageOfAgedProcesses, long limit) { boolean isOverLimit = false; if (currentMemUsage > (2*limit)) { LOG.warn("Process tree for task: " + tId + " running over twice " + "the configured limit. Limit=" + limit + ", current usage = " + currentMemUsage); isOverLimit = true; } else if (curMemUsageOfAgedProcesses > limit) { LOG.warn("Process tree for task: " + tId + " has processes older than 1 " + "iteration running over the configured limit. Limit=" + limit + ", current usage = " + curMemUsageOfAgedProcesses); isOverLimit = true; } return isOverLimit; } // method provided just for easy testing purposes boolean isProcessTreeOverLimit(ProcfsBasedProcessTree pTree, String tId, long limit) { long currentMemUsage = pTree.getCumulativeVmem(); // as processes begin with an age 1, we want to see if there are processes // more than 1 iteration old. long curMemUsageOfAgedProcesses = pTree.getCumulativeVmem(1); return isProcessTreeOverLimit(tId, currentMemUsage, curMemUsageOfAgedProcesses, limit); } private void killTasksWithLeastProgress(long memoryStillInUsage) { List<TaskAttemptID> tasksToKill = new ArrayList<TaskAttemptID>(); List<TaskAttemptID> tasksToExclude = new ArrayList<TaskAttemptID>(); // Find tasks to kill so as to get memory usage under limits. while (memoryStillInUsage > maxMemoryAllowedForAllTasks) { // Exclude tasks that are already marked for // killing. TaskInProgress task = taskTracker.findTaskToKill(tasksToExclude); if (task == null) { break; // couldn't find any more tasks to kill. } TaskAttemptID tid = task.getTask().getTaskID(); if (processTreeInfoMap.containsKey(tid)) { ProcessTreeInfo ptInfo = processTreeInfoMap.get(tid); ProcfsBasedProcessTree pTree = ptInfo.getProcessTree(); memoryStillInUsage -= pTree.getCumulativeVmem(); tasksToKill.add(tid); } // Exclude this task from next search because it is already // considered. tasksToExclude.add(tid); } // Now kill the tasks. if (!tasksToKill.isEmpty()) { for (TaskAttemptID tid : tasksToKill) { String msg = "Killing one of the least progress tasks - " + tid + ", as the cumulative memory usage of all the tasks on " + "the TaskTracker exceeds virtual memory limit " + maxMemoryAllowedForAllTasks + "."; LOG.warn(msg); killTask(tid, msg, false); } } else { LOG.info("The total memory usage is overflowing TTs limits. " + "But found no alive task to kill for freeing memory."); } } /** * Return the cumulative rss memory used by a task * @param tid the task attempt ID of the task * @return rss memory usage in bytes. 0 if the process tree is not available */ private long getTaskCumulativeRssmem(TaskAttemptID tid) { ProcessTreeInfo ptInfo = processTreeInfoMap.get(tid); ProcfsBasedProcessTree pTree = ptInfo.getProcessTree(); return pTree == null ? 0 : pTree.getCumulativeVmem(); } /** * Starting from the tasks use the highest amount of RSS memory, * fail the tasks until the RSS memory meets the requirement * @param rssMemoryInUsage RSS memory used by all tasks * @param availableRssMemory The free and cache memory in the system */ private void failTasksWithMaxRssMemory( long rssMemoryInUsage, long availableRssMemory) { List<TaskAttemptID> tasksToKill = new ArrayList<TaskAttemptID>(); List<TaskAttemptID> allTasks = new ArrayList<TaskAttemptID>(); allTasks.addAll(processTreeInfoMap.keySet()); // Sort the tasks descendingly according to RSS memory usage Collections.sort(allTasks, new Comparator<TaskAttemptID>() { @Override public int compare(TaskAttemptID tid1, TaskAttemptID tid2) { return getTaskCumulativeRssmem(tid2) > getTaskCumulativeRssmem(tid1) ? 1 : -1; }}); long rssMemoryStillInUsage = rssMemoryInUsage; long availableRssMemoryAfterKilling = availableRssMemory; // Fail the tasks one by one until the memory requirement is met while ((rssMemoryStillInUsage > maxRssMemoryAllowedForAllTasks || availableRssMemoryAfterKilling < reservedRssMemory) && !allTasks.isEmpty()) { TaskAttemptID tid = allTasks.remove(0); if (!isKillable(tid)) { continue; } long rssmem = getTaskCumulativeRssmem(tid); if (rssmem == 0) { break; // Skip tasks without process tree information currently } tasksToKill.add(tid); rssMemoryStillInUsage -= rssmem; availableRssMemoryAfterKilling += rssmem; } // Now kill the tasks. if (!tasksToKill.isEmpty()) { for (TaskAttemptID tid : tasksToKill) { long taskMemoryLimit = getTaskMemoryLimit(tid); long taskMemory = getTaskCumulativeRssmem(tid); String pid = processTreeInfoMap.get(tid).getPID(); String msg = HIGH_MEMORY_KEYWORD + " task:" + tid + " pid:" + pid + " taskMemory:" + taskMemory + " taskMemoryLimit:" + taskMemoryLimit + " availableMemory:" + availableRssMemory + " totalMemory:" + rssMemoryInUsage + " totalMemoryLimit:" + maxRssMemoryAllowedForAllTasks; if (taskMemory > taskMemoryLimit) { msg = "Failing " + msg; LOG.warn(msg); killTask(tid, msg, true); } else { msg = "Killing " + msg; LOG.warn(msg); killTask(tid, msg, false); } } } else { LOG.error("The total physical memory usage is overflowing TTs limits. " + "But found no alive task to kill for freeing memory."); } } private long getTaskMemoryLimit(TaskAttemptID tid) { JobConf conf; synchronized (this.taskTracker) { conf = this.taskTracker.tasks.get(tid).getJobConf(); } long taskMemoryLimit = tid.isMap() ? conf.getInt(JobConf.MAPRED_JOB_MAP_MEMORY_MB_PROPERTY, TASK_MAX_PHYSICAL_MEMORY_MB_DEFAULT) : conf.getInt(JobConf.MAPRED_JOB_REDUCE_MEMORY_MB_PROPERTY, TASK_MAX_PHYSICAL_MEMORY_MB_DEFAULT); return taskMemoryLimit * 1024 * 1024L; } /** * Kill the task and clean up ProcessTreeInfo * @param tid task attempt ID of the task to be killed. * @param msg diagonostic message * @param wasFailure if true, fail the task */ private void killTask(TaskAttemptID tid, String msg, boolean wasFailure) { // Kill the task and mark it as killed. taskTracker.cleanUpOverMemoryTask(tid, wasFailure, msg); // Now destroy the ProcessTree, remove it from monitoring map. ProcessTreeInfo ptInfo = processTreeInfoMap.get(tid); ProcfsBasedProcessTree pTree = ptInfo.getProcessTree(); try { LinuxSystemCall.killProcessGroup(Integer.parseInt(ptInfo.getPID())); } catch (java.io.IOException e) { LOG.error("Could not kill process group " + ptInfo.getPID(), e); } processTreeInfoMap.remove(tid); LOG.info("Removed ProcessTree with root " + ptInfo.getPID()); } /** * Check if a task can be killed to increase free memory * @param tid task attempt ID * @return true if the task can be killed */ private boolean isKillable(TaskAttemptID tid) { TaskInProgress tip = taskTracker.runningTasks.get(tid); return tip != null && !tip.wasKilled() && (tip.getRunState() == TaskStatus.State.RUNNING || tip.getRunState() == TaskStatus.State.COMMIT_PENDING); } public void shutdown() { this.running = false; this.interrupt(); } }