/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.mapred; import java.util.Collection; import org.apache.hadoop.metrics.MetricsContext; import org.apache.hadoop.metrics.MetricsRecord; import org.apache.hadoop.metrics.MetricsUtil; import org.apache.hadoop.metrics.Updater; import org.apache.hadoop.metrics.jvm.JvmMetrics; import org.apache.hadoop.metrics.util.MetricsBase; import org.apache.hadoop.metrics.util.MetricsRegistry; import org.apache.hadoop.metrics.util.MetricsTimeVaryingRate; import org.apache.hadoop.util.ProcfsBasedProcessTree; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; class TaskTrackerMetricsInst extends TaskTrackerInstrumentation implements Updater { /** Configuration variable for extra jvms */ public static final String EXTRA_JVMS = "mapred.extraJvms"; private static final Log LOG = LogFactory.getLog(TaskTrackerMetricsInst.class); /** Registry of a subset of metrics */ private final MetricsRegistry registry = new MetricsRegistry(); private final MetricsTimeVaryingRate taskLaunchMsecs = new MetricsTimeVaryingRate("taskLaunchMsecs", registry, "Msecs to launch a task after getting request.", true); /** All metrics are put here */ private final MetricsRecord metricsRecord; /** * Number of completed map tasks (subset of numCompletedTasks), * includes setup/clean up tasks */ private int numCompletedMapTasks = 0; /** Number of completed reduce tasks (subset of numCompletedTasks) */ private int numCompletedReduceTasks = 0; private int numCompletedTasks = 0; private int timedoutTasks = 0; private int tasksFailedPing = 0; private long unaccountedMemory = 0; private int numDiskOutOfSpaceTasks = 0; @Override public synchronized void diskOutOfSpaceTask(TaskAttemptID t) { ++numDiskOutOfSpaceTasks; } /** Tree for checking the proc fs */ private ProcfsBasedProcessTree processTree = new ProcfsBasedProcessTree("-1", false, -1); /** Extra JVMs allowed beyond the maps + reduces before dumping procs */ private final int extraJvms; private final boolean checkJvms = ProcfsBasedProcessTree.isAvailable(); public TaskTrackerMetricsInst(TaskTracker t) { super(t); JobConf conf = tt.getJobConf(); extraJvms = conf.getInt(EXTRA_JVMS, 16); String sessionId = conf.getSessionId(); // Initiate Java VM Metrics JvmMetrics.init("TaskTracker", sessionId); // Create a record for Task Tracker metrics MetricsContext context = MetricsUtil.getContext("mapred"); metricsRecord = MetricsUtil.createRecord(context, "tasktracker"); //guaranteed never null metricsRecord.setTag("sessionId", sessionId); context.registerUpdater(this); } @Override public synchronized void completeTask(TaskAttemptID t) { if (t.isMap()) { ++numCompletedMapTasks; } else { ++numCompletedReduceTasks; } ++numCompletedTasks; } @Override public synchronized void timedoutTask(TaskAttemptID t) { ++timedoutTasks; } @Override public synchronized void taskFailedPing(TaskAttemptID t) { ++tasksFailedPing; } @Override public synchronized void unaccountedMemory(long memory) { unaccountedMemory = memory; } @Override public synchronized void addTaskLaunchMsecs(long msecs) { taskLaunchMsecs.inc(msecs); } @Override public MetricsTimeVaryingRate getTaskLaunchMsecs() { return taskLaunchMsecs; } /** * Check the number of jvms running on this node and also set the metric * For use in doUpdates(). */ private void checkAndSetJvms() { Collection<String> jvmProcs = processTree.getProcessNameContainsCount("java "); metricsRecord.setMetric("all_node_jvms", jvmProcs.size()); int maxExpected = tt.getMaxActualMapTasks() + tt.getMaxActualReduceTasks() + extraJvms; if (maxExpected < jvmProcs.size()) { LOG.warn("checkAndSetJvms: Expected up to " + maxExpected + " jvms, " + "but got " + jvmProcs.size()); for (String jvmProc : jvmProcs) { LOG.warn(jvmProc); } } } /** * Since this object is a registered updater, this method will be called * periodically, e.g. every 5 seconds. */ @Override public void doUpdates(MetricsContext unused) { synchronized (this) { for (MetricsBase metricsBase : registry.getMetricsList()) { metricsBase.pushMetric(metricsRecord); } metricsRecord.setMetric("aveMapSlotRefillMsecs", tt.getAveMapSlotRefillMsecs()); metricsRecord.setMetric("aveReduceSlotRefillMsecs", tt.getAveReduceSlotRefillMsecs()); if (checkJvms) { checkAndSetJvms(); } metricsRecord.setMetric("cgroup_memory_oom", tt.getCGroupOOM()); metricsRecord.setMetric("maps_running", tt.getRunningMaps()); metricsRecord.setMetric("reduces_running", tt.getRunningReduces()); metricsRecord.setMetric("mapTaskSlots", (short)tt.getMaxActualMapTasks()); metricsRecord.setMetric("reduceTaskSlots", (short)tt.getMaxActualReduceTasks()); metricsRecord.incrMetric("map_tasks_completed", numCompletedMapTasks); metricsRecord.incrMetric("reduce_tasks_completed", numCompletedReduceTasks); metricsRecord.incrMetric("tasks_completed", numCompletedTasks); metricsRecord.incrMetric("tasks_failed_timeout", timedoutTasks); metricsRecord.incrMetric("tasks_failed_ping", tasksFailedPing); metricsRecord.incrMetric("tasks_disk_out_of_space", numDiskOutOfSpaceTasks); metricsRecord.setMetric("unaccounted_memory", unaccountedMemory); metricsRecord.incrMetric("tasks_failed_to_add_cgroup", tt.getAndResetNumFailedToAddTaskToCGroup()); metricsRecord.incrMetric("tasks_saved_by_cgroup", tt.getAndResetAliveTaskNumInCGroup()); metricsRecord.setMetric("rss_memory_usage", tt.getTaskTrackerRSSMem()); metricsRecord.incrMetric("tasks_cpu_saved_by_cgroup", tt.getAndResetAliveTasksCPUMSecs()); for (int index = 0; index <tt.getKilledTaskRssBucketsNum(); ++ index) { String metricKey = "tasks_killed_group_by_rss_" + index; metricsRecord.incrMetric(metricKey, tt.getAndRestNumOfKilledTasksByRssBucket(index)); } numCompletedMapTasks = 0; numCompletedReduceTasks = 0; numCompletedTasks = 0; timedoutTasks = 0; tasksFailedPing = 0; numDiskOutOfSpaceTasks = 0; } metricsRecord.update(); } }