/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.corona;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import org.apache.hadoop.metrics.MetricsContext;
import org.apache.hadoop.metrics.MetricsRecord;
import org.apache.hadoop.metrics.MetricsUtil;
import org.apache.hadoop.metrics.Updater;
import org.apache.hadoop.metrics.util.MetricsBase;
import org.apache.hadoop.metrics.util.MetricsIntValue;
import org.apache.hadoop.metrics.util.MetricsLongValue;
import org.apache.hadoop.metrics.util.MetricsRegistry;
import org.apache.hadoop.metrics.util.MetricsTimeVaryingInt;
import org.apache.hadoop.metrics.util.MetricsTimeVaryingLong;
/**
* Metrics for the Corona Cluster Manager.
*/
class ClusterManagerMetrics implements Updater {
/** Metrics context name. */
static final String CONTEXT_NAME = "clustermanager";
/** The list of possible end states for a session */
private static final List<SessionStatus> SESSION_END_STATES = getEndStates();
/** Metrics context. */
private final MetricsContext context;
/** Metrics record. */
private final MetricsRecord metricsRecord;
/** Metrics registry. */
private final MetricsRegistry registry = new MetricsRegistry();
/** Number of requested resources by resource type. */
private final Map<ResourceType, MetricsTimeVaryingLong>
typeToResourceRequested;
/** Number of granted resources by resource type. */
private final Map<ResourceType, MetricsTimeVaryingLong> typeToResourceGranted;
/** Number of revokes issued (preemption) by resource type. */
private final Map<ResourceType, MetricsTimeVaryingLong> typeToResourceRevoked;
/** Number of resources released by sessions, by resource type. */
private final Map<ResourceType, MetricsTimeVaryingLong>
typeToResourceReleased;
/** Number of pending resource requests by resource type. */
private final Map<ResourceType, MetricsIntValue> typeToPendingCount;
/**
* Number of running resource requests by resource type. This should be same
* as the number of granted resources.
*/
private final Map<ResourceType, MetricsIntValue> typeToRunningCount;
/** Number of total slots by resource type. */
private final Map<ResourceType, MetricsIntValue> typeToTotalSlots;
/** Number of free slots by resource type. */
private final Map<ResourceType, MetricsIntValue> typeToFreeSlots;
/** Scheduler run time by resource type. */
private final Map<ResourceType, MetricsIntValue> typeToSchedulerRunTime;
/** The start time of the current run of the scheduler by resource type.
* Contains a non 0 value if the cycle is in progress and 0 otherwise */
private final Map<ResourceType, Long> typeToSchedulerCurrentCycleStart;
/** Number of alive nodes. */
private final MetricsIntValue aliveNodes;
/** Number of dead nodes. */
private final MetricsIntValue deadNodes;
/** Number of blacklisted nodes. */
private final MetricsIntValue blacklistedNodes;
/** Breakdown of session by session status. */
private final Map<SessionStatus, MetricsTimeVaryingInt>
sessionStatusToMetrics;
/** Number of running sessions. */
private final MetricsIntValue numRunningSessions;
/** Number of sessions since start of cluster manager. */
private final MetricsTimeVaryingInt totalSessionCount;
/** Number of pending calls to sessions. */
private final MetricsIntValue pendingCallsCount;
/** Number of CoronaJobTracker failures. */
private final MetricsTimeVaryingInt numCJTFailures;
/** Cluster manager scheduler for metrics */
private Scheduler scheduler;
/** Cluster manager session notifier for metrics. */
private SessionNotifier sessionNotifier;
/** Number of task tracker get restarted */
private final MetricsIntValue numTaskTrackerRestarted;
/** Number of remote job tracker timedout */
private final MetricsIntValue numRemoteJTTimedout;
/**
* Constructor.
* @param types The available resource types.
*/
public ClusterManagerMetrics(Collection<ResourceType> types) {
context = MetricsUtil.getContext(CONTEXT_NAME);
metricsRecord = MetricsUtil.createRecord(context, CONTEXT_NAME);
typeToResourceRequested = createTypeToResourceCountMap(types, "requested");
typeToResourceGranted = createTypeToResourceCountMap(types, "granted");
typeToResourceRevoked = createTypeToResourceCountMap(types, "revoked");
typeToResourceReleased = createTypeToResourceCountMap(types, "released");
typeToPendingCount = createTypeToCountMap(types, "pending");
typeToRunningCount = createTypeToCountMap(types, "running");
typeToTotalSlots = createTypeToCountMap(types, "total");
typeToFreeSlots = createTypeToCountMap(types, "free");
typeToSchedulerRunTime = createTypeToCountMap(types, "scheduler_runtime");
typeToSchedulerCurrentCycleStart =
new ConcurrentHashMap<ResourceType, Long>();
sessionStatusToMetrics = createSessionStatusToMetricsMap();
aliveNodes = new MetricsIntValue("alive_nodes", registry);
deadNodes = new MetricsIntValue("dead_nodes", registry);
blacklistedNodes = new MetricsIntValue("blacklisted_nodes", registry);
numRunningSessions = new MetricsIntValue("num_running_sessions", registry);
totalSessionCount = new MetricsTimeVaryingInt("total_sessions", registry);
pendingCallsCount = new MetricsIntValue("num_pending_calls", registry);
numCJTFailures = new MetricsTimeVaryingInt("num_cjt_failures", registry);
numTaskTrackerRestarted = new MetricsIntValue("num_task_tracker_restarted", registry);
numRemoteJTTimedout = new MetricsIntValue("num_remotejt_timedout", registry);
}
/**
* Set the number of pending requests.
* @param resourceType The resource type.
* @param pending The number of pending requests.
*/
public void setPendingRequestCount(ResourceType resourceType, int pending) {
typeToPendingCount.get(resourceType).set(pending);
}
/**
* Set the number of running resources.
* @param resourceType The resource type.
* @param running The number of running resources.
*/
public void setRunningRequestCount(ResourceType resourceType, int running) {
typeToRunningCount.get(resourceType).set(running);
}
/**
* Set the number of total slots.
* @param resourceType The resource type.
* @param totalSlots The total number of slots.
*/
public void setTotalSlots(ResourceType resourceType, int totalSlots) {
typeToTotalSlots.get(resourceType).set(totalSlots);
}
/**
* Set the number of free slots.
* @param resourceType The resource type.
* @param freeSlots The number of free slots.
*/
public void setFreeSlots(ResourceType resourceType, int freeSlots) {
typeToFreeSlots.get(resourceType).set(freeSlots);
}
public void setSchedulerRunTime(ResourceType resourceType, int runtime) {
// This is called when the scheduling cycle is complete
setSchedulerCurrentCycleStartTime(resourceType, 0);
typeToSchedulerRunTime.get(resourceType).set(runtime);
}
public void setSchedulerCurrentCycleStartTime(ResourceType resourceType, long tstamp) {
typeToSchedulerCurrentCycleStart.put(resourceType, tstamp);
}
/**
* Record the request of a resource.
* @param type The resource type.
*/
public void requestResource(ResourceType type) {
typeToResourceRequested.get(type).inc();
}
/**
* Record the release of a resource.
* @param type The resource type.
*/
public void releaseResource(ResourceType type) {
typeToResourceReleased.get(type).inc();
}
/**
* Record the grant of a resource.
* @param type The resource type.
*/
public void grantResource(ResourceType type) {
typeToResourceGranted.get(type).inc();
}
/**
* Record the revoke of a resource.
* @param type The resource type.
*/
public void revokeResource(ResourceType type) {
typeToResourceRevoked.get(type).inc();
}
/**
* Set the number of alive nodes.
* @param numAlive The number of alive nodes.
*/
public void setAliveNodes(int numAlive) {
aliveNodes.set(numAlive);
}
/**
* num of task trackers get restarted
* @param num The number of task trackers get restarted
*/
public void restartTaskTracker(int num) {
this.numTaskTrackerRestarted.inc(num);
}
/**
/**
* num of remote JT timedout
* @param num The number of remote JT timedout
*/
public void timeoutRemoteJT(int num) {
this.numRemoteJTTimedout.inc(num);
}
/**
* Set the number of dead nodes.
* @param numDead The number of dead nodes.
*/
public void setDeadNodes(int numDead) {
deadNodes.set(numDead);
}
/**
* Set the number of blacklisted nodes.
* @param numBlacklisted The number of blacklisted nodes.
*/
public void setBlacklistedNodes(int numBlacklisted) {
blacklistedNodes.set(numBlacklisted);
}
/**
* Set the number of running sessions.
* @param num The number of running sessions.
*/
public void setNumRunningSessions(int num) {
numRunningSessions.set(num);
}
/**
* Increment the number of sessions since the start of the cluster manager.
*/
public void sessionStart() {
totalSessionCount.inc();
}
/**
* Record the end of a session.
* @param finishState The state that the session finished in.
*/
public void sessionEnd(SessionStatus finishState) {
if (sessionStatusToMetrics.containsKey(finishState)) {
sessionStatusToMetrics.get(finishState).inc();
} else {
throw new IllegalArgumentException("Invalid end state " + finishState);
}
}
/**
* Update the metric pending calls metric
* @param numPendingCalls the number of calls in the queue
* waiting to be sent out
*/
public void setNumPendingCalls(int numPendingCalls) {
pendingCallsCount.set(numPendingCalls);
}
/**
* Records CoronaJobTracker failure.
*/
public void recordCJTFailure() {
numCJTFailures.inc();
}
/**
* Set the scheduler and start the updating. The metrics won't be reported
* until this is called.
*
* @param scheduler Scheduler for this cluster manager.
* @param sessionNotifier Session Notifier for this cluster manager.
*/
public void registerUpdater(Scheduler scheduler,
SessionNotifier sessionNotifier) {
this.scheduler = scheduler;
this.sessionNotifier = sessionNotifier;
context.registerUpdater(this);
}
/**
* Get all the possible end states (non running) of the session
* @return The list of session status that are the non-running state
*/
private static List<SessionStatus> getEndStates() {
List<SessionStatus> endStatesRet = new ArrayList<SessionStatus>();
for (SessionStatus s : SessionStatus.values()) {
if (s != SessionStatus.RUNNING) {
endStatesRet.add(s);
}
}
return endStatesRet;
}
/**
* Create a map of session status -> metrics.
* @return the map.
*/
private Map<SessionStatus, MetricsTimeVaryingInt>
createSessionStatusToMetricsMap() {
Map<SessionStatus, MetricsTimeVaryingInt> m =
new HashMap<SessionStatus, MetricsTimeVaryingInt>();
for (SessionStatus endState : SESSION_END_STATES) {
String name = endState.toString().toLowerCase() + "_sessions";
m.put(endState, new MetricsTimeVaryingInt(name, registry));
}
return m;
}
/**
* Create a map of resource type -> current count.
* @param resourceTypes The resource types.
* @param actionType A string indicating pending, running etc.
* @return The map.
*/
private Map<ResourceType, MetricsIntValue> createTypeToCountMap(
Collection<ResourceType> resourceTypes, String actionType) {
Map<ResourceType, MetricsIntValue> m =
new HashMap<ResourceType, MetricsIntValue>();
for (ResourceType t : resourceTypes) {
String name = (actionType + "_" + t).toLowerCase();
MetricsIntValue value = new MetricsIntValue(name, registry);
m.put(t, value);
}
return m;
}
/**
* Create a map of resource type -> cumulative counts.
* @param resourceTypes The resource types.
* @param actionType A string indicating granted, revoked, etc.
* @return The map.
*/
private Map<ResourceType, MetricsTimeVaryingLong>
createTypeToResourceCountMap(
Collection<ResourceType> resourceTypes, String actionType) {
Map<ResourceType, MetricsTimeVaryingLong> m =
new HashMap<ResourceType, MetricsTimeVaryingLong>();
for (ResourceType t : resourceTypes) {
String name = (actionType + "_" + t).toLowerCase();
MetricsTimeVaryingLong value = new MetricsTimeVaryingLong(name, registry);
m.put(t, value);
}
return m;
}
@Override
public void doUpdates(MetricsContext context) {
// Get the fair scheduler metrics
if (scheduler != null) {
scheduler.submitMetrics(metricsRecord);
}
for (Map.Entry<ResourceType, Long> currStart :
typeToSchedulerCurrentCycleStart.entrySet()) {
long start = currStart.getValue();
if (start > 0) {
// This means that there's a scheduling cycle in progress.
int currCycleRun = (int)(System.currentTimeMillis() - start);
typeToSchedulerRunTime.get(currStart.getKey()).set(currCycleRun);
}
}
// Get the number of pending calls.
setNumPendingCalls(sessionNotifier.getNumPendingCalls());
// Not synchronized on the ClusterManagerMetrics object.
// The list of metrics in the registry is modified only in the constructor.
// And pushMetrics() is thread-safe.
for (MetricsBase m : registry.getMetricsList()) {
m.pushMetric(metricsRecord);
}
metricsRecord.update();
}
public MetricsContext getContext() {
return context;
}
}