/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hdfs.server.datanode.metrics; import com.google.common.annotations.VisibleForTesting; import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceStability; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hdfs.DFSConfigKeys; import org.apache.hadoop.metrics2.MetricsJsonBuilder; import org.apache.hadoop.metrics2.lib.RollingAverages; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.Map; import java.util.concurrent.ThreadLocalRandom; import java.util.concurrent.TimeUnit; /** * This class maintains DataNode peer metrics (e.g. numOps, AvgTime, etc.) for * various peer operations. */ @InterfaceAudience.Private @InterfaceStability.Unstable public class DataNodePeerMetrics { public static final Logger LOG = LoggerFactory.getLogger( DataNodePeerMetrics.class); private final RollingAverages sendPacketDownstreamRollingAvgerages; private final String name; /** * Threshold in milliseconds below which a DataNode is definitely not slow. */ private static final long LOW_THRESHOLD_MS = 5; private static final long MIN_OUTLIER_DETECTION_NODES = 10; private final OutlierDetector slowNodeDetector; /** * Minimum number of packet send samples which are required to qualify * for outlier detection. If the number of samples is below this then * outlier detection is skipped. */ @VisibleForTesting static final long MIN_OUTLIER_DETECTION_SAMPLES = 1000; public DataNodePeerMetrics( final String name, final long windowSizeMs, final int numWindows) { this.name = name; this.slowNodeDetector = new OutlierDetector(MIN_OUTLIER_DETECTION_NODES, LOW_THRESHOLD_MS); sendPacketDownstreamRollingAvgerages = new RollingAverages( windowSizeMs, numWindows); } public String name() { return name; } /** * Creates an instance of DataNodePeerMetrics, used for registration. */ public static DataNodePeerMetrics create(Configuration conf, String dnName) { final String name = "DataNodePeerActivity-" + (dnName.isEmpty() ? "UndefinedDataNodeName" + ThreadLocalRandom.current().nextInt() : dnName.replace(':', '-')); final long windowSizeMs = conf.getTimeDuration( DFSConfigKeys.DFS_METRICS_ROLLING_AVERAGES_WINDOW_LENGTH_KEY, DFSConfigKeys.DFS_METRICS_ROLLING_AVERAGES_WINDOW_LENGTH_DEFAULT, TimeUnit.MILLISECONDS); final int numWindows = conf.getInt( DFSConfigKeys.DFS_METRICS_ROLLING_AVERAGE_NUM_WINDOWS_KEY, DFSConfigKeys.DFS_METRICS_ROLLING_AVERAGE_NUM_WINDOWS_DEFAULT); return new DataNodePeerMetrics( name, windowSizeMs, numWindows); } /** * Adds invocation and elapsed time of SendPacketDownstream for peer. * <p> * The caller should pass in a well-formatted peerAddr. e.g. * "[192.168.1.110:1010]" is good. This will be translated into a full * qualified metric name, e.g. "[192.168.1.110:1010]AvgTime". * </p> */ public void addSendPacketDownstream( final String peerAddr, final long elapsedMs) { sendPacketDownstreamRollingAvgerages.add(peerAddr, elapsedMs); } /** * Dump SendPacketDownstreamRollingAvgTime metrics as JSON. */ public String dumpSendPacketDownstreamAvgInfoAsJson() { final MetricsJsonBuilder builder = new MetricsJsonBuilder(null); sendPacketDownstreamRollingAvgerages.snapshot(builder, true); return builder.toString(); } /** * Collects states maintained in {@link ThreadLocal}, if any. */ public void collectThreadLocalStates() { sendPacketDownstreamRollingAvgerages.collectThreadLocalStates(); } /** * Retrieve the set of dataNodes that look significantly slower * than their peers. */ public Map<String, Double> getOutliers() { // This maps the metric name to the aggregate latency. // The metric name is the datanode ID. final Map<String, Double> stats = sendPacketDownstreamRollingAvgerages.getStats( MIN_OUTLIER_DETECTION_SAMPLES); LOG.trace("DataNodePeerMetrics: Got stats: {}", stats); return slowNodeDetector.getOutliers(stats); } }