// Copyright 2016 Twitter. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package com.twitter.heron.metricsmgr.sink.metricscache;
import java.io.IOException;
import java.time.Duration;
import java.time.temporal.ChronoUnit;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.ThreadFactory;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Level;
import java.util.logging.Logger;
import com.google.common.annotations.VisibleForTesting;
import com.twitter.heron.common.basics.Communicator;
import com.twitter.heron.common.basics.NIOLooper;
import com.twitter.heron.common.basics.SingletonRegistry;
import com.twitter.heron.common.basics.SysUtils;
import com.twitter.heron.common.basics.TypeUtils;
import com.twitter.heron.common.network.HeronSocketOptions;
import com.twitter.heron.metricsmgr.MetricsManagerServer;
import com.twitter.heron.proto.tmaster.TopologyMaster;
import com.twitter.heron.spi.metricsmgr.metrics.ExceptionInfo;
import com.twitter.heron.spi.metricsmgr.metrics.MetricsFilter;
import com.twitter.heron.spi.metricsmgr.metrics.MetricsInfo;
import com.twitter.heron.spi.metricsmgr.metrics.MetricsRecord;
import com.twitter.heron.spi.metricsmgr.sink.IMetricsSink;
import com.twitter.heron.spi.metricsmgr.sink.SinkContext;
/**
* An IMetricsSink sends Metrics to MetricsCache.
* 1. It gets the MetricsCacheLocation
* <p>
* 2. Then it would construct a long-live Service running metricsCacheClient, which could automatically
* recover from uncaught exceptions, i.e. close the old one and start a new one.
* Also, it provides api to update the MetricsCacheLocation that metricsCacheClient need to connect and
* restart the metricsCacheClient.
* There are two scenarios we need to restart a metricsCacheClient in our case:
* <p>
* -- Uncaught exceptions happen within metricsCacheClient; then we would restart metricsCacheClient inside
* the same ExecutorService inside the UncaughtExceptionHandlers.
* Notice that, in java, exceptions occur inside UncaughtExceptionHandlers would not invoke
* UncaughtExceptionHandlers; instead, it would kill the thread with that exception.
* So if exceptions thrown during restart a new metricsCacheClient, this MetricsCacheSink would die, and
* external logic would take care of it.
* <p>
* -- MetricsCacheLocation changes (though in fact, metricsCacheClient might also throw exceptions in this case),
* in this case, we would invoke MetricsCacheService to start from tMasterLocationStarter's thread.
* But the MetricsCacheService and metricsCacheClient still start wihtin the thread they run.
* <p>
* 3. When a new MetricsRecord comes by invoking processRecord, it would push the MetricsRecord
* to the Communicator Queue to metricsCacheClient
* <p>
* Notice that we would not send all metrics to MetricsCache; we would use MetricsFilter to figure out
* needed metrics.
*/
public class MetricsCacheSink implements IMetricsSink {
private static final Logger LOG = Logger.getLogger(MetricsCacheSink.class.getName());
// These configs would be read from metrics-sink-configs.yaml
private static final String KEY_TMASTER_LOCATION_CHECK_INTERVAL_SEC =
"metricscache-location-check-interval-sec";
private static final String KEY_TMASTER = "metricscache-client";
private static final String KEY_TMASTER_RECONNECT_INTERVAL_SEC = "reconnect-interval-second";
private static final String KEY_NETWORK_WRITE_BATCH_SIZE_BYTES = "network-write-batch-size-bytes";
private static final String KEY_NETWORK_WRITE_BATCH_TIME_MS = "network-write-batch-time-ms";
private static final String KEY_NETWORK_READ_BATCH_SIZE_BYTES = "network-read-batch-size-bytes";
private static final String KEY_NETWORK_READ_BATCH_TIME_MS = "network-read-batch-time-ms";
private static final String KEY_SOCKET_SEND_BUFFER_BYTES = "socket-send-buffer-size-bytes";
private static final String KEY_SOCKET_RECEIVED_BUFFER_BYTES =
"socket-received-buffer-size-bytes";
private static final String KEY_TMASTER_METRICS_TYPE = "metricscache-metrics-type";
// Bean name to fetch the MetricsCacheLocation object from SingletonRegistry
// private static final String TMASTER_LOCATION_BEAN_NAME =
// TopologyMaster.MetricsCacheLocation.newBuilder().getDescriptorForType().getFullName();
// Metrics Counter Name
private static final String METRICS_COUNT = "metrics-count";
private static final String EXCEPTIONS_COUNT = "exceptions-count";
private static final String RECORD_PROCESS_COUNT = "record-process-count";
private static final String TMASTER_RESTART_COUNT = "tmaster-restart-count";
private static final String TMASTER_LOCATION_UPDATE_COUNT = "tmaster-location-update-count";
private final Communicator<TopologyMaster.PublishMetrics> metricsCommunicator =
new Communicator<>();
private final MetricsFilter tMasterMetricsFilter = new MetricsFilter();
private final Map<String, Object> sinkConfig = new HashMap<>();
// A scheduled executor service to check whether the MetricsCacheLocation has changed
// If so, restart the metricsCacheClientService with the new MetricsCacheLocation
// Start of metricsCacheClientService will also be in this thread
private final ScheduledExecutorService tMasterLocationStarter =
Executors.newSingleThreadScheduledExecutor();
private MetricsCacheClientService metricsCacheClientService;
// We need to cache it locally to check whether the MetricsCacheLocation is changed
// This field is changed only in ScheduledExecutorService's thread,
// so no need to make it volatile
private TopologyMaster.MetricsCacheLocation currentMetricsCacheLocation = null;
private SinkContext sinkContext;
@Override
@SuppressWarnings("unchecked")
public void init(Map<String, Object> conf, SinkContext context) {
LOG.info("metricscache sink init");
sinkConfig.putAll(conf);
sinkContext = context;
// Fill the tMasterMetricsFilter according to metrics-sink-configs.yaml
Map<String, String> tmasterMetricsType =
(Map<String, String>) sinkConfig.get(KEY_TMASTER_METRICS_TYPE);
if (tmasterMetricsType != null) {
for (Map.Entry<String, String> metricToType : tmasterMetricsType.entrySet()) {
String value = metricToType.getValue();
MetricsFilter.MetricAggregationType type =
MetricsFilter.MetricAggregationType.valueOf(value);
tMasterMetricsFilter.setPrefixToType(metricToType.getKey(), type);
}
}
// Construct the long-live metricsCacheClientService
metricsCacheClientService =
new MetricsCacheClientService((Map<String, Object>)
sinkConfig.get(KEY_TMASTER), metricsCommunicator);
// Start the tMasterLocationStarter
startMetricsCacheChecker();
}
// Start the MetricsCacheCheck, which would check whether the MetricsCacheLocation is changed
// at an interval.
// If so, restart the metricsCacheClientService with the new MetricsCacheLocation
private void startMetricsCacheChecker() {
final int checkIntervalSec =
TypeUtils.getInteger(sinkConfig.get(KEY_TMASTER_LOCATION_CHECK_INTERVAL_SEC));
Runnable runnable = new Runnable() {
@Override
public void run() {
TopologyMaster.MetricsCacheLocation location =
(TopologyMaster.MetricsCacheLocation) SingletonRegistry.INSTANCE.getSingleton(
MetricsManagerServer.METRICSCACHE_LOCATION_BEAN_NAME);
if (location != null) {
if (currentMetricsCacheLocation == null
|| !location.equals(currentMetricsCacheLocation)) {
LOG.info("Update current MetricsCacheLocation to: " + location);
currentMetricsCacheLocation = location;
metricsCacheClientService.updateMetricsCacheLocation(currentMetricsCacheLocation);
metricsCacheClientService.startNewMasterClient();
// Update Metrics
sinkContext.exportCountMetric(TMASTER_LOCATION_UPDATE_COUNT, 1);
}
}
// Schedule itself in future
tMasterLocationStarter.schedule(this, checkIntervalSec, TimeUnit.SECONDS);
}
};
// First Entry
tMasterLocationStarter.schedule(runnable, checkIntervalSec, TimeUnit.SECONDS);
}
@Override
public void processRecord(MetricsRecord record) {
LOG.info("metricscache sink processRecord");
// Format it into TopologyMaster.PublishMetrics
// The format of source is "host:port/componentName/instanceId"
// So source.split("/") would be an array with 3 elements:
// ["host:port", componentName, instanceId]
String[] sources = record.getSource().split("/");
String hostPort = sources[0];
String componentName = sources[1];
String instanceId = sources[2];
TopologyMaster.PublishMetrics.Builder publishMetrics =
TopologyMaster.PublishMetrics.newBuilder();
for (MetricsInfo metricsInfo : tMasterMetricsFilter.filter(record.getMetrics())) {
// We would filter out unneeded metrics
TopologyMaster.MetricDatum metricDatum = TopologyMaster.MetricDatum.newBuilder().
setComponentName(componentName).setInstanceId(instanceId).setName(metricsInfo.getName()).
setValue(metricsInfo.getValue()).setTimestamp(record.getTimestamp()).build();
publishMetrics.addMetrics(metricDatum);
}
for (ExceptionInfo exceptionInfo : record.getExceptions()) {
TopologyMaster.TmasterExceptionLog exceptionLog =
TopologyMaster.TmasterExceptionLog.newBuilder()
.setComponentName(componentName)
.setHostname(hostPort)
.setInstanceId(instanceId)
.setStacktrace(exceptionInfo.getStackTrace())
.setLasttime(exceptionInfo.getLastTime())
.setFirsttime(exceptionInfo.getFirstTime())
.setCount(exceptionInfo.getCount())
.setLogging(exceptionInfo.getLogging()).build();
publishMetrics.addExceptions(exceptionLog);
}
metricsCommunicator.offer(publishMetrics.build());
// Update metrics
sinkContext.exportCountMetric(RECORD_PROCESS_COUNT, 1);
sinkContext.exportCountMetric(METRICS_COUNT, publishMetrics.getMetricsCount());
sinkContext.exportCountMetric(EXCEPTIONS_COUNT, publishMetrics.getExceptionsCount());
}
@Override
public void flush() {
// We do nothing here but update metrics
sinkContext.exportCountMetric(TMASTER_RESTART_COUNT,
metricsCacheClientService.startedAttempts.longValue());
}
@Override
public void close() {
metricsCacheClientService.close();
metricsCommunicator.clear();
}
@VisibleForTesting
MetricsCacheClientService getMetricsCacheClientService() {
return metricsCacheClientService;
}
@VisibleForTesting
void createSimpleMetricsCacheClientService(Map<String, Object> serviceConfig) {
metricsCacheClientService =
new MetricsCacheClientService(serviceConfig, metricsCommunicator);
}
@VisibleForTesting
MetricsCacheClient getMetricsCacheClient() {
return metricsCacheClientService.getMetricsCacheClient();
}
@VisibleForTesting
void startNewMetricsCacheClient(TopologyMaster.MetricsCacheLocation location) {
metricsCacheClientService.updateMetricsCacheLocation(location);
metricsCacheClientService.startNewMasterClient();
}
@VisibleForTesting
int getMetricsCacheStartedAttempts() {
return metricsCacheClientService.startedAttempts.get();
}
@VisibleForTesting
TopologyMaster.MetricsCacheLocation getCurrentMetricsCacheLocation() {
return currentMetricsCacheLocation;
}
@VisibleForTesting
TopologyMaster.MetricsCacheLocation getCurrentMetricsCacheLocationInService() {
return metricsCacheClientService.getCurrentMetricsCacheLocation();
}
/**
* A long-live Service running metricsCacheClient
* It would automatically restart the metricsCacheClient connecting and communicating to the latest
* MetricsCacheLocation if any uncaught exceptions throw.
* <p>
* It provides startNewMasterClient(TopologyMaster.MetricsCacheLocation location), which would also
* update the currentMetricsCacheLocation to the lastest location.
* <p>
* So a new metricsCacheClient would start in two cases:
* 1. The old one threw exceptions and died.
* 2. startNewMasterClient() is invoked externally with MetricsCacheLocation.
*/
private static final class MetricsCacheClientService {
private final AtomicInteger startedAttempts = new AtomicInteger(0);
private final Map<String, Object> metricsCacheClientConfig;
private final Communicator<TopologyMaster.PublishMetrics> metricsCommunicator;
private final ExecutorService metricsCacheClientExecutor =
Executors.newSingleThreadExecutor(new MetricsCacheClientThreadFactory());
private volatile MetricsCacheClient metricsCacheClient;
// We need to cache MetricsCacheLocation for failover case
// This value is set in ScheduledExecutorService' thread while
// it is used in metricsCacheClientService thread,
// so we need to make it volatile to guarantee the visiability.
private volatile TopologyMaster.MetricsCacheLocation currentMetricsCacheLocation;
private MetricsCacheClientService(
Map<String, Object> metricsCacheClientConfig,
Communicator<TopologyMaster.PublishMetrics> metricsCommunicator) {
this.metricsCacheClientConfig = metricsCacheClientConfig;
this.metricsCommunicator = metricsCommunicator;
}
// Update the MetricsCacheLocation to connect within the metricsCacheClient
// This method is thread-safe, since
// currentMetricsCacheLocation is volatile and we just replace it.
// In our scenario, it is only invoked when MetricsCacheLocation is changed,
// i.e. this method is only invoked in scheduled executor thread.
public void updateMetricsCacheLocation(TopologyMaster.MetricsCacheLocation location) {
currentMetricsCacheLocation = location;
}
// This method could be invoked by different threads
// Make it synchronized to guarantee thread-safe
public synchronized void startNewMasterClient() {
// Exit any running metricsCacheClient if there is any to release
// the thread in metricsCacheClientExecutor
if (metricsCacheClient != null) {
metricsCacheClient.stop();
metricsCacheClient.getNIOLooper().exitLoop();
}
// Construct the new metricsCacheClient
final NIOLooper looper;
try {
looper = new NIOLooper();
} catch (IOException e) {
throw new RuntimeException("Could not create the NIOLooper", e);
}
HeronSocketOptions socketOptions =
new HeronSocketOptions(
TypeUtils.getByteAmount(
metricsCacheClientConfig.get(KEY_NETWORK_WRITE_BATCH_SIZE_BYTES)),
TypeUtils.getDuration(
metricsCacheClientConfig.get(KEY_NETWORK_WRITE_BATCH_TIME_MS), ChronoUnit.MILLIS),
TypeUtils.getByteAmount(
metricsCacheClientConfig.get(KEY_NETWORK_READ_BATCH_SIZE_BYTES)),
TypeUtils.getDuration(
metricsCacheClientConfig.get(KEY_NETWORK_READ_BATCH_TIME_MS), ChronoUnit.MILLIS),
TypeUtils.getByteAmount(
metricsCacheClientConfig.get(KEY_SOCKET_SEND_BUFFER_BYTES)),
TypeUtils.getByteAmount(
metricsCacheClientConfig.get(KEY_SOCKET_RECEIVED_BUFFER_BYTES)));
// Reset the Consumer
metricsCommunicator.setConsumer(looper);
metricsCacheClient = new MetricsCacheClient(looper,
currentMetricsCacheLocation.getHost(),
currentMetricsCacheLocation.getMasterPort(),
socketOptions, metricsCommunicator,
TypeUtils.getDuration(
metricsCacheClientConfig.get(KEY_TMASTER_RECONNECT_INTERVAL_SEC),
ChronoUnit.SECONDS));
LOG.severe(String.format("Starting metricsCacheClient for the %d time.",
startedAttempts.incrementAndGet()));
metricsCacheClientExecutor.execute(metricsCacheClient);
}
// This method could be invoked by different threads
// Make it synchronized to guarantee thread-safe
public synchronized void close() {
metricsCacheClient.getNIOLooper().exitLoop();
metricsCacheClientExecutor.shutdownNow();
}
@VisibleForTesting
MetricsCacheClient getMetricsCacheClient() {
return metricsCacheClient;
}
@VisibleForTesting
int getMetricsCacheStartedAttempts() {
return startedAttempts.get();
}
@VisibleForTesting
TopologyMaster.MetricsCacheLocation getCurrentMetricsCacheLocation() {
return currentMetricsCacheLocation;
}
// An UncaughtExceptionHandler, which would restart MetricsCacheLocation with
// current MetricsCacheLocation.
private class MetricsCacheClientThreadFactory implements ThreadFactory {
@Override
public Thread newThread(Runnable r) {
final Thread thread = new Thread(r);
thread.setUncaughtExceptionHandler(new MetricsCacheClientExceptionHandler());
return thread;
}
private class MetricsCacheClientExceptionHandler implements Thread.UncaughtExceptionHandler {
@Override
public void uncaughtException(Thread t, Throwable e) {
LOG.log(Level.SEVERE, "metricsCacheClient dies in thread: " + t, e);
Duration reconnectInterval = TypeUtils.getDuration(
metricsCacheClientConfig.get(KEY_TMASTER_RECONNECT_INTERVAL_SEC), ChronoUnit.SECONDS);
SysUtils.sleep(reconnectInterval);
LOG.info("Restarting metricsCacheClient");
// We would use the MetricsCacheLocation in cache, since
// the new metricsCacheClient is started due to exception thrown,
// rather than MetricsCacheLocation changes
startNewMasterClient();
}
}
}
}
}