/** * Copyright 2017 LinkedIn Corp. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ package com.github.ambry.server; import com.codahale.metrics.MetricRegistry; import com.github.ambry.clustermap.PartitionId; import com.github.ambry.config.StatsManagerConfig; import com.github.ambry.store.StorageManager; import com.github.ambry.store.Store; import com.github.ambry.store.StoreException; import com.github.ambry.utils.Time; import com.github.ambry.utils.Utils; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.ThreadLocalRandom; import java.util.concurrent.TimeUnit; import org.codehaus.jackson.annotate.JsonAutoDetect; import org.codehaus.jackson.map.ObjectMapper; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * The stats manager is responsible for periodic aggregation of node level stats and expose/publish such stats to * potential consumers. */ class StatsManager { private static final Logger logger = LoggerFactory.getLogger(StatsManager.class); private final StorageManager storageManager; private final File statsOutputFile; private final long publishPeriodInSecs; private final int initialDelayInSecs; private final List<PartitionId> totalPartitionIds; private final StatsManagerMetrics metrics; private final Time time; private final ObjectMapper mapper = new ObjectMapper(); private ScheduledExecutorService scheduler = null; private StatsAggregator statsAggregator = null; /** * Constructs a {@link StatsManager}. * @param storageManager the {@link StorageManager} to be used to fetch the {@link Store}s * @param partitionIds a {@link List} of {@link PartitionId}s that are going to be fetched * @param registry the {@link MetricRegistry} to be used for {@link StatsManagerMetrics} * @param config the {@link StatsManagerConfig} to be used to configure the output file path and publish period * @param time the {@link Time} instance to be used for reporting * @throws IOException */ StatsManager(StorageManager storageManager, List<PartitionId> partitionIds, MetricRegistry registry, StatsManagerConfig config, Time time) throws IOException { this.storageManager = storageManager; totalPartitionIds = partitionIds; statsOutputFile = new File(config.outputFilePath); publishPeriodInSecs = config.publishPeriodInSecs; initialDelayInSecs = config.initialDelayUpperBoundInSecs; metrics = new StatsManagerMetrics(registry); mapper.setVisibilityChecker(mapper.getVisibilityChecker().withFieldVisibility(JsonAutoDetect.Visibility.ANY)); this.time = time; } /** * Start the stats manager by scheduling the periodic task that collect, aggregate and publish stats. */ void start() { scheduler = Utils.newScheduler(1, false); statsAggregator = new StatsAggregator(); int actualDelay = initialDelayInSecs > 0 ? ThreadLocalRandom.current().nextInt(initialDelayInSecs) : 0; logger.info("Scheduling stats aggregation job with an initial delay of {} secs", actualDelay); scheduler.scheduleAtFixedRate(statsAggregator, actualDelay, publishPeriodInSecs, TimeUnit.SECONDS); } /** * Stops the periodic task that is collecting, aggregating and publishing stats. */ void shutdown() throws InterruptedException { if (statsAggregator != null) { statsAggregator.cancel(); } if (scheduler != null) { scheduler.shutdown(); if (!scheduler.awaitTermination(30, TimeUnit.SECONDS)) { logger.error("Could not terminate aggregator tasks after StatsManager shutdown"); } } } /** * Publishes stats to a local file in JSON format. * @param statsWrapper the {@link StatsWrapper} to be published * @throws IOException */ void publish(StatsWrapper statsWrapper) throws IOException { File tempFile = new File(statsOutputFile.getAbsolutePath() + ".tmp"); if (tempFile.createNewFile()) { mapper.defaultPrettyPrintingWriter().writeValue(tempFile, statsWrapper); if (!tempFile.renameTo(statsOutputFile)) { throw new IOException( "Failed to rename " + tempFile.getAbsolutePath() + " to " + statsOutputFile.getAbsolutePath()); } } else { throw new IOException("Temporary file creation failed when publishing stats " + tempFile.getAbsolutePath()); } } /** * Fetch and aggregate stats from a given {@link Store} * @param aggregatedSnapshot the {@link StatsSnapshot} to hold the aggregated result * @param partitionId specifies the {@link Store} to be fetched from * @param unreachableStores a {@link List} containing partition Ids that were unable to successfully fetch from */ void collectAndAggregate(StatsSnapshot aggregatedSnapshot, PartitionId partitionId, List<String> unreachableStores) { Store store = storageManager.getStore(partitionId); if (store == null) { unreachableStores.add(partitionId.toString()); } else { try { long fetchAndAggregatePerStoreStartTimeMs = time.milliseconds(); StatsSnapshot statsSnapshot = store.getStoreStats().getStatsSnapshot(); aggregate(aggregatedSnapshot, statsSnapshot); metrics.fetchAndAggregateTimePerStoreMs.update(time.milliseconds() - fetchAndAggregatePerStoreStartTimeMs); } catch (StoreException e) { unreachableStores.add(partitionId.toString()); } } } /** * Performs recursive aggregation of two {@link StatsSnapshot} and stores the result in the first one. * @param baseSnapshot one of the addends and where the result will be * @param newSnapshot the other addend to be added into the first {@link StatsSnapshot} */ private void aggregate(StatsSnapshot baseSnapshot, StatsSnapshot newSnapshot) { baseSnapshot.setValue(baseSnapshot.getValue() + newSnapshot.getValue()); if (baseSnapshot.getSubMap() == null) { baseSnapshot.setSubMap(newSnapshot.getSubMap()); } else if (newSnapshot.getSubMap() != null) { for (Map.Entry<String, StatsSnapshot> entry : newSnapshot.getSubMap().entrySet()) { if (!baseSnapshot.getSubMap().containsKey(entry.getKey())) { baseSnapshot.getSubMap().put(entry.getKey(), new StatsSnapshot(0L, null)); } aggregate(baseSnapshot.getSubMap().get(entry.getKey()), entry.getValue()); } } } /** * Runnable class that collects, aggregate and publish stats via methods in StatsManager. */ private class StatsAggregator implements Runnable { private volatile boolean cancelled = false; @Override public void run() { logger.info("Aggregating stats"); try { long totalFetchAndAggregateStartTimeMs = time.milliseconds(); StatsSnapshot aggregatedSnapshot = new StatsSnapshot(0L, null); List<String> unreachableStores = new ArrayList<>(); Iterator<PartitionId> iterator = totalPartitionIds.iterator(); while (!cancelled && iterator.hasNext()) { PartitionId partitionId = iterator.next(); collectAndAggregate(aggregatedSnapshot, partitionId, unreachableStores); } if (!cancelled) { metrics.totalFetchAndAggregateTimeMs.update(time.milliseconds() - totalFetchAndAggregateStartTimeMs); StatsHeader statsHeader = new StatsHeader(StatsHeader.StatsDescription.QUOTA, time.milliseconds(), totalPartitionIds.size(), totalPartitionIds.size() - unreachableStores.size(), unreachableStores); publish(new StatsWrapper(statsHeader, aggregatedSnapshot)); logger.info("Stats snapshot published to {}", statsOutputFile.getAbsolutePath()); } } catch (Exception | Error e) { metrics.statsAggregationFailureCount.inc(); logger.error("Exception while aggregating stats. Stats output file path - {}", statsOutputFile.getAbsolutePath(), e); } } void cancel() { cancelled = true; } } }