package ru.yandex.market.graphouse.cacher; import com.google.common.base.Stopwatch; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.springframework.beans.factory.InitializingBean; import org.springframework.beans.factory.annotation.Value; import org.springframework.jdbc.core.JdbcTemplate; import org.springframework.jdbc.core.StatementCallback; import ru.yandex.clickhouse.ClickHouseStatementImpl; import ru.yandex.market.graphouse.Metric; import ru.yandex.market.graphouse.monitoring.Monitoring; import ru.yandex.market.graphouse.monitoring.MonitoringUnit; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.BlockingQueue; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Semaphore; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; /** * @author Dmitry Andreev <a href="mailto:AndreevDm@yandex-team.ru"></a> * @date 06/04/15 */ public class MetricCacher implements Runnable, InitializingBean { private static final Logger log = LogManager.getLogger(); private final JdbcTemplate clickHouseJdbcTemplate; private final Monitoring monitoring; @Value("${graphouse.clickhouse.data-table}") private String graphiteTable; @Value("${graphouse.cacher.queue-size}") private int queueSize = 1_000_000; @Value("${graphouse.cacher.min-batch-size}") private int minBatchSize = 10_000; @Value("${graphouse.cacher.max-batch-size}") private int maxBatchSize = 500_000; @Value("${graphouse.cacher.min-batch-time-seconds}") private int minBatchTimeSeconds = 1; @Value("${graphouse.cacher.max-batch-time-seconds}") private int maxBatchTimeSeconds = 5; @Value("${graphouse.cacher.max-output-threads}") private int maxOutputThreads = 2; private final AtomicLong lastBatchTimeMillis = new AtomicLong(System.currentTimeMillis()); private final MonitoringUnit metricCacherQueryUnit = new MonitoringUnit("MetricCacherQueue", 2, TimeUnit.MINUTES); private final Semaphore semaphore = new Semaphore(0, false); private BlockingQueue<Metric> metricQueue; private final AtomicInteger activeWriters = new AtomicInteger(0); private final AtomicInteger activeOutputMetrics = new AtomicInteger(0); private ExecutorService executorService; public MetricCacher(JdbcTemplate clickHouseJdbcTemplate, Monitoring monitoring) { this.clickHouseJdbcTemplate = clickHouseJdbcTemplate; this.monitoring = monitoring; } @Override public void afterPropertiesSet() throws Exception { metricQueue = new ArrayBlockingQueue<>(queueSize); semaphore.release(queueSize); executorService = Executors.newFixedThreadPool(maxOutputThreads); monitoring.addUnit(metricCacherQueryUnit); new Thread(this, "Metric cacher thread").start(); Runtime.getRuntime().addShutdownHook(new Thread(() -> { log.info("Shutting down metric cacher. Saving all cached metrics..."); while (!(metricQueue.isEmpty() && (activeOutputMetrics.get() == 0))) { log.info((metricQueue.size() + activeOutputMetrics.get()) + " metrics remaining"); createBatches(true); try { Thread.sleep(100); } catch (InterruptedException ignored) { } } executorService.shutdown(); while (!executorService.isTerminated()) { log.info("Awaiting save completion"); try { executorService.awaitTermination(100, TimeUnit.MILLISECONDS); } catch (InterruptedException ignored) { } } log.info("Metric cacher stopped"); })); } public void submitMetric(Metric metric) { try { semaphore.acquire(); metricQueue.put(metric); } catch (InterruptedException e) { throw new RuntimeException(e); } } public void submitMetrics(List<Metric> metrics) { if (metrics.isEmpty()) { return; } try { semaphore.acquire(metrics.size()); metricQueue.addAll(metrics); } catch (InterruptedException e) { throw new RuntimeException(e); } } @Override public void run() { while (!Thread.interrupted()) { try { Thread.sleep(1); createBatches(false); } catch (InterruptedException ignored) { } } } private boolean needBatch() { if (metricQueue.size() >= maxBatchSize) { return true; } long secondsPassed = TimeUnit.MILLISECONDS.toSeconds(System.currentTimeMillis() - lastBatchTimeMillis.get()); if (secondsPassed >= maxBatchTimeSeconds) { return true; } if (metricQueue.size() >= minBatchSize && secondsPassed >= minBatchTimeSeconds) { return true; } return false; } private void createBatches(boolean force) { if (metricQueue.isEmpty()) { metricCacherQueryUnit.ok(); return; } int queueSize = this.queueSize - semaphore.availablePermits(); double queueOccupancyPercent = queueSize * 100.0 / this.queueSize; queueSizeMonitoring(queueOccupancyPercent); int createdBatches = 0; int metricsInBatches = 0; Stopwatch stopwatch = Stopwatch.createStarted(); while ((needBatch() || force) && activeWriters.get() < maxOutputThreads) { List<Metric> metrics = createBatch(); if (metrics.isEmpty()) { continue; } createdBatches++; metricsInBatches += metrics.size(); executorService.submit(new ClickhouseWriterWorker(metrics)); activeWriters.incrementAndGet(); activeOutputMetrics.addAndGet(metrics.size()); lastBatchTimeMillis.set(System.currentTimeMillis()); } stopwatch.stop(); if (createdBatches > 0) { log.info( "Created " + createdBatches + " output worker(s) (" + activeWriters.get() + " total) " + "for " + metricsInBatches + " metrics (" + activeOutputMetrics.get() + " total in processing) " + "in " + stopwatch.toString() + ". " + "Metric queue size: " + queueSize + " (" + queueOccupancyPercent + "%)"); } } private void queueSizeMonitoring(double queueOccupancyPercent) { if (queueOccupancyPercent >= 95) { metricCacherQueryUnit.critical("Metric queue size >= 95%"); } else if (queueOccupancyPercent >= 80) { metricCacherQueryUnit.warning("Metric queue size >= 80%"); } else { metricCacherQueryUnit.ok(); } } private List<Metric> createBatch() { int batchSize = Math.min(maxBatchSize, metricQueue.size()); List<Metric> metrics = new ArrayList<>(batchSize); metricQueue.drainTo(metrics, batchSize); return metrics; } private class ClickhouseWriterWorker implements Runnable { private List<Metric> metrics = new ArrayList<>(); public ClickhouseWriterWorker(List<Metric> metrics) { this.metrics = metrics; } @Override public void run() { boolean ok = false; while (!ok) { try { long start = System.currentTimeMillis(); saveMetrics(); long processed = System.currentTimeMillis() - start; log.info("Saved " + metrics.size() + " metrics in " + processed + "ms"); semaphore.release(metrics.size()); ok = true; } catch (Exception e) { log.error("Failed to save metrics. Waiting 1 second before retry", e); monitoring.addTemporaryCritical( "MetricCacherClickhouseOutput", e.getMessage(), 1, TimeUnit.MINUTES ); try { Thread.sleep(1000); } catch (InterruptedException ignored) { } } } activeOutputMetrics.addAndGet(-metrics.size()); activeWriters.decrementAndGet(); } private void saveMetrics() throws IOException { MetricRowBinaryHttpEntity httpEntity = new MetricRowBinaryHttpEntity(metrics); clickHouseJdbcTemplate.execute( (StatementCallback<Void>) stmt -> { ClickHouseStatementImpl statement = (ClickHouseStatementImpl) stmt; statement.sendStream( httpEntity, "INSERT INTO " + graphiteTable + " (metric, value, timestamp, date, updated)", "RowBinary" ); return null; } ); } } }