/** * Copyright 2015-2016 The OpenZipkin Authors * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except * in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the License * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express * or implied. See the License for the specific language governing permissions and limitations under * the License. */ package zipkin.collector.zookeeper; import com.google.common.io.Closer; import java.io.Closeable; import java.io.IOException; import java.util.UUID; import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.ScheduledFuture; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.atomic.AtomicReference; import org.apache.curator.framework.CuratorFramework; import org.apache.curator.framework.imps.CuratorFrameworkState; import org.apache.curator.framework.recipes.cache.NodeCache; import org.apache.curator.framework.recipes.nodes.GroupMember; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import zipkin.Span; import zipkin.collector.CollectorSampler; import static com.google.common.base.Preconditions.checkState; import static zipkin.internal.Util.UTF_8; import static zipkin.internal.Util.checkArgument; import static zipkin.internal.Util.checkNotNull; /** * This is an adaptive sampler which can help prevent a surge in traffic from overwhelming the * zipkin storage layer. It works by coordinating a sample rate based on multiple instances vs a * target storage rate in spans/minute. * * <p>This assumes that each instance is storing every span it {@link #isSampled(Span) samples}, and * that the store rate is a useful metric (ex spans have relatively the same size and depth. * * <p>If the storage layer is capable of 10k spans/minute, you'd set the target rate in ZooKeeper to * 10000. With this in mind, 10 balanced collectors writing 10k spans/minute would eventually see a * sample rate of 0.10, slowing them down to match what the storage is capable of. * * <h3>Implementation notes</h3> * * <p>This object spawns a single scheduling thread that reports its rate of {@link #isSampled(Span) * spans sampled}, per the {@link Builder#updateFrequency(int) update frequency}. * * <p>When a leader, this object summarizes recent sample rates and compares them against a target. * * <p>Algorithms and defaults are tuned to favor decreasing the sample rate vs increasing it. For * example, a surge in writes will fire a rate adjustment faster than a drop in writes. */ public final class ZooKeeperCollectorSampler extends CollectorSampler implements Closeable { final static Logger log = LoggerFactory.getLogger(ZooKeeperCollectorSampler.class); public static Builder builder() { return new Builder(); } public static final class Builder { float initialRate = 1.0f; String basePath = "/zipkin/sampler"; String id = UUID.randomUUID().toString(); int updateFrequency = 30; int windowSize = 30 * 60; int sufficientWindowSize = 10 * 60; int outlierThreshold = 5 * 60; /** Rate used until an adaptive one calculated. 0.0001 means 0.01% of traces. Defaults to 1.0 */ public Builder initialRate(float rate) { checkArgument(rate >= 0 && rate <= 1, "rate should be between 0 and 1: was %s", rate); this.initialRate = rate; return this; } /** * Stable name to use for this node in ZooKeeper groups and elections, ex. "cluster@host:port". * Defaults to a UUID. */ public Builder id(String id) { this.id = checkNotNull(id, "id"); return this; } /** Base path in ZooKeeper for the sampler to use. Defaults to "zipkin" */ public Builder basePath(String basePath) { this.basePath = checkNotNull(basePath, "basePath"); return this; } /** Frequency in seconds which to update the store and sample rate. Defaults to 30 */ public Builder updateFrequency(int updateFrequency) { checkArgument(updateFrequency >= 1, "updateFrequency must be at least 1 second"); this.updateFrequency = updateFrequency; return this; } /** Seconds of request rate data to base sample rate on. Defaults to 1800 (30 minutes) */ public Builder windowSize(int windowSize) { this.windowSize = windowSize; return this; } /** * Seconds of request rate data to gather before calculating sample rate. Defaults to 600 (10 * minutes) */ public Builder sufficientWindowSize(int sufficientWindowSize) { this.sufficientWindowSize = sufficientWindowSize; return this; } /** Seconds to see outliers before updating sample rate. Defaults to 300 (5 minutes) */ public Builder outlierThreshold(int outlierThreshold) { this.outlierThreshold = outlierThreshold; return this; } /** * @param client must be started, and will not be closed on {@link #close()} */ public ZooKeeperCollectorSampler build(CuratorFramework client) { checkState(checkNotNull(client, "client").getState() == CuratorFrameworkState.STARTED, "%s is not started", client.getState()); return new ZooKeeperCollectorSampler(this, client); } Builder() { } } final String groupMember; final AtomicLong boundary; final AtomicInteger spanCount; final AtomicInteger storeRate; final Closer closer = Closer.create(); ZooKeeperCollectorSampler(Builder builder, CuratorFramework client) { groupMember = builder.id; boundary = new AtomicLong((long) (Long.MAX_VALUE * builder.initialRate)); // safe cast as less <= 1 spanCount = new AtomicInteger(0); storeRate = new AtomicInteger(); GroupMember storeRateMember = storeRateGroup(client, builder, closer, spanCount, storeRate); AtomicInteger targetStoreRate = targetStoreRate(client, builder, closer); AtomicReference<Float> sampleRate = new AtomicReference(builder.initialRate); String sampleRatePath = builder.basePath + "/sampleRate"; closer.register( new SampleRateListener(client, sampleRatePath, sampleRate, boundary)); closer.register(new SampleRateUpdater( client, storeRateMember, builder.basePath + "/storeRates", sampleRatePath, new SampleRateCalculatorInput(builder, targetStoreRate).andThen( new SampleRateCalculator(targetStoreRate, sampleRate)), closer.register(new SampleRateUpdateGuard(client, builder)))); } static GroupMember storeRateGroup(CuratorFramework client, Builder builder, Closer closer, AtomicInteger spanCount, AtomicInteger storeRate) { String storeRatePath = ensureExists(client, builder.basePath + "/storeRates"); GroupMember storeRateGroup = closer.register(new GroupMember(client, storeRatePath, builder.id)); log.debug("{} is to join the group {}", builder.id, storeRatePath); storeRateGroup.start(); ScheduledExecutorService executor = Executors.newSingleThreadScheduledExecutor(); closer.register(executor::shutdown); ScheduledFuture<?> future = executor.scheduleAtFixedRate(() -> { int oldValue = storeRate.get(); int newValue = (int) (1.0 * spanCount.getAndSet(0) * 60 / builder.updateFrequency); log.debug("Store rates was: {} now {}", oldValue, newValue); if (oldValue != newValue) { storeRate.set(newValue); storeRateGroup.setThisData(Integer.valueOf(newValue).toString().getBytes(UTF_8)); } }, 0, builder.updateFrequency, TimeUnit.SECONDS); closer.register(() -> future.cancel(true)); return storeRateGroup; } /** read-only */ static AtomicInteger targetStoreRate(CuratorFramework client, Builder builder, Closer closer) { String targetStoreRatePath = ensureExists(client, builder.basePath + "/targetStoreRate"); NodeCache cache = closer.register(new NodeCache(client, targetStoreRatePath)); try { cache.start(); } catch (Exception e) { throw new IllegalStateException("Error starting cache for " + targetStoreRatePath, e); } AtomicInteger targetStoreRate = new AtomicInteger(); cache.getListenable().addListener(() -> { byte[] bytes = cache.getCurrentData().getData(); if (bytes.length == 0) return; try { targetStoreRate.set(Integer.valueOf(new String(bytes, UTF_8))); } catch (NumberFormatException e) { log.warn("Error parsing target store rate {}", e.getMessage()); return; } }); return targetStoreRate; } static String ensureExists(CuratorFramework client, String path) { try { client.checkExists().creatingParentContainersIfNeeded().forPath(path); return path; } catch (Exception e) { throw new IllegalStateException("Error creating " + path, e); } } @Override public void close() throws IOException { closer.close(); } @Override public boolean isSampled(Span span) { boolean result = super.isSampled(span); if (result) spanCount.incrementAndGet(); return result; } @Override protected long boundary() { return boundary.get(); } }