/*
* Copyright 2014-2017 Netflix, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.netflix.spectator.atlas;
import com.fasterxml.jackson.core.JsonFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.module.SimpleModule;
import com.fasterxml.jackson.dataformat.smile.SmileFactory;
import com.netflix.spectator.api.AbstractRegistry;
import com.netflix.spectator.api.Clock;
import com.netflix.spectator.api.Counter;
import com.netflix.spectator.api.DistributionSummary;
import com.netflix.spectator.api.Gauge;
import com.netflix.spectator.api.Id;
import com.netflix.spectator.api.Measurement;
import com.netflix.spectator.api.Tag;
import com.netflix.spectator.api.Timer;
import com.netflix.spectator.atlas.impl.EvalPayload;
import com.netflix.spectator.atlas.impl.Evaluator;
import com.netflix.spectator.atlas.impl.MeasurementSerializer;
import com.netflix.spectator.atlas.impl.PublishPayload;
import com.netflix.spectator.atlas.impl.Subscription;
import com.netflix.spectator.atlas.impl.Subscriptions;
import com.netflix.spectator.atlas.impl.TagsValuePair;
import com.netflix.spectator.impl.AsciiSet;
import com.netflix.spectator.impl.Scheduler;
import com.netflix.spectator.sandbox.HttpClient;
import com.netflix.spectator.sandbox.HttpResponse;
import java.net.URI;
import java.time.Duration;
import java.time.Instant;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;
import java.util.stream.StreamSupport;
/**
* Registry for reporting metrics to Atlas.
*/
public final class AtlasRegistry extends AbstractRegistry {
private static final String CLOCK_SKEW_TIMER = "spectator.atlas.clockSkew";
private final Clock clock;
private final boolean enabled;
private final Duration step;
private final long stepMillis;
private final URI uri;
private final boolean lwcEnabled;
private final Duration configRefreshFrequency;
private final long configTTL;
private final URI configUri;
private final URI evalUri;
private final int connectTimeout;
private final int readTimeout;
private final int batchSize;
private final int numThreads;
private final Map<String, String> commonTags;
private final AsciiSet charset;
private final Map<String, AsciiSet> overrides;
private final ObjectMapper jsonMapper;
private final ObjectMapper smileMapper;
private Scheduler scheduler;
private final Map<Subscription, Long> subscriptions = new ConcurrentHashMap<>();
/** Create a new instance. */
public AtlasRegistry(Clock clock, AtlasConfig config) {
super(new StepClock(clock, config.step().toMillis()), config);
this.clock = clock;
this.enabled = config.enabled();
this.step = config.step();
this.stepMillis = step.toMillis();
this.uri = URI.create(config.uri());
this.lwcEnabled = config.lwcEnabled();
this.configRefreshFrequency = config.configRefreshFrequency();
this.configTTL = config.configTTL().toMillis();
this.configUri = URI.create(config.configUri());
this.evalUri = URI.create(config.evalUri());
this.connectTimeout = (int) config.connectTimeout().toMillis();
this.readTimeout = (int) config.readTimeout().toMillis();
this.batchSize = config.batchSize();
this.numThreads = config.numThreads();
this.commonTags = new TreeMap<>(config.commonTags());
this.charset = AsciiSet.fromPattern(config.validTagCharacters());
this.overrides = config.validTagValueCharacters()
.keySet().stream()
.collect(Collectors.toMap(k -> k, AsciiSet::fromPattern));
SimpleModule module = new SimpleModule()
.addSerializer(Measurement.class, new MeasurementSerializer(charset, overrides));
this.jsonMapper = new ObjectMapper(new JsonFactory()).registerModule(module);
this.smileMapper = new ObjectMapper(new SmileFactory()).registerModule(module);
}
/**
* Start the scheduler to collect metrics data.
*/
public void start() {
if (scheduler == null) {
// Setup main collection for publishing to Atlas
if (enabled || lwcEnabled) {
Scheduler.Options options = new Scheduler.Options()
.withFrequency(Scheduler.Policy.FIXED_RATE_SKIP_IF_LONG, step)
.withInitialDelay(Duration.ofMillis(getInitialDelay(stepMillis)))
.withStopOnFailure(false);
scheduler = new Scheduler(this, "spectator-reg-atlas", numThreads);
scheduler.schedule(options, this::collectData);
logger.info("started collecting metrics every {} reporting to {}", step, uri);
logger.info("common tags: {}", commonTags);
} else {
logger.info("publishing is not enabled");
}
// Setup collection for subscriptions
if (lwcEnabled) {
Scheduler.Options options = new Scheduler.Options()
.withFrequency(Scheduler.Policy.FIXED_DELAY, configRefreshFrequency)
.withStopOnFailure(false);
scheduler.schedule(options, this::fetchSubscriptions);
} else {
logger.info("subscriptions are not enabled");
}
} else {
logger.warn("registry already started, ignoring duplicate request");
}
}
/**
* Avoid collecting right on boundaries to minimize transitions on step longs
* during a collection. Randomly distribute across the middle of the step interval.
*/
long getInitialDelay(long stepSize) {
long now = clock.wallTime();
long stepBoundary = now / stepSize * stepSize;
// Buffer by 10% of the step interval on either side
long offset = stepSize / 10;
// Check if the current delay is within the acceptable range
long delay = now - stepBoundary;
if (delay < offset) {
return delay + offset;
} else if (delay > stepSize - offset) {
return stepSize - offset;
} else {
return delay;
}
}
/**
* Stop the scheduler reporting Atlas data.
*/
public void stop() {
if (scheduler != null) {
scheduler.shutdown();
scheduler = null;
logger.info("stopped collecting metrics every {}ms reporting to {}", step, uri);
} else {
logger.warn("registry stopped, but was never started");
}
}
private void collectData() {
// Send data for any subscriptions
if (lwcEnabled) {
try {
handleSubscriptions();
} catch (Exception e) {
logger.warn("failed to handle subscriptions", e);
}
}
// Publish to Atlas
if (enabled) {
try {
for (List<Measurement> batch : getBatches()) {
PublishPayload p = new PublishPayload(commonTags, batch);
HttpResponse res = HttpClient.DEFAULT.newRequest("spectator-reg-atlas", uri)
.withMethod("POST")
.withConnectTimeout(connectTimeout)
.withReadTimeout(readTimeout)
.withContent("application/x-jackson-smile", smileMapper.writeValueAsBytes(p))
.send();
Instant date = res.dateHeader("Date");
recordClockSkew((date == null) ? 0L : date.toEpochMilli());
}
} catch (Exception e) {
logger.warn("failed to send metrics", e);
}
}
}
private void handleSubscriptions() {
List<Subscription> subs = new ArrayList<>(subscriptions.keySet());
if (!subs.isEmpty()) {
List<TagsValuePair> ms = getMeasurements().stream()
.map(this::newTagsValuePair)
.collect(Collectors.toList());
Evaluator evaluator = new Evaluator().addGroupSubscriptions("local", subs);
EvalPayload payload = evaluator.eval("local", clock().wallTime(), ms);
try {
String json = jsonMapper.writeValueAsString(payload);
HttpClient.DEFAULT.newRequest("spectator-lwc-eval", evalUri)
.withMethod("POST")
.withConnectTimeout(connectTimeout)
.withReadTimeout(readTimeout)
.withJsonContent(json)
.send()
.decompress();
} catch (Exception e) {
logger.warn("failed to send metrics for subscriptions", e);
}
}
}
private void fetchSubscriptions() {
try {
HttpResponse res = HttpClient.DEFAULT.newRequest("spectator-lwc-subs", configUri)
.withMethod("GET")
.withConnectTimeout(connectTimeout)
.withReadTimeout(readTimeout)
.send()
.decompress();
if (res.status() != 200) {
logger.warn("failed to update subscriptions, received status {}", res.status());
} else {
Subscriptions subs = jsonMapper.readValue(res.entity(), Subscriptions.class);
long now = clock.wallTime();
subs.update(subscriptions, now, now + configTTL);
}
} catch (Exception e) {
logger.warn("failed to send metrics", e);
}
}
/**
* Record the difference between the date response time and the local time on the server.
* This is used to get a rough idea of the amount of skew in the environment. Ideally it
* should be fairly small. The date header will only have seconds so we expect to regularly
* have differences of up to 1 second. Note, that it is a rough estimate and could be
* elevated because of unrelated problems like GC or network delays.
*/
private void recordClockSkew(long responseTimestamp) {
if (responseTimestamp == 0L) {
logger.debug("no date timestamp on response, cannot record skew");
} else {
final long delta = clock.wallTime() - responseTimestamp;
if (delta >= 0L) {
// Local clock is running fast compared to the server. Note this should also be the
// common case for if the clocks are in sync as there will be some delay for the server
// response to reach this node.
timer(CLOCK_SKEW_TIMER, "id", "fast").record(delta, TimeUnit.MILLISECONDS);
} else {
// Local clock is running slow compared to the server. This means the response timestamp
// appears to be after the current time on this node. The timer will ignore negative
// values so we negate and record it with a different id.
timer(CLOCK_SKEW_TIMER, "id", "slow").record(-delta, TimeUnit.MILLISECONDS);
}
logger.debug("clock skew between client and server: {}ms", delta);
}
}
private Map<String, String> toMap(Id id) {
Map<String, String> tags = new HashMap<>();
for (Tag t : id.tags()) {
String k = charset.replaceNonMembers(t.key(), '_');
String v = overrides.getOrDefault(k, charset).replaceNonMembers(t.value(), '_');
tags.put(k, v);
}
String name = overrides.getOrDefault("name", charset).replaceNonMembers(id.name(), '_');
tags.put("name", name);
return tags;
}
private TagsValuePair newTagsValuePair(Measurement m) {
Map<String, String> tags = toMap(m.id());
tags.putAll(commonTags);
return new TagsValuePair(tags, m.value());
}
/** Get a list of all measurements from the registry. */
List<Measurement> getMeasurements() {
return stream()
.flatMap(m -> StreamSupport.stream(m.measure().spliterator(), false))
.collect(Collectors.toList());
}
/** Get a list of all measurements and break them into batches. */
List<List<Measurement>> getBatches() {
List<List<Measurement>> batches = new ArrayList<>();
List<Measurement> ms = getMeasurements();
for (int i = 0; i < ms.size(); i += batchSize) {
List<Measurement> batch = ms.subList(i, Math.min(ms.size(), i + batchSize));
batches.add(batch);
}
return batches;
}
@Override protected Counter newCounter(Id id) {
return new AtlasCounter(id, clock, stepMillis);
}
@Override protected DistributionSummary newDistributionSummary(Id id) {
return new AtlasDistributionSummary(id, clock, stepMillis);
}
@Override protected Timer newTimer(Id id) {
return new AtlasTimer(id, clock, stepMillis);
}
@Override protected Gauge newGauge(Id id) {
// Be sure to get StepClock so the measurements will have step aligned
// timestamps.
return new AtlasGauge(id, clock());
}
}