/**
* Copyright 2015-2017 The OpenZipkin Authors
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
* in compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License
* is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
* or implied. See the License for the specific language governing permissions and limitations under
* the License.
*/
package zipkin.storage.cassandra3;
import com.datastax.driver.core.BoundStatement;
import com.datastax.driver.core.KeyspaceMetadata;
import com.datastax.driver.core.PreparedStatement;
import com.datastax.driver.core.ResultSet;
import com.datastax.driver.core.Row;
import com.datastax.driver.core.Session;
import com.datastax.driver.core.querybuilder.QueryBuilder;
import com.datastax.driver.core.utils.UUIDs;
import com.google.common.base.Function;
import com.google.common.collect.ContiguousSet;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Iterators;
import com.google.common.collect.Ordering;
import com.google.common.collect.Range;
import com.google.common.util.concurrent.AsyncFunction;
import com.google.common.util.concurrent.Futures;
import com.google.common.util.concurrent.ListenableFuture;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Date;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ExecutionException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import zipkin.Codec;
import zipkin.DependencyLink;
import zipkin.Span;
import zipkin.internal.CorrectForClockSkew;
import zipkin.internal.DependencyLinker;
import zipkin.internal.GroupByTraceId;
import zipkin.internal.MergeById;
import zipkin.internal.Nullable;
import zipkin.storage.QueryRequest;
import zipkin.storage.cassandra3.Schema.AnnotationUDT;
import zipkin.storage.cassandra3.Schema.BinaryAnnotationUDT;
import zipkin.storage.cassandra3.Schema.TraceIdUDT;
import zipkin.storage.guava.GuavaSpanStore;
import static com.google.common.base.Preconditions.checkNotNull;
import static com.google.common.collect.DiscreteDomain.integers;
import static com.google.common.util.concurrent.Futures.allAsList;
import static com.google.common.util.concurrent.Futures.immediateFailedFuture;
import static com.google.common.util.concurrent.Futures.immediateFuture;
import static com.google.common.util.concurrent.Futures.transform;
import static zipkin.internal.Util.getDays;
import static zipkin.storage.cassandra3.Schema.TABLE_SERVICE_SPANS;
import static zipkin.storage.cassandra3.Schema.TABLE_TRACES;
import static zipkin.storage.cassandra3.Schema.TABLE_TRACE_BY_SERVICE_SPAN;
final class CassandraSpanStore implements GuavaSpanStore {
private static final Logger LOG = LoggerFactory.getLogger(CassandraSpanStore.class);
static final ListenableFuture<List<String>> EMPTY_LIST =
immediateFuture(Collections.<String>emptyList());
static final Function<List<Span>, List<Span>> OR_NULL = input -> input.isEmpty() ? null : input;
private final int maxTraceCols;
private final int indexFetchMultiplier;
private final boolean strictTraceId;
private final Session session;
private final PreparedStatement selectTraces;
private final PreparedStatement selectDependencies;
private final PreparedStatement selectServiceNames;
private final PreparedStatement selectSpanNames;
private final PreparedStatement selectTraceIdsByServiceSpanName;
private final PreparedStatement selectTraceIdsByServiceSpanNameAndDuration;
private final PreparedStatement selectTraceIdsByAnnotation;
private final Function<ResultSet, Map<TraceIdUDT, Long>> traceIdToTimestamp;
private final Function<List<Map<TraceIdUDT, Long>>, Map<TraceIdUDT, Long>> collapseTraceIdMaps;
private final int traceTtl;
private final int indexTtl;
CassandraSpanStore(Session session, int maxTraceCols, int indexFetchMultiplier,
boolean strictTraceId) {
this.session = session;
this.maxTraceCols = maxTraceCols;
this.indexFetchMultiplier = indexFetchMultiplier;
this.strictTraceId = strictTraceId;
selectTraces = session.prepare(
QueryBuilder.select("trace_id", "id", "ts", "span_name", "parent_id", "duration",
"annotations", "binary_annotations")
.from(TABLE_TRACES)
.where(QueryBuilder.in("trace_id", QueryBuilder.bindMarker("trace_id")))
.limit(QueryBuilder.bindMarker("limit_")));
selectDependencies = session.prepare(
QueryBuilder.select("links")
.from(Schema.TABLE_DEPENDENCIES)
.where(QueryBuilder.in("day", QueryBuilder.bindMarker("days"))));
selectServiceNames = session.prepare(
QueryBuilder.select("service_name")
.distinct()
.from(TABLE_SERVICE_SPANS));
selectSpanNames = session.prepare(
QueryBuilder.select("span_name")
.from(TABLE_SERVICE_SPANS)
.where(QueryBuilder.eq("service_name", QueryBuilder.bindMarker("service_name")))
.limit(QueryBuilder.bindMarker("limit_")));
selectTraceIdsByServiceSpanName = session.prepare(
QueryBuilder.select("ts", "trace_id")
.from(TABLE_TRACE_BY_SERVICE_SPAN)
.where(QueryBuilder.eq("service_name", QueryBuilder.bindMarker("service_name")))
.and(QueryBuilder.eq("span_name", QueryBuilder.bindMarker("span_name")))
.and(QueryBuilder.eq("bucket", QueryBuilder.bindMarker("bucket")))
.and(QueryBuilder.gte("ts", QueryBuilder.bindMarker("start_ts")))
.and(QueryBuilder.lte("ts", QueryBuilder.bindMarker("end_ts")))
.limit(QueryBuilder.bindMarker("limit_")));
selectTraceIdsByServiceSpanNameAndDuration = session.prepare(
QueryBuilder.select("ts", "trace_id")
.from(TABLE_TRACE_BY_SERVICE_SPAN)
.where(QueryBuilder.eq("service_name", QueryBuilder.bindMarker("service_name")))
.and(QueryBuilder.eq("span_name", QueryBuilder.bindMarker("span_name")))
.and(QueryBuilder.eq("bucket", QueryBuilder.bindMarker("bucket")))
.and(QueryBuilder.gte("ts", QueryBuilder.bindMarker("start_ts")))
.and(QueryBuilder.lte("ts", QueryBuilder.bindMarker("end_ts")))
.and(QueryBuilder.gte("duration", QueryBuilder.bindMarker("start_duration")))
.and(QueryBuilder.lte("duration", QueryBuilder.bindMarker("end_duration")))
.limit(QueryBuilder.bindMarker("limit_")));
selectTraceIdsByAnnotation = session.prepare(
QueryBuilder.select("ts", "trace_id")
.from(TABLE_TRACES)
.where(QueryBuilder.like("all_annotations", QueryBuilder.bindMarker("annotation")))
.and(QueryBuilder.gte("ts_uuid", QueryBuilder.bindMarker("start_ts")))
.and(QueryBuilder.lte("ts_uuid", QueryBuilder.bindMarker("end_ts")))
.limit(QueryBuilder.bindMarker("limit_"))
.allowFiltering());
traceIdToTimestamp = input -> {
Map<TraceIdUDT, Long> result = new LinkedHashMap<>();
for (Row row : input) {
result.put(row.get("trace_id", TraceIdUDT.class), UUIDs.unixTimestamp(row.getUUID("ts")));
}
return result;
};
collapseTraceIdMaps = input -> {
Map<TraceIdUDT, Long> result = new LinkedHashMap<>();
for (Map<TraceIdUDT, Long> m : input) {
result.putAll(m);
}
return result;
};
KeyspaceMetadata md = Schema.getKeyspaceMetadata(session);
this.traceTtl = md.getTable(TABLE_TRACES).getOptions().getDefaultTimeToLive();
this.indexTtl = md.getTable(TABLE_TRACE_BY_SERVICE_SPAN).getOptions().getDefaultTimeToLive();
}
/**
* This fans out into a number of requests. The returned future will fail if any of the
* inputs fail.
*
* <p>When {@link QueryRequest#serviceName service name} is unset, service names will be
* fetched eagerly, implying an additional query.
*
* <p>The duration query is the most expensive query in cassandra, as it turns into 1 request per
* hour of {@link QueryRequest#lookback lookback}. Because many times lookback is set to a day,
* this means 24 requests to the backend!
*
* <p>See https://github.com/openzipkin/zipkin-java/issues/200
*/
@Override
public ListenableFuture<List<List<Span>>> getTraces(final QueryRequest request) {
// Over fetch on indexes as they don't return distinct (trace id, timestamp) rows.
final int traceIndexFetchSize = request.limit * indexFetchMultiplier;
ListenableFuture<Map<TraceIdUDT, Long>> traceIdToTimestamp = getTraceIdsByServiceNames(request);
List<String> annotationKeys = CassandraUtil.annotationKeys(request);
ListenableFuture<Collection<TraceIdUDT>> traceIds;
if (annotationKeys.isEmpty()) {
// Simplest case is when there is no annotation query. Limit is valid since there's no AND
// query that could reduce the results returned to less than the limit.
traceIds =
Futures.transform(traceIdToTimestamp, CassandraUtil.traceIdsSortedByDescTimestamp());
} else {
// While a valid port of the scala cassandra span store (from zipkin 1.35), there is a fault.
// each annotation key is an intersection, meaning we likely return < traceIndexFetchSize.
List<ListenableFuture<Map<TraceIdUDT, Long>>> futureKeySetsToIntersect = new ArrayList<>();
if (request.spanName != null) {
futureKeySetsToIntersect.add(traceIdToTimestamp);
}
for (String annotationKey : annotationKeys) {
futureKeySetsToIntersect
.add(getTraceIdsByAnnotation(annotationKey, request.endTs, request.lookback, traceIndexFetchSize));
}
// We achieve the AND goal, by intersecting each of the key sets.
traceIds =
Futures.transform(allAsList(futureKeySetsToIntersect), CassandraUtil.intersectKeySets());
// @xxx the sorting by timestamp desc is broken here^
}
return transform(traceIds, new AsyncFunction<Collection<TraceIdUDT>, List<List<Span>>>() {
@Override public ListenableFuture<List<List<Span>>> apply(Collection<TraceIdUDT> traceIds) {
ImmutableSet<TraceIdUDT> set =
ImmutableSet.copyOf(Iterators.limit(traceIds.iterator(), request.limit));
return transform(getSpansByTraceIds(set, maxTraceCols),
new Function<List<Span>, List<List<Span>>>() {
@Override public List<List<Span>> apply(List<Span> input) {
return GroupByTraceId.apply(input, strictTraceId, true);
}
});
}
@Override public String toString() {
return "getSpansByTraceIds";
}
});
}
@Override public ListenableFuture<List<Span>> getRawTrace(long traceId) {
return getRawTrace(0L, traceId);
}
@Override public ListenableFuture<List<Span>> getRawTrace(long traceIdHigh, long traceIdLow) {
TraceIdUDT traceIdUDT = new TraceIdUDT(strictTraceId ? 0L : traceIdHigh, traceIdLow);
return transform(getSpansByTraceIds(Collections.singleton(traceIdUDT), maxTraceCols), OR_NULL);
}
@Override public ListenableFuture<List<Span>> getTrace(long traceId) {
return getTrace(0L, traceId);
}
@Override public ListenableFuture<List<Span>> getTrace(long traceIdHigh, long traceIdLow) {
return transform(getRawTrace(traceIdHigh, traceIdLow), AdjustTrace.INSTANCE);
}
enum AdjustTrace implements Function<Collection<Span>, List<Span>> {
INSTANCE;
@Override public List<Span> apply(Collection<Span> input) {
List<Span> result = CorrectForClockSkew.apply(MergeById.apply(input));
return result.isEmpty() ? null : result;
}
}
@Override public ListenableFuture<List<String>> getServiceNames() {
try {
BoundStatement bound = CassandraUtil.bindWithName(selectServiceNames, "select-service-names");
return transform(session.executeAsync(bound), new Function<ResultSet, List<String>>() {
@Override public List<String> apply(ResultSet input) {
Set<String> serviceNames = new LinkedHashSet<>();
for (Row row : input) {
serviceNames.add(row.getString("service_name"));
}
return Ordering.natural().sortedCopy(serviceNames);
}
}
);
} catch (RuntimeException ex) {
return immediateFailedFuture(ex);
}
}
@Override public ListenableFuture<List<String>> getSpanNames(String serviceName) {
if (serviceName == null || serviceName.isEmpty()) return EMPTY_LIST;
serviceName = checkNotNull(serviceName, "serviceName").toLowerCase();
try {
BoundStatement bound = CassandraUtil.bindWithName(selectSpanNames, "select-span-names")
.setString("service_name", serviceName)
// no one is ever going to browse so many span names
.setInt("limit_", 1000);
return transform(session.executeAsync(bound), new Function<ResultSet, List<String>>() {
@Override public List<String> apply(ResultSet input) {
Set<String> spanNames = new LinkedHashSet<>();
for (Row row : input) {
if (!row.getString("span_name").isEmpty()) {
spanNames.add(row.getString("span_name"));
}
}
return Ordering.natural().sortedCopy(spanNames);
}
}
);
} catch (RuntimeException ex) {
return immediateFailedFuture(ex);
}
}
@Override public ListenableFuture<List<DependencyLink>> getDependencies(long endTs,
@Nullable Long lookback) {
List<Date> days = getDays(endTs, lookback);
try {
BoundStatement bound = CassandraUtil.bindWithName(selectDependencies, "select-dependencies")
.setList("days", days);
return transform(session.executeAsync(bound), ConvertDependenciesResponse.INSTANCE);
} catch (RuntimeException ex) {
return immediateFailedFuture(ex);
}
}
enum ConvertDependenciesResponse implements Function<ResultSet, List<DependencyLink>> {
INSTANCE;
@Override public List<DependencyLink> apply(ResultSet rs) {
ImmutableList.Builder<DependencyLink> unmerged = ImmutableList.builder();
for (Row row : rs) {
ByteBuffer encodedDayOfDependencies = row.getBytes("links");
for (DependencyLink link : Codec.THRIFT.readDependencyLinks(encodedDayOfDependencies)) {
unmerged.add(link);
}
}
return DependencyLinker.merge(unmerged.build());
}
}
/**
* Get the available trace information from the storage system. Spans in trace should be sorted by
* the first annotation timestamp in that span. First event should be first in the spans list. <p>
* The return list will contain only spans that have been found, thus the return list may not
* match the provided list of ids.
*/
ListenableFuture<List<Span>> getSpansByTraceIds(Set<TraceIdUDT> traceIds, int limit) {
checkNotNull(traceIds, "traceIds");
if (traceIds.isEmpty()) {
return immediateFuture(Collections.<Span>emptyList());
}
try {
BoundStatement bound = CassandraUtil.bindWithName(selectTraces, "select-traces")
.setSet("trace_id", traceIds)
.setInt("limit_", limit);
return transform(session.executeAsync(bound),
new Function<ResultSet, List<Span>>() {
@Override public List<Span> apply(ResultSet input) {
List<Span> result = new ArrayList<>(input.getAvailableWithoutFetching());
for (Row row : input) {
TraceIdUDT traceId = row.get("trace_id", TraceIdUDT.class);
Span.Builder builder = Span.builder()
.traceIdHigh(traceId.getHigh())
.traceId(traceId.getLow())
.id(row.getLong("id"))
.name(row.getString("span_name"))
.duration(row.getLong("duration"));
if (!row.isNull("ts")) {
builder = builder.timestamp(row.getLong("ts"));
}
if (!row.isNull("duration")) {
builder = builder.duration(row.getLong("duration"));
}
if (!row.isNull("parent_id")) {
builder = builder.parentId(row.getLong("parent_id"));
}
for (AnnotationUDT udt : row.getList("annotations", AnnotationUDT.class)) {
builder = builder.addAnnotation(udt.toAnnotation());
}
for (BinaryAnnotationUDT udt : row.getList("binary_annotations",
BinaryAnnotationUDT.class)) {
builder = builder.addBinaryAnnotation(udt.toBinaryAnnotation());
}
result.add(builder.build());
}
return result;
}
}
);
} catch (RuntimeException ex) {
return immediateFailedFuture(ex);
}
}
ListenableFuture<Map<TraceIdUDT, Long>> getTraceIdsByServiceNames(QueryRequest request) {
long oldestData = Math.max(System.currentTimeMillis() - indexTtl * 1000, 0); // >= 1970
long startTsMillis = Math.max((request.endTs - request.lookback), oldestData);
long endTsMillis = Math.max(request.endTs, oldestData);
try {
Set<String> serviceNames;
if (null != request.serviceName) {
serviceNames = Collections.singleton(request.serviceName);
} else {
serviceNames = new LinkedHashSet<>(getServiceNames().get());
if (serviceNames.isEmpty()) {
return immediateFuture(Collections.<TraceIdUDT, Long>emptyMap());
}
}
int startBucket = CassandraUtil.durationIndexBucket(startTsMillis * 1000);
int endBucket = CassandraUtil.durationIndexBucket(endTsMillis * 1000);
if (startBucket > endBucket) {
throw new IllegalArgumentException(
"Start bucket (" + startBucket + ") > end bucket (" + endBucket + ")");
}
Set<Integer> buckets = ContiguousSet.create(Range.closed(startBucket, endBucket), integers());
boolean withDuration = null != request.minDuration || null != request.maxDuration;
List<ListenableFuture<Map<TraceIdUDT, Long>>> futures = new ArrayList<>();
if (200 < serviceNames.size() * buckets.size()) {
LOG.warn("read against " + TABLE_TRACE_BY_SERVICE_SPAN
+ " fanning out to " + serviceNames.size() * buckets.size() + " requests");
//@xxx the fan-out of requests here can be improved
}
for (String serviceName : serviceNames) {
for (Integer bucket : buckets) {
BoundStatement bound = CassandraUtil
.bindWithName(
withDuration ? selectTraceIdsByServiceSpanNameAndDuration
: selectTraceIdsByServiceSpanName,
"select-trace-ids-by-service-name")
.setString("service_name", serviceName)
.setString("span_name", null != request.spanName ? request.spanName : "")
.setInt("bucket", bucket)
.setUUID("start_ts", UUIDs.startOf(startTsMillis))
.setUUID("end_ts", UUIDs.endOf(endTsMillis))
.setInt("limit_", request.limit);
if (withDuration) {
bound = bound
.setLong("start_duration", null != request.minDuration ? request.minDuration : 0)
.setLong("end_duration",
null != request.maxDuration ? request.maxDuration : Long.MAX_VALUE);
}
bound.setFetchSize(Integer.MAX_VALUE);
futures.add(transform(session.executeAsync(bound), traceIdToTimestamp));
}
}
return transform(allAsList(futures), collapseTraceIdMaps);
} catch (RuntimeException | InterruptedException | ExecutionException ex) {
return immediateFailedFuture(ex);
}
}
ListenableFuture<Map<TraceIdUDT, Long>> getTraceIdsByAnnotation(
String annotationKey,
long endTsMillis,
long lookbackMillis,
int limit) {
long oldestData = Math.max(System.currentTimeMillis() - indexTtl * 1000, 0); // >= 1970
long startTsMillis = Math.max((endTsMillis - lookbackMillis), oldestData);
endTsMillis = Math.max(endTsMillis, oldestData);
try {
BoundStatement bound =
CassandraUtil.bindWithName(selectTraceIdsByAnnotation, "select-trace-ids-by-annotation")
.setString("annotation", "%" + annotationKey + "%")
.setUUID("start_ts", UUIDs.startOf(startTsMillis))
.setUUID("end_ts", UUIDs.endOf(endTsMillis))
.setInt("limit_", limit);
return transform(session.executeAsync(bound),
new Function<ResultSet, Map<TraceIdUDT, Long>>() {
@Override public Map<TraceIdUDT, Long> apply(ResultSet input) {
Map<TraceIdUDT, Long> traceIdsToTimestamps = new LinkedHashMap<>();
for (Row row : input) {
traceIdsToTimestamps.put(
row.get("trace_id", TraceIdUDT.class),
row.getLong("ts"));
}
return traceIdsToTimestamps;
}
}
);
} catch (RuntimeException ex) {
return immediateFailedFuture(ex);
}
}
}