/** * Copyright 2015-2017 The OpenZipkin Authors * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except * in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the License * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express * or implied. See the License for the specific language governing permissions and limitations under * the License. */ package zipkin.storage.cassandra; import com.datastax.driver.core.BoundStatement; import com.datastax.driver.core.PreparedStatement; import com.datastax.driver.core.ProtocolVersion; import com.datastax.driver.core.ResultSet; import com.datastax.driver.core.Row; import com.datastax.driver.core.Session; import com.datastax.driver.core.querybuilder.QueryBuilder; import com.google.common.base.Function; import com.google.common.collect.ContiguousSet; import com.google.common.collect.FluentIterable; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableSet; import com.google.common.collect.Iterators; import com.google.common.collect.Ordering; import com.google.common.collect.Range; import com.google.common.util.concurrent.AsyncFunction; import com.google.common.util.concurrent.Futures; import com.google.common.util.concurrent.ListenableFuture; import java.nio.ByteBuffer; import java.nio.charset.CharacterCodingException; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.Date; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Set; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import zipkin.Codec; import zipkin.DependencyLink; import zipkin.Span; import zipkin.internal.CorrectForClockSkew; import zipkin.internal.Dependencies; import zipkin.internal.DependencyLinker; import zipkin.internal.GroupByTraceId; import zipkin.internal.MergeById; import zipkin.internal.Nullable; import zipkin.storage.QueryRequest; import zipkin.storage.guava.GuavaSpanStore; import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.base.Preconditions.checkNotNull; import static com.google.common.collect.DiscreteDomain.integers; import static com.google.common.util.concurrent.Futures.allAsList; import static com.google.common.util.concurrent.Futures.immediateFailedFuture; import static com.google.common.util.concurrent.Futures.immediateFuture; import static com.google.common.util.concurrent.Futures.transform; import static zipkin.internal.Util.getDays; public final class CassandraSpanStore implements GuavaSpanStore { private static final Logger LOG = LoggerFactory.getLogger(CassandraSpanStore.class); static final ListenableFuture<List<String>> EMPTY_LIST = immediateFuture(Collections.<String>emptyList()); private final int maxTraceCols; private final int indexFetchMultiplier; private final boolean strictTraceId; private final Session session; private final TimestampCodec timestampCodec; private final Set<Integer> buckets; private final PreparedStatement selectTraces; private final PreparedStatement selectDependencies; private final PreparedStatement selectServiceNames; private final PreparedStatement selectSpanNames; private final PreparedStatement selectTraceIdsByServiceName; private final PreparedStatement selectTraceIdsByServiceNames; private final PreparedStatement selectTraceIdsBySpanName; private final PreparedStatement selectTraceIdsByAnnotation; private final Function<ResultSet, Map<Long, Long>> traceIdToTimestamp; CassandraSpanStore(Session session, int bucketCount, int maxTraceCols, int indexFetchMultiplier, boolean strictTraceId) { this.session = session; this.maxTraceCols = maxTraceCols; this.indexFetchMultiplier = indexFetchMultiplier; this.strictTraceId = strictTraceId; ProtocolVersion protocolVersion = session.getCluster() .getConfiguration().getProtocolOptions().getProtocolVersion(); this.timestampCodec = new TimestampCodec(protocolVersion); this.buckets = ContiguousSet.create(Range.closedOpen(0, bucketCount), integers()); selectTraces = session.prepare( QueryBuilder.select("trace_id", "span") .from("traces") .where(QueryBuilder.in("trace_id", QueryBuilder.bindMarker("trace_id"))) .limit(QueryBuilder.bindMarker("limit_"))); selectDependencies = session.prepare( QueryBuilder.select("dependencies") .from("dependencies") .where(QueryBuilder.in("day", QueryBuilder.bindMarker("days")))); selectServiceNames = session.prepare( QueryBuilder.select("service_name") .from(Tables.SERVICE_NAMES)); selectSpanNames = session.prepare( QueryBuilder.select("span_name") .from(Tables.SPAN_NAMES) .where(QueryBuilder.eq("service_name", QueryBuilder.bindMarker("service_name"))) .and(QueryBuilder.eq("bucket", QueryBuilder.bindMarker("bucket"))) .limit(QueryBuilder.bindMarker("limit_"))); selectTraceIdsByServiceName = session.prepare( QueryBuilder.select("ts", "trace_id") .from(Tables.SERVICE_NAME_INDEX) .where(QueryBuilder.eq("service_name", QueryBuilder.bindMarker("service_name"))) .and(QueryBuilder.in("bucket", QueryBuilder.bindMarker("bucket"))) .and(QueryBuilder.gte("ts", QueryBuilder.bindMarker("start_ts"))) .and(QueryBuilder.lte("ts", QueryBuilder.bindMarker("end_ts"))) .limit(QueryBuilder.bindMarker("limit_")) .orderBy(QueryBuilder.desc("ts"))); selectTraceIdsBySpanName = session.prepare( QueryBuilder.select("ts", "trace_id") .from(Tables.SERVICE_SPAN_NAME_INDEX) .where( QueryBuilder.eq("service_span_name", QueryBuilder.bindMarker("service_span_name"))) .and(QueryBuilder.gte("ts", QueryBuilder.bindMarker("start_ts"))) .and(QueryBuilder.lte("ts", QueryBuilder.bindMarker("end_ts"))) .limit(QueryBuilder.bindMarker("limit_")) .orderBy(QueryBuilder.desc("ts"))); selectTraceIdsByAnnotation = session.prepare( QueryBuilder.select("ts", "trace_id") .from(Tables.ANNOTATIONS_INDEX) .where(QueryBuilder.eq("annotation", QueryBuilder.bindMarker("annotation"))) .and(QueryBuilder.in("bucket", QueryBuilder.bindMarker("bucket"))) .and(QueryBuilder.gte("ts", QueryBuilder.bindMarker("start_ts"))) .and(QueryBuilder.lte("ts", QueryBuilder.bindMarker("end_ts"))) .limit(QueryBuilder.bindMarker("limit_")) .orderBy(QueryBuilder.desc("ts"))); if (protocolVersion.compareTo(ProtocolVersion.V4) < 0) { LOG.warn("Please update Cassandra to 2.2 or later, as some features may fail"); // Log vs failing on "Partition KEY part service_name cannot be restricted by IN relation" selectTraceIdsByServiceNames = null; } else { selectTraceIdsByServiceNames = session.prepare( QueryBuilder.select("ts", "trace_id") .from(Tables.SERVICE_NAME_INDEX) .where(QueryBuilder.in("service_name", QueryBuilder.bindMarker("service_name"))) .and(QueryBuilder.in("bucket", QueryBuilder.bindMarker("bucket"))) .and(QueryBuilder.gte("ts", QueryBuilder.bindMarker("start_ts"))) .and(QueryBuilder.lte("ts", QueryBuilder.bindMarker("end_ts"))) .limit(QueryBuilder.bindMarker("limit_")) .orderBy(QueryBuilder.desc("ts"))); } traceIdToTimestamp = input -> { Map<Long, Long> result = new LinkedHashMap<>(); for (Row row : input) { result.put(row.getLong("trace_id"), timestampCodec.deserialize(row, "ts")); } return result; }; } /** * This fans out into a potentially large amount of requests related to the amount of annotations * queried. The returned future will fail if any of the inputs fail. * * <p>When {@link QueryRequest#serviceName service name} is unset, service names will be * fetched eagerly, implying an additional query. */ @Override public ListenableFuture<List<List<Span>>> getTraces(final QueryRequest request) { // Over fetch on indexes as they don't return distinct (trace id, timestamp) rows. final int traceIndexFetchSize = request.limit * indexFetchMultiplier; ListenableFuture<Map<Long, Long>> traceIdToTimestamp; if (request.spanName != null) { traceIdToTimestamp = getTraceIdsBySpanName(request.serviceName, request.spanName, request.endTs * 1000, request.lookback * 1000, traceIndexFetchSize); } else if (request.serviceName != null) { traceIdToTimestamp = getTraceIdsByServiceNames(Collections.singletonList(request.serviceName), request.endTs * 1000, request.lookback * 1000, traceIndexFetchSize); } else { checkArgument(selectTraceIdsByServiceNames != null, "getTraces without serviceName requires Cassandra 2.2 or later"); traceIdToTimestamp = transform(getServiceNames(), new AsyncFunction<List<String>, Map<Long, Long>>() { @Override public ListenableFuture<Map<Long, Long>> apply(List<String> serviceNames) { return getTraceIdsByServiceNames(serviceNames, request.endTs * 1000, request.lookback * 1000, traceIndexFetchSize); } }); } List<String> annotationKeys = CassandraUtil.annotationKeys(request); ListenableFuture<Set<Long>> traceIds; if (annotationKeys.isEmpty()) { // Simplest case is when there is no annotation query. Limit is valid since there's no AND // query that could reduce the results returned to less than the limit. traceIds = Futures.transform(traceIdToTimestamp, CassandraUtil.keyset()); } else { // While a valid port of the scala cassandra span store (from zipkin 1.35), there is a fault. // each annotation key is an intersection, meaning we likely return < traceIndexFetchSize. List<ListenableFuture<Map<Long, Long>>> futureKeySetsToIntersect = new ArrayList<>(); if (request.spanName != null) { futureKeySetsToIntersect.add(traceIdToTimestamp); } for (String annotationKey : annotationKeys) { futureKeySetsToIntersect.add(getTraceIdsByAnnotation(annotationKey, request.endTs * 1000, request.lookback * 1000, traceIndexFetchSize)); } // We achieve the AND goal, by intersecting each of the key sets. traceIds = Futures.transform(allAsList(futureKeySetsToIntersect), CassandraUtil.intersectKeySets()); } return transform(traceIds, new AsyncFunction<Set<Long>, List<List<Span>>>() { @Override public ListenableFuture<List<List<Span>>> apply(Set<Long> traceIds) { traceIds = ImmutableSet.copyOf(Iterators.limit(traceIds.iterator(), request.limit)); return transform(getSpansByTraceIds(traceIds, maxTraceCols), new Function<List<Span>, List<List<Span>>>() { @Override public List<List<Span>> apply(List<Span> input) { // Indexes only contain Span.traceId, so our matches are imprecise on Span.traceIdHigh return FluentIterable.from(GroupByTraceId.apply(input, strictTraceId, true)) .filter(trace -> trace.get(0).traceIdHigh == 0 || request.test(trace)) .toList(); } }); } @Override public String toString() { return "getSpansByTraceIds"; } }); } @Override public ListenableFuture<List<Span>> getRawTrace(long traceId) { return getRawTrace(0L, traceId); } /** * Since the schema doesn't have a unique index on {@link Span#traceIdHigh}, we have to filter * client-side. */ @Override public ListenableFuture<List<Span>> getRawTrace(final long traceIdHigh, long traceIdLow) { return transform(getSpansByTraceIds(Collections.singleton(traceIdLow), maxTraceCols), new Function<List<Span>, List<Span>>() { @Override public List<Span> apply(List<Span> input) { if (strictTraceId) { Iterator<Span> spans = input.iterator(); while (spans.hasNext()) { long nextTraceIdHigh = spans.next().traceIdHigh; if (nextTraceIdHigh != 0L && nextTraceIdHigh != traceIdHigh) { spans.remove(); } } } return input.isEmpty() ? null : input; } }); } @Override public ListenableFuture<List<Span>> getTrace(long traceId) { return getTrace(0L, traceId); } @Override public ListenableFuture<List<Span>> getTrace(long traceIdHigh, long traceIdLow) { return transform(getRawTrace(traceIdHigh, traceIdLow), AdjustTrace.INSTANCE); } enum AdjustTrace implements Function<Collection<Span>, List<Span>> { INSTANCE; @Override public List<Span> apply(Collection<Span> input) { List<Span> result = CorrectForClockSkew.apply(MergeById.apply(input)); return result.isEmpty() ? null : result; } } @Override public ListenableFuture<List<String>> getServiceNames() { try { BoundStatement bound = CassandraUtil.bindWithName(selectServiceNames, "select-service-names"); return transform(session.executeAsync(bound), new Function<ResultSet, List<String>>() { @Override public List<String> apply(ResultSet input) { Set<String> serviceNames = new HashSet<>(); for (Row row : input) { serviceNames.add(row.getString("service_name")); } return Ordering.natural().sortedCopy(serviceNames); } } ); } catch (RuntimeException ex) { return immediateFailedFuture(ex); } } @Override public ListenableFuture<List<String>> getSpanNames(String serviceName) { if (serviceName == null || serviceName.isEmpty()) return EMPTY_LIST; serviceName = checkNotNull(serviceName, "serviceName").toLowerCase(); int bucket = 0; try { BoundStatement bound = CassandraUtil.bindWithName(selectSpanNames, "select-span-names") .setString("service_name", serviceName) .setInt("bucket", bucket) // no one is ever going to browse so many span names .setInt("limit_", 1000); return transform(session.executeAsync(bound), new Function<ResultSet, List<String>>() { @Override public List<String> apply(ResultSet input) { Set<String> spanNames = new HashSet<>(); for (Row row : input) { spanNames.add(row.getString("span_name")); } return Ordering.natural().sortedCopy(spanNames); } } ); } catch (RuntimeException ex) { return immediateFailedFuture(ex); } } @Override public ListenableFuture<List<DependencyLink>> getDependencies(long endTs, @Nullable Long lookback) { List<Date> days = getDays(endTs, lookback); try { BoundStatement bound = CassandraUtil.bindWithName(selectDependencies, "select-dependencies") .setList("days", days); return transform(session.executeAsync(bound), ConvertDependenciesResponse.INSTANCE); } catch (RuntimeException ex) { return immediateFailedFuture(ex); } } enum ConvertDependenciesResponse implements Function<ResultSet, List<DependencyLink>> { INSTANCE; @Override public List<DependencyLink> apply(ResultSet rs) { ImmutableList.Builder<DependencyLink> unmerged = ImmutableList.builder(); for (Row row : rs) { ByteBuffer encodedDayOfDependencies = row.getBytes("dependencies"); for (DependencyLink link : Dependencies.fromThrift(encodedDayOfDependencies).links) { unmerged.add(link); } } return DependencyLinker.merge(unmerged.build()); } } /** * Get the available trace information from the storage system. Spans in trace should be sorted by * the first annotation timestamp in that span. First event should be first in the spans list. <p> * The return list will contain only spans that have been found, thus the return list may not * match the provided list of ids. */ ListenableFuture<List<Span>> getSpansByTraceIds(Set<Long> traceIds, int limit) { checkNotNull(traceIds, "traceIds"); if (traceIds.isEmpty()) { return immediateFuture(Collections.<Span>emptyList()); } try { BoundStatement bound = CassandraUtil.bindWithName(selectTraces, "select-traces") .setSet("trace_id", traceIds) .setInt("limit_", limit); bound.setFetchSize(Integer.MAX_VALUE); return transform(session.executeAsync(bound), new Function<ResultSet, List<Span>>() { @Override public List<Span> apply(ResultSet input) { List<Span> result = new ArrayList<>(input.getAvailableWithoutFetching()); for (Row row : input) { result.add(Codec.THRIFT.readSpan(row.getBytes("span"))); } return result; } } ); } catch (RuntimeException ex) { return immediateFailedFuture(ex); } } ListenableFuture<Map<Long, Long>> getTraceIdsByServiceNames(List<String> serviceNames, long endTs, long lookback, int limit) { if (serviceNames.isEmpty()) return immediateFuture(Collections.<Long, Long>emptyMap()); long startTs = Math.max(endTs - lookback, 0); // >= 1970 try { // This guards use of "in" query to give people a little more time to move off Cassandra 2.1 // Note that it will still fail when serviceNames.size() > 1 BoundStatement bound = serviceNames.size() == 1 ? CassandraUtil.bindWithName(selectTraceIdsByServiceName, "select-trace-ids-by-service-name") .setString("service_name", serviceNames.get(0)) .setSet("bucket", buckets) .setBytesUnsafe("start_ts", timestampCodec.serialize(startTs)) .setBytesUnsafe("end_ts", timestampCodec.serialize(endTs)) .setInt("limit_", limit) : CassandraUtil.bindWithName(selectTraceIdsByServiceNames, "select-trace-ids-by-service-names") .setList("service_name", serviceNames) .setSet("bucket", buckets) .setBytesUnsafe("start_ts", timestampCodec.serialize(startTs)) .setBytesUnsafe("end_ts", timestampCodec.serialize(endTs)) .setInt("limit_", limit); bound.setFetchSize(Integer.MAX_VALUE); return transform(session.executeAsync(bound), traceIdToTimestamp); } catch (RuntimeException ex) { return immediateFailedFuture(ex); } } ListenableFuture<Map<Long, Long>> getTraceIdsBySpanName(String serviceName, String spanName, long endTs, long lookback, int limit) { checkArgument(serviceName != null, "serviceName required on spanName query"); checkArgument(spanName != null, "spanName required on spanName query"); String serviceSpanName = serviceName + "." + spanName; long startTs = Math.max(endTs - lookback, 0); // >= 1970 try { BoundStatement bound = CassandraUtil.bindWithName(selectTraceIdsBySpanName, "select-trace-ids-by-span-name") .setString("service_span_name", serviceSpanName) .setBytesUnsafe("start_ts", timestampCodec.serialize(startTs)) .setBytesUnsafe("end_ts", timestampCodec.serialize(endTs)) .setInt("limit_", limit); return transform(session.executeAsync(bound), traceIdToTimestamp); } catch (RuntimeException ex) { return immediateFailedFuture(ex); } } ListenableFuture<Map<Long, Long>> getTraceIdsByAnnotation(String annotationKey, long endTs, long lookback, int limit) { long startTs = Math.max(endTs - lookback, 0); // >= 1970 try { BoundStatement bound = CassandraUtil.bindWithName(selectTraceIdsByAnnotation, "select-trace-ids-by-annotation") .setBytes("annotation", CassandraUtil.toByteBuffer(annotationKey)) .setSet("bucket", buckets) .setBytesUnsafe("start_ts", timestampCodec.serialize(startTs)) .setBytesUnsafe("end_ts", timestampCodec.serialize(endTs)) .setInt("limit_", limit); bound.setFetchSize(Integer.MAX_VALUE); return transform(session.executeAsync(bound), new Function<ResultSet, Map<Long, Long>>() { @Override public Map<Long, Long> apply(ResultSet input) { Map<Long, Long> traceIdsToTimestamps = new LinkedHashMap<>(); for (Row row : input) { traceIdsToTimestamps.put(row.getLong("trace_id"), timestampCodec.deserialize(row, "ts")); } return traceIdsToTimestamps; } } ); } catch (CharacterCodingException | RuntimeException ex) { return immediateFailedFuture(ex); } } }