// This file is part of OpenTSDB. // Copyright (C) 2010-2012 The OpenTSDB Authors. // // This program is free software: you can redistribute it and/or modify it // under the terms of the GNU Lesser General Public License as published by // the Free Software Foundation, either version 2.1 of the License, or (at your // option) any later version. This program is distributed in the hope that it // will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty // of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser // General Public License for more details. You should have received a copy // of the GNU Lesser General Public License along with this program. If not, // see <http://www.gnu.org/licenses/>. package net.opentsdb.core; import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Comparator; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeMap; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.hbase.async.Bytes; import org.hbase.async.DeleteRequest; import org.hbase.async.HBaseException; import org.hbase.async.KeyValue; import org.hbase.async.Scanner; import org.hbase.async.Bytes.ByteMap; import com.google.common.annotations.VisibleForTesting; import com.stumbleupon.async.Callback; import com.stumbleupon.async.Deferred; import com.stumbleupon.async.DeferredGroupException; import net.opentsdb.query.QueryUtil; import net.opentsdb.query.filter.TagVFilter; import net.opentsdb.stats.Histogram; import net.opentsdb.stats.QueryStats; import net.opentsdb.stats.QueryStats.QueryStat; import net.opentsdb.uid.NoSuchUniqueId; import net.opentsdb.uid.NoSuchUniqueName; import net.opentsdb.uid.UniqueId; import net.opentsdb.utils.DateTime; /** * Non-synchronized implementation of {@link Query}. */ final class TsdbQuery implements Query { private static final Logger LOG = LoggerFactory.getLogger(TsdbQuery.class); /** Used whenever there are no results. */ private static final DataPoints[] NO_RESULT = new DataPoints[0]; /** * Keep track of the latency we perceive when doing Scans on HBase. * We want buckets up to 16s, with 2 ms interval between each bucket up to * 100 ms after we which we switch to exponential buckets. */ static final Histogram scanlatency = new Histogram(16000, (short) 2, 100); /** * Charset to use with our server-side row-filter. * We use this one because it preserves every possible byte unchanged. */ private static final Charset CHARSET = Charset.forName("ISO-8859-1"); /** The TSDB we belong to. */ private final TSDB tsdb; /** The time, in ns, when we start scanning for data **/ private long scan_start_time; /** Value used for timestamps that are uninitialized. */ private static final int UNSET = -1; /** Start time (UNIX timestamp in seconds) on 32 bits ("unsigned" int). */ private long start_time = UNSET; /** End time (UNIX timestamp in seconds) on 32 bits ("unsigned" int). */ private long end_time = UNSET; /** Whether or not to delete the queried data */ private boolean delete; /** ID of the metric being looked up. */ private byte[] metric; /** Row key regex to pass to HBase if we have tags or TSUIDs */ private String regex; /** Whether or not to enable the fuzzy row filter for Hbase */ private boolean enable_fuzzy_filter; /** * Tags by which we must group the results. * Each element is a tag ID. * Invariant: an element cannot be both in this array and in {@code tags}. */ private ArrayList<byte[]> group_bys; /** * Tag key and values to use in the row key filter, all pre-sorted */ private ByteMap<byte[][]> row_key_literals; /** If true, use rate of change instead of actual values. */ private boolean rate; /** Specifies the various options for rate calculations */ private RateOptions rate_options; /** Aggregator function to use. */ private Aggregator aggregator; /** Downsampling specification to use, if any (can be {@code null}). */ private DownsamplingSpecification downsampler; /** Optional list of TSUIDs to fetch and aggregate instead of a metric */ private List<String> tsuids; /** An index that links this query to the original sub query */ private int query_index; /** Tag value filters to apply post scan */ private List<TagVFilter> filters; /** An object for storing stats in regarding the query. May be null */ private QueryStats query_stats; /** Whether or not to match series with ONLY the given tags */ private boolean explicit_tags; /** Constructor. */ public TsdbQuery(final TSDB tsdb) { this.tsdb = tsdb; enable_fuzzy_filter = tsdb.getConfig() .getBoolean("tsd.query.enable_fuzzy_filter"); } /** * Sets the start time for the query * @param timestamp Unix epoch timestamp in seconds or milliseconds * @throws IllegalArgumentException if the timestamp is invalid or greater * than the end time (if set) */ @Override public void setStartTime(final long timestamp) { if (timestamp < 0 || ((timestamp & Const.SECOND_MASK) != 0 && timestamp > 9999999999999L)) { throw new IllegalArgumentException("Invalid timestamp: " + timestamp); } else if (end_time != UNSET && timestamp >= getEndTime()) { throw new IllegalArgumentException("new start time (" + timestamp + ") is greater than or equal to end time: " + getEndTime()); } start_time = timestamp; } /** * @return the start time for the query * @throws IllegalStateException if the start time hasn't been set yet */ @Override public long getStartTime() { if (start_time == UNSET) { throw new IllegalStateException("setStartTime was never called!"); } return start_time; } /** * Sets the end time for the query. If this isn't set, the system time will be * used when the query is executed or {@link #getEndTime} is called * @param timestamp Unix epoch timestamp in seconds or milliseconds * @throws IllegalArgumentException if the timestamp is invalid or less * than the start time (if set) */ @Override public void setEndTime(final long timestamp) { if (timestamp < 0 || ((timestamp & Const.SECOND_MASK) != 0 && timestamp > 9999999999999L)) { throw new IllegalArgumentException("Invalid timestamp: " + timestamp); } else if (start_time != UNSET && timestamp <= getStartTime()) { throw new IllegalArgumentException("new end time (" + timestamp + ") is less than or equal to start time: " + getStartTime()); } end_time = timestamp; } /** @return the configured end time. If the end time hasn't been set, the * current system time will be stored and returned. */ @Override public long getEndTime() { if (end_time == UNSET) { setEndTime(DateTime.currentTimeMillis()); } return end_time; } @Override public void setDelete(boolean delete) { this.delete = delete; } @Override public boolean getDelete() { return delete; } @Override public void setTimeSeries(final String metric, final Map<String, String> tags, final Aggregator function, final boolean rate) throws NoSuchUniqueName { setTimeSeries(metric, tags, function, rate, new RateOptions()); } @Override public void setTimeSeries(final String metric, final Map<String, String> tags, final Aggregator function, final boolean rate, final RateOptions rate_options) throws NoSuchUniqueName { if (filters == null) { filters = new ArrayList<TagVFilter>(tags.size()); } TagVFilter.tagsToFilters(tags, filters); try { for (final TagVFilter filter : this.filters) { filter.resolveTagkName(tsdb).join(); } } catch (final InterruptedException e) { LOG.warn("Interrupted", e); Thread.currentThread().interrupt(); } catch (final NoSuchUniqueName e) { throw e; } catch (final Exception e) { if (e instanceof DeferredGroupException) { // rollback to the actual case. The DGE missdirects Throwable ex = e.getCause(); while(ex != null && ex instanceof DeferredGroupException) { ex = ex.getCause(); } if (ex != null) { throw (RuntimeException)ex; } } LOG.error("Unexpected exception processing group bys", e); throw new RuntimeException(e); } findGroupBys(); this.metric = tsdb.metrics.getId(metric); aggregator = function; this.rate = rate; this.rate_options = rate_options; } @Override public void setTimeSeries(final List<String> tsuids, final Aggregator function, final boolean rate) { setTimeSeries(tsuids, function, rate, new RateOptions()); } @Override public void setTimeSeries(final List<String> tsuids, final Aggregator function, final boolean rate, final RateOptions rate_options) { if (tsuids == null || tsuids.isEmpty()) { throw new IllegalArgumentException( "Empty or missing TSUID list not allowed"); } String first_metric = ""; for (final String tsuid : tsuids) { if (first_metric.isEmpty()) { first_metric = tsuid.substring(0, TSDB.metrics_width() * 2) .toUpperCase(); continue; } final String metric = tsuid.substring(0, TSDB.metrics_width() * 2) .toUpperCase(); if (!first_metric.equals(metric)) { throw new IllegalArgumentException( "One or more TSUIDs did not share the same metric"); } } // the metric will be set with the scanner is configured this.tsuids = tsuids; aggregator = function; this.rate = rate; this.rate_options = rate_options; } /** * @param explicit_tags Whether or not to match only on the given tags * @since 2.3 */ public void setExplicitTags(final boolean explicit_tags) { this.explicit_tags = explicit_tags; } @Override public Deferred<Object> configureFromQuery(final TSQuery query, final int index) { if (query.getQueries() == null || query.getQueries().isEmpty()) { throw new IllegalArgumentException("Missing sub queries"); } if (index < 0 || index > query.getQueries().size()) { throw new IllegalArgumentException("Query index was out of range"); } final TSSubQuery sub_query = query.getQueries().get(index); setStartTime(query.startTime()); setEndTime(query.endTime()); setDelete(query.getDelete()); query_index = index; query_stats = query.getQueryStats(); // set common options aggregator = sub_query.aggregator(); rate = sub_query.getRate(); rate_options = sub_query.getRateOptions(); if (rate_options == null) { rate_options = new RateOptions(); } downsampler = sub_query.downsamplingSpecification(); filters = sub_query.getFilters(); explicit_tags = sub_query.getExplicitTags(); // if we have tsuids set, that takes precedence if (sub_query.getTsuids() != null && !sub_query.getTsuids().isEmpty()) { tsuids = new ArrayList<String>(sub_query.getTsuids()); String first_metric = ""; for (final String tsuid : tsuids) { if (first_metric.isEmpty()) { first_metric = tsuid.substring(0, TSDB.metrics_width() * 2) .toUpperCase(); continue; } final String metric = tsuid.substring(0, TSDB.metrics_width() * 2) .toUpperCase(); if (!first_metric.equals(metric)) { throw new IllegalArgumentException( "One or more TSUIDs did not share the same metric [" + first_metric + "] [" + metric + "]"); } } return Deferred.fromResult(null); } else { /** Triggers the group by resolution if we had filters to resolve */ class FilterCB implements Callback<Object, ArrayList<byte[]>> { @Override public Object call(final ArrayList<byte[]> results) throws Exception { findGroupBys(); return null; } } /** Resolve and group by tags after resolving the metric */ class MetricCB implements Callback<Deferred<Object>, byte[]> { @Override public Deferred<Object> call(final byte[] uid) throws Exception { metric = uid; if (filters != null) { final List<Deferred<byte[]>> deferreds = new ArrayList<Deferred<byte[]>>(filters.size()); for (final TagVFilter filter : filters) { deferreds.add(filter.resolveTagkName(tsdb)); } return Deferred.group(deferreds).addCallback(new FilterCB()); } else { return Deferred.fromResult(null); } } } // fire off the callback chain by resolving the metric first return tsdb.metrics.getIdAsync(sub_query.getMetric()) .addCallbackDeferring(new MetricCB()); } } @Override public void downsample(final long interval, final Aggregator downsampler, final FillPolicy fill_policy) { this.downsampler = new DownsamplingSpecification( interval, downsampler,fill_policy); } /** * Sets an optional downsampling function with interpolation on this query. * @param interval The interval, in milliseconds to rollup data points * @param downsampler An aggregation function to use when rolling up data points * @throws NullPointerException if the aggregation function is null * @throws IllegalArgumentException if the interval is not greater than 0 */ @Override public void downsample(final long interval, final Aggregator downsampler) { if (downsampler == Aggregators.NONE) { throw new IllegalArgumentException("cannot use the NONE " + "aggregator for downsampling"); } downsample(interval, downsampler, FillPolicy.NONE); } /** * Populates the {@link #group_bys} and {@link #row_key_literals}'s with * values pulled from the filters. */ private void findGroupBys() { if (filters == null || filters.isEmpty()) { return; } row_key_literals = new ByteMap<byte[][]>(); Collections.sort(filters); final Iterator<TagVFilter> current_iterator = filters.iterator(); final Iterator<TagVFilter> look_ahead = filters.iterator(); byte[] tagk = null; TagVFilter next = look_ahead.hasNext() ? look_ahead.next() : null; int row_key_literals_count = 0; while (current_iterator.hasNext()) { next = look_ahead.hasNext() ? look_ahead.next() : null; int gbs = 0; // sorted! final ByteMap<Void> literals = new ByteMap<Void>(); final List<TagVFilter> literal_filters = new ArrayList<TagVFilter>(); TagVFilter current = null; do { // yeah, I'm breakin out the do!!! current = current_iterator.next(); if (tagk == null) { tagk = new byte[TSDB.tagk_width()]; System.arraycopy(current.getTagkBytes(), 0, tagk, 0, TSDB.tagk_width()); } if (current.isGroupBy()) { gbs++; } if (!current.getTagVUids().isEmpty()) { for (final byte[] uid : current.getTagVUids()) { literals.put(uid, null); } literal_filters.add(current); } if (next != null && Bytes.memcmp(tagk, next.getTagkBytes()) != 0) { break; } next = look_ahead.hasNext() ? look_ahead.next() : null; } while (current_iterator.hasNext() && Bytes.memcmp(tagk, current.getTagkBytes()) == 0); if (gbs > 0) { if (group_bys == null) { group_bys = new ArrayList<byte[]>(); } group_bys.add(current.getTagkBytes()); } if (literals.size() > 0) { if (literals.size() + row_key_literals_count > tsdb.getConfig().getInt("tsd.query.filter.expansion_limit")) { LOG.debug("Skipping literals for " + current.getTagk() + " as it exceedes the limit"); } else { final byte[][] values = new byte[literals.size()][]; literals.keySet().toArray(values); row_key_literals.put(current.getTagkBytes(), values); row_key_literals_count += values.length; for (final TagVFilter filter : literal_filters) { filter.setPostScan(false); } } } else { row_key_literals.put(current.getTagkBytes(), null); } } } /** * Executes the query. * NOTE: Do not run the same query multiple times. Construct a new query with * the same parameters again if needed * TODO(cl) There are some strange occurrences when unit testing where the end * time, if not set, can change between calls to run() * @return An array of data points with one time series per array value */ @Override public DataPoints[] run() throws HBaseException { try { return runAsync().joinUninterruptibly(); } catch (RuntimeException e) { throw e; } catch (Exception e) { throw new RuntimeException("Should never be here", e); } } @Override public Deferred<DataPoints[]> runAsync() throws HBaseException { return findSpans().addCallback(new GroupByAndAggregateCB()); } /** * Finds all the {@link Span}s that match this query. * This is what actually scans the HBase table and loads the data into * {@link Span}s. * @return A map from HBase row key to the {@link Span} for that row key. * Since a {@link Span} actually contains multiple HBase rows, the row key * stored in the map has its timestamp zero'ed out. * @throws HBaseException if there was a problem communicating with HBase to * perform the search. * @throws IllegalArgumentException if bad data was retrieved from HBase. */ private Deferred<TreeMap<byte[], Span>> findSpans() throws HBaseException { final short metric_width = tsdb.metrics.width(); final TreeMap<byte[], Span> spans = // The key is a row key from HBase. new TreeMap<byte[], Span>(new SpanCmp( (short)(Const.SALT_WIDTH() + metric_width))); // Copy only the filters that should trigger a tag resolution. If this list // is empty due to literals or a wildcard star, then we'll save a TON of // UID lookups final List<TagVFilter> scanner_filters; if (filters != null) { scanner_filters = new ArrayList<TagVFilter>(filters.size()); for (final TagVFilter filter : filters) { if (filter.postScan()) { scanner_filters.add(filter); } } } else { scanner_filters = null; } if (Const.SALT_WIDTH() > 0) { final List<Scanner> scanners = new ArrayList<Scanner>(Const.SALT_BUCKETS()); for (int i = 0; i < Const.SALT_BUCKETS(); i++) { scanners.add(getScanner(i)); } scan_start_time = DateTime.nanoTime(); return new SaltScanner(tsdb, metric, scanners, spans, scanner_filters, delete, query_stats, query_index).scan(); } scan_start_time = DateTime.nanoTime(); final Scanner scanner = getScanner(); if (query_stats != null) { query_stats.addScannerId(query_index, 0, scanner.toString()); } final Deferred<TreeMap<byte[], Span>> results = new Deferred<TreeMap<byte[], Span>>(); /** * Scanner callback executed recursively each time we get a set of data * from storage. This is responsible for determining what columns are * returned and issuing requests to load leaf objects. * When the scanner returns a null set of rows, the method initiates the * final callback. */ final class ScannerCB implements Callback<Object, ArrayList<ArrayList<KeyValue>>> { int nrows = 0; boolean seenAnnotation = false; long scanner_start = DateTime.nanoTime(); long timeout = tsdb.getConfig().getLong("tsd.query.timeout"); private final Set<String> skips = new HashSet<String>(); private final Set<String> keepers = new HashSet<String>(); private final int index = 0; // only used for salted scanners /** nanosecond timestamps */ private long fetch_start = 0; // reset each time we send an RPC to HBase private long fetch_time = 0; // cumulation of time waiting on HBase private long uid_resolve_time = 0; // cumulation of time resolving UIDs private long uids_resolved = 0; private long compaction_time = 0; // cumulation of time compacting private long dps_pre_filter = 0; private long rows_pre_filter = 0; private long dps_post_filter = 0; private long rows_post_filter = 0; /** Error callback that will capture an exception from AsyncHBase and store * it so we can bubble it up to the caller. */ class ErrorCB implements Callback<Object, Exception> { @Override public Object call(final Exception e) throws Exception { LOG.error("Scanner " + scanner + " threw an exception", e); close(e); return null; } } /** * Starts the scanner and is called recursively to fetch the next set of * rows from the scanner. * @return The map of spans if loaded successfully, null if no data was * found */ public Object scan() { fetch_start = DateTime.nanoTime(); return scanner.nextRows().addCallback(this).addErrback(new ErrorCB()); } /** * Loops through each row of the scanner results and parses out data * points and optional meta data * @return null if no rows were found, otherwise the TreeMap with spans */ @Override public Object call(final ArrayList<ArrayList<KeyValue>> rows) throws Exception { fetch_time += DateTime.nanoTime() - fetch_start; try { if (rows == null) { scanlatency.add((int)DateTime.msFromNano(fetch_time)); LOG.info(TsdbQuery.this + " matched " + nrows + " rows in " + spans.size() + " spans in " + DateTime.msFromNano(fetch_time) + "ms"); close(null); return null; } if (timeout > 0 && DateTime.msFromNanoDiff( DateTime.nanoTime(), scanner_start) > timeout) { throw new InterruptedException("Query timeout exceeded!"); } rows_pre_filter += rows.size(); // used for UID resolution if a filter is involved final List<Deferred<Object>> lookups = filters != null && !filters.isEmpty() ? new ArrayList<Deferred<Object>>(rows.size()) : null; for (final ArrayList<KeyValue> row : rows) { final byte[] key = row.get(0).key(); if (Bytes.memcmp(metric, key, 0, metric_width) != 0) { scanner.close(); throw new IllegalDataException( "HBase returned a row that doesn't match" + " our scanner (" + scanner + ")! " + row + " does not start" + " with " + Arrays.toString(metric)); } // calculate estimated data point count. We don't want to deserialize // the byte arrays so we'll just get a rough estimate of compacted // columns. for (final KeyValue kv : row) { if (kv.qualifier().length % 2 == 0) { if (kv.qualifier().length == 2 || kv.qualifier().length == 4) { ++dps_pre_filter; } else { // for now we'll assume that all compacted columns are of the // same precision. This is likely incorrect. if (Internal.inMilliseconds(kv.qualifier())) { dps_pre_filter += (kv.qualifier().length / 4); } else { dps_pre_filter += (kv.qualifier().length / 2); } } } else if (kv.qualifier()[0] == AppendDataPoints.APPEND_COLUMN_PREFIX) { // with appends we don't have a good rough estimate as the length // can vary widely with the value length variability. Therefore we // have to iterate. int idx = 0; int qlength = 0; while (idx < kv.value().length) { qlength = Internal.getQualifierLength(kv.value(), idx); idx += qlength + Internal.getValueLengthFromQualifier(kv.value(), idx); ++dps_pre_filter; } } } // If any filters have made it this far then we need to resolve // the row key UIDs to their names for string comparison. We'll // try to avoid the resolution with some sets but we may dupe // resolve a few times. // TODO - more efficient resolution // TODO - byte set instead of a string for the uid may be faster if (scanner_filters != null && !scanner_filters.isEmpty()) { lookups.clear(); final String tsuid = UniqueId.uidToString(UniqueId.getTSUIDFromKey(key, TSDB.metrics_width(), Const.TIMESTAMP_BYTES)); if (skips.contains(tsuid)) { continue; } if (!keepers.contains(tsuid)) { final long uid_start = DateTime.nanoTime(); /** CB to called after all of the UIDs have been resolved */ class MatchCB implements Callback<Object, ArrayList<Boolean>> { @Override public Object call(final ArrayList<Boolean> matches) throws Exception { for (final boolean matched : matches) { if (!matched) { skips.add(tsuid); return null; } } // matched all, good data keepers.add(tsuid); processRow(key, row); return null; } } /** Resolves all of the row key UIDs to their strings for filtering */ class GetTagsCB implements Callback<Deferred<ArrayList<Boolean>>, Map<String, String>> { @Override public Deferred<ArrayList<Boolean>> call( final Map<String, String> tags) throws Exception { uid_resolve_time += (DateTime.nanoTime() - uid_start); uids_resolved += tags.size(); final List<Deferred<Boolean>> matches = new ArrayList<Deferred<Boolean>>(scanner_filters.size()); for (final TagVFilter filter : scanner_filters) { matches.add(filter.match(tags)); } return Deferred.group(matches); } } lookups.add(Tags.getTagsAsync(tsdb, key) .addCallbackDeferring(new GetTagsCB()) .addBoth(new MatchCB())); } else { processRow(key, row); } } else { processRow(key, row); } } // either we need to wait on the UID resolutions or we can go ahead // if we don't have filters. if (lookups != null && lookups.size() > 0) { class GroupCB implements Callback<Object, ArrayList<Object>> { @Override public Object call(final ArrayList<Object> group) throws Exception { return scan(); } } return Deferred.group(lookups).addCallback(new GroupCB()); } else { return scan(); } } catch (Exception e) { close(e); return null; } } /** * Finds or creates the span for this row, compacts it and stores it. * @param key The row key to use for fetching the span * @param row The row to add */ void processRow(final byte[] key, final ArrayList<KeyValue> row) { ++rows_post_filter; if (delete) { final DeleteRequest del = new DeleteRequest(tsdb.dataTable(), key); tsdb.getClient().delete(del); } // calculate estimated data point count. We don't want to deserialize // the byte arrays so we'll just get a rough estimate of compacted // columns. for (final KeyValue kv : row) { if (kv.qualifier().length % 2 == 0) { if (kv.qualifier().length == 2 || kv.qualifier().length == 4) { ++dps_post_filter; } else { // for now we'll assume that all compacted columns are of the // same precision. This is likely incorrect. if (Internal.inMilliseconds(kv.qualifier())) { dps_post_filter += (kv.qualifier().length / 4); } else { dps_post_filter += (kv.qualifier().length / 2); } } } else if (kv.qualifier()[0] == AppendDataPoints.APPEND_COLUMN_PREFIX) { // with appends we don't have a good rough estimate as the length // can vary widely with the value length variability. Therefore we // have to iterate. int idx = 0; int qlength = 0; while (idx < kv.value().length) { qlength = Internal.getQualifierLength(kv.value(), idx); idx += qlength + Internal.getValueLengthFromQualifier(kv.value(), idx); ++dps_post_filter; } } } Span datapoints = spans.get(key); if (datapoints == null) { datapoints = new Span(tsdb); spans.put(key, datapoints); } final long compaction_start = DateTime.nanoTime(); final KeyValue compacted = tsdb.compact(row, datapoints.getAnnotations()); compaction_time += (DateTime.nanoTime() - compaction_start); seenAnnotation |= !datapoints.getAnnotations().isEmpty(); if (compacted != null) { // Can be null if we ignored all KVs. datapoints.addRow(compacted); ++nrows; } } void close(final Exception e) { scanner.close(); if (query_stats != null) { query_stats.addScannerStat(query_index, index, QueryStat.SCANNER_TIME, DateTime.nanoTime() - scan_start_time); // Scanner Stats /* Uncomment when AsyncHBase has this feature: query_stats.addScannerStat(query_index, index, QueryStat.ROWS_FROM_STORAGE, scanner.getRowsFetched()); query_stats.addScannerStat(query_index, index, QueryStat.COLUMNS_FROM_STORAGE, scanner.getColumnsFetched()); query_stats.addScannerStat(query_index, index, QueryStat.BYTES_FROM_STORAGE, scanner.getBytesFetched()); */ query_stats.addScannerStat(query_index, index, QueryStat.HBASE_TIME, fetch_time); query_stats.addScannerStat(query_index, index, QueryStat.SUCCESSFUL_SCAN, e == null ? 1 : 0); // Post Scan stats query_stats.addScannerStat(query_index, index, QueryStat.ROWS_PRE_FILTER, rows_pre_filter); query_stats.addScannerStat(query_index, index, QueryStat.DPS_PRE_FILTER, dps_pre_filter); query_stats.addScannerStat(query_index, index, QueryStat.ROWS_POST_FILTER, rows_post_filter); query_stats.addScannerStat(query_index, index, QueryStat.DPS_POST_FILTER, dps_post_filter); query_stats.addScannerStat(query_index, index, QueryStat.SCANNER_UID_TO_STRING_TIME, uid_resolve_time); query_stats.addScannerStat(query_index, index, QueryStat.UID_PAIRS_RESOLVED, uids_resolved); query_stats.addScannerStat(query_index, index, QueryStat.COMPACTION_TIME, compaction_time); } if (e != null) { results.callback(e); } else if (nrows < 1 && !seenAnnotation) { results.callback(null); } else { results.callback(spans); } } } new ScannerCB().scan(); return results; } /** * Callback that should be attached the the output of * {@link TsdbQuery#findSpans} to group and sort the results. */ private class GroupByAndAggregateCB implements Callback<DataPoints[], TreeMap<byte[], Span>>{ /** * Creates the {@link SpanGroup}s to form the final results of this query. * @param spans The {@link Span}s found for this query ({@link #findSpans}). * Can be {@code null}, in which case the array returned will be empty. * @return A possibly empty array of {@link SpanGroup}s built according to * any 'GROUP BY' formulated in this query. */ @Override public DataPoints[] call(final TreeMap<byte[], Span> spans) throws Exception { if (query_stats != null) { query_stats.addStat(query_index, QueryStat.QUERY_SCAN_TIME, (System.nanoTime() - TsdbQuery.this.scan_start_time)); } if (spans == null || spans.size() <= 0) { if (query_stats != null) { query_stats.addStat(query_index, QueryStat.GROUP_BY_TIME, 0); } return NO_RESULT; } // The raw aggregator skips group bys and ignores downsampling if (aggregator == Aggregators.NONE) { final SpanGroup[] groups = new SpanGroup[spans.size()]; int i = 0; for (final Span span : spans.values()) { final SpanGroup group = new SpanGroup( tsdb, getScanStartTimeSeconds(), getScanEndTimeSeconds(), null, rate, rate_options, aggregator, downsampler, getStartTime(), getEndTime(), query_index); group.add(span); groups[i++] = group; } return groups; } if (group_bys == null) { // We haven't been asked to find groups, so let's put all the spans // together in the same group. final SpanGroup group = new SpanGroup(tsdb, getScanStartTimeSeconds(), getScanEndTimeSeconds(), spans.values(), rate, rate_options, aggregator, downsampler, getStartTime(), getEndTime(), query_index); if (query_stats != null) { query_stats.addStat(query_index, QueryStat.GROUP_BY_TIME, 0); } return new SpanGroup[] { group }; } // Maps group value IDs to the SpanGroup for those values. Say we've // been asked to group by two things: foo=* bar=* Then the keys in this // map will contain all the value IDs combinations we've seen. If the // name IDs for `foo' and `bar' are respectively [0, 0, 7] and [0, 0, 2] // then we'll have group_bys=[[0, 0, 2], [0, 0, 7]] (notice it's sorted // by ID, so bar is first) and say we find foo=LOL bar=OMG as well as // foo=LOL bar=WTF and that the IDs of the tag values are: // LOL=[0, 0, 1] OMG=[0, 0, 4] WTF=[0, 0, 3] // then the map will have two keys: // - one for the LOL-OMG combination: [0, 0, 1, 0, 0, 4] and, // - one for the LOL-WTF combination: [0, 0, 1, 0, 0, 3]. final ByteMap<SpanGroup> groups = new ByteMap<SpanGroup>(); final short value_width = tsdb.tag_values.width(); final byte[] group = new byte[group_bys.size() * value_width]; for (final Map.Entry<byte[], Span> entry : spans.entrySet()) { final byte[] row = entry.getKey(); byte[] value_id = null; int i = 0; // TODO(tsuna): The following loop has a quadratic behavior. We can // make it much better since both the row key and group_bys are sorted. for (final byte[] tag_id : group_bys) { value_id = Tags.getValueId(tsdb, row, tag_id); if (value_id == null) { break; } System.arraycopy(value_id, 0, group, i, value_width); i += value_width; } if (value_id == null) { LOG.error("WTF? Dropping span for row " + Arrays.toString(row) + " as it had no matching tag from the requested groups," + " which is unexpected. Query=" + this); continue; } //LOG.info("Span belongs to group " + Arrays.toString(group) + ": " + Arrays.toString(row)); SpanGroup thegroup = groups.get(group); if (thegroup == null) { thegroup = new SpanGroup(tsdb, getScanStartTimeSeconds(), getScanEndTimeSeconds(), null, rate, rate_options, aggregator, downsampler, getStartTime(), getEndTime(), query_index); // Copy the array because we're going to keep `group' and overwrite // its contents. So we want the collection to have an immutable copy. final byte[] group_copy = new byte[group.length]; System.arraycopy(group, 0, group_copy, 0, group.length); groups.put(group_copy, thegroup); } thegroup.add(entry.getValue()); } //for (final Map.Entry<byte[], SpanGroup> entry : groups) { // LOG.info("group for " + Arrays.toString(entry.getKey()) + ": " + entry.getValue()); //} if (query_stats != null) { query_stats.addStat(query_index, QueryStat.GROUP_BY_TIME, 0); } return groups.values().toArray(new SpanGroup[groups.size()]); } } /** * Returns a scanner set for the given metric (from {@link #metric} or from * the first TSUID in the {@link #tsuids}s list. If one or more tags are * provided, it calls into {@link #createAndSetFilter} to setup a row key * filter. If one or more TSUIDs have been provided, it calls into * {@link #createAndSetTSUIDFilter} to setup a row key filter. * @return A scanner to use for fetching data points */ protected Scanner getScanner() throws HBaseException { return getScanner(0); } /** * Returns a scanner set for the given metric (from {@link #metric} or from * the first TSUID in the {@link #tsuids}s list. If one or more tags are * provided, it calls into {@link #createAndSetFilter} to setup a row key * filter. If one or more TSUIDs have been provided, it calls into * {@link #createAndSetTSUIDFilter} to setup a row key filter. * @param salt_bucket The salt bucket to scan over when salting is enabled. * @return A scanner to use for fetching data points */ protected Scanner getScanner(final int salt_bucket) throws HBaseException { final short metric_width = tsdb.metrics.width(); // set the metric UID based on the TSUIDs if given, or the metric UID if (tsuids != null && !tsuids.isEmpty()) { final String tsuid = tsuids.get(0); final String metric_uid = tsuid.substring(0, metric_width * 2); metric = UniqueId.stringToUid(metric_uid); } // We search at least one row before and one row after the start & end // time we've been given as it's quite likely that the exact timestamp // we're looking for is in the middle of a row. Plus, a number of things // rely on having a few extra data points before & after the exact start // & end dates in order to do proper rate calculation or downsampling near // the "edges" of the graph. final Scanner scanner = QueryUtil.getMetricScanner(tsdb, salt_bucket, metric, (int) getScanStartTimeSeconds(), end_time == UNSET ? -1 // Will scan until the end (0xFFF...). : (int) getScanEndTimeSeconds(), tsdb.table, TSDB.FAMILY()); if (tsuids != null && !tsuids.isEmpty()) { createAndSetTSUIDFilter(scanner); } else if (filters.size() > 0) { createAndSetFilter(scanner); } return scanner; } /** Returns the UNIX timestamp from which we must start scanning. */ private long getScanStartTimeSeconds() { // Begin with the raw query start time. long start = getStartTime(); // Convert to seconds if we have a query in ms. if ((start & Const.SECOND_MASK) != 0L) { start /= 1000L; } // First, we align the start timestamp to its representative value for the // interval in which it appears, if downsampling. long interval_aligned_ts = start; if (downsampler != null && downsampler.getInterval() > 0) { // Downsampling enabled. // TODO - calendar interval final long interval_offset = (1000L * start) % downsampler.getInterval(); interval_aligned_ts -= interval_offset / 1000L; } // Then snap that timestamp back to its representative value for the // timespan in which it appears. final long timespan_offset = interval_aligned_ts % Const.MAX_TIMESPAN; final long timespan_aligned_ts = interval_aligned_ts - timespan_offset; // Don't return negative numbers. return timespan_aligned_ts > 0L ? timespan_aligned_ts : 0L; } /** Returns the UNIX timestamp at which we must stop scanning. */ private long getScanEndTimeSeconds() { // Begin with the raw query end time. long end = getEndTime(); // Convert to seconds if we have a query in ms. if ((end & Const.SECOND_MASK) != 0L) { end /= 1000L; if (end - (end * 1000) < 1) { // handle an edge case where a user may request a ms time between // 0 and 1 seconds. Just bump it a second. end++; } } // The calculation depends on whether we're downsampling. if (downsampler != null && downsampler.getInterval() > 0) { // Downsampling enabled. // // First, we align the end timestamp to its representative value for the // interval FOLLOWING the one in which it appears. // // OpenTSDB's query bounds are inclusive, but HBase scan bounds are half- // open. The user may have provided an end bound that is already // interval-aligned (i.e., its interval offset is zero). If so, the user // wishes for that interval to appear in the output. In that case, we // skip forward an entire extra interval. // // This can be accomplished by simply not testing for zero offset. final long interval_offset = (1000L * end) % downsampler.getInterval(); final long interval_aligned_ts = end + (downsampler.getInterval() - interval_offset) / 1000L; // Then, if we're now aligned on a timespan boundary, then we need no // further adjustment: we are guaranteed to have always moved the end time // forward, so the scan will find the data we need. // // Otherwise, we need to align to the NEXT timespan to ensure that we scan // the needed data. final long timespan_offset = interval_aligned_ts % Const.MAX_TIMESPAN; return (0L == timespan_offset) ? interval_aligned_ts : interval_aligned_ts + (Const.MAX_TIMESPAN - timespan_offset); } else { // Not downsampling. // // Regardless of the end timestamp's position within the current timespan, // we must always align to the beginning of the next timespan. This is // true even if it's already aligned on a timespan boundary. Again, the // reason for this is OpenTSDB's closed interval vs. HBase's half-open. final long timespan_offset = end % Const.MAX_TIMESPAN; return end + (Const.MAX_TIMESPAN - timespan_offset); } } /** * Sets the server-side regexp filter on the scanner. * In order to find the rows with the relevant tags, we use a * server-side filter that matches a regular expression on the row key. * @param scanner The scanner on which to add the filter. */ private void createAndSetFilter(final Scanner scanner) { QueryUtil.setDataTableScanFilter(scanner, group_bys, row_key_literals, explicit_tags, enable_fuzzy_filter, (end_time == UNSET ? -1 // Will scan until the end (0xFFF...). : (int) getScanEndTimeSeconds())); } /** * Sets the server-side regexp filter on the scanner. * This will compile a list of the tagk/v pairs for the TSUIDs to prevent * storage from returning irrelevant rows. * @param scanner The scanner on which to add the filter. * @since 2.0 */ private void createAndSetTSUIDFilter(final Scanner scanner) { if (regex == null) { regex = QueryUtil.getRowKeyTSUIDRegex(tsuids); } scanner.setKeyRegexp(regex, CHARSET); } @Override public String toString() { final StringBuilder buf = new StringBuilder(); buf.append("TsdbQuery(start_time=") .append(getStartTime()) .append(", end_time=") .append(getEndTime()); if (tsuids != null && !tsuids.isEmpty()) { buf.append(", tsuids="); for (final String tsuid : tsuids) { buf.append(tsuid).append(","); } } else { buf.append(", metric=").append(Arrays.toString(metric)); buf.append(", filters=["); for (final Iterator<TagVFilter> it = filters.iterator(); it.hasNext(); ) { buf.append(it.next()); if (it.hasNext()) { buf.append(','); } } buf.append("], rate=").append(rate) .append(", aggregator=").append(aggregator) .append(", group_bys=("); if (group_bys != null) { for (final byte[] tag_id : group_bys) { try { buf.append(tsdb.tag_names.getName(tag_id)); } catch (NoSuchUniqueId e) { buf.append('<').append(e.getMessage()).append('>'); } buf.append(' ') .append(Arrays.toString(tag_id)); if (row_key_literals != null) { final byte[][] value_ids = row_key_literals.get(tag_id); if (value_ids == null) { continue; } buf.append("={"); for (final byte[] value_id : value_ids) { try { if (value_id != null) { buf.append(tsdb.tag_values.getName(value_id)); } else { buf.append("null"); } } catch (NoSuchUniqueId e) { buf.append('<').append(e.getMessage()).append('>'); } buf.append(' ') .append(Arrays.toString(value_id)) .append(", "); } buf.append('}'); } buf.append(", "); } } } buf.append("))"); return buf.toString(); } /** * Comparator that ignores timestamps in row keys. */ private static final class SpanCmp implements Comparator<byte[]> { private final short metric_width; public SpanCmp(final short metric_width) { this.metric_width = metric_width; } @Override public int compare(final byte[] a, final byte[] b) { final int length = Math.min(a.length, b.length); if (a == b) { // Do this after accessing a.length and b.length return 0; // in order to NPE if either a or b is null. } int i; // First compare the metric ID. for (i = 0; i < metric_width; i++) { if (a[i] != b[i]) { return (a[i] & 0xFF) - (b[i] & 0xFF); // "promote" to unsigned. } } // Then skip the timestamp and compare the rest. for (i += Const.TIMESTAMP_BYTES; i < length; i++) { if (a[i] != b[i]) { return (a[i] & 0xFF) - (b[i] & 0xFF); // "promote" to unsigned. } } return a.length - b.length; } } /** Helps unit tests inspect private methods. */ @VisibleForTesting static class ForTesting { /** @return the start time of the HBase scan for unit tests. */ static long getScanStartTimeSeconds(final TsdbQuery query) { return query.getScanStartTimeSeconds(); } /** @return the end time of the HBase scan for unit tests. */ static long getScanEndTimeSeconds(final TsdbQuery query) { return query.getScanEndTimeSeconds(); } /** @return the downsampling interval for unit tests. */ static long getDownsampleIntervalMs(final TsdbQuery query) { return query.downsampler.getInterval(); } static byte[] getMetric(final TsdbQuery query) { return query.metric; } static RateOptions getRateOptions(final TsdbQuery query) { return query.rate_options; } static List<TagVFilter> getFilters(final TsdbQuery query) { return query.filters; } static ArrayList<byte[]> getGroupBys(final TsdbQuery query) { return query.group_bys; } static ByteMap<byte[][]> getRowKeyLiterals(final TsdbQuery query) { return query.row_key_literals; } } }