// This file is part of OpenTSDB.
// Copyright (C) 2010-2012 The OpenTSDB Authors.
//
// This program is free software: you can redistribute it and/or modify it
// under the terms of the GNU Lesser General Public License as published by
// the Free Software Foundation, either version 2.1 of the License, or (at your
// option) any later version. This program is distributed in the hope that it
// will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty
// of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser
// General Public License for more details. You should have received a copy
// of the GNU Lesser General Public License along with this program. If not,
// see <http://www.gnu.org/licenses/>.
package net.opentsdb.core;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.hbase.async.Bytes;
import org.hbase.async.HBaseException;
import org.hbase.async.KeyValue;
import org.hbase.async.Scanner;
import static org.hbase.async.Bytes.ByteMap;
import net.opentsdb.stats.Histogram;
import net.opentsdb.uid.NoSuchUniqueId;
import net.opentsdb.uid.NoSuchUniqueName;
/**
* Non-synchronized implementation of {@link Query}.
*/
public final class TsdbQuery implements Query {
private static final Logger LOG = LoggerFactory.getLogger(TsdbQuery.class);
/** Used whenever there are no results. */
private static final DataPoints[] NO_RESULT = new DataPoints[0];
/**
* Keep track of the latency we perceive when doing Scans on HBase.
* We want buckets up to 16s, with 2 ms interval between each bucket up to
* 100 ms after we which we switch to exponential buckets.
*/
static final Histogram scanlatency = new Histogram(16000, (short) 2, 100);
/**
* Charset to use with our server-side row-filter.
* We use this one because it preserves every possible byte unchanged.
*/
private static final Charset CHARSET = Charset.forName("ISO-8859-1");
/** The TSDB we belong to. */
private final TSDB tsdb;
/** Value used for timestamps that are uninitialized. */
private static final int UNSET = -1;
/** Start time (UNIX timestamp in seconds) on 32 bits ("unsigned" int). */
private int start_time = UNSET;
/** End time (UNIX timestamp in seconds) on 32 bits ("unsigned" int). */
private int end_time = UNSET;
/** ID of the metric being looked up. */
private byte[] metric;
/**
* Tags of the metrics being looked up.
* Each tag is a byte array holding the ID of both the name and value
* of the tag.
* Invariant: an element cannot be both in this array and in group_bys.
*/
private ArrayList<byte[]> tags;
/**
* Tags by which we must group the results.
* Each element is a tag ID.
* Invariant: an element cannot be both in this array and in {@code tags}.
*/
private ArrayList<byte[]> group_bys;
/**
* Values we may be grouping on.
* For certain elements in {@code group_bys}, we may have a specific list of
* values IDs we're looking for. Those IDs are stored in this map. The key
* is an element of {@code group_bys} (so a tag name ID) and the values are
* tag value IDs (at least two).
*/
private ByteMap<byte[][]> group_by_values;
/** If true, use rate of change instead of actual values. */
private boolean rate;
/** Aggregator function to use. */
private Aggregator aggregator;
/**
* Downsampling function to use, if any (can be {@code null}).
* If this is non-null, {@code sample_interval} must be strictly positive.
*/
private Aggregator downsampler;
/** Minimum time interval (in seconds) wanted between each data point. */
private int sample_interval;
/** Constructor. */
public TsdbQuery(final TSDB tsdb) {
this.tsdb = tsdb;
}
public void setStartTime(final long timestamp) {
if ((timestamp & 0xFFFFFFFF00000000L) != 0) {
throw new IllegalArgumentException("Invalid timestamp: " + timestamp);
} else if (end_time != UNSET && timestamp >= getEndTime()) {
throw new IllegalArgumentException("new start time (" + timestamp
+ ") is greater than or equal to end time: " + getEndTime());
}
// Keep the 32 bits.
start_time = (int) timestamp;
}
public long getStartTime() {
if (start_time == UNSET) {
throw new IllegalStateException("setStartTime was never called!");
}
return start_time & 0x00000000FFFFFFFFL;
}
public void setEndTime(final long timestamp) {
if ((timestamp & 0xFFFFFFFF00000000L) != 0) {
throw new IllegalArgumentException("Invalid timestamp: " + timestamp);
} else if (start_time != UNSET && timestamp <= getStartTime()) {
throw new IllegalArgumentException("new end time (" + timestamp
+ ") is less than or equal to start time: " + getStartTime());
}
// Keep the 32 bits.
end_time = (int) timestamp;
}
public long getEndTime() {
if (end_time == UNSET) {
setEndTime(System.currentTimeMillis() / 1000);
}
return end_time;
}
public void setTimeSeries(final String metric,
final Map<String, String> tags,
final Aggregator function,
final boolean rate) throws NoSuchUniqueName {
findGroupBys(tags);
this.metric = tsdb.metrics.getId(metric);
this.tags = Tags.resolveAll(tsdb, tags);
aggregator = function;
this.rate = rate;
}
public void downsample(final int interval, final Aggregator downsampler) {
if (downsampler == null) {
throw new NullPointerException("downsampler");
} else if (interval <= 0) {
throw new IllegalArgumentException("interval not > 0: " + interval);
}
this.downsampler = downsampler;
this.sample_interval = interval;
}
/**
* Extracts all the tags we must use to group results.
* <ul>
* <li>If a tag has the form {@code name=*} then we'll create one
* group per value we find for that tag.</li>
* <li>If a tag has the form {@code name={v1,v2,..,vN}} then we'll
* create {@code N} groups.</li>
* </ul>
* In the both cases above, {@code name} will be stored in the
* {@code group_bys} attribute. In the second case specifically,
* the {@code N} values would be stored in {@code group_by_values},
* the key in this map being {@code name}.
* @param tags The tags from which to extract the 'GROUP BY's.
* Each tag that represents a 'GROUP BY' will be removed from the map
* passed in argument.
*/
private void findGroupBys(final Map<String, String> tags) {
final Iterator<Map.Entry<String, String>> i = tags.entrySet().iterator();
while (i.hasNext()) {
final Map.Entry<String, String> tag = i.next();
final String tagvalue = tag.getValue();
if (tagvalue.equals("*") // 'GROUP BY' with any value.
|| tagvalue.indexOf('|', 1) >= 0) { // Multiple possible values.
if (group_bys == null) {
group_bys = new ArrayList<byte[]>();
}
group_bys.add(tsdb.tag_names.getId(tag.getKey()));
i.remove();
if (tagvalue.charAt(0) == '*') {
continue; // For a 'GROUP BY' with any value, we're done.
}
// 'GROUP BY' with specific values. Need to split the values
// to group on and store their IDs in group_by_values.
final String[] values = Tags.splitString(tagvalue, '|');
if (group_by_values == null) {
group_by_values = new ByteMap<byte[][]>();
}
final short value_width = tsdb.tag_values.width();
final byte[][] value_ids = new byte[values.length][value_width];
group_by_values.put(tsdb.tag_names.getId(tag.getKey()),
value_ids);
for (int j = 0; j < values.length; j++) {
final byte[] value_id = tsdb.tag_values.getId(values[j]);
System.arraycopy(value_id, 0, value_ids[j], 0, value_width);
}
}
}
}
public DataPoints[] run() throws HBaseException {
return groupByAndAggregate(findSpans());
}
/**
* Finds all the {@link Span}s that match this query.
* This is what actually scans the HBase table and loads the data into
* {@link Span}s.
* @return A map from HBase row key to the {@link Span} for that row key.
* Since a {@link Span} actually contains multiple HBase rows, the row key
* stored in the map has its timestamp zero'ed out.
* @throws HBaseException if there was a problem communicating with HBase to
* perform the search.
* @throws IllegalArgumentException if bad data was retreived from HBase.
*/
private TreeMap<byte[], Span> findSpans() throws HBaseException {
final short metric_width = tsdb.metrics.width();
final TreeMap<byte[], Span> spans = // The key is a row key from HBase.
new TreeMap<byte[], Span>(new SpanCmp(metric_width));
int nrows = 0;
int hbase_time = 0; // milliseconds.
long starttime = System.nanoTime();
final Scanner scanner = getScanner();
try {
ArrayList<ArrayList<KeyValue>> rows;
while ((rows = scanner.nextRows().joinUninterruptibly()) != null) {
hbase_time += (System.nanoTime() - starttime) / 1000000;
for (final ArrayList<KeyValue> row : rows) {
final byte[] key = row.get(0).key();
if (Bytes.memcmp(metric, key, 0, metric_width) != 0) {
throw new IllegalDataException("HBase returned a row that doesn't match"
+ " our scanner (" + scanner + ")! " + row + " does not start"
+ " with " + Arrays.toString(metric));
}
Span datapoints = spans.get(key);
if (datapoints == null) {
datapoints = new Span(tsdb);
spans.put(key, datapoints);
}
datapoints.addRow(tsdb.compact(row));
nrows++;
starttime = System.nanoTime();
}
}
} catch (RuntimeException e) {
throw e;
} catch (Exception e) {
throw new RuntimeException("Should never be here", e);
} finally {
hbase_time += (System.nanoTime() - starttime) / 1000000;
scanlatency.add(hbase_time);
}
LOG.info(this + " matched " + nrows + " rows in " + spans.size() + " spans");
if (nrows == 0) {
return null;
}
return spans;
}
/**
* Creates the {@link SpanGroup}s to form the final results of this query.
* @param spans The {@link Span}s found for this query ({@link #findSpans}).
* Can be {@code null}, in which case the array returned will be empty.
* @return A possibly empty array of {@link SpanGroup}s built according to
* any 'GROUP BY' formulated in this query.
*/
private DataPoints[] groupByAndAggregate(final TreeMap<byte[], Span> spans) {
if (spans == null || spans.size() <= 0) {
return NO_RESULT;
}
if (group_bys == null) {
// We haven't been asked to find groups, so let's put all the spans
// together in the same group.
final SpanGroup group = new SpanGroup(tsdb,
getScanStartTime(),
getScanEndTime(),
spans.values(),
rate,
aggregator,
sample_interval, downsampler);
return new SpanGroup[] { group };
}
// Maps group value IDs to the SpanGroup for those values. Say we've
// been asked to group by two things: foo=* bar=* Then the keys in this
// map will contain all the value IDs combinations we've seen. If the
// name IDs for `foo' and `bar' are respectively [0, 0, 7] and [0, 0, 2]
// then we'll have group_bys=[[0, 0, 2], [0, 0, 7]] (notice it's sorted
// by ID, so bar is first) and say we find foo=LOL bar=OMG as well as
// foo=LOL bar=WTF and that the IDs of the tag values are:
// LOL=[0, 0, 1] OMG=[0, 0, 4] WTF=[0, 0, 3]
// then the map will have two keys:
// - one for the LOL-OMG combination: [0, 0, 1, 0, 0, 4] and,
// - one for the LOL-WTF combination: [0, 0, 1, 0, 0, 3].
final ByteMap<SpanGroup> groups = new ByteMap<SpanGroup>();
final short value_width = tsdb.tag_values.width();
final byte[] group = new byte[group_bys.size() * value_width];
for (final Map.Entry<byte[], Span> entry : spans.entrySet()) {
final byte[] row = entry.getKey();
byte[] value_id = null;
int i = 0;
// TODO(tsuna): The following loop has a quadratic behavior. We can
// make it much better since both the row key and group_bys are sorted.
for (final byte[] tag_id : group_bys) {
value_id = Tags.getValueId(tsdb, row, tag_id);
if (value_id == null) {
break;
}
System.arraycopy(value_id, 0, group, i, value_width);
i += value_width;
}
if (value_id == null) {
LOG.error("WTF? Dropping span for row " + Arrays.toString(row)
+ " as it had no matching tag from the requested groups,"
+ " which is unexpected. Query=" + this);
continue;
}
//LOG.info("Span belongs to group " + Arrays.toString(group) + ": " + Arrays.toString(row));
SpanGroup thegroup = groups.get(group);
if (thegroup == null) {
thegroup = new SpanGroup(tsdb, getScanStartTime(), getScanEndTime(),
null, rate, aggregator,
sample_interval, downsampler);
// Copy the array because we're going to keep `group' and overwrite
// its contents. So we want the collection to have an immutable copy.
final byte[] group_copy = new byte[group.length];
System.arraycopy(group, 0, group_copy, 0, group.length);
groups.put(group_copy, thegroup);
}
thegroup.add(entry.getValue());
}
//for (final Map.Entry<byte[], SpanGroup> entry : groups) {
// LOG.info("group for " + Arrays.toString(entry.getKey()) + ": " + entry.getValue());
//}
return groups.values().toArray(new SpanGroup[groups.size()]);
}
/**
* Creates the {@link Scanner} to use for this query.
*/
public Scanner getScanner() throws HBaseException {
final short metric_width = tsdb.metrics.width();
final byte[] start_row = new byte[metric_width + Const.TIMESTAMP_BYTES];
final byte[] end_row = new byte[metric_width + Const.TIMESTAMP_BYTES];
// We search at least one row before and one row after the start & end
// time we've been given as it's quite likely that the exact timestamp
// we're looking for is in the middle of a row. Plus, a number of things
// rely on having a few extra data points before & after the exact start
// & end dates in order to do proper rate calculation or downsampling near
// the "edges" of the graph.
Bytes.setInt(start_row, (int) getScanStartTime(), metric_width);
Bytes.setInt(end_row, (end_time == UNSET
? -1 // Will scan until the end (0xFFF...).
: (int) getScanEndTime()),
metric_width);
System.arraycopy(metric, 0, start_row, 0, metric_width);
System.arraycopy(metric, 0, end_row, 0, metric_width);
final Scanner scanner = tsdb.client.newScanner(tsdb.table);
scanner.setStartKey(start_row);
scanner.setStopKey(end_row);
if (tags.size() > 0 || group_bys != null) {
createAndSetFilter(scanner);
}
scanner.setFamily(TSDB.FAMILY);
return scanner;
}
/** Returns the UNIX timestamp from which we must start scanning. */
private long getScanStartTime() {
// The reason we look before by `MAX_TIMESPAN * 2' seconds is because of
// the following. Let's assume MAX_TIMESPAN = 600 (10 minutes) and the
// start_time = ... 12:31:00. If we initialize the scanner to look
// only 10 minutes before, we'll start scanning at time=12:21, which will
// give us the row that starts at 12:30 (remember: rows are always aligned
// on MAX_TIMESPAN boundaries -- so in this example, on 10m boundaries).
// But we need to start scanning at least 1 row before, so we actually
// look back by twice MAX_TIMESPAN. Only when start_time is aligned on a
// MAX_TIMESPAN boundary then we'll mistakenly scan back by an extra row,
// but this doesn't really matter.
// Additionally, in case our sample_interval is large, we need to look
// even further before/after, so use that too.
final long ts = getStartTime() - Const.MAX_TIMESPAN * 2 - sample_interval;
return ts > 0 ? ts : 0;
}
/** Returns the UNIX timestamp at which we must stop scanning. */
private long getScanEndTime() {
// For the end_time, we have a different problem. For instance if our
// end_time = ... 12:30:00, we'll stop scanning when we get to 12:40, but
// once again we wanna try to look ahead one more row, so to avoid this
// problem we always add 1 second to the end_time. Only when the end_time
// is of the form HH:59:59 then we will scan ahead an extra row, but once
// again that doesn't really matter.
// Additionally, in case our sample_interval is large, we need to look
// even further before/after, so use that too.
return getEndTime() + Const.MAX_TIMESPAN + 1 + sample_interval;
}
/**
* Sets the server-side regexp filter on the scanner.
* In order to find the rows with the relevant tags, we use a
* server-side filter that matches a regular expression on the row key.
* @param scanner The scanner on which to add the filter.
*/
void createAndSetFilter(final Scanner scanner) {
if (group_bys != null) {
Collections.sort(group_bys, Bytes.MEMCMP);
}
final short name_width = tsdb.tag_names.width();
final short value_width = tsdb.tag_values.width();
final short tagsize = (short) (name_width + value_width);
// Generate a regexp for our tags. Say we have 2 tags: { 0 0 1 0 0 2 }
// and { 4 5 6 9 8 7 }, the regexp will be:
// "^.{7}(?:.{6})*\\Q\000\000\001\000\000\002\\E(?:.{6})*\\Q\004\005\006\011\010\007\\E(?:.{6})*$"
final StringBuilder buf = new StringBuilder(
15 // "^.{N}" + "(?:.{M})*" + "$"
+ ((13 + tagsize) // "(?:.{M})*\\Q" + tagsize bytes + "\\E"
* (tags.size() + (group_bys == null ? 0 : group_bys.size() * 3))));
// In order to avoid re-allocations, reserve a bit more w/ groups ^^^
// Alright, let's build this regexp. From the beginning...
buf.append("(?s)" // Ensure we use the DOTALL flag.
+ "^.{")
// ... start by skipping the metric ID and timestamp.
.append(tsdb.metrics.width() + Const.TIMESTAMP_BYTES)
.append("}");
final Iterator<byte[]> tags = this.tags.iterator();
final Iterator<byte[]> group_bys = (this.group_bys == null
? new ArrayList<byte[]>(0).iterator()
: this.group_bys.iterator());
byte[] tag = tags.hasNext() ? tags.next() : null;
byte[] group_by = group_bys.hasNext() ? group_bys.next() : null;
// Tags and group_bys are already sorted. We need to put them in the
// regexp in order by ID, which means we just merge two sorted lists.
do {
// Skip any number of tags.
buf.append("(?:.{").append(tagsize).append("})*\\Q");
if (isTagNext(name_width, tag, group_by)) {
addId(buf, tag);
tag = tags.hasNext() ? tags.next() : null;
} else { // Add a group_by.
addId(buf, group_by);
final byte[][] value_ids = (group_by_values == null
? null
: group_by_values.get(group_by));
if (value_ids == null) { // We don't want any specific ID...
buf.append(".{").append(value_width).append('}'); // Any value ID.
} else { // We want specific IDs. List them: /(AAA|BBB|CCC|..)/
buf.append("(?:");
for (final byte[] value_id : value_ids) {
buf.append("\\Q");
addId(buf, value_id);
buf.append('|');
}
// Replace the pipe of the last iteration.
buf.setCharAt(buf.length() - 1, ')');
}
group_by = group_bys.hasNext() ? group_bys.next() : null;
}
} while (tag != group_by); // Stop when they both become null.
// Skip any number of tags before the end.
buf.append("(?:.{").append(tagsize).append("})*$");
scanner.setKeyRegexp(buf.toString(), CHARSET);
}
/**
* Helper comparison function to compare tag name IDs.
* @param name_width Number of bytes used by a tag name ID.
* @param tag A tag (array containing a tag name ID and a tag value ID).
* @param group_by A tag name ID.
* @return {@code true} number if {@code tag} should be used next (because
* it contains a smaller ID), {@code false} otherwise.
*/
private boolean isTagNext(final short name_width,
final byte[] tag,
final byte[] group_by) {
if (tag == null) {
return false;
} else if (group_by == null) {
return true;
}
final int cmp = Bytes.memcmp(tag, group_by, 0, name_width);
if (cmp == 0) {
throw new AssertionError("invariant violation: tag ID "
+ Arrays.toString(group_by) + " is both in 'tags' and"
+ " 'group_bys' in " + this);
}
return cmp < 0;
}
/**
* Appends the given ID to the given buffer, followed by "\\E".
*/
private static void addId(final StringBuilder buf, final byte[] id) {
boolean backslash = false;
for (final byte b : id) {
buf.append((char) (b & 0xFF));
if (b == 'E' && backslash) { // If we saw a `\' and now we have a `E'.
// So we just terminated the quoted section because we just added \E
// to `buf'. So let's put a litteral \E now and start quoting again.
buf.append("\\\\E\\Q");
} else {
backslash = b == '\\';
}
}
buf.append("\\E");
}
public String toString() {
final StringBuilder buf = new StringBuilder();
buf.append("TsdbQuery(start_time=")
.append(getStartTime())
.append(", end_time=")
.append(getEndTime())
.append(", metric=").append(Arrays.toString(metric));
try {
buf.append(" (").append(tsdb.metrics.getName(metric));
} catch (NoSuchUniqueId e) {
buf.append(" (<").append(e.getMessage()).append('>');
}
try {
buf.append("), tags=").append(Tags.resolveIds(tsdb, tags));
} catch (NoSuchUniqueId e) {
buf.append("), tags=<").append(e.getMessage()).append('>');
}
buf.append(", rate=").append(rate)
.append(", aggregator=").append(aggregator)
.append(", group_bys=(");
if (group_bys != null) {
for (final byte[] tag_id : group_bys) {
try {
buf.append(tsdb.tag_names.getName(tag_id));
} catch (NoSuchUniqueId e) {
buf.append('<').append(e.getMessage()).append('>');
}
buf.append(' ')
.append(Arrays.toString(tag_id));
if (group_by_values != null) {
final byte[][] value_ids = group_by_values.get(tag_id);
if (value_ids == null) {
continue;
}
buf.append("={");
for (final byte[] value_id : value_ids) {
try {
buf.append(tsdb.tag_values.getName(value_id));
} catch (NoSuchUniqueId e) {
buf.append('<').append(e.getMessage()).append('>');
}
buf.append(' ')
.append(Arrays.toString(value_id))
.append(", ");
}
buf.append('}');
}
buf.append(", ");
}
}
buf.append("))");
return buf.toString();
}
/**
* Comparator that ignores timestamps in row keys.
*/
private static final class SpanCmp implements Comparator<byte[]> {
private final short metric_width;
public SpanCmp(final short metric_width) {
this.metric_width = metric_width;
}
public int compare(final byte[] a, final byte[] b) {
final int length = Math.min(a.length, b.length);
if (a == b) { // Do this after accessing a.length and b.length
return 0; // in order to NPE if either a or b is null.
}
int i;
// First compare the metric ID.
for (i = 0; i < metric_width; i++) {
if (a[i] != b[i]) {
return (a[i] & 0xFF) - (b[i] & 0xFF); // "promote" to unsigned.
}
}
// Then skip the timestamp and compare the rest.
for (i += Const.TIMESTAMP_BYTES; i < length; i++) {
if (a[i] != b[i]) {
return (a[i] & 0xFF) - (b[i] & 0xFF); // "promote" to unsigned.
}
}
return a.length - b.length;
}
}
}