// This file is part of OpenTSDB. // Copyright (C) 2015 The OpenTSDB Authors. // // This program is free software: you can redistribute it and/or modify it // under the terms of the GNU Lesser General Public License as published by // the Free Software Foundation, either version 2.1 of the License, or (at your // option) any later version. This program is distributed in the hope that it // will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty // of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser // General Public License for more details. You should have received a copy // of the GNU Lesser General Public License along with this program. If not, // see <http://www.gnu.org/licenses/>. package net.opentsdb.core; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeMap; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicInteger; import net.opentsdb.meta.Annotation; import net.opentsdb.query.filter.TagVFilter; import net.opentsdb.stats.QueryStats; import net.opentsdb.stats.QueryStats.QueryStat; import net.opentsdb.uid.UniqueId; import net.opentsdb.utils.DateTime; import org.hbase.async.Bytes.ByteMap; import org.hbase.async.DeleteRequest; import org.hbase.async.KeyValue; import org.hbase.async.Scanner; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.collect.Lists; import com.stumbleupon.async.Callback; import com.stumbleupon.async.Deferred; /** * A class that handles coordinating the various scanners created for each * salt bucket when salting is enabled. Each scanner stores it's results in * local maps and once everyone has reported in, then the maps are parsed and * combined into a proper set of spans to return to the {@link TsdbQuery} class. * * Note that if one or more of the scanners throws an exception, then that * exception will be returned to the caller in the deferred. Unfortunately we * don't have a good way to cancel a scan in progress so the first scanner with * an error will store it, then we wait for all of the other scanners to * complete. * * Concurrency is important in this class as the scanners are executing * asynchronously and can modify variables at any time. */ public class SaltScanner { private static final Logger LOG = LoggerFactory.getLogger(SaltScanner.class); /** This is a map that the caller must supply. We'll fill it with data. * WARNING: The salted row comparator should be applied to this map. */ private final TreeMap<byte[], Span> spans; /** The list of pre-configured scanners. One scanner should be created per * salt bucket. */ private final List<Scanner> scanners; /** Stores the compacted columns from each scanner as it completes. After all * scanners are done, we process this into the span map above. */ private final Map<Integer, List<KeyValue>> kv_map = new ConcurrentHashMap<Integer, List<KeyValue>>(); /** Stores annotations from each scanner as it completes */ private final Map<byte[], List<Annotation>> annotation_map = Collections.synchronizedMap( new TreeMap<byte[], List<Annotation>>(new RowKey.SaltCmp())); /** A deferred to call with the spans on completion */ private final Deferred<TreeMap<byte[], Span>> results = new Deferred<TreeMap<byte[], Span>>(); /** The metric this scanner set is dealing with. If a row comes in with a * different metric we toss an exception. This shouldn't happen though. */ private final byte[] metric; /** The TSDB to which we belong */ private final TSDB tsdb; /** A stats object associated with the sub query used for storing stats * about scanner operations. */ private final QueryStats query_stats; /** Index of the sub query in the main query list */ private final int query_index; /** A counter used to determine how many scanners are still running */ private AtomicInteger completed_tasks = new AtomicInteger(); /** When the scanning started. We store the scan latency once all scanners * are done.*/ private long start_time; // milliseconds. /** Whether or not to delete the queried data */ private final boolean delete; /** A list of filters to iterate over when processing rows */ private final List<TagVFilter> filters; /** A holder for storing the first exception thrown by a scanner if something * goes pear shaped. Make sure to synchronize on this object when checking * for null or assigning from a scanner's callback. */ private volatile Exception exception; /** * Default ctor that performs some validation. Call {@link scan} after * construction to actually start fetching data. * @param tsdb The TSDB to which we belong * @param metric The metric we're expecting to fetch * @param scanners A list of HBase scanners, one for each bucket * @param spans The span map to store results in * @param filters A list of filters for processing * @throws IllegalArgumentException if any required data was missing or * we had invalid parameters. */ public SaltScanner(final TSDB tsdb, final byte[] metric, final List<Scanner> scanners, final TreeMap<byte[], Span> spans, final List<TagVFilter> filters) { this(tsdb, metric, scanners, spans, filters, false, null, 0); } /** * Default ctor that performs some validation. Call {@link scan} after * construction to actually start fetching data. * @param tsdb The TSDB to which we belong * @param metric The metric we're expecting to fetch * @param scanners A list of HBase scanners, one for each bucket * @param spans The span map to store results in * @param delete Whether or not to delete the queried data * @param filters A list of filters for processing * @param query_stats A stats object for tracking timing * @param query_index The index of the sub query in the main query list * @throws IllegalArgumentException if any required data was missing or * we had invalid parameters. */ public SaltScanner(final TSDB tsdb, final byte[] metric, final List<Scanner> scanners, final TreeMap<byte[], Span> spans, final List<TagVFilter> filters, final boolean delete, final QueryStats query_stats, final int query_index) { if (Const.SALT_WIDTH() < 1) { throw new IllegalArgumentException( "Salting is disabled. Use the regular scanner"); } if (tsdb == null) { throw new IllegalArgumentException("The TSDB argument was null."); } if (spans == null) { throw new IllegalArgumentException("Span map cannot be null."); } if (!spans.isEmpty()) { throw new IllegalArgumentException("The span map should be empty."); } if (scanners == null || scanners.isEmpty()) { throw new IllegalArgumentException("Missing or empty scanners list. " + "Please provide a list of scanners for each salt."); } if (scanners.size() != Const.SALT_BUCKETS()) { throw new IllegalArgumentException("Not enough or too many scanners " + scanners.size() + " when the salt bucket count is " + Const.SALT_BUCKETS()); } if (metric == null) { throw new IllegalArgumentException("The metric array was null."); } if (metric.length != TSDB.metrics_width()) { throw new IllegalArgumentException("The metric was too short. It must be " + TSDB.metrics_width() + "bytes wide."); } this.scanners = scanners; this.spans = spans; this.metric = metric; this.tsdb = tsdb; this.filters = filters; this.delete = delete; this.query_stats = query_stats; this.query_index = query_index; } /** * Starts all of the scanners asynchronously and returns the data fetched * once all of the scanners have completed. Note that the result may be an * exception if one or more of the scanners encountered an exception. The * first error will be returned, others will be logged. * @return A deferred to wait on for results. */ public Deferred<TreeMap<byte[], Span>> scan() { start_time = System.currentTimeMillis(); int i = 0; for (final Scanner scanner: scanners) { new ScannerCB(scanner, i++).scan(); } return results; } /** * Called once all of the scanners have reported back in to record our * latency and merge the results into the spans map. If there was an exception * stored then we'll return that instead. */ private void mergeAndReturnResults() { final long hbase_time = System.currentTimeMillis(); TsdbQuery.scanlatency.add((int)(hbase_time - start_time)); long rows = 0; if (exception != null) { LOG.error("After all of the scanners finished, at " + "least one threw an exception", exception); results.callback(exception); return; } // Merge sorted spans together final long merge_start = DateTime.nanoTime(); for (final List<KeyValue> kvs : kv_map.values()) { if (kvs == null || kvs.isEmpty()) { LOG.warn("Found a key value list that was null or empty"); continue; } for (final KeyValue kv : kvs) { if (kv == null) { LOG.warn("Found a key value item that was null"); continue; } if (kv.key() == null) { LOG.warn("A key for a kv was null"); continue; } Span datapoints = spans.get(kv.key()); if (datapoints == null) { datapoints = new Span(tsdb); spans.put(kv.key(), datapoints); } if (annotation_map.containsKey(kv.key())) { for (final Annotation note: annotation_map.get(kv.key())) { datapoints.getAnnotations().add(note); } annotation_map.remove(kv.key()); } try { datapoints.addRow(kv); rows++; } catch (RuntimeException e) { LOG.error("Exception adding row to span", e); throw e; } } } kv_map.clear(); for (final byte[] key : annotation_map.keySet()) { Span datapoints = spans.get(key); if (datapoints == null) { datapoints = new Span(tsdb); spans.put(key, datapoints); } for (final Annotation note: annotation_map.get(key)) { datapoints.getAnnotations().add(note); } } if (query_stats != null) { query_stats.addStat(query_index, QueryStat.SCANNER_MERGE_TIME, (DateTime.nanoTime() - merge_start)); } if (LOG.isDebugEnabled()) { LOG.debug("Scanning completed in " + (hbase_time - start_time) + " ms, " + rows + " rows, and stored in " + spans.size() + " spans"); LOG.debug("It took " + (System.currentTimeMillis() - hbase_time) + " ms, " + " to merge and sort the rows into a tree map"); } results.callback(spans); } /** * Scanner callback executed recursively each time we get a set of data * from storage. This is responsible for determining what columns are * returned and issuing requests to load leaf objects. * When the scanner returns a null set of rows, the method initiates the * final callback. */ final class ScannerCB implements Callback<Object, ArrayList<ArrayList<KeyValue>>> { private final Scanner scanner; private final int index; private final List<KeyValue> kvs = new ArrayList<KeyValue>(); private final ByteMap<List<Annotation>> annotations = new ByteMap<List<Annotation>>(); private final Set<String> skips = Collections.newSetFromMap( new ConcurrentHashMap<String, Boolean>()); private final Set<String> keepers = Collections.newSetFromMap( new ConcurrentHashMap<String, Boolean>()); private long scanner_start = -1; /** nanosecond timestamps */ private long fetch_start = 0; // reset each time we send an RPC to HBase private long fetch_time = 0; // cumulation of time waiting on HBase private long uid_resolve_time = 0; // cumulation of time resolving UIDs private long uids_resolved = 0; private long compaction_time = 0; // cumulation of time compacting private long dps_pre_filter = 0; private long rows_pre_filter = 0; private long dps_post_filter = 0; private long rows_post_filter = 0; public ScannerCB(final Scanner scanner, final int index) { this.scanner = scanner; this.index = index; if (query_stats != null) { query_stats.addScannerId(query_index, index, scanner.toString()); } } /** Error callback that will capture an exception from AsyncHBase and store * it so we can bubble it up to the caller. */ class ErrorCb implements Callback<Object, Exception> { @Override public Object call(final Exception e) throws Exception { LOG.error("Scanner " + scanner + " threw an exception", e); close(false); handleException(e); return null; } } /** * Starts the scanner and is called recursively to fetch the next set of * rows from the scanner. * @return The map of spans if loaded successfully, null if no data was * found */ public Object scan() { if (scanner_start < 0) { scanner_start = DateTime.nanoTime(); } fetch_start = DateTime.nanoTime(); return scanner.nextRows().addCallback(this).addErrback(new ErrorCb()); } /** * Iterate through each row of the scanner results, parses out data * points (and optional meta data). * @return null if no rows were found, otherwise the TreeMap with spans */ @Override public Object call(final ArrayList<ArrayList<KeyValue>> rows) throws Exception { try { fetch_time += DateTime.nanoTime() - fetch_start; if (rows == null) { close(true); return null; } else if (exception != null) { close(false); // don't need to handleException here as it's already taken care of // due to the fact that exception was set. if (LOG.isDebugEnabled()) { LOG.debug("Closing scanner as there was an exception: " + scanner); } return null; } // used for UID resolution if a filter is involved final List<Deferred<Object>> lookups = filters != null && !filters.isEmpty() ? new ArrayList<Deferred<Object>>(rows.size()) : null; rows_pre_filter += rows.size(); for (final ArrayList<KeyValue> row : rows) { final byte[] key = row.get(0).key(); if (RowKey.rowKeyContainsMetric(metric, key) != 0) { close(false); handleException(new IllegalDataException( "HBase returned a row that doesn't match" + " our scanner (" + scanner + ")! " + row + " does not start" + " with " + Arrays.toString(metric) + " on scanner " + this)); return null; } // calculate estimated data point count. We don't want to deserialize // the byte arrays so we'll just get a rough estimate of compacted // columns. for (final KeyValue kv : row) { if (kv.qualifier().length % 2 == 0) { if (kv.qualifier().length == 2 || kv.qualifier().length == 4) { ++dps_pre_filter; } else { // for now we'll assume that all compacted columns are of the // same precision. This is likely incorrect. if (Internal.inMilliseconds(kv.qualifier())) { dps_pre_filter += (kv.qualifier().length / 4); } else { dps_pre_filter += (kv.qualifier().length / 2); } } } else if (kv.qualifier()[0] == AppendDataPoints.APPEND_COLUMN_PREFIX) { // with appends we don't have a good rough estimate as the length // can vary widely with the value length variability. Therefore we // have to iterate. int idx = 0; int qlength = 0; while (idx < kv.value().length) { qlength = Internal.getQualifierLength(kv.value(), idx); idx += qlength + Internal.getValueLengthFromQualifier(kv.value(), idx); ++dps_pre_filter; } } } // If any filters have made it this far then we need to resolve // the row key UIDs to their names for string comparison. We'll // try to avoid the resolution with some sets but we may dupe // resolve a few times. // TODO - more efficient resolution // TODO - byte set instead of a string for the uid may be faster if (filters != null && !filters.isEmpty()) { lookups.clear(); final String tsuid = UniqueId.uidToString(UniqueId.getTSUIDFromKey(key, TSDB.metrics_width(), Const.TIMESTAMP_BYTES)); if (skips.contains(tsuid)) { continue; } if (!keepers.contains(tsuid)) { final long uid_start = DateTime.nanoTime(); /** CB to called after all of the UIDs have been resolved */ class MatchCB implements Callback<Object, ArrayList<Boolean>> { @Override public Object call(final ArrayList<Boolean> matches) throws Exception { for (final boolean matched : matches) { if (!matched) { skips.add(tsuid); return null; } } // matched all, good data keepers.add(tsuid); processRow(key, row); return null; } } /** Resolves all of the row key UIDs to their strings for filtering */ class GetTagsCB implements Callback<Deferred<ArrayList<Boolean>>, Map<String, String>> { @Override public Deferred<ArrayList<Boolean>> call( final Map<String, String> tags) throws Exception { uid_resolve_time += (DateTime.nanoTime() - uid_start); uids_resolved += tags.size(); final List<Deferred<Boolean>> matches = new ArrayList<Deferred<Boolean>>(filters.size()); for (final TagVFilter filter : filters) { matches.add(filter.match(tags)); } return Deferred.group(matches); } } lookups.add(Tags.getTagsAsync(tsdb, key) .addCallbackDeferring(new GetTagsCB()) .addBoth(new MatchCB())); } else { processRow(key, row); } } else { processRow(key, row); } } // either we need to wait on the UID resolutions or we can go ahead // if we don't have filters. if (lookups != null && lookups.size() > 0) { class GroupCB implements Callback<Object, ArrayList<Object>> { @Override public Object call(final ArrayList<Object> group) throws Exception { return scan(); } } return Deferred.group(lookups).addCallback(new GroupCB()); } else { return scan(); } } catch (final RuntimeException e) { LOG.error("Unexpected exception on scanner " + this, e); close(false); handleException(e); return null; } } /** * Finds or creates the span for this row, compacts it and stores it. Also * fires off a delete request for the row if told to. * @param key The row key to use for fetching the span * @param row The row to add */ void processRow(final byte[] key, final ArrayList<KeyValue> row) { ++rows_post_filter; if (delete) { final DeleteRequest del = new DeleteRequest(tsdb.dataTable(), key); tsdb.getClient().delete(del); } // calculate estimated data point count. We don't want to deserialize // the byte arrays so we'll just get a rough estimate of compacted // columns. for (final KeyValue kv : row) { if (kv.qualifier().length % 2 == 0) { if (kv.qualifier().length == 2 || kv.qualifier().length == 4) { ++dps_post_filter; } else { // for now we'll assume that all compacted columns are of the // same precision. This is likely incorrect. if (Internal.inMilliseconds(kv.qualifier())) { dps_post_filter += (kv.qualifier().length / 4); } else { dps_post_filter += (kv.qualifier().length / 2); } } } else if (kv.qualifier()[0] == AppendDataPoints.APPEND_COLUMN_PREFIX) { // with appends we don't have a good rough estimate as the length // can vary widely with the value length variability. Therefore we // have to iterate. int idx = 0; int qlength = 0; while (idx < kv.value().length) { qlength = Internal.getQualifierLength(kv.value(), idx); idx += qlength + Internal.getValueLengthFromQualifier(kv.value(), idx); ++dps_post_filter; } } } final KeyValue compacted; // let IllegalDataExceptions bubble up so the handler above can close // the scanner final long compaction_start = DateTime.nanoTime(); try { final List<Annotation> notes = Lists.newArrayList(); compacted = tsdb.compact(row, notes); if (!notes.isEmpty()) { synchronized (annotations) { List<Annotation> map_notes = annotations.get(key); if (map_notes == null) { annotations.put(key, notes); } else { map_notes.addAll(notes); } } } } catch (IllegalDataException idex) { compaction_time += (DateTime.nanoTime() - compaction_start); close(false); handleException(idex); return; } compaction_time += (DateTime.nanoTime() - compaction_start); if (compacted != null) { // Can be null if we ignored all KVs. kvs.add(compacted); } } /** * Closes the scanner and sets the various stats after filtering * @param ok Whether or not the scanner closed with an exception or * closed due to natural causes (e.g. ran out of data or we wanted to stop * it early) */ void close(final boolean ok) { scanner.close(); if (query_stats != null) { query_stats.addScannerStat(query_index, index, QueryStat.SCANNER_TIME, DateTime.nanoTime() - scanner_start); // Scanner Stats /* Uncomment when AsyncHBase has this feature: query_stats.addScannerStat(query_index, index, QueryStat.ROWS_FROM_STORAGE, scanner.getRowsFetched()); query_stats.addScannerStat(query_index, index, QueryStat.COLUMNS_FROM_STORAGE, scanner.getColumnsFetched()); query_stats.addScannerStat(query_index, index, QueryStat.BYTES_FROM_STORAGE, scanner.getBytesFetched()); */ query_stats.addScannerStat(query_index, index, QueryStat.HBASE_TIME, fetch_time); query_stats.addScannerStat(query_index, index, QueryStat.SUCCESSFUL_SCAN, ok ? 1 : 0); // Post Scan stats query_stats.addScannerStat(query_index, index, QueryStat.ROWS_PRE_FILTER, rows_pre_filter); query_stats.addScannerStat(query_index, index, QueryStat.DPS_PRE_FILTER, dps_pre_filter); query_stats.addScannerStat(query_index, index, QueryStat.ROWS_POST_FILTER, rows_post_filter); query_stats.addScannerStat(query_index, index, QueryStat.DPS_POST_FILTER, dps_post_filter); query_stats.addScannerStat(query_index, index, QueryStat.SCANNER_UID_TO_STRING_TIME, uid_resolve_time); query_stats.addScannerStat(query_index, index, QueryStat.UID_PAIRS_RESOLVED, uids_resolved); query_stats.addScannerStat(query_index, index, QueryStat.COMPACTION_TIME, compaction_time); } if (ok && exception == null) { validateAndTriggerCallback(kvs, annotations); } else { completed_tasks.incrementAndGet(); } } } /** * Called each time a scanner completes with valid or empty data. * @param kvs The compacted columns fetched by the scanner * @param annotations The annotations fetched by the scanners */ private void validateAndTriggerCallback(final List<KeyValue> kvs, final Map<byte[], List<Annotation>> annotations) { final int tasks = completed_tasks.incrementAndGet(); if (kvs.size() > 0) { kv_map.put(tasks, kvs); } for (final byte[] key : annotations.keySet()) { final List<Annotation> notes = annotations.get(key); if (notes.size() > 0) { // Optimistic write, expecting unique row keys annotation_map.put(key, notes); } } if (tasks >= Const.SALT_BUCKETS()) { try { mergeAndReturnResults(); } catch (final Exception ex) { results.callback(ex); } } } /** * If one or more of the scanners throws an exception then we should close it * and pass the exception here so that we can catch and return it to the * caller. If all of the scanners have finished, this will callback to the * caller immediately. * @param e The exception to store. */ private void handleException(final Exception e) { // make sure only one scanner can set the exception completed_tasks.incrementAndGet(); if (exception == null) { synchronized (this) { if (exception == null) { exception = e; // fail once and fast on the first scanner to throw an exception try { mergeAndReturnResults(); } catch (Exception ex) { LOG.error("Failed merging and returning results, " + "calling back with exception", ex); results.callback(ex); } } else { // TODO - it would be nice to close and cancel the other scanners but // for now we have to wait for them to finish and/or throw exceptions. LOG.error("Another scanner threw an exception", e); } } } } }