// This file is part of OpenTSDB. // Copyright (C) 2011-2012 The OpenTSDB Authors. // // This program is free software: you can redistribute it and/or modify it // under the terms of the GNU Lesser General Public License as published by // the Free Software Foundation, either version 2.1 of the License, or (at your // option) any later version. This program is distributed in the hope that it // will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty // of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser // General Public License for more details. You should have received a copy // of the GNU Lesser General Public License along with this program. If not, // see <http://www.gnu.org/licenses/>. package net.opentsdb.core; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Comparator; import java.util.concurrent.ConcurrentSkipListMap; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; import com.stumbleupon.async.Callback; import com.stumbleupon.async.Deferred; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.hbase.async.Bytes; import org.hbase.async.HBaseRpc; import org.hbase.async.KeyValue; import org.hbase.async.PleaseThrottleException; import net.opentsdb.stats.StatsCollector; /** * "Queue" of rows to compact. * <p> * Whenever we write a data point to HBase, the row key we write to is added * to this queue, which is effectively a sorted set. There is a separate * thread that periodically goes through the queue and look for "old rows" to * compact. A row is considered "old" if the timestamp in the row key is * older than a certain threshold. * <p> * The compaction process consists in reading all the cells within a given row * and writing them back out as a single big cell. Once that writes succeeds, * we delete all the individual little cells. * <p> * This process is effective because in HBase the row key is repeated for * every single cell. And because there is no way to efficiently append bytes * at the end of a cell, we have to do this instead. */ final class CompactionQueue extends ConcurrentSkipListMap<byte[], Boolean> { private static final Logger LOG = LoggerFactory.getLogger(CompactionQueue.class); /** * How many items are currently in the queue. * Because {@link ConcurrentSkipListMap#size} has O(N) complexity. */ private final AtomicInteger size = new AtomicInteger(); private final AtomicLong trivial_compactions = new AtomicLong(); private final AtomicLong complex_compactions = new AtomicLong(); private final AtomicLong written_cells = new AtomicLong(); private final AtomicLong deleted_cells = new AtomicLong(); /** The {@code TSDB} instance we belong to. */ private final TSDB tsdb; /** On how many bytes do we encode metrics IDs. */ private final short metric_width; /** * Constructor. * @param tsdb The TSDB we belong to. */ public CompactionQueue(final TSDB tsdb) { super(new Cmp(tsdb)); this.tsdb = tsdb; metric_width = tsdb.metrics.width(); if (TSDB.enable_compactions) { startCompactionThread(); } } @Override public int size() { return size.get(); } public void add(final byte[] row) { if (super.put(row, Boolean.TRUE) == null) { size.incrementAndGet(); // We added a new entry, count it. } } /** * Forces a flush of the all old entries in the compaction queue. * @return A deferred that will be called back once everything has been * flushed (or something failed, in which case the deferred will carry the * exception). In case of success, the kind of object returned is * unspecified. */ public Deferred<ArrayList<Object>> flush() { final int size = size(); if (size > 0) { LOG.info("Flushing all old outstanding rows out of " + size + " rows"); } final long now = System.currentTimeMillis(); return flush(now / 1000 - Const.MAX_TIMESPAN - 1, Integer.MAX_VALUE); } /** * Collects the stats and metrics tracked by this instance. * @param collector The collector to use. */ void collectStats(final StatsCollector collector) { collector.record("compaction.count", trivial_compactions, "type=trivial"); collector.record("compaction.count", complex_compactions, "type=complex"); if (!TSDB.enable_compactions) { return; } // The remaining stats only make sense with compactions enabled. collector.record("compaction.queue.size", size); collector.record("compaction.errors", handle_read_error.errors, "rpc=read"); collector.record("compaction.errors", handle_write_error.errors, "rpc=put"); collector.record("compaction.errors", handle_delete_error.errors, "rpc=delete"); collector.record("compaction.writes", written_cells); collector.record("compaction.deletes", deleted_cells); } /** * Flushes all the rows in the compaction queue older than the cutoff time. * @param cut_off A UNIX timestamp in seconds (unsigned 32-bit integer). * @param maxflushes How many rows to flush off the queue at once. * This integer is expected to be strictly positive. * @return A deferred that will be called back once everything has been * flushed. */ private Deferred<ArrayList<Object>> flush(final long cut_off, int maxflushes) { assert maxflushes > 0: "maxflushes must be > 0, but I got " + maxflushes; // We can't possibly flush more entries than size(). maxflushes = Math.min(maxflushes, size()); if (maxflushes == 0) { // Because size() might be 0. return Deferred.fromResult(new ArrayList<Object>(0)); } final ArrayList<Deferred<Object>> ds = new ArrayList<Deferred<Object>>(Math.min(maxflushes, MAX_CONCURRENT_FLUSHES)); int nflushes = 0; for (final byte[] row : this.keySet()) { if (maxflushes == 0) { break; } final long base_time = Bytes.getUnsignedInt(row, metric_width); if (base_time > cut_off) { break; } else if (nflushes == MAX_CONCURRENT_FLUSHES) { // We kicked off the compaction of too many rows already, let's wait // until they're done before kicking off more. break; } // You'd think that it would be faster to grab an iterator on the map // and then call remove() on the iterator to "unlink" the element // directly from where the iterator is at, but no, the JDK implements // it by calling remove(key) so it has to lookup the key again anyway. if (super.remove(row) == null) { // We didn't remove anything. continue; // So someone else already took care of this entry. } nflushes++; maxflushes--; size.decrementAndGet(); ds.add(tsdb.get(row).addCallbacks(compactcb, handle_read_error)); } final Deferred<ArrayList<Object>> group = Deferred.group(ds); if (nflushes == MAX_CONCURRENT_FLUSHES && maxflushes > 0) { // We're not done yet. Once this group of flushes completes, we need // to kick off more. tsdb.flush(); // Speed up this batch by telling the client to flush. final int maxflushez = maxflushes; // Make it final for closure. final class FlushMoreCB implements Callback<Deferred<ArrayList<Object>>, ArrayList<Object>> { public Deferred<ArrayList<Object>> call(final ArrayList<Object> arg) { return flush(cut_off, maxflushez); } public String toString() { return "Continue flushing with cut_off=" + cut_off + ", maxflushes=" + maxflushez; } } group.addCallbackDeferring(new FlushMoreCB()); } return group; } private final CompactCB compactcb = new CompactCB(); /** * Callback to compact a row once it's been read. * <p> * This is used once the "get" completes, to actually compact the row and * write back the compacted version. */ private final class CompactCB implements Callback<Object, ArrayList<KeyValue>> { public Object call(final ArrayList<KeyValue> row) { return compact(row, null); } public String toString() { return "compact"; } } /** * Compacts a row into a single {@link KeyValue}. * @param row The row containing all the KVs to compact. * Must contain at least one element. * @return A compacted version of this row. */ KeyValue compact(final ArrayList<KeyValue> row) { final KeyValue[] compacted = { null }; compact(row, compacted); return compacted[0]; } /** * Compacts a row into a single {@link KeyValue}. * <p> * If the {@code row} is empty, this function does literally nothing. * If {@code compacted} is not {@code null}, then the compacted form of this * {@code row} will be stored in {@code compacted[0]}. Obviously, if the * {@code row} contains a single cell, then that cell is the compacted form. * Otherwise the compaction process takes places. * @param row The row containing all the KVs to compact. Must be non-null. * @param compacted If non-null, the first item in the array will be set to * a {@link KeyValue} containing the compacted form of this row. * If non-null, we will also not write the compacted form back to HBase * unless the timestamp in the row key is old enough. * @return A {@link Deferred} if the compaction processed required a write * to HBase, otherwise {@code null}. */ private Deferred<Object> compact(final ArrayList<KeyValue> row, final KeyValue[] compacted) { if (row.size() <= 1) { if (row.isEmpty()) { // Maybe the row got deleted in the mean time? LOG.debug("Attempted to compact a row that doesn't exist."); } else if (compacted != null) { // no need to re-compact rows containing a single value. KeyValue kv = row.get(0); final byte[] qual = kv.qualifier(); final byte[] val = kv.value(); if (floatingPointValueToFix(qual[1], val)) { // Fix up old, incorrectly encoded floating point value. final byte[] newval = fixFloatingPointValue(qual[1], val); final byte[] newqual = new byte[] { qual[0], fixQualifierFlags(qual[1], newval.length) }; kv = new KeyValue(kv.key(), kv.family(), newqual, newval); } compacted[0] = kv; } return null; } // We know we have at least 2 cells. We need to go through all the cells // to determine what kind of compaction we're going to do. If each cell // contains a single individual data point, then we can do a trivial // compaction. Otherwise, we have a partially compacted row, and the // logic required to compact it is more complex. boolean write = true; // Do we need to write a compacted cell? final KeyValue compact; { boolean trivial = true; // Are we doing a trivial compaction? int qual_len = 0; // Pre-compute the size of the qualifier we'll need. int val_len = 1; // Reserve an extra byte for meta-data. short last_delta = -1; // Time delta, extracted from the qualifier. KeyValue longest = row.get(0); // KV with the longest qualifier. int longest_idx = 0; // Index of `longest'. final int nkvs = row.size(); for (int i = 0; i < nkvs; i++) { final KeyValue kv = row.get(i); final byte[] qual = kv.qualifier(); // If the qualifier length isn't 2, this row might have already // been compacted, potentially partially, so we need to merge the // partially compacted set of cells, with the rest. final int len = qual.length; if (len != 2) { trivial = false; // We only do this here because no qualifier can be < 2 bytes. if (len > longest.qualifier().length) { longest = kv; longest_idx = i; } } else { // In the trivial case, do some sanity checking here. // For non-trivial cases, the sanity checking logic is more // complicated and is thus pushed down to `complexCompact'. final short delta = (short) ((Bytes.getShort(qual) & 0xFFFF) >>> Const.FLAG_BITS); // This data point has a time delta that's less than or equal to // the previous one. This typically means we have 2 data points // at the same timestamp but they have different flags. We're // going to abort here because someone needs to fsck the table. if (delta <= last_delta) { throw new IllegalDataException("Found out of order or duplicate" + " data: last_delta=" + last_delta + ", delta=" + delta + ", offending KV=" + kv + ", row=" + row + " -- run an fsck."); } last_delta = delta; // We don't need it below for complex compactions, so we update it // only here in the `else' branch. final byte[] v = kv.value(); val_len += floatingPointValueToFix(qual[1], v) ? 4 : v.length; } qual_len += len; } if (trivial) { trivial_compactions.incrementAndGet(); compact = trivialCompact(row, qual_len, val_len); } else { complex_compactions.incrementAndGet(); compact = complexCompact(row, qual_len / 2); // Now it's vital that we check whether the compact KV has the same // qualifier as one of the qualifiers that were already in the row. // Otherwise we might do a `put' in this cell, followed by a delete. // We don't want to delete what we just wrote. // This can happen if this row was already compacted but someone // wrote another individual data point at the same timestamp. // Optimization: since we kept track of which KV had the longest // qualifier, we can opportunistically check here if it happens to // have the same qualifier as the one we just created. final byte[] qual = compact.qualifier(); final byte[] longest_qual = longest.qualifier(); if (qual.length <= longest_qual.length) { KeyValue dup = null; int dup_idx = -1; if (Bytes.equals(longest_qual, qual)) { dup = longest; dup_idx = longest_idx; } else { // Worst case: to be safe we have to loop again and check all // the qualifiers and make sure we're not going to overwrite // anything. // TODO(tsuna): Try to write a unit test that triggers this code // path. I'm not even sure it's possible. Should we replace // this code with an `assert false: "should never be here"'? for (int i = 0; i < nkvs; i++) { final KeyValue kv = row.get(i); if (Bytes.equals(kv.qualifier(), qual)) { dup = kv; dup_idx = i; break; } } } if (dup != null) { // So we did find an existing KV with the same qualifier. // Let's check if, by chance, the value is the same too. if (Bytes.equals(dup.value(), compact.value())) { // Since the values are the same, we don't need to write // anything. There's already a properly compacted version of // this row in TSDB. write = false; } // Now let's make sure we don't delete this qualifier. This // re-allocates the entire array, but should be a rare case. row.remove(dup_idx); } // else: no dup, we're good. } // else: most common case: the compact qualifier is longer than // the previously longest qualifier, so we cannot possibly // overwrite an existing cell we would then delete. } } if (compacted != null) { // Caller is interested in the compacted form. compacted[0] = compact; final long base_time = Bytes.getUnsignedInt(compact.key(), metric_width); final long cut_off = System.currentTimeMillis() / 1000 - Const.MAX_TIMESPAN - 1; if (base_time > cut_off) { // If row is too recent... return null; // ... Don't write back compacted. } } if (!TSDB.enable_compactions) { return null; } final byte[] key = compact.key(); //LOG.debug("Compacting row " + Arrays.toString(key)); deleted_cells.addAndGet(row.size()); // We're going to delete this. if (write) { final byte[] qual = compact.qualifier(); final byte[] value = compact.value(); written_cells.incrementAndGet(); return tsdb.put(key, qual, value) .addCallbacks(new DeleteCompactedCB(row), handle_write_error); } else { // We had nothing to write, because one of the cells is already the // correctly compacted version, so we can go ahead and delete the // individual cells directly. new DeleteCompactedCB(row).call(null); return null; } } /** * Performs a trivial compaction of a row. * <p> * This method is to be used only when all the cells in the given row * are individual data points (nothing has been compacted yet). If any of * the cells have already been compacted, the caller is expected to call * {@link #complexCompact} instead. * @param row The row to compact. Assumed to have 2 elements or more. * @param qual_len Exact number of bytes to hold the compacted qualifiers. * @param val_len Exact number of bytes to hold the compacted values. * @return a {@link KeyValue} containing the result of the merge of all the * {@code KeyValue}s given in argument. */ private static KeyValue trivialCompact(final ArrayList<KeyValue> row, final int qual_len, final int val_len) { // Now let's simply concatenate all the qualifiers and values together. final byte[] qualifier = new byte[qual_len]; final byte[] value = new byte[val_len]; // Now populate the arrays by copying qualifiers/values over. int qual_idx = 0; int val_idx = 0; for (final KeyValue kv : row) { final byte[] q = kv.qualifier(); // We shouldn't get into this function if this isn't true. assert q.length == 2: "Qualifier length must be 2: " + kv; final byte[] v = fixFloatingPointValue(q[1], kv.value()); qualifier[qual_idx++] = q[0]; qualifier[qual_idx++] = fixQualifierFlags(q[1], v.length); System.arraycopy(v, 0, value, val_idx, v.length); val_idx += v.length; } // Right now we leave the last byte all zeros, this last byte will be // used in the future to introduce more formats/encodings. final KeyValue first = row.get(0); return new KeyValue(first.key(), first.family(), qualifier, value); } /** * Fix the flags inside the last byte of a qualifier. * <p> * OpenTSDB used to not rely on the size recorded in the flags being * correct, and so for a long time it was setting the wrong size for * floating point values (pretending they were encoded on 8 bytes when * in fact they were on 4). So overwrite these bits here to make sure * they're correct now, because once they're compacted it's going to * be quite hard to tell if the flags are right or wrong, and we need * them to be correct to easily decode the values. * @param flags The least significant byte of a qualifier. * @param val_len The number of bytes in the value of this qualifier. * @return The least significant byte of the qualifier with correct flags. */ private static byte fixQualifierFlags(byte flags, final int val_len) { // Explanation: // (1) Take the last byte of the qualifier. // (2) Zero out all the flag bits but one. // The one we keep is the type (floating point vs integer value). // (3) Set the length properly based on the value we have. return (byte) ((flags & ~(Const.FLAGS_MASK >>> 1)) | (val_len - 1)); // ^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^ // (1) (2) (3) } /** * Returns whether or not this is a floating value that needs to be fixed. * <p> * OpenTSDB used to encode all floating point values as `float' (4 bytes) * but actually store them on 8 bytes, with 4 leading 0 bytes, and flags * correctly stating the value was on 4 bytes. * @param flags The least significant byte of a qualifier. * @param value The value that may need to be corrected. */ private static boolean floatingPointValueToFix(final byte flags, final byte[] value) { return (flags & Const.FLAG_FLOAT) != 0 // We need a floating point value. && (flags & Const.LENGTH_MASK) == 0x3 // That pretends to be on 4 bytes. && value.length == 8; // But is actually using 8 bytes. } /** * Returns a corrected value if this is a floating point value to fix. * <p> * OpenTSDB used to encode all floating point values as `float' (4 bytes) * but actually store them on 8 bytes, with 4 leading 0 bytes, and flags * correctly stating the value was on 4 bytes. * <p> * This function detects such values and returns a corrected value, without * the 4 leading zeros. Otherwise it returns the value unchanged. * @param flags The least significant byte of a qualifier. * @param value The value that may need to be corrected. * @throws IllegalDataException if the value is malformed. */ private static byte[] fixFloatingPointValue(final byte flags, final byte[] value) { if (floatingPointValueToFix(flags, value)) { // The first 4 bytes should really be zeros. if (value[0] == 0 && value[1] == 0 && value[2] == 0 && value[3] == 0) { // Just keep the last 4 bytes. return new byte[] { value[4], value[5], value[6], value[7] }; } else { // Very unlikely. throw new IllegalDataException("Corrupted floating point value: " + Arrays.toString(value) + " flags=0x" + Integer.toHexString(flags) + " -- first 4 bytes are expected to be zeros."); } } return value; } /** * Helper class for complex compaction cases. * <p> * This is simply a glorified pair of (qualifier, value) that's comparable. * Only the qualifier is used to make comparisons. * @see #complexCompact */ private static final class Cell implements Comparable<Cell> { /** Tombstone used as a helper during the complex compaction. */ static final Cell SKIP = new Cell(null, null); final byte[] qualifier; final byte[] value; Cell(final byte[] qualifier, final byte[] value) { this.qualifier = qualifier; this.value = value; } public int compareTo(final Cell other) { return Bytes.memcmp(qualifier, other.qualifier); } public boolean equals(final Object o) { return o != null && o instanceof Cell && compareTo((Cell) o) == 0; } public int hashCode() { return Arrays.hashCode(qualifier); } public String toString() { return "Cell(" + Arrays.toString(qualifier) + ", " + Arrays.toString(value) + ')'; } } /** * Compacts a partially compacted row. * <p> * This method is called in the non-trivial re-compaction cases, where a row * already contains one or more partially compacted cells. This can happen * for various reasons, such as TSDs dying in the middle of a compaction or * races involved with TSDs trying to compact the same row at the same * time, or old data being slowly written to a TSD. * @param row The row to compact. Assumed to have 2 elements or more. * @param estimated_nvalues Estimate of the number of values to compact. * Used to pre-allocate a collection of the right size, so it's better to * overshoot a bit to avoid re-allocations. * @return a {@link KeyValue} containing the result of the merge of all the * {@code KeyValue}s given in argument. * @throws IllegalDataException if one of the cells cannot be read because * it's corrupted or in a format we don't understand. */ static KeyValue complexCompact(final ArrayList<KeyValue> row, final int estimated_nvalues) { // We know at least one of the cells contains multiple values, and we need // to merge all the cells together in a sorted fashion. We use a simple // strategy: split all the cells into individual objects, sort them, // merge the result while ignoring duplicates (same qualifier & value). final ArrayList<Cell> cells = breakDownValues(row, estimated_nvalues); Collections.sort(cells); // Now let's done one pass first to compute the length of the compacted // value and to find if we have any bad duplicates (same qualifier, // different value). int nvalues = 0; int val_len = 1; // Reserve an extra byte for meta-data. short last_delta = -1; // Time delta, extracted from the qualifier. int ncells = cells.size(); for (int i = 0; i < ncells; i++) { final Cell cell = cells.get(i); final short delta = (short) ((Bytes.getShort(cell.qualifier) & 0xFFFF) >>> Const.FLAG_BITS); // Because we sorted `cells' by qualifier, and because the time delta // occupies the most significant bits, this should never trigger. assert delta >= last_delta: ("WTF? It's supposed to be sorted: " + cells + " at " + i + " delta=" + delta + ", last_delta=" + last_delta); // The only troublesome case is where we have two (or more) consecutive // cells with the same time delta, but different flags or values. if (delta == last_delta) { // Find the previous cell. Because we potentially replace the one // right before `i' with a tombstone, we might need to look further // back a bit more. Cell prev = Cell.SKIP; // i > 0 because we can't get here during the first iteration. // Also the first Cell cannot be Cell.SKIP, so `j' will never // underflow. And even if it does, we'll get an exception. for (int j = i - 1; prev == Cell.SKIP; j--) { prev = cells.get(j); } if (cell.qualifier[1] != prev.qualifier[1] || !Bytes.equals(cell.value, prev.value)) { throw new IllegalDataException("Found out of order or duplicate" + " data: cell=" + cell + ", delta=" + delta + ", prev cell=" + prev + ", last_delta=" + last_delta + ", in row=" + row + " -- run an fsck."); } // else: we're good, this is a true duplicate (same qualifier & value). // Just replace it with a tombstone so we'll skip it. We don't delete // it from the array because that would cause a re-allocation. cells.set(i, Cell.SKIP); continue; } last_delta = delta; nvalues++; val_len += cell.value.length; } final byte[] qualifier = new byte[nvalues * 2]; final byte[] value = new byte[val_len]; // Now populate the arrays by copying qualifiers/values over. int qual_idx = 0; int val_idx = 0; for (final Cell cell : cells) { if (cell == Cell.SKIP) { continue; } byte[] b = cell.qualifier; System.arraycopy(b, 0, qualifier, qual_idx, b.length); qual_idx += b.length; b = cell.value; System.arraycopy(b, 0, value, val_idx, b.length); val_idx += b.length; } // Right now we leave the last byte all zeros, this last byte will be // used in the future to introduce more formats/encodings. final KeyValue first = row.get(0); final KeyValue kv = new KeyValue(first.key(), first.family(), qualifier, value); return kv; } /** * Breaks down all the values in a row into individual {@link Cell}s. * @param row The row to compact. Assumed to have 2 elements or more. * @param estimated_nvalues Estimate of the number of values to compact. * Used to pre-allocate a collection of the right size, so it's better to * overshoot a bit to avoid re-allocations. * @throws IllegalDataException if one of the cells cannot be read because * it's corrupted or in a format we don't understand. */ private static ArrayList<Cell> breakDownValues(final ArrayList<KeyValue> row, final int estimated_nvalues) { final ArrayList<Cell> cells = new ArrayList<Cell>(estimated_nvalues); for (final KeyValue kv : row) { final byte[] qual = kv.qualifier(); final int len = qual.length; final byte[] val = kv.value(); if (len == 2) { // Single-value cell. // Maybe we need to fix the flags in the qualifier. final byte[] actual_val = fixFloatingPointValue(qual[1], val); final byte q = fixQualifierFlags(qual[1], actual_val.length); final byte[] actual_qual; if (q != qual[1]) { // We need to fix the qualifier. actual_qual = new byte[] { qual[0], q }; // So make a copy. } else { actual_qual = qual; // Otherwise use the one we already have. } final Cell cell = new Cell(actual_qual, actual_val); cells.add(cell); continue; } // else: we have a multi-value cell. We need to break it down into // individual Cell objects. // First check that the last byte is 0, otherwise it might mean that // this compacted cell has been written by a future version of OpenTSDB // and we don't know how to decode it, so we shouldn't touch it. if (val[val.length - 1] != 0) { throw new IllegalDataException("Don't know how to read this value:" + Arrays.toString(val) + " found in " + kv + " -- this compacted value might have been written by a future" + " version of OpenTSDB, or could be corrupt."); } // Now break it down into Cells. int val_idx = 0; for (int i = 0; i < len; i += 2) { final byte[] q = new byte[] { qual[i], qual[i + 1] }; final int vlen = (q[1] & Const.LENGTH_MASK) + 1; final byte[] v = new byte[vlen]; System.arraycopy(val, val_idx, v, 0, vlen); val_idx += vlen; final Cell cell = new Cell(q, v); cells.add(cell); } // Check we consumed all the bytes of the value. Remember the last byte // is metadata, so it's normal that we didn't consume it. if (val_idx != val.length - 1) { throw new IllegalDataException("Corrupted value: couldn't break down" + " into individual values (consumed " + val_idx + " bytes, but was" + " expecting to consume " + (val.length - 1) + "): " + kv + ", cells so far: " + cells); } } return cells; } /** * Callback to delete a row that's been successfully compacted. */ private final class DeleteCompactedCB implements Callback<Object, Object> { /** What we're going to delete. */ private final byte[] key; private final byte[] family; private final byte[][] qualifiers; public DeleteCompactedCB(final ArrayList<KeyValue> cells) { final KeyValue first = cells.get(0); key = first.key(); family = first.family(); qualifiers = new byte[cells.size()][]; for (int i = 0; i < qualifiers.length; i++) { qualifiers[i] = cells.get(i).qualifier(); } } public Object call(final Object arg) { return tsdb.delete(key, qualifiers).addErrback(handle_delete_error); } public String toString() { return "delete compacted cells"; } } private final HandleErrorCB handle_read_error = new HandleErrorCB("read"); private final HandleErrorCB handle_write_error = new HandleErrorCB("write"); private final HandleErrorCB handle_delete_error = new HandleErrorCB("delete"); /** * Callback to handle exceptions during the compaction process. */ private final class HandleErrorCB implements Callback<Object, Exception> { private volatile int errors; private final String what; /** * Constructor. * @param what String describing what kind of operation (e.g. "read"). */ public HandleErrorCB(final String what) { this.what = what; } public Object call(final Exception e) { if (e instanceof PleaseThrottleException) { // HBase isn't keeping up. final HBaseRpc rpc = ((PleaseThrottleException) e).getFailedRpc(); if (rpc instanceof HBaseRpc.HasKey) { // We failed to compact this row. Whether it's because of a failed // get, put or delete, we should re-schedule this row for a future // compaction. add(((HBaseRpc.HasKey) rpc).key()); return Boolean.TRUE; // We handled it, so don't return an exception. } else { // Should never get in this clause. LOG.error("WTF? Cannot retry this RPC, and this shouldn't happen: " + rpc); } } // `++' is not atomic but doesn't matter if we miss some increments. if (++errors % 100 == 1) { // Basic rate-limiting to not flood logs. LOG.error("Failed to " + what + " a row to re-compact", e); } return e; } public String toString() { return "handle " + what + " error"; } } static final long serialVersionUID = 1307386642; /** Starts a compaction thread. Only one such thread is needed. */ private void startCompactionThread() { final Thrd thread = new Thrd(); thread.setDaemon(true); thread.start(); } /** How frequently the compaction thread wakes up flush stuff. */ // TODO(tsuna): Make configurable? private static final int FLUSH_INTERVAL = 10; // seconds /** Minimum number of rows we'll attempt to compact at once. */ // TODO(tsuna): Make configurable? private static final int MIN_FLUSH_THRESHOLD = 100; // rows /** Maximum number of rows we'll compact concurrently. */ // TODO(tsuna): Make configurable? private static final int MAX_CONCURRENT_FLUSHES = 10000; // rows /** If this is X then we'll flush X times faster than we really need. */ // TODO(tsuna): Make configurable? private static final int FLUSH_SPEED = 2; // multiplicative factor /** * Background thread to trigger periodic compactions. */ final class Thrd extends Thread { public Thrd() { super("CompactionThread"); } public void run() { long last_flush = 0; while (true) { try { final long now = System.currentTimeMillis(); final int size = size(); // Let's suppose MAX_TIMESPAN = 1h. We have `size' rows to compact, // and we better compact them all before in less than 1h, otherwise // we're going to "fall behind" when a new hour start (as we'll be // creating a ton of new rows then). So slice MAX_TIMESPAN using // FLUSH_INTERVAL to compute what fraction of `size' we need to // flush at each iteration. Note that `size' will usually account // for many rows that can't be flushed yet (not old enough) so we're // overshooting a bit (flushing more aggressively than necessary). // This isn't a problem at all. The only thing that matters is that // the rate at which we flush stuff is proportional to how much work // is sitting in the queue. The multiplicative factor FLUSH_SPEED // is added to make flush even faster than we need. For example, if // FLUSH_SPEED is 2, then instead of taking 1h to flush what we have // for the previous hour, we'll take only 30m. This is desirable so // that we evict old entries from the queue a bit faster. final int maxflushes = Math.max(MIN_FLUSH_THRESHOLD, size * FLUSH_INTERVAL * FLUSH_SPEED / Const.MAX_TIMESPAN); // Flush if either (1) it's been too long since the last flush // or (2) we have too many rows to recompact already. // Note that in the case (2) we might not be able to flush anything // if the rows aren't old enough. if (last_flush - now > Const.MAX_TIMESPAN // (1) || size > maxflushes) { // (2) flush(now / 1000 - Const.MAX_TIMESPAN - 1, maxflushes); if (LOG.isDebugEnabled()) { final int newsize = size(); LOG.debug("flush() took " + (System.currentTimeMillis() - now) + "ms, new queue size=" + newsize + " (" + (newsize - size) + ')'); } } } catch (Exception e) { LOG.error("Uncaught exception in compaction thread", e); } catch (OutOfMemoryError e) { // Let's free up some memory by throwing away the compaction queue. final int sz = size.get(); CompactionQueue.super.clear(); size.set(0); LOG.error("Discarded the compaction queue, size=" + sz, e); } catch (Throwable e) { LOG.error("Uncaught *Throwable* in compaction thread", e); // Catching this kind of error is totally unexpected and is really // bad. If we do nothing and let this thread die, we'll run out of // memory as new entries are added to the queue. We could always // commit suicide, but it's kind of drastic and nothing else in the // code does this. If `enable_compactions' wasn't final, we could // always set it to false, but that's not an option. So in order to // try to get a fresh start, let this compaction thread terminate // and spin off a new one instead. try { Thread.sleep(1000); // Avoid busy looping creating new threads. } catch (InterruptedException i) { LOG.error("Compaction thread interrupted in error handling", i); return; // Don't flush, we're truly hopeless. } startCompactionThread(); return; } try { Thread.sleep(FLUSH_INTERVAL * 1000); } catch (InterruptedException e) { LOG.error("Compaction thread interrupted, doing one last flush", e); flush(); return; } } } } /** * Helper to sort the byte arrays in the compaction queue. * <p> * This comparator sorts things by timestamp first, this way we can find * all rows of the same age at once. */ private static final class Cmp implements Comparator<byte[]> { /** On how many bytes do we encode metrics IDs. */ private final short metric_width; public Cmp(final TSDB tsdb) { metric_width = tsdb.metrics.width(); } public int compare(final byte[] a, final byte[] b) { final int c = Bytes.memcmp(a, b, metric_width, Const.TIMESTAMP_BYTES); // If the timestamps are equal, sort according to the entire row key. return c != 0 ? c : Bytes.memcmp(a, b); } } }