CompactionQueue.java example

// This file is part of OpenTSDB.
// Copyright (C) 2011-2012  The OpenTSDB Authors.
//
// This program is free software: you can redistribute it and/or modify it
// under the terms of the GNU Lesser General Public License as published by
// the Free Software Foundation, either version 2.1 of the License, or (at your
// option) any later version.  This program is distributed in the hope that it
// will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty
// of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser
// General Public License for more details.  You should have received a copy
// of the GNU Lesser General Public License along with this program.  If not,
// see <http://www.gnu.org/licenses/>.
package net.opentsdb.core;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.concurrent.ConcurrentSkipListMap;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;

import com.stumbleupon.async.Callback;
import com.stumbleupon.async.Deferred;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.hbase.async.Bytes;
import org.hbase.async.HBaseRpc;
import org.hbase.async.KeyValue;
import org.hbase.async.PleaseThrottleException;

import net.opentsdb.stats.StatsCollector;

/**
 * "Queue" of rows to compact.
 * <p>
 * Whenever we write a data point to HBase, the row key we write to is added
 * to this queue, which is effectively a sorted set.  There is a separate
 * thread that periodically goes through the queue and look for "old rows" to
 * compact.  A row is considered "old" if the timestamp in the row key is
 * older than a certain threshold.
 * <p>
 * The compaction process consists in reading all the cells within a given row
 * and writing them back out as a single big cell.  Once that writes succeeds,
 * we delete all the individual little cells.
 * <p>
 * This process is effective because in HBase the row key is repeated for
 * every single cell.  And because there is no way to efficiently append bytes
 * at the end of a cell, we have to do this instead.
 */
final class CompactionQueue extends ConcurrentSkipListMap<byte[], Boolean> {

  private static final Logger LOG = LoggerFactory.getLogger(CompactionQueue.class);

  /**
   * How many items are currently in the queue.
   * Because {@link ConcurrentSkipListMap#size} has O(N) complexity.
   */
  private final AtomicInteger size = new AtomicInteger();

  private final AtomicLong trivial_compactions = new AtomicLong();
  private final AtomicLong complex_compactions = new AtomicLong();
  private final AtomicLong written_cells = new AtomicLong();
  private final AtomicLong deleted_cells = new AtomicLong();

  /** The {@code TSDB} instance we belong to. */
  private final TSDB tsdb;

  /** On how many bytes do we encode metrics IDs.  */
  private final short metric_width;

  /**
   * Constructor.
   * @param tsdb The TSDB we belong to.
   */
  public CompactionQueue(final TSDB tsdb) {
    super(new Cmp(tsdb));
    this.tsdb = tsdb;
    metric_width = tsdb.metrics.width();
    if (TSDB.enable_compactions) {
      startCompactionThread();
    }
  }

  @Override
  public int size() {
    return size.get();
  }

  public void add(final byte[] row) {
    if (super.put(row, Boolean.TRUE) == null) {
      size.incrementAndGet();  // We added a new entry, count it.
    }
  }

  /**
   * Forces a flush of the all old entries in the compaction queue.
   * @return A deferred that will be called back once everything has been
   * flushed (or something failed, in which case the deferred will carry the
   * exception).  In case of success, the kind of object returned is
   * unspecified.
   */
  public Deferred<ArrayList<Object>> flush() {
    final int size = size();
    if (size > 0) {
      LOG.info("Flushing all old outstanding rows out of " + size + " rows");
    }
    final long now = System.currentTimeMillis();
    return flush(now / 1000 - Const.MAX_TIMESPAN - 1, Integer.MAX_VALUE);
  }

  /**
   * Collects the stats and metrics tracked by this instance.
   * @param collector The collector to use.
   */
  void collectStats(final StatsCollector collector) {
    collector.record("compaction.count", trivial_compactions, "type=trivial");
    collector.record("compaction.count", complex_compactions, "type=complex");
    if (!TSDB.enable_compactions) {
      return;
    }
    // The remaining stats only make sense with compactions enabled.
    collector.record("compaction.queue.size", size);
    collector.record("compaction.errors", handle_read_error.errors, "rpc=read");
    collector.record("compaction.errors", handle_write_error.errors, "rpc=put");
    collector.record("compaction.errors", handle_delete_error.errors,
                     "rpc=delete");
    collector.record("compaction.writes", written_cells);
    collector.record("compaction.deletes", deleted_cells);
  }

  /**
   * Flushes all the rows in the compaction queue older than the cutoff time.
   * @param cut_off A UNIX timestamp in seconds (unsigned 32-bit integer).
   * @param maxflushes How many rows to flush off the queue at once.
   * This integer is expected to be strictly positive.
   * @return A deferred that will be called back once everything has been
   * flushed.
   */
  private Deferred<ArrayList<Object>> flush(final long cut_off, int maxflushes) {
    assert maxflushes > 0: "maxflushes must be > 0, but I got " + maxflushes;
    // We can't possibly flush more entries than size().
    maxflushes = Math.min(maxflushes, size());
    if (maxflushes == 0) {  // Because size() might be 0.
      return Deferred.fromResult(new ArrayList<Object>(0));
    }
    final ArrayList<Deferred<Object>> ds =
      new ArrayList<Deferred<Object>>(Math.min(maxflushes,
                                               MAX_CONCURRENT_FLUSHES));
    int nflushes = 0;
    for (final byte[] row : this.keySet()) {
      if (maxflushes == 0) {
        break;
      }
      final long base_time = Bytes.getUnsignedInt(row, metric_width);
      if (base_time > cut_off) {
        break;
      } else if (nflushes == MAX_CONCURRENT_FLUSHES) {
        // We kicked off the compaction of too many rows already, let's wait
        // until they're done before kicking off more.
        break;
      }
      // You'd think that it would be faster to grab an iterator on the map
      // and then call remove() on the iterator to "unlink" the element
      // directly from where the iterator is at, but no, the JDK implements
      // it by calling remove(key) so it has to lookup the key again anyway.
      if (super.remove(row) == null) {  // We didn't remove anything.
        continue;  // So someone else already took care of this entry.
      }
      nflushes++;
      maxflushes--;
      size.decrementAndGet();
      ds.add(tsdb.get(row).addCallbacks(compactcb, handle_read_error));
    }
    final Deferred<ArrayList<Object>> group = Deferred.group(ds);
    if (nflushes == MAX_CONCURRENT_FLUSHES && maxflushes > 0) {
      // We're not done yet.  Once this group of flushes completes, we need
      // to kick off more.
      tsdb.flush();  // Speed up this batch by telling the client to flush.
      final int maxflushez = maxflushes;  // Make it final for closure.
      final class FlushMoreCB implements Callback<Deferred<ArrayList<Object>>,
                                                  ArrayList<Object>> {
        public Deferred<ArrayList<Object>> call(final ArrayList<Object> arg) {
          return flush(cut_off, maxflushez);
        }
        public String toString() {
          return "Continue flushing with cut_off=" + cut_off
            + ", maxflushes=" + maxflushez;
        }
      }
      group.addCallbackDeferring(new FlushMoreCB());
    }
    return group;
  }

  private final CompactCB compactcb = new CompactCB();

  /**
   * Callback to compact a row once it's been read.
   * <p>
   * This is used once the "get" completes, to actually compact the row and
   * write back the compacted version.
   */
  private final class CompactCB implements Callback<Object, ArrayList<KeyValue>> {
    public Object call(final ArrayList<KeyValue> row) {
      return compact(row, null);
    }
    public String toString() {
      return "compact";
    }
  }

  /**
   * Compacts a row into a single {@link KeyValue}.
   * @param row The row containing all the KVs to compact.
   * Must contain at least one element.
   * @return A compacted version of this row.
   */
  KeyValue compact(final ArrayList<KeyValue> row) {
    final KeyValue[] compacted = { null };
    compact(row, compacted);
    return compacted[0];
  }

  /**
   * Compacts a row into a single {@link KeyValue}.
   * <p>
   * If the {@code row} is empty, this function does literally nothing.
   * If {@code compacted} is not {@code null}, then the compacted form of this
   * {@code row} will be stored in {@code compacted[0]}.  Obviously, if the
   * {@code row} contains a single cell, then that cell is the compacted form.
   * Otherwise the compaction process takes places.
   * @param row The row containing all the KVs to compact.  Must be non-null.
   * @param compacted If non-null, the first item in the array will be set to
   * a {@link KeyValue} containing the compacted form of this row.
   * If non-null, we will also not write the compacted form back to HBase
   * unless the timestamp in the row key is old enough.
   * @return A {@link Deferred} if the compaction processed required a write
   * to HBase, otherwise {@code null}.
   */
  private Deferred<Object> compact(final ArrayList<KeyValue> row,
                                   final KeyValue[] compacted) {
    if (row.size() <= 1) {
      if (row.isEmpty()) {  // Maybe the row got deleted in the mean time?
        LOG.debug("Attempted to compact a row that doesn't exist.");
      } else if (compacted != null) {
        // no need to re-compact rows containing a single value.
        KeyValue kv = row.get(0);
        final byte[] qual = kv.qualifier();
        final byte[] val = kv.value();
        if (floatingPointValueToFix(qual[1], val)) {
          // Fix up old, incorrectly encoded floating point value.
          final byte[] newval = fixFloatingPointValue(qual[1], val);
          final byte[] newqual = new byte[] { qual[0],
            fixQualifierFlags(qual[1], newval.length) };
          kv = new KeyValue(kv.key(), kv.family(), newqual, newval);
        }
        compacted[0] = kv;
      }
      return null;
    }

    // We know we have at least 2 cells.  We need to go through all the cells
    // to determine what kind of compaction we're going to do.  If each cell
    // contains a single individual data point, then we can do a trivial
    // compaction.  Otherwise, we have a partially compacted row, and the
    // logic required to compact it is more complex.
    boolean write = true;  // Do we need to write a compacted cell?
    final KeyValue compact;
    {
      boolean trivial = true;  // Are we doing a trivial compaction?
      int qual_len = 0;  // Pre-compute the size of the qualifier we'll need.
      int val_len = 1;   // Reserve an extra byte for meta-data.
      short last_delta = -1;  // Time delta, extracted from the qualifier.
      KeyValue longest = row.get(0);  // KV with the longest qualifier.
      int longest_idx = 0;            // Index of `longest'.
      final int nkvs = row.size();
      for (int i = 0; i < nkvs; i++) {
        final KeyValue kv = row.get(i);
        final byte[] qual = kv.qualifier();
        // If the qualifier length isn't 2, this row might have already
        // been compacted, potentially partially, so we need to merge the
        // partially compacted set of cells, with the rest.
        final int len = qual.length;
        if (len != 2) {
          trivial = false;
          // We only do this here because no qualifier can be < 2 bytes.
          if (len > longest.qualifier().length) {
            longest = kv;
            longest_idx = i;
          }
        } else {
          // In the trivial case, do some sanity checking here.
          // For non-trivial cases, the sanity checking logic is more
          // complicated and is thus pushed down to `complexCompact'.
          final short delta = (short) ((Bytes.getShort(qual) & 0xFFFF)
                                       >>> Const.FLAG_BITS);
          // This data point has a time delta that's less than or equal to
          // the previous one.  This typically means we have 2 data points
          // at the same timestamp but they have different flags.  We're
          // going to abort here because someone needs to fsck the table.
          if (delta <= last_delta) {
            throw new IllegalDataException("Found out of order or duplicate"
              + " data: last_delta=" + last_delta + ", delta=" + delta
              + ", offending KV=" + kv + ", row=" + row + " -- run an fsck.");
          }
          last_delta = delta;
          // We don't need it below for complex compactions, so we update it
          // only here in the `else' branch.
          final byte[] v = kv.value();
          val_len += floatingPointValueToFix(qual[1], v) ? 4 : v.length;
        }
        qual_len += len;
      }

      if (trivial) {
        trivial_compactions.incrementAndGet();
        compact = trivialCompact(row, qual_len, val_len);
      } else {
        complex_compactions.incrementAndGet();
        compact = complexCompact(row, qual_len / 2);
        // Now it's vital that we check whether the compact KV has the same
        // qualifier as one of the qualifiers that were already in the row.
        // Otherwise we might do a `put' in this cell, followed by a delete.
        // We don't want to delete what we just wrote.
        // This can happen if this row was already compacted but someone
        // wrote another individual data point at the same timestamp.
        // Optimization: since we kept track of which KV had the longest
        // qualifier, we can opportunistically check here if it happens to
        // have the same qualifier as the one we just created.
        final byte[] qual = compact.qualifier();
        final byte[] longest_qual = longest.qualifier();
        if (qual.length <= longest_qual.length) {
          KeyValue dup = null;
          int dup_idx = -1;
          if (Bytes.equals(longest_qual, qual)) {
            dup = longest;
            dup_idx = longest_idx;
          } else {
            // Worst case: to be safe we have to loop again and check all
            // the qualifiers and make sure we're not going to overwrite
            // anything.
            // TODO(tsuna): Try to write a unit test that triggers this code
            // path.  I'm not even sure it's possible.  Should we replace
            // this code with an `assert false: "should never be here"'?
            for (int i = 0; i < nkvs; i++) {
              final KeyValue kv = row.get(i);
              if (Bytes.equals(kv.qualifier(), qual)) {
                dup = kv;
                dup_idx = i;
                break;
              }
            }
          }
          if (dup != null) {
            // So we did find an existing KV with the same qualifier.
            // Let's check if, by chance, the value is the same too.
            if (Bytes.equals(dup.value(), compact.value())) {
              // Since the values are the same, we don't need to write
              // anything.  There's already a properly compacted version of
              // this row in TSDB.
              write = false;
            }
            // Now let's make sure we don't delete this qualifier.  This
            // re-allocates the entire array, but should be a rare case.
            row.remove(dup_idx);
          } // else: no dup, we're good.
        } // else: most common case: the compact qualifier is longer than
          // the previously longest qualifier, so we cannot possibly
          // overwrite an existing cell we would then delete.
      }
    }
    if (compacted != null) {  // Caller is interested in the compacted form.
      compacted[0] = compact;
      final long base_time = Bytes.getUnsignedInt(compact.key(), metric_width);
      final long cut_off = System.currentTimeMillis() / 1000
        - Const.MAX_TIMESPAN - 1;
      if (base_time > cut_off) {  // If row is too recent...
        return null;              // ... Don't write back compacted.
      }
    }
    if (!TSDB.enable_compactions) {
      return null;
    }

    final byte[] key = compact.key();
    //LOG.debug("Compacting row " + Arrays.toString(key));
    deleted_cells.addAndGet(row.size());  // We're going to delete this.
    if (write) {
      final byte[] qual = compact.qualifier();
      final byte[] value = compact.value();
      written_cells.incrementAndGet();
      return tsdb.put(key, qual, value)
        .addCallbacks(new DeleteCompactedCB(row), handle_write_error);
    } else {
      // We had nothing to write, because one of the cells is already the
      // correctly compacted version, so we can go ahead and delete the
      // individual cells directly.
      new DeleteCompactedCB(row).call(null);
      return null;
    }
  }

  /**
   * Performs a trivial compaction of a row.
   * <p>
   * This method is to be used only when all the cells in the given row
   * are individual data points (nothing has been compacted yet).  If any of
   * the cells have already been compacted, the caller is expected to call
   * {@link #complexCompact} instead.
   * @param row The row to compact.  Assumed to have 2 elements or more.
   * @param qual_len Exact number of bytes to hold the compacted qualifiers.
   * @param val_len Exact number of bytes to hold the compacted values.
   * @return a {@link KeyValue} containing the result of the merge of all the
   * {@code KeyValue}s given in argument.
   */
  private static KeyValue trivialCompact(final ArrayList<KeyValue> row,
                                         final int qual_len,
                                         final int val_len) {
    // Now let's simply concatenate all the qualifiers and values together.
    final byte[] qualifier = new byte[qual_len];
    final byte[] value = new byte[val_len];
    // Now populate the arrays by copying qualifiers/values over.
    int qual_idx = 0;
    int val_idx = 0;
    for (final KeyValue kv : row) {
      final byte[] q = kv.qualifier();
      // We shouldn't get into this function if this isn't true.
      assert q.length == 2: "Qualifier length must be 2: " + kv;
      final byte[] v = fixFloatingPointValue(q[1], kv.value());
      qualifier[qual_idx++] = q[0];
      qualifier[qual_idx++] = fixQualifierFlags(q[1], v.length);
      System.arraycopy(v, 0, value, val_idx, v.length);
      val_idx += v.length;
    }
    // Right now we leave the last byte all zeros, this last byte will be
    // used in the future to introduce more formats/encodings.

    final KeyValue first = row.get(0);
    return new KeyValue(first.key(), first.family(), qualifier, value);
  }

  /**
   * Fix the flags inside the last byte of a qualifier.
   * <p>
   * OpenTSDB used to not rely on the size recorded in the flags being
   * correct, and so for a long time it was setting the wrong size for
   * floating point values (pretending they were encoded on 8 bytes when
   * in fact they were on 4).  So overwrite these bits here to make sure
   * they're correct now, because once they're compacted it's going to
   * be quite hard to tell if the flags are right or wrong, and we need
   * them to be correct to easily decode the values.
   * @param flags The least significant byte of a qualifier.
   * @param val_len The number of bytes in the value of this qualifier.
   * @return The least significant byte of the qualifier with correct flags.
   */
  private static byte fixQualifierFlags(byte flags, final int val_len) {
    // Explanation:
    //   (1) Take the last byte of the qualifier.
    //   (2) Zero out all the flag bits but one.
    //       The one we keep is the type (floating point vs integer value).
    //   (3) Set the length properly based on the value we have.
    return (byte) ((flags & ~(Const.FLAGS_MASK >>> 1)) | (val_len - 1));
    //              ^^^^^   ^^^^^^^^^^^^^^^^^^^^^^^^^    ^^^^^^^^^^^^^
    //               (1)               (2)                    (3)
  }

  /**
   * Returns whether or not this is a floating value that needs to be fixed.
   * <p>
   * OpenTSDB used to encode all floating point values as `float' (4 bytes)
   * but actually store them on 8 bytes, with 4 leading 0 bytes, and flags
   * correctly stating the value was on 4 bytes.
   * @param flags The least significant byte of a qualifier.
   * @param value The value that may need to be corrected.
   */
  private static boolean floatingPointValueToFix(final byte flags,
                                                 final byte[] value) {
    return (flags & Const.FLAG_FLOAT) != 0   // We need a floating point value.
      && (flags & Const.LENGTH_MASK) == 0x3  // That pretends to be on 4 bytes.
      && value.length == 8;                  // But is actually using 8 bytes.
  }

  /**
   * Returns a corrected value if this is a floating point value to fix.
   * <p>
   * OpenTSDB used to encode all floating point values as `float' (4 bytes)
   * but actually store them on 8 bytes, with 4 leading 0 bytes, and flags
   * correctly stating the value was on 4 bytes.
   * <p>
   * This function detects such values and returns a corrected value, without
   * the 4 leading zeros.  Otherwise it returns the value unchanged.
   * @param flags The least significant byte of a qualifier.
   * @param value The value that may need to be corrected.
   * @throws IllegalDataException if the value is malformed.
   */
  private static byte[] fixFloatingPointValue(final byte flags,
                                              final byte[] value) {
    if (floatingPointValueToFix(flags, value)) {
      // The first 4 bytes should really be zeros.
      if (value[0] == 0 && value[1] == 0 && value[2] == 0 && value[3] == 0) {
        // Just keep the last 4 bytes.
        return new byte[] { value[4], value[5], value[6], value[7] };
      } else {  // Very unlikely.
        throw new IllegalDataException("Corrupted floating point value: "
          + Arrays.toString(value) + " flags=0x" + Integer.toHexString(flags)
          + " -- first 4 bytes are expected to be zeros.");
      }
    }
    return value;
  }

  /**
   * Helper class for complex compaction cases.
   * <p>
   * This is simply a glorified pair of (qualifier, value) that's comparable.
   * Only the qualifier is used to make comparisons.
   * @see #complexCompact
   */
  private static final class Cell implements Comparable<Cell> {
    /** Tombstone used as a helper during the complex compaction.  */
    static final Cell SKIP = new Cell(null, null);

    final byte[] qualifier;
    final byte[] value;

    Cell(final byte[] qualifier, final byte[] value) {
      this.qualifier = qualifier;
      this.value = value;
    }

    public int compareTo(final Cell other) {
      return Bytes.memcmp(qualifier, other.qualifier);
    }

    public boolean equals(final Object o) {
      return o != null && o instanceof Cell && compareTo((Cell) o) == 0;
    }

    public int hashCode() {
      return Arrays.hashCode(qualifier);
    }

    public String toString() {
      return "Cell(" + Arrays.toString(qualifier)
        + ", " + Arrays.toString(value) + ')';
    }
  }

  /**
   * Compacts a partially compacted row.
   * <p>
   * This method is called in the non-trivial re-compaction cases, where a row
   * already contains one or more partially compacted cells.  This can happen
   * for various reasons, such as TSDs dying in the middle of a compaction or
   * races involved with TSDs trying to compact the same row at the same
   * time, or old data being slowly written to a TSD.
   * @param row The row to compact.  Assumed to have 2 elements or more.
   * @param estimated_nvalues Estimate of the number of values to compact.
   * Used to pre-allocate a collection of the right size, so it's better to
   * overshoot a bit to avoid re-allocations.
   * @return a {@link KeyValue} containing the result of the merge of all the
   * {@code KeyValue}s given in argument.
   * @throws IllegalDataException if one of the cells cannot be read because
   * it's corrupted or in a format we don't understand.
   */
  static KeyValue complexCompact(final ArrayList<KeyValue> row,
                                 final int estimated_nvalues) {
    // We know at least one of the cells contains multiple values, and we need
    // to merge all the cells together in a sorted fashion.  We use a simple
    // strategy: split all the cells into individual objects, sort them,
    // merge the result while ignoring duplicates (same qualifier & value).
    final ArrayList<Cell> cells = breakDownValues(row, estimated_nvalues);
    Collections.sort(cells);

    // Now let's done one pass first to compute the length of the compacted
    // value and to find if we have any bad duplicates (same qualifier,
    // different value).
    int nvalues = 0;
    int val_len = 1;  // Reserve an extra byte for meta-data.
    short last_delta = -1;  // Time delta, extracted from the qualifier.
    int ncells = cells.size();
    for (int i = 0; i < ncells; i++) {
      final Cell cell = cells.get(i);
      final short delta = (short) ((Bytes.getShort(cell.qualifier) & 0xFFFF)
                                   >>> Const.FLAG_BITS);
      // Because we sorted `cells' by qualifier, and because the time delta
      // occupies the most significant bits, this should never trigger.
      assert delta >= last_delta: ("WTF? It's supposed to be sorted: " + cells
                                   + " at " + i + " delta=" + delta
                                   + ", last_delta=" + last_delta);
      // The only troublesome case is where we have two (or more) consecutive
      // cells with the same time delta, but different flags or values.
      if (delta == last_delta) {
        // Find the previous cell.  Because we potentially replace the one
        // right before `i' with a tombstone, we might need to look further
        // back a bit more.
        Cell prev = Cell.SKIP;
        // i > 0 because we can't get here during the first iteration.
        // Also the first Cell cannot be Cell.SKIP, so `j' will never
        // underflow.  And even if it does, we'll get an exception.
        for (int j = i - 1; prev == Cell.SKIP; j--) {
          prev = cells.get(j);
        }
        if (cell.qualifier[1] != prev.qualifier[1]
            || !Bytes.equals(cell.value, prev.value)) {
          throw new IllegalDataException("Found out of order or duplicate"
            + " data: cell=" + cell + ", delta=" + delta + ", prev cell="
            + prev + ", last_delta=" + last_delta + ", in row=" + row
            + " -- run an fsck.");
        }
        // else: we're good, this is a true duplicate (same qualifier & value).
        // Just replace it with a tombstone so we'll skip it.  We don't delete
        // it from the array because that would cause a re-allocation.
        cells.set(i, Cell.SKIP);
        continue;
      }
      last_delta = delta;
      nvalues++;
      val_len += cell.value.length;
    }

    final byte[] qualifier = new byte[nvalues * 2];
    final byte[] value = new byte[val_len];
    // Now populate the arrays by copying qualifiers/values over.
    int qual_idx = 0;
    int val_idx = 0;
    for (final Cell cell : cells) {
      if (cell == Cell.SKIP) {
        continue;
      }
      byte[] b = cell.qualifier;
      System.arraycopy(b, 0, qualifier, qual_idx, b.length);
      qual_idx += b.length;
      b = cell.value;
      System.arraycopy(b, 0, value, val_idx, b.length);
      val_idx += b.length;
    }
    // Right now we leave the last byte all zeros, this last byte will be
    // used in the future to introduce more formats/encodings.

    final KeyValue first = row.get(0);
    final KeyValue kv = new KeyValue(first.key(), first.family(),
                                     qualifier, value);
    return kv;
  }

  /**
   * Breaks down all the values in a row into individual {@link Cell}s.
   * @param row The row to compact.  Assumed to have 2 elements or more.
   * @param estimated_nvalues Estimate of the number of values to compact.
   * Used to pre-allocate a collection of the right size, so it's better to
   * overshoot a bit to avoid re-allocations.
   * @throws IllegalDataException if one of the cells cannot be read because
   * it's corrupted or in a format we don't understand.
   */
  private static ArrayList<Cell> breakDownValues(final ArrayList<KeyValue> row,
                                                 final int estimated_nvalues) {
    final ArrayList<Cell> cells = new ArrayList<Cell>(estimated_nvalues);
    for (final KeyValue kv : row) {
      final byte[] qual = kv.qualifier();
      final int len = qual.length;
      final byte[] val = kv.value();
      if (len == 2) {  // Single-value cell.
        // Maybe we need to fix the flags in the qualifier.
        final byte[] actual_val = fixFloatingPointValue(qual[1], val);
        final byte q = fixQualifierFlags(qual[1], actual_val.length);
        final byte[] actual_qual;
        if (q != qual[1]) {  // We need to fix the qualifier.
          actual_qual = new byte[] { qual[0], q };  // So make a copy.
        } else {
          actual_qual = qual;  // Otherwise use the one we already have.
        }
        final Cell cell = new Cell(actual_qual, actual_val);
        cells.add(cell);
        continue;
      }
      // else: we have a multi-value cell.  We need to break it down into
      // individual Cell objects.
      // First check that the last byte is 0, otherwise it might mean that
      // this compacted cell has been written by a future version of OpenTSDB
      // and we don't know how to decode it, so we shouldn't touch it.
      if (val[val.length - 1] != 0) {
        throw new IllegalDataException("Don't know how to read this value:"
          + Arrays.toString(val) + " found in " + kv
          + " -- this compacted value might have been written by a future"
          + " version of OpenTSDB, or could be corrupt.");
      }
      // Now break it down into Cells.
      int val_idx = 0;
      for (int i = 0; i < len; i += 2) {
        final byte[] q = new byte[] { qual[i], qual[i + 1] };
        final int vlen = (q[1] & Const.LENGTH_MASK) + 1;
        final byte[] v = new byte[vlen];
        System.arraycopy(val, val_idx, v, 0, vlen);
        val_idx += vlen;
        final Cell cell = new Cell(q, v);
        cells.add(cell);
      }
      // Check we consumed all the bytes of the value.  Remember the last byte
      // is metadata, so it's normal that we didn't consume it.
      if (val_idx != val.length - 1) {
        throw new IllegalDataException("Corrupted value: couldn't break down"
          + " into individual values (consumed " + val_idx + " bytes, but was"
          + " expecting to consume " + (val.length - 1) + "): " + kv
          + ", cells so far: " + cells);
      }
    }
    return cells;
  }

  /**
   * Callback to delete a row that's been successfully compacted.
   */
  private final class DeleteCompactedCB implements Callback<Object, Object> {

    /** What we're going to delete.  */
    private final byte[] key;
    private final byte[] family;
    private final byte[][] qualifiers;

    public DeleteCompactedCB(final ArrayList<KeyValue> cells) {
      final KeyValue first = cells.get(0);
      key = first.key();
      family = first.family();
      qualifiers = new byte[cells.size()][];
      for (int i = 0; i < qualifiers.length; i++) {
        qualifiers[i] = cells.get(i).qualifier();
      }
    }

    public Object call(final Object arg) {
      return tsdb.delete(key, qualifiers).addErrback(handle_delete_error);
    }

    public String toString() {
      return "delete compacted cells";
    }

  }

  private final HandleErrorCB handle_read_error = new HandleErrorCB("read");
  private final HandleErrorCB handle_write_error = new HandleErrorCB("write");
  private final HandleErrorCB handle_delete_error = new HandleErrorCB("delete");

  /**
   * Callback to handle exceptions during the compaction process.
   */
  private final class HandleErrorCB implements Callback<Object, Exception> {

    private volatile int errors;

    private final String what;

    /**
     * Constructor.
     * @param what String describing what kind of operation (e.g. "read").
     */
    public HandleErrorCB(final String what) {
      this.what = what;
    }

    public Object call(final Exception e) {
      if (e instanceof PleaseThrottleException) {  // HBase isn't keeping up.
        final HBaseRpc rpc = ((PleaseThrottleException) e).getFailedRpc();
        if (rpc instanceof HBaseRpc.HasKey) {
          // We failed to compact this row.  Whether it's because of a failed
          // get, put or delete, we should re-schedule this row for a future
          // compaction.
          add(((HBaseRpc.HasKey) rpc).key());
          return Boolean.TRUE;  // We handled it, so don't return an exception.
        } else {  // Should never get in this clause.
          LOG.error("WTF?  Cannot retry this RPC, and this shouldn't happen: "
                    + rpc);
        }
      }
      // `++' is not atomic but doesn't matter if we miss some increments.
      if (++errors % 100 == 1) {  // Basic rate-limiting to not flood logs.
        LOG.error("Failed to " + what + " a row to re-compact", e);
      }
      return e;
    }

    public String toString() {
      return "handle " + what + " error";
    }
  }

  static final long serialVersionUID = 1307386642;

  /** Starts a compaction thread.  Only one such thread is needed.  */
  private void startCompactionThread() {
    final Thrd thread = new Thrd();
    thread.setDaemon(true);
    thread.start();
  }

  /** How frequently the compaction thread wakes up flush stuff.  */
  // TODO(tsuna): Make configurable?
  private static final int FLUSH_INTERVAL = 10;  // seconds

  /** Minimum number of rows we'll attempt to compact at once.  */
  // TODO(tsuna): Make configurable?
  private static final int MIN_FLUSH_THRESHOLD = 100;  // rows

  /** Maximum number of rows we'll compact concurrently.  */
  // TODO(tsuna): Make configurable?
  private static final int MAX_CONCURRENT_FLUSHES = 10000;  // rows

  /** If this is X then we'll flush X times faster than we really need.  */
  // TODO(tsuna): Make configurable?
  private static final int FLUSH_SPEED = 2;  // multiplicative factor

  /**
   * Background thread to trigger periodic compactions.
   */
  final class Thrd extends Thread {
    public Thrd() {
      super("CompactionThread");
    }

    public void run() {
      long last_flush = 0;
      while (true) {
        try {
          final long now = System.currentTimeMillis();
          final int size = size();
          // Let's suppose MAX_TIMESPAN = 1h.  We have `size' rows to compact,
          // and we better compact them all before in less than 1h, otherwise
          // we're going to "fall behind" when a new hour start (as we'll be
          // creating a ton of new rows then).  So slice MAX_TIMESPAN using
          // FLUSH_INTERVAL to compute what fraction of `size' we need to
          // flush at each iteration.  Note that `size' will usually account
          // for many rows that can't be flushed yet (not old enough) so we're
          // overshooting a bit (flushing more aggressively than necessary).
          // This isn't a problem at all.  The only thing that matters is that
          // the rate at which we flush stuff is proportional to how much work
          // is sitting in the queue.  The multiplicative factor FLUSH_SPEED
          // is added to make flush even faster than we need.  For example, if
          // FLUSH_SPEED is 2, then instead of taking 1h to flush what we have
          // for the previous hour, we'll take only 30m.  This is desirable so
          // that we evict old entries from the queue a bit faster.
          final int maxflushes = Math.max(MIN_FLUSH_THRESHOLD,
            size * FLUSH_INTERVAL * FLUSH_SPEED / Const.MAX_TIMESPAN);
          // Flush if either (1) it's been too long since the last flush
          // or (2) we have too many rows to recompact already.
          // Note that in the case (2) we might not be able to flush anything
          // if the rows aren't old enough.
          if (last_flush - now > Const.MAX_TIMESPAN  // (1)
              || size > maxflushes) {                // (2)
            flush(now / 1000 - Const.MAX_TIMESPAN - 1, maxflushes);
            if (LOG.isDebugEnabled()) {
              final int newsize = size();
              LOG.debug("flush() took " + (System.currentTimeMillis() - now)
                        + "ms, new queue size=" + newsize
                        + " (" + (newsize - size) + ')');
            }
          }
        } catch (Exception e) {
          LOG.error("Uncaught exception in compaction thread", e);
        } catch (OutOfMemoryError e) {
          // Let's free up some memory by throwing away the compaction queue.
          final int sz = size.get();
          CompactionQueue.super.clear();
          size.set(0);
          LOG.error("Discarded the compaction queue, size=" + sz, e);
        } catch (Throwable e) {
          LOG.error("Uncaught *Throwable* in compaction thread", e);
          // Catching this kind of error is totally unexpected and is really
          // bad.  If we do nothing and let this thread die, we'll run out of
          // memory as new entries are added to the queue.  We could always
          // commit suicide, but it's kind of drastic and nothing else in the
          // code does this.  If `enable_compactions' wasn't final, we could
          // always set it to false, but that's not an option.  So in order to
          // try to get a fresh start, let this compaction thread terminate
          // and spin off a new one instead.
          try {
            Thread.sleep(1000);  // Avoid busy looping creating new threads.
          } catch (InterruptedException i) {
            LOG.error("Compaction thread interrupted in error handling", i);
            return;  // Don't flush, we're truly hopeless.
          }
          startCompactionThread();
          return;
        }
        try {
          Thread.sleep(FLUSH_INTERVAL * 1000);
        } catch (InterruptedException e) {
          LOG.error("Compaction thread interrupted, doing one last flush", e);
          flush();
          return;
        }
      }
    }
  }

  /**
   * Helper to sort the byte arrays in the compaction queue.
   * <p>
   * This comparator sorts things by timestamp first, this way we can find
   * all rows of the same age at once.
   */
  private static final class Cmp implements Comparator<byte[]> {

    /** On how many bytes do we encode metrics IDs.  */
    private final short metric_width;

    public Cmp(final TSDB tsdb) {
      metric_width = tsdb.metrics.width();
    }

    public int compare(final byte[] a, final byte[] b) {
      final int c = Bytes.memcmp(a, b, metric_width, Const.TIMESTAMP_BYTES);
      // If the timestamps are equal, sort according to the entire row key.
      return c != 0 ? c : Bytes.memcmp(a, b);
    }
  }

}