CompactionQueue.java example

Explorer
opentsdb-master
- src
- test
// This file is part of OpenTSDB.
// Copyright (C) 2011-2012  The OpenTSDB Authors.
//
// This program is free software: you can redistribute it and/or modify it
// under the terms of the GNU Lesser General Public License as published by
// the Free Software Foundation, either version 2.1 of the License, or (at your
// option) any later version.  This program is distributed in the hope that it
// will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty
// of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser
// General Public License for more details.  You should have received a copy
// of the GNU Lesser General Public License along with this program.  If not,
// see <http://www.gnu.org/licenses/>.
package net.opentsdb.core;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.Iterator;
import java.util.List;
import java.util.PriorityQueue;
import java.util.concurrent.ConcurrentSkipListMap;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;

import com.stumbleupon.async.Callback;
import com.stumbleupon.async.Deferred;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.hbase.async.Bytes;
import org.hbase.async.HBaseRpc;
import org.hbase.async.KeyValue;
import org.hbase.async.PleaseThrottleException;

import net.opentsdb.meta.Annotation;
import net.opentsdb.stats.StatsCollector;
import net.opentsdb.utils.JSON;

/**
 * "Queue" of rows to compact.
 * <p>
 * Whenever we write a data point to HBase, the row key we write to is added
 * to this queue, which is effectively a sorted set.  There is a separate
 * thread that periodically goes through the queue and look for "old rows" to
 * compact.  A row is considered "old" if the timestamp in the row key is
 * older than a certain threshold.
 * <p>
 * The compaction process consists in reading all the cells within a given row
 * and writing them back out as a single big cell.  Once that writes succeeds,
 * we delete all the individual little cells.
 * <p>
 * This process is effective because in HBase the row key is repeated for
 * every single cell.  And because there is no way to efficiently append bytes
 * at the end of a cell, we have to do this instead.
 */
final class CompactionQueue extends ConcurrentSkipListMap<byte[], Boolean> {

  private static final Logger LOG = LoggerFactory.getLogger(CompactionQueue.class);

  /**
   * How many items are currently in the queue.
   * Because {@link ConcurrentSkipListMap#size} has O(N) complexity.
   */
  private final AtomicInteger size = new AtomicInteger();

  private final AtomicLong duplicates_different = new AtomicLong();
  private final AtomicLong duplicates_same = new AtomicLong();
  private final AtomicLong compaction_count = new AtomicLong();
  private final AtomicLong written_cells = new AtomicLong();
  private final AtomicLong deleted_cells = new AtomicLong();

  /** The {@code TSDB} instance we belong to. */
  private final TSDB tsdb;

  /** On how many bytes do we encode metrics IDs.  */
  private final short metric_width;

  /** How frequently the compaction thread wakes up to flush stuff.  */
  private final int flush_interval;  // seconds

  /** Minimum number of rows we'll attempt to compact at once.  */
  private final int min_flush_threshold;  // rows

  /** Maximum number of rows we'll compact concurrently.  */
  private final int max_concurrent_flushes;  // rows

  /** If this is X then we'll flush X times faster than we really need.  */
  private final int flush_speed;  // multiplicative factor

  /**
   * Constructor.
   * @param tsdb The TSDB we belong to.
   */
  public CompactionQueue(final TSDB tsdb) {
    super(new Cmp(tsdb));
    this.tsdb = tsdb;
    metric_width = tsdb.metrics.width();
    flush_interval = tsdb.config.getInt("tsd.storage.compaction.flush_interval");
    min_flush_threshold = tsdb.config.getInt("tsd.storage.compaction.min_flush_threshold");
    max_concurrent_flushes = tsdb.config.getInt("tsd.storage.compaction.max_concurrent_flushes");
    flush_speed = tsdb.config.getInt("tsd.storage.compaction.flush_speed");
    if (tsdb.config.enable_compactions()) {
      startCompactionThread();
    }
  }

  @Override
  public int size() {
    return size.get();
  }

  public void add(final byte[] row) {
    if (super.put(row, Boolean.TRUE) == null) {
      size.incrementAndGet();  // We added a new entry, count it.
    }
  }

  /**
   * Forces a flush of the all old entries in the compaction queue.
   * @return A deferred that will be called back once everything has been
   * flushed (or something failed, in which case the deferred will carry the
   * exception).  In case of success, the kind of object returned is
   * unspecified.
   */
  public Deferred<ArrayList<Object>> flush() {
    final int size = size();
    if (size > 0) {
      LOG.info("Flushing all old outstanding rows out of " + size + " rows");
    }
    final long now = System.currentTimeMillis();
    return flush(now / 1000 - Const.MAX_TIMESPAN - 1, Integer.MAX_VALUE);
  }

  /**
   * Collects the stats and metrics tracked by this instance.
   * @param collector The collector to use.
   */
  void collectStats(final StatsCollector collector) {
    collector.record("compaction.count", compaction_count);
    collector.record("compaction.duplicates", duplicates_same, "type=identical");
    collector.record("compaction.duplicates", duplicates_different, "type=variant");
    if (!tsdb.config.enable_compactions()) {
      return;
    }
    // The remaining stats only make sense with compactions enabled.
    collector.record("compaction.queue.size", size);
    collector.record("compaction.errors", handle_read_error.errors, "rpc=read");
    collector.record("compaction.errors", handle_write_error.errors, "rpc=put");
    collector.record("compaction.errors", handle_delete_error.errors,
                     "rpc=delete");
    collector.record("compaction.writes", written_cells);
    collector.record("compaction.deletes", deleted_cells);
  }

  /**
   * Flushes all the rows in the compaction queue older than the cutoff time.
   * @param cut_off A UNIX timestamp in seconds (unsigned 32-bit integer).
   * @param maxflushes How many rows to flush off the queue at once.
   * This integer is expected to be strictly positive.
   * @return A deferred that will be called back once everything has been
   * flushed.
   */
  private Deferred<ArrayList<Object>> flush(final long cut_off, int maxflushes) {
    assert maxflushes > 0: "maxflushes must be > 0, but I got " + maxflushes;
    // We can't possibly flush more entries than size().
    maxflushes = Math.min(maxflushes, size());
    if (maxflushes == 0) {  // Because size() might be 0.
      return Deferred.fromResult(new ArrayList<Object>(0));
    }
    final ArrayList<Deferred<Object>> ds =
      new ArrayList<Deferred<Object>>(Math.min(maxflushes, max_concurrent_flushes));
    int nflushes = 0;
    int seed = (int) (System.nanoTime() % 3);
    for (final byte[] row : this.keySet()) {
      if (maxflushes == 0) {
        break;
      }
      if (seed == row.hashCode() % 3) {
        continue;
      }
      final long base_time = Bytes.getUnsignedInt(row, 
          Const.SALT_WIDTH() + metric_width);
      if (base_time > cut_off) {
        break;
      } else if (nflushes == max_concurrent_flushes) {
        // We kicked off the compaction of too many rows already, let's wait
        // until they're done before kicking off more.
        break;
      }
      // You'd think that it would be faster to grab an iterator on the map
      // and then call remove() on the iterator to "unlink" the element
      // directly from where the iterator is at, but no, the JDK implements
      // it by calling remove(key) so it has to lookup the key again anyway.
      if (super.remove(row) == null) {  // We didn't remove anything.
        continue;  // So someone else already took care of this entry.
      }
      nflushes++;
      maxflushes--;
      size.decrementAndGet();
      ds.add(tsdb.get(row).addCallbacks(compactcb, handle_read_error));
    }
    final Deferred<ArrayList<Object>> group = Deferred.group(ds);
    if (nflushes == max_concurrent_flushes && maxflushes > 0) {
      // We're not done yet.  Once this group of flushes completes, we need
      // to kick off more.
      tsdb.getClient().flush();  // Speed up this batch by telling the client to flush.
      final int maxflushez = maxflushes;  // Make it final for closure.
      final class FlushMoreCB implements Callback<Deferred<ArrayList<Object>>,
                                                  ArrayList<Object>> {
        @Override
        public Deferred<ArrayList<Object>> call(final ArrayList<Object> arg) {
          return flush(cut_off, maxflushez);
        }
        @Override
        public String toString() {
          return "Continue flushing with cut_off=" + cut_off
            + ", maxflushes=" + maxflushez;
        }
      }
      group.addCallbackDeferring(new FlushMoreCB());
    }
    return group;
  }

  private final CompactCB compactcb = new CompactCB();

  /**
   * Callback to compact a row once it's been read.
   * <p>
   * This is used once the "get" completes, to actually compact the row and
   * write back the compacted version.
   */
  private final class CompactCB implements Callback<Object, ArrayList<KeyValue>> {
    @Override
    public Object call(final ArrayList<KeyValue> row) {
      return compact(row, null);
    }
    @Override
    public String toString() {
      return "compact";
    }
  }

  /**
   * Compacts a row into a single {@link KeyValue}.
   * @param row The row containing all the KVs to compact.
   * Must contain at least one element.
   * @return A compacted version of this row.
   */
  KeyValue compact(final ArrayList<KeyValue> row,
      List<Annotation> annotations) {
    final KeyValue[] compacted = { null };
    compact(row, compacted, annotations);
    return compacted[0];
  }

  /**
   * Maintains state for a single compaction; exists to break the steps down into manageable
   * pieces without having to worry about returning multiple values and passing many parameters
   * around.
   * 
   * @since 2.1
   */
  private class Compaction {

    // parameters for the compaction
    private final ArrayList<KeyValue> row;
    private final KeyValue[] compacted;
    private final List<Annotation> annotations;

    private final int nkvs;

    // keeps a list of KeyValues to be deleted
    private final List<KeyValue> to_delete;

    // heap of columns, ordered by increasing timestamp
    private PriorityQueue<ColumnDatapointIterator> heap;

    // true if any ms-resolution datapoints have been seen in the column
    private boolean ms_in_row;

    // true if any s-resolution datapoints have been seen in the column
    private boolean s_in_row;

    // KeyValue containing the longest qualifier for the datapoint, used to optimize
    // checking if the compacted qualifier already exists.
    private KeyValue longest;
    
    // the latest append column. If set then we don't want to re-write the row
    // and if we only had a single column with a single value, we return this.
    private KeyValue last_append_column;

    public Compaction(ArrayList<KeyValue> row, KeyValue[] compacted, List<Annotation> annotations) {
      nkvs = row.size();
      this.row = row;
      this.compacted = compacted;
      this.annotations = annotations;
      to_delete = new ArrayList<KeyValue>(nkvs);
    }

    /**
     * Check if there are no fixups or merges required.  This will be the case when:
     * <ul>
     *  <li>there are no columns in the heap</li>
     *  <li>there is only one single-valued column needing no fixups</li>
     * </ul>
     *
     * @return true if we know no additional work is required
     */
    private boolean noMergesOrFixups() {
      switch (heap.size()) {
        case 0:
          // no data points, nothing to do
          return true;
        case 1:
          // only one column, check to see if it needs fixups
          ColumnDatapointIterator col = heap.peek();
          // either a 2-byte qualifier or one 4-byte ms qualifier, and no fixups required
          return (col.qualifier.length == 2 || (col.qualifier.length == 4
              && Internal.inMilliseconds(col.qualifier))) && !col.needsFixup();
        default:
          // more than one column, need to merge
          return false;
      }
    }

    /**
     * Perform the compaction.
     *
     * @return A {@link Deferred} if the compaction processed required a write
     * to HBase, otherwise {@code null}.
     */
    public Deferred<Object> compact() {
      // no columns in row, nothing to do
      if (nkvs == 0) {
        return null;
      }

      // go through all the columns, process annotations, and
      heap = new PriorityQueue<ColumnDatapointIterator>(nkvs);
      int tot_values = buildHeapProcessAnnotations();

      // if there are no datapoints or only one that needs no fixup, we are done
      if (noMergesOrFixups()) {
        // return the single non-annotation entry if requested
        if (compacted != null && heap.size() == 1) {
          compacted[0] = findFirstDatapointColumn();
        }
        return null;
      }

      // merge the datapoints, ordered by timestamp and removing duplicates
      final ByteBufferList compacted_qual = new ByteBufferList(tot_values);
      final ByteBufferList compacted_val = new ByteBufferList(tot_values);
      compaction_count.incrementAndGet();
      mergeDatapoints(compacted_qual, compacted_val);

      // if we wound up with no data in the compacted column, we are done
      if (compacted_qual.segmentCount() == 0) {
        return null;
      }

      // build the compacted columns
      final KeyValue compact = buildCompactedColumn(compacted_qual, compacted_val);

      final boolean write = updateDeletesCheckForWrite(compact);

      if (compacted != null) {  // Caller is interested in the compacted form.
        compacted[0] = compact;
        final long base_time = Bytes.getUnsignedInt(compact.key(), 
            Const.SALT_WIDTH() + metric_width);
        final long cut_off = System.currentTimeMillis() / 1000
            - Const.MAX_TIMESPAN - 1;
        if (base_time > cut_off) {  // If row is too recent...
          return null;              // ... Don't write back compacted.
        }
      }
      // if compactions aren't enabled or there is nothing to write, we're done
      if (!tsdb.config.enable_compactions() || (!write && to_delete.isEmpty())) {
        return null;
      }

      final byte[] key = compact.key();
      //LOG.debug("Compacting row " + Arrays.toString(key));
      deleted_cells.addAndGet(to_delete.size());  // We're going to delete this.
      if (write) {
        written_cells.incrementAndGet();
        Deferred<Object> deferred = tsdb.put(key, compact.qualifier(), compact.value());
        if (!to_delete.isEmpty()) {
          deferred = deferred.addCallbacks(new DeleteCompactedCB(to_delete), handle_write_error);
        }
        return deferred;
      } else if (last_append_column == null) {
        // We had nothing to write, because one of the cells is already the
        // correctly compacted version, so we can go ahead and delete the
        // individual cells directly.
        new DeleteCompactedCB(to_delete).call(null);
        return null;
      } else {
        return null;
      }
    }

    /**
     * Find the first datapoint column in a row. It may be an appended column
     *
     * @return the first found datapoint column in the row, or null if none
     */
    private KeyValue findFirstDatapointColumn() {
      if (last_append_column != null) {
        return last_append_column;
      }
      for (final KeyValue kv : row) {
        if (isDatapoint(kv)) {
          return kv;
        }
      }
      return null;
    }

    /**
     * Build a heap of columns containing datapoints.  Assumes that non-datapoint columns are
     * never merged.  Adds datapoint columns to the list of rows to be deleted.
     *
     * @return an estimate of the number of total values present, which may be high
     */
    private int buildHeapProcessAnnotations() {
      int tot_values = 0;
      for (final KeyValue kv : row) {
        byte[] qual = kv.qualifier();
        int len = qual.length;
        if ((len & 1) != 0) {
          // process annotations and other extended formats
          if (qual[0] == Annotation.PREFIX()) {
            annotations.add(JSON.parseToObject(kv.value(), Annotation.class));
          } else if (qual[0] == AppendDataPoints.APPEND_COLUMN_PREFIX){
            final AppendDataPoints adp = new AppendDataPoints();
            tot_values += adp.parseKeyValue(tsdb, kv).size();
            last_append_column = new KeyValue(kv.key(), kv.family(), 
                adp.qualifier(), kv.timestamp(), adp.value());
            if (longest == null || 
                longest.qualifier().length < last_append_column.qualifier().length) {
              longest = last_append_column;
            }
            final ColumnDatapointIterator col = 
                new ColumnDatapointIterator(last_append_column);
            if (col.hasMoreData()) {
              heap.add(col);
            }
          } else {
            LOG.warn("Ignoring unexpected extended format type " + qual[0]);
          }
          continue;
        }
        // estimate number of points based on the size of the first entry
        // in the column; if ms/sec datapoints are mixed, this will be
        // incorrect, which will cost a reallocation/copy
        final int entry_size = Internal.inMilliseconds(qual) ? 4 : 2;
        tot_values += (len + entry_size - 1) / entry_size;
        if (longest == null || longest.qualifier().length < kv.qualifier().length) {
          longest = kv;
        }
        ColumnDatapointIterator col = new ColumnDatapointIterator(kv);
        if (col.hasMoreData()) {
          heap.add(col);
        }
        to_delete.add(kv);
      }
      return tot_values;
    }

    /**
     * Process datapoints from the heap in order, merging into a sorted list.  Handles duplicates
     * by keeping the most recent (based on HBase column timestamps; if duplicates in the )
     *
     * @param compacted_qual qualifiers for sorted datapoints
     * @param compacted_val values for sorted datapoints
     */
    private void mergeDatapoints(ByteBufferList compacted_qual, 
        ByteBufferList compacted_val) {
      int prevTs = -1;
      while (!heap.isEmpty()) {
        final ColumnDatapointIterator col = heap.remove();
        final int ts = col.getTimestampOffsetMs();
        if (ts == prevTs) {
          // check to see if it is a complete duplicate, or if the value changed
          final byte[] existingVal = compacted_val.getLastSegment();
          final byte[] discardedVal = col.getCopyOfCurrentValue();
          if (!Arrays.equals(existingVal, discardedVal)) {
            duplicates_different.incrementAndGet();
            if (!tsdb.config.fix_duplicates()) {
              throw new IllegalDataException("Duplicate timestamp for key="
                  + Arrays.toString(row.get(0).key()) + ", ms_offset=" + ts + ", older="
                  + Arrays.toString(existingVal) + ", newer=" + Arrays.toString(discardedVal)
                  + "; set tsd.storage.fix_duplicates=true to fix automatically or run Fsck");
            }
            LOG.warn("Duplicate timestamp for key=" + Arrays.toString(row.get(0).key())
                + ", ms_offset=" + ts + ", kept=" + Arrays.toString(existingVal) + ", discarded="
                + Arrays.toString(discardedVal));
          } else {
            duplicates_same.incrementAndGet();
          }
        } else {
          prevTs = ts;
          col.writeToBuffers(compacted_qual, compacted_val);
          ms_in_row |= col.isMilliseconds();
          s_in_row |= !col.isMilliseconds();
        }
        if (col.advance()) {
          // there is still more data in this column, so add it back to the heap
          heap.add(col);
        }
      }
    }

    /**
     * Build the compacted column from the list of byte buffers that were
     * merged together.
     *
     * @param compacted_qual list of merged qualifiers
     * @param compacted_val list of merged values
     *
     * @return {@link KeyValue} instance for the compacted column
     */
    private KeyValue buildCompactedColumn(ByteBufferList compacted_qual,
        ByteBufferList compacted_val) {
      // metadata is a single byte for a multi-value column, otherwise nothing
      final int metadata_length = compacted_val.segmentCount() > 1 ? 1 : 0;
      final byte[] cq = compacted_qual.toBytes(0);
      final byte[] cv = compacted_val.toBytes(metadata_length);

      // add the metadata flag, which right now only includes whether we mix s/ms datapoints
      if (metadata_length > 0) {
        byte metadata_flag = 0;
        if (ms_in_row && s_in_row) {
          metadata_flag |= Const.MS_MIXED_COMPACT;
        }
        cv[cv.length - 1] = metadata_flag;
      }

      final KeyValue first = row.get(0);
      return new KeyValue(first.key(), first.family(), cq, cv);
    }

    /**
     * Make sure we don't delete the row that is the result of the compaction, so we
     * remove the compacted value from the list of values to delete if it is there.
     * Also, if one or more columns were appends then we don't want to mess with
     * the row for now.
     *
     * @param compact the compacted column
     * @return true if we need to write the compacted value
     */
    private boolean updateDeletesCheckForWrite(KeyValue compact) {
      if (last_append_column != null) {
        // TODO appends are involved so we may want to squash dps into the 
        // append or vice-versa. 
        return false;
      }
      
      // if the longest entry isn't as long as the compacted one, obviously the compacted
      // one can't have already existed
      if (longest != null && longest.qualifier().length >= compact.qualifier().length) {
        final Iterator<KeyValue> deleteIterator = to_delete.iterator();
        while (deleteIterator.hasNext()) {
          final KeyValue cur = deleteIterator.next();
          if (Arrays.equals(cur.qualifier(), compact.qualifier())) {
            // the compacted row already existed, so remove from the list to delete
            deleteIterator.remove();
            // if the key and value are the same, we don't need to write it
            return !Arrays.equals(cur.value(), compact.value());
          }
        }
      }
      return true;
    }
  }

  /**
   * Check if a particular column is a datapoint column (as opposed to annotation or other
   * extended formats).
   *
   * @param kv column to check
   * @return true if the column represents one or more datapoint
   */
  protected static boolean isDatapoint(KeyValue kv) {
    return (kv.qualifier().length & 1) == 0;
  }

  /**
   * Compacts a row into a single {@link KeyValue}.
   * <p>
   * If the {@code row} is empty, this function does literally nothing.
   * If {@code compacted} is not {@code null}, then the compacted form of this
   * {@code row} will be stored in {@code compacted[0]}.  Obviously, if the
   * {@code row} contains a single cell, then that cell is the compacted form.
   * Otherwise the compaction process takes places.
   * @param row The row containing all the KVs to compact.  Must be non-null.
   * @param compacted If non-null, the first item in the array will be set to
   * a {@link KeyValue} containing the compacted form of this row.
   * If non-null, we will also not write the compacted form back to HBase
   * unless the timestamp in the row key is old enough.
   * @param annotations supplied list which will have all encountered
   * annotations added to it.
   * @return A {@link Deferred} if the compaction processed required a write
   * to HBase, otherwise {@code null}.
   */
  Deferred<Object> compact(final ArrayList<KeyValue> row,
      final KeyValue[] compacted,
      List<Annotation> annotations) {
    return new Compaction(row, compacted, annotations).compact();
  }

  /**
   * Callback to delete a row that's been successfully compacted.
   */
  private final class DeleteCompactedCB implements Callback<Object, Object> {

    /** What we're going to delete.  */
    private final byte[] key;
    private final byte[][] qualifiers;

    public DeleteCompactedCB(final List<KeyValue> cells) {
      final KeyValue first = cells.get(0);
      key = first.key();
      qualifiers = new byte[cells.size()][];
      for (int i = 0; i < qualifiers.length; i++) {
        qualifiers[i] = cells.get(i).qualifier();
      }
    }

    @Override
    public Object call(final Object arg) {
      return tsdb.delete(key, qualifiers).addErrback(handle_delete_error);
    }

    @Override
    public String toString() {
      return "delete compacted cells";
    }
  }

  private final HandleErrorCB handle_read_error = new HandleErrorCB("read");
  private final HandleErrorCB handle_write_error = new HandleErrorCB("write");
  private final HandleErrorCB handle_delete_error = new HandleErrorCB("delete");

  /**
   * Callback to handle exceptions during the compaction process.
   */
  private final class HandleErrorCB implements Callback<Object, Exception> {

    private volatile int errors;

    private final String what;

    /**
     * Constructor.
     * @param what String describing what kind of operation (e.g. "read").
     */
    public HandleErrorCB(final String what) {
      this.what = what;
    }

    @Override
    public Object call(final Exception e) {
      if (e instanceof PleaseThrottleException) {  // HBase isn't keeping up.
        final HBaseRpc rpc = ((PleaseThrottleException) e).getFailedRpc();
        if (rpc instanceof HBaseRpc.HasKey) {
          // We failed to compact this row.  Whether it's because of a failed
          // get, put or delete, we should re-schedule this row for a future
          // compaction.
          add(((HBaseRpc.HasKey) rpc).key());
          return Boolean.TRUE;  // We handled it, so don't return an exception.
        } else {  // Should never get in this clause.
          LOG.error("WTF?  Cannot retry this RPC, and this shouldn't happen: "
                    + rpc);
        }
      }
      // `++' is not atomic but doesn't matter if we miss some increments.
      if (++errors % 100 == 1) {  // Basic rate-limiting to not flood logs.
        LOG.error("Failed to " + what + " a row to re-compact", e);
      }
      return e;
    }

    @Override
    public String toString() {
      return "handle " + what + " error";
    }
  }

  static final long serialVersionUID = 1307386642;

  /** Starts a compaction thread.  Only one such thread is needed.  */
  private void startCompactionThread() {
    final Thrd thread = new Thrd();
    thread.setDaemon(true);
    thread.start();
  }



  /**
   * Background thread to trigger periodic compactions.
   */
  final class Thrd extends Thread {
    public Thrd() {
      super("CompactionThread");
    }

    @Override
    public void run() {
      while (true) {
        try {
          final int size = size();
          // Flush if  we have too many rows to recompact.
          // Note that in we might not be able to actually
          // flush anything if the rows aren't old enough.
          if (size > min_flush_threshold) {
            // How much should we flush during this iteration?  This scheme is
            // adaptive and flushes at a rate that is proportional to the size
            // of the queue, so we flush more aggressively if the queue is big.
            // Let's suppose MAX_TIMESPAN = 1h.  We have `size' rows to compact,
            // and we better compact them all in less than 1h, otherwise we're
            // going to "fall behind" when after a new hour starts (as we'll be
            // inserting a ton of new rows then).  So slice MAX_TIMESPAN using
            // FLUSH_INTERVAL to compute what fraction of `size' we need to
            // flush at each iteration.  Note that `size' will usually account
            // for many rows that can't be flushed yet (not old enough) so we're
            // overshooting a bit (flushing more aggressively than necessary).
            // This isn't a problem at all.  The only thing that matters is that
            // the rate at which we flush stuff is proportional to how much work
            // is sitting in the queue.  The multiplicative factor FLUSH_SPEED
            // is added to make flush even faster than we need.  For example, if
            // FLUSH_SPEED is 2, then instead of taking 1h to flush what we have
            // for the previous hour, we'll take only 30m.  This is desirable so
            // that we evict old entries from the queue a bit faster.
            final int maxflushes = Math.max(min_flush_threshold,
              size * flush_interval * flush_speed / Const.MAX_TIMESPAN);
            final long now = System.currentTimeMillis();
            flush(now / 1000 - Const.MAX_TIMESPAN - 1, maxflushes);
            if (LOG.isDebugEnabled()) {
              final int newsize = size();
              LOG.debug("flush() took " + (System.currentTimeMillis() - now)
                        + "ms, new queue size=" + newsize
                        + " (" + (newsize - size) + ')');
            }
          }
        } catch (Exception e) {
          LOG.error("Uncaught exception in compaction thread", e);
        } catch (OutOfMemoryError e) {
          // Let's free up some memory by throwing away the compaction queue.
          final int sz = size.get();
          CompactionQueue.super.clear();
          size.set(0);
          LOG.error("Discarded the compaction queue, size=" + sz, e);
        } catch (Throwable e) {
          LOG.error("Uncaught *Throwable* in compaction thread", e);
          // Catching this kind of error is totally unexpected and is really
          // bad.  If we do nothing and let this thread die, we'll run out of
          // memory as new entries are added to the queue.  We could always
          // commit suicide, but it's kind of drastic and nothing else in the
          // code does this.  If `enable_compactions' wasn't final, we could
          // always set it to false, but that's not an option.  So in order to
          // try to get a fresh start, let this compaction thread terminate
          // and spin off a new one instead.
          try {
            Thread.sleep(1000);  // Avoid busy looping creating new threads.
          } catch (InterruptedException i) {
            LOG.error("Compaction thread interrupted in error handling", i);
            return;  // Don't flush, we're truly hopeless.
          }
          startCompactionThread();
          return;
        }
        try {
          Thread.sleep(flush_interval * 1000);
        } catch (InterruptedException e) {
          LOG.error("Compaction thread interrupted, doing one last flush", e);
          flush();
          return;
        }
      }
    }
  }

  /**
   * Helper to sort the byte arrays in the compaction queue.
   * <p>
   * This comparator sorts things by timestamp first, this way we can find
   * all rows of the same age at once.
   */
  private static final class Cmp implements Comparator<byte[]> {

    /** The position with which the timestamp of metric starts.  */
    private final short timestamp_pos;

    public Cmp(final TSDB tsdb) {
      timestamp_pos = (short) (Const.SALT_WIDTH() + tsdb.metrics.width());
    }

    @Override
    public int compare(final byte[] a, final byte[] b) {
      final int c = Bytes.memcmp(a, b, timestamp_pos, Const.TIMESTAMP_BYTES);
      // If the timestamps are equal, sort according to the entire row key.
      return c != 0 ? c : Bytes.memcmp(a, b);
    }
  }
}