// This file is part of OpenTSDB.
// Copyright (C) 2014 The OpenTSDB Authors.
//
// This program is free software: you can redistribute it and/or modify it
// under the terms of the GNU Lesser General Public License as published by
// the Free Software Foundation, either version 2.1 of the License, or (at your
// option) any later version. This program is distributed in the hope that it
// will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty
// of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser
// General Public License for more details. You should have received a copy
// of the GNU Lesser General Public License along with this program. If not,
// see <http://www.gnu.org/licenses/>.
package net.opentsdb.tools;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Date;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.concurrent.atomic.AtomicLong;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.hbase.async.Bytes;
import org.hbase.async.Bytes.ByteMap;
import org.hbase.async.DeleteRequest;
import org.hbase.async.KeyValue;
import org.hbase.async.PutRequest;
import org.hbase.async.Scanner;
import com.stumbleupon.async.Deferred;
import net.opentsdb.core.AppendDataPoints;
import net.opentsdb.core.Const;
import net.opentsdb.core.IllegalDataException;
import net.opentsdb.core.Internal;
import net.opentsdb.core.Internal.Cell;
import net.opentsdb.core.Query;
import net.opentsdb.core.RowKey;
import net.opentsdb.core.TSDB;
import net.opentsdb.core.Tags;
import net.opentsdb.meta.Annotation;
import net.opentsdb.uid.NoSuchUniqueId;
import net.opentsdb.uid.UniqueId;
import net.opentsdb.utils.Config;
/**
* Tool to look for and fix corrupted data in a TSDB. FSCK can be used to
* recover space, resolve duplicate data points, remove orphaned time series and
* remove data errors. If one or more command line queries are provided, only
* rows matching the query will be FSCK'd. Alternatively a full table scan can
* be performed.
* <p>
* Scanning is done in three stages:
* 1) Each row key is parsed to make sure it's a valid OpenTSDB row. If it isn't
* then the user can decide to delete it. If one or more UIDs cannot be resolved
* to names (metric or tags) then the user can decide to purge it.
* 2) All key value pairs in a row are parsed to determine the type of object.
* If it's a single data point, it's added to a tree map based on the data point
* timestamp. If it's a compacted column, the data points are exploded and
* added to the data point map. If it's some other object it may be purged if
* told to, or if it's a known type (e.g. annotations) simply ignored.
* 3) If any data points were found, we iterate over each one looking for
* duplicates, malformed encodings or potential value-length-encoding savings.
* At the end, if told to, FSCK will fix up the values and optionally write a
* new compacted cell, deleting all of the old values.
* <p>
* A number of metrics are tracked during the run and a report will be dumped
* to the log at the end.
* <p>
* When iterating over the datapoints in step 3, the workers will usually compile
* a set of compacted qualifiers and values so that at the end, if necessary, a
* new compacted cell can be written and the old cells purged.
* <p>
* Note: some fields are package private so that we can easily unit test.
*/
final class Fsck {
private static final Logger LOG = LoggerFactory.getLogger(Fsck.class);
/** The TSDB to use for access */
private final TSDB tsdb;
/** Options to use while iterating over rows */
private final FsckOptions options;
/** Counters incremented during processing. They have to be atomic counters
* as we may be running multiple fsck threads. */
final AtomicLong kvs_processed = new AtomicLong();
final AtomicLong rows_processed = new AtomicLong();
final AtomicLong valid_datapoints = new AtomicLong();
final AtomicLong annotations = new AtomicLong();
final AtomicLong append_dps = new AtomicLong();
final AtomicLong append_dps_fixed = new AtomicLong();
final AtomicLong bad_key = new AtomicLong();
final AtomicLong bad_key_fixed = new AtomicLong();
final AtomicLong duplicates = new AtomicLong();
final AtomicLong duplicates_fixed = new AtomicLong();
final AtomicLong duplicates_fixed_comp = new AtomicLong();
final AtomicLong orphans = new AtomicLong();
final AtomicLong orphans_fixed = new AtomicLong();
final AtomicLong future = new AtomicLong();
final AtomicLong unknown = new AtomicLong();
final AtomicLong unknown_fixed = new AtomicLong();
final AtomicLong bad_values = new AtomicLong();
final AtomicLong bad_values_deleted = new AtomicLong();
final AtomicLong value_encoding = new AtomicLong();
final AtomicLong value_encoding_fixed = new AtomicLong();
final AtomicLong fixable_compacted_columns = new AtomicLong();
final AtomicLong bad_compacted_columns = new AtomicLong();
final AtomicLong bad_compacted_columns_deleted = new AtomicLong();
final AtomicLong vle = new AtomicLong();
final AtomicLong vle_bytes = new AtomicLong();
final AtomicLong vle_fixed = new AtomicLong();
/** Length of the metric + timestamp for key validation */
private int key_prefix_length = Const.SALT_WIDTH() +
TSDB.metrics_width() + Const.TIMESTAMP_BYTES;
/** Length of a tagk + tagv pair for key validation */
private int key_tags_length = TSDB.tagk_width() + TSDB.tagv_width();
/** How often to report progress */
private static long report_rows = 10000;
/**
* Default Ctor
* @param tsdb The TSDB to use for access
* @param options Options to use when iterating over rows
*/
public Fsck(final TSDB tsdb, final FsckOptions options) {
this.tsdb = tsdb;
this.options = options;
}
/**
* Fetches the max metric ID and splits the data table up amongst threads on
* a naive split. By default we execute cores * 2 threads but the user can
* specify more or fewer.
* @throws Exception If something goes pear shaped.
*/
public void runFullTable() throws Exception {
LOG.info("Starting full table scan");
final long start_time = System.currentTimeMillis() / 1000;
final int workers = options.threads() > 0 ? options.threads() :
Runtime.getRuntime().availableProcessors() * 2;
final List<Scanner> scanners = CliUtils.getDataTableScanners(tsdb, workers);
LOG.info("Spooling up [" + scanners.size() + "] worker threads");
final List<Thread> threads = new ArrayList<Thread>(scanners.size());
int i = 0;
for (final Scanner scanner : scanners) {
final FsckWorker worker = new FsckWorker(scanner, i++);
worker.setName("Fsck #" + i);
worker.start();
threads.add(worker);
}
final Thread reporter = new ProgressReporter();
reporter.start();
for (final Thread thread : threads) {
thread.join();
LOG.info("Thread [" + thread + "] Finished");
}
reporter.interrupt();
logResults();
final long duration = (System.currentTimeMillis() / 1000) - start_time;
LOG.info("Completed fsck in [" + duration + "] seconds");
}
/**
* Scans the rows matching one or more standard queries. An aggregator is still
* required though it's ignored.
* @param queries The queries to execute
* @throws Exception If something goes pear shaped.
*/
public void runQueries(final List<Query> queries) throws Exception {
final long start_time = System.currentTimeMillis() / 1000;
// TODO - threadify it. We *could* have hundreds of queries and we don't
// want to create that many threads. For now we'll just execute each one
// serially
final Thread reporter = new ProgressReporter();
reporter.start();
for (final Query query : queries) {
final List<Scanner> scanners = Internal.getScanners(query);
final List<Thread> threads = new ArrayList<Thread>(scanners.size());
int i = 0;
for (final Scanner scanner : scanners) {
final FsckWorker worker = new FsckWorker(scanner, i++);
worker.setName("Fsck #" + i);
worker.start();
threads.add(worker);
}
for (final Thread thread : threads) {
thread.join();
LOG.info("Thread [" + thread + "] Finished");
}
}
reporter.interrupt();
logResults();
final long duration = (System.currentTimeMillis() / 1000) - start_time;
LOG.info("Completed fsck in [" + duration + "] seconds");
}
/** @return The total number of errors detected during the run */
long totalErrors() {
return bad_key.get() + duplicates.get() + orphans.get() + unknown.get() +
bad_values.get() + bad_compacted_columns.get() +
fixable_compacted_columns.get() + value_encoding.get();
}
/** @return The total number of errors fixed during the run */
long totalFixed() {
return bad_key_fixed.get() + duplicates_fixed.get() + orphans_fixed.get() +
unknown_fixed.get() + value_encoding_fixed.get() +
bad_values_deleted.get();
}
/** @return The total number of errors that could be (or may have been) fixed */
long correctable() {
return bad_key.get() + duplicates.get() + orphans.get() + unknown.get() +
bad_values.get() + bad_compacted_columns.get() +
fixable_compacted_columns.get() + value_encoding.get();
}
/**
* A worker thread that takes a query or a chunk of the main data table and
* performs the actual FSCK process.
*/
final class FsckWorker extends Thread {
/** Id of the thread this worker belongs to */
final int thread_id;
/** Optional query to execute instead of a full table scan */
final Query query;
/** The scanner to use for iterating over a chunk of the table */
final Scanner scanner;
/** Set of TSUIDs this worker has seen. Used to avoid UID resolution for
* previously processed row keys */
final Set<String> tsuids = new HashSet<String>();
/** Shared flags and values for compiling a compacted column */
byte[] compact_qualifier = null;
int qualifier_index = 0;
byte[] compact_value = null;
int value_index = 0;
boolean compact_row = false;
int qualifier_bytes = 0;
int value_bytes = 0;
/**
* Ctor for running a worker on a chunk of the data table
* @param scanner The scanner to use for iterationg
* @param thread_id Id of the thread this worker is assigned for logging
*/
FsckWorker(final Scanner scanner, final int thread_id) {
this.scanner = scanner;
this.thread_id = thread_id;
query = null;
}
/**
* Determines the type of scanner to use, i.e. a specific query scanner or
* for a portion of the whole table. It then performs the actual scan,
* compiling a list of data points and fixing/compacting them when
* appropriate.
*/
public void run() {
// store every data point for the row in here
final TreeMap<Long, ArrayList<DP>> datapoints =
new TreeMap<Long, ArrayList<DP>>();
byte[] last_key = null;
ArrayList<ArrayList<KeyValue>> rows;
try {
while ((rows = scanner.nextRows().joinUninterruptibly()) != null) {
// keep in mind that with annotations and millisecond values, a row
// can now have more than 4069 key values, the default for a scanner.
// Since we don't know how many values may actually be in a row, we
// don't want to set the KV limit too high. Instead we'll just keep
// working through the sets until we hit a different row key, then
// process all of the data points. It puts more of a burden on fsck
// memory but we should be able to keep ~3M data points in memory
// without a problem.
for (final ArrayList<KeyValue> row : rows) {
if (last_key != null && Bytes.memcmp(row.get(0).key(), last_key) != 0) {
// new row so flush the old one
rows_processed.getAndIncrement();
if (!datapoints.isEmpty()) {
compact_qualifier = new byte[qualifier_bytes];
compact_value = new byte[value_bytes+1];
fsckDataPoints(datapoints);
resetCompaction();
datapoints.clear();
}
}
last_key = row.get(0).key();
fsckRow(row, datapoints);
}
}
// handle the last row
if (!datapoints.isEmpty()) {
rows_processed.getAndIncrement();
compact_qualifier = new byte[qualifier_bytes];
compact_value = new byte[value_bytes+1];
fsckDataPoints(datapoints);
}
} catch (Exception e) {
LOG.error("Shouldn't be here", e);
}
}
/**
* Parses the row of KeyValues. First it validates the row key, then parses
* each KeyValue to determine what kind of object it is. Data points are
* stored in the tree map and non-data point columns are handled per the
* option flags
* @param row The row of data to parse
* @param datapoints The map of datapoints to append to.
* @throws Exception If something goes pear shaped.
*/
private void fsckRow(final ArrayList<KeyValue> row,
final TreeMap<Long, ArrayList<DP>> datapoints) throws Exception {
// The data table should contain only rows with a metric, timestamp and
// one or more tag pairs. Future version may use different prefixes or
// key formats but for now, we can safely delete any rows with invalid
// keys. This may check the same row key multiple times but that's good
// as it will keep the data points from being pushed to the dp map
if (!fsckKey(row.get(0).key())) {
return;
}
final long base_time = Bytes.getUnsignedInt(row.get(0).key(),
Const.SALT_WIDTH() + TSDB.metrics_width());
for (final KeyValue kv : row) {
kvs_processed.getAndIncrement();
// these are not final as they may be modified when fixing is enabled
byte[] value = kv.value();
byte[] qual = kv.qualifier();
// all qualifiers must be at least 2 bytes long, i.e. a single data point
if (qual.length < 2) {
unknown.getAndIncrement();
LOG.error("Invalid qualifier, must be on 2 bytes or more.\n\t" + kv);
if (options.fix() && options.deleteUnknownColumns()) {
final DeleteRequest delete = new DeleteRequest(tsdb.dataTable(), kv);
tsdb.getClient().delete(delete);
unknown_fixed.getAndIncrement();
}
continue;
}
// All data point columns have an even number of bytes, so if we find
// one that has an odd length, it could be an OpenTSDB object or it
// could be junk that made it into the table.
if (qual.length % 2 != 0) {
// If this test fails, the column is not a TSDB object such as an
// annotation or blob. Future versions may be able to compact TSDB
// objects so that their qualifier would be of a different length, but
// for now we'll consider it an error.
if (qual.length != 3 && qual.length != 5) {
unknown.getAndIncrement();
LOG.error("Unknown qualifier, must be 2, 3, 5 or an even number " +
"of bytes.\n\t" + kv);
if (options.fix() && options.deleteUnknownColumns()) {
final DeleteRequest delete = new DeleteRequest(tsdb.dataTable(), kv);
tsdb.getClient().delete(delete);
unknown_fixed.getAndIncrement();
}
continue;
}
// TODO - create a list of TSDB objects and fsck them. Maybe a plugin
// or interface.
// TODO - perform validation of the annotation
if (qual[0] == Annotation.PREFIX()) {
annotations.getAndIncrement();
continue;
} else if (qual[0] == AppendDataPoints.APPEND_COLUMN_PREFIX) {
append_dps.getAndIncrement();
try {
final AppendDataPoints adps = new AppendDataPoints();
adps.parseKeyValue(tsdb, kv);
if (adps.repairedDeferred() != null) {
append_dps_fixed.incrementAndGet();
}
} catch (RuntimeException e) {
LOG.error("Unexpected exception processing append data point: " + kv, e);
}
continue;
}
LOG.warn("Found an object possibly from a future version of OpenTSDB\n\t"
+ kv);
future.getAndIncrement();
continue;
}
// This is (hopefully) a compacted column with multiple data points. It
// could have two points with second qualifiers or multiple points with
// a mix of second and millisecond qualifiers
if (qual.length == 4 && !Internal.inMilliseconds(qual[0])
|| qual.length > 4) {
if (value[value.length - 1] > Const.MS_MIXED_COMPACT) {
// TODO - figure out a way to fix these. Maybe lookup a row before
// or after and try parsing this for values. If the values are
// somewhat close to the others, then we could just set the last
// byte. Otherwise it could be a bad compaction and we'd need to
// toss it.
bad_compacted_columns.getAndIncrement();
LOG.error("The last byte of a compacted should be 0 or 1. Either"
+ " this value is corrupted or it was written by a"
+ " future version of OpenTSDB.\n\t" + kv);
continue;
}
// add every cell in the compacted column to the data point tree so
// that we can scan for duplicate timestamps
try {
final ArrayList<Cell> cells = Internal.extractDataPoints(kv);
// the extractDataPoints() method will automatically fix up some
// issues such as setting proper lengths on floats and sorting the
// cells to be in order. Rather than reproduce the extraction code or
// add another method, we can just recompile the compacted qualifier
// as we run through it. If the new one is different (indicating a fix)
// then we'll replace it later on.
final byte[] recompacted_qualifier = new byte[kv.qualifier().length];
int qualifier_index = 0;
for (final Cell cell : cells) {
final long ts = cell.timestamp(base_time);
ArrayList<DP> dps = datapoints.get(ts);
if (dps == null) {
dps = new ArrayList<DP>(1);
datapoints.put(ts, dps);
}
dps.add(new DP(kv, cell));
qualifier_bytes += cell.qualifier().length;
value_bytes += cell.value().length;
System.arraycopy(cell.qualifier(), 0, recompacted_qualifier,
qualifier_index, cell.qualifier().length);
qualifier_index += cell.qualifier().length;
}
if (Bytes.memcmp(recompacted_qualifier, kv.qualifier()) != 0) {
LOG.error("Compacted column was out of order or requires a "
+ "fixup: " + kv);
fixable_compacted_columns.getAndIncrement();
}
compact_row = true;
} catch (IllegalDataException e) {
bad_compacted_columns.getAndIncrement();
LOG.error(e.getMessage());
if (options.fix() && options.deleteBadCompacts()) {
final DeleteRequest delete = new DeleteRequest(tsdb.dataTable(), kv);
tsdb.getClient().delete(delete);
bad_compacted_columns_deleted.getAndIncrement();
}
}
continue;
}
// at this point we *should* be dealing with a single data point encoded
// in seconds or milliseconds.
final long timestamp =
Internal.getTimestampFromQualifier(qual, base_time);
ArrayList<DP> dps = datapoints.get(timestamp);
if (dps == null) {
dps = new ArrayList<DP>(1);
datapoints.put(timestamp, dps);
}
dps.add(new DP(kv));
qualifier_bytes += kv.qualifier().length;
value_bytes += kv.value().length;
}
}
/**
* Validates the row key. It must match the format
* {@code <metric><timestamp><tagpair>[...<tagpair>]}. If it doesn't, then
* the row is considered an error. If the UIDs in a row key do not resolve
* to a name, then the row is considered an orphan and the values contained
* therein are NOT fsck'd. Also, if the TSUID in the row key has been seen
* before, then we don't re-resolve the UIDs. Saves a bit of CPU time.
* NOTE: We do not currently validate the timestamp in the row key. This
* would be a good TODO.
* NOTE: Global annotations are of the format {@code <metric=0><timestamp>}
* but fsck will not scan over those rows. Full table scans start at metric
* 1 and queries must match a valid name.
* @param key The row key to validate
* @return True if the row key is valid, false if it is not
* @throws Exception If something goes pear shaped.
*/
private boolean fsckKey(final byte[] key) throws Exception {
if (key.length < key_prefix_length ||
(key.length - key_prefix_length) % key_tags_length != 0) {
LOG.error("Invalid row key.\n\tKey: " + UniqueId.uidToString(key));
bad_key.getAndIncrement();
if (options.fix() && options.deleteBadRows()) {
final DeleteRequest delete = new DeleteRequest(tsdb.dataTable(), key);
tsdb.getClient().delete(delete);
bad_key_fixed.getAndIncrement();
}
return false;
}
// Process the time series ID by resolving the UIDs to names if we haven't
// already seen this particular TSUID. Note that getTSUID accounts for salt
final byte[] tsuid = UniqueId.getTSUIDFromKey(key, TSDB.metrics_width(),
Const.TIMESTAMP_BYTES);
if (!tsuids.contains(tsuid)) {
try {
RowKey.metricNameAsync(tsdb, key).joinUninterruptibly();
} catch (NoSuchUniqueId nsui) {
LOG.error("Unable to resolve the metric from the row key.\n\tKey: "
+ UniqueId.uidToString(key) + "\n\t" + nsui.getMessage());
orphans.getAndIncrement();
if (options.fix() && options.deleteOrphans()) {
final DeleteRequest delete = new DeleteRequest(tsdb.dataTable(), key);
tsdb.getClient().delete(delete);
orphans_fixed.getAndIncrement();
}
return false;
}
try {
Tags.resolveIds(tsdb, (ArrayList<byte[]>)
UniqueId.getTagPairsFromTSUID(tsuid));
} catch (NoSuchUniqueId nsui) {
LOG.error("Unable to resolve the a tagk or tagv from the row key.\n\tKey: "
+ UniqueId.uidToString(key) + "\n\t" + nsui.getMessage());
orphans.getAndIncrement();
if (options.fix() && options.deleteOrphans()) {
final DeleteRequest delete = new DeleteRequest(tsdb.dataTable(), key);
tsdb.getClient().delete(delete);
orphans_fixed.getAndIncrement();
}
return false;
}
}
return true;
}
/**
* Processes each data point parsed from the row. Validates the qualifiers
* and values, fixing what it can and deleting those it can't. At the end
* it may write a new compacted column and remove the others. Also handles
* duplicate data point resolution.
* @param datapoints The list of data points parsed from the row
* @throws Exception If something goes pear shaped.
*/
private void fsckDataPoints(final Map<Long, ArrayList<DP>> datapoints)
throws Exception {
// store a unique set of qualifier/value columns to help us later when
// we need to delete or update the row
final ByteMap<byte[]> unique_columns = new ByteMap<byte[]>();
byte[] key = null;
boolean has_seconds = false;
boolean has_milliseconds = false;
boolean has_duplicates = false;
boolean has_uncorrected_value_error = false;
for (final Map.Entry<Long, ArrayList<DP>> time_map : datapoints.entrySet()) {
if (key == null) {
key = time_map.getValue().get(0).kv.key();
}
if (time_map.getValue().size() < 2) {
// there was only one data point for this timestamp, no conflicts
final DP dp = time_map.getValue().get(0);
valid_datapoints.getAndIncrement();
has_uncorrected_value_error |= Internal.isFloat(dp.qualifier()) ?
fsckFloat(dp) : fsckInteger(dp);
if (Internal.inMilliseconds(dp.qualifier())) {
has_milliseconds = true;
} else {
has_seconds = true;
}
unique_columns.put(dp.kv.qualifier(), dp.kv.value());
continue;
}
// sort so we can figure out which one we're going to keep, i.e. oldest
// or newest
Collections.sort(time_map.getValue());
has_duplicates = true;
// We want to keep either the first or the last incoming datapoint
// and ignore delete the middle.
final StringBuilder buf = new StringBuilder();
buf.append("More than one column had a value for the same timestamp: ")
.append("(")
.append(time_map.getKey())
.append(" - ")
.append(new Date(time_map.getKey()))
.append(")\n row key: (")
.append(UniqueId.uidToString(key))
.append(")\n");
int num_dupes = time_map.getValue().size();
final int delete_range_start;
final int delete_range_stop;
final DP dp_to_keep;
if (options.lastWriteWins()) {
// Save the latest datapoint from extinction.
delete_range_start = 0;
delete_range_stop = num_dupes - 1;
dp_to_keep = time_map.getValue().get(num_dupes - 1);
} else {
// Save the oldest datapoint from extinction.
delete_range_start = 1;
delete_range_stop = num_dupes;
dp_to_keep = time_map.getValue().get(0);
appendDatapointInfo(buf, dp_to_keep, " <--- keep oldest").append("\n");
}
unique_columns.put(dp_to_keep.kv.qualifier(), dp_to_keep.kv.value());
valid_datapoints.getAndIncrement();
has_uncorrected_value_error |= Internal.isFloat(dp_to_keep.qualifier()) ?
fsckFloat(dp_to_keep) : fsckInteger(dp_to_keep);
if (Internal.inMilliseconds(dp_to_keep.qualifier())) {
has_milliseconds = true;
} else {
has_seconds = true;
}
for (int dp_index = delete_range_start; dp_index < delete_range_stop;
dp_index++) {
duplicates.getAndIncrement();
DP dp = time_map.getValue().get(dp_index);
try {
final byte flags = (byte)Internal.getFlagsFromQualifier(dp.kv.qualifier());
buf.append(" ")
.append("write time: (")
.append(dp.kv.timestamp())
.append(" - ")
.append(new Date(dp.kv.timestamp()))
.append(") ")
.append(" compacted: (")
.append(dp.compacted)
.append(") qualifier: ")
.append(Arrays.toString(dp.kv.qualifier()))
.append(" value: ")
.append(Internal.isFloat(dp.kv.qualifier()) ?
Internal.extractFloatingPointValue(dp.value(), 0, flags) :
Internal.extractIntegerValue(dp.value(), 0, flags))
.append("\n");
unique_columns.put(dp.kv.qualifier(), dp.kv.value());
if (options.fix() && options.resolveDupes()) {
if (compact_row) {
// Scheduled for deletion by compaction.
duplicates_fixed_comp.getAndIncrement();
} else if (!dp.compacted) {
LOG.debug("Removing duplicate data point: " + dp.kv);
tsdb.getClient().delete(
new DeleteRequest(
tsdb.dataTable(), dp.kv.key(), dp.kv.family(), dp.qualifier()
)
);
duplicates_fixed.getAndIncrement();
}
}
} catch (Exception e) {
LOG.error("Unexpected exception processing DP: " + dp);
}
}
if (options.lastWriteWins()) {
appendDatapointInfo(buf, dp_to_keep, " <--- keep latest").append("\n");
}
LOG.info(buf.toString());
}
// if an error was found in this row that was not marked for repair, then
// we should bail at this point and not write a new compacted column.
if ((has_duplicates && !options.resolveDupes()) ||
(has_uncorrected_value_error && !options.deleteBadValues())) {
LOG.warn("One or more errors found in row that were not marked for repair");
return;
}
if ((options.compact() || compact_row) && options.fix()
&& qualifier_index > 0) {
if (qualifier_index == 2 || (qualifier_index == 4 &&
Internal.inMilliseconds(compact_qualifier))) {
// we may have deleted all but one value from the row and that one
// value may have a different qualifier than it originally had. We
// can't write a compacted column with a single data point as the length
// will be off due to the flag at the end. Therefore we just rollback
// the length of the value array.
value_index--;
} else if (has_seconds && has_milliseconds) {
// set mixed compact flag at end of the values array
compact_value[value_index] = 1;
}
value_index++;
final byte[] new_qualifier = Arrays.copyOfRange(compact_qualifier, 0,
qualifier_index);
final byte[] new_value = Arrays.copyOfRange(compact_value, 0,
value_index);
final PutRequest put = new PutRequest(tsdb.dataTable(), key,
TSDB.FAMILY(), new_qualifier, new_value);
// it's *possible* that the hash of our new compacted qualifier is in
// the delete list so double check before we delete everything
if (unique_columns.containsKey(new_qualifier)) {
if (Bytes.memcmp(unique_columns.get(new_qualifier), new_value) != 0) {
final StringBuilder buf = new StringBuilder();
buf.append("Overwriting compacted column with new value: ")
.append("\n row key: (")
.append(UniqueId.uidToString(key))
.append(")\n qualifier: ")
.append(Bytes.pretty(new_qualifier))
.append("\n value: ")
.append(Bytes.pretty(new_value));
LOG.info(buf.toString());
// Important: Make sure to wait for the write to complete before
// proceeding with the deletes.
tsdb.getClient().put(put).joinUninterruptibly();
} else if (has_duplicates) {
if (LOG.isDebugEnabled()) {
final StringBuilder buf = new StringBuilder();
buf.append("Re-compacted column is the same as the existing column: ")
.append("\n row key: (")
.append(UniqueId.uidToString(key))
.append(")\n qualifier: ")
.append(Bytes.pretty(new_qualifier))
.append("\n value: ")
.append(Bytes.pretty(new_value));
LOG.debug(buf.toString());
}
}
unique_columns.remove(new_qualifier);
} else {
// Important: Make sure to wait for the write to complete before
// proceeding with the deletes.
tsdb.getClient().put(put).joinUninterruptibly();
}
final List<Deferred<Object>> deletes =
new ArrayList<Deferred<Object>>(unique_columns.size());
for (byte[] qualifier : unique_columns.keySet()) {
final DeleteRequest delete = new DeleteRequest(tsdb.dataTable(), key,
TSDB.FAMILY(), qualifier);
if (LOG.isDebugEnabled()) {
final StringBuilder buf = new StringBuilder();
buf.append("Deleting column: ")
.append("\n row key: (")
.append(UniqueId.uidToString(key))
.append(")\n qualifier: ")
.append(Bytes.pretty(qualifier));
LOG.debug(buf.toString());
}
deletes.add(tsdb.getClient().delete(delete));
}
Deferred.group(deletes).joinUninterruptibly();
duplicates_fixed.getAndAdd(duplicates_fixed_comp.longValue());
duplicates_fixed_comp.set(0);
}
}
/**
* Handles validating a floating point value. Floats must be encoded on 4
* bytes for a Float and 8 bytes for a Double. The qualifier is compared to
* the actual length in the case of single data points. In previous versions
* of OpenTSDB, the qualifier flag may have been on 4 bytes but the actual
* value on 8. This method will fix those issues as well as an old bug
* where the first 4 bytes of the 8 byte value were sign-extended.
* @param dp The data point to process
* @return True if value was NOT fixed so the caller can avoid compacting.
* If false, then the value was good or it was repaired.
* @throws Exception If something goes pear shaped
*/
private boolean fsckFloat(final DP dp) throws Exception {
byte[] qual = dp.qualifier();
byte[] value = dp.value();
final byte length = Internal.getValueLengthFromQualifier(qual);
// The qualifier says the value is on 4 bytes, and the value is
// on 8 bytes, then the 4 MSBs must be 0s. Old versions of the
// code were doing this. It's kinda sad. Some versions had a
// bug whereby the value would be sign-extended, so we can
// detect these values and fix them here.
if (length == 4 && value.length == 8) {
if (value[0] == -1 && value[1] == -1
&& value[2] == -1 && value[3] == -1 && qual.length == 2) {
value_encoding.getAndIncrement();
LOG.error("Floating point value with 0xFF most significant"
+ " bytes, probably caused by sign extension bug"
+ " present in revisions [96908436..607256fc].\n"
+ "\t" + dp.kv);
if (options.fix()) {
final float value_as_float =
Float.intBitsToFloat(Bytes.getInt(value, 4));
value = Bytes.fromInt(
Float.floatToRawIntBits((float)value_as_float));
if (compact_row || options.compact()) {
appendDP(qual, value, 4);
} else if (!dp.compacted){
final PutRequest put = new PutRequest(tsdb.dataTable(),
dp.kv.key(), dp.kv.family(), qual, value);
tsdb.getClient().put(put);
} else {
LOG.error("SHOULDN'T be here as we didn't compact or fix a "
+ "single value");
}
value_encoding_fixed.getAndIncrement();
} else {
return true;
}
} else if (value[0] != 0 || value[1] != 0
|| value[2] != 0 || value[3] != 0) {
// can't happen if it was compacted
LOG.error("Floating point value was marked as 4 bytes long but"
+ " was actually 8 bytes long and the first four bytes were"
+ " not zeroed\n\t" + dp);
bad_values.getAndIncrement();
if (options.fix() && options.deleteBadValues() && !dp.compacted) {
final DeleteRequest delete = new DeleteRequest(tsdb.dataTable(),
dp.kv);
tsdb.getClient().delete(delete);
bad_values_deleted.getAndIncrement();
} else if (dp.compacted) {
LOG.error("The value was in a compacted column. This should "
+ "not be possible\n\t" + dp);
bad_compacted_columns.getAndIncrement();
return true;
} else {
return true;
}
} else {
// can't happen if it was compacted
LOG.warn("Floating point value was marked as 4 bytes long but"
+ " was actually 8 bytes long\n\t" + dp.kv);
value_encoding.getAndIncrement();
if (options.fix() && !dp.compacted) {
final float value_as_float =
Float.intBitsToFloat(Bytes.getInt(value, 4));
value = Bytes.fromInt(
Float.floatToRawIntBits((float)value_as_float));
if (compact_row || options.compact()) {
appendDP(qual, value, 4);
} else if (!dp.compacted) {
final PutRequest put = new PutRequest(tsdb.dataTable(),
dp.kv.key(), dp.kv.family(), qual, value);
tsdb.getClient().put(put);
} else {
LOG.error("SHOULDN'T be here as we didn't compact or fix a single value");
}
value_encoding_fixed.getAndIncrement();
} else {
return true;
}
}
} else if (length == 8 && value.length == 4) {
// could be a marked as a Double but actually encoded as a Float. BUT we
// don't know that and can't parse it accurately so tank it
bad_values.getAndIncrement();
LOG.error("This floating point value was marked as 8 bytes long but"
+ " was only " + value.length + " bytes.\n\t" + dp.kv);
if (options.fix() && options.deleteBadValues() && !dp.compacted) {
final DeleteRequest delete = new DeleteRequest(tsdb.dataTable(), dp.kv);
tsdb.getClient().delete(delete);
bad_values_deleted.getAndIncrement();
} else if (dp.compacted) {
LOG.error("The previous value was in a compacted column. This should "
+ "not be possible.");
bad_compacted_columns.getAndIncrement();
} else {
return true;
}
} else if (value.length != 4 && value.length != 8) {
bad_values.getAndIncrement();
LOG.error("This floating point value must be encoded either on"
+ " 4 or 8 bytes, but it's on " + value.length
+ " bytes.\n\t" + dp.kv);
if (options.fix() && options.deleteBadValues() && !dp.compacted) {
final DeleteRequest delete = new DeleteRequest(tsdb.dataTable(), dp.kv);
tsdb.getClient().delete(delete);
bad_values_deleted.getAndIncrement();
} else if (dp.compacted) {
LOG.error("The previous value was in a compacted column. This should "
+ "not be possible.");
bad_compacted_columns.getAndIncrement();
return true;
} else {
return true;
}
} else {
if (compact_row || options.compact()) {
appendDP(qual, value, value.length);
}
}
return false;
}
/**
* Handles validating an integer value. Integers must be encoded on 1, 2, 4
* or 8 bytes. Older versions of OpenTSDB wrote all integers on 8 bytes
* regardless of value. If the --fix flag is specified, this method will
* attempt to re-encode small values to save space (up to 7 bytes!!). It also
* makes sure the value length matches the length specified in the qualifier
* @param dp The data point to process
* @return True if value was NOT fixed so the caller can avoid compacting.
* If false, then the value was good or it was repaired.
* @throws Exception If something goes pear shaped
*/
private boolean fsckInteger(final DP dp) throws Exception {
byte[] qual = dp.qualifier();
byte[] value = dp.value();
// this should be a single integer value. Check the encoding to make
// sure it's the proper length, and if the flag is set to fix encoding
// we can save space with VLE.
final byte length = Internal.getValueLengthFromQualifier(qual);
if (value.length != length) {
// can't happen in a compacted column
bad_values.getAndIncrement();
LOG.error("The integer value is " + value.length + " bytes long but "
+ "should be " + length + " bytes.\n\t" + dp.kv);
if (options.fix() && options.deleteBadValues()) {
final DeleteRequest delete = new DeleteRequest(tsdb.dataTable(), dp.kv);
tsdb.getClient().delete(delete);
bad_values_deleted.getAndIncrement();
} else if (dp.compacted) {
LOG.error("The previous value was in a compacted column. This should "
+ "not be possible.");
bad_compacted_columns.getAndIncrement();
} else {
return true;
}
return false;
}
// OpenTSDB had support for VLE decoding of integers but only wrote
// on 8 bytes originally. Lets see how much space we could save.
// We'll assume that a length other than 8 bytes is already VLE'd
if (length == 8) {
final long decoded = Bytes.getLong(value);
if (Byte.MIN_VALUE <= decoded && decoded <= Byte.MAX_VALUE) {
vle.getAndIncrement();
vle_bytes.addAndGet(7);
value = new byte[] { (byte) decoded };
} else if (Short.MIN_VALUE <= decoded && decoded <= Short.MAX_VALUE) {
vle.getAndIncrement();
vle_bytes.addAndGet(6);
value = Bytes.fromShort((short) decoded);
} else if (Integer.MIN_VALUE <= decoded &&
decoded <= Integer.MAX_VALUE) {
vle.getAndIncrement();
vle_bytes.addAndGet(4);
value = Bytes.fromInt((int) decoded);
} // else it needs 8 bytes, it's on 8 bytes, yipee
if (length != value.length && options.fix()) {
final byte[] new_qualifier = Arrays.copyOf(qual, qual.length);
new_qualifier[new_qualifier.length - 1] &= 0xF0 | (value.length - 1);
if (compact_row || options.compact()) {
appendDP(new_qualifier, value, value.length);
} else {
// put the new value, THEN delete the old
final PutRequest put = new PutRequest(tsdb.dataTable(),
dp.kv.key(), dp.kv.family(), new_qualifier, value);
tsdb.getClient().put(put).joinUninterruptibly();
final DeleteRequest delete = new DeleteRequest(tsdb.dataTable(),
dp.kv.key(), dp.kv.family(), qual);
tsdb.getClient().delete(delete);
}
vle_fixed.getAndIncrement();
} // don't return true here as we don't consider a VLE an error.
} else {
if (compact_row || options.compact()) {
appendDP(qual, value, value.length);
}
}
return false;
}
/**
* Appends the given value to the running qualifier and value compaction
* byte arrays. It doesn't take a {@code DP} as we may be changing the
* arrays before they're re-written.
* @param new_qual The qualifier to append
* @param new_value The value to append
* @param value_length How much of the value to append
*/
private void appendDP(final byte[] new_qual, final byte[] new_value,
final int value_length) {
System.arraycopy(new_qual, 0, compact_qualifier, qualifier_index, new_qual.length);
qualifier_index += new_qual.length;
System.arraycopy(new_value, 0, compact_value, value_index, value_length);
value_index += value_length;
}
/**
* Appends a representation of a datapoint to a string buffer
* @param buf The buffer to modify
* @param msg An optional message to append
*/
private StringBuilder appendDatapointInfo(final StringBuilder buf,
final DP dp, final String msg) {
buf.append(" ")
.append("write time: (")
.append(dp.kv.timestamp())
.append(") ")
.append(" compacted: (")
.append(dp.compacted)
.append(") qualifier: ")
.append(Arrays.toString(dp.kv.qualifier()))
.append(msg);
return buf;
}
/**
* Resets the running compaction variables. This should be called AFTER a
* {@link fsckDataPoints()} has been run and before the next row of values
* is processed. Note that we may overallocate some memory when creating
* the arrays.
*/
private void resetCompaction() {
compact_qualifier = null;
qualifier_index = 0;
compact_value = null;
value_index = 0;
qualifier_bytes = 0;
value_bytes = 0;
compact_row = false;
}
/**
* Internal class used for storing references to values during row parsing.
* The object will hold onto the key value where the value was found as well
* as the actual qualifier/value if the data point was compacted. It also
* sorts on the actual HBase write timestamp so we can resolve duplicates
* using the earliest or latest value.
*/
final class DP implements Comparable<DP> {
/** The KeyValue where this data point was found. May be a compacted column */
KeyValue kv;
/** Whether or not the value was in a compacted column */
boolean compacted;
/** The specific data point qualifier/value if the data point was compacted */
Cell cell;
/**
* Default Ctor used for a single data point
* @param kv The column where the value appeared.
*/
DP(final KeyValue kv) {
this.kv = kv;
compacted = false;
}
/**
* Overload for a compacted data point
* @param kv The column where the value appeared.
* @param cell The exploded data point
*/
DP(final KeyValue kv, final Cell cell) {
this.kv = kv;
this.cell = cell;
compacted = true;
}
/**
* Compares data points.
* @param dp The data point to compare to
* @return 0 if the HBase write timestamps are the same, -1 if the local
* object was written BEFORE the other data point, 1 if it was written
* later.
*/
public int compareTo(final DP dp) {
if (kv.timestamp() == dp.kv.timestamp()) {
return 0;
}
return kv.timestamp() < dp.kv.timestamp() ? -1 : 1;
}
/** @return The qualifier of the data point (from the compaction or column) */
public byte[] qualifier() {
return compacted ? cell.qualifier() : kv.qualifier();
}
/** @return The value of the data point */
public byte[] value() {
return compacted ? cell.value() : kv.value();
}
/** @return The cell or key value string */
public String toString() {
return compacted ? cell.toString() : kv.toString();
}
}
}
/**
* Silly little class to report the progress while fscking
*/
final class ProgressReporter extends Thread {
ProgressReporter() {
super("Progress");
}
public void run() {
long last_progress = 0;
while(true) {
try {
long processed_rows = rows_processed.get();
processed_rows = (processed_rows - (processed_rows % report_rows));
if (processed_rows - last_progress >= report_rows) {
last_progress = processed_rows;
LOG.info("Processed " + processed_rows + " rows, " +
valid_datapoints.get() + " valid datapoints");
}
Thread.sleep(1000);
} catch (InterruptedException e) {
}
}
}
}
/** Prints usage and exits with the given retval. */
private static void usage(final ArgP argp, final String errmsg,
final int retval) {
System.err.println(errmsg);
System.err.println("Usage: fsck"
+ " [flags] [START-DATE [END-DATE] query [queries...]] \n"
+ "Scans the OpenTSDB data table for errors. Use the --full-scan flag\n"
+ "to scan the entire data table or specify a command line query to "
+ "scan a subset.\n"
+ "To see the format in which queries should be written, see the help"
+ " of the 'query' command.\n"
+ "The --fix or --fix-all flags will attempt to fix errors,"
+ " but be careful when using them.\n");
System.err.print(argp.usage());
System.exit(retval);
}
/**
* Helper to dump the atomic counters to the log after a completed FSCK
*/
private void logResults() {
LOG.info("Key Values Processed: " + kvs_processed.get());
LOG.info("Rows Processed: " + rows_processed.get());
LOG.info("Valid Datapoints: " + valid_datapoints.get());
LOG.info("Annotations: " + annotations.get());
LOG.info("Invalid Row Keys Found: " + bad_key.get());
LOG.info("Invalid Rows Deleted: " + bad_key_fixed.get());
LOG.info("Duplicate Datapoints: " + duplicates.get());
LOG.info("Duplicate Datapoints Resolved: " + duplicates_fixed.get());
LOG.info("Orphaned UID Rows: " + orphans.get());
LOG.info("Orphaned UID Rows Deleted: " + orphans_fixed.get());
LOG.info("Possible Future Objects: " + future.get());
LOG.info("Unknown Objects: " + unknown.get());
LOG.info("Unknown Objects Deleted: " + unknown_fixed.get());
LOG.info("Unparseable Datapoint Values: " + bad_values.get());
LOG.info("Unparseable Datapoint Values Deleted: " + bad_values_deleted.get());
LOG.info("Improperly Encoded Floating Point Values: " + value_encoding.get());
LOG.info("Improperly Encoded Floating Point Values Fixed: " +
value_encoding_fixed.get());
LOG.info("Unparseable Compacted Columns: " + bad_compacted_columns.get());
LOG.info("Unparseable Compacted Columns Deleted: " +
bad_compacted_columns_deleted.get());
LOG.info("Datapoints Qualified for VLE : " + vle.get());
LOG.info("Datapoints Compressed with VLE: " + vle_fixed.get());
LOG.info("Bytes Saved with VLE: " + vle_bytes.get());
LOG.info("Total Errors: " + totalErrors());
LOG.info("Total Correctable Errors: " + correctable());
LOG.info("Total Errors Fixed: " + totalFixed());
}
/**
* The main class executed from the "tsdb" script
* @param args Command line arguments to parse
* @throws Exception If something goes pear shaped
*/
public static void main(String[] args) throws Exception {
ArgP argp = new ArgP();
argp.addOption("--help", "Print help information.");
CliOptions.addCommon(argp);
FsckOptions.addDataOptions(argp);
args = CliOptions.parse(argp, args);
if (argp.has("--help")) {
usage(argp, "", 0);
}
Config config = CliOptions.getConfig(argp);
final FsckOptions options = new FsckOptions(argp, config);
final TSDB tsdb = new TSDB(config);
final ArrayList<Query> queries = new ArrayList<Query>();
if (args != null && args.length > 0) {
CliQuery.parseCommandLineQuery(args, tsdb, queries, null, null);
}
if (queries.isEmpty() && !argp.has("--full-scan")) {
usage(argp, "Must supply a query or use the '--full-scan' flag", 1);
}
tsdb.checkNecessaryTablesExist().joinUninterruptibly();
argp = null;
final Fsck fsck = new Fsck(tsdb, options);
try {
if (!queries.isEmpty()) {
fsck.runQueries(queries);
} else {
fsck.runFullTable();
}
} finally {
tsdb.shutdown().joinUninterruptibly();
}
System.exit(fsck.totalErrors() == 0 ? 0 : 1);
}
}