/*
* Copyright © 2014-2016 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package co.cask.cdap.api.dataset.lib;
import co.cask.cdap.api.common.Bytes;
import co.cask.cdap.api.data.batch.RecordScanner;
import co.cask.cdap.api.data.batch.Split;
import co.cask.cdap.api.data.batch.SplitReader;
import co.cask.cdap.api.data.format.StructuredRecord;
import co.cask.cdap.api.dataset.table.Delete;
import co.cask.cdap.api.dataset.table.Get;
import co.cask.cdap.api.dataset.table.Increment;
import co.cask.cdap.api.dataset.table.Put;
import co.cask.cdap.api.dataset.table.Result;
import co.cask.cdap.api.dataset.table.Row;
import co.cask.cdap.api.dataset.table.Scan;
import co.cask.cdap.api.dataset.table.Scanner;
import co.cask.cdap.api.dataset.table.Table;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.lang.reflect.Type;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.NavigableMap;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeMap;
import java.util.TreeSet;
import javax.annotation.Nullable;
/**
* Implements a table that creates and maintains indexes on values stored within a configured set of column names.
*
* <p>
* This dataset uses two tables:
* <ul>
* <li>the actual data table, which stores the raw, unmodified rows which are written; and</li>
* <li>an index table, with rows keyed by the indexed column and value (plus data row key for uniqueness),
* which contains a reference to the row key in the data table matching the indexed value.</li>
* </ul>
* </p>
*
* <p>The indexed values need not be unique. When reading the data back by index value, a {@link Scanner} will be
* returned, allowing the client to iterate through all matching rows. Exact matches as well as range lookups on
* indexed values are supported.
* </p>
*
* <p>Index entries are created by storing additional rows in a second table. These index rows are keyed by
* column name, column value, and original row key, each field separated by a single null byte delimiter. It is not
* recommended that the name of an index column contains the null byte (this can cause a degradation in performance
* of index reads).
* </p>
*
* <p>The columns to index can be configured in the {@link co.cask.cdap.api.dataset.DatasetProperties} used
* when the dataset instance in created. Multiple column names should be listed as a comma-separated string
* (with no spaces):
*
* <p>
* <pre><code>public class MyApp extends AbstractApplication {
* public void configure() {
* setName("MyApp");
* ...
* createDataset("indexedData", IndexedTable.class,
* DatasetProperties.builder().add(
* IndexedTableDefinition.INDEX_COLUMNS_CONF_KEY, "col1,col2").build());
* ...
* }
* }</code></pre>
* </p>
*
* <p>
* Note that this means that the column names which should be indexed cannot contain the comma character,
* as it would break parsing of the configuration property.
* </p>
*
* @see co.cask.cdap.api.dataset.lib.IndexedTableDefinition#INDEX_COLUMNS_CONF_KEY
*/
public class IndexedTable extends AbstractDataset implements Table {
private static final Logger LOG = LoggerFactory.getLogger(IndexedTable.class);
/**
* Column key used to store the existence of a row in the secondary index.
*/
private static final byte[] IDX_COL = {'r'};
private static final byte DELIMITER_BYTE = 0;
private static final byte[] KEY_DELIMITER = new byte[] { DELIMITER_BYTE };
private final boolean hasColumnWithDelimiter;
// the two underlying tables
private Table table, index;
// the secondary index column
private SortedSet<byte[]> indexedColumns;
/**
* Configuration time constructor.
*
* @param name the name of the table
* @param table table to use as the table
* @param index table to use as the index
* @param columnsToIndex the names of the data columns to index
*/
public IndexedTable(String name, Table table, Table index, byte[][] columnsToIndex) {
super(name, table, index);
this.table = table;
this.index = index;
this.indexedColumns = new TreeSet<>(Bytes.BYTES_COMPARATOR);
this.hasColumnWithDelimiter = hasDelimiterByte(columnsToIndex);
Collections.addAll(this.indexedColumns, columnsToIndex);
}
/**
* Checks if a set of columns contains the DELIMITER_BYTE.
* This is needed because only when a column has a null byte in it do we need to check for false positive in the scan
* on the index by checking against the column's value in the data table.
* For instance, a false positive can arise with an index on the following two rows:
* <ul>
* <li>Row #1: column name: a\0b, column value: c</li>
* <li>Row #2: column name: a, column value: b\0c</li>
* </ul>
* Because of how the rows are constructed in the index table, both will be prefixed (and therefore looked up by)
* the row key prefix a\0b\0c.
*
* @param columns the set of columns being checked
* @return true if any of the columns contain the DELIMITER_BYTE
*/
private boolean hasDelimiterByte(byte[][] columns) {
for (byte[] column : columns) {
for (byte b : column) {
if (b == DELIMITER_BYTE) {
return true;
}
}
}
return false;
}
/**
* Read a row by row key from the data table.
*
* @param get the read operation, as if it were on a non-indexed table
* @return the result of the read on the underlying primary table
*/
@Override
public Row get(Get get) {
return table.get(get);
}
@Override
public Row get(byte[] row) {
return table.get(row);
}
@Override
public byte[] get(byte[] row, byte[] column) {
return table.get(row, column);
}
@Override
public Row get(byte[] row, byte[][] columns) {
return table.get(row, columns);
}
@Override
public Row get(byte[] row, byte[] startColumn, byte[] stopColumn, int limit) {
return table.get(row, startColumn, stopColumn, limit);
}
@Override
public List<Row> get(List<Get> gets) {
return table.get(gets);
}
/**
* Reads table rows by the given secondary index key. If no rows are indexed by the given key, then a
* {@link co.cask.cdap.api.dataset.table.Scanner} with no results will be returned.
*
* @return a Scanner returning rows from the data table, whose stored value for the given column matches the
* given value.
* @throws java.lang.IllegalArgumentException if the given column is not configured for indexing.
*/
public Scanner readByIndex(byte[] column, byte[] value) {
assertIndexedColumn(column);
byte[] rowKeyPrefix = Bytes.concat(column, KEY_DELIMITER, value, KEY_DELIMITER);
byte[] stopRow = Bytes.stopKeyForPrefix(rowKeyPrefix);
Scanner indexScan = index.scan(rowKeyPrefix, stopRow);
return new IndexScanner(indexScan, column, value);
}
/**
* Reads table rows within the given secondary index key range. If no rows are indexed, falling within the given
* range, then a {@link co.cask.cdap.api.dataset.table.Scanner} with no results will be returned.
*
* @param column the column to use for the index lookup
* @param startValue the inclusive start of the range for which rows must fall within to be returned in the scan.
* {@code null} means start from first row of the table
* @param endValue the exclusive end of the range for which rows must fall within to be returned in the scan
* {@code null} means end with the last row of the table
* @return a Scanner returning rows from the data table, whose stored value for the given column is within the the
* given range.
* @throws java.lang.IllegalArgumentException if the given column is not configured for indexing.
*/
public Scanner scanByIndex(byte[] column, @Nullable byte[] startValue, @Nullable byte[] endValue) {
assertIndexedColumn(column);
// KEY_DELIMITER is not used at the end of the rowKeys, because they are used for a range scan,
// instead of a fixed-match lookup
byte[] startRow = startValue == null ? Bytes.concat(column, KEY_DELIMITER) :
Bytes.concat(column, KEY_DELIMITER, startValue);
byte[] stopRow = endValue == null ? Bytes.stopKeyForPrefix(Bytes.concat(column, KEY_DELIMITER)) :
Bytes.concat(column, KEY_DELIMITER, endValue);
Scanner indexScan = index.scan(startRow, stopRow);
return new IndexRangeScanner(indexScan, column, startValue, endValue);
}
private void assertIndexedColumn(byte[] column) {
if (!indexedColumns.contains(column)) {
throw new IllegalArgumentException("Column " + Bytes.toStringBinary(column) + " is not configured for indexing");
}
}
/**
* Writes a put to the data table. If any of the columns in the {@link Put} are configured to be indexed, the
* appropriate indexes will be updated with the indexed values referencing the data table row.
*
* @param put The put operation to store
*/
@Override
public void put(Put put) {
// if different value exists, remove current index ref
// add a new index ref unless same value already exists
byte[] dataRow = put.getRow();
// find which values need to be indexed
Map<byte[], byte[]> putColumns = put.getValues();
Set<byte[]> colsToIndex = new TreeSet<>(Bytes.BYTES_COMPARATOR);
for (Map.Entry<byte[], byte[]> putEntry : putColumns.entrySet()) {
if (indexedColumns.contains(putEntry.getKey())) {
colsToIndex.add(putEntry.getKey());
}
}
if (!colsToIndex.isEmpty()) {
// first read the existing indexed values to find which have changed and need to be updated
Row existingRow = table.get(dataRow, colsToIndex.toArray(new byte[colsToIndex.size()][]));
for (Map.Entry<byte[], byte[]> entry : existingRow.getColumns().entrySet()) {
if (!Arrays.equals(entry.getValue(), putColumns.get(entry.getKey()))) {
index.delete(createIndexKey(dataRow, entry.getKey(), entry.getValue()), IDX_COL);
} else {
// value already indexed
colsToIndex.remove(entry.getKey());
}
}
// add new index entries for all values that have changed or did not exist
for (byte[] col : colsToIndex) {
index.put(createIndexKey(dataRow, col, putColumns.get(col)), IDX_COL, dataRow);
}
}
// store the data row
table.put(put);
}
private byte[] createIndexKey(byte[] row, byte[] column, byte[] value) {
return Bytes.concat(column, KEY_DELIMITER, value, KEY_DELIMITER, row);
}
@Override
public void put(byte[] row, byte[] column, byte[] value) {
Put put = new Put(row);
put.add(column, value);
put(put);
}
@Override
public void put(byte[] row, byte[][] columns, byte[][] values) {
Put put = new Put(row);
for (int i = 0; i < columns.length; i++) {
put.add(columns[i], values[i]);
}
put(put);
}
/**
* Perform a delete on the data table. Any index entries referencing the deleted row will also be removed.
*
* @param delete The delete operation identifying the row and optional columns to remove
*/
@Override
public void delete(Delete delete) {
if (delete.getColumns() == null) {
// full row delete
delete(delete.getRow());
return;
}
delete(delete.getRow(), delete.getColumns().toArray(new byte[0][]));
}
@Override
public void delete(byte[] row) {
Row existingRow = table.get(row);
if (existingRow.isEmpty()) {
// no row to delete
return;
}
// delete all index entries
deleteIndexEntries(existingRow);
// delete the row
table.delete(row);
}
@Override
public void delete(byte[] row, byte[] column) {
delete(row, new byte[][]{ column });
}
@Override
public void delete(byte[] row, byte[][] columns) {
Row existingRow = table.get(row, columns);
if (existingRow.isEmpty()) {
// no row to delete
return;
}
// delete all index entries
deleteIndexEntries(existingRow);
// delete the row's columns
table.delete(row, columns);
}
private void deleteIndexEntries(Row existingRow) {
byte[] row = existingRow.getRow();
for (Map.Entry<byte[], byte[]> entry : existingRow.getColumns().entrySet()) {
if (indexedColumns.contains(entry.getKey())) {
index.delete(createIndexKey(row, entry.getKey(), entry.getValue()), IDX_COL);
}
}
}
/**
* Perform a swap operation by primary key.
* Parameters are as if they were on a non-indexed table.
* Note that if the swap is on the secondary key column,
* then the index must be updated; otherwise, this is a
* pass-through to the underlying table.
*/
@Override
public boolean compareAndSwap(byte[] row, byte[] column, byte[] expected, byte[] newValue) {
// if the swap is on a column other than the column key, then
// the index is not affected - just execute the swap.
// also, if the swap is on the index column, but the old value
// is the same as the new value, then the index is not affected either.
if (!indexedColumns.contains(column) ||
Arrays.equals(expected, newValue)) {
return table.compareAndSwap(row, column, expected, newValue);
}
// the swap is on the index column. it will only succeed if the current
// value matches the expected value of the swap. if that value is not null,
// then we must remove the row key from the index for that value.
Delete idxDelete = null;
if (expected != null) {
idxDelete = new Delete(createIndexKey(row, column, expected), IDX_COL);
}
// if the new value is not null, then we must add the rowkey to the index
// for that value.
Put idxPut = null;
if (newValue != null) {
idxPut = new Put(createIndexKey(row, column, newValue), IDX_COL, row);
}
// apply all operations to both tables
boolean success = table.compareAndSwap(row, column, expected, newValue);
if (!success) {
// do nothing: no changes
return false;
}
if (idxDelete != null) {
index.delete(idxDelete);
}
if (idxPut != null) {
index.put(idxPut);
}
return true;
}
/**
* Increments (atomically) the specified row and column by the specified amount, and returns the new value.
* Note that performing this operation on an indexed column will generally have a negative impact on performance,
* since up to three writes will need to be performed for every increment (one removing the index for the previous,
* pre-increment value, one adding the index for the incremented value, and one for the increment itself).
*
* @see Table#incrementAndGet(byte[], byte[], long)
*/
@Override
public long incrementAndGet(byte[] row, byte[] column, long amount) {
byte[] newValue = incrementAndGet(row, new byte[][]{ column }, new long[]{ amount }).get(column);
return Bytes.toLong(newValue);
}
/**
* Increments (atomically) the specified row and columns by the specified amounts, and returns the new values.
* Note that performing this operation on an indexed column will generally have a negative impact on performance,
* since up to three writes will need to be performed for every increment (one removing the index for the previous,
* pre-increment value, one adding the index for the incremented value, and one for the increment itself).
*
* @see Table#incrementAndGet(byte[], byte[][], long[])
*/
@Override
public Row incrementAndGet(byte[] row, byte[][] columns, long[] amounts) {
if (columns.length != amounts.length) {
throw new IllegalArgumentException("Size of columns and amounts arguments must match");
}
Row existingRow = table.get(row, columns);
byte[][] updatedValues = new byte[columns.length][];
NavigableMap<byte[], byte[]> result = new TreeMap<>(Bytes.BYTES_COMPARATOR);
for (int i = 0; i < columns.length; i++) {
long existingValue = 0L;
byte[] existingBytes = existingRow.get(columns[i]);
if (existingBytes != null) {
if (existingBytes.length != Bytes.SIZEOF_LONG) {
throw new NumberFormatException("Attempted to increment a value that is not convertible to long," +
" row: " + Bytes.toStringBinary(row) +
" column: " + Bytes.toStringBinary(columns[i]));
}
existingValue = Bytes.toLong(existingBytes);
if (indexedColumns.contains(columns[i])) {
index.delete(createIndexKey(row, columns[i], existingBytes), IDX_COL);
}
}
updatedValues[i] = Bytes.toBytes(existingValue + amounts[i]);
result.put(columns[i], updatedValues[i]);
if (indexedColumns.contains(columns[i])) {
index.put(createIndexKey(row, columns[i], updatedValues[i]), IDX_COL, row);
}
}
table.put(row, columns, updatedValues);
return new Result(row, result);
}
/**
* Increments (atomically) the specified row and columns by the specified amounts, and returns the new values.
* Note that performing this operation on an indexed column will generally have a negative impact on performance,
* since up to three writes will need to be performed for every increment (one removing the index for the previous,
* pre-increment value, one adding the index for the incremented value, and one for the increment itself).
*
* @see Table#incrementAndGet(Increment)
*/
@Override
public Row incrementAndGet(Increment increment) {
Map<byte[], Long> incrementValues = increment.getValues();
Collection<Long> values = incrementValues.values();
long[] longValues = new long[values.size()];
int i = 0;
for (long value : values) {
longValues[i++] = value;
}
return incrementAndGet(increment.getRow(),
incrementValues.keySet().toArray(new byte[incrementValues.size()][]),
longValues);
}
/**
* Increments (atomically) the specified row and column by the specified amount, without returning the new value.
* Note that performing this operation on an indexed column throws {@link java.lang.IllegalArgumentException}.
*
* @see Table#increment(byte[], byte[], long)
*/
@Override
public void increment(byte[] row, byte[] column, long amount) {
// read-less increments should not be used on indexed columns
if (indexedColumns.contains(column)) {
throw new IllegalArgumentException("Read-less increment is not supported on indexed column '"
+ Bytes.toStringBinary(column) + "'");
}
table.increment(row, column, amount);
}
/**
* Increments (atomically) the specified row and columns by the specified amounts, without returning the new values.
* Note that performing this operation on an indexed column throws {@link java.lang.IllegalArgumentException}.
*
* @see Table#increment(byte[], byte[][], long[])
*/
@Override
public void increment(byte[] row, byte[][] columns, long[] amounts) {
// read-less increments should not be used on indexed columns
for (byte[] col : columns) {
if (indexedColumns.contains(col)) {
throw new IllegalArgumentException("Read-less increment is not supported on indexed column '"
+ Bytes.toStringBinary(col) + "'");
}
}
table.increment(row, columns, amounts);
}
/**
* Increments (atomically) the specified row and columns by the specified amounts, without returning the new values.
* Note that performing this operation on an indexed column throws {@link java.lang.IllegalArgumentException}.
*
* @see Table#increment(Increment)
*/
@Override
public void increment(Increment increment) {
for (byte[] col : increment.getValues().keySet()) {
if (indexedColumns.contains(col)) {
throw new IllegalArgumentException("Read-less increment is not supported on indexed column '"
+ Bytes.toStringBinary(col) + "'");
}
}
table.increment(increment);
}
@Override
public Scanner scan(@Nullable byte[] startRow, @Nullable byte[] stopRow) {
return table.scan(startRow, stopRow);
}
@Override
public Scanner scan(Scan scan) {
return table.scan(scan);
}
/* BatchReadable implementation */
@Override
public List<Split> getSplits(int numSplits, byte[] start, byte[] stop) {
return table.getSplits(numSplits, start, stop);
}
@Override
public Type getRecordType() {
return table.getRecordType();
}
@Override
public void write(StructuredRecord structuredRecord) throws IOException {
table.write(structuredRecord);
}
@Override
public List<Split> getSplits() {
return table.getSplits();
}
@Override
public RecordScanner<StructuredRecord> createSplitRecordScanner(Split split) {
return table.createSplitRecordScanner(split);
}
@Override
public SplitReader<byte[], Row> createSplitReader(Split split) {
return table.createSplitReader(split);
}
/* BatchWritable implementation */
@Override
public void write(byte[] bytes, Put put) {
put(put);
}
private abstract class AbstractIndexScanner implements Scanner {
// scanner over index table
private final Scanner baseScanner;
private final byte[] column;
public AbstractIndexScanner(Scanner baseScanner, byte[] column) {
this.baseScanner = baseScanner;
this.column = column;
}
/**
* checks if a particular column value matches a criteria defined by the implementing class
*
* @param columnValue the column to check for a match
* @return false to indicate to skip the corresponding row
*/
protected abstract boolean matches(byte[] columnValue);
@Nullable
@Override
public Row next() {
// TODO: retrieve results in batches to minimize RPC overhead (requires multi-get support in table)
// keep going until we hit a non-null, non-empty data row, or we exhaust the index
for (Row indexRow = baseScanner.next(); indexRow != null; indexRow = baseScanner.next()) {
byte[] rowkey = indexRow.get(IDX_COL);
if (rowkey == null) {
LOG.warn("Row of Indexed table '{}' is missing index column. Row key: {}", getName(), indexRow.getRow());
continue;
}
byte[] columnValue = Arrays.copyOfRange(indexRow.getRow(),
column.length + 1,
indexRow.getRow().length - rowkey.length - 1);
// Verify that datarow matches the expected row key to avoid issues with column name or value
// containing the delimiter used. This is a sufficient check, as long as columns don't contain the null byte.
if (matches(columnValue)) {
Row row = table.get(rowkey);
// If a column has null byte (the key delimiter) in it, then we need to check against the data row's column
// to be sure this row isn't a false positive in the scan.
// For reference, take a look at IndexedTableTest#testIndexKeyDelimiterAmbiguity
if (hasColumnWithDelimiter && !Bytes.equals(row.get(column), columnValue)) {
continue;
}
return row;
}
}
// end of index
return null;
}
@Override
public void close() {
baseScanner.close();
}
}
// scanner that matches column values based upon exact match
private class IndexScanner extends AbstractIndexScanner {
private final byte[] value;
public IndexScanner(Scanner baseScanner, byte[] column, byte[] value) {
super(baseScanner, column);
this.value = value;
}
@Override
protected boolean matches(byte[] columnValue) {
return Bytes.equals(columnValue, value);
}
}
// scanner that matches column values based upon range
private class IndexRangeScanner extends AbstractIndexScanner {
private final byte[] start;
private final byte[] end;
public IndexRangeScanner(Scanner baseScanner, byte[] column, @Nullable byte[] start, @Nullable byte[] end) {
super(baseScanner, column);
this.start = start;
this.end = end;
}
@Override
protected boolean matches(byte[] columnValue) {
return (start == null || Bytes.compareTo(columnValue, start) >= 0)
&& (end == null || Bytes.compareTo(columnValue, end) < 0);
}
}
}