/*
* Copyright © 2014-2016 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package co.cask.cdap.data2.dataset2.lib.table;
import co.cask.cdap.api.common.Bytes;
import co.cask.cdap.api.data.batch.Split;
import co.cask.cdap.api.data.schema.Schema;
import co.cask.cdap.api.dataset.DataSetException;
import co.cask.cdap.api.dataset.metrics.MeteredDataset;
import co.cask.cdap.api.dataset.table.ConflictDetection;
import co.cask.cdap.api.dataset.table.Filter;
import co.cask.cdap.api.dataset.table.Get;
import co.cask.cdap.api.dataset.table.Result;
import co.cask.cdap.api.dataset.table.Row;
import co.cask.cdap.api.dataset.table.Scan;
import co.cask.cdap.api.dataset.table.Scanner;
import co.cask.cdap.api.dataset.table.TableSplit;
import co.cask.cdap.api.metrics.MetricsCollector;
import co.cask.cdap.common.conf.Constants;
import co.cask.tephra.Transaction;
import co.cask.tephra.TransactionAware;
import com.google.common.base.Function;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.NavigableMap;
import java.util.concurrent.ConcurrentSkipListMap;
import javax.annotation.Nullable;
/**
* An abstract {@link TransactionAware} implementation of {@link co.cask.cdap.api.dataset.table.Table} which
* keeps data in memory buffer until transaction commits.
* <p>
* Subclasses should implement methods which deal with persistent store. This implementation merges data from persistent
* store and in-memory buffer for read/write operations.
* NOTE: this implementation does not allow storing null as a value of a column
* NOTE: data fetched from persisted store should never have nulls in column values: this class doesn't check that and
* they could be exposed to user as nulls. At the same time since null value is not allowed by this implementation
* this could lead to un-expected results
* <p>
* This implementation assumes that the table has name and conflicts are resolved on row level.
* <p>
* NOTE: this implementation doesn't cache any data in-memory besides changes. I.e. if you do get of same data that is
* not in in-memory buffer twice, two times it will try to fetch it from persistent store.
* Given the snapshot isolation tx model, this can be improved in future implementations.
* <p>
* NOTE: current implementation persists changes only at the end of transaction. Beware of OOME. There should be better
* implementation for MapReduce case (YMMV though, for counters/aggregations this implementation looks sweet)
* <p>
* NOTE: Using {@link #get(byte[], byte[], byte[], int)} is generally always not efficient since it always hits the
* persisted store even if all needed data is in-memory buffer. See more info at method javadoc
*/
// todo: copying passed params to write methods may be done more efficiently: no need to copy when no changes are made
public abstract class BufferingTable extends AbstractTable implements MeteredDataset {
private static final Logger LOG = LoggerFactory.getLogger(BufferingTable.class);
protected static final byte[] DELETE_MARKER = new byte[0];
// name of the table
private final String name;
// conflict detection level
private final ConflictDetection conflictLevel;
// name length + name of the table: handy to have one cached
private final byte[] nameAsTxChangePrefix;
// Whether read-less increments should be used when increment() is called
private final boolean enableReadlessIncrements;
// In-memory buffer that keeps not yet persisted data. It is row->(column->value) map. Value can be null which means
// that the corresponded column was removed.
private NavigableMap<byte[], NavigableMap<byte[], Update>> buff;
// Keeps track of what was persisted so far
private NavigableMap<byte[], NavigableMap<byte[], Update>> toUndo;
// Report data ops metrics to
private MetricsCollector metricsCollector;
/**
* Creates an instance of {@link BufferingTable} with row level conflict detection, without readless increments,
* and no schema.
*
* @param name the name of the table
*/
public BufferingTable(String name) {
this(name, ConflictDetection.ROW);
}
/**
* Creates an instance of {@link BufferingTable} without readless increments disabled and no schema.
*
* @param name the name of the table
* @param level the conflict detection level
*/
public BufferingTable(String name, ConflictDetection level) {
this(name, level, false, null, null);
}
/**
* Creates an instance of {@link BufferingTable}.
*
* @param name the name of the table
* @param level the conflict detection level
* @param enableReadlessIncrements whether or not readless increments are enabled
* @param schema the schema of the table, or null if there is no schema
* @param rowFieldName the name of the schema field that the row key maps to, or null if there is none
*/
public BufferingTable(String name, ConflictDetection level, boolean enableReadlessIncrements,
@Nullable Schema schema, @Nullable String rowFieldName) {
super(schema, rowFieldName);
// for optimization purposes we don't allow table name of length greater than Byte.MAX_VALUE
Preconditions.checkArgument(name.length() < Byte.MAX_VALUE,
"Too big table name: " + name + ", exceeds " + Byte.MAX_VALUE);
this.name = name;
this.conflictLevel = level;
this.enableReadlessIncrements = enableReadlessIncrements;
// TODO: having central dataset management service will allow us to use table ids instead of names, which will
// reduce changeset size transferred to/from server
// we want it to be of format length+value to avoid conflicts like table="ab", row="cd" vs table="abc", row="d"
// Default uses the above scheme. Subclasses can change it by overriding the #getNameAsTxChangePrefix method
this.nameAsTxChangePrefix = Bytes.add(new byte[]{(byte) name.length()}, Bytes.toBytes(name));
this.buff = new ConcurrentSkipListMap<>(Bytes.BYTES_COMPARATOR);
}
/**
* @return name of this table
*/
public String getTableName() {
return name;
}
@Override
public String getTransactionAwareName() {
return getClass().getSimpleName() + "(table = " + name + ")";
}
/**
* Generates a byte array to be used as the transaction change prefix.
* Allows implementations to override it so the change prefix more closely represents the underlying storage.
*
* @return transaction change prefix
*/
public byte[] getNameAsTxChangePrefix() {
return this.nameAsTxChangePrefix;
}
/**
* Persists in-memory buffer. After this method returns we assume that data can be visible to other table clients
* (of course other clients may choose still not to see it based on transaction isolation logic).
* @param buff in-memory buffer to persist. Map is described as row->(column->value). Map can contain null values
* which means that the corresponded column was deleted
* @throws Exception
*/
protected abstract void persist(NavigableMap<byte[], NavigableMap<byte[], Update>> buff)
throws Exception;
/**
* Undos previously persisted changes. After this method returns we assume that data can be visible to other table
* clients (of course other clients may choose still not to see it based on transaction isolation logic).
* @param persisted previously persisted changes. Map is described as row->(column->value). Map can contain null
* values which means that the corresponded column was deleted
* @throws Exception
*/
protected abstract void undo(NavigableMap<byte[], NavigableMap<byte[], Update>> persisted)
throws Exception;
/**
* Fetches column->value pairs for set of columns from persistent store.
* NOTE: persisted store can also be in-memory, it is called "persisted" to distinguish from in-memory buffer.
* @param row row key defines the row to fetch columns from
* @param columns set of columns to fetch. null means fetch everything; empty array which means fetch nothing.
* @return map of column->value pairs, never null.
* @throws Exception
*/
protected abstract NavigableMap<byte[], byte[]> getPersisted(byte[] row, @Nullable byte[][] columns)
throws Exception;
/**
* Fetches column->value pairs for range of columns from persistent store.
* NOTE: persisted store can also be in-memory, it is called "persisted" to distinguish from in-memory buffer.
* NOTE: Using this method is generally always not efficient since it always hits the
* persisted store even if all needed data is in-memory buffer. Since columns set is not strictly defined the
* implementation always looks up for more columns in persistent store.
* @param row row key defines the row to fetch columns from
* @param startColumn first column in a range, inclusive
* @param stopColumn last column in a range, exclusive
* @param limit max number of columns to fetch
* @return map of column->value pairs, never null.
* @throws Exception
*/
protected abstract NavigableMap<byte[], byte[]> getPersisted(byte[] row,
byte[] startColumn, byte[] stopColumn,
int limit)
throws Exception;
/**
* Scans range of rows from persistent store for a given {@link Scan}.
* NOTE: persisted store can also be in-memory, it is called "persisted" to distinguish from in-memory buffer.
* @param scan scan configuration
* @return instance of {@link Scanner}, never null
* @throws Exception
*/
protected abstract Scanner scanPersisted(Scan scan) throws Exception;
/**
* Fetches a list of rows from persistent store. Subclasses should override this if they can batch multiple
* gets into a single request, as the default implementation simply loops through the gets and calls
* {@link #getPersisted(byte[], byte[][])} on each get.
* NOTE: persisted store can also be in-memory, it is called "persisted" to distinguish from in-memory buffer.
* @param gets list of gets to perform
* @return list of rows, one for each get
* @throws Exception
*/
protected List<Map<byte[], byte[]>> getPersisted(List<Get> gets) throws Exception {
List<Map<byte[], byte[]>> results = Lists.newArrayListWithCapacity(gets.size());
for (Get get : gets) {
List<byte[]> getColumns = get.getColumns();
byte[][] columns = getColumns == null ? null : getColumns.toArray(new byte[getColumns.size()][]);
results.add(getPersisted(get.getRow(), columns));
}
return results;
}
@Override
public void setMetricsCollector(MetricsCollector metricsCollector) {
this.metricsCollector = metricsCollector;
}
@Override
public void close() throws IOException {
// releasing resources
buff = null;
toUndo = null;
}
@Override
public void startTx(Transaction tx) {
if (buff == null) {
String msg = "Attempted to use closed dataset " + getTransactionAwareName();
LOG.error(msg);
throw new IllegalStateException(msg);
}
// starting with fresh buffer when tx starts
buff.clear();
toUndo = null;
}
@Override
public void updateTx(Transaction transaction) {
// TODO: transaction checkpoints are not yet supported
// This is safe, since this should only be called by TransactionContext.checkpoint(),
// which is not exposed through the application APIs.
//
// Supporting transaction checkpoints will require:
// 1. providing some application API that interacts with TransactionContext.checkpoint()
// 2. keying the buffered edits by timestamp, so that buffered writes at different
// checkpoint timestamps can be correctly ordered during merge.
// 3. keying toUndo by timestamp, so that persisted changes can be rolled back using the
// correct timestamp
throw new UnsupportedOperationException("Transaction checkpoints are not supported");
}
@Override
public Collection<byte[]> getTxChanges() {
switch (conflictLevel) {
case NONE:
return Collections.emptyList();
case ROW:
return getRowChanges();
case COLUMN:
return getColumnChanges();
default:
throw new RuntimeException("Unknown conflict detection level: " + conflictLevel);
}
}
private Collection<byte[]> getRowChanges() {
// we resolve conflicts on row level of individual table
List<byte[]> changes = new ArrayList<>(buff.size());
for (byte[] changedRow : buff.keySet()) {
changes.add(Bytes.add(getNameAsTxChangePrefix(), changedRow));
}
return changes;
}
private Collection<byte[]> getColumnChanges() {
// we resolve conflicts on row level of individual table
List<byte[]> changes = new ArrayList<>(buff.size());
for (Map.Entry<byte[], NavigableMap<byte[], Update>> rowChange : buff.entrySet()) {
if (rowChange.getValue() == null) {
// NOTE: as of now we cannot detect conflict between delete whole row and row's column value change.
// this is not a big problem as of now, as row deletion is now act as deletion of every column, but this
// will change in future, so we will have to address the issue.
continue;
}
// using length + value format to prevent conflicts like row="ab", column="cd" vs row="abc", column="d"
byte[] rowTxChange = Bytes.add(Bytes.toBytes(rowChange.getKey().length), rowChange.getKey());
for (byte[] column : rowChange.getValue().keySet()) {
changes.add(Bytes.add(getNameAsTxChangePrefix(), rowTxChange, column));
}
}
return changes;
}
@Override
public boolean commitTx() throws Exception {
if (!buff.isEmpty()) {
// We first assume that all data will be persisted. So that if exception happen during persist we try to
// rollback everything we had in in-memory buffer.
toUndo = buff;
// clearing up in-memory buffer by initializing new map.
// NOTE: we want to init map here so that if no changes are made we re-use same instance of the map in next tx
// NOTE: we could cache two maps and swap them to avoid creation of map instances, but code would be ugly
buff = new ConcurrentSkipListMap<>(Bytes.BYTES_COMPARATOR);
// TODO: tracking of persisted items can be optimized by returning a pair {succeededOrNot, persisted} which
// tells if persisting succeeded and what was persisted (i.e. what we will have to undo in case of rollback)
persist(toUndo);
}
return true;
}
@Override
public void postTxCommit() {
// don't need buffer anymore: tx has been committed
buff.clear();
toUndo = null;
}
@Override
public boolean rollbackTx() throws Exception {
buff.clear();
if (toUndo != null) {
undo(toUndo);
toUndo = null;
}
return true;
}
/**
* NOTE: Depending on the use-case, calling this method may be much less
* efficient than calling same method with columns as parameters because it may always require round trip to
* persistent store
*/
@Override
public Row get(byte[] row) {
reportRead(1);
try {
return new Result(row, getRowMap(row));
} catch (Exception e) {
LOG.debug("get failed for table: " + getTransactionAwareName() + ", row: " + Bytes.toStringBinary(row), e);
throw new DataSetException("get failed", e);
}
}
@Override
public Row get(byte[] row, byte[][] columns) {
reportRead(1);
try {
return new Result(row, getRowMap(row, columns));
} catch (Exception e) {
LOG.debug("get failed for table: " + getTransactionAwareName() + ", row: " + Bytes.toStringBinary(row), e);
throw new DataSetException("get failed", e);
}
}
@Override
public Row get(byte[] row, byte[] startColumn, byte[] stopColumn, int limit) {
reportRead(1);
// checking if the row was deleted inside this tx
NavigableMap<byte[], Update> buffCols = buff.get(row);
// NOTE: since we cannot tell the exact column set, we always have to go to persisted store.
// potential improvement: do not fetch columns available in in-mem buffer (we know them at this point)
try {
Map<byte[], byte[]> persistedCols = getPersisted(row, startColumn, stopColumn, limit);
// adding server cols, and then overriding with buffered values
NavigableMap<byte[], byte[]> result = Maps.newTreeMap(Bytes.BYTES_COMPARATOR);
if (persistedCols != null) {
result.putAll(persistedCols);
}
if (buffCols != null) {
buffCols = getRange(buffCols, startColumn, stopColumn, limit);
// null valued columns in in-memory buffer are deletes, so we need to delete them from the result list
mergeToPersisted(result, buffCols, null);
}
// applying limit
return new Result(row, head(result, limit));
} catch (Exception e) {
LOG.debug("get failed for table: " + getTransactionAwareName() + ", row: " + Bytes.toStringBinary(row), e);
throw new DataSetException("get failed", e);
}
}
@Override
public List<Row> get(List<Get> gets) {
try {
// get persisted, then overwrite with whats buffered
List<Map<byte[], byte[]>> persistedRows = getPersisted(gets);
// gets and rows lists are always of the same size
Preconditions.checkArgument(gets.size() == persistedRows.size(),
"Invalid number of rows fetched when performing multi-get. There must be one row for each get.");
List<Row> result = Lists.newArrayListWithCapacity(persistedRows.size());
Iterator<Map<byte[], byte[]>> persistedRowsIter = persistedRows.iterator();
Iterator<Get> getIter = gets.iterator();
while (persistedRowsIter.hasNext() && getIter.hasNext()) {
Get get = getIter.next();
Map<byte[], byte[]> persistedRow = persistedRowsIter.next();
// navigable copy of the persisted data. Implementation may return immutable or unmodifiable maps,
// so we make a copy here.
NavigableMap<byte[], byte[]> rowColumns = Maps.newTreeMap(Bytes.BYTES_COMPARATOR);
rowColumns.putAll(persistedRow);
byte[] row = get.getRow();
NavigableMap<byte[], Update> buffCols = buff.get(row);
// merge what was in the buffer and what was persisted
if (buffCols != null) {
List<byte[]> getColumns = get.getColumns();
byte[][] columns = getColumns == null ? null : getColumns.toArray(new byte[getColumns.size()][]);
mergeToPersisted(rowColumns, buffCols, columns);
}
result.add(new Result(row, unwrapDeletes(rowColumns)));
}
return result;
} catch (Exception e) {
LOG.debug("multi-get failed for table: " + getTransactionAwareName(), e);
throw new DataSetException("multi-get failed", e);
}
}
/**
* NOTE: if value is null corresponded column is deleted. It will not be in result set when reading.
*
* Also see {@link co.cask.cdap.api.dataset.table.Table#put(byte[], byte[][], byte[][])}.
*/
@Override
public void put(byte[] row, byte[][] columns, byte[][] values) {
putInternal(row, columns, values);
// report metrics _after_ write was performed
reportWrite(1, getSize(row) + getSize(columns) + getSize(values));
}
private void putInternal(byte[] row, byte[][] columns, byte[][] values) {
NavigableMap<byte[], Update> colVals = buff.get(row);
boolean newRow = false;
if (colVals == null) {
colVals = Maps.newTreeMap(Bytes.BYTES_COMPARATOR);
newRow = true;
}
for (int i = 0; i < columns.length; i++) {
// NOTE: we copy passed column's and value's byte arrays to protect buffer against possible changes of these
// arrays on client
if (values[i] != null && values[i].length == 0) {
warnAboutEmptyValue(columns[i]);
}
colVals.put(copy(columns[i]), new PutValue(copy(values[i])));
}
if (newRow) {
// NOTE: we copy passed row's byte arrays to protect buffer against possible changes of this array on client
buff.put(copy(row), colVals);
}
}
/**
* NOTE: Depending on the use-case, calling this method may be much less efficient than calling same method
* with columns as parameters because it will require a round trip to persistent store.
*/
@Override
public void delete(byte[] row) {
// this is going to be expensive, but the only we can do as delete implementation act on per-column level
try {
Map<byte[], byte[]> rowMap = getRowMap(row);
delete(row, rowMap.keySet().toArray(new byte[rowMap.keySet().size()][]));
// "0" because we don't know what gets deleted
reportWrite(1, 0);
} catch (Exception e) {
LOG.debug("delete failed for table: " + getTransactionAwareName() + ", row: " + Bytes.toStringBinary(row), e);
throw new DataSetException("delete failed", e);
}
}
@Override
public void delete(byte[] row, byte[][] columns) {
if (columns == null) {
delete(row);
return;
}
// Do not delete anything when columns list is empty. Return-fast shortcut
if (columns.length == 0) {
return;
}
// same as writing null for every column
// ANDREAS: shouldn't this be DELETE_MARKER?
putInternal(row, columns, new byte[columns.length][]);
// "0" because we don't know what gets deleted
reportWrite(1, 0);
}
@Override
public Row incrementAndGet(byte[] row, byte[][] columns, long[] amounts) {
// Logic:
// * fetching current values
// * updating values
// * updating in-memory store
// * returning updated values as result
// NOTE: there is more efficient way to do it, but for now we want more simple implementation, not over-optimizing
Map<byte[], byte[]> rowMap;
try {
rowMap = getRowMap(row, columns);
reportRead(1);
} catch (Exception e) {
LOG.debug("incrementAndGet failed for table: " + getTransactionAwareName() +
", row: " + Bytes.toStringBinary(row), e);
throw new DataSetException("incrementAndGet failed", e);
}
byte[][] updatedValues = new byte[columns.length][];
NavigableMap<byte[], byte[]> result = Maps.newTreeMap(Bytes.BYTES_COMPARATOR);
for (int i = 0; i < columns.length; i++) {
byte[] column = columns[i];
byte[] val = rowMap.get(column);
// converting to long
long longVal;
if (val == null) {
longVal = 0L;
} else {
if (val.length != Bytes.SIZEOF_LONG) {
throw new NumberFormatException("Attempted to increment a value that is not convertible to long," +
" row: " + Bytes.toStringBinary(row) +
" column: " + Bytes.toStringBinary(column));
}
longVal = Bytes.toLong(val);
}
longVal += amounts[i];
updatedValues[i] = Bytes.toBytes(longVal);
result.put(column, updatedValues[i]);
}
putInternal(row, columns, updatedValues);
reportWrite(1, getSize(row) + getSize(columns) + getSize(amounts));
return new Result(row, result);
}
@Override
public void increment(byte[] row, byte[][] columns, long[] amounts) {
if (enableReadlessIncrements) {
NavigableMap<byte[], Update> colVals = buff.get(row);
if (colVals == null) {
colVals = Maps.newTreeMap(Bytes.BYTES_COMPARATOR);
buff.put(row, colVals);
}
for (int i = 0; i < columns.length; i++) {
colVals.put(columns[i], Updates.mergeUpdates(colVals.get(columns[i]), new IncrementValue(amounts[i])));
}
reportWrite(1, getSize(row) + getSize(columns) + getSize(amounts));
} else {
incrementAndGet(row, columns, amounts);
}
}
@Override
public boolean compareAndSwap(byte[] row, byte[] column, byte[] expectedValue, byte[] newValue) {
// TODO: add support for empty values; see https://issues.cask.co/browse/TEPHRA-45 for details.
if (newValue != null && newValue.length == 0) {
warnAboutEmptyValue(column);
}
// NOTE: there is more efficient way to do it, but for now we want more simple implementation, not over-optimizing
byte[][] columns = new byte[][]{column};
try {
byte[] currentValue = getRowMap(row, columns).get(column);
reportRead(1);
if (Arrays.equals(expectedValue, currentValue)) {
putInternal(row, columns, new byte[][]{newValue});
reportWrite(1, getSize(row) + getSize(column) + getSize(newValue));
return true;
}
} catch (Exception e) {
LOG.debug("compareAndSwap failed for table: " + getTransactionAwareName() +
", row: " + Bytes.toStringBinary(row), e);
throw new DataSetException("compareAndSwap failed", e);
}
return false;
}
/**
* Fallback implementation of getSplits, {@link SplitsUtil#primitiveGetSplits(int, byte[], byte[])}.
* Ideally should be overridden by subclasses.
*
* @param numSplits Desired number of splits. If greater than zero, at most this many splits will be returned.
* If less or equal to zero, any number of splits can be returned.
* @param start If non-null, the returned splits will only cover keys that are greater or equal.
* @param stop If non-null, the returned splits will only cover keys that are less.
* @return list of {@link Split}
*/
@Override
public List<Split> getSplits(int numSplits, byte[] start, byte[] stop) {
List<KeyRange> keyRanges = SplitsUtil.primitiveGetSplits(numSplits, start, stop);
return Lists.transform(keyRanges, new Function<KeyRange, Split>() {
@Nullable
@Override
public Split apply(@Nullable KeyRange input) {
return new TableSplit(input == null ? null : input.getStart(),
input == null ? null : input.getStop());
}
});
}
@Override
public Scanner scan(byte[] startRow, byte[] stopRow) {
return scan(new Scan(startRow, stopRow));
}
@Override
public Scanner scan(Scan scan) {
NavigableMap<byte[], NavigableMap<byte[], Update>> bufferMap = scanBuffer(scan);
try {
return new BufferingScanner(bufferMap, scanPersisted(scan));
} catch (Exception e) {
LOG.debug("scan failed for table: " + getTransactionAwareName() +
", scan: " + scan.toString(), e);
throw new DataSetException("scan failed", e);
}
}
private NavigableMap<byte[], NavigableMap<byte[], Update>> scanBuffer(Scan scan) {
NavigableMap<byte[], NavigableMap<byte[], Update>> bufferMap;
byte[] startRow = scan.getStartRow();
byte[] stopRow = scan.getStopRow();
if (startRow == null && stopRow == null) {
bufferMap = buff;
} else if (startRow == null) {
bufferMap = buff.headMap(stopRow, false);
} else if (stopRow == null) {
bufferMap = buff.tailMap(startRow, true);
} else {
bufferMap = buff.subMap(startRow, true, stopRow, false);
}
bufferMap = applyFilter(bufferMap, scan.getFilter());
return bufferMap;
}
private NavigableMap<byte[], NavigableMap<byte[], Update>> applyFilter(
NavigableMap<byte[], NavigableMap<byte[], Update>> bufferMap,
@Nullable Filter filter) {
if (filter == null) {
return bufferMap;
}
// todo: currently we support only FuzzyRowFilter as an experimental feature
if (filter instanceof FuzzyRowFilter) {
NavigableMap<byte[], NavigableMap<byte[], Update>> result = Maps.newTreeMap(Bytes.BYTES_COMPARATOR);
for (Map.Entry<byte[], NavigableMap<byte[], Update>> entry : bufferMap.entrySet()) {
if (FuzzyRowFilter.ReturnCode.INCLUDE == ((FuzzyRowFilter) filter).filterRow(entry.getKey())) {
result.put(entry.getKey(), entry.getValue());
}
}
return result;
} else {
throw new DataSetException("Unknown filter type: " + filter);
}
}
private Map<byte[], byte[]> getRowMap(byte[] row) throws Exception {
NavigableMap<byte[], byte[]> result = Maps.newTreeMap(Bytes.BYTES_COMPARATOR);
// checking if the row was deleted inside this tx
NavigableMap<byte[], Update> buffCols = buff.get(row);
boolean rowDeleted = buffCols == null && buff.containsKey(row);
if (rowDeleted) {
return Collections.emptyMap();
}
Map<byte[], byte[]> persisted = getPersisted(row, null);
result.putAll(persisted);
if (buffCols != null) {
// buffered should override those returned from persistent store
mergeToPersisted(result, buffCols, null);
}
return unwrapDeletes(result);
}
private Map<byte[], byte[]> getRowMap(byte[] row, byte[][] columns) throws Exception {
NavigableMap<byte[], byte[]> result = Maps.newTreeMap(Bytes.BYTES_COMPARATOR);
// checking if the row was deleted inside this tx
NavigableMap<byte[], Update> buffCols = buff.get(row);
// if nothing locally, return all from server
if (buffCols == null) {
return getPersisted(row, columns);
}
// otherwise try to fetch data from in-memory buffer. If not all present - fetch leftover from persisted
List<byte[]> colsToFetchFromPersisted = Lists.newArrayList();
// try to fetch from local buffer first and then from server if it is not in buffer
for (byte[] column : columns) {
if (!buffCols.containsKey(column)) {
colsToFetchFromPersisted.add(column);
continue;
}
Update val = buffCols.get(column);
// buffered increments will need to the applied on top of the persisted values
if (val instanceof IncrementValue) {
colsToFetchFromPersisted.add(column);
}
}
// fetching from server those that were not found in in-mem buffer
if (colsToFetchFromPersisted.size() > 0) {
Map<byte[], byte[]> persistedCols =
getPersisted(row, colsToFetchFromPersisted.toArray(new byte[colsToFetchFromPersisted.size()][]));
if (persistedCols != null) {
result.putAll(persistedCols);
}
}
// overlay buffered values on persisted, applying increments where necessary
mergeToPersisted(result, buffCols, columns);
return unwrapDeletes(result);
}
/**
* Applies the buffered updates on top of the map of persisted values. The persisted map is modified in place
* with the updated values.
* @param persisted The map to modify with the buffered values.
* @param buffered The buffered values to overlay on the persisted map.
*/
private static void mergeToPersisted(Map<byte[], byte[]> persisted, Map<byte[], Update> buffered, byte[][] columns) {
List<byte[]> columnKeys;
if (columns != null) {
columnKeys = Arrays.asList(columns);
} else {
// NOTE: we want to copy key's byte array because it may be leaked to table's client and we don't want client
// to affect the buffer by changing it in place
columnKeys = Lists.newArrayListWithExpectedSize(buffered.size());
for (byte[] key : buffered.keySet()) {
columnKeys.add(copy(key));
}
}
// overlay buffered values on persisted, applying increments where necessary
for (byte[] key : columnKeys) {
Update val = buffered.get(key);
if (val == null) {
if (buffered.containsKey(key)) {
persisted.remove(key);
}
} else if (val instanceof IncrementValue) {
long persistedValue = 0L;
byte[] persistedBytes = persisted.get(key);
if (persistedBytes != null) {
persistedValue = Bytes.toLong(persistedBytes);
}
long newValue = persistedValue + ((IncrementValue) val).getValue();
persisted.put(key, Bytes.toBytes(newValue));
} else if (val instanceof PutValue) {
// overwrite the current
// NOTE: we want to copy value's byte array because it may be leaked to table's client and we don't want client
// to affect the buffer by changing it in place
persisted.put(key, copy(((PutValue) val).getValue()));
}
// unknown type?!
}
}
// utilities useful for underlying implementations
protected static <T> NavigableMap<byte[], T> getRange(NavigableMap<byte[], T> rowMap,
byte[] startColumn, byte[] stopColumn,
int limit) {
NavigableMap<byte[], T> result;
if (startColumn == null && stopColumn == null) {
result = rowMap;
} else if (startColumn == null) {
result = rowMap.headMap(stopColumn, false);
} else if (stopColumn == null) {
result = rowMap.tailMap(startColumn, true);
} else {
result = rowMap.subMap(startColumn, true, stopColumn, false);
}
return head(result, limit);
}
protected static <T> NavigableMap<byte[], T> head(NavigableMap<byte[], T> map, int count) {
if (count > 0 && map.size() > count) {
// todo: is there better way to do it?
byte [] lastToInclude = null;
int i = 0;
for (Map.Entry<byte[], T> entry : map.entrySet()) {
lastToInclude = entry.getKey();
if (++i >= count) {
break;
}
}
map = map.headMap(lastToInclude, true);
}
return map;
}
protected static byte[] wrapDeleteIfNeeded(byte[] value) {
return value == null ? DELETE_MARKER : value;
}
protected static byte[] unwrapDeleteIfNeeded(byte[] value) {
return Arrays.equals(DELETE_MARKER, value) ? null : value;
}
// todo: it is in-efficient to copy maps a lot, consider merging with getLatest methods
protected static NavigableMap<byte[], NavigableMap<byte[], byte[]>> unwrapDeletesForRows(
NavigableMap<byte[], NavigableMap<byte[], byte[]>> rows) {
NavigableMap<byte[], NavigableMap<byte[], byte[]>> result = Maps.newTreeMap(Bytes.BYTES_COMPARATOR);
for (Map.Entry<byte[], NavigableMap<byte[], byte[]>> row : rows.entrySet()) {
NavigableMap<byte[], byte[]> rowMap = unwrapDeletes(row.getValue());
if (rowMap.size() > 0) {
result.put(row.getKey(), rowMap);
}
}
return result;
}
// todo: it is in-efficient to copy maps a lot, consider merging with getLatest methods
protected static NavigableMap<byte[], byte[]> unwrapDeletes(NavigableMap<byte[], byte[]> rowMap) {
if (rowMap == null || rowMap.isEmpty()) {
return EMPTY_ROW_MAP;
}
NavigableMap<byte[], byte[]> result = Maps.newTreeMap(Bytes.BYTES_COMPARATOR);
for (Map.Entry<byte[], byte[]> keyVal : rowMap.entrySet()) {
byte[] val = unwrapDeleteIfNeeded(keyVal.getValue());
if (val != null) {
result.put(keyVal.getKey(), val);
}
}
return result;
}
private void reportWrite(int numOps, int dataSize) {
if (metricsCollector != null) {
metricsCollector.increment(Constants.Metrics.Name.Dataset.WRITE_COUNT, numOps);
metricsCollector.increment(Constants.Metrics.Name.Dataset.WRITE_BYTES, dataSize);
metricsCollector.increment(Constants.Metrics.Name.Dataset.OP_COUNT, numOps);
}
}
private void reportRead(int numOps) {
if (metricsCollector != null) {
// todo: report amount of data being read
metricsCollector.increment(Constants.Metrics.Name.Dataset.READ_COUNT, numOps);
metricsCollector.increment(Constants.Metrics.Name.Dataset.OP_COUNT, numOps);
}
}
private int getSize(long[] values) {
return Bytes.SIZEOF_LONG * values.length;
}
private static int getSize(byte[][] data) {
int size = 0;
for (byte[] item : data) {
size += getSize(item);
}
return size;
}
private static int getSize(byte[] item) {
return item == null ? 0 : item.length;
}
private static byte[] copy(byte[] bytes) {
return bytes == null ? null : Arrays.copyOf(bytes, bytes.length);
}
/**
* Scanner implementation that overlays buffered data on top of already persisted data.
*/
private class BufferingScanner implements Scanner {
private final NavigableMap<byte[], NavigableMap<byte[], Update>> buffer;
private final Scanner persistedScanner;
private final Iterator<byte[]> keyIter;
private byte[] currentKey;
private Row currentRow;
private BufferingScanner(NavigableMap<byte[], NavigableMap<byte[], Update>> buffer, Scanner persistedScanner) {
this.buffer = buffer;
this.keyIter = this.buffer.keySet().iterator();
if (this.keyIter.hasNext()) {
currentKey = keyIter.next();
}
this.persistedScanner = persistedScanner;
this.currentRow = this.persistedScanner.next();
}
@Nullable
@Override
public Row next() {
if (currentKey == null && currentRow == null) {
// out of rows
return null;
}
reportRead(1);
int order;
if (currentKey == null) {
// exhausted buffer is the same as persisted scan row coming first
order = 1;
} else if (currentRow == null) {
// exhausted persisted scanner is the same as buffer row coming first
order = -1;
} else {
order = Bytes.compareTo(currentKey, currentRow.getRow());
}
Row result;
if (order > 0) {
// persisted row comes first or buffer is empty
result = currentRow;
currentRow = persistedScanner.next();
} else if (order < 0) {
// buffer row comes first or persisted scanner is empty
Map<byte[], byte[]> persistedRow = Maps.newTreeMap(Bytes.BYTES_COMPARATOR);
mergeToPersisted(persistedRow, buffer.get(currentKey), null);
result = new Result(copy(currentKey), persistedRow);
currentKey = keyIter.hasNext() ? keyIter.next() : null;
} else {
// if currentKey and currentRow are equal, merge and advance both
Map<byte[], byte[]> persisted = currentRow.getColumns();
mergeToPersisted(persisted, buffer.get(currentKey), null);
result = new Result(currentRow.getRow(), persisted);
currentRow = persistedScanner.next();
currentKey = keyIter.hasNext() ? keyIter.next() : null;
}
return result;
}
@Override
public void close() {
this.persistedScanner.close();
}
}
private long warnedCount = 0L;
private long skippedCount = 0L;
private long warnFrequency = 1L;
private void warnAboutEmptyValue(byte[] column) {
if (++skippedCount < warnFrequency) {
// have not skipped often enough, skip logging this time
return;
}
skippedCount = 0;
String additionalMessage = "";
// after every 10th of logging, double the frequency but max out at 4096
if (++warnedCount >= 10 && warnFrequency < 4096) {
warnedCount = 0;
warnFrequency = 2 * warnFrequency;
additionalMessage = String.format(
"To reduce log verbosity, this warning will now only be logged one in %d times", warnFrequency);
}
LOG.warn("Attempt to write an empty value to column '{}' of table '{}'. " +
"This will result in deleting the column. {}", Bytes.toString(column), name, additionalMessage);
}
}