/* * Copyright © 2014 Cask Data, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package co.cask.cdap.data2.dataset2.lib.table.leveldb; import co.cask.cdap.api.common.Bytes; import co.cask.cdap.api.dataset.table.Result; import co.cask.cdap.api.dataset.table.Row; import co.cask.cdap.api.dataset.table.Scanner; import co.cask.cdap.common.utils.ImmutablePair; import co.cask.cdap.data2.dataset2.lib.table.FuzzyRowFilter; import co.cask.tephra.Transaction; import com.google.common.base.Preconditions; import com.google.common.base.Throwables; import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableSortedMap; import com.google.common.collect.Maps; import org.iq80.leveldb.DB; import org.iq80.leveldb.DBIterator; import org.iq80.leveldb.WriteBatch; import org.iq80.leveldb.WriteOptions; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.Iterator; import java.util.Map; import java.util.NavigableMap; import javax.annotation.Nullable; /** * Provides common operations for levelDB tables and queues. */ public class LevelDBTableCore { private static final Logger LOG = LoggerFactory.getLogger(LevelDBTableCore.class); private static final Scanner EMPTY_SCANNER = createEmptyScanner(); // this represents deleted values protected static final byte[] DELETE_MARKER = { }; // we use the empty column family for all data private static final byte[] DATA_COLFAM = { }; // we will never write this, but use it as an upper bound for scans private static final byte[] NEXT_COLFAM = { 0x00 }; // used for obtaining the next row/column for upper bound private static final byte[] ONE_ZERO = { 0x00 }; private static byte[] upperBound(byte[] column) { return Bytes.add(column, ONE_ZERO); } // empty immutable row's column->value map constant // Using ImmutableSortedMap instead of Maps.unmodifiableNavigableMap to avoid conflicts with // Hadoop, which uses an older version of guava without that method. static final NavigableMap<byte[], byte[]> EMPTY_ROW_MAP = ImmutableSortedMap.<byte[], byte[]>orderedBy(Bytes.BYTES_COMPARATOR).build(); private final String tableName; private final LevelDBTableService service; public LevelDBTableCore(String tableName, LevelDBTableService service) throws IOException { this.tableName = tableName; this.service = service; } private DB getDB() throws IOException { return service.getTable(tableName); } private WriteOptions getWriteOptions() { return service.getWriteOptions(); } public synchronized boolean swap(byte[] row, byte[] column, byte[] oldValue, byte[] newValue) throws IOException { byte[] existing = getRow(row, new byte[][] { column }, null, null, -1, null).get(column); // verify if (oldValue == null && existing != null) { return false; } if (oldValue != null && (existing == null || !Bytes.equals(oldValue, existing))) { return false; } // write if (newValue == null) { // to-do deleteColumn(row, column); } else { persist(Collections.singletonMap(row, Collections.singletonMap(column, newValue)), System.currentTimeMillis()); } return true; } public synchronized Map<byte[], Long> increment(byte[] row, Map<byte[], Long> increments) throws IOException { Map<byte[], Long> result = getResultMap(row, increments); Map<byte[], byte[]> replacing = Maps.newTreeMap(Bytes.BYTES_COMPARATOR); for (Map.Entry<byte[], Long> entry : result.entrySet()) { replacing.put(entry.getKey(), Bytes.toBytes(entry.getValue())); } persist(ImmutableMap.of(row, replacing), System.currentTimeMillis()); return result; } public synchronized void increment(NavigableMap<byte[], NavigableMap<byte[], Long>> updates) throws IOException { Map<byte[], Map<byte[], byte[]>> resultMap = Maps.newHashMap(); for (NavigableMap.Entry<byte[], NavigableMap<byte[], Long>> row : updates.entrySet()) { NavigableMap<byte[], Long> increments = row.getValue(); Map<byte[], byte[]> replacing = Maps.newTreeMap(Bytes.BYTES_COMPARATOR); Map<byte[], Long> result = getResultMap(row.getKey(), increments); for (Map.Entry<byte[], Long> entry : result.entrySet()) { replacing.put(entry.getKey(), Bytes.toBytes(entry.getValue())); } resultMap.put(row.getKey(), replacing); } persist(resultMap, System.currentTimeMillis()); } private Map<byte[], Long> getResultMap(byte[] row, Map<byte[], Long> increments) throws IOException { NavigableMap<byte[], byte[]> existing = getRow(row, increments.keySet().toArray(new byte[increments.size()][]), null, null, -1, null); Map<byte[], Long> result = Maps.newTreeMap(Bytes.BYTES_COMPARATOR); for (Map.Entry<byte[], Long> increment : increments.entrySet()) { long existingValue = 0L; byte[] existingBytes = existing.get(increment.getKey()); if (existingBytes != null) { if (existingBytes.length != Bytes.SIZEOF_LONG) { throw new NumberFormatException("Attempted to increment a value that is not convertible to long," + " row: " + Bytes.toStringBinary(row) + " column: " + Bytes.toStringBinary(increment.getKey())); } existingValue = Bytes.toLong(existingBytes); } long newValue = existingValue + increment.getValue(); result.put(increment.getKey(), newValue); } return result; } public void persist(Map<byte[], ? extends Map<byte[], byte[]>> changes, long version) throws IOException { DB db = getDB(); // todo support writing null when no transaction WriteBatch batch = db.createWriteBatch(); for (Map.Entry<byte[], ? extends Map<byte[], byte[]>> row : changes.entrySet()) { for (Map.Entry<byte[], byte[]> column : row.getValue().entrySet()) { byte[] key = createPutKey(row.getKey(), column.getKey(), version); batch.put(key, column.getValue() == null ? DELETE_MARKER : column.getValue()); } } db.write(batch, service.getWriteOptions()); } public void put(byte[] row, byte[] column, byte[] value, long version) throws IOException { getDB().put(createPutKey(row, column, version), value); } public void undo(Map<byte[], ? extends Map<byte[], ?>> persisted, long version) throws IOException { if (persisted.isEmpty()) { return; } DB db = getDB(); WriteBatch batch = db.createWriteBatch(); for (Map.Entry<byte[], ? extends Map<byte[], ?>> row : persisted.entrySet()) { for (Map.Entry<byte[], ?> column : row.getValue().entrySet()) { byte[] key = createPutKey(row.getKey(), column.getKey(), version); batch.delete(key); } } db.write(batch, service.getWriteOptions()); } public Scanner scan(byte[] startRow, byte[] stopRow, @Nullable FuzzyRowFilter filter, @Nullable byte[][] columns, @Nullable Transaction tx) throws IOException { if (columns != null) { if (columns.length == 0) { return EMPTY_SCANNER; } columns = Arrays.copyOf(columns, columns.length); Arrays.sort(columns, Bytes.BYTES_COMPARATOR); } DBIterator iterator = getDB().iterator(); seekToStart(iterator, startRow); byte[] endKey = stopRow == null ? null : createEndKey(stopRow); return new LevelDBScanner(iterator, endKey, filter, columns, tx); } /** * if columns are not null, then limit param is ignored and limit is columns.length */ public NavigableMap<byte[], byte[]> getRow(byte[] row, @Nullable byte[][] columns, byte[] startCol, byte[] stopCol, int limit, Transaction tx) throws IOException { if (columns != null) { if (columns.length == 0) { return EMPTY_ROW_MAP; } columns = Arrays.copyOf(columns, columns.length); Arrays.sort(columns, Bytes.BYTES_COMPARATOR); limit = columns.length; } byte[] startKey = createStartKey(row, columns == null ? startCol : columns[0]); byte[] endKey = createEndKey(row, columns == null ? stopCol : upperBound(columns[columns.length - 1])); try (DBIterator iterator = getDB().iterator()) { iterator.seek(startKey); return getRow(iterator, endKey, tx, false, columns, limit).getSecond(); } } private static Scanner createEmptyScanner() { return new Scanner() { @Override public Row next() { return null; } @Override public void close() { // no-op } }; } /** * Read one row of the table. This is used both by getRow() and by Scanner.next(). * @param iterator An iterator over the database. This is passed in such that the caller can reuse the same * iterator if scanning multiple rows. * @param endKey An upper bound for the (leveldb) keys to read. This method never reads past that key. * @param tx The transaction to use for visibility. * @param multiRow If true indicates that the row may end before the endKey. In that case, * this method will stop reading as soon as it sees more than one row key. The iterator will not be * advanced past the beginning of the next row (so that next time, we still see the entire next row). * @param columns If non-null, only columns contained in this will be returned. The given columns should be sorted. * @param limit If non-negative, at most this many columns will be returned. If multiRow is true, this is ignored. * @return a pair consisting of the row key of the next non-empty row and the column map for that row. If multiRow * is false, null is returned for row key because the caller already knows it. */ private static ImmutablePair<byte[], NavigableMap<byte[], byte[]>> getRow(DBIterator iterator, byte[] endKey, Transaction tx, boolean multiRow, byte[][] columns, int limit) throws IOException { byte[] rowBeingRead = null; byte[] previousRow = null; byte[] previousCol = null; NavigableMap<byte[], byte[]> map = Maps.newTreeMap(Bytes.BYTES_COMPARATOR); while (iterator.hasNext()) { Map.Entry<byte[], byte[]> entry = iterator.peekNext(); // if we have reached past the endKey, nothing was found, return null if (endKey != null && KeyValue.KEY_COMPARATOR.compare(entry.getKey(), endKey) >= 0) { break; } // if this is part of a multi-row scan and we reach the next row, stop without advancing iterator KeyValue kv = KeyValue.fromKey(entry.getKey()); if (multiRow) { byte[] rowKey = kv.getRow(); if (rowBeingRead != null) { if (Bytes.compareTo(rowKey, rowBeingRead) > 0) { break; } } } // it is safe to consume this entry, advance the iterator iterator.next(); // Determine if this KV is visible if (tx != null && !tx.isVisible(kv.getTimestamp())) { continue; } // have we seen this row & column before? byte[] row = kv.getRow(); byte[] column = kv.getQualifier(); boolean seenThisColumn = previousRow != null && Bytes.equals(previousRow, row) && previousCol != null && Bytes.equals(previousCol, column); if (seenThisColumn) { continue; } // remember that this is the last column we have seen previousRow = row; previousCol = column; // is it a column we want? if (columns == null || Arrays.binarySearch(columns, column, Bytes.BYTES_COMPARATOR) >= 0) { byte[] value = entry.getValue(); // only add to map if it is not a delete if (tx == null || !Bytes.equals(value, DELETE_MARKER)) { map.put(column, value); // first time we add a column. must remember the row key to know when to stop if (multiRow && rowBeingRead == null) { rowBeingRead = kv.getRow(); } if (limit > 0 && map.size() >= limit) { break; } } } } // note this will return null for the row being read if multiRow is false (because the caller knows the row) return new ImmutablePair<>(rowBeingRead, map); } public void deleteRows(byte[] prefix) throws IOException { Preconditions.checkNotNull(prefix, "prefix must not be null"); DB db = getDB(); WriteBatch batch = db.createWriteBatch(); try (DBIterator iterator = db.iterator()) { iterator.seek(createStartKey(prefix)); while (iterator.hasNext()) { Map.Entry<byte[], byte[]> entry = iterator.next(); if (!Bytes.startsWith(KeyValue.fromKey(entry.getKey()).getRow(), prefix)) { // iterator is past prefix break; } batch.delete(entry.getKey()); } db.write(batch); } } /** * Delete a list of rows from the table entirely, disregarding transactions. * @param toDelete the row keys to delete */ public void deleteRows(Collection<byte[]> toDelete) throws IOException { if (toDelete.isEmpty()) { return; } // find first row to delete and first entry in the DB to examine Iterator<byte[]> rows = toDelete.iterator(); byte[] currentRow = rows.next(); byte[] startKey = createStartKey(currentRow); DB db = getDB(); WriteBatch batch = db.createWriteBatch(); try (DBIterator iterator = db.iterator()) { iterator.seek(startKey); if (!iterator.hasNext()) { return; // nothing in the db to delete } Map.Entry<byte[], byte[]> entry = iterator.next(); // iterate over the database and the rows to delete, collecting (raw) keys to delete while (entry != null && currentRow != null) { KeyValue kv = KeyValue.fromKey(entry.getKey()); int comp = Bytes.compareTo(kv.getRow(), currentRow); if (comp == 0) { // same row -> delete batch.delete(entry.getKey()); entry = iterator.hasNext() ? iterator.next() : null; } else if (comp > 0) { // read past current row -> move to next row currentRow = rows.hasNext() ? rows.next() : null; } else if (comp < 0) { // iterator must seek to current row iterator.seek(createStartKey(currentRow)); entry = iterator.hasNext() ? iterator.next() : null; } } } // delete all the entries that were found db.write(batch, getWriteOptions()); } public void deleteRange(byte[] startRow, byte[] stopRow, @Nullable FuzzyRowFilter filter, @Nullable byte[][] columns) throws IOException { if (columns != null) { if (columns.length == 0) { return; } columns = Arrays.copyOf(columns, columns.length); Arrays.sort(columns, Bytes.BYTES_COMPARATOR); } DB db = getDB(); DBIterator iterator = db.iterator(); seekToStart(iterator, startRow); byte[] endKey = stopRow == null ? null : createEndKey(stopRow); Scanner scanner = new LevelDBScanner(iterator, endKey, filter, columns, null); DBIterator deleteIterator = db.iterator(); seekToStart(deleteIterator, startRow); final int deletesPerRound = 1024; // todo make configurable try { Row rowValues; WriteBatch batch = db.createWriteBatch(); int deletesInBatch = 0; // go through all matching cells and delete them in batches. while ((rowValues = scanner.next()) != null) { byte[] row = rowValues.getRow(); for (byte[] column : rowValues.getColumns().keySet()) { addToDeleteBatch(batch, deleteIterator, row, column); deletesInBatch++; // perform the deletes when we have built up a batch. if (deletesInBatch >= deletesPerRound) { // delete all the entries that were found db.write(batch, getWriteOptions()); batch = db.createWriteBatch(); deletesInBatch = 0; } } } // perform any outstanding deletes if (deletesInBatch > 0) { db.write(batch, getWriteOptions()); } } finally { scanner.close(); deleteIterator.close(); } } public void deleteColumn(byte[] row, byte[] column) throws IOException { DB db = getDB(); WriteBatch batch = db.createWriteBatch(); try (DBIterator iterator = db.iterator()) { addToDeleteBatch(batch, iterator, row, column); db.write(batch); } } /** * Helper to add deletes to a batch. The expected use case is for the caller to be iterating * through leveldb keys in sorted order, collecting key values to delete in batch. */ private void addToDeleteBatch(WriteBatch batch, DBIterator iterator, byte[] row, byte[] column) { byte[] endKey = createStartKey(row, Bytes.add(column, new byte[] { 0 })); iterator.seek(createStartKey(row, column)); while (iterator.hasNext()) { Map.Entry<byte[], byte[]> entry = iterator.next(); if (KeyValue.KEY_COMPARATOR.compare(entry.getKey(), endKey) >= 0) { // iterator is past column break; } batch.delete(entry.getKey()); } } private void seekToStart(DBIterator iterator, byte[] startRow) { try { if (startRow != null) { iterator.seek(createStartKey(startRow)); } else { iterator.seekToFirst(); } } catch (RuntimeException e) { try { iterator.close(); } catch (IOException ioe) { LOG.warn("Error closing LevelDB iterator", ioe); // but what else can we do? nothing... } throw e; } } /** * A scanner for a range of rows. */ private static class LevelDBScanner implements Scanner { private final Transaction tx; private byte[] endKey; private final DBIterator iterator; private final byte[][] columns; private final FuzzyRowFilter filter; public LevelDBScanner(DBIterator iterator, byte[] endKey, @Nullable FuzzyRowFilter filter, @Nullable byte[][] columns, @Nullable Transaction tx) { this.tx = tx; this.endKey = endKey; this.iterator = iterator; this.filter = filter; this.columns = columns; } @Override public Row next() { try { while (true) { ImmutablePair<byte[], NavigableMap<byte[], byte[]>> result = getRow(iterator, endKey, tx, true, columns, -1); if (result.getFirst() == null) { return null; } // apply row filter if any if (filter != null) { FuzzyRowFilter.ReturnCode code = filter.filterRow(result.getFirst()); switch (code) { case DONE: { return null; } case SEEK_NEXT_USING_HINT: { // row does not match but another one could. seek to next possible matching row and iterate byte[] seekToRow = filter.getNextRowHint(result.getFirst()); iterator.seek(createStartKey(seekToRow)); continue; } case INCLUDE: { break; } } } return new Result(result.getFirst(), result.getSecond()); } } catch (Exception e) { throw Throwables.propagate(e); } } @Override public void close() { try { iterator.close(); } catch (Exception e) { LOG.warn("Error closing LevelDB iterator", e); // but what else can we do? nothing. } } } // ------- helpers to create the keys for writes and scans ---------- private static byte[] createPutKey(byte[] rowKey, byte[] columnKey, long version) { return new KeyValue(rowKey, DATA_COLFAM, columnKey, version, KeyValue.Type.Put).getKey(); } private static byte[] createStartKey(byte[] row) { // the first possible key of a row return new KeyValue(row, DATA_COLFAM, null, KeyValue.LATEST_TIMESTAMP, KeyValue.Type.Maximum).getKey(); } private static byte[] createEndKey(byte[] row) { return createStartKey(row); // the first key of the stop is the first to be excluded } private static byte[] createStartKey(byte[] row, byte[] column) { return new KeyValue(row, DATA_COLFAM, column, KeyValue.LATEST_TIMESTAMP, KeyValue.Type.Maximum).getKey(); } private static byte[] createEndKey(byte[] row, byte[] column) { if (column != null) { // we have a stop column and can use that as an upper bound return new KeyValue(row, DATA_COLFAM, column, KeyValue.LATEST_TIMESTAMP, KeyValue.Type.Maximum).getKey(); } else { // no stop column - use next column family as upper bound return new KeyValue(row, NEXT_COLFAM, null, KeyValue.LATEST_TIMESTAMP, KeyValue.Type.Maximum).getKey(); } } }