package org.yamcs.yarch.rocksdb;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import org.rocksdb.ReadOptions;
import org.rocksdb.RocksIterator;
import org.rocksdb.Snapshot;
import org.yamcs.utils.ByteArrayUtils;
import org.yamcs.yarch.AbstractTableReaderStream;
import org.yamcs.yarch.ColumnDefinition;
import org.yamcs.yarch.ColumnSerializer;
import org.yamcs.yarch.DataType;
import org.yamcs.yarch.DbReaderStream;
import org.yamcs.yarch.IndexFilter;
import org.yamcs.yarch.Partition;
import org.yamcs.yarch.PartitioningSpec;
import org.yamcs.yarch.RawTuple;
import org.yamcs.yarch.TableDefinition;
import org.yamcs.yarch.YarchDatabase;
/**
* reader for tables with PartitionStorage.IN_KEY (the partition is prepended in front of the key)
* @author nm
*
*/
public class InkeyTableReaderStream extends AbstractTableReaderStream implements Runnable, DbReaderStream {
static AtomicInteger count = new AtomicInteger(0);
final PartitioningSpec partitioningSpec;
final RdbPartitionManager partitionManager;
final TableDefinition tableDefinition;
private long numRecordsRead = 0;
// size in bytes of value if partitioned by value
private final int partitionSize;
protected InkeyTableReaderStream(YarchDatabase ydb, TableDefinition tblDef, RdbPartitionManager partitionManager, boolean ascending, boolean follow) {
super(ydb, tblDef, partitionManager, ascending, follow);
this.tableDefinition = tblDef;
partitioningSpec = tblDef.getPartitioningSpec();
this.partitionManager = partitionManager;
DataType dt = partitioningSpec.getValueColumnType();
if(dt!=null) {
this.partitionSize = ColumnValueSerializer.getSerializedSize(dt);
} else {
throw new IllegalStateException("InkeyTableReader is used only when the table is partitioned by value");
}
}
@Override
public void start() {
(new Thread(this, "InkeyRdbTableReader["+getName()+"]")).start();
}
/**
* reads a file, sending data only that conform with the start and end filters.
* returns true if the stop condition is met
*
* All the partitions are from the same time interval and thus from one single RocksDB database
*
*/
@Override
protected boolean runPartitions(List<Partition> partitions, IndexFilter range) throws IOException {
byte[] rangeStart=null;
boolean strictStart=false;
byte[] rangeEnd=null;
boolean strictEnd=false;
if(range!=null) {
ColumnDefinition cd = tableDefinition.getKeyDefinition().getColumn(0);
ColumnSerializer cs = tableDefinition.getColumnSerializer(cd.getName());
if(range.keyStart!=null) {
strictStart = range.strictStart;
rangeStart=cs.toByteArray(range.keyStart);
}
if(range.keyEnd!=null) {
strictEnd=range.strictEnd;
rangeEnd=cs.toByteArray(range.keyEnd);
}
}
return runValuePartitions(partitions, rangeStart, strictStart, rangeEnd, strictEnd);
}
/*
* runs value based partitions: the partition value is encoded as the first bytes of the key, so we have to make multiple parallel iterators
*/
private boolean runValuePartitions(List<Partition> partitions, byte[] rangeStart, boolean strictStart, byte[] rangeEnd, boolean strictEnd) {
DbIterator iterator = null;
RDBFactory rdbf = RDBFactory.getInstance(ydb.getName());
RdbPartition p1 = (RdbPartition) partitions.get(0);
String dbDir = p1.dir;
log.debug("opening database {}", dbDir);
YRDB rdb;
try {
rdb = rdbf.getRdb(tableDefinition.getDataDir()+"/"+p1.dir, p1.binaryValue.length, false);
} catch (IOException e) {
log.error("Failed to open database", e);
return false;
}
ReadOptions readOptions = new ReadOptions();
readOptions.setTailing(follow);
Snapshot snapshot = null;
if(!follow) {
snapshot = rdb.getDb().getSnapshot();
readOptions.setSnapshot(snapshot);
}
try {
List<DbIterator> itList = new ArrayList<>(partitions.size());
//create an iterator for each partitions
for(Partition p: partitions) {
p1 = (RdbPartition) p;
RocksIterator rocksIt = rdb.getDb().newIterator(readOptions);
DbIterator it = getPartitionIterator(rocksIt, p1.binaryValue, ascending, rangeStart, strictStart, rangeEnd, strictEnd);
if(it.isValid()) {
itList.add(it);
} else {
it.close();
}
}
if(itList.size()==0) {
return false;
} else if(itList.size()==1) {
iterator = itList.get(0);
} else {
iterator = new MergingIterator(itList, ascending?new SuffixAscendingComparator(partitionSize):new SuffixDescendingComparator(partitionSize) );
}
if(ascending) {
return runAscending(iterator, rangeEnd, strictEnd);
} else {
return runDescending(iterator, rangeStart, strictStart);
}
} finally {
if(iterator!=null) {
iterator.close();
}
if(snapshot!=null) {
snapshot.close();
}
readOptions.close();
rdbf.dispose(rdb);
}
}
boolean runAscending(DbIterator iterator, byte[] rangeEnd, boolean strictEnd) {
while(!quit && iterator.isValid()){
byte[] dbKey = iterator.key();
byte[] key = Arrays.copyOfRange(dbKey, partitionSize, dbKey.length);
if(!emitIfNotPastStop(key, iterator.value(), rangeEnd, strictEnd)) {
return true;
}
iterator.next();
}
return false;
}
boolean runDescending(DbIterator iterator, byte[] rangeStart, boolean strictStart) {
while(!quit && iterator.isValid()){
byte[] dbKey = iterator.key();
byte[] key = Arrays.copyOfRange(dbKey, partitionSize, dbKey.length);
if(!emitIfNotPastStart(key, iterator.value(), rangeStart, strictStart)) {
return true;
}
iterator.prev();
}
return false;
}
/*
* create a ranging iterator for the given partition
* TODO: check usage of RocksDB prefix iterators
*
*/
private DbIterator getPartitionIterator(RocksIterator it, byte[] part, boolean ascending, byte[] rangeStart, boolean strictStart, byte[] rangeEnd, boolean strictEnd) {
byte[] dbKeyStart;
byte[] dbKeyEnd;
boolean dbStrictStart, dbStrictEnd;
if(rangeStart!=null) {
dbKeyStart = Arrays.copyOf(part, part.length+rangeStart.length);
System.arraycopy(rangeStart, 0, dbKeyStart, part.length, rangeStart.length);
dbStrictStart = strictStart;
} else {
dbKeyStart = part;
dbStrictStart = false;
}
if(rangeEnd!=null) {
dbKeyEnd = Arrays.copyOf(part, part.length+rangeEnd.length);
System.arraycopy(rangeEnd, 0, dbKeyEnd, part.length, rangeEnd.length);
dbStrictEnd = strictEnd;
} else {
dbKeyEnd = ByteArrayUtils.plusOne(part);
dbStrictEnd = true;
}
if(ascending) {
return new AscendingRangeIterator(it, dbKeyStart, dbStrictStart, dbKeyEnd, dbStrictEnd);
} else {
return new DescendingRangeIterator(it, dbKeyStart, dbStrictStart, dbKeyEnd, dbStrictEnd);
}
}
public long getNumRecordsRead() {
return numRecordsRead;
}
class RdbRawTuple extends RawTuple {
int index; //used for sorting tuples with equals keys
RocksIterator iterator;
byte[] partition;
byte[] key;
byte[] value;
public RdbRawTuple(byte[] partition, byte[] key, byte[] value, RocksIterator iterator, int index) {
super(index);
this.partition = partition;
this.key = key;
this.value = value;
this.iterator = iterator;
}
@Override
protected byte[] getKey() {
return key;
}
@Override
protected byte[] getValue() {
return value;
}
}
static class SuffixAscendingComparator implements Comparator<byte[]> {
int prefixSize;
public SuffixAscendingComparator(int prefixSize) {
this.prefixSize = prefixSize;
}
@Override
public int compare(byte[] b1, byte[] b2) {
int minLength = Math.min(b1.length, b2.length);
for (int i = prefixSize; i < minLength; i++) {
int d=(b1[i]&0xFF)-(b2[i]&0xFF);
if(d!=0){
return d;
}
}
for (int i = 0; i < prefixSize; i++) {
int d=(b1[i]&0xFF)-(b2[i]&0xFF);
if(d!=0){
return d;
}
}
return b1.length - b2.length;
}
}
static class SuffixDescendingComparator implements Comparator<byte[]> {
int prefixSize;
public SuffixDescendingComparator(int prefixSize) {
this.prefixSize = prefixSize;
}
@Override
public int compare(byte[] b1, byte[] b2) {
int minLength = Math.min(b1.length, b2.length);
for (int i = prefixSize; i < minLength; i++) {
int d=(b2[i]&0xFF)-(b1[i]&0xFF);
if(d!=0){
return d;
}
}
for (int i = 0; i < prefixSize; i++) {
int d=(b2[i]&0xFF)-(b1[i]&0xFF);
if(d!=0){
return d;
}
}
return b2.length - b1.length;
}
}
}