package org.yamcs.yarch.rocksdb; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.PriorityQueue; import java.util.concurrent.atomic.AtomicInteger; import org.rocksdb.ColumnFamilyHandle; import org.rocksdb.RocksDBException; import org.rocksdb.RocksIterator; import org.yamcs.yarch.AbstractTableReaderStream; import org.yamcs.yarch.ColumnDefinition; import org.yamcs.yarch.ColumnSerializer; import org.yamcs.yarch.DbReaderStream; import org.yamcs.yarch.IndexFilter; import org.yamcs.yarch.Partition; import org.yamcs.yarch.PartitioningSpec; import org.yamcs.yarch.RawTuple; import org.yamcs.yarch.TableDefinition; import org.yamcs.yarch.YarchDatabase; /** * reader for tables where each partition is a different column family * * Also works for the case when there is no partitioning by value * * @author nm * */ public class CfTableReaderStream extends AbstractTableReaderStream implements Runnable, DbReaderStream { static AtomicInteger count = new AtomicInteger(0); final PartitioningSpec partitioningSpec; final RdbPartitionManager partitionManager; final TableDefinition tableDefinition; private long numRecordsRead = 0; protected CfTableReaderStream(YarchDatabase ydb, TableDefinition tblDef, RdbPartitionManager partitionManager, boolean ascending, boolean follow) { super(ydb, tblDef, partitionManager, ascending, follow); this.tableDefinition = tblDef; partitioningSpec = tblDef.getPartitioningSpec(); this.partitionManager = partitionManager; } @Override public void start() { (new Thread(this, "TcTableReader["+getName()+"]")).start(); } /** * reads a file, sending data only that conform with the start and end filters. * returns true if the stop condition is met * * All the partitions are from the same time interval and thus from one single RocksDB database * */ @Override protected boolean runPartitions(List<Partition> partitions, IndexFilter range) throws IOException { byte[] rangeStart=null; boolean strictStart=false; byte[] rangeEnd=null; boolean strictEnd=false; if(range!=null) { ColumnDefinition cd=tableDefinition.getKeyDefinition().getColumn(0); ColumnSerializer cs=tableDefinition.getColumnSerializer(cd.getName()); if(range.keyStart!=null) { strictStart=range.strictStart; rangeStart=cs.toByteArray(range.keyStart); } if(range.keyEnd!=null) { strictEnd=range.strictEnd; rangeEnd=cs.toByteArray(range.keyEnd); } } try { if (ascending) { return readAscending(partitions, rangeStart, strictStart, rangeEnd, strictEnd); } else { return readDescending(partitions, rangeStart, strictStart, rangeEnd, strictEnd); } } catch (RocksDBException e) { throw new IOException(e); } } private boolean readAscending(List<Partition> partitions, byte[] rangeStart, boolean strictStart, byte[] rangeEnd, boolean strictEnd) throws IOException, RocksDBException { PriorityQueue<RdbRawTuple> orderedQueue = new PriorityQueue<RdbRawTuple>(); RDBFactory rdbFactory = RDBFactory.getInstance(ydb.getName()); YRDB rdb = null; try { RdbPartition p1 = (RdbPartition) partitions.iterator().next(); String dbDir = p1.dir; log.debug("opening database {}", dbDir); rdb = rdbFactory.getRdb(tableDefinition.getDataDir()+"/"+p1.dir, false); List<ColumnFamilyHandle> cfhList = new ArrayList<ColumnFamilyHandle>(); for(Partition p: partitions) { ColumnFamilyHandle cfh = rdb.getColumnFamilyHandle(((RdbPartition)p).binaryValue); if(cfh!=null) { cfhList.add(cfh); } } //create a cursor for all partitions List<RocksIterator> iteratorList = rdb.newIterators(cfhList, follow); int i=0; for(RocksIterator it:iteratorList) { boolean found=true; if(rangeStart!=null) { it.seek(rangeStart); if(it.isValid()) { if((strictStart)&&(compare(rangeStart, it.key())==0)) { //if filter condition is ">" we skip the first record if it is equal to the key it.next(); found=it.isValid(); } } else { found=false; } if(!found) { log.debug("no record corresponding to the StartFilter"); } } else { it.seekToFirst(); if(!it.isValid()) { log.debug("tcb contains no record"); found = false; } } if(!found) { it.close(); } else { numRecordsRead++; orderedQueue.add(new RdbRawTuple(it.key(), it.value(), it, i++)); } } log.debug("got one tuple from each partition, starting the business"); //now continue publishing the first element from the priority queue till it becomes empty while((!quit) && orderedQueue.size()>0){ RdbRawTuple rt = orderedQueue.poll(); if(!emitIfNotPastStop(rt.key, rt.value, rangeEnd, strictEnd)) { return true; } rt.iterator.next(); if(rt.iterator.isValid()) { numRecordsRead++; rt.key = rt.iterator.key(); rt.value = rt.iterator.value(); orderedQueue.add(rt); } else { log.debug("{} finished", rt.iterator); rt.iterator.close(); } } return false; } finally { for(RdbRawTuple rt:orderedQueue) { rt.iterator.close(); } if(rdb!=null) { rdbFactory.dispose(rdb); } } } private boolean readDescending(List<Partition> partitions, byte[] rangeStart, boolean strictStart, byte[] rangeEnd, boolean strictEnd) throws IOException, RocksDBException { PriorityQueue<RdbRawTuple> orderedQueue=new PriorityQueue<RdbRawTuple>(RawTuple.reverseComparator); RDBFactory rdbFactory = RDBFactory.getInstance(ydb.getName()); YRDB rdb = null; try { RdbPartition p1 = (RdbPartition) partitions.get(0); String dbDir = p1.dir; log.debug("opening database {}", dbDir); rdb = rdbFactory.getRdb(tableDefinition.getDataDir()+"/"+p1.dir, false); List<ColumnFamilyHandle> cfhList = new ArrayList<>(); for(Partition p: partitions) { ColumnFamilyHandle cfh = rdb.getColumnFamilyHandle(((RdbPartition)p).binaryValue); if(cfh!=null) { cfhList.add(cfh); } } //create a cursor for all partitions List<RocksIterator> iteratorList = rdb.newIterators(cfhList, false); int i=0; for(RocksIterator it:iteratorList) { boolean found=true; if(rangeEnd!=null) { //seek moves cursor beyond the match it.seek(rangeEnd); boolean verify=false; if(it.isValid()) { if((strictEnd)||(compare(rangeEnd, it.key())!=0)) { it.prev(); verify=true; } } else if (!it.isValid()) { //at end of iterator, check last entry it.seekToLast(); verify=true; } if(verify && it.isValid()) { int c=compare(it.key(), rangeEnd); if (c>0) {//don't care about non-strict, covered before it.seek(rangeEnd); } } if(it.isValid()) { if((strictEnd)&&(compare(rangeEnd, it.key())==0)) { //if filter condition is "<" we skip the first record if it is equal to the key it.prev(); found=it.isValid(); } } else { found=false; } if(!found) { log.debug("no record corresponding to the StartFilter"); } } else { it.seekToLast(); if(!it.isValid()) { log.debug("rdb contains no record"); found=false; } } if(!found) { it.close(); } else { orderedQueue.add(new RdbRawTuple(it.key(), it.value(), it, i++)); } } log.debug("got one tuple from each partition, starting the business"); //now continue publishing the first element from the priority queue till it becomes empty while((!quit) && orderedQueue.size()>0){ RdbRawTuple rt=orderedQueue.poll(); if(!emitIfNotPastStart(rt.key, rt.value, rangeStart, strictStart)) { return true; } rt.iterator.prev(); if(rt.iterator.isValid()) { rt.key = rt.iterator.key(); rt.value = rt.iterator.value(); orderedQueue.add(rt); } else { log.debug("{} finished", rt.iterator); rt.iterator.close(); } } return false; } finally { for(RdbRawTuple rt:orderedQueue) { rt.iterator.close(); } if(rdb!=null) { rdbFactory.dispose(rdb); } } } public long getNumRecordsRead() { return numRecordsRead; } class RdbRawTuple extends RawTuple { int index;//used for sorting tuples with equals keys RocksIterator iterator; byte[] key; byte[] value; public RdbRawTuple(byte[] key, byte[] value, RocksIterator iterator, int index) { super(index); this.iterator = iterator; this.key = key; this.value = value; } @Override protected byte[] getKey() { return key; } @Override protected byte[] getValue() { return value; } } } /* * simple case when there is no value partitioning */ /* to replace maybe the above private boolean runSimplePartition(RdbPartition partition, byte[] rangeStart, boolean strictStart, byte[] rangeEnd, boolean strictEnd) { DbIterator iterator = null; RDBFactory rdbf = RDBFactory.getInstance(ydb.getName()); String dbDir = partition.dir; log.debug("opening database "+ dbDir); YRDB rdb; try { rdb = rdbf.getRdb(tableDefinition.getDataDir()+"/"+partition.dir, false); } catch (IOException e) { log.error("Failed to open database", e); return false; } ReadOptions readOptions = new ReadOptions(); readOptions.setTailing(follow); Snapshot snapshot = null; if(!follow) { snapshot = rdb.getDb().getSnapshot(); readOptions.setSnapshot(snapshot); } try { RocksIterator it = rdb.getDb().newIterator(readOptions); if(ascending) { iterator = new AscendingRangeIterator(it, rangeStart, strictStart, rangeEnd, strictEnd); while(!quit && iterator.isValid()){ if(!emitIfNotPastStop(iterator.key(), iterator.value(), rangeEnd, strictEnd)) { return true; } iterator.next(); } return false; } else { iterator = new DescendingRangeIterator(it, rangeStart, strictStart, rangeEnd, strictEnd); while(!quit && iterator.isValid()){ if(!emitIfNotPastStart(iterator.key(), iterator.value(), rangeStart, strictStart)) { return true; } iterator.prev(); } return false; } } finally { if(iterator!=null) iterator.close(); if(snapshot!=null) snapshot.close(); readOptions.close(); rdbf.dispose(rdb); } } */