/** * Copyright 2013 Oak Ridge National Laboratory * Author: James Horey <horeyjl@ornl.gov> * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. **/ package gov.ornl.keva.node; /** * Java libs. **/ import java.util.Map; import java.util.HashMap; import java.util.NavigableMap; import java.util.TreeMap; import java.util.List; import java.util.ArrayList; import java.util.Iterator; import java.util.Comparator; import java.nio.file.Path; import java.nio.file.Paths; /** * SEDA libs. **/ import gov.ornl.seda.SEDAFuture; /** * Configuration libs. **/ import gov.ornl.config.ConfigFactory; import gov.ornl.config.Configuration; import gov.ornl.config.ConfigEntry; /** * Keva libs. **/ import gov.ornl.keva.sstable.SSTable; import gov.ornl.keva.mem.MemTable; import gov.ornl.keva.mem.MemTableAllocator; import gov.ornl.keva.table.TableValueFactory; import gov.ornl.keva.table.TableKey; import gov.ornl.keva.table.TableValue; import gov.ornl.keva.core.KevaDBException; import gov.ornl.keva.core.VectorClock; import gov.ornl.keva.core.PruneOptions; import gov.ornl.keva.core.ReadOptions; import gov.ornl.keva.core.WriteOptions; import gov.ornl.keva.core.OpenOptions; import gov.ornl.keva.core.StreamIterator; import gov.ornl.keva.core.TreeUnionIterator; import gov.ornl.keva.loader.JobLoader; /** * KevaDB is the primary mechanism to interact with databases. * Each database is represented and mediated by a KevaDB instance, and * has a simple API to interact with the underlying data. The data model * used by KevaDB is based on a history of values defined by vector clocks. * This means that values may persist for a long time (even if users write to * the same key), and that values may fork (depending on the vector clock * assigned to the value). * * @author James Horey */ public class KevaDB { /** * Identify this database. **/ private String db; /** * Client ID clock to use for system operations (like delete). **/ private volatile int systemClock = 0; /** * The pruning options are used to filter results * while reading. It also serves as a way to remove * unnecessary items from the sstables. **/ private PruneOptions pruneOptions; /** * Used to sort independent values. **/ private Comparator<TableValue> comparator; /** * Memtable is where all the data actually resides. **/ private MemTable table; private MemTableAllocator memAllocator; private long memTableFlushSize; /** * Durable storage. **/ private SSTableService diskService; private WriteAheadLog wal; /** * Configuration information. **/ private String dataPath; private String logPath; private String configFile; private ConfigFactory configFactory; /** * This is not a public constructor. To instantiate * KevaDB objects, use the factory. **/ protected KevaDB(String db, String configFile) { this.db = db; // These are where the configuration files live. dataPath = null; logPath = null; // Set up default pruning options. // This is overriden by the config options. pruneOptions = new PruneOptions(); // Dfault comparator sorts values by wall time. comparator = new Comparator<TableValue>() { public int compare(TableValue v1, TableValue v2) { long d = v1.getClock().getLocalTime() - v2.getClock().getLocalTime(); return (int)d; } }; // Set up the SSTable. diskService = SSTableService.newInstance(); this.configFile = configFile; configFactory = new ConfigFactory(); loadConfig(configFile); // Load configuration // Set up the MemTable. memAllocator = new MemTableAllocator(); table = memAllocator.newMemTable(memTableFlushSize, comparator); table.setPruneOptions(pruneOptions); } /** * Start the database. Used by the factory to finish * initialization of the DB. */ protected void start() { // Configure the write-ahead log. wal = new WriteAheadLog(this, configFile); wal.createLog(); } /** * Load up all the configuration files. **/ private void loadConfig(String c) { Configuration conf; ConfigEntry entry; if(c != null) { Path p = Paths.get(c); conf = configFactory.getConfig(p.toAbsolutePath().toString()); if(conf != null) { setStorage(conf); // Set the storage directories. } } } /** * Set the various storage directories. **/ @SuppressWarnings("unchecked") private void setStorage(Configuration conf) { ConfigEntry entry = null; entry = conf.get("keva.data.dir"); if(entry != null) { // Try to use the values set. List<String> vv = entry.getEntry("value").getValues(); if(vv.size() > 0) { dataPath = vv.get(0).trim() + System.getProperty("file.separator") + db; } } // Set the WAL directory. entry = conf.get("keva.wal.dir"); if(entry != null) { List<String> vv = entry.getEntry("value").getValues(); if(vv.size() > 0) { logPath = vv.get(0).trim() + System.getProperty("file.separator") + db; } } // The sstable implementation. entry = conf.get("keva.sstable.impl"); if(entry != null) { List<String> vv = entry.getEntry("value").getValues(); for(String v : vv) { String[] s = v.split(":"); if(s.length == 1) { diskService.addDB(this, s[0], null); } else { diskService.addDB(this, s[0], s[1]); } } } // Set up the sorting. entry = conf.get("keva.sort"); if(entry != null) { List<String> vv = entry.getEntry("value").getValues(); if(vv.size() == 1) { // Just the name of the class. Assume that we can find // the class using the system loader. String clazz = vv.get(0).trim(); Object obj = JobLoader.load(clazz, null, null); if(obj != null && obj instanceof Comparator) { comparator = (Comparator<TableValue>)obj; } } else if(vv.size() == 2) { // Name of the comparator jar & class. String jar = vv.get(0).trim(); String clazz = vv.get(1).trim(); Path p = Paths.get(jar); Object obj = JobLoader.load(clazz, p.toAbsolutePath().toString(), null); if(obj != null && obj instanceof Comparator) { comparator = (Comparator<TableValue>)obj; } } } // Should we prune the deleted items from the history? entry = conf.get("keva.prune.delete"); if(entry != null) { List<String> vv = entry.getEntry("value").getValues(); if(vv.size() > 0) { pruneOptions.delete = Boolean.valueOf(vv.get(0).trim()).booleanValue(); } } // Should we prune older values? entry = conf.get("keva.prune.history"); if(entry != null) { List<String> vv = entry.getEntry("value").getValues(); if(vv.size() > 0) { pruneOptions.newest = Integer.parseInt(vv.get(0).trim()); } } // how large before flushing memtables. memTableFlushSize = MemTable.RECOMMENDED_THRESHOLD; entry = conf.get("keva.memtable.threshold"); if(entry != null) { List<String> vv = entry.getEntry("value").getValues(); if(vv.size() > 0) { memTableFlushSize = Integer.parseInt(vv.get(0).trim()); } } } /** * Set the sstable implementation class. * * @param className Name of the sstable class * @param jar Name of the jar file containing the implementation (optional) */ public void setSSTableImplementation(String className, String jar) { diskService.addDB(this, className, jar); } /** * ID that uniquely identifies this database. * * @return The string representation of the ID. */ public String getID() { return db; } /** * Define the path where the sstables are stored. This value is also * defined by the configuration parameter "keva.data.dir". * * @param path The path where the data is stored. */ public void setDataPath(String path) { dataPath = path; } /** * Return the path where the sstables are stored. * * @return The path where the data is stored. */ public String getDataPath() { return dataPath; } /** * Define the log directory. The log directory is where all the WAL logs live. * * @param path The log directory path. */ public void setLogPath(String path) { logPath = path; } /** * Return the log directory path. * * @return The log path in string representation. */ public String getLogPath() { return logPath; } /** * Get the SSTable service. **/ protected SSTableService getDiskService() { return diskService; } /** * Create a new empty memtable. **/ protected void format() { diskService.format(this); if(wal != null) { wal.clear(); } } /** * Close a database. */ public void close() throws KevaDBException { // Set the memtable to null so that nothing // else can write to it. MemTable oldTable = table; table = null; // We need to wait for any existing writes to finish. // Easiest way to do this is just try locking each key. for(Iterator<TableKey> keys = oldTable.getKeys(); keys.hasNext(); ) { TableKey key = keys.next(); oldTable.lock(key); oldTable.unlock(key); } // Flush the table to disk. if(oldTable.getNumKeys() > 0) { diskService.flush(this, oldTable); } // Probably a bug, since we don't know when the table is actually free! memAllocator.freeMemTable(oldTable); // Now get rid of old entries in the WAL, including // any in-memory buffers. wal.clear(); } /** * Recover the database from logs. */ public void recover() throws KevaDBException { // Disable the current WAL so that we don't // record the playback. WriteAheadLog temp = wal; wal = null; // Replay the old WAL. WriteAheadLog oldLog = new WriteAheadLog(this, configFile); oldLog.replay(); // Now set up our wal again. wal = temp; } /** * Force all the sstables in the first level to be merged. */ public void forceMerge() { SEDAFuture future = diskService.forceMerge(this); future.get(); } /** * Flush the current memtable to disk and make an sstable. * This normally happens when the memtable grows too large, but * the user can force the issue if necessary. */ public void flush() { MemTable oldTable = table; // Do not flush an empty table. if(oldTable.getNumKeys() > 0) { // Replace with new memtable. table = memAllocator.newMemTable(memTableFlushSize, comparator); // Wait for all the writers to be complete // on the old table. oldTable.flush(); // Flush the table to disk. SEDAFuture future = diskService.flush(this, oldTable); // Wait for the job to complete, before freeing // the old memtable. future.get(); memAllocator.freeMemTable(oldTable); // Now get rid of old entries in the WAL. if(wal != null) { diskService.lockForRead(); wal.recycle(System.currentTimeMillis()); diskService.unlockForRead(); } // Finaly check if we need to merge any of the // tables in any level. diskService.mergeIfNecessary(this); } } /** * Commit a tentative value to memory. A tentative value * is a value that is already in the memtable, but is not * visible. Since the value isn't visible it won't be flushed * to an sstable until the value is commited. This is useful * when we need to atomically commit multiple values. * * @param key The key of the value to commit. * @param value The value to commit. We actually just need the vector clock. * @param options Write options associated with this value. */ public void commit(final TableKey key, final TableValue value, final WriteOptions options) { table.lock(key); if(options != null) { table.commit(key, value, options.branch); } else { table.commit(key, value, null); } table.unlock(key); } /** * Delete the value from the database. This does not actually * remove the value from the database, but simply marks it for deletion. * The user must define a pruning option to actually get rid of the value. * * @param client Unique ID representing the client. Normally the vector clock * associated with a value is sufficient to define the client, but since the * delete operation does not have a value parameter, we must supply another client ID. * @param key The key of the value to delete. */ public void delete(final TableKey key) { // We must construct a new "delete" table value, and then // place this value along every single branch. In order // to implement this properly, we must first get the collapsed // values on every branch, and then perform a write on each // branch with the right vector clock. To do this atomically, we // must lock this specific key. table.lock(key); // Get all the value histories. Map<String,StreamIterator<TableValue>> memValues = table.getCollapsed(key); for(String branch : memValues.keySet()) { // Create a new "delete" value. The delete value will // also need a vector clock that is new enough. TableValue delete = TableValueFactory.newValue(TableValue.DELETE); delete.setClock(new VectorClock("sys".getBytes(), systemClock++)); // Log the write into the WAL. if(wal != null) { wal.put(key, delete, null); } // Place the delete operatation into the memtable. table.put(key, delete, branch, false); } table.unlock(key); // Check if we need to flush. if(table.shouldFlush()) { flush(); } } /** * Place a new value into the database. This method assumes that the * client has defined the value vector clock already, and does not * have any specific writing options. * * @param key The key of the value. * @param value The value to place into the database. */ public void put(final TableKey key, final TableValue value) { put(key, value, null); } /** * Place a new value into the database. * * @param key The key of the value. * @param value The value to place into the database. * @param options Write options that define how the * value is written to the memtable. */ public void put(final TableKey key, final TableValue value, final WriteOptions options) { // Log the write into the WAL. if(wal != null) { wal.put(key, value, options); } if(options != null) { // Check if we need to insert into a specific branch. if(options.branch != null) { table.put(key, value, options.branch, options.tentative); } else { table.put(key, value, options.tentative); } } else { table.put(key, value, false); } // Check if we need to flush. if(table.shouldFlush()) { flush(); } } /** * Apply multiple writes atomically. * * @param ops The batch write operations. */ public boolean put(final WriteBatch ops) { // Lock all the keys associated with this batch. // This makes sure that we do not insert other // items while inserting the batch. Iterator<TableKey> iter = ops.iterator(); while(iter.hasNext()) { TableKey key = iter.next(); // We might have to create the bucket before // locking it (otherwise it causes a lock error). table.create(key); table.lock(key); } // Perform all the actual writes. The memtable has a special // "commit" method that is similar to "put" except that it doesn't // perform any special locking. iter = ops.iterator(); while(iter.hasNext()) { TableKey key = iter.next(); for(WriteBatch.TableWrite write : ops.getValues(key)) { if(write.options == null) { table.commit(key, write.value, null); } else { table.commit(key, write.value, write.options.branch); } } } // We are all done so unlock all the keys. iter = ops.iterator(); while(iter.hasNext()) { TableKey key = iter.next(); table.unlock(key); } return true; } /** * Help retrieve latest data. */ private NavigableMap<String, StreamIterator<TableValue>> getHelper(final TableKey key, final ReadOptions options) { Map<String,StreamIterator<TableValue>> memValues = null; List<Map<String,StreamIterator<TableValue>>> ssValues; // Check if there are any valid options. if(options == null || (options.branch == null && options.time == -1)) { // Get from all the sstables. ssValues = getFromSSTable(key, null, -1); // Get all the independent values associated with this key. memValues = table.getCollapsed(key); } else { if(options.branch != null) { // Get only the value associated with the branch. // There is only one value in this iterator. ssValues = getFromSSTable(key, options, 0); memValues = table.getCollapsed(key, options.branch); } else { // Then find all the values associated with that wall time. ssValues = getFromSSTable(key, null, options.time); memValues = table.getCollapsed(key, options.time); } } // Collect all the iterators. if(memValues != null) { ssValues.add(memValues); } // Now merge the independent branches. return SSTableService.collateBranches(ssValues, true, comparator); } /** * Get the latest independent values. * * @param key The key identifying the value. * @return An iterator over the latest independent values. */ public Map<String, StreamIterator<TableValue>> get(final TableKey key) { return getHelper(key, null); } /** * Read the latest independent values while applying the read options. * * @param key The key identifying the value. * @param options Read options that specify clock constraints, ordering, etc. * @return An iterator over the latest independent values. **/ public StreamIterator<TableValue> get(final TableKey key, final String branch) { Map<String,StreamIterator<TableValue>> values = null; // First try the memtable. If it is found here, then // we can stop searching since the memtable always has // the latest value. values = table.getCollapsed(key, branch); if(values != null) { return values.get(branch); } // Now search for the data in the sstables. However // we should search in level order. values = getLatestByLevel(key, branch); if(values != null) { return values.get(branch); } // Couldn't find it in the memtable or sstables. That // means it doesn't exist! return null; } /** * Read the latest independent values while applying the read options. * * @param key The key identifying the value. * @param time The wall time * @return An iterator over the latest independent values. **/ public Map<String, StreamIterator<TableValue>> get(final TableKey key, final long time) { // Create a new read option. ReadOptions options = new ReadOptions(); options.time = time; return getHelper(key, options); } /** * Read the values associated with the list of keys. If the * user supplies a read option, then we use those options to * synchronize the reads. Otherwise, we will use the latest values. * * @param keys List of keys identifying the values. * @param options Read options that specify clock constraints, ordering, etc. * @return A map associating the latest independent values for each key. */ public Map<TableKey, Map<String, StreamIterator<TableValue>>> get(final List<TableKey> keys, final ReadOptions options) { Map<TableKey, Map<String, StreamIterator<TableValue>>> iters = new HashMap<>(); // Apply the read option in the following manner: // // (0) If there are no options, then just run over the latest // values from the keys. // (1) If the user has specified a branch, then we apply // that branch to every key. // (2) If the user has specified a vector clock, then only // apply that clock to the first key. Then use the // wall time for the other keys. if(options == null) { for(TableKey k : keys) { iters.put(k, getHelper(k, null)); } } else if(options.branch != null) { for(TableKey k : keys) { iters.put(k, getHelper(k, options)); } } else if(options.time != -1) { for(TableKey k : keys) { iters.put(k, getHelper(k, options)); } } return iters; } /** * Help retrieve historical values * * @param key Table key * @param options Read options */ private NavigableMap<String, StreamIterator<TableValue>> getHistoryHelper(final TableKey key, final String branch) { List<Map<String,StreamIterator<TableValue>>> values = new ArrayList<>(); // Get the data from the memtable. Map<String,StreamIterator<TableValue>> value = null; if(branch != null) { value = table.getUncollapsed(key, branch); } else { value = table.getAll(key); } if(value != null) { values.add(value); } // Get the data from the sstables. diskService.lockForRead(); Map<String, Integer> tables = diskService.getDataManifests(this, 0, SSTableService.MAX_LEVELS); for(String t : tables.keySet()) { SSTable ss = diskService.getSSTable(this, t, tables.get(t)); if(ss != null) { value = null; if(ss.contains(key) && branch != null) { value = ss.getUncollapsed(key, branch); } if(value != null) { values.add(value); } } } diskService.unlockForRead(); // Now merge all the histories. return SSTableService.collateBranches(values, false, comparator); } /** * Get the history of the values associated with the key. The * option is used to control which specific values are returned. * * @param key The key identifying the value. * @return An iterator over the history of values along the branch specified by the options. **/ public Map<String, StreamIterator<TableValue>> getHistory(final TableKey key) { return getHistoryHelper(key, null); } /** * Get the history of the values associated with the key. The * option is used to control which specific values are returned. * * @param key The key identifying the value. * @param options Read options that specify a specific branch of values. * @return An iterator over the history of values along the branch specified by the options. **/ public Iterator<TableValue> getHistory(final TableKey key, final String branch) { NavigableMap<String,StreamIterator<TableValue>> histories = getHistoryHelper(key, branch); return histories.get(branch); } /** * Iterate over all the keys in sorted order. Be warned that this * is an expensive operation since we need to scan all the **/ public Iterator<TableKey> iterator() { List<Iterator<? extends TableKey>> keys = new ArrayList<>(); // Get the keys from the memtable. keys.add(table.getKeys()); // Get the keys from the sstables. Map<String, Integer> tables = diskService.getDataManifests(this, 0, SSTableService.MAX_LEVELS); for(String t : tables.keySet()) { SSTable ss = diskService.getSSTable(this, t, tables.get(t)); if(ss != null) { Iterator<TableKey> k = ss.getKeys(); if(k != null) { keys.add(k); } } } // Specify how to compare table keys. Comparator<TableKey> comp = new Comparator<TableKey>() { public int compare(TableKey k1, TableKey k2) { return k1.compareTo(k2); } }; // Create a new merge iterator that will return all // the keys in sorted order. // return new UnionIterator<TableKey>(keys, comp); return new TreeUnionIterator<TableKey>(keys, comp); } /** * Fetch a value from a specific branch from the sstables. Because we want * the latest value we can search in level order. So it is found in level0 * then we don't need to look in level1, etc. That is because things age in * level order. */ private Map<String,StreamIterator<TableValue>> getLatestByLevel(final TableKey key, final String branch) { diskService.lockForRead(); Map<String, Integer> tables = diskService.getDataManifests(this, 0, SSTableService.MAX_LEVELS); Map<Integer,List<String>> manifestByLevel = new TreeMap<>(); for(String uuid : tables.keySet()) { Integer level = tables.get(uuid); List<String> sstables = manifestByLevel.get(level); if(sstables == null) { sstables = new ArrayList<>(); manifestByLevel.put(level, sstables); } sstables.add(uuid); } List<Map<String,StreamIterator<TableValue>>> values = new ArrayList<>(); for(Integer level : manifestByLevel.keySet()) { values.clear(); for(String uuid : manifestByLevel.get(level)) { SSTable ss = diskService.getSSTable(this, uuid, level); if(ss != null && ss.contains(key)) { Map<String, StreamIterator<TableValue>> value = ss.getCollapsed(key, branch); if(value != null && value.size() > 0) { values.add(value); } } } // Now see if we can collate these results. if(values.size() > 0) { diskService.unlockForRead(); return SSTableService.collateBranches(values, false, comparator); } } diskService.unlockForRead(); return null; } /** * Fetch a value from the sstables. **/ private List<Map<String,StreamIterator<TableValue>>> getFromSSTable(final TableKey key, final ReadOptions options, final long time) { List<Map<String,StreamIterator<TableValue>>> bucket = new ArrayList<>(); diskService.lockForRead(); Map<String, Integer> tables = diskService.getDataManifests(this, 0, SSTableService.MAX_LEVELS); for(String t : tables.keySet()) { SSTable ss = diskService.getSSTable(this, t, tables.get(t)); if(ss != null) { // First check if this sstable has this key. // This might result in a false positive, but we // check for improper iterators as well. if(ss.contains(key)) { Map<String,StreamIterator<TableValue>> ssValues = null; if(options != null && options.branch != null) { ssValues = ss.getCollapsed(key, options.branch); } else if(options == null && time != -1) { ssValues = ss.getCollapsed(key, time); } else { ssValues = ss.getCollapsed(key); } if(ssValues != null) { bucket.add(ssValues); } } } } diskService.unlockForRead(); return bucket; } }