/** * Copyright 2013 Oak Ridge National Laboratory * Author: James Horey <horeyjl@ornl.gov> * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. **/ package gov.ornl.keva.mem; /** * Java libs. **/ import java.util.Arrays; import java.util.Comparator; import java.util.Iterator; import java.util.NavigableSet; import java.util.Map; import java.util.TreeMap; import java.util.NavigableMap; import java.util.concurrent.ConcurrentSkipListMap; import java.util.concurrent.ConcurrentHashMap; import java.io.OutputStream; import java.io.IOException; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReentrantReadWriteLock; /** * Keva libs. **/ import gov.ornl.keva.table.TableKey; import gov.ornl.keva.table.TableValue; import gov.ornl.keva.table.TableValueHistory; import gov.ornl.keva.table.TableDeleteOp; import gov.ornl.keva.table.TableBucket; import gov.ornl.keva.table.TableBucketFactory; import gov.ornl.keva.core.VectorClock; import gov.ornl.keva.core.EmptyIterator; import gov.ornl.keva.core.StreamIterator; import gov.ornl.keva.core.PruneOptions; /** * A mem table is the datastructure that holds actual key-value data in memory. * All data is stored in sorted, navigable order to simplify data flushing. * After the memtable reaches a certain threshold, the table is flushed onto * disk as an sstable. * * @author James Horey */ public class MemTable { /** * Memory threshold before flushing to disk. Normally * set to 1/3 the amount of free memory. However, the * can override this value during memtable creation. */ public static final long RECOMMENDED_THRESHOLD = Runtime.getRuntime().freeMemory() / 3; /** * Used to make sure that memtables are completely * empty of writers before flushing. All operations * normally use a "read" lock so that multiple writers * can co-exist. However, when performing a "flush", we * use a "write" lock to gain exclusitivity. */ private final ReentrantReadWriteLock rwLock = new ReentrantReadWriteLock(); private final Lock writes = rwLock.readLock(); private final Lock flushes = rwLock.writeLock(); private long memThreshold; // Max. size of table (in B). private Comparator<TableValue> comp; // For sorting comparisons. private final ConcurrentSkipListMap<TableKey, TableBucket> map; // Our values. private volatile long runningKeyTotal; // Running total memory usage for keys. private volatile long runningDataTotal; // Running total memory usage for data. private PruneOptions pruneOptions; /** * The memtable must know its own threshold values. * * @param memThreshold How large the memtable should be before flushing to disk */ public MemTable(long memThreshold) { // Figure out the memory threshold. this.memThreshold = memThreshold == RECOMMENDED_THRESHOLD? RECOMMENDED_THRESHOLD : memThreshold; runningKeyTotal = 0; // Keep track of memory usage. runningDataTotal = 0; // Keep track of memory usage. comp = null; map = new ConcurrentSkipListMap<TableKey, TableBucket>(new Comparator<TableKey>() { public int compare(TableKey k1, TableKey k2) { return k1.compareTo(k2); }}); } /** * Prune options are used to control which values get * read and flushed. Used to prune out old values, etc. * * @param pruneOptions Prune options */ public void setPruneOptions(PruneOptions pruneOptions) { this.pruneOptions = pruneOptions; } /** * Set the independent value comparator. Used to sort the independent values. * * @param comp The table value comparator */ public void setComparator(Comparator<TableValue> comp) { this.comp = comp; } /** * Get the independent value comparator. * * @return The table value comparator */ public Comparator<TableValue> getComparator() { return comp; } /** * The memory threshold refers to the amount of memory the table can * use before it is flushed onto disk. * * @param threshold Memory threshold */ public void setMemThreshold(long threshold) { memThreshold = threshold; } /** * Get the current memory threshold. * * @return Flushing memory threshold */ public long getMemThreshold() { return memThreshold; } /** * Estimate current memory usage and check whether we are over the memory threshold. * * @return boolean True if the table is over the threshold. False otherwise. */ public boolean shouldFlush() { if(runningKeyTotal + runningDataTotal > memThreshold) { return true; } return false; } /** * Indicates whether a value for the table key exists. * * @param key The table key used to identify the value * @return True if the key is in the memtable. False otherwise. */ public boolean contains(final TableKey key) { return map.containsKey(key); } /** * Lock the row for writing. This is used during commit operations. * * @param key The table key used to identify the value */ public void lock(final TableKey key) { TableBucket bucket = map.get(key); if(bucket != null) { bucket.lockBucket(); } } /** * Unlock the row for writing. This is used during commit operations. * * @param key The table key used to identify the value */ public void unlock(final TableKey key) { TableBucket bucket = map.get(key); if(bucket != null) { bucket.unlockBucket(); } } /** * Create the bucket if it doesn't exist. */ public void create(final TableKey key) { writes.lock(); map.putIfAbsent(key, TableBucketFactory.newBucket(TableBucket.Constraint.SAFE, comp)); writes.unlock(); } /** * Commit the tentative value to the memtable. * * @param key The table key used to identify the value * @param value The table value to store * @param branch The branch to store the value (optional) */ public void commit(final TableKey key, final TableValue value, final String branch) { writes.lock(); // Now add to the bucket. TableBucket bucket = map.get(key); bucket.commit(value, branch); runningDataTotal += (Integer.SIZE / 8) + // Size of the serialized data buffer. value.getClock().memory() + // Size of the vector clock (Integer.SIZE / 8) + // Size of compressed data length (Integer.SIZE / 8) + // Size of data length value.memory(); // Size of the actual data. runningKeyTotal += (Integer.SIZE / 8) + // Size of key serialized length. key.size(); // Size of the actual key writes.unlock(); } /** * Add this value to the appropriate table bucket. * * @param key The table key used to identify the value * @param value The table value to store * @param tentative Indicate if this is a tentative value. */ public void put(final TableKey key, final TableValue value, final boolean tentative) { put(key, value, null, tentative); } /** * Add this value to the appropriate table bucket. * * @param key The table key used to identify the value * @param value The table value to store * @param branch The branch to store the value (optional) * @param tentative Indicate if this is a tentative value. */ public void put(final TableKey key, final TableValue value, final String branch, final boolean tentative) { TableBucket bucket; writes.lock(); // See if we need to encode the "tentative" status. if(tentative) { value.setFlags(TableValue.TENTATIVE); } // Create a new bucket if necessary & add the value. map.putIfAbsent(key, TableBucketFactory.newBucket(TableBucket.Constraint.SAFE, comp)); bucket = map.get(key); // Is this a delete operation? If so, we need to explicitly // tell the bucket so that it knows how to prune later. if(value instanceof TableDeleteOp) { bucket.setDeleteOp(true); } // Using explicit branching? if(branch != null) { bucket.add(value, branch); } else { bucket.add(value); } // Keep track of the total memory used. if(value.getAttributes() != null) { runningDataTotal += value.getAttributes().memory(); } runningDataTotal += (Integer.SIZE / 8) + // Size of the serialized data buffer. value.getClock().memory() + // Size of the vector clock (Integer.SIZE / 8) + // Size of compressed data length (Integer.SIZE / 8) + // Size of data length value.memory(); // Size of the actual data. runningKeyTotal += (Integer.SIZE / 8) + // Size of key serialized length. key.size(); // Size of the actual key writes.unlock(); } /** * Get all the values associated with this key across all branches. * * @param key The table key used to identify the value * @return An iterator over all the values associated with the key */ public Map<String,StreamIterator<TableValue>> getAll(final TableKey key) { TableBucket bucket = map.get(key); if(bucket != null) { return bucket.getComplete(pruneOptions); } return null; } /** * Get all the historical table values along a single branch associated with * the supplied key. The branch is identified using the branch name. * * @param key The table key used to identify the value * @param branch The branch to store the value * @return An iterator over all the historical values associated with the key along a specific branch. */ public Map<String,StreamIterator<TableValue>> getUncollapsed(final TableKey key, final String branch) { TableBucket bucket = map.get(key); if(bucket != null) { return bucket.getUncollapsed(branch, null); } return null; } /** * Get all the latest, independent values associated with this key. * We will need to go through the bucket to reconstruct the latest values. * * @param key The table key used to identify the value * @return An iterator over the final, independent values associated with the key */ public Map<String,StreamIterator<TableValue>> getCollapsed(final TableKey key) { TableBucket b = map.get(key); if(b != null) { return b.getCollapsed(); } return null; } /** * Get all the latest, independent values associated with this key. * We will need to go through the bucket to reconstruct the latest values. * * @param key The table key used to identify the value * @param time Prune all values with a wall time less than this time * @return An iterator over the final, independent values associated with the key */ public Map<String,StreamIterator<TableValue>> getCollapsed(final TableKey key, final long time) { TableBucket b = map.get(key); if(b != null) { return b.getCollapsed(time); } return null; } /** * Get all the latest, independent values associated with this key on * the specified branch. Since this is a collapsed value, we should * only return a single value. * * @param key The table key used to identify the value * @return An iterator over the final, independent values associated with the key */ public Map<String,StreamIterator<TableValue>> getCollapsed(final TableKey key, final String branch) { TableBucket bucket = map.get(key); if(bucket != null) { return bucket.getCollapsed(branch); } return null; } /** * Get the latest vector clocks associated with this key. * * @param key The table key used to identify the value * @return An iterator over all the latest vector clocks associated with the key */ public Iterator<VectorClock> getAllClocks(final TableKey key) { TableBucket b = map.get(key); if(b != null) { return b.getAllClocks(); } return new EmptyIterator<VectorClock>(); } /** * Return the keys in sorted order. * * @return Iterator over the table keys */ public Iterator<TableKey> getKeys() { return map.navigableKeySet().iterator(); } /** * Get the number of unique keys in the memtable. * * @return Number of keys */ public int getNumKeys() { return map.size(); } /** * Get the total number of entries associated with the key. * * @param key The table key used to identify the value * @return Number of entries */ public long getNumEntries(final TableKey key) { TableBucket b = map.get(key); if(b != null) { return b.size(); } return 0; } /** * Flush the memtable of all active writers. */ public void flush() { flushes.lock(); flushes.unlock(); } /** * Get total amount of memory used by the data (excluding keys) * * @return Memory used in bytes */ public long getDataSize() { return runningDataTotal; } /** * Get total amount of memory used by the keys (excluding data) * * @return Memory used in bytes */ public long getKeySize() { return runningKeyTotal; } }