MemTable.java example

Explorer
zava-master
- src
/**
 * Copyright 2013 Oak Ridge National Laboratory
 * Author: James Horey <horeyjl@ornl.gov>
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *     http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
**/

package gov.ornl.keva.mem;

/**
 * Java libs. 
 **/
import java.util.Arrays;
import java.util.Comparator;
import java.util.Iterator;
import java.util.NavigableSet;
import java.util.Map;
import java.util.TreeMap;
import java.util.NavigableMap;
import java.util.concurrent.ConcurrentSkipListMap;
import java.util.concurrent.ConcurrentHashMap;
import java.io.OutputStream;
import java.io.IOException;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantReadWriteLock;

/**
 * Keva libs.
 **/
import gov.ornl.keva.table.TableKey;
import gov.ornl.keva.table.TableValue;
import gov.ornl.keva.table.TableValueHistory;
import gov.ornl.keva.table.TableDeleteOp;
import gov.ornl.keva.table.TableBucket;
import gov.ornl.keva.table.TableBucketFactory;
import gov.ornl.keva.core.VectorClock;
import gov.ornl.keva.core.EmptyIterator;
import gov.ornl.keva.core.StreamIterator;
import gov.ornl.keva.core.PruneOptions;

/**
 * A mem table is the datastructure that holds actual key-value data in memory.
 * All data is stored in sorted, navigable order to simplify data flushing. 
 * After the memtable reaches a certain threshold, the table is flushed onto
 * disk as an sstable.
 *
 * @author James Horey
 */
public class MemTable {
    /**
     * Memory threshold before flushing to disk. Normally
     * set to 1/3 the amount of free memory. However, the
     * can override this value during memtable creation. 
     */
    public static final long RECOMMENDED_THRESHOLD = 
	Runtime.getRuntime().freeMemory() / 3;

    /**
     * Used to make sure that memtables are completely
     * empty of writers before flushing. All operations 
     * normally use a "read" lock so that multiple writers
     * can co-exist. However, when performing a "flush", we
     * use a "write" lock to gain exclusitivity. 
     */
    private final ReentrantReadWriteLock rwLock = new ReentrantReadWriteLock();
    private final Lock writes = rwLock.readLock();
    private final Lock flushes = rwLock.writeLock();

    private long memThreshold; // Max. size of table (in B). 
    private Comparator<TableValue> comp; // For sorting comparisons. 
    private final ConcurrentSkipListMap<TableKey, TableBucket> map; // Our values. 
    private volatile long runningKeyTotal; // Running total memory usage for keys.
    private volatile long runningDataTotal; // Running total memory usage for data.
    private PruneOptions pruneOptions;

    /**
     * The memtable must know its own threshold values.
     *
     * @param memThreshold How large the memtable should be before flushing to disk
     */
    public MemTable(long memThreshold) {
	// Figure out the memory threshold. 
	this.memThreshold = memThreshold == RECOMMENDED_THRESHOLD? 
	    RECOMMENDED_THRESHOLD : memThreshold; 

	runningKeyTotal = 0; // Keep track of memory usage. 
	runningDataTotal = 0; // Keep track of memory usage. 

	comp = null;

	map = new ConcurrentSkipListMap<TableKey, TableBucket>(new Comparator<TableKey>() {
		public int compare(TableKey k1, TableKey k2) {
		    return k1.compareTo(k2);
		}});
    }

    /**
     * Prune options are used to control which values get
     * read and flushed. Used to prune out old values, etc. 
     *
     * @param pruneOptions Prune options
     */
    public void setPruneOptions(PruneOptions pruneOptions) {
	this.pruneOptions = pruneOptions;
    }

    /**
     * Set the independent value comparator. Used to sort the independent values.
     *
     * @param comp The table value comparator 
     */
    public void setComparator(Comparator<TableValue> comp) {
	this.comp = comp;
    }

    /**
     * Get the independent value comparator.
     *
     * @return The table value comparator
     */
    public Comparator<TableValue> getComparator() {
	return comp;
    }

    /**
     * The memory threshold refers to the amount of memory the table can
     * use before it is flushed onto disk. 
     *
     * @param threshold Memory threshold
     */
    public void setMemThreshold(long threshold) {
	memThreshold = threshold;
    }

    /**
     * Get the current memory threshold. 
     *
     * @return Flushing memory threshold
     */
    public long getMemThreshold() {
	return memThreshold;
    }

    /**
     * Estimate current memory usage and check whether we are over the memory threshold. 
     *
     * @return boolean True if the table is over the threshold. False otherwise. 
     */
    public boolean shouldFlush() {
	if(runningKeyTotal + runningDataTotal > memThreshold) {
	    return true;
	}

	return false;
    }

    /**
     * Indicates whether a value for the table key exists. 
     *
     * @param key The table key used to identify the value
     * @return True if the key is in the memtable. False otherwise.
     */
    public boolean contains(final TableKey key) {
	return map.containsKey(key);
    }

    /**
     * Lock the row for writing. This is used during commit operations. 
     * 
     * @param key The table key used to identify the value
     */
    public void lock(final TableKey key) {
	TableBucket bucket = map.get(key);
	if(bucket != null) {
	    bucket.lockBucket();
	}
    }

    /**
     * Unlock the row for writing. This is used during commit operations. 
     *
     * @param key The table key used to identify the value
     */
    public void unlock(final TableKey key) {
	TableBucket bucket = map.get(key);
	if(bucket != null) {
	    bucket.unlockBucket();
	}
    }

    /**
     * Create the bucket if it doesn't exist. 
     */
    public void create(final TableKey key) {
	writes.lock();
	map.putIfAbsent(key, TableBucketFactory.newBucket(TableBucket.Constraint.SAFE, comp));
	writes.unlock();
    }

    /**
     * Commit the tentative value to the memtable.
     *
     * @param key The table key used to identify the value
     * @param value The table value to store
     * @param branch The branch to store the value (optional)
     */
    public void commit(final TableKey key, 
		       final TableValue value,
		       final String branch) {
	
	writes.lock();

	// Now add to the bucket. 
	TableBucket bucket = map.get(key);
	bucket.commit(value, branch);

	runningDataTotal += 
	    (Integer.SIZE / 8) +             // Size of the serialized data buffer.
	    value.getClock().memory() +      // Size of the vector clock
	    (Integer.SIZE / 8) +             // Size of compressed data length
	    (Integer.SIZE / 8) +             // Size of data length
	    value.memory();                  // Size of the actual data.

	runningKeyTotal +=
	    (Integer.SIZE / 8) +             // Size of key serialized length. 
	    key.size();                      // Size of the actual key

	writes.unlock();
    }

    /**
     * Add this value to the appropriate table bucket. 
     *
     * @param key The table key used to identify the value
     * @param value The table value to store
     * @param tentative Indicate if this is a tentative value. 
     */
    public void put(final TableKey key, 
		    final TableValue value,
		    final boolean tentative) {
	put(key, value, null, tentative);
    }

    /**
     * Add this value to the appropriate table bucket. 
     *
     * @param key The table key used to identify the value
     * @param value The table value to store
     * @param branch The branch to store the value (optional)
     * @param tentative Indicate if this is a tentative value. 
     */
    public void put(final TableKey key, 
		    final TableValue value,
		    final String branch,
		    final boolean tentative) { 
	TableBucket bucket;
	writes.lock();

	// See if we need to encode the "tentative" status.
	if(tentative) {
	    value.setFlags(TableValue.TENTATIVE);
	}

	// Create a new bucket if necessary & add the value. 
	map.putIfAbsent(key, TableBucketFactory.newBucket(TableBucket.Constraint.SAFE, comp));
	bucket = map.get(key);

	// Is this a delete operation? If so, we need to explicitly
	// tell the bucket so that it knows how to prune later. 
	if(value instanceof TableDeleteOp) {
	    bucket.setDeleteOp(true);
	}

	// Using explicit branching? 
	if(branch != null) {
	    bucket.add(value, branch);
	}
	else {
	    bucket.add(value);
	}

	// Keep track of the total memory used. 
	if(value.getAttributes() != null) {
	    runningDataTotal += value.getAttributes().memory();
	}

	runningDataTotal += 
	    (Integer.SIZE / 8) +             // Size of the serialized data buffer.
	    value.getClock().memory() +      // Size of the vector clock
	    (Integer.SIZE / 8) +             // Size of compressed data length
	    (Integer.SIZE / 8) +             // Size of data length
	    value.memory();                  // Size of the actual data.

	runningKeyTotal +=
	    (Integer.SIZE / 8) +             // Size of key serialized length. 
	    key.size();                      // Size of the actual key

	writes.unlock();
    }

    /**
     * Get all the values associated with this key across all branches. 
     *
     * @param key The table key used to identify the value
     * @return An iterator over all the values associated with the key
     */
    public Map<String,StreamIterator<TableValue>> getAll(final TableKey key) {
	TableBucket bucket = map.get(key);
	if(bucket != null) {
	    return bucket.getComplete(pruneOptions);
	}

	return null;
    }    

    /**
     * Get all the historical table values along a single branch associated with
     * the supplied key. The branch is identified using the branch name. 
     *
     * @param key The table key used to identify the value
     * @param branch The branch to store the value
     * @return An iterator over all the historical values associated with the key along a specific branch. 
     */
    public Map<String,StreamIterator<TableValue>> getUncollapsed(final TableKey key,
							     final String branch) {
	TableBucket bucket = map.get(key);
	if(bucket != null) {
	    return bucket.getUncollapsed(branch, null);
	}

	return null;
    }

    /**
     * Get all the latest, independent values associated with this key. 
     * We will need to go through the bucket to reconstruct the latest values. 
     * 
     * @param key The table key used to identify the value
     * @return An iterator over the final, independent values associated with the key
     */
    public Map<String,StreamIterator<TableValue>> getCollapsed(final TableKey key) {
	TableBucket b = map.get(key);
	if(b != null) {
	    return b.getCollapsed();
	}

	return null;
    }

    /**
     * Get all the latest, independent values associated with this key. 
     * We will need to go through the bucket to reconstruct the latest values. 
     *
     * @param key The table key used to identify the value
     * @param time Prune all values with a wall time less than this time
     * @return An iterator over the final, independent values associated with the key
     */
    public Map<String,StreamIterator<TableValue>> getCollapsed(final TableKey key,
							       final long time) {
	TableBucket b = map.get(key);
	if(b != null) {
	    return b.getCollapsed(time);
	}

	return null;
    }

    /**
     * Get all the latest, independent values associated with this key on
     * the specified branch. Since this is a collapsed value, we should
     * only return a single value. 
     * 
     * @param key The table key used to identify the value
     * @return An iterator over the final, independent values associated with the key
     */
    public Map<String,StreamIterator<TableValue>> getCollapsed(final TableKey key,
							   final String branch) {
	TableBucket bucket = map.get(key);
	if(bucket != null) {
	    return bucket.getCollapsed(branch);
	}

	return null;
    }

    /**
     * Get the latest vector clocks associated with this key. 
     * 
     * @param key The table key used to identify the value
     * @return An iterator over all the latest vector clocks associated with the key
     */
    public Iterator<VectorClock> getAllClocks(final TableKey key) {
	TableBucket b = map.get(key);

	if(b != null) {
	    return b.getAllClocks();
	}

	return new EmptyIterator<VectorClock>();
    }

    /**
     * Return the keys in sorted order. 
     *
     * @return Iterator over the table keys
     */
    public Iterator<TableKey> getKeys() {
	return map.navigableKeySet().iterator();
    }

    /**
     * Get the number of unique keys in the memtable.
     * 
     * @return Number of keys
     */
    public int getNumKeys() {
	return map.size();
    }

    /**
     * Get the total number of entries associated with the key.
     *
     * @param key The table key used to identify the value
     * @return Number of entries
     */
    public long getNumEntries(final TableKey key) {
	TableBucket b = map.get(key);

	if(b != null) {
	    return b.size();
	}

	return 0;
    }

    /**
     * Flush the memtable of all active writers. 
     */
    public void flush() {
	flushes.lock();
	flushes.unlock();
    }

    /**
     * Get total amount of memory used by the data (excluding keys)
     *
     * @return Memory used in bytes
     */
    public long getDataSize() {
	return runningDataTotal;
    }

    /**
     * Get total amount of memory used by the keys (excluding data)
     *
     * @return Memory used in bytes
     */
    public long getKeySize() {
	return runningKeyTotal;
    }
}