SSTableService.java example

Explorer
zava-master
- src
/**
 * Copyright 2013 Oak Ridge National Laboratory
 * Author: James Horey <horeyjl@ornl.gov>
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *     http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
**/

package gov.ornl.keva.node;

/**
 * Java libs. 
 **/
import java.util.List;
import java.util.ArrayList;
import java.util.Map;
import java.util.HashMap;
import java.util.NavigableMap;
import java.util.TreeMap;
import java.util.Iterator;
import java.util.UUID;
import java.util.Comparator;
import java.nio.ByteBuffer;
import java.nio.MappedByteBuffer;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardOpenOption;
import java.nio.file.SimpleFileVisitor;
import java.nio.file.FileVisitResult;
import java.nio.file.attribute.BasicFileAttributes;
import java.nio.channels.FileChannel;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantReadWriteLock;

/**
 * Keva libs.
 **/
import gov.ornl.keva.sstable.SSTable;
import gov.ornl.keva.sstable.DefaultSSTable;
import gov.ornl.keva.mem.MemTable;
import gov.ornl.keva.table.TableKey;
import gov.ornl.keva.table.TableValue;
import gov.ornl.keva.core.MinimumIterator;
import gov.ornl.keva.core.StreamIterator;
import gov.ornl.keva.core.MergeSortedIterator;
import gov.ornl.keva.loader.JobLoader;

/**
 * SEDA libs.
 **/
import gov.ornl.seda.SEDAService;
import gov.ornl.seda.SEDAJob;
import gov.ornl.seda.SEDAFuture;

/**
 * SSTable service performs all the work of creating and maintaining sstables. 
 * This class is a singleton, which means that only one service will be
 * instantiated for multiple databases. 
 * 
 * @author James Horey
 */
public class SSTableService extends SEDAService {
    /**
     * Job function names. 
     **/
    public static final int FLUSH_JOB       = 0;
    public static final int FORCE_MERGE_JOB = 1;
    public static final int MERGE_JOB       = 2;

    /**
     * This class is a singleton. 
     **/ 
    private static SSTableService service;

    /**
     * Number of sstables in a level before we start merging. 
     **/
    public static int MAX_SSTABLES = 5;

    /**
     * Max number of sstable levels.
     **/
    public static int MAX_LEVELS = 6;

    /**
     * SSTable information. 
     **/
    private Map<String, String> sstableClasses =
	new HashMap<>(); 
    private Map<String, Path> sstableJars = 
	new HashMap<>(); 

    /**
     * Used to control access to the manifest file and for merging. 
     **/
    private final ReentrantReadWriteLock rwLock = new ReentrantReadWriteLock();
    private final Lock readLock = rwLock.readLock();
    private final Lock writeLock = rwLock.writeLock();

    /**
     * Manifest cache. 
     */
    private Map<String, Integer> manifestCache = null;

    /**
     * All block IDs have a fixed length. Makes it easier to parse.  
     */
    private final static int UUID_LENGTH;

    static { // Static initializers
	service = new SSTableService();

	String uuid = UUID.randomUUID().toString();
	UUID_LENGTH = uuid.length(); // Assume that all UUIDs are same length.
    }

    /**
     * Private constructur makes sure that clients cannot
     * directly instantiate the service. 
     **/
    private SSTableService() {
    }

    /**
     * Public facing initializer. 
     **/
    public static SSTableService newInstance() {
	return service;
    }

    /**
     * Add a new sstable implementation. 
     * @param db The database being added
     * @param tableClass Name of the sstable class used by this database
     * @param jar Name of the jar file containing the sstable implementation
     **/
    public void addDB(KevaDB db, String tableClass, String jar) {
	sstableClasses.put(db.getID(), tableClass);

	if(jar != null) {
	    sstableJars.put(db.getID(), Paths.get(jar).toAbsolutePath());
	}
    }

    /**
     * Where all the Level SSTables live. 
     **/
    private String getLevelPath(String path, int level) {
	return path + System.getProperty("file.separator") + String.format("level%d", level);
    }

    /**
     * Flush the data into the merge buffer. 
     **/
    protected void flushMergeBuffer(final ByteBuffer mergeBuffer,
				    final ByteBuffer data) {
	// Just add the data into the merge buffer. 
	mergeBuffer.put(data);
    }

    /**
     * Where the sstable data lives.
     **/
    private String getDataPath(String path, int level) throws IOException {
	String p = getLevelPath(path, level) + 
	    System.getProperty("file.separator") + "data";

	Files.createDirectories(Paths.get(p).toAbsolutePath());
	return p;
    }

    /**
     * Where the sstable metadata lives.
     **/
    private String getIndexPath(String path, int level) throws IOException {
	String p = getLevelPath(path, level) + 
	    System.getProperty("file.separator") + "index";

	Files.createDirectories(Paths.get(p).toAbsolutePath());
	return p;
    }

    /**
     * Where the sstable bloom filter lives.
     **/
    private String getFilterPath(String path, int level) throws IOException {
	String p = getLevelPath(path, level) + 
	    System.getProperty("file.separator") + "filter";

	Files.createDirectories(Paths.get(p).toAbsolutePath());
	return p;
    }

    /**
     * Where the manifest file lives.
     **/
    private String getManifestPath(String path, int level) throws IOException {
	Files.createDirectories(Paths.get(getLevelPath(path, level)).toAbsolutePath());
	return getLevelPath(path, level) + System.getProperty("file.separator") + "manifest";
    }

    /**
     * Remove this table from the manifest. 
     **/
    private void removeFromManifest(final String path,
				    final Map<String,SSTable> tables,
				    final int level) {
	/**
	 * Remove the sstable by:
	 * (1) Scanning all the old items while excluding the supplied sstables
	 * (2) Deleting the old manifest
	 * (3) Writing out a new manifest with the list
	 **/
	try {
	    // Where is the manifest located? 
	    String pathName = getManifestPath(path, level);

	    // Check if this file exists. 
	    Path p = Paths.get(pathName).toAbsolutePath();
	    if(Files.exists(p)) {
		List<String> manifestList = new ArrayList<>();
		    
		// Create a file channel and map the buffer for reading. 
		FileChannel chan = FileChannel.open(p, StandardOpenOption.READ);
		MappedByteBuffer in = 
		    chan.map(FileChannel.MapMode.READ_ONLY, 0, (int)chan.size());

		// Read in one SSTable UUID at a time. Assume that all the IDs are
		// the exact same length. 
		for(int i = 0; i < (int)chan.size(); i += UUID_LENGTH) {
		    byte[] buffer = new byte[UUID_LENGTH];
		    in.get(buffer, 0, UUID_LENGTH);
		    
		    String tableName = new String(buffer);
		    if(!tables.containsKey(tableName)) {
			// Add this sstable to our new manifest.
			manifestList.add(tableName);
		    }
		}
		chan.close();

		// Now delete the old manifest file. 
		Files.delete(p);
		Files.createFile(p);

		// Now write out the new manifest file. 
		chan = FileChannel.open(p,
					StandardOpenOption.CREATE,
					StandardOpenOption.APPEND);
		for(String m : manifestList) {
		    byte[] data = m.getBytes();
		    ByteBuffer buffer = ByteBuffer.allocate(data.length);
		    buffer.put(data, 0, UUID_LENGTH); // Transfer to the buffer.
		    buffer.flip(); // Flip the buffer so we can write. 
		    chan.write(buffer); // Write to the channel.
		}
		chan.close(); // Close the channel.
	    }
	} catch(Exception e) {
	    e.printStackTrace();
	}

	// Invalidate the manifest cache. 
	manifestCache = null;
    }

    /**
     * Record the new sstable ID so that we can easily
     * maintain a list of all current sstables. 
     **/
    private void updateDataManifest(final String path,
				    final SSTable table,
				    final int level) {
	try {
	    // Get the manifest path. 
	    String pathName = getManifestPath(path, level);

	    Path p = Paths.get(pathName).toAbsolutePath();
	    if(!Files.exists(p)) {
		// Manifest doesn't exist yet. Most likely we are merging 
		// manifests and creating a new upper-level manifest.
		Files.createFile(p);
	    }

	    // Create a file channel for appending. 
	    FileChannel chan = FileChannel.open(p, 
						StandardOpenOption.CREATE,
						StandardOpenOption.APPEND);
	    byte[] data = table.getUUID().getBytes();
	    ByteBuffer buffer = ByteBuffer.allocate(data.length);
	    buffer.put(data, 0, UUID_LENGTH); // Transfer to the buffer.
	    buffer.flip(); // Flip the buffer so we can write. 
	    chan.write(buffer); // Write to the channel.

	    chan.close(); // Close the channel.
	} catch(IOException e) { 
	    e.printStackTrace();
	}	

	// Invalidate the manifest cache. 
	manifestCache = null;
    }

    /**
     * Lock reading the sstables. There may be multiple concurrent
     * readers, but only a single writer. 
     */
    public void lockForRead() {
	readLock.lock();
    }
    /**
     * Unlock reading the sstables. There may be multiple concurrent
     * readers, but only a single writer. 
     */
    public void unlockForRead() {
	readLock.unlock();
    }
    /**
     * Lock writing the sstables. There may be multiple concurrent
     * readers, but only a single writer. 
     */
    public void lockForWrite() {
	writeLock.lock();
    }
    /**
     * Unlock writing the sstables. There may be multiple concurrent
     * readers, but only a single writer. 
     */
    public void unlockForWrite() {
	writeLock.unlock();
    }

    /**
     * Get the list of sstables.
     *
     * @param db The database we are searching 
     * @param startLevel Start looking at this level
     * @param maxLevel Stop looking at this level
     * @return Manifest mapping the name of the sstable block to the level at which the sstable is found.
     */
    public Map<String, Integer> getDataManifests(final KevaDB db,
						 final int startLevel, 
						 final int maxLevel) {
	// We have all the manifest cached, so use it. 
	if(manifestCache != null &&
	   startLevel == 0 &&
	   maxLevel == MAX_SSTABLES) {
	    return manifestCache;
	}

	// Otherwise we need to re-read the manifest. 
	Map<String, Integer> manifest = new HashMap<>();
	try {	   
	    for(int i = startLevel; i <= maxLevel; ++i) {
		// Get the manifest path. 
		String pathName = getManifestPath(db.getDataPath(), i);

		// Check if this file exists. 
		Path path = Paths.get(pathName).toAbsolutePath();
		if(Files.exists(path)) {
		    // Create a file channel and map the buffer for reading. 
		    FileChannel chan = FileChannel.open(path, StandardOpenOption.READ);
		    MappedByteBuffer in = 
			chan.map(FileChannel.MapMode.READ_ONLY, 0, (int)chan.size());

		    // Read in one SSTable UUID at a time. Assume that all the IDs are
		    // the exact same length. 
		    for(int j = 0; j < (int)chan.size(); j += UUID_LENGTH) {
			byte[] buffer = new byte[UUID_LENGTH];
			in.get(buffer, 0, UUID_LENGTH);
			manifest.put(new String(buffer), i);
		    }

		    // Clsoe the channel.
		    chan.close();
		}
	    }
	} catch(Exception e) {
	    e.printStackTrace();
	}

	// Should we cache these results? 
	if(startLevel == 0 && maxLevel == MAX_SSTABLES) {
	    manifestCache = manifest;
	}

	return manifest;
    }

    /**
     * Retrieve the SSTable associated with the path.
     * 
     * @param db The database we are searching
     * @param uuid The block ID of the sstable
     * @param level The level to search for the sstables
     * @return SSTable identified by the parameters
     */
    public SSTable getSSTable(final KevaDB db,
			      final String uuid,
			      final int level) {
	String dataPath = null;
	String indexPath = null;
	String filterPath = null;

	try {
	    // Get the data paths.
	    dataPath = getDataPath(db.getDataPath(), level);
	    indexPath = getIndexPath(db.getDataPath(), level); 
	    filterPath = getFilterPath(db.getDataPath(), level);
	} catch(IOException e) {
	    return null;
	}

	if(dataPath == null ||
	   indexPath == null ||
	   filterPath == null) {
	    return null;
	}

	// Create both the sstable & metadata table.
	SSTable table = instantiateTable(db, dataPath, indexPath, filterPath);
	table.setUUID(uuid);
	table.init(); 

	return table;
    }

    /**
     * Indicate whether any sstables that are older than or equal
     * to the the time supplied. This is useful when replaying the 
     * write-ahead log so that we don't replay commands already in
     * some sstables.
     *
     * @param db The database we are searching
     * @param time The modification time
     * @return Indicate whether any sstable matches the parameter
     */
    public boolean hasOlderSSTable(final KevaDB db,
				   final long time) {
	lockForRead();
	Map<String, Integer> manifest =
	    getDataManifests(db, 0, MAX_LEVELS);
	unlockForRead();

	for(String uuid : manifest.keySet()) {
	    SSTable table = getSSTable(db, uuid, manifest.get(uuid));
	    if(table.getModificationTime() <= time) {
		return true;
	    }
	}

	return false;
    }

    /**
     * Collate the histories. 
     */
    public static NavigableMap<String,StreamIterator<TableValue>> collateBranches(List<Map<String,StreamIterator<TableValue>>> values,
										  boolean minimize,
										  Comparator<TableValue> independentComparator) {
	NavigableMap<String, List<StreamIterator<TableValue>>> histories = 
	    new TreeMap<>();
	NavigableMap<String, StreamIterator<TableValue>> collated = 
	    new TreeMap<>();

	// We need to take all the related table histories (those that
	// belong to the same branch), and store them in a list. 
	for(Map<String,StreamIterator<TableValue>> fm : values) {
	    for(String branch : fm.keySet()) {
		List<StreamIterator<TableValue>> iters = histories.get(branch);

		if(iters == null) {
		    iters = new ArrayList<>();
		    histories.put(branch, iters);
		}

		iters.add(fm.get(branch));
	    }
	}

	for(String branch : histories.keySet()) {
	    List<StreamIterator<TableValue>> iters = histories.get(branch);

	    // Specify how values should be sorted. 
	    Comparator<TableValue> tvc = new Comparator<TableValue>() {
		public int compare(TableValue v1, TableValue v2) {
		    if(v1 == v2) {
			return 0;
		    }
		    else {
			return 
			(int)v1.getClock().getLocalTime() - 
			(int)v2.getClock().getLocalTime();
		    }
		}
	    };

	    if(minimize) {
		// We want to collapse these iterators. 
		MinimumIterator<TableValue> iter = new MinimumIterator<>(tvc);
		iter.addAll(iters);
		collated.put(branch, iter);
	    }
	    else {
		// We want to return all the values, but in
		// sorted order. 
		MergeSortedIterator<TableValue> iter = new MergeSortedIterator<>(tvc);
		iter.addAll(iters);
		collated.put(branch, iter);
	    }
	}

	return collated;
    }

    /**
     * This initiates an actual job request.
     **/
    @SuppressWarnings("unchecked")    
    protected void startJob(SEDAJob job) {
	if(job.getMethod() == FLUSH_JOB) {
	    flushHelper((KevaDB)job.getArgs().get(0),
			(MemTable)job.getArgs().get(1));
	    job.setStatus(SEDAJob.DONE); // All done with this job.
	}
	else if(job.getMethod() == FORCE_MERGE_JOB) {
	    forceMergeHelper((KevaDB)job.getArgs().get(0));
	    job.setStatus(SEDAJob.DONE); // All done with this job.
	}
	else if(job.getMethod() == MERGE_JOB) {
	    mergeIfNecessaryHelper((KevaDB)job.getArgs().get(0));
	    job.setStatus(SEDAJob.DONE); // All done with this job.
	}
    }

    /**
     * Flush the memtable to disk and make it into an sstable.
     *
     * @param db The database we are searching
     * @param mem The memtable to flush 
     **/
    public SEDAFuture flush(final KevaDB db, final MemTable mem) {
	List<Object> args = new ArrayList<Object>(2);
	args.add(db);
	args.add(mem);

	return addJob(new SEDAJob(FLUSH_JOB, args));
    }

    /**
     * Force merge the database. 
     *
     * @param db The database we are searching
     */
    public SEDAFuture forceMerge(final KevaDB db) {
	List<Object> args = new ArrayList<Object>(1);
	args.add(db);

	return addJob(new SEDAJob(FORCE_MERGE_JOB, args));
    }

    /**
     * Merge tables if the merge limitation is reached.
     *
     * @param db The database we are searching
     **/
    public SEDAFuture mergeIfNecessary(final KevaDB db) {
	List<Object> args = new ArrayList<Object>(1);
	args.add(db);

	return addJob(new SEDAJob(MERGE_JOB, args));
    }

    /**
     * Recursively delete everything. 
     **/
    private void deleteRecursive(String p) throws IOException {
	Files.walkFileTree(Paths.get(p).toAbsolutePath(),
			   new SimpleFileVisitor<Path>() {
				   @Override public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) 
				       throws IOException {
				       Files.delete(file);
				       return FileVisitResult.CONTINUE;
				   }
				   @Override public FileVisitResult postVisitDirectory(Path dir, IOException exc) 
				       throws IOException {
				       if(exc == null) {
					   Files.delete(dir);
					   return FileVisitResult.CONTINUE;
				       }
				       throw exc;
				   }
			   } );
    }


    /**
     * Format the database by deleting all the logs and sstables.
     *
     * @param db The database to format
     **/
    public void format(KevaDB db) {
	try {
	    // Need to delete everything in these paths. 
	    if(db.getDataPath() != null) {
		deleteRecursive(db.getDataPath());
	    }

	} catch(IOException e) {
	    System.out.printf("sstable: could not delete directories\n");
	}
    }

    /**
     * Force a merge in the first level. 
     */
    private void forceMergeHelper(final KevaDB db) {
	lockForWrite();
	mergeHelper(db, 0, getDataManifests(db, 0, 0));
	unlockForWrite();
    }

    /**
     * Help with the merging process. 
     */
    private synchronized void mergeIfNecessaryHelper(final KevaDB db) {
	List<Map<String, Integer>> manifests = 
	    new ArrayList<>(MAX_LEVELS);

	// First collect all the manifests. tThat way we don't
	// block further sstable flushes while we are merging, but
	// still get consistent views into the manifest. 
	lockForWrite();
	for(int i = 0; i <= MAX_LEVELS; ++i) {
	    manifests.add(getDataManifests(db, i, i));
	}
	unlockForWrite();

	// Now merge all the manifests. 
	for(int i = 0; i <= MAX_LEVELS; ++i) {
	    Map<String, Integer> manifest = 
		manifests.get(i);

	    if(manifest.size() >= MAX_SSTABLES) {
	    	mergeHelper(db, i, manifest);
	    }	    
	}
    }

    /**
     * Merge the two sstables to the specified level. 
     **/
    private void mergeHelper(final KevaDB db,
			     final int level,
			     Map<String, Integer> tables) {
	// long start = System.currentTimeMillis();
	// System.out.printf("merging level %d ", level); 

	// Collect all the tables and organize for merging. 
	SSTable table1 = null;
	List<SSTable> otherTables = new ArrayList<>();
	Iterator<String> iter = tables.keySet().iterator();
	while(iter.hasNext()) {
	    SSTable t = getSSTable(db, iter.next(), level);
	    if(table1 == null) {
		table1 = t;
	    }
	    else {
		otherTables.add(t);
	    }
	}

	// Unlikely, but may happen if we try to schedule two
	// concurrent merges. We read the table names, then the
	// first merge finishes, and now the 2nd merge has null data. 
	if(table1 == null ||
	   otherTables == null) {
	    return;
	}

	String dataPath = null;
	String indexPath = null;
	String filterPath = null;
	try {
	    // Get the paths for the merged table. 
	    dataPath = getDataPath(db.getDataPath(), level + 1);
	    indexPath = getIndexPath(db.getDataPath(), level + 1); 
	    filterPath = getFilterPath(db.getDataPath(), level + 1);
	}
	catch(IOException e) {
	    e.printStackTrace();
	}

	if(dataPath == null ||
	   indexPath == null ||
	   filterPath == null) {
	    return;
	}

	// Perform the actual merge.
	SSTable mt = table1.merge(otherTables, dataPath, indexPath, filterPath);

	// Update the directory index so that we can find this table again. 
	writeLock.lock();
	updateDataManifest(db.getDataPath(), mt, level + 1);

	// Remove these items from the manifest. 
	Map<String, SSTable> toRemove = new HashMap<>(otherTables.size() + 1);
	toRemove.put(table1.getUUID(), table1);
	for(SSTable t : otherTables) {
	    toRemove.put(t.getUUID(), t);
	}
	removeFromManifest(db.getDataPath(), toRemove, level);
	writeLock.unlock();

	// Now get rid of the old sstables. 
	table1.delete();
	for(SSTable t : otherTables) {	    
	    t.delete();
	}

	// long end = System.currentTimeMillis();
	// System.out.printf(" (%.3f sec)\n", (double)(end - start) / 1000.00);
    }

    /**
     * Place the table into the index and write the table to disk. 
     * The table knows how to write to disk themselves, so the service
     * just needs to supply the directory. 
     **/
    private void flushHelper(final KevaDB db, 
			     final MemTable mem) {
	// Get the Level 0 paths.
	String dataPath = null;
	String indexPath = null;
	String filterPath = null;

	// long start = System.currentTimeMillis();
	// System.out.printf("flushing sstable "); 
	try {
	    dataPath = getDataPath(db.getDataPath(), 0);
	    indexPath = getIndexPath(db.getDataPath(), 0); 
	    filterPath = getFilterPath(db.getDataPath(), 0);
	} catch(IOException e) {
	    e.printStackTrace();
	}

	if(dataPath == null ||
	   indexPath == null ||
	   filterPath == null) {
	    return;
	}

	// Write out the data. This also flushes the bloom filter and data index. 
	SSTable table = instantiateTable(db, dataPath, indexPath, filterPath);
	table.setUUID(table.flush(mem));
	    
	// Update the directory index so that we can find this table again. 
	writeLock.lock();
	updateDataManifest(db.getDataPath(), table, 0);
	writeLock.unlock();
	// long end = System.currentTimeMillis();
	// System.out.printf(" (%.3f sec)\n", (double)(end - start) / 1000.00);
    }

    /**
     * Instantiate a new SSTable. 
     **/
    @SuppressWarnings("unchecked")    
    private SSTable instantiateTable(KevaDB db, 
				     String data, 
				     String index, 
				     String filter) {
	// Try to see if the user has specified a custom
	// sstable implementation for this DB. 
	String clazz = sstableClasses.get(db.getID());
	if(clazz != null) {
	    Path jar = sstableJars.get(db.getID());
	    String jarPath = null;
	    Object obj = null;

	    if(jar != null) {
		jarPath = jar.toAbsolutePath().toString();
	    }

	    // Construct the parameters. 
	    String[] args = new String[3];
	    args[0] = data;
	    args[1] = index;
	    args[2] = filter;

	    obj = JobLoader.load(clazz, jarPath, args);
	    if(obj != null && obj instanceof SSTable) {
		return (SSTable)obj;
	    }
	}

	// Otherwise, use the default sstable implementation.
	SSTable table = new DefaultSSTable(data, index, filter);
	return table;
    }
}