/** * Copyright 2013 Oak Ridge National Laboratory * Author: James Horey <horeyjl@ornl.gov> * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. **/ package gov.ornl.keva.node; /** * Configuration libs. **/ import gov.ornl.config.ConfigFactory; import gov.ornl.config.Configuration; import gov.ornl.config.ConfigEntry; /** * Java libraries. **/ import java.util.Comparator; import java.util.Collections; import java.util.List; import java.util.ArrayList; import java.util.Map; import java.util.Iterator; import java.io.IOException; import java.io.File; import java.io.RandomAccessFile; import java.nio.ByteBuffer; import java.nio.MappedByteBuffer; import java.nio.file.Path; import java.nio.file.Paths; import java.nio.file.Files; import java.nio.file.DirectoryStream; import java.nio.file.StandardOpenOption; import java.nio.file.NoSuchFileException; import java.nio.file.attribute.BasicFileAttributes; import java.nio.file.attribute.BasicFileAttributeView; import java.nio.channels.FileChannel; import java.nio.channels.FileLock; /** * Compression libraries. **/ import net.jpountz.lz4.LZ4Compressor; import net.jpountz.lz4.LZ4Decompressor; import net.jpountz.lz4.LZ4Factory; /** * Keva libs. **/ import gov.ornl.keva.table.TableKey; import gov.ornl.keva.table.TableValue; import gov.ornl.keva.table.TableValueFactory; import gov.ornl.keva.sstable.SSTable; import gov.ornl.keva.core.WriteOptions; import gov.ornl.keva.core.OptionsSerializer; import gov.ornl.keva.core.KevaDBException; /** * The write ahead log persists all "put" operations (the key and value) onto * disk sequentially. The log is used to ensure that all values are safely persisted * since memtables only store the values in memory (and hence may become lost if the * machine is rebooted). Our WAL implementation has different flushing policies depending * on the safety guarantees the user expects. * * @author James Horey */ public class WriteAheadLog { private static final short MAGIC_NUMBER = 79; /** * Maximum memory mapped buffer size. Set to a reasonable * size so that we don't continually re-map, but not too large. * * Currently about 4MB. In the future, probably should be a parameter. */ private static final long MAX_MAP_SIZE = 1024 * 4096; /** * Flushing policies. **/ public enum Flush { SAFE, UNSAFE }; /** * The LZ4 compression classes. */ private static LZ4Factory lz4Factory = LZ4Factory.fastestInstance(); private static LZ4Compressor compressor = lz4Factory.fastCompressor(); private static LZ4Decompressor decompressor = lz4Factory.decompressor(); private KevaDB db; // Used to replay the log. private Flush flushPolicy; // Immediate, time based, or size based private long param; // Parameter used by either time or size policy. private Path logPath; // Parent directory where logs are stored private FileChannel logChannel; // Open channel to current log. private MappedByteBuffer logBuffer; // Buffer to log channel. private long mapStart; // Where we are in the WAL. /** * @param db The database to associate with this log * @param configFile Path to the configuration file */ public WriteAheadLog(KevaDB db, String configFile) { this.db = db; flushPolicy = Flush.UNSAFE; param = 0; mapStart = 0; // Load the configuration loadConfig(configFile); } /** * Start the logging system. We must define a new * log to store everything. **/ public synchronized void createLog() { if(logPath == null) { String id = String.format("log_%d", System.currentTimeMillis()); try { // Try to close the old channel. if(logChannel != null) { logChannel.close(); } logPath = Paths.get(db.getLogPath() + System.getProperty("file.separator") + id).toAbsolutePath(); if(!Files.exists(logPath)) { // Create the necessary parent directories. Files.createDirectories(Paths.get(db.getLogPath()).toAbsolutePath()); // Make sure that our WAL is created before we return. logPath = Files.createFile(logPath); // Open up the line of communication. String flag = "rw"; if(flushPolicy == Flush.SAFE) { flag += "d"; } logChannel = new RandomAccessFile(logPath.toString(), flag).getChannel(); // Map the file. logBuffer = mapWAL(); } } catch(IOException e) { e.printStackTrace(); } catch(Exception e) { e.printStackTrace(); } } } /** * Remap the WAL buffer. */ private MappedByteBuffer mapWAL() { MappedByteBuffer buf = null; try { buf = logChannel.map(FileChannel.MapMode.READ_WRITE, mapStart, MAX_MAP_SIZE); } catch(IOException e) { e.printStackTrace(); } if(buf != null) { mapStart += MAX_MAP_SIZE; } return buf; } /** * Find the latest log file for the database. **/ private List<Path> getLogs() { List<Path> logs = new ArrayList<>(); // Compare the modified times. We want to read the // oldest logs first. Comparator<Path> comp = new Comparator<Path>() { public int compare(Path p1, Path p2) { try { BasicFileAttributes v1 = Files.getFileAttributeView(p1, BasicFileAttributeView.class) .readAttributes(); BasicFileAttributes v2 = Files.getFileAttributeView(p2, BasicFileAttributeView.class) .readAttributes(); return (int)(v2.lastModifiedTime().toMillis() - v1.lastModifiedTime().toMillis()); } catch(IOException e) { e.printStackTrace(); } return 0; } }; try { DirectoryStream<Path> stream = Files.newDirectoryStream(Paths.get(db.getLogPath()).toAbsolutePath(), "log_*"); for (Path entry: stream) { logs.add(entry); } } catch(NoSuchFileException e) { } catch (IOException e) { e.printStackTrace(); } // Sort the logs. Collections.sort(logs, comp); return logs; } /** * Configure the log. **/ private void loadConfig(String c) { ConfigFactory configFactory; Configuration conf; configFactory = new ConfigFactory(); if(c != null) { Path p = Paths.get(c); conf = configFactory.getConfig(p.toAbsolutePath().toString()); if(conf != null) { ConfigEntry entry = conf.get("keva.wal.flush"); if(entry != null) { ConfigEntry v = entry.getEntry("value"); String flush = v.getValues().get(0); if(flush.equals("immediate")) { setPolicy(Flush.SAFE); } v = entry.getEntry("param"); if(v != null) { setPolicyParam(Long.parseLong(v.getValues().get(0))); } } } } } /** * Set the flushing policy. * FLUSH_IMMEDIATE will flush all values to disk immediately * (equivalent to an OS synch operatiation). FLUSH_TIME will * only flush values after a certain period of time has passed. * FLUSH_SIZE will only flush values after the in-memory buffer * grows too large. * * @param policy The flushing policy */ public void setPolicy(Flush policy) { flushPolicy = policy; } /** * Get the flushing policy. * * @return The flushing policy */ public Flush getPolicy() { return flushPolicy; } /** * The size and time-based flushing policies require an additional * parameter to control when to flush. * * @param param The flushing policy parameter */ public void setPolicyParam(long param) { this.param = param; } /** * Get the flushing policy parameter. * * @return The flushing policy parameter */ public long getPolicyParam() { return param; } /** * Serialize the write. **/ private ByteBuffer serialize(final TableKey key, final TableValue value, final WriteOptions options) { int size; int optionSize = 0; byte[] optionBuf = null; byte[] keyData = key.serialize(); byte[] valueData = value.getBytes(); int maxCompressed = compressor.maxCompressedLength(valueData.length); byte[] compressed = new byte[maxCompressed]; int actualCompressed = compressor.compress(valueData, 0, valueData.length, compressed, 0, maxCompressed); size = (Short.SIZE / 8) + 4 * (Integer.SIZE / 8) + (Long.SIZE / 8) + keyData.length + actualCompressed; if(options != null) { optionBuf = OptionsSerializer.getBytes(options); if(optionBuf != null) { optionSize = optionBuf.length; } else { optionSize = 0; } size += optionSize; } ByteBuffer buf = ByteBuffer.allocateDirect(size); buf.putLong(System.currentTimeMillis()); buf.putShort(MAGIC_NUMBER); // Magic number. buf.putInt(keyData.length); buf.putInt(valueData.length); buf.putInt(actualCompressed); buf.putInt(optionSize); buf.put(keyData); buf.put(compressed, 0, actualCompressed); if(optionBuf != null) { buf.put(optionBuf); } return buf; } /** * Clear the entire WAL history. */ public void clear() { // Delete the current block. try { if(logPath != null) { Files.delete(logPath); logPath = null; } } catch (IOException e) { e.printStackTrace(); } // Recycle all the old blocks. recycle(Long.MAX_VALUE); } /** * Delete older logs from disk. * * @param time The minimum age of the logs. */ public void recycle(long time) { List<Path> toRecycle = new ArrayList<>(); // Check if the log directory exists. If it doesn't // then there isn't anything to recycle. Path logDir = Paths.get(db.getLogPath()).toAbsolutePath(); if(!Files.exists(logDir)) { return; } try { DirectoryStream<Path> stream = Files.newDirectoryStream(logDir, "log_*"); for (Path entry: stream) { // System.out.printf("investigating log %s\n", entry); // Do not include the current block. if(logPath == null || !Files.isSameFile(entry, logPath)) { BasicFileAttributes view = Files.getFileAttributeView(entry, BasicFileAttributeView.class) .readAttributes(); if(view.lastModifiedTime().toMillis() < time) { // We no longer need this entry. Add to our // recycling list. toRecycle.add(entry); } } } // Now delete all the values. for(Path p : toRecycle) { Files.delete(p); } } catch(IOException e) { e.printStackTrace(); } } /** * Log a series of writes. These writes are committed "atomically". * Since all writes in the log are serialized, we just need to make * sure that all these writes are contiguous. * * @param ops Set of write operations */ public void put(final WriteBatch ops) { // Lock the log so that we can commit several at a time. synchronized(this) { Iterator<TableKey> iter = ops.iterator(); while(iter.hasNext()) { TableKey key = iter.next(); for(WriteBatch.TableWrite write : ops.getValues(key)) { ByteBuffer data = serialize(key, write.value, write.options); if(logBuffer.remaining() < data.position()) { logBuffer = mapWAL(); } data.flip(); logBuffer.put(data); } } } } /** * Log a put operation. * * @param key The key of the value. * @param value The value to place into the database. * @param options Write options that define how the * value is written to the memtable. */ public void put(final TableKey key, final TableValue value, final WriteOptions options) { ByteBuffer data = serialize(key, value, options); data.flip(); synchronized(this) { if(logBuffer.remaining() < data.capacity()) { logBuffer = mapWAL(); } logBuffer.put(data); } } /** * Parse the command from the byte buffer. **/ private boolean unroll(ByteBuffer b, Map<String, Integer> manifest) throws KevaDBException { long time = b.getLong(); // Check if there are any sstables that are older than // this WAL entry. If so, we can safely skip this entry since // we know the results have already been persisted. if(manifest != null) { for(String uuid : manifest.keySet()) { SSTable table = db.getDiskService().getSSTable(db, uuid, manifest.get(uuid)); if(table.getModificationTime() <= time) { return true; } } } // Check if this is a valid entry. short magic = b.getShort(); if(magic != MAGIC_NUMBER) { return false; } // Read in the various lengths. int keyLength = b.getInt(); int valueLength = b.getInt(); int compressedLength = b.getInt(); int optionLength = b.getInt(); // Instantiate the various buffers. byte[] keyBuffer = new byte[keyLength]; byte[] valueBuffer = new byte[valueLength]; byte[] compressedBuffer = new byte[compressedLength]; b.get(keyBuffer); b.get(compressedBuffer); // Now we need to decompress the value buffer. decompressor.decompress(compressedBuffer, 0, valueBuffer, 0, valueLength); // Now get the write options. WriteOptions options = null; if(optionLength > 0) { byte[] optionBuffer = new byte[optionLength]; b.get(optionBuffer); options = OptionsSerializer.writeOptionsFromBytes(optionBuffer); } TableKey key = TableKey.fromBytes(ByteBuffer.wrap(keyBuffer)); TableValue value = TableValueFactory.fromBytes(valueBuffer, valueLength); // Now put the value into the DB. db.put(key, value, options); return true; } /** * Reply a specific WAL. **/ private void replayLog(Path logPath) throws KevaDBException { // Now look at the disk. if(logPath != null) { // Get a list of the sstables already in place. Map<String, Integer> manifest = db.getDiskService().getDataManifests(db, 0, SSTableService.MAX_LEVELS); FileLock fileLock = null; try { // Must open in "rw" mode so that we can use filelock. FileChannel fc = FileChannel.open(logPath, StandardOpenOption.READ, StandardOpenOption.WRITE); MappedByteBuffer in = fc.map(FileChannel.MapMode.READ_ONLY, 0, (int)fc.size()); fileLock = fc.tryLock(); if(fileLock != null) { for(;;) { if(!unroll(in, manifest)) { break; } } } if(fileLock != null) { fileLock.release(); } fc.close(); } catch(IOException e) { e.printStackTrace(); } } } /** * Replay this log. * * @throws KevaDBException is used to indicate any errors. */ public void replay() throws KevaDBException { List<Path> logs = getLogs(); for (Path log: logs) { replayLog(log); } } }