/* * Copyright (c) 2008, Jan Stender, Bjoern Kolbeck, Mikael Hoegqvist, * Felix Hupfeld, Zuse Institute Berlin * * Licensed under the BSD License, see LICENSE file for details. * */ package de.mxro.thrd.babudb05.index; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.Map.Entry; import java.util.NoSuchElementException; import de.mxro.thrd.babudb05.api.database.ResultSet; import de.mxro.thrd.babudb05.api.index.ByteRangeComparator; import de.mxro.thrd.babudb05.index.overlay.MultiOverlayBufferTree; import de.mxro.thrd.babudb05.index.reader.DiskIndex; import de.mxro.thrd.babudb05.index.reader.InternalBufferUtil; import de.mxro.thrd.babudb05.index.reader.InternalDiskIndexIterator; import de.mxro.thrd.babudb05.index.reader.InternalMergeIterator; import de.mxro.thrd.babudb05.index.writer.DiskIndexWriter; import de.mxro.thrd.babudb05.snapshots.SnapshotConfig; import de.mxro.thrd.xstreemfs.foundation.logging.Logging; import de.mxro.thrd.xstreemfs.foundation.util.OutputUtils; public class LSMTree { private static long totalOnDiskSize = 0; private static final byte[] NULL_ELEMENT = new byte[0]; private MultiOverlayBufferTree overlay; private DiskIndex index; private final ByteRangeComparator comp; private final Object lock; private boolean compressed; private final int maxEntriesPerBlock; private final int maxBlockFileSize; private final boolean useMMap; private final int mmapLimitBytes; /** * Creates a new LSM tree. * * @param indexFile * the on-disk index file - may be <code>null</code> * @param comp * a comparator for byte ranges * @param compressed * Compression of disk-index * @throws IOException * if an I/O error occurs when accessing the on-disk index file */ public LSMTree(String indexFile, ByteRangeComparator comp, boolean compressed, int maxEntriesPerBlock, int maxBlockFileSize, boolean useMMap, int mmapLimit) throws IOException { this.comp = comp; this.compressed = compressed; this.maxEntriesPerBlock = maxEntriesPerBlock; this.maxBlockFileSize = maxBlockFileSize; this.useMMap = useMMap; this.mmapLimitBytes = mmapLimit * 1024 * 1024; overlay = new MultiOverlayBufferTree(NULL_ELEMENT, comp); totalOnDiskSize += indexFile == null ? 0 : getTotalDirSize(new File(indexFile)); index = indexFile == null ? null : new DiskIndex(indexFile, comp, compressed, useMmap()); lock = new Object(); } /** * Performs a lookup. * * @param key * the key to look up * @return the value associated with the key */ public byte[] lookup(byte[] key) { byte[] result = overlay.lookup(key); if (result == NULL_ELEMENT) return null; if (result != null) return result; return index == null ? null : index.lookup(key); } /** * Performs a lookup in a given snapshot. * * @param key * the key to look up * @param snapId * the snapshot ID * @return the value associated with the key in the snapshot */ public byte[] lookup(byte[] key, int snapId) { byte[] result = overlay.lookup(key, snapId); if (result == NULL_ELEMENT) return null; if (result != null) return result; return index == null ? null : index.lookup(key); } /** * Returns the first entry. * * @return the first entry */ public Entry<byte[], byte[]> firstEntry() { ResultSet<byte[], byte[]> it = prefixLookup(new byte[0]); Entry<byte[], byte[]> result = it.hasNext() ? it.next() : null; it.free(); return result; } /** * Returns the first entry in the given snapshot. * * @param snapId * the snapshot ID * @return the first entry */ public Entry<byte[], byte[]> firstEntry(int snapId) { ResultSet<byte[], byte[]> it = prefixLookup(new byte[0], snapId, true); Entry<byte[], byte[]> result = it.hasNext() ? it.next() : null; it.free(); return result; } /** * Returns the last entry. * * @return the last entry */ public Entry<byte[], byte[]> lastEntry() { ResultSet<byte[], byte[]> it = prefixLookup(new byte[0], false); Entry<byte[], byte[]> result = it.hasNext() ? it.next() : null; it.free(); return result; } /** * Returns the last entry in the given snapshot. * * @param snapId * the snapshot ID * @return the last entry */ public Entry<byte[], byte[]> lastEntry(int snapId) { ResultSet<byte[], byte[]> it = prefixLookup(new byte[0], snapId, false); Entry<byte[], byte[]> result = it.hasNext() ? it.next() : null; it.free(); return result; } /** * Performs a prefix lookup. Key-value paris are returned in an iterator in * ascending key order, where only such keys are returned with a matching * prefix according to the comparator. * * @param prefix * the prefix * @return an iterator with key-value pairs */ public ResultSet<byte[], byte[]> prefixLookup(byte[] prefix) { return prefixLookup(prefix, true); } /** * Performs a prefix lookup. Key-value paris are returned in an iterator in * the given key order, where only such keys are returned with a matching * prefix according to the comparator. * * @param prefix * the prefix * @param ascending * if <code>true</code>, entries will be returned in ascending * order; otherwise, they will be returned in descending order * @return an iterator with key-value pairs */ public ResultSet<byte[], byte[]> prefixLookup(byte[] prefix, boolean ascending) { if (prefix != null && prefix.length == 0) prefix = null; List<Iterator<Entry<byte[], byte[]>>> list = new ArrayList<Iterator<Entry<byte[], byte[]>>>(2); list.add(overlay.prefixLookup(prefix, true, ascending)); if (index != null) { byte[][] rng = comp.prefixToRange(prefix, ascending); list.add(index.rangeLookup(rng[0], rng[1], ascending)); } return new OverlayMergeIterator<byte[], byte[]>(list, comp, NULL_ELEMENT, ascending); } /** * Performs a prefix lookup in a given snapshot. Key-value paris are * returned in an iterator in ascending key order, where only such keys are * returned with a matching prefix according to the comparator. * * @param prefix * the prefix * @param snapId * the snapshot ID * @return an iterator with key-value pairs */ public ResultSet<byte[], byte[]> prefixLookup(byte[] prefix, int snapId) { return prefixLookup(prefix, snapId, true); } /** * Performs a prefix lookup in a given snapshot. Key-value pairs are * returned in an iterator in the given key order, where only such keys are * returned with a matching prefix according to the comparator. * * @param prefix * the prefix * @param snapId * the snapshot ID * @param ascending * if <code>true</code>, entries will be returned in ascending * order; otherwise, they will be returned in descending order * @return an iterator with key-value pairs */ public ResultSet<byte[], byte[]> prefixLookup(byte[] prefix, int snapId, boolean ascending) { if (prefix != null && prefix.length == 0) prefix = null; List<Iterator<Entry<byte[], byte[]>>> list = new ArrayList<Iterator<Entry<byte[], byte[]>>>(2); list.add(overlay.prefixLookup(prefix, snapId, true, ascending)); if (index != null) { byte[][] rng = comp.prefixToRange(prefix, ascending); list.add(index.rangeLookup(rng[0], rng[1], ascending)); } return new OverlayMergeIterator<byte[], byte[]>(list, comp, NULL_ELEMENT, ascending); } /** * Performs a range lookup. Key-value paris are returned in an iterator in * ascending key order, where only such keys are returned between * <code>from</code> (inclusively) and <code>to</code> (inclusively), * according to the comparator. * * @param from * the first key (inclusively) * @param to * the last key (exclusively) * @return an iterator with key-value pairs */ public ResultSet<byte[], byte[]> rangeLookup(byte[] from, byte[] to) { return rangeLookup(from, to, true); } /** * Performs a range lookup. Key-value paris are returned in an iterator in * the given key order, where only such keys are returned between * <code>from</code> (inclusively) and <code>to</code> (inclusively), * according to the comparator. * * @param from * the first key (inclusively) * @param to * the last key (exclusively) * @param ascending * if <code>true</code>, entries will be returned in ascending * order; otherwise, they will be returned in descending order * @return an iterator with key-value pairs */ public ResultSet<byte[], byte[]> rangeLookup(byte[] from, byte[] to, boolean ascending) { if (from.length == 0) from = null; if (to.length == 0) to = null; List<Iterator<Entry<byte[], byte[]>>> list = new ArrayList<Iterator<Entry<byte[], byte[]>>>(2); list.add(overlay.rangeLookup(from, to, true, ascending)); if (index != null) list.add(index.rangeLookup(from, to, ascending)); return new OverlayMergeIterator<byte[], byte[]>(list, comp, NULL_ELEMENT, ascending); } /** * Performs a range lookup in a given snapshot. Key-value paris are returned * in an iterator in ascending key order, where only such keys are returned * between <code>from</code> (inclusively) and <code>to</code> * (inclusively), according to the comparator. * * @param from * the first key (inclusively) * @param to * the last key (exclusively) * @param snapId * the snapshot ID * @return an iterator with key-value pairs */ public ResultSet<byte[], byte[]> rangeLookup(byte[] from, byte[] to, int snapId) { return rangeLookup(from, to, snapId, true); } /** * Performs a range lookup in a given snapshot. Key-value pairs are returned * in an iterator in the given key order, where only such keys are returned * between <code>from</code> (inclusively) and <code>to</code> * (inclusively), according to the comparator. * * @param from * the first key (inclusively) * @param to * the last key (exclusively) * @param snapId * the snapshot ID * @param ascending * if <code>true</code>, entries will be returned in ascending * order; otherwise, they will be returned in descending order * @return an iterator with key-value pairs */ public ResultSet<byte[], byte[]> rangeLookup(byte[] from, byte[] to, int snapId, boolean ascending) { if (from.length == 0) from = null; if (to.length == 0) to = null; List<Iterator<Entry<byte[], byte[]>>> list = new ArrayList<Iterator<Entry<byte[], byte[]>>>(2); list.add(overlay.rangeLookup(from, to, snapId, true, ascending)); if (index != null) list.add(index.rangeLookup(from, to, ascending)); return new OverlayMergeIterator<byte[], byte[]>(list, comp, NULL_ELEMENT, ascending); } /** * Inserts a key-value pair. * * @param key * the key * @param value * the value */ public void insert(byte[] key, byte[] value) { synchronized (lock) { overlay.insert(key, value); } } /** * Deletes a key-value pair. This method is equivalent to * <code>insert(key, null)</code>. * * @param key * the key */ public void delete(byte[] key) { synchronized (lock) { overlay.insert(key, null); } } /** * Creates a new in-memory snapshot. * * @return the snapshot ID */ public int createSnapshot() { return overlay.newOverlay(); } /** * Writes an in-memory snapshot to a file on disk. * * @param targetFile * the file to which to write the snapshot * @param snapId * the snapshot ID * @throws IOException * if an I/O error occurs while writing the snapshot */ public void materializeSnapshot(String targetFile, int snapId) throws IOException { DiskIndexWriter writer = new DiskIndexWriter(targetFile, maxEntriesPerBlock, compressed, maxBlockFileSize); InternalMergeIterator it = internalPrefixLookup(null, snapId, true); writer.writeIndex(it); it.free(); } /** * Writes a certain part of an in-memory snapshot to a file on disk. * * @param targetFile * the file to which to write the snapshot * @param snapId * the snapshot ID * @param indexId * the id used by the database to identify this index * @param snap * the snapshot configuration * @throws IOException * if an I/O error occurs while writing the snapshot */ public void materializeSnapshot(String targetFile, final int snapId, final int indexId, final SnapshotConfig snap) throws IOException { DiskIndexWriter writer = new DiskIndexWriter(targetFile, maxEntriesPerBlock, compressed, maxBlockFileSize); writer.writeIndex(new ResultSet<Object, Object>() { private ResultSet<Object, Object>[] iterators; private Entry<Object, Object> next; private int currentIt; { byte[][] prefixes = snap.getPrefixes(indexId); currentIt = 0; if (prefixes != null) { iterators = new ResultSet[prefixes.length]; for (int i = 0; i < prefixes.length; i++) iterators[i] = internalPrefixLookup(prefixes[i], snapId, true); } else { iterators = new ResultSet[] { prefixLookup(null, snapId, true) }; } getNextElement(); } @Override public boolean hasNext() { return (next != null); } @Override public Entry<Object, Object> next() { if (next == null) throw new NoSuchElementException(); Entry<Object, Object> tmp = next; getNextElement(); return tmp; } @Override public void remove() { throw new UnsupportedOperationException(); } @Override public void free() { for (ResultSet<Object, Object> it : iterators) it.free(); } protected void getNextElement() { for (;;) { // get the next iterator w/ elements while (currentIt < iterators.length && !iterators[currentIt].hasNext()) currentIt++; // if there is no such iterator, set next to null and return if (currentIt >= iterators.length) { next = null; return; } // otherwise, next is the next element from the current // iterator next = iterators[currentIt].next(); // if this element is explicitly excluded, skip it byte[] tmp = InternalBufferUtil.toBuffer(next.getKey()); if (snap.containsKey(indexId, tmp)) break; } } }); } /** * Links the LSM tree to a new snapshot file. The on-disk index is replaced * with the index stored in the given snapshot file, and all in-memory * snapshots are discarded. * * @param snapshotFile * the snapshot file * @throws IOException * if an I/O error occurred while reading the snapshot file */ public void linkToSnapshot(String snapshotFile) throws IOException { final DiskIndex oldIndex = index; synchronized (lock) { totalOnDiskSize -= index == null ? 0 : index.getSize(); index = new DiskIndex(snapshotFile, comp, this.compressed, useMmap()); totalOnDiskSize += index.getSize(); if (oldIndex != null) oldIndex.destroy(); overlay.cleanup(); } } /** * Checks if files containing index data are memory-mapped. * * @return <code>true</code> if index files are memory-mapped, <code>false</code>, otherwise */ public boolean isMMapEnabled() { return useMMap; } /** * Checks if files containing index data are compressed. * * @return <code>true</code> if index files are compressed, <code>false</code>, otherwise */ public boolean isCompressed() { return compressed; } /** * Destroys the LSM tree. Frees all in-memory indices plus the on-disk tree. */ public void destroy() throws IOException { synchronized (lock) { if (index != null) { totalOnDiskSize -= index.getSize(); index.destroy(); } overlay.cleanup(); } } private boolean useMmap() { Logging.logMessage(Logging.LEVEL_DEBUG, this, "DB size: " + OutputUtils.formatBytes(totalOnDiskSize)); return useMMap && (mmapLimitBytes < 0 || totalOnDiskSize < mmapLimitBytes); } /** * Performs a prefix lookup. Key-value paris are returned in an iterator in * the given key order, where only such keys are returned with a matching * prefix according to the comparator. <br/> * * <b>WARNING:</b> This method should only be accessed internally, as it * provides access to internal index buffers that have to remain immutable. * * @param prefix * the prefix * @param snapId * the snapshot ID * @param ascending * if <code>true</code>, entries will be returned in ascending * order; otherwise, they will be returned in descending order * @return an iterator with references to internally used buffers */ protected InternalMergeIterator internalPrefixLookup(byte[] prefix, int snapId, boolean ascending) { if (prefix != null && prefix.length == 0) prefix = null; Iterator<Entry<byte[], byte[]>> overlayIterator = overlay.prefixLookup(prefix, snapId, true, ascending); InternalDiskIndexIterator diskIndexIterator = null; if (index != null) { byte[][] rng = comp.prefixToRange(prefix, ascending); diskIndexIterator = index.internalRangeLookup(rng[0], rng[1], ascending); } return new InternalMergeIterator(overlayIterator, diskIndexIterator, comp, NULL_ELEMENT, ascending); } private static long getTotalDirSize(File dir) { if (!dir.exists()) return 0; long size = 0; if (dir.isDirectory()) { for (File child : dir.listFiles()) size += getTotalDirSize(child); } else size += dir.length(); return size; } }