/* * Copyright (c) 2013-2017 Cinchapi Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.cinchapi.concourse.server.storage.db; import java.io.EOFException; import java.io.File; import java.io.IOException; import java.io.StreamCorruptedException; import java.lang.ref.SoftReference; import java.nio.ByteBuffer; import java.nio.MappedByteBuffer; import java.nio.channels.FileChannel; import java.nio.channels.FileChannel.MapMode; import java.util.Comparator; import java.util.Iterator; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.locks.ReentrantReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock.ReadLock; import java.util.concurrent.locks.ReentrantReadWriteLock.WriteLock; import javax.annotation.Nullable; import javax.annotation.concurrent.GuardedBy; import javax.annotation.concurrent.ThreadSafe; import com.cinchapi.concourse.annotate.PackagePrivate; import com.cinchapi.concourse.server.GlobalState; import com.cinchapi.concourse.server.concurrent.Locks; import com.cinchapi.concourse.server.io.Byteable; import com.cinchapi.concourse.server.io.ByteableCollections; import com.cinchapi.concourse.server.io.Byteables; import com.cinchapi.concourse.server.io.FileSystem; import com.cinchapi.concourse.server.io.Syncable; import com.cinchapi.concourse.server.storage.Action; import com.cinchapi.concourse.server.storage.cache.BloomFilter; import com.cinchapi.concourse.util.Logger; import com.google.common.base.Preconditions; import com.google.common.base.Throwables; import com.google.common.collect.ComparisonChain; import com.google.common.collect.SortedMultiset; import com.google.common.collect.TreeMultiset; /** * <p> * A Block is a sorted collection of Revisions that is used by the Database to * store indexed data. When a Block is initially created, it resides solely in * memory and is able to insert new revisions, which are sorted on the fly by a * {@link Sorter}. Once the Block is synced to disk it becomes immutable and all * lookups are disk based. This means that writing to a block never incurs any * random disk I/O. A Block is not durable until the {@link #sync()} method is * called, so Block serialization and Buffer.Page deletion happen sequentially. * </p> * <p> * Each Block is stored with a corresponding {@link BloomFilter} and a * {@link BlockIndex} to make lookups more efficient. The BlockFilter is used to * test whether a Revision involving some locator and possibly key, and possibly * value <em>might</em> exist in the Block. The BlockIndex is used to find the * exact start and end positions for Revisions involving a locator and possibly * some key. This means that reading from a Block never incurs any unnecessary * disk I/O. * </p> * <p> * Prior to 0.2, Concourse stored each logical Record in its own file, which had * the advantage of simplified deserialization (we only needed to locate one * file and read all of its content). The down side to that approach was that a * single record couldn't be deserialized if it was larger than the amount of * available memory. Storing data in blocks, helps to solve that problem, * because larger records are broken up and we can do more granular seeking to * reduce the amount of data that must come into memory (i.e. we can limit data * reading by timestamp). Blocks also make it much easier to nuke old data * without reading anything. And since each Block has its own bloom filter in * memory, we make best efforts to only look at files when necessary. * </p> * * * @author Jeff Nelson */ @ThreadSafe @PackagePrivate abstract class Block<L extends Byteable & Comparable<L>, K extends Byteable & Comparable<K>, V extends Byteable & Comparable<V>> implements Byteable, Syncable, Iterable<Revision<L, K, V>> { /** * Return a new PrimaryBlock that will be stored in {@code directory}. * * @param id * @param directory * @return the PrimaryBlock */ public static PrimaryBlock createPrimaryBlock(String id, String directory) { return new PrimaryBlock(id, directory, false); } /** * Return a new SearchBlock that will be stored in {@code directory}. * * @param id * @param directory * @return the SearchBlock */ public static SearchBlock createSearchBlock(String id, String directory) { return new SearchBlock(id, directory, false); } /** * Return a new SecondaryBlock that will be stored in {@code directory}. * * @param id * @param directory * @return the SecondaryBlock */ public static SecondaryBlock createSecondaryBlock(String id, String directory) { return new SecondaryBlock(id, directory, false); } /** * Return the block id from the name of the block file. * * @param filename * @return the block id */ public static String getId(String filename) { return FileSystem.getSimpleName(filename); } /** * The expected number of Block insertions. This number is used to size the * Block's internal data structures. This value should be large enough to * reflect the fact that, for each revision, we make 3 inserts into the * bloom filter, but no larger than necessary since we must keep all bloom * filters in memory. */ private static final int EXPECTED_INSERTIONS = GlobalState.BUFFER_PAGE_SIZE; /** * The extension for the {@link BloomFilter} file. */ private static final String FILTER_NAME_EXTENSION = ".fltr"; /** * The extension for the {@link BlockIndex} file. */ private static final String INDEX_NAME_EXTENSION = ".indx"; /** * The extension for the block file. */ @PackagePrivate static final String BLOCK_NAME_EXTENSION = ".blk"; /** * The location of the block file. */ private final String file; /** * A fixed size filter that is used to test whether elements are contained * in the Block without actually looking through the Block. */ private BloomFilter filter; /** * The unique id for the block. Each component of the block is named after * the id. It is assumed that block ids should be assigned in atomically * increasing order (i.e. a timestamp). */ private final String id; /** * A flag that indicates whether we should ignore (and not log a warning) if * an attempt is made to sync an empty block. */ private final boolean ignoreEmptySync; /** * The index to determine which bytes in the block pertain to a locator or * locator/key pair. */ private final BlockIndex index; // Since the index is only used for // immutable blocks, it is only populated // during the call to #getBytes() /** * The master lock for {@link #write} and {@link #read}. DO NOT use this * lock directly. */ private final ReentrantReadWriteLock master = new ReentrantReadWriteLock(); /** * A collection that contains all the Revisions that have been inserted into * the Block. This collection is sorted on the fly as elements are inserted. * This collection is only maintained for a mutable Block. A Block that is * synced and subsequently read from disk does not rely on this collection * at all. */ @Nullable private SortedMultiset<Revision<L, K, V>> revisions; /** * The running size of the Block. This number only refers to the size of the * Revisions that are stored in the block file. The size for the filter and * index are tracked separately. */ private transient int size; /** * The size counter to use if this Block is {@link #concurrent} and uses the * {@link #insertUnsafe(Byteable, Byteable, Byteable, long, Action)} method. */ private transient AtomicInteger atomicSize = new AtomicInteger(0); /** * A soft reference to the {@link #revisions} that <em>may</em> stay in * memory after the Block has been synced. The GC is encouraged to clear * this reference in response to memory pressure at which point disk seeks * will be performed in the {@link #seek(Record, Byteable...)} method. */ private final SoftReference<SortedMultiset<Revision<L, K, V>>> softRevisions; /** * A hint that this Block uses the * {@link #insertUnsafe(Byteable, Byteable, Byteable, long, Action)} method * to add data without grabbing any locks. This is generally safe to do as * long as {@link #createBackingStore(Comparator)} returns a concurrent * collection that is thread safe. */ protected transient boolean concurrent = false; /** * The flag that indicates whether the Block is mutable or not. A Block is * mutable until a call to {@link #sync()} stores it to disk. */ protected transient boolean mutable; /** * A shared lock that permits many readers and no writer. Use this lock to * ensure that no data insert occurs while a seek is happening within the * Block. */ protected final ReadLock read = master.readLock(); /** * An exclusive lock that permits only one writer and no reader. Use this * lock to ensure that no seek occurs while data is being inserted into the * Block. */ protected final WriteLock write = master.writeLock(); /** * Construct a new instance. * * @param id * @param directory * @param diskLoad - set to {@code true} to deserialize the block {@code id} * from {@code directory} on disk */ protected Block(String id, String directory, boolean diskLoad) { FileSystem.mkdirs(directory); this.id = id; this.file = directory + File.separator + id + BLOCK_NAME_EXTENSION; if(diskLoad) { this.mutable = false; this.size = (int) FileSystem.getFileSize(this.file); try { this.filter = BloomFilter.open(directory + File.separator + id + FILTER_NAME_EXTENSION); filter.disableThreadSafety(); } catch (RuntimeException e) { repair(e); } this.index = BlockIndex.open(directory + File.separator + id + INDEX_NAME_EXTENSION); this.revisions = null; } else { this.mutable = true; this.size = 0; this.revisions = createBackingStore(Sorter.INSTANCE); this.filter = BloomFilter.create( (directory + File.separator + id + FILTER_NAME_EXTENSION), EXPECTED_INSERTIONS); this.index = BlockIndex.create(directory + File.separator + id + INDEX_NAME_EXTENSION, EXPECTED_INSERTIONS); } this.softRevisions = new SoftReference<SortedMultiset<Revision<L, K, V>>>( revisions); this.ignoreEmptySync = this instanceof SearchBlock; } @Override public void copyTo(ByteBuffer buffer) { Locks.lockIfCondition(read, mutable); try { L locator = null; K key = null; int position = 0; boolean populated = false; for (Revision<L, K, V> revision : revisions) { populated = true; buffer.putInt(revision.size()); revision.copyTo(buffer); position = buffer.position() - revision.size() - 4; /* * States that trigger this condition to be true: * 1. This is the first locator we've seen * 2. This locator is different than the last one we've seen */ if(locator == null || !locator.equals(revision.getLocator())) { index.putStart(position, revision.getLocator()); if(locator != null) { // There was a locator before us (we are not the first!) // and we need to record the end index. index.putEnd(position - 1, locator); } } /* * NOTE: IF key == null, then it must be the case that locator * == null since they are set at the same time. Therefore we do * not need to explicitly check for that condition below * * States that trigger this condition to be true: * 1. This is the first key we've seen * 2. This key is different than the last one we've seen * (regardless of whether the locator is different or the same!) * 3. This key is the same as the last one we've seen, but the * locator is different. */ if(key == null || !key.equals(revision.getKey()) || !locator.equals(revision.getLocator())) { index.putStart(position, revision.getLocator(), revision.getKey()); if(key != null) { // There was a locator, key before us (we are not the // first!) and we need to record the end index. index.putEnd(position - 1, locator, key); } } locator = revision.getLocator(); key = revision.getKey(); } if(populated) { position = buffer.position() - 1; index.putEnd(position, locator); index.putEnd(position, locator, key); } } finally { Locks.unlockIfCondition(read, mutable); } } @Override public boolean equals(Object obj) { if(obj instanceof Block) { // CON-83: I am intentionally making all Blocks with the same #id // equal, regardless of subclass type. return id.equals(((Block<?, ?, ?>) obj).id); } else { return false; } } @Override public ByteBuffer getBytes() { read.lock(); try { ByteBuffer bytes = ByteBuffer.allocate(sizeImpl()); copyTo(bytes); bytes.rewind(); return bytes; } finally { read.unlock(); } } /** * Return the block id. * * @return the id */ public String getId() { return id; } @Override public int hashCode() { return id.hashCode(); } /** * Insert a revision for {@code key} as {@code value} in {@code locator} at * {@code version} into this Block. * * @param locator * @param key * @param value * @param version * @param type * @throws IllegalStateException if the Block is not mutable */ public Revision<L, K, V> insert(L locator, K key, V value, long version, Action type) throws IllegalStateException { Locks.lockIfCondition(write, mutable); try { Preconditions.checkState(mutable, "Cannot modify a block that is not mutable"); Revision<L, K, V> revision = makeRevision(locator, key, value, version, type); revisions.add(revision); filter.put(revision.getLocator()); filter.put(revision.getLocator(), revision.getKey()); filter.put(revision.getLocator(), revision.getKey(), revision.getValue()); // NOTE: The entire revision is added // to the filter so that we can // quickly verify that a revision // DOES NOT exist using // #mightContain(L,K,V) without // seeking size += revision.size() + 4; return revision; } finally { Locks.unlockIfCondition(write, mutable); } } /** * {@inheritDoc} * <p> * <strong>NOTE:</strong> Use this method with extreme caution because it * will load all of the revisions from disk, into memory. * </p> */ @Override public Iterator<Revision<L, K, V>> iterator() { Preconditions.checkState(!mutable, "Cannot iterate a mutable block"); return new Iterator<Revision<L, K, V>>() { private final Iterator<ByteBuffer> it = ByteableCollections .streamingIterator(file, GlobalState.BUFFER_PAGE_SIZE); @Override public boolean hasNext() { return it.hasNext(); } @Override public Revision<L, K, V> next() { ByteBuffer next = it.next(); if(next != null) { return Byteables.read(next, xRevisionClass()); } else { return null; } } @Override public void remove() { throw new UnsupportedOperationException(); } }; } /** * Return {@code true} if this Block might contain revisions involving * {@code key} as {@code value} in {@code locator}. This method <em>may</em> * return a false positive, but never a false negative. If this method * returns {@code true}, the caller should seek for {@code key} in * {@code locator} and check if any of those revisions contain {@code value} * as a component. * * @param locator * @param key * @param value * @return {@code true} if it is possible that relevant revisions exists */ public boolean mightContain(L locator, K key, V value) { Locks.lockIfCondition(read, mutable); try { return filter.mightContain(locator, key, value); } finally { Locks.unlockIfCondition(read, mutable); } } /** * Seek revisions that contain {@code key} in {@code locator} and append * them to {@code record} if it is <em>likely</em> that those revisions * exist in this Block. * * @param locator * @param key * @param record */ @GuardedBy("seek(Record, Byteable...)") public void seek(L locator, K key, Record<L, K, V> record) { seek(record, locator, key); } /** * Seek revisions that contain any key in {@code locator} and append them to * {@code record} if it is <em>likely</em> that those revisions exist in * this Block. * * @param locator * @param record */ @GuardedBy("seek(Record, Byteable...)") public void seek(L locator, Record<L, K, V> record) { seek(record, locator); } @Override public int size() { Locks.lockIfCondition(read, mutable); try { return sizeImpl(); } finally { Locks.unlockIfCondition(read, mutable); } } /** * Flush the content to disk in a block file, sync the filter and index and * finally make the Block immutable. */ @Override public void sync() { write.lock(); try { if(mutable && sizeImpl() > 0) { mutable = false; FileChannel channel = FileSystem.getFileChannel(file); channel.write(getBytes()); channel.force(true); filter.sync(); index.sync(); FileSystem.closeFileChannel(channel); revisions = null; // Set to NULL so that the Set is eligible for // GC while the Block stays in memory. filter.disableThreadSafety(); } else if(!mutable) { Logger.warn("Cannot sync a block that is not mutable: {}", id); } else if(!ignoreEmptySync) { Logger.warn("Cannot sync a block that is empty: {}. " + "Was there an unexpected server shutdown recently?", id); } } catch (IOException e) { throw Throwables.propagate(e); } finally { write.unlock(); } } @Override public String toString() { return getClass().getSimpleName() + " " + id; } /** * Attempt to repair the Block from the symptoms of the specified exception. * Generally speaking, a repair is only possible if the exception pertains * to the metadata (e.g. filter or index) and not the actual block data. * <p> * If a repair is not possible, then the input exception is re-thrown * </p> * * @param e - the {@link RuntimeException} that was caught indicates what * error needs to be repaired. */ private void repair(RuntimeException e) { if(e.getCause() != null && (e.getCause() instanceof EOFException || e.getCause() instanceof StreamCorruptedException)) { String target = file.replace(BLOCK_NAME_EXTENSION, FILTER_NAME_EXTENSION); String backup = target + ".bak"; FileSystem.copyBytes(target, backup); FileSystem.deleteFile(target); filter = BloomFilter.create(target, EXPECTED_INSERTIONS); MappedByteBuffer bytes = FileSystem.map(file, MapMode.READ_ONLY, 0, FileSystem.getFileSize(file)); Iterator<ByteBuffer> it = ByteableCollections.iterator(bytes); while (it.hasNext()) { Revision<L, K, V> revision = Byteables.read(it.next(), xRevisionClass()); filter.put(revision.getLocator()); filter.put(revision.getLocator(), revision.getKey()); filter.put(revision.getLocator(), revision.getKey(), revision.getValue()); } filter.sync(); FileSystem.deleteFile(backup); Logger.warn("Found and repaired a corrupted bloom " + "filter for {} {}", this.getClass().getSimpleName(), id); FileSystem.unmap(bytes); } else { throw e; } } /** * Seek revisions that contain components from {@code byteables} and append * them to {@code record}. The seek will be perform in memory iff this block * is mutable, otherwise, the seek happens on disk. * * @param record * @param byteables */ private void seek(Record<L, K, V> record, Byteable... byteables) { Locks.lockIfCondition(read, mutable); try { if(filter.mightContain(byteables)) { SortedMultiset<Revision<L, K, V>> revisions = softRevisions .get(); if(revisions != null) { Iterator<Revision<L, K, V>> it = revisions.iterator(); boolean processing = false; // Since the revisions are // sorted, I can toggle this // flag on once I reach a // revision that I care about so // that I can break out of the // loop once I reach a revision // I don't care about again. boolean checkSecond = byteables.length > 1; while (it.hasNext()) { Revision<L, K, V> revision = it.next(); if(revision.getLocator().equals(byteables[0]) && ((checkSecond && revision.getKey().equals( byteables[1])) || !checkSecond)) { processing = true; record.append(revision); } else if(processing) { break; } } } else { int start = index.getStart(byteables); int length = index.getEnd(byteables) - (start - 1); if(start != BlockIndex.NO_ENTRY && length > 0) { ByteBuffer bytes = FileSystem.map(file, MapMode.READ_ONLY, start, length); Iterator<ByteBuffer> it = ByteableCollections .iterator(bytes); while (it.hasNext()) { Revision<L, K, V> revision = Byteables.read( it.next(), xRevisionClass()); Logger.debug("Attempting to append {} from {} to " + "{}", revision, this, record); record.append(revision); } } } } } finally { Locks.unlockIfCondition(read, mutable); } } /** * Internal implementation to return size of this Block without grabbing any * locks. * * @return the size */ private int sizeImpl() { return concurrent ? atomicSize.get() : size; } /** * Return the backing store to hold revisions that are placed in this Block. * This is only relevant to use when the Block is {@link #mutable} and not * yet persisted to disk. * <p> * If this Block is to be {@link #concurrent} then override this method and * return a Concurrent Multiset. * </p> * * @param comparator * @return the backing store */ @SuppressWarnings("rawtypes") protected SortedMultiset<Revision<L, K, V>> createBackingStore( Comparator<Revision> comparator) { return TreeMultiset.create(comparator); } /** * Return a dump of the revisions in the block as a String. This method * primarily exists for debugging using the {@link ManageDataCli} tool. * <p> * NOTE: This method will map an entire immutable block into memory, so * please use with caution. * </p> * * @return a string dump */ protected String dump() { Locks.lockIfCondition(read, mutable); try { StringBuilder sb = new StringBuilder(); sb.append("Dump for " + getClass().getSimpleName() + " " + id); sb.append("\n"); sb.append("------"); sb.append("\n"); if(mutable) { for (Revision<L, K, V> revision : revisions) { sb.append(revision); sb.append("\n"); } } else { ByteBuffer bytes = FileSystem.map(file, MapMode.READ_ONLY, 0, FileSystem.getFileSize(file)); Iterator<ByteBuffer> it = ByteableCollections.iterator(bytes); while (it.hasNext()) { Revision<L, K, V> revision = Byteables.read(it.next(), xRevisionClass()); sb.append(revision); sb.append("\n"); } } sb.append("\n"); return sb.toString(); } finally { Locks.unlockIfCondition(read, mutable); } } protected Revision<L, K, V> insertUnsafe(L locator, K key, V value, long version, Action type) throws IllegalStateException { Preconditions.checkState(mutable, "Cannot modify a block that is not mutable"); Revision<L, K, V> revision = makeRevision(locator, key, value, version, type); revisions.add(revision); filter.put(revision.getLocator()); filter.put(revision.getLocator(), revision.getKey()); filter.put(revision.getLocator(), revision.getKey(), revision.getValue()); // NOTE: The entire revision is added // to the filter so that we can // quickly verify that a revision // DOES NOT exist using // #mightContain(L,K,V) without // seeking atomicSize.addAndGet(revision.size() + 4); return revision; } /** * Return a {@link Revision} for {@code key} as {@code value} in * {@code locator} at {@code version}. * * @param locator * @param key * @param value * @param version * @param type * @return the Revision */ protected abstract Revision<L, K, V> makeRevision(L locator, K key, V value, long version, Action type); /** * Return the class of the {@code revision} type. * * @return the revision class */ protected abstract Class<? extends Revision<L, K, V>> xRevisionClass(); /** * A Comparator that sorts Revisions in a block. The sort order is * {@code locator} followed by {@code key} followed by {@code version}. * * @author Jeff Nelson */ @SuppressWarnings("rawtypes") private enum Sorter implements Comparator<Revision> { INSTANCE; /** * Sorts by locator followed by key followed by version. */ @Override public int compare(Revision o1, Revision o2) { return ComparisonChain.start() .compare(o1.getLocator(), o2.getLocator()) .compare(o1.getKey(), o2.getKey()) .compare(o1.getVersion(), o2.getVersion()) .compare(o1.getValue(), o2.getValue()).result(); } } }