/*
* Copyright (c) 2013-2017 Cinchapi Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.cinchapi.concourse.server.storage.temp;
import java.io.File;
import java.nio.ByteBuffer;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel.MapMode;
import java.util.AbstractList;
import java.util.ConcurrentModificationException;
import java.util.Iterator;
import java.util.List;
import java.util.ListIterator;
import java.util.Map;
import java.util.Set;
import java.util.SortedMap;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.locks.ReentrantLock;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import java.util.concurrent.locks.StampedLock;
import javax.annotation.Nullable;
import javax.annotation.concurrent.GuardedBy;
import javax.annotation.concurrent.ThreadSafe;
import com.cinchapi.common.base.TernaryTruth;
import com.cinchapi.concourse.Tag;
import com.cinchapi.concourse.annotate.Restricted;
import com.cinchapi.concourse.server.GlobalState;
import com.cinchapi.concourse.server.concurrent.ConcourseExecutors;
import com.cinchapi.concourse.server.concurrent.Locks;
import com.cinchapi.concourse.server.concurrent.PriorityReadWriteLock;
import com.cinchapi.concourse.server.io.ByteableCollections;
import com.cinchapi.concourse.server.io.Byteables;
import com.cinchapi.concourse.server.io.FileSystem;
import com.cinchapi.concourse.server.model.PrimaryKey;
import com.cinchapi.concourse.server.model.Text;
import com.cinchapi.concourse.server.model.Value;
import com.cinchapi.concourse.server.plugin.data.WriteEvent;
import com.cinchapi.concourse.server.storage.Action;
import com.cinchapi.concourse.server.storage.Engine;
import com.cinchapi.concourse.server.storage.Inventory;
import com.cinchapi.concourse.server.storage.InventoryTracker;
import com.cinchapi.concourse.server.storage.PermanentStore;
import com.cinchapi.concourse.server.storage.cache.BloomFilter;
import com.cinchapi.concourse.server.storage.db.Database;
import com.cinchapi.concourse.thrift.Operator;
import com.cinchapi.concourse.thrift.TObject;
import com.cinchapi.concourse.thrift.Type;
import com.cinchapi.concourse.time.Time;
import com.cinchapi.concourse.util.Convert;
import com.cinchapi.concourse.util.Integers;
import com.cinchapi.concourse.util.Logger;
import com.cinchapi.concourse.util.MultimapViews;
import com.cinchapi.concourse.util.NaturalSorter;
import com.cinchapi.concourse.util.ReadOnlyIterator;
import com.cinchapi.concourse.util.TMaps;
import com.cinchapi.concourse.util.ThreadFactories;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import static com.cinchapi.concourse.server.GlobalState.BINARY_QUEUE;
import static com.cinchapi.concourse.server.GlobalState.BUFFER_DIRECTORY;
import static com.cinchapi.concourse.server.GlobalState.BUFFER_PAGE_SIZE;
import static com.google.common.collect.Maps.newLinkedHashMap;
/**
* A {@code Buffer} is a special implementation of {@link Limbo} that aims to
* quickly accumulate writes in memory before performing a batch flush into some
* {@link PermanentStore}.
* <p>
* A Buffer enforces the durability guarantee because all writes are also
* immediately flushed to disk. Even though there is some disk I/O, the overhead
* is minimal and writes are fast because the entire backing store is memory
* mapped and the writes are always appended.
* </p>
*
* @author Jeff Nelson
*/
@ThreadSafe
public final class Buffer extends Limbo implements InventoryTracker {
/**
* Assuming {@code location} is a valid bufferStore, return an
* {@link Iterator} to traverse the writes in the Buffer directly from disk
* without loading the entire Buffer into memory.
*
* @return the iterator
*/
public static Iterator<Write> onDiskIterator(String location) {
return new OnDiskIterator(location);
}
// NOTE: The Buffer does not ever lock itself because its delegates
// concurrency control to each individual page. Furthermore, since each
// Page is append-only, there is no need to ever lock any Page that is not
// equal to #currentPage. The Buffer does grab the transport readLock for
// most methods so that we don't end up in situations where a transport
// happens while we're trying to read.
/**
* The average number of bytes used to store an arbitrary Write.
*/
private static final int AVG_WRITE_SIZE = 30; /* arbitrary */
/**
* A global {@link ExecutorService} to asynchronously record all the
* {@link WriteEvent write events} that are handled by any Buffer instance.
*/
private final static ExecutorService GLOBAL_EXECUTOR = Executors
.newCachedThreadPool(
ThreadFactories.namingDaemonThreadFactory("buffer-global"));
/**
* Don't let the transport rate exceed this value.
*/
private static int MAX_TRANSPORT_RATE = 8192;
/**
* The maximum number of milliseconds to sleep between transport cycles.
*/
private static final int MAX_TRANSPORT_THREAD_SLEEP_TIME_IN_MS = 100;
/**
* The minimum number of milliseconds to sleep between transport cycles.
*/
private static final int MIN_TRANSPORT_THREAD_SLEEP_TIME_IN_MS = 5;
/**
* The number of slots to put in each Page's bloom filter. We want this
* small enough to have few hash functions, but large enough so that the
* bloom filter does not become saturated.
*/
private static int PER_PAGE_BLOOM_FILTER_CAPACITY = GlobalState.BUFFER_PAGE_SIZE
/ 10;
/**
* The multiplier that is used when increasing the rate of transport.
*/
protected int transportRateMultiplier = 2; // visible for testing
/**
* A pointer to the current Page.
*/
private Page currentPage;
/**
* The directory where the Buffer pages are stored.
*/
private final String directory;
/**
* The environment that is associated with {@link Engine}.
*/
private String environment;
/**
* A pointer to the inventory that is used within the Engine.
*/
private Inventory inventory = null;
/**
* A runnable that flushes the inventory to disk.
*/
private Runnable inventorySync = new Runnable() {
@Override
public void run() {
inventory.sync();
}
};
/**
* The number of verifies initiated.
*/
private AtomicLong numVerifyRequests;
/**
* The number of verifies scanning the buffer.
*/
private AtomicLong numVerifyScans;
/**
* The sequence of Pages that make up the Buffer.
*/
private final List<Page> pages = new AbstractList<Page>() { // This List
// implementation
// provides an
// iterator that
// has
// "reloading"
// functionality
// such that we
// aren't halted
// by a CME that
// occurs when
// one thread
// adds a page
// to the
// underlying
// collection
// while another
// thread is
// using the
// iterator
/**
* The wrapped list that actually stores the data.
*/
private final List<Page> delegate = Lists.newArrayList();
@Override
public void add(int index, Page element) {
delegate.add(index, element);
}
@Override
public Page get(int index) {
return delegate.get(index);
}
@Override
public Iterator<Page> iterator() {
return new Iterator<Page>() {
int index = 0;
ListIterator<Page> it = delegate.listIterator(index);
@Override
public boolean hasNext() {
return it.hasNext();
}
@Override
public Page next() {
try {
index = it.nextIndex();
return it.next();
}
catch (ConcurrentModificationException e) {
// CON-75: The exception is thrown because a new page
// was adding by another thread while the current thread
// (which owns the iterator) was in the middle of the
// read. We can ignore this exception, because adding a
// new page will not lead to inconsistent results since
// all the data we've read so far is still valid. This
// just means we have more work to do before finishing
// than we originally anticipated.
//
// It is worth noting that each read method grabs the
// transportLock which prevents the case of a page being
// removed in the middle of a read.
it = delegate.listIterator(index);
return next();
}
}
@Override
public void remove() {
it.remove();
}
};
}
@Override
public Page remove(int index) {
return delegate.remove(index);
}
@Override
public int size() {
return delegate.size();
}
};
/**
* A runnable instance that flushes the content the current buffer page to
* disk.
*/
private Runnable pageSync = new Runnable() {
@Override
public void run() {
currentPage.content.force();
}
};
/**
* A flag to indicate if the Buffer is running or not.
*/
private boolean running = false;
/**
* The structure lock ensures that only a single thread can modify the
* structure of the Buffer, without affecting any readers.
*/
private final ReentrantLock structure = new ReentrantLock();
/**
* The prefix for the threads that are responsible for flushing data to
* disk. This is normally set by the Engine using the
* {@link #setThreadNamePrefix(String)} method.
*/
private String threadNamePrefix;
/**
* We keep track of the time when the last transport occurred so that the
* Engine can determine if it should avoid busy waiting in the
* BufferTransportThread.
*/
private AtomicLong timeOfLastTransport = new AtomicLong(Time.now());
/**
* A monitor that is used to make a thread block while waiting for the
* Buffer to become transportable. The {@link #waitUntilTransportable()}
* waits for this monitor and the {@link #insert(Write)} method notifies the
* threads waiting on this monitor whenever there is more than single page
* worth of data in the Buffer.
*/
private final Object transportable = new Object();
/**
* The number of items to transport to the Database per attempt. There is a
* tension between transporting and reading data (e.g. reads cannot happen
* while a transport occurs and vice versa). Transports are most efficient
* when they can batch up the amount of work per cycle, but that would be
* reads are blocked longer. So this variable indicates how many items
* should be transported in a single cycle. Each time a transport happens,
* this value will increase, but it will be decreased whenever a read
* occurs. This allows us to be more aggressive with transports when there
* are no reads happening, and also allows us to scale back transports when
* reads do occur.
*/
private int transportRate = 1;
/**
* The number of milliseconds to sleep between transport cycles.
*/
private int transportThreadSleepTimeInMs = MAX_TRANSPORT_THREAD_SLEEP_TIME_IN_MS;
/**
* Construct a Buffer that is backed by the default location, which is
* {@link GlobalState#BUFFER_DIRECTORY}.
*
*/
public Buffer() {
this(BUFFER_DIRECTORY);
}
/**
*
* Construct a a Buffer that is backed by {@code backingStore}. Existing
* content, if available, will be loaded from the file. Otherwise, a new and
* empty Buffer will be returned.
*
* @param directory - the path to directory where the buffer files should be
* stored. If the directory does not exist, it'll be created
* automatically
*/
public Buffer(String directory) {
FileSystem.mkdirs(directory);
this.directory = directory;
this.inventory = Inventory.create(directory + File.separator + "meta"
+ File.separator + "inventory"); // just incase we are running
// from a unit test and
// there is no call to
// #setInventory
this.threadNamePrefix = "buffer-" + System.identityHashCode(this);
this.numVerifyRequests = new AtomicLong(0);
this.numVerifyScans = new AtomicLong(0);
}
@Override
public Map<Long, String> audit(long record) {
Map<Long, String> audit = Maps.newTreeMap();
for (Iterator<Write> it = iterator(record, Time.NONE); it.hasNext();) {
Write write = it.next();
audit.put(write.getVersion(), write.toString());
}
return audit;
}
@Override
public Map<Long, String> audit(String key, long record) {
Map<Long, String> audit = Maps.newTreeMap();
for (Iterator<Write> it = iterator(key, record, Time.NONE); it
.hasNext();) {
Write write = it.next();
audit.put(write.getVersion(), write.toString());
}
return audit;
}
@Override
public Map<TObject, Set<Long>> browse(String key, long timestamp,
Map<TObject, Set<Long>> context) {
for (Iterator<Write> it = iterator(key, timestamp); it.hasNext();) {
Write write = it.next();
Set<Long> records = context.get(write.getValue().getTObject());
if(records == null) {
records = Sets.newLinkedHashSet();
context.put(write.getValue().getTObject(), records);
}
if(write.getType() == Action.ADD) {
records.add(write.getRecord().longValue());
}
else {
records.remove(write.getRecord().longValue());
}
}
return Maps.newTreeMap((SortedMap<TObject, Set<Long>>) Maps
.filterValues(context, emptySetFilter));
}
@Override
public Map<Long, Set<TObject>> chronologize(String key, long record,
long start, long end, Map<Long, Set<TObject>> context) {
Set<TObject> snapshot = Iterables.getLast(context.values(),
Sets.<TObject> newLinkedHashSet());
if(snapshot.isEmpty() && !context.isEmpty()) {
// CON-474: Empty set is placed in the context if it was the last
// snapshot know to the database
context.remove(Time.NONE);
}
for (Iterator<Write> it = iterator(key, record, end - 1); it
.hasNext();) {
Write write = it.next();
long timestamp = write.getVersion();
Text writtenKey = write.getKey();
long writtenRecordId = write.getRecord().longValue();
Action action = write.getType();
if(writtenKey.toString().equals(key) && writtenRecordId == record) {
snapshot = Sets.newLinkedHashSet(snapshot);
Value newValue = write.getValue();
if(action == Action.ADD) {
snapshot.add(newValue.getTObject());
}
else if(action == Action.REMOVE) {
snapshot.remove(newValue.getTObject());
}
if(timestamp >= start && !snapshot.isEmpty()) {
context.put(timestamp, snapshot);
}
}
}
return context;
}
@Override
public boolean contains(long record) {
return inventory.contains(record);
}
@Override
public Set<String> describe(long record, long timestamp,
Map<String, Set<TObject>> context) {
for (Iterator<Write> it = iterator(record, timestamp); it.hasNext();) {
Write write = it.next();
Set<TObject> values;
values = context.get(write.getKey().toString());
if(values == null) {
values = Sets.newHashSet();
context.put(write.getKey().toString(), values);
}
if(write.getType() == Action.ADD) {
values.add(write.getValue().getTObject());
}
else {
values.remove(write.getValue().getTObject());
}
}
return newLinkedHashMap(Maps.filterValues(context, emptySetFilter))
.keySet();
}
/**
* Return dumps for all the pages in the Buffer.
*
* @return the dump string
*/
public String dump() {
StringBuilder sb = new StringBuilder();
for (Page page : pages) {
sb.append(page.dump());
sb.append("\n");
}
return sb.toString();
}
@Override
public Map<Long, Set<TObject>> explore(Map<Long, Set<TObject>> context,
long timestamp, String key, Operator operator, TObject... values) {
for (Iterator<Write> it = iterator(key, timestamp); it.hasNext();) {
Write write = it.next();
long record = write.getRecord().longValue();
if(matches(write.getValue(), operator, values)) {
if(write.getType() == Action.ADD) {
MultimapViews.put(context, record,
write.getValue().getTObject());
}
else {
MultimapViews.remove(context, record,
write.getValue().getTObject());
}
}
}
return TMaps.asSortedMap(context);
}
@Override
public Set<Long> getAllRecords() {
return inventory.getAll();
}
/**
* Return the location where the Buffer stores its data.
*
* @return {@link GlobalState#BUFFER_DIRECTORY} or the directory that was
* passed to the {@link #Buffer(String)} constructor
*/
@Restricted
public String getBackingStore() {
return directory;
}
@Override
public int getDesiredTransportSleepTimeInMs() {
return transportThreadSleepTimeInMs;
}
@Override
public Inventory getInventory() {
return inventory;
}
/**
* Return the timestamp of the most recent data transport from the Buffer.
*
* @return the time of last transport
*/
@Restricted
public long getTimeOfLastTransport() {
return timeOfLastTransport.get();
}
@Override
public boolean insert(Write write, boolean sync) {
structure.lock();
try {
boolean notify = pages.size() == 2 && currentPage.size == 0;
currentPage.append(write, sync);
if(notify) {
synchronized (transportable) {
transportable.notify();
}
}
return true;
}
catch (CapacityException e) {
addPage();
return insert(write, sync);
}
finally {
structure.unlock();
}
}
@Override
public Iterator<Write> iterator() {
return new AllSeekingIterator(Time.NONE);
}
/**
* Return an iterator over all writes in the buffer that occurred no later
* than {@code timestamp}.
*
* @param timestamp
* @return the iterator
*/
public Iterator<Write> iterator(long timestamp) {
return new AllSeekingIterator(timestamp);
}
/**
* Return an iterator over all writes in the buffer with the specified
* {@code record} component and that occurred no later than
* {@code timestamp}.
*
* @param record
* @param timestamp
* @return the iterator
*/
public Iterator<Write> iterator(long record, long timestamp) {
return new RecordSeekingIterator(record, timestamp);
}
/**
* Return an iterator over all writes in the buffer with the specified
* {@code key} component and that occurred no later than {@code timestamp}.
*
* @param key
* @param timestamp
* @return the iterator
*/
public Iterator<Write> iterator(String key, long timestamp) {
return new KeySeekingIterator(key, timestamp);
}
/**
* Return an iterator over all writes in the buffer with the specified
* {@code key} and {@code record} components and that occurred no later than
* {@code timestamp}.
*
* @param timestamp
* @return the iterator
*/
public Iterator<Write> iterator(String key, long record, long timestamp) {
return new KeyInRecordSeekingIterator(key, record, timestamp);
}
/**
* Return an iterator over all writes in the buffer that equal the input
* {@code write} and that occurred no later than {@code timestamp}.
*
* @param timestamp
* @return the iterator
*/
public Iterator<Write> iterator(Write write, long timestamp) {
return new WriteSeekingIterator(write, timestamp);
}
@Override
public Map<String, Set<TObject>> select(long record, long timestamp,
Map<String, Set<TObject>> context) {
for (Iterator<Write> it = iterator(record, timestamp); it.hasNext();) {
Write write = it.next();
Set<TObject> values;
values = context.get(write.getKey().toString());
if(values == null) {
values = Sets.newLinkedHashSet();
context.put(write.getKey().toString(), values);
}
if(write.getType() == Action.ADD) {
values.add(write.getValue().getTObject());
}
else {
values.remove(write.getValue().getTObject());
}
}
return Maps.newTreeMap((SortedMap<String, Set<TObject>>) Maps
.filterValues(context, emptySetFilter));
}
@Override
public Set<TObject> select(String key, long record, long timestamp,
Set<TObject> context) {
for (Iterator<Write> it = iterator(key, record, timestamp); it
.hasNext();) {
Write write = it.next();
if(write.getType() == Action.ADD) {
context.add(write.getValue().getTObject());
}
else {
context.remove(write.getValue().getTObject());
}
}
return context;
}
/**
*
* Called by the parent {@link Engine} to set the environment that the
* Buffer
* associated to
*
* @param environment
*/
public void setEnvironment(String environment) {
this.environment = environment;
}
/**
* <p>
* <strong>DO NOT CALL!!!</strong>
* </p>
* <p>
* Called by the parent {@link Engine} to set the inventory that the Buffer
* writes to when new records are added.
* </p>
*
* @param inventory
*/
@Restricted
public void setInventory(Inventory inventory) {
this.inventory = inventory;
}
/**
* <p>
* <strong>DO NOT CALL!!!</strong>
* </p>
* <p>
* Called by the parent {@link Engine} to set the thread name prefix that
* the Buffer uses when spawning asynchronous threads.
* </p>
*
* @param threadNamePrefix
*/
@Restricted
public void setThreadNamePrefix(String threadNamePrefix) {
this.threadNamePrefix = threadNamePrefix;
}
@Override
public void start() {
if(!running) {
running = true;
Logger.info("Buffer configured to store data in {}", directory);
SortedMap<File, Page> pageSorter = Maps
.newTreeMap(NaturalSorter.INSTANCE);
for (File file : new File(directory).listFiles()) {
if(!file.isDirectory()) {
Page page = new Page(file.getAbsolutePath());
pageSorter.put(file, page);
Logger.info("Loading Buffer content from {}...", page);
}
}
pages.addAll(pageSorter.values());
if(pages.isEmpty()) {
addPage(false);
}
else {
currentPage = pages.get(pages.size() - 1);
}
}
}
@Override
public void stop() {
if(running) {
running = false;
synchronized (transportable) {
transportable.notifyAll(); // notify to allow any waiting
// threads to terminate
}
}
}
@Override
public void sync() {
ConcourseExecutors.executeAndAwaitTermination(threadNamePrefix,
pageSync, inventorySync);
}
/**
* {@inheritDoc}
* <p>
* This method will transport at least one write from the buffer, in
* chronological order.
* </p>
*/
@Override
public void transport(PermanentStore destination, boolean sync) {
// NOTE: The #sync parameter is ignored because the Database does not
// support allowing the Buffer to control when syncs happen.
if(pages.size() > 1) {
Page page = pages.get(0);
if(!page.transportLock.writeLock().isHeldByCurrentThread()
&& page.transportLock.writeLock().tryLock()) {
try {
for (int i = 0; i < transportRate; ++i) {
if(page.hasNext()) {
destination.accept(page.next());
page.remove();
}
else {
((Database) destination).triggerSync();
removePage();
break;
}
}
timeOfLastTransport.set(Time.now());
transportRate = transportRate >= MAX_TRANSPORT_RATE
? MAX_TRANSPORT_RATE
: (transportRate * transportRateMultiplier);
--transportThreadSleepTimeInMs;
if(transportThreadSleepTimeInMs < MIN_TRANSPORT_THREAD_SLEEP_TIME_IN_MS) {
transportThreadSleepTimeInMs = MIN_TRANSPORT_THREAD_SLEEP_TIME_IN_MS;
}
}
finally {
page.transportLock.writeLock().unlock();
}
}
}
}
@Override
public boolean verify(Write write, long timestamp, boolean exists) {
numVerifyRequests.incrementAndGet();
for (Iterator<Write> it = iterator(write, timestamp); it.hasNext();) {
it.next();
exists ^= true; // toggle boolean
}
return exists;
}
@Override
public TernaryTruth verifyFast(Write write, long timestamp) {
if(inventory.contains(write.getRecord().longValue())) {
return super.verifyFast(write, timestamp);
}
else {
return TernaryTruth.FALSE;
}
}
@Override
public void waitUntilTransportable() {
if(pages.size() <= 1) {
synchronized (transportable) {
try {
transportable.wait();
}
catch (InterruptedException e) {/* ignore */}
}
}
}
/**
* Return {@code true} if the Buffer has more than 1 page and the first page
* has at least one element that can be transported. If this method returns
* {@code false} it means that the first page is the only page or that the
* Buffer would need to trigger a Database sync and remove the first page in
* order to transport.
*
* @return {@code true} if the Buffer can transport a Write.
*/
@VisibleForTesting
protected boolean canTransport() { // visible for testing
return pages.size() > 1 && pages.get(0).hasNext();
}
@Nullable
@Override
protected Action getLastWriteAction(Write write, long timestamp) {
// TODO: use ReverseSeekingIterator to optimize this
Iterator<Write> it = iterator(write, timestamp);
Action action = null;
while (it.hasNext()) {
action = it.next().getType();
}
return action;
}
@Override
protected long getOldestWriteTimestamp() {
return pages.get(0).getOldestWriteTimestamp();
}
@Override
protected Iterator<Write> getSearchIterator(String key) {
return iterator(key, Time.NONE);
}
@Override
protected boolean isPossibleSearchMatch(String key, Write write,
Value value) {
return value.getType() == Type.STRING;
}
/**
* Add a new Page to the Buffer.
*/
private void addPage() {
addPage(true);
}
/**
* Add a new Page to the Buffer and optionally perform a {@code sync}.
*
* @param sync - a flag that determines whether the {@link #sync()} method
* should be called to durably persist the current page to disk.
* This should only be false when called from the
* {@link #start()} method.
*/
private void addPage(boolean sync) {
structure.lock();
try {
if(sync) {
sync();
}
currentPage = new Page(BUFFER_PAGE_SIZE);
pages.add(currentPage);
Logger.debug("Added page {} to Buffer", currentPage);
}
finally {
structure.unlock();
}
}
/**
* Determines the percentage within range [0, 1] of verifies that scan
* the buffer.
*
* @return: decimal percentage of verifies initiated that scanned the
* buffer.
*/
@SuppressWarnings("unused")
private float getPercentVerifyScans() { // to be used for CON-236
return ((float) numVerifyScans.get()) / numVerifyRequests.get();
}
/**
* Remove the first page in the Buffer.
*/
private void removePage() {
structure.lock();
try {
pages.remove(0).delete();
}
finally {
structure.unlock();
}
}
/**
* Scale back the number of items that are transported in a single cycle.
*/
private void scaleBackTransportRate() {
transportRate = 1;
transportThreadSleepTimeInMs = MAX_TRANSPORT_THREAD_SLEEP_TIME_IN_MS;
}
/**
* A {@link SeekingIterator} for all the writes in the buffer.
*
* @author Jeff Nelson
*/
private class AllSeekingIterator extends SeekingIterator {
/**
* Construct a new instance.
*
* @param timestamp
*/
protected AllSeekingIterator(long timestamp) {
super(timestamp);
init();
}
@Override
protected boolean isRelevantWrite(Write write) {
return true;
}
@Override
protected boolean pageMightContainRelevantWrites(Page page) {
return true;
}
}
/**
* A {@link SeekingIterator} that looks for writes with a particular key and
* record component.
*
* @author Jeff Nelson
*/
private class KeyInRecordSeekingIterator extends SeekingIterator {
/**
* The relevant key.
*/
private final Text key;
/**
* The relevant record.
*/
private final PrimaryKey record;
/**
* Construct a new instance.
*
* @param timestamp
*/
protected KeyInRecordSeekingIterator(String key, long record,
long timestamp) {
super(timestamp);
this.key = Text.wrapCached(key);
this.record = PrimaryKey.wrap(record);
init();
}
@Override
protected boolean isRelevantWrite(Write write) {
return write.getRecord().equals(record)
&& write.getKey().equals(key);
}
@Override
protected boolean pageMightContainRelevantWrites(Page page) {
return page.mightContain(key, record);
}
}
/**
* A {@link SeekingIterator} that looks for writes with a particular key
* component.
*
* @author Jeff Nelson
*/
private class KeySeekingIterator extends SeekingIterator {
/**
* The relevant key
*/
private final Text key;
/**
* Construct a new instance.
*
* @param timestamp
*/
protected KeySeekingIterator(String key, long timestamp) {
super(timestamp);
this.key = Text.wrapCached(key);
init();
}
@Override
protected boolean isRelevantWrite(Write write) {
return write.getKey().equals(key);
}
@Override
protected boolean pageMightContainRelevantWrites(Page page) {
return page.mightContain(key);
}
}
/**
* An {@link Iterator} that can traverse Writes directly from disk for a
* Buffer that uses {@code location} as a store. Call
* {@link Buffer#onDiskIterator(String)} to instantiate one of these. This
* should only be used in cases where it is necessary (and safe) to iterate
* through a Buffer's writes while the Buffer is offline.
*
* @author Jeff Nelson
*/
private static class OnDiskIterator extends ReadOnlyIterator<Write> {
/**
* An {@link Iterator} over all the files in the input directory.
*/
private final Iterator<String> fileIt;
/**
* An {@link Iterator} over the data chunks in the current file.
*/
private Iterator<ByteBuffer> it = null;
/**
* Construct a new instance.
*
* @param location
*/
private OnDiskIterator(String location) {
this.fileIt = FileSystem.fileOnlyIterator(location);
flip();
}
@Override
public boolean hasNext() {
if(it == null) {
return false;
}
else if(!it.hasNext() && fileIt.hasNext()) {
flip();
return hasNext();
}
else if(!it.hasNext()) {
return false;
}
else {
return true;
}
}
@Override
public Write next() {
if(hasNext()) {
return Byteables.readStatic(it.next(), Write.class);
}
else {
return null;
}
}
/**
* Flip to the next page in the iterator.
*/
private void flip() {
if(fileIt.hasNext()) {
ByteBuffer bytes = FileSystem.readBytes(fileIt.next());
it = ByteableCollections.iterator(bytes);
}
}
}
/**
* A {@link Page} represents a granular section of the {@link Buffer}. Pages
* are an append-only iterator over a sequence of {@link Write} objects.
* Pages differ from other iterators because they do not advance in the
* sequence until the {@link #remove()} method is called.
*
* @author Jeff Nelson
*/
private class Page implements Iterator<Write>, Iterable<Write> {
// NOTE: This class does not define hashCode() and equals() because the
// defaults are the desired behaviour.
/**
* The filename extension.
*/
private static final String ext = ".buf";
/**
* The local lock for read/write access on the page. This is only used
* when this page is equal to the {@link #currentPage}. In that case,
* this lock is grabbed before any access is allowed on the page, so
* subsequent structures that are used need not be thread safe.
*/
private transient StampedLock accessLock = new StampedLock();
/**
* The append-only buffer that contains the content of the backing file.
* Data is never deleted from the buffer, until the entire Page is
* removed.
*/
private MappedByteBuffer content;
/**
* The file that contains the content of the Page.
*/
private final String filename;
/**
* Indicates the index in {@link #writes} that constitutes the first
* element. When writes are "removed", elements are not actually deleted
* from the list, so it is necessary to keep track of the head element
* so that the correct next() element can be returned.
*/
private transient int head = 0;
/**
* A bloom filter like cache that is used to help determine if it
* possible that a key exists on the page.
*/
private final boolean[] keyCache;
/**
* A bloom filter like cache that is used to help determine if it is
* possible that a key/record exists on the page.
*/
private final boolean[] keyRecordCache;
/**
* A bloom filter like cache that is used to help determine if it
* possible that a record exists on the page.
*/
private final boolean[] recordCache;
/**
* The total number of elements in the list of {@link #writes}.
*/
private transient int size = 0;
/**
* The upper bound on the number of writes that this page can hold.
*/
private final transient int sizeUpperBound;
/**
* The transportLock makes it possible to append new Writes and
* transport old writes concurrently while prohibiting reading the Page
* and transporting writes at the same time.
*/
private final transient ReentrantReadWriteLock transportLock = PriorityReadWriteLock
.prioritizeReads();
/**
* A bloom filter like cache that is used to help determine if it
* possible that a Write exists on the page.
*/
private final BloomFilter writeCache;
/**
* The append-only list of {@link Write} objects on the Page. Elements
* are never deleted from this list, but are marked as "removed"
* depending on the location of the {@link #head} index.
*/
private final Write[] writes;
/**
* Construct an empty Page with {@code capacity} bytes.
*
* @param size
*/
public Page(int capacity) {
this(directory + File.separator + Time.now() + ext, capacity);
}
/**
* Construct a Page that is backed by {@code filename}. Existing
* content, if available, will be loaded from the file starting at the
* position specified in {@link #pos}.
* <p>
* Please note that this constructor is designed to deserialize a
* retired page, so the returned Object will be at capacity and unable
* to append additional {@link Write} objects.
* </p>
*
* @param filename
*/
public Page(String filename) {
this(filename, FileSystem.getFileSize(filename));
}
/**
* Construct a new instance.
*
* @param filename
* @param capacity
*/
private Page(String filename, long capacity) {
this.filename = filename;
this.content = FileSystem.map(filename, MapMode.READ_WRITE, 0,
capacity);
this.sizeUpperBound = Math.max(1,
(int) ((capacity / AVG_WRITE_SIZE) * 1.2));
this.writes = new Write[sizeUpperBound];
this.recordCache = new boolean[sizeUpperBound];
this.keyCache = new boolean[sizeUpperBound];
this.keyRecordCache = new boolean[sizeUpperBound];
this.writeCache = BloomFilter
.create(PER_PAGE_BLOOM_FILTER_CAPACITY);
writeCache.disableThreadSafety();
Iterator<ByteBuffer> it = ByteableCollections.iterator(content);
while (it.hasNext()) {
Write write = Write.fromByteBuffer(it.next());
index(write);
Logger.debug("Found existing write '{}' in the Buffer", write);
}
}
/**
* Append {@code write} to the Page if {@link #content} has enough
* remaining capacity to store {@code write}. Since all inserts are
* routed to this method, we grab a writeLock so that we don't have a
* situation where the currentPage is ever changed in the middle of a
* read.
*
* @param write the {@link Write} to append
* @param sync a flag that determines if the page should be fsynced
* (or the equivalent) after appending {@code write} so that
* the changes are guaranteed to be durably persisted, this
* flag should almost always be {@code true} if calling this
* method directly. It is set to {@code false} when called
* from the context of an atomic operation transporting
* writes to this Buffer using GROUP SYNC
* @throws CapacityException
* - if the size of {@code write} is greater than the
* remaining capacity of {@link #content}
*/
public void append(Write write, boolean sync) throws CapacityException {
Preconditions.checkState(this == currentPage, "Illegal attempt to "
+ "append a Write to an inactive Page");
long stamp = accessLock.writeLock();
try {
if(content.remaining() >= write.size() + 4) {
appendUnsafe(write, sync); /* (authorized) */
}
else if(content.position() == 0) {
// Handle corner case where a Write is larger than
// BUFFER_PAGE_SIZE by auto expanding the capacity for the
// page
content = FileSystem.map(filename, MapMode.READ_WRITE, 0,
write.size() + 4);
appendUnsafe(write, sync); /* (authorized) */
}
else {
throw CapacityException.INSTANCE;
}
}
finally {
accessLock.unlockWrite(stamp);
}
}
/**
* Delete the page from disk. The Page object will reside in memory
* until garbage collection.
*/
public void delete() {
FileSystem.deleteFile(filename);
FileSystem.unmap(content); // CON-163 (authorized)
Logger.info("Deleting Buffer page {}", filename);
}
/**
* Return the timestamp of the oldest (e.g. first) write on this page,
* if it exists.
*
* @return the oldest write timestamp
*/
public long getOldestWriteTimestamp() {
Write oldestWrite = writes[0];
// When there is no data on the page return the max possible
// timestamp so that no query's timestamp is less than this
// timestamp
return oldestWrite == null ? Long.MAX_VALUE
: oldestWrite.getVersion();
}
/**
* Returns {@code true} if {@link #head} is smaller than the largest
* occupied index in {@link #writes}. This means that it is possible for
* calls to this method to initially return {@code false} at t0, but
* eventually return {@code true} at t1 if an element is added to the
* Page between t0 and t1.
*/
@Override
public boolean hasNext() {
long stamp = Locks.stampLockReadIfCondition(accessLock,
this == currentPage);
try {
return head < size;
}
finally {
Locks.stampUnlockReadIfCondition(accessLock, stamp,
this == currentPage);
}
}
/**
* <p>
* Returns an iterator that is appropriate for the append-only list of
* {@link Write} objects that backs the Page. The iterator does not
* support the {@link Iterator#remove()} method and only throws a
* {@link ConcurrentModificationException} if an element is removed from
* the Page using the {@link #remove()} method.
* </p>
* <p>
* While the Page is, itself, an iterator (for transporting Writes), the
* iterator returned from this method is appropriate for cases when it
* is necessary to iterate through the page for reading.
* </p>
*/
/*
* (non-Javadoc)
* This iterator is only used for Limbo reads that
* traverse the collection of Writes. This iterator differs from the
* Page (which is also an Iterator over Write objects) by virtue of the
* fact that it does not allow removes and will detect concurrent
* modification.
*/
@Override
public Iterator<Write> iterator() {
return new Iterator<Write>() {
/**
* The distance between the {@link #head} element and the
* {@code next} element. This is used to detect for concurrent
* modifications.
*/
private int distance = 0;
/**
* The index of the "next" element in {@link #writes}.
*/
private int index = head;
@Override
public boolean hasNext() {
if(index - head != distance) {
throw new ConcurrentModificationException(
"A write has been removed from the Page");
}
return index < size;
}
@Override
public Write next() {
if(index - head != distance) {
throw new ConcurrentModificationException(
"A write has been removed from the Page");
}
Write next = writes[index];
++index;
++distance;
return next;
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
};
}
/**
* Return {@code true} if the Page <em>might</em> have a Write with the
* specified {@code record} component. If this function returns true,
* the caller should perform a linear scan using the local
* {@link #iterator()} .
*
* @param record
* @return {@code true} if a write within {@code record} possibly exists
*/
public boolean mightContain(PrimaryKey record) {
return recordCache[slotify(record.hashCode())];
}
/**
* Return {@code true} if the Page <em>might</em> have a Write with the
* specified {@code key} component. If this function returns true,
* the caller should perform a linear scan using the local
* {@link #iterator()} .
*
* @param key
* @return {@code true} if a write for {@code key} possibly exists
*/
public boolean mightContain(Text key) {
return keyCache[slotify(key.hashCode())];
}
/**
* Return {@code true} if the Page <em>might</em> have a Write with the
* specified {@code key} and {@code record} components. If this function
* returns true, the caller should perform a linear scan using the local
* {@link #iterator()} .
*
* @param key
* @param record
* @return {@code true} if a write for {@code key} in {@code record}
* possibly exists
*/
public boolean mightContain(Text key, PrimaryKey record) {
return keyRecordCache[slotify(key.hashCode(), record.hashCode())];
}
/**
* Return {@code true} if the Page <em>might</em> have a Write equal to
* {@code write}. If this function returns true, the caller should check
* with certainty by calling {@link #doesContain(Write, long)}.
*
* @param write
* @return {@code true} if the write possibly exists
*/
public boolean mightContain(Write write) {
Type valueType = write.getValue().getType();
if(writeCache.mightContainCached(write.getRecord(), write.getKey(),
write.getValue())) {
return true;
}
else if(valueType == Type.STRING) {
return writeCache.mightContainCached(write.getRecord(),
write.getKey(),
Value.wrap(Convert.javaToThrift(Tag.create(
(String) write.getValue().getObject()))));
}
else if(valueType == Type.TAG) {
return writeCache.mightContainCached(write.getRecord(),
write.getKey(), Value.wrap(Convert.javaToThrift(
write.getValue().getObject().toString())));
}
else {
return false;
}
}
/**
* Returns the Write at index {@link #head} in {@link #writes}.
* <p>
* <strong>NOTE:</strong>
* <em>This method will return the same element on multiple
* invocations until {@link #remove()} is called.</em>
* </p>
*/
@Override
public Write next() {
long stamp = Locks.stampLockReadIfCondition(accessLock,
this == currentPage);
try {
return writes[head];
}
finally {
Locks.stampUnlockReadIfCondition(accessLock, stamp,
this == currentPage);
}
}
/**
* Simulates the removal of the head Write from the Page. This method
* only updates the {@link #head} and {@link #pos} metadata and does not
* actually delete any data, which is a performance optimization.
*/
@Override
public void remove() {
long stamp = Locks.stampLockWriteIfCondition(accessLock,
this == currentPage);
try {
++head;
}
finally {
Locks.stampUnlockWriteIfCondition(accessLock, stamp,
this == currentPage);
}
}
@Override
public String toString() {
return filename;
}
/**
* Dump the contents of this page.
*
* @return the dump string
*/
protected String dump() {
long stamp = Locks.stampLockReadIfCondition(accessLock,
this == currentPage);
try {
StringBuilder sb = new StringBuilder();
sb.append("Dump for " + getClass().getSimpleName() + " "
+ filename);
sb.append("\n");
sb.append("------");
sb.append("\n");
for (Write write : writes) {
if(write == null) {
break;
}
else {
sb.append(write);
sb.append("\n");
}
}
sb.append("\n");
return sb.toString();
}
finally {
Locks.stampUnlockReadIfCondition(accessLock, stamp,
this == currentPage);
}
}
/**
* Do the work to actually index and append {@code write} (while
* optionally performing a {@code sync} WITHOUT grabbing any locks
* (hence this method being UNSAFE) for unauthorized usage.
*
* @param write the {@link Write} to append
* @param sync a flag that determines if the page should be fsynced
* (or the equivalent) after appending {@code write} so that
* the changes are guaranteed to be durably persisted, this
* flag should almost always be {@code true} if calling this
* method directly. It is set to {@code false} when called
* from the context of an atomic operation transporting
* writes to this Buffer using GROUP SYNC
*/
@GuardedBy("Buffer.Page#append(Write)")
private void appendUnsafe(final Write write, boolean sync) {
index(write);
content.putInt(write.size());
write.copyTo(content);
inventory.add(write.getRecord().longValue());
if(sync) {
sync();
}
GLOBAL_EXECUTOR.execute(new Runnable() {
@Override
public void run() {
WriteEvent event = new WriteEvent(write.getKey().toString(),
write.getValue().getTObject(),
write.getRecord().longValue(), write.getVersion(),
WriteEvent.Type.valueOf(write.getType().name()),
environment);
BINARY_QUEUE.add(event);
}
});
}
/**
* Insert {@code write} into the list of {@link #writes} and increment
* the {@link #size} counter.
*
* @param write
* @throws CapacityException
*/
@GuardedBy("Buffer.Page#append(Write)")
private void index(Write write) throws CapacityException {
if(size < writes.length) {
writes[size] = write;
int hashCodeRecord = write.getRecord().hashCode();
int hashCodeKey = write.getKey().hashCode();
// The individual Write components are added instead of the
// entire Write so that version information is not factored into
// the bloom filter hashing
writeCache.putCached(write.getRecord(), write.getKey(),
write.getValue());
keyRecordCache[slotify(hashCodeRecord, hashCodeKey)] = true;
recordCache[slotify(hashCodeRecord)] = true;
keyCache[slotify(hashCodeKey)] = true;
++size;
}
else {
throw CapacityException.INSTANCE;
}
}
/**
* Convenience function to return the appropriate slot in one of the
* Page's filter's between 0 and {@code #sizeUpperBound} for an object
* with {@code hashCode}.
*
* @param hashCode
* @return the slot
*/
private int slotify(int hashCode) {
return Math.abs(hashCode % sizeUpperBound);
}
/**
* Convenience function to return the appropriate slot in one of the
* Page's filter's between 0 and {@code #sizeUpperBound} for a group of
* objects with the {@code hashCodes}.
*
* @param hashCode
* @return the slot
*/
private int slotify(int... hashCodes) {
return Math.abs(Integers.avg(hashCodes) % sizeUpperBound);
}
}
/**
* A {@link SeekingIterator} that looks for writes with a particular record
* component.
*
* @author Jeff Nelson
*/
private class RecordSeekingIterator extends SeekingIterator {
/**
* The relevant record.
*/
private final PrimaryKey record;
/**
* Construct a new instance.
*
* @param timestamp
*/
/**
* Construct a new instance.
*
* @param record
* @param timestamp
*/
protected RecordSeekingIterator(long record, long timestamp) {
super(timestamp);
this.record = PrimaryKey.wrap(record);
init();
}
@Override
protected boolean isRelevantWrite(Write write) {
return write.getRecord().equals(record);
}
@Override
protected boolean pageMightContainRelevantWrites(Page page) {
return page.mightContain(record);
}
}
/**
* An {@link Iterator} over the writes in the buffer that has logic to only
* return writes that match a certain {@code seek} criteria. The iterator
* also uses the criteria as a hint to perform more optimal searches over
* the pages in the Buffer.
*
* @author Jeff Nelson
*/
private abstract class SeekingIterator implements Iterator<Write> {
/**
* A flag that indicates whether we should not perform a timestamp check
* because we want all the writes up until the present state.
*/
private boolean ignoreTimestamp = false;
/**
* The stamp returned from grabbing the read access lock from
* {@link #myCurrentPage}.
*/
private long myAccessStamp = 0L;
/**
* A reference to the page in which the iterator is currently
* traversing.
*/
private Page myCurrentPage;
/**
* The next write to return.
*/
private Write next = null;
/**
* An iterator over all the pages in the Buffer.
*/
private Iterator<Page> pageIterator = pages.iterator();
/**
* The max timestamp for which to seek. If a Write's version is greater
* than this timestamp, then the iterator ceases to return elements.
*/
private final long timestamp;
/**
* A flag that indicates whether this iterator has satisfied
* preconditions and is useable. If it is not useable, it won't perform
* any traversals or return any data.
*/
private final boolean useable;
/**
* The iterator over the writes on the page at which the iterator is
* currently traversing.
*/
private Iterator<Write> writeIterator = null;
/**
* Construct a new instance.
*
* @param timestamp
*/
protected SeekingIterator(long timestamp) {
this.timestamp = timestamp;
if(timestamp >= getOldestWriteTimestamp()) {
scaleBackTransportRate();
this.ignoreTimestamp = timestamp == Long.MAX_VALUE;
this.next = advance();
this.useable = true;
}
else {
this.useable = false;
}
}
@Override
public boolean hasNext() {
return next != null;
}
@Override
public Write next() {
Write next0 = next;
this.next = advance();
return next0;
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
/**
* Each subclass should call this method after constructing the initial
* state to turn to the first page and get the first write.
*/
protected void init() {
if(useable) {
flip(true);
this.next = advance();
}
}
/**
* Return {@code true} if {@code write} is relevant to what this
* iterator is seeking.
*
* @param write
* @return {@code true} if the write is relevant
*/
protected abstract boolean isRelevantWrite(Write write);
/**
* Call the appropriate function to determine if the {@code page} might
* contain the kinds of writes that this iterator is seeking.
*
* @param page
* @return {@code true} if the page can possibly contain relevant data
*/
protected abstract boolean pageMightContainRelevantWrites(Page page);
/**
* Advance to the next write that this iterator should return, if it
* exists.
*
* @return the next write or {@code null}
*/
private Write advance() {
for (;;) {
if(writeIterator == null) {
return null;
}
while (writeIterator.hasNext()) {
Write write = writeIterator.next();
if(!ignoreTimestamp && write.getVersion() > timestamp) {
writeIterator = null;
pageIterator = null;
releaseLocks();
return null;
}
else if(isRelevantWrite(write)) {
return write;
}
}
flip();
}
}
/**
* Flip to the next page in the Buffer.
*/
private void flip() {
flip(false);
}
/**
* Flip to the next page in the Buffer with the option to temporarily
* skip the timestamp check.
*
* @param skipTsCheck
*/
private void flip(boolean skipTsCheck) {
writeIterator = null;
releaseLocks();
if(pageIterator.hasNext()) {
while (pageIterator.hasNext()) {
Page next = pageIterator.next();
grabLocks(next);
if(!skipTsCheck && !ignoreTimestamp
&& next.getOldestWriteTimestamp() > timestamp) {
writeIterator = null;
pageIterator = null;
releaseLocks();
break;
}
if(pageMightContainRelevantWrites(next)) {
writeIterator = next.iterator();
break;
}
else {
releaseLocks();
}
}
}
}
/**
* Grab the necessary locks to protected {@code #page} while it is used
* in the iterator.
*
* @param page
*/
private void grabLocks(Page page) {
myCurrentPage = page;
myAccessStamp = Locks.stampLockReadIfCondition(
myCurrentPage.accessLock, myCurrentPage == currentPage);
myCurrentPage.transportLock.readLock().lock();
}
/**
* Release the locks for {@link #myCurrentPage}.
*/
private void releaseLocks() {
if(myCurrentPage != null) {
Locks.stampUnlockReadIfCondition(myCurrentPage.accessLock,
myAccessStamp, myCurrentPage == currentPage);
myCurrentPage.transportLock.readLock().unlock();
}
}
}
/**
* A {@link SeekingIterator} that looks for writes that are equal to a
* comparison write.
*
* @author Jeff Nelson
*/
private class WriteSeekingIterator extends SeekingIterator {
/**
* A flag to check whether the buffer has already been scanned (to
* mitigate multiple increments given multiple scans
* to the same buffer).
*/
private boolean scanned;
/**
* The relevant write.
*/
private final Write write;
/**
* Construct a new instance.
*
* @param timestamp
*/
protected WriteSeekingIterator(Write write, long timestamp) {
super(timestamp);
this.write = write;
init();
}
@Override
protected boolean isRelevantWrite(Write write) {
return write.equals(this.write);
}
@Override
protected boolean pageMightContainRelevantWrites(Page page) {
boolean mightContain = page.mightContain(write);
if(!scanned && mightContain) {
numVerifyScans.incrementAndGet();
}
return mightContain;
}
}
}