/*
* Copyright (c) 2013-2017 Cinchapi Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.cinchapi.concourse.server.plugin.io;
import java.io.IOException;
import java.lang.Thread.State;
import java.nio.BufferUnderflowException;
import java.nio.ByteBuffer;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.channels.FileChannel.MapMode;
import java.nio.channels.FileLock;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardOpenOption;
import java.util.concurrent.Executor;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import javax.annotation.concurrent.ThreadSafe;
import com.cinchapi.common.base.CheckedExceptions;
import com.cinchapi.concourse.server.plugin.concurrent.FileLocks;
import com.cinchapi.concourse.util.ByteBuffers;
import com.cinchapi.concourse.util.FileOps;
import com.cinchapi.concourse.util.Strings;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Throwables;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
/**
* {@link SharedMemory} is an alternative for local socket communication between
* two separate processes.
* <p>
* Separate processes can communicate via {@link SharedMemory} by passing
* messages. One process simply {@link #write(ByteBuffer) writes} a message and
* another one {@link #read() reads} it.
* </p>
* <p>
* {@link SharedMemory} is not limited to back-and-forth communication. Any
* number of processes can communicate using the same {@link SharedMemory}
* segment and any one processes can write any number of messages or read as
* many messages as available.
* </p>
* <h1>Writing Messages</h1>
* <p>
* A writer process must first acquire a lock that blocks other writes from the
* shared memory segment (e.g. writers and readers can use the segment
* concurrently). The lock is released after the message is written. The size of
* a {@link SharedMemory} segment is dynamic and written messages are appended
* to the end.
* </p>
* <h1>Reading Messages</h1>
* <p>
* A reader process must first acquire a lock that blocks readers from the
* shared memory segment (e.g. writers and readers can use the segment
* concurrently). The lock is released after the message is written. A message
* can only be read by one process. Once it is read, it is removed.
* </p>
* <h2>Compaction</h2>
* <p>
* Old messages are removed from the segment during certain read and write
* operations after the {@link #COMPACTION_FREQUENCY_IN_MILLIS} has passed since
* the last compaction. While compaction will reduce the amount of disk space
* used for the segment, each individual process will need to run a compaction
* for the results to be visible in its memory space.
* </p>
* <h1>Latency</h1>
* <p>
* This class attempt to strike a balance between efficiently handling low
* latency messages and minimizing CPU usage. To that end, the {@link #read()}
* method employs an algorithm that prefers to spin/busy-wait for new messages
* when messages are written in about 2 seconds or less. If the average latency
* between messages grows larger than 2 seconds, the {@link #read()} method will
* block while wait for notifications from the underlying file system, which may
* exhibit additional delay depending on the OS.
* </p>
*
* @author Jeff Nelson
*/
@ThreadSafe
public final class SharedMemory implements InterProcessCommunication {
/**
* The amount of time before another compaction is done after a read or
* write operation.
*/
@VisibleForTesting
protected static int COMPACTION_FREQUENCY_IN_MILLIS = 60000;
/**
* The total number of spin cycles to conduct before moving onto the next
* round of spins.
*/
private static final int MAX_SPIN_CYCLES_PER_ROUND = 20;
/**
* The total number of rounds to conduct spin cycles in the {@link #read()}
* method before going into a wait/notify cycle.
*/
private static final int MAX_SPIN_ROUNDS = 20;
/**
* The number of bytes used to store metadata at the beginning of the file
*/
private static final int METADATA_SIZE_IN_BYTES = 10;
/**
* The max average millisecond latency that is allowable for the
* {@link #read()} method to {@link #preferBusyWait()} as opposed to relying
* on notifications from the underlying file system.
*/
private static final long SPIN_AVG_LATENCY_TOLERANCE_IN_MILLIS = 2000;
/**
* The total number of seconds to backoff (e.g. sleep) after a round of
* spins.This value is chosen so that the total amount of time spent
* spinning is about 2 seconds.
*/
private static final int SPIN_BACKOFF_IN_MILLIS = 100;
/**
* The amount of time to sleep while executing a lazy check for race
* conditions while waiting for file system notifications.
*/
private static final int DORMANT_SPIN_SLEEP_TIME_IN_MILLIS = 1500;
/**
* The position in the {@link #channel} where the {@link #readLock()} byte
* his held.
*/
private static final int READ_LOCK_POSITION = 8;
/**
* The position in the {@link #channel} where the {@link #writeLock()} byte
* his held.
*/
private static final int WRITE_LOCK_POSITION = 9;
/**
* The underlying {@link FileChannel} for the memory's backing store.
*/
private final FileChannel channel;
/**
* An executor service dedicated to running compaction in the background
* after certain read or write operations.
*/
private final ExecutorService compactor = Executors.newSingleThreadExecutor(
new ThreadFactoryBuilder().setNameFormat("shared-memory-compactor")
.setDaemon(true).build());
/**
* The timestamp (in milliseconds) of the last compaction.
*/
private long lastCompaction;
/**
* The location of of the shared memory.
*/
private final Path location;
/**
* A {@link MappedByteBuffer} that tracks the content of the shared memory
* segment.
*/
private MappedByteBuffer memory;
/**
* The number of messages that have been read. This statistic is tracked for
* potential optimizations.
*/
private long readCount;
/**
* The total amount of time that this instance has ever waited after trying
* to read a message.
*/
private long totalLatency;
/**
* An {@link Executor} dedicated to detecting and fixing race conditions.
*/
private final ExecutorService raceConditionDetector = Executors
.newSingleThreadExecutor(new ThreadFactoryBuilder()
.setNameFormat("shared-memory-race-condition-detector")
.setDaemon(true).build());
/**
* The relative position in {@link #memory} where a reader should begin
* consuming the next message.
*/
private final MappedAtomicInteger nextRead;
/**
* The relative position in {@link #memory} where a writer should begin
* storing the next message.
*/
private final MappedAtomicInteger nextWrite;
/**
* Construct a new {@link SharedMemory} instance backed by a temporary
* store.
*/
public SharedMemory() {
this(FileOps.tempFile("con", ".sm"), 1024);
}
/**
* Construct a new instance.
*
* @param path
*/
public SharedMemory(String path) {
this(path, 1024);
}
/**
* Construct a new {@link SharedMemory} instance backed by the file at
* {@code path} with the specified initial {@code capacity}.
*
* @param path the path of the backing file for the shared memory
* @param capacity the initial capacity of the shared memory segment
*/
public SharedMemory(String path, int capacity) {
capacity = Math.max(capacity, METADATA_SIZE_IN_BYTES + capacity);
try {
this.location = Paths.get(path).toAbsolutePath();
this.channel = FileChannel.open(location, StandardOpenOption.CREATE,
StandardOpenOption.READ, StandardOpenOption.WRITE);
this.nextRead = new MappedAtomicInteger(channel, 0);
this.nextWrite = new MappedAtomicInteger(channel, 4);
this.memory = channel.map(MapMode.READ_WRITE,
METADATA_SIZE_IN_BYTES, capacity);
if(nextWrite.get() == 0) {
nextRead.setAndSync(-1);
}
Runtime.getRuntime().addShutdownHook(new Thread() {
@Override
public void run() {
try {
channel.close();
}
catch (IOException e) {
throw Throwables.propagate(e);
}
}
});
}
catch (IOException e) {
throw Throwables.propagate(e);
}
this.lastCompaction = System.currentTimeMillis();
}
/**
* Run compact on the {@link SharedMemory} to optimize how much space is
* utilized by removing garbage.
*/
@Override
public void compact() {
FileLock lock = lock();
try {
int start = nextRead.get();
int end = start < 0 ? 0 : nextWrite.get(); // If start < 0, there
// are no unread writes,
// so we can truncate the
// entire file
int length;
if(start >= 0) {
length = end - start;
memory.position(start);
byte[] data = new byte[length];
if(length > memory.remaining()) {
// There is more data in the underlying file than is
// represented in memory, so first grow to capture all of
// it.
growUnsafe();
}
memory.get(data);
memory.flip();
memory.put(data);
memory.flip();
nextRead.set(0);
nextWrite.set(end - start);
}
else {
length = 0;
memory.position(0);
memory.limit(0);
nextRead.set(-1);
nextWrite.set(0);
}
channel.truncate(METADATA_SIZE_IN_BYTES + length);
memory = channel.map(MapMode.READ_WRITE, METADATA_SIZE_IN_BYTES,
length);
nextRead.sync();
nextWrite.sync();
memory.force();
}
catch (IOException e) {
throw Throwables.propagate(e);
}
finally {
lastCompaction = System.currentTimeMillis();
FileLocks.release(lock);
}
}
/**
* Read the most recent message from the memory segment, blocking until a
* message is available.
*
* @return a {@link ByteBuffer} that contains the most recent message
*/
@Override
public ByteBuffer read() {
long start = System.currentTimeMillis();
if(preferBusyWait()) {
for (int i = 0; i < MAX_SPIN_ROUNDS; ++i) {
int spins = 0;
while (nextRead.get() < 0
&& spins < MAX_SPIN_CYCLES_PER_ROUND) {
spins++;
continue;
}
if(spins < MAX_SPIN_CYCLES_PER_ROUND) {
break;
}
else {
try {
Thread.sleep(SPIN_BACKOFF_IN_MILLIS);
}
catch (InterruptedException e) {
throw Throwables.propagate(e);
}
}
}
}
while (nextRead.get() < 0) {
Thread parentThread = Thread.currentThread();
raceConditionDetector.execute(() -> {
// NOTE: There is a subtle race condition that may occur if a
// write comes in between the #nextRead check above and when
// FileOps#awaitChange registers the #location with the watch
// service. To get around that, we have a separate thread check
// #nextRead and touch the #location if a write did come in when
// the race condition happened.
while (parentThread.getState() == State.RUNNABLE) {
// Wait until the parent thread is blocking before detecting
// a race condition...
Thread.yield();
continue;
}
while (nextRead.get() < 0) {
try {
Thread.sleep(DORMANT_SPIN_SLEEP_TIME_IN_MILLIS);
}
catch (InterruptedException e) {
throw CheckedExceptions.throwAsRuntimeException(e);
}
continue;
}
if(parentThread.getState() == State.RUNNABLE) {
parentThread.interrupt();
}
});
try {
FileOps.awaitChangeInterruptibly(location.toString());
}
catch (InterruptedException e) {
continue;
}
}
FileLock lock = readLock();
try {
int position = nextRead.get();
if(position >= 0) {
long elapsed = System.currentTimeMillis() - start;
totalLatency += elapsed;
memory.position(position);
if(memory.remaining() < 4) {
growUnsafe();
}
int length = memory.getInt();
while (length > memory.remaining()) {
growUnsafe();
}
ByteBuffer message = ByteBuffers.get(memory, length);
int mark = memory.position();
int next = -1;
boolean retry = true;
while (retry) {
retry = false;
try { // Peek at the next 4 bytes to see if it is > 0, which
// indicates that there is a next message to read.
int peek = memory.getInt();
if(peek > 0) {
next = mark;
}
}
catch (BufferUnderflowException e) {
growUnsafe();
retry = true;
}
}
memory.position(mark);
nextRead.setAndSync(next);
return message;
}
else { // race condition, someone else read the message before we
// did.
return read();
}
}
finally {
FileLocks.release(lock);
if(System.currentTimeMillis()
- lastCompaction > COMPACTION_FREQUENCY_IN_MILLIS) {
compactor.execute(() -> {
compact();
});
}
++readCount;
}
}
@Override
public String toString() {
return Strings.format(
"SharedMemory[path={}, nextRead={}, nextWrite={}]", location,
nextRead.get(), nextWrite.get());
}
/**
* Write {@code data} to the shared memory segment.
* <p>
* This method grabs an exclusive lock on the shared memory so that no other
* readers or writers may access while the message is being written. As
* such, this method also <strong>blocks</strong> while waiting for the
* memory segment to become available for writing.
* </p>
* <p>
* <strong>CAUTION:</strong> This method does not check to make sure that
* the most recent message was read before writing.
* </p>
*
* @param data the message to write to the memory segment
* @return {@link SharedMemory this}
*/
@Override
public SharedMemory write(ByteBuffer data) {
FileLock lock = writeLock();
try {
// Must check to see if the underlying file has been truncated by
// compaction from another process or else manipulation of the
// current #memory segment won't actually be preserved. Not sure if
// this is a Java bug or not...
if(channel.size() < memory.capacity()) {
memory = channel.map(MapMode.READ_WRITE, METADATA_SIZE_IN_BYTES,
channel.size() - METADATA_SIZE_IN_BYTES);
}
int position = nextWrite.get();
while ((position > memory.limit()) || data.capacity() + 4 > memory
.position(position).remaining()) {
growUnsafe();
}
int mark = memory.position();
memory.putInt(data.capacity());
memory.put(data);
if(nextRead.get() < 0) {
nextRead.setAndSync(mark);
}
nextWrite.setAndSync(memory.position());
return this;
}
catch (IOException e) {
throw CheckedExceptions.throwAsRuntimeException(e);
}
finally {
FileLocks.release(lock);
if(System.currentTimeMillis()
- lastCompaction > COMPACTION_FREQUENCY_IN_MILLIS) {
compactor.execute(() -> {
compact();
});
}
}
}
/**
* Increase the capacity of the {@link #memory} segment without grabbing the
* lock.
*/
private void growUnsafe() {
try {
int position = memory.position();
int capacity = Math.max(memory.capacity(), 1);
memory = channel.map(MapMode.READ_WRITE, METADATA_SIZE_IN_BYTES,
capacity * 4);
memory.position(position);
}
catch (IOException e) {
throw Throwables.propagate(e);
}
}
/**
* Return an exclusive {@link FileLock} over the entire {@link #channel}.
*
* <p>
* Release the lock using {@link FileLocks#release(FileLock)}.
* </p>
*
* @return a {@link FileLock} over the entire channel
*/
private FileLock lock() {
FileLock read = readLock();
FileLock write = writeLock();
return new FileLock(channel, READ_LOCK_POSITION, 2, false) {
boolean valid = true;
@Override
public boolean isValid() {
return valid;
}
@Override
public void release() throws IOException {
FileLocks.release(read);
FileLocks.release(write);
valid = false;
}
};
}
/**
* Return {@code true} if the {@link #read()} method should try busy waiting
* before giving up its timeslice and relying on a notification from the
* underlying filesystem.
* <p>
* This is done in an attempt to strike a balance between low latency reads
* and high CPU usage.
* </p>
*
* @return {@code true} if busy waiting is preferable
*/
private boolean preferBusyWait() {
return readCount > 0
? totalLatency
/ readCount <= SPIN_AVG_LATENCY_TOLERANCE_IN_MILLIS
: true;
}
/**
* Return an exclusive {@link FileLock} that blocks other readers.
*
* <p>
* Release the lock using {@link FileLocks#release(FileLock)}.
* </p>
*
* @return a {@link FileLock} that blocks readers
*/
private FileLock readLock() {
return FileLocks.lock(channel, READ_LOCK_POSITION, 1, false);
}
/**
* Return an exclusive {@link FileLock} that blocks other writers.
*
* <p>
* Release the lock using {@link FileLocks#release(FileLock)}.
* </p>
*
* @return a {@link FileLock} that blocks writers
*/
private FileLock writeLock() {
return FileLocks.lock(channel, WRITE_LOCK_POSITION, 1, false);
}
}