/*
* Copyright 2012 David Tinker
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.qdb.buffer;
import java.io.*;
import java.nio.ByteBuffer;
import java.nio.channels.FileChannel;
import java.nio.channels.ReadableByteChannel;
import java.nio.charset.Charset;
/**
* <p>A bunch of messages all in the same file. Supports fast seek to a message by timestamp and detection and
* recovery from corruption due to server crash. New messages are always appended to the end of the file for
* performance. Thread safe.</p>
*
* <p>The file header is 4096 bytes long. The fixed part is 16 bytes and has the following format
* (all BIG_ENDIAN):</p>
* <pre>
* magic: 2 bytes (currently 0xBE01)
* reserved: 2 bytes (currently 0x0000)
* max file size: 4 bytes
* length of file at last checkpoint: 4 bytes
* reserved: 4 bytes (currently 0x00000000)
* </pre>
*
* <p>Recovery from a crash is simply a matter of truncating the file to its last checkpoint length. That might
* discard some good messages but has the advantage of being very fast (compared to calculating and checking
* message CRC values for example). The assumption is that if the messages are very important they will be
* written to multiple machines.</p>
*
* <p>The rest of the file header consists of up to 255 histogram buckets for fast message lookup by timestamp
* and id:</p>
* <pre>
* first message id (relative to this file): 4 bytes
* timestamp: 8 bytes
* message count: 4 bytes
* </pre>
*
* <p>The histogram is updated at each checkpoint. Checkpoints are done manually or automatically every max file
* size / 255 bytes.</p>
*
* <p>The remainder of the file consists of records in the following format (all BIG_ENDIAN):</p>
*
* <pre>
* record type: 1 byte (value always 0xA1 currently)
* timestamp: 8 bytes
* routing key size in bytes (m): 2 bytes
* payload size (n): 4 bytes
* routing key UTF8 encoded: m bytes
* payload: n bytes
* </pre>
*/
class MessageFile implements Closeable {
private final File file;
private final long firstMessageId;
private final int maxFileSize;
private final RandomAccessFile raf;
private final FileChannel channel;
private final ByteBuffer fileHeader;
private final ByteBuffer header;
private int usageCounter = 1;
private int length;
private int lastCheckpointLength;
private long mostRecentTimestamp;
private final int bytesPerBucket;
private int bucketIndex;
private long bucketTimestamp;
private int bucketMessageId;
private int bucketCount;
public static final int FILE_HEADER_SIZE = 4096;
private static final int FILE_HEADER_FIXED_SIZE = 16;
private static final int BUCKET_RECORD_SIZE = 16;
private static final int MAX_BUCKETS = (FILE_HEADER_SIZE - FILE_HEADER_FIXED_SIZE) / BUCKET_RECORD_SIZE;
private static final short FILE_MAGIC = (short)0xBE01;
private static final byte TYPE_MESSAGE = (byte)0xA1;
private static final int MESSAGE_HEADER_SIZE = 1 + 8 + 2 + 4;
private static final Charset UTF8 = Charset.forName("UTF8");
/**
* Open an existing file.
*/
@SuppressWarnings("StatementWithEmptyBody")
public MessageFile(File file, long firstMessageId) throws IOException {
this(file, firstMessageId, -1);
}
/**
* Open a new or existing file. The maxFileSize parameter is only used when creating a new file.
*/
@SuppressWarnings("StatementWithEmptyBody")
public MessageFile(File file, long firstMessageId, int maxFileSize) throws IOException {
this.file = file;
this.firstMessageId = firstMessageId;
if (maxFileSize < 0 && !file.isFile()) {
throw new IllegalArgumentException("File does not exist, is not readable or is not a file [" + file + "]");
}
raf = new RandomAccessFile(file, "rw");
channel = raf.getChannel();
fileHeader = ByteBuffer.allocateDirect(FILE_HEADER_SIZE);
header = ByteBuffer.allocateDirect(1024);
int size = (int)channel.size();
if (size == 0) {
if (maxFileSize < FILE_HEADER_SIZE) {
throw new IllegalArgumentException("Invalid max file size " + maxFileSize);
}
fileHeader.putShort(FILE_MAGIC);
fileHeader.putShort((short)0);
fileHeader.putInt(this.maxFileSize = maxFileSize);
fileHeader.putInt(length = FILE_HEADER_SIZE);
for (int i = bucketPosition(0); i < FILE_HEADER_SIZE; i += 16) fileHeader.putInt(i, -1);
fileHeader.position(0);
channel.write(fileHeader);
channel.force(false); // make sure file always has a valid header
bucketIndex = -1;
} else {
int sz = channel.read(fileHeader);
if (sz < FILE_HEADER_SIZE) throw new IOException("File header too short [" + file + "]");
fileHeader.flip();
short magic = fileHeader.getShort();
if (magic != FILE_MAGIC) {
throw new IOException("Invalid file magic 0x" + Integer.toHexString(magic & 0xFFFF) + " [" + file + "]");
}
fileHeader.position(fileHeader.position() + 2);
this.maxFileSize = fileHeader.getInt();
if (this.maxFileSize < FILE_HEADER_SIZE) {
throw new IOException("Invalid max file size " + this.maxFileSize + " [" + file + "]");
}
length = fileHeader.getInt();
if (length > size) {
throw new IOException("Checkpoint " + length + " exceeds file size " + size + " [" + file + "]");
} else if (length < size) {
channel.truncate(length); // discard possibly corrupt portion
}
lastCheckpointLength = length;
for (bucketIndex = 0; bucketIndex < MAX_BUCKETS && fileHeader.getInt(bucketPosition(bucketIndex)) != -1; bucketIndex++);
fileHeader.position(bucketPosition(--bucketIndex));
bucketMessageId = fileHeader.getInt();
bucketTimestamp = fileHeader.getLong();
bucketCount = fileHeader.getInt();
}
bytesPerBucket = (this.maxFileSize - FILE_HEADER_SIZE) / MAX_BUCKETS;
}
private int bucketPosition(int i) {
return FILE_HEADER_FIXED_SIZE + i * BUCKET_RECORD_SIZE;
}
public File getFile() {
return file;
}
public long getFirstMessageId() {
return firstMessageId;
}
/**
* What ID will the next message appended have, assuming there is space for it?
*/
public long getNextMessageId() {
synchronized (channel) {
return firstMessageId + length - FILE_HEADER_SIZE;
}
}
/**
* Append a message and return its id (position in the file plus the firstMessageId of the file). Returns
* -1 if this file is too full for the message.
*/
public long append(long timestamp, String routingKey, ReadableByteChannel payload, int payloadSize) throws IOException {
int n = routingKey.length();
if (n > 255) throw new IllegalArgumentException("Routing key length " + n + " > 255 characters");
byte[] routingKeyBytes = routingKey.getBytes(UTF8);
synchronized (channel) {
if (length + MESSAGE_HEADER_SIZE + routingKeyBytes.length + payloadSize > maxFileSize) return -1;
header.clear();
channel.position(length);
header.put(TYPE_MESSAGE);
header.putLong(timestamp);
header.putShort((short)routingKeyBytes.length);
header.putInt(payloadSize);
header.put(routingKeyBytes);
header.flip();
int id = length - FILE_HEADER_SIZE;
channel.write(header);
long sz = channel.transferFrom(payload, channel.position(), payloadSize);
if (sz != payloadSize) {
throw new IOException("Only read " + sz + " bytes from payload channel instead of " + payloadSize);
}
length = (int)channel.position() + payloadSize; // update after write so a partial write won't corrupt file
// see if we need to start a new histogram bucket
if (bucketIndex < 0 || ((id - bucketMessageId >= bytesPerBucket) && bucketIndex < MAX_BUCKETS - 1)) {
if (bucketIndex >= 0) {
putBucketDataInFileHeader();
++bucketIndex;
} else {
bucketIndex = 0;
}
bucketMessageId = id;
bucketTimestamp = timestamp;
bucketCount = 1;
} else {
++bucketCount;
}
mostRecentTimestamp = timestamp;
return firstMessageId + id;
}
}
private void putBucketDataInFileHeader() {
fileHeader.position(bucketPosition(bucketIndex));
fileHeader.putInt(bucketMessageId);
fileHeader.putLong(bucketTimestamp);
fileHeader.putInt(bucketCount);
// data will be written at the next checkpoint
}
/**
* How many bytes will the message take up in the file including headers?
*/
public static int getMessageSize(String routingKey, int payloadSize) {
try {
return (routingKey == null ? 0 : routingKey.getBytes("UTF8").length) + payloadSize + MESSAGE_HEADER_SIZE;
} catch (UnsupportedEncodingException e) {
throw new RuntimeException(e); // not possible really
}
}
/**
* How big is this file in bytes? Note that this is the total length of the file including the header.
*/
public int length() {
synchronized (channel) {
return length;
}
}
/**
* Sync all changes to disk and write a checkpoint to the file. Note that the checkpoint is itself synced to
* disk only if force is true.
*/
public void checkpoint(boolean force) throws IOException {
synchronized (channel) {
// force all writes to disk before updating checkpoint length so we know all data up to length is good
channel.force(true);
if (length != lastCheckpointLength) {
fileHeader.putInt(8, length);
if (bucketIndex >= 0) putBucketDataInFileHeader();
fileHeader.position(0);
channel.position(0).write(fileHeader);
lastCheckpointLength = length;
if (force) channel.force(true);
}
}
}
/**
* Increment the usage counter for this file. Each call to {@link #closeIfUnused()} decrements the counter and
* the file is actually closed when the counter reaches zero.
*/
public void use() {
synchronized (channel) {
++usageCounter;
}
}
/**
* Close this file if no-one else is using it (see {@link #use()}). NOP if already closed.
*/
public void closeIfUnused() throws IOException {
synchronized (channel) {
if (isOpen() && --usageCounter <= 0) {
checkpoint(true);
raf.close();
}
}
}
/**
* Close this file even if it is in use. NOP if already closed.
*/
@Override
public void close() throws IOException {
synchronized (channel) {
if (isOpen()) {
--usageCounter;
checkpoint(true);
raf.close();
}
}
}
/**
* Is this file open?
*/
public boolean isOpen() {
synchronized (channel) {
return channel.isOpen();
}
}
@Override
public String toString() {
return "MessageFile[" + file + "] firstMessageId " + firstMessageId + " length " + length;
}
/**
* Get the timestamp of the message most recently appended to this file or 0 if it is empty.
*/
public long getMostRecentTimestamp() throws IOException {
synchronized (channel) {
if (mostRecentTimestamp == 0 && length > FILE_HEADER_SIZE) {
MessageCursor c = cursor(getBucket(getBucketCount() - 1).getFirstMessageId());
try {
while (c.next()) mostRecentTimestamp = c.getTimestamp();
} finally {
c.close();
}
}
return mostRecentTimestamp;
}
}
/**
* How many messages are in this file?
*/
public int getMessageCount() throws IOException {
synchronized (channel) {
int count = 0;
int start = bucketPosition(0);
for (int i = 0; i < bucketIndex; i++) {
count += fileHeader.getInt(start + i * 16 + 12);
}
count += bucketCount;
return count;
}
}
public Timeline getTimeline() throws IOException {
synchronized (channel) {
TimelineImpl ans = new TimelineImpl(firstMessageId, bucketIndex);
fileHeader.position(bucketPosition(0));
for (int i = 0; i < bucketIndex; i++) {
ans.ids[i] = fileHeader.getInt();
ans.timestamps[i] = fileHeader.getLong();
ans.counts[i] = fileHeader.getInt();
}
ans.ids[bucketIndex] = bucketMessageId;
ans.timestamps[bucketIndex] = bucketTimestamp;
ans.counts[bucketIndex] = bucketCount;
ans.ids[bucketIndex + 1] = (int)(getNextMessageId() - firstMessageId);
ans.timestamps[bucketIndex + 1] = getMostRecentTimestamp();
return ans;
}
}
static class TimelineImpl implements Timeline {
private long firstMessageId;
private int[] ids, counts;
private long[] timestamps;
TimelineImpl(long firstMessageId, int bucketIndex) {
this.firstMessageId = firstMessageId;
ids = new int[bucketIndex + 2];
this.timestamps = new long[bucketIndex + 2];
counts = new int[bucketIndex + 1];
}
public int size() {
return ids.length - 1;
}
public long getMessageId(int i) {
return ids[i] + firstMessageId;
}
public long getTimestamp(int i) {
return this.timestamps[i];
}
public int getBytes(int i) {
return ids[i + 1] - ids[i];
}
public long getMillis(int i) {
return this.timestamps[i + 1] - this.timestamps[i];
}
public int getCount(int i) {
return counts[i];
}
}
/**
* How many histogram buckets are there?
*/
public int getBucketCount() {
synchronized (channel) {
return bucketIndex + 1;
}
}
/**
* Get a copy of the data for the histogram bucket at index.
*/
public Bucket getBucket(int i) {
synchronized (channel) {
if (i < 0 || i > bucketIndex) {
throw new IllegalArgumentException("index " + i + " out of range (0 to " + bucketIndex + ")");
}
if (i == bucketIndex) {
return new Bucket(firstMessageId + bucketMessageId, bucketTimestamp, bucketCount,
(length - FILE_HEADER_SIZE) - bucketMessageId);
}
fileHeader.position(bucketPosition(i));
int id = fileHeader.getInt();
return new Bucket(firstMessageId + id, fileHeader.getLong(), fileHeader.getInt(),
(i == bucketIndex - 1 ? bucketMessageId : fileHeader.getInt()) - id);
}
}
/**
* Get the index of the histogram bucket containing messageId or -1 if it is before the first message id or this
* file is empty. If messageId is after the last message the last bucket index is returned.
*/
public int findBucket(long messageId) throws IOException {
synchronized (channel) {
int key = (int)(messageId - firstMessageId);
if (key >= bucketMessageId) return bucketIndex; // last bucket
int low = 0;
int high = bucketIndex - 1;
while (low <= high) {
int mid = (low + high) >>> 1;
int midVal = fileHeader.getInt(bucketPosition(mid));
if (midVal < key) low = mid + 1;
else if (midVal > key) high = mid - 1;
else return mid;
}
return low - 1;
}
}
/**
* Get the index of the histogram bucket containing timestamp or -1 if it is before the first message or this
* file is empty. If timestamp is after the last message the last bucket index is returned.
*/
public int findBucketByTimestamp(long timestamp) throws IOException {
synchronized (channel) {
if (timestamp >= bucketTimestamp) return bucketIndex; // last bucket
int low = 0;
int high = bucketIndex - 1;
while (low <= high) {
int mid = (low + high) >>> 1;
long midVal = fileHeader.getLong(bucketPosition(mid) + 4);
if (midVal < timestamp) low = mid + 1;
else if (midVal > timestamp) high = mid - 1;
else return mid;
}
return low - 1;
}
}
public static class Bucket {
private final long firstMessageId;
private final long timestamp;
private final int count;
private final int size;
public Bucket(long firstMessageId, long timestamp, int count, int size) {
this.firstMessageId = firstMessageId;
this.timestamp = timestamp;
this.count = count;
this.size = size;
}
/**
* Get the timestamp of the first message in the bucket.
*/
public long getTimestamp() {
return timestamp;
}
/**
* Get the ID of the first message in the bucket.
*/
public long getFirstMessageId() {
return firstMessageId;
}
/**
* Get the number of messages in the bucket.
*/
public int getCount() {
return count;
}
/**
* Get the number of bytes of messages in the bucket.
*/
public int getSize() {
return size;
}
@Override
public String toString() {
return "Bucket{firstMessageId=" + firstMessageId + ", timestamp=" + timestamp + ", count=" + count +
", size=" + size + '}';
}
}
/**
* Create a cursor reading data from messageId onwards. To read the oldest message appearing in the file
* use {@link #getFirstMessageId()} as the message ID. To read the newest use {@link #getNextMessageId()}.
* If messageId is 'between' messages it is advanced to the next message. The cursor is not thread safe.
*/
@SuppressWarnings("StatementWithEmptyBody")
public MessageCursor cursor(long messageId) throws IOException {
long nextMessageId = getNextMessageId();
if (messageId < firstMessageId || messageId > nextMessageId) {
throw new IllegalArgumentException("messageId " + messageId + " not in " + this);
}
if (messageId == nextMessageId) return new Cursor(messageId); // at EOF
long pos = getBucket(findBucket(messageId)).getFirstMessageId();
Cursor c = new Cursor(pos);
if (pos < messageId) { // skip messages until we get to the one we want
while (c.next() && c.getNextId() < messageId);
}
return c;
}
/**
* Create a cursor reading data from timestamp onwards. If timestamp is before the first message then the cursor
* reads starting at the first message. If timestamp is past the last message then the cursor will return false
* until more messages appear in the file.
*/
@SuppressWarnings("StatementWithEmptyBody")
public MessageCursor cursorByTimestamp(long timestamp) throws IOException {
int i = findBucketByTimestamp(timestamp);
if (i < 0) return new Cursor(firstMessageId);
// the first message with timestamp >= the time we are looking for may be in a previous bucket because
// the bucket timestamp resolution is only ms so go back until we get a change in time .. that way we
// are sure to find it
Bucket b = getBucket(i);
for (; b.getTimestamp() == timestamp && i > 0; b = getBucket(--i));
Cursor c = new Cursor(getBucket(i).getFirstMessageId());
for (; c.next(); ) { // skip messages until we get one >= timestamp
if (c.getTimestamp() >= timestamp) {
c.unget();
break;
}
}
return c;
}
/**
* Iterates over messages in the file. Not thread safe.
*/
private class Cursor implements MessageCursor {
private final ChannelInput input;
private final byte[] routingKeyBuf = new byte[1024];
private long id;
private long timestamp;
private int routingKeySize;
private int payloadSize;
private int nextPosition;
public Cursor(long messageId) throws IOException {
input = new ChannelInput(channel, messageIdToPosition(messageId), 8192);
}
private int messageIdToPosition(long messageId) {
return (int)(messageId - firstMessageId) + FILE_HEADER_SIZE;
}
private void unget() {
routingKeySize = payloadSize = -1;
input.position(messageIdToPosition(id));
}
/**
* Advance to the next message or return false if there are no more messages. The cursor initially starts
* "before" the next message.
*/
public boolean next() throws IOException {
if (routingKeySize > 0) {
input.skip(routingKeySize); // routing key was never read
routingKeySize = -1;
}
if (payloadSize > 0) {
input.skip(payloadSize); // payload was never read
payloadSize = -1;
}
int len = length();
if (input.position() >= len) return false;
id = firstMessageId + input.position() - FILE_HEADER_SIZE;
byte type = input.readByte();
if (type != TYPE_MESSAGE) {
throw new IOException("Unexpected message type 0x" + Integer.toHexString(type & 0xFF) + " at " +
(input.position() - 1) + " in " + MessageFile.this);
}
timestamp = input.readLong();
routingKeySize = input.readShort();
if (routingKeySize < 0 || routingKeySize >= routingKeyBuf.length) {
throw new IOException("Invalid routing key size " + routingKeySize + " at " +
(input.position() - 2) + " in " + MessageFile.this);
}
payloadSize = input.readInt();
if (payloadSize < 0) {
throw new IOException("Negative payload size " + payloadSize + " at " + (input.position() - 4) +
" in " + MessageFile.this);
}
nextPosition = input.position() + routingKeySize + payloadSize;
if (nextPosition > len) {
throw new IOException("Payload size " + payloadSize + " at " + (input.position() - 4) +
" extends beyond EOF " + len + " in " + MessageFile.this);
}
return true;
}
@Override
public boolean next(int timeoutMs) throws IOException {
throw new UnsupportedOperationException();
}
public long getId() {
return id;
}
public long getTimestamp() {
return timestamp;
}
public String getRoutingKey() throws IOException {
if (routingKeySize < 0) throw new IllegalStateException("Routing key already read");
input.read(routingKeyBuf, 0, routingKeySize);
String ans = new String(routingKeyBuf, 0, routingKeySize, UTF8);
routingKeySize = -1;
return ans;
}
public int getPayloadSize() {
return payloadSize;
}
public byte[] getPayload() throws IOException {
if (payloadSize < 0) throw new IllegalStateException("Payload already read");
if (routingKeySize > 0) {
input.skip(routingKeySize); // routing key was never read
routingKeySize = -1;
}
byte[] buf = new byte[payloadSize];
input.read(buf, 0, payloadSize);
payloadSize = -1;
return buf;
}
public long getNextId() {
return firstMessageId + nextPosition - FILE_HEADER_SIZE;
}
@Override
public void close() throws IOException {
// nothing to do
}
}
}