/*
* Copyright 2015 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.atomix.copycat.server.storage;
import java.util.zip.CRC32;
import java.util.zip.Checksum;
import io.atomix.catalyst.buffer.*;
import io.atomix.catalyst.serializer.Serializer;
import io.atomix.catalyst.util.Assert;
import io.atomix.copycat.server.storage.entry.Entry;
import io.atomix.copycat.server.storage.index.OffsetIndex;
import io.atomix.copycat.server.storage.util.OffsetPredicate;
import io.atomix.copycat.server.storage.util.TermIndex;
import static io.atomix.catalyst.buffer.Bytes.BOOLEAN;
import static io.atomix.catalyst.buffer.Bytes.INTEGER;
import static io.atomix.catalyst.buffer.Bytes.LONG;
/**
* Stores a sequence of entries with monotonically increasing indexes in a {@link Buffer}.
* <p>
* Segments are individual file or memory based groups of sequential entries. Each segment has a fixed capacity
* in terms of either number of entries or size in bytes.
* <p>
* The {@link SegmentDescriptor} describes the metadata for the segment, including the starting {@code index} of
* the segment and various configuration options. The descriptor is persisted in 48 bytes at the head of the segment.
* <p>
* Internally, each segment maintains an in-memory index of entries. The index stores the offset and position of
* each entry within the segment's internal {@link io.atomix.catalyst.buffer.Buffer}. For entries that are appended
* to the log sequentially, the index has an O(1) lookup time. For instances where entries in a segment have been
* skipped (due to log compaction), the lookup time is O(log n) due to binary search. However, due to the nature of
* the Raft consensus algorithm, readers should typically benefit from O(1) lookups.
* <p>
* When a segment is constructed, the segment will attempt to rebuild its index from the underlying segment
* {@link Buffer}. This is done by reading a 32-bit length and 64-bit offset for each entry. Once the segment
* has been built, new entries will be {@link #append(Entry) appended} at the end of the segment.
* <p>
* Additionally, segments are responsible for keeping track of entries that have been {@link #release(long) released}.
* Entry liveness is tracked in an internal {@link io.atomix.catalyst.buffer.util.BitArray} with a size equal
* to the segment's entry {@link #count()}.
* <p>
* An entry in the log is written in binary format. The binary format of an entry is as follows:
* <ul>
* <li>Required 32-bit signed entry length</li>
* <li>Required 32-bit unsigned entry checksum</li>
* <li>Required 64-bit signed offset</li>
* <li>Required 8-bit term flag</li>
* <li>Optional 64-bit term</li>
* </ul>
*
* @author <a href="http://github.com/kuujo">Jordan Halterman</a>
*/
public class Segment implements AutoCloseable {
private final SegmentFile file;
private final SegmentDescriptor descriptor;
private final Serializer serializer;
private final Buffer buffer;
private final HeapBuffer memory = HeapBuffer.allocate();
private final OffsetIndex offsetIndex;
private final OffsetPredicate offsetPredicate;
private final TermIndex termIndex = new TermIndex();
private final SegmentManager manager;
private long skip = 0;
private boolean open = true;
/**
* @throws NullPointerException if any argument is null
*/
Segment(SegmentFile file, Buffer buffer, SegmentDescriptor descriptor, OffsetIndex offsetIndex, OffsetPredicate offsetPredicate, Serializer serializer, SegmentManager manager) {
this.serializer = Assert.notNull(serializer, "serializer");
this.file = Assert.notNull(file, "file");
this.buffer = Assert.notNull(buffer, "buffer");
this.descriptor = Assert.notNull(descriptor, "descriptor");
this.offsetIndex = Assert.notNull(offsetIndex, "offsetIndex");
this.offsetPredicate = Assert.notNull(offsetPredicate, "offsetPredicate");
this.manager = Assert.notNull(manager, "manager");
buildIndex();
}
/**
* Builds the index from the segment bytes.
*/
private void buildIndex() {
// Read the current buffer position.
long position = buffer.mark().position();
// Read the first entry length.
int length = buffer.readInt();
// While the length is non-zero...
while (length != 0) {
// Read the full entry into memory.
buffer.read(memory.clear().limit(length));
// Flip the in-memory buffer.
memory.flip();
// Read the 64-bit entry checksum.
long checksum = memory.readUnsignedInt();
// Read the 64-bit entry offset.
long offset = memory.readLong();
// If the term is set on the entry, read the term.
Long term = memory.readBoolean() ? memory.readLong() : null;
// Calculate the entry position and length.
int entryPosition = (int) memory.position();
int entryLength = length - entryPosition;
// Compute the checksum for the entry bytes.
Checksum crc32 = new CRC32();
crc32.update(memory.array(), entryPosition, entryLength);
// If the computed checksum equals the stored checksum...
if (checksum == crc32.getValue()) {
// If the entry contained a term, index the term.
if (term != null) {
termIndex.index(offset, term);
}
// Index the entry offset.
offsetIndex.index(offset, position);
} else {
break;
}
// Store the next entry start position.
position = buffer.position();
// Read the next entry length.
length = buffer.mark().readInt();
}
// Reset the buffer back to the start of the next entry.
buffer.reset();
}
/**
* Returns the segment file.
*
* @return The segment file.
*/
public SegmentFile file() {
return file;
}
/**
* Returns the {@link SegmentDescriptor} for the segment.
* <p>
* The segment descriptor is stored in {@link SegmentDescriptor#BYTES} bytes at the head of the segment. The descriptor
* defines essential information about the segment, including its position in the complete {@link Log} and its {@code index}.
*
* @return The segment descriptor stored at the head of the segment.
*/
public SegmentDescriptor descriptor() {
return descriptor;
}
/**
* Returns a boolean value indicating whether the segment is open.
*
* @return Indicates whether the segment is open.
*/
public boolean isOpen() {
return open;
}
/**
* Returns a boolean value indicating whether the segment is empty.
* <p>
* The segment is considered empty if no entries have been written to the segment and no indexes in the
* segment have been {@link #skip(long) skipped}.
*
* @return Indicates whether the segment is empty.
*/
public boolean isEmpty() {
return offsetIndex.size() > 0 ? offsetIndex.lastOffset() + 1 + skip == 0 : skip == 0;
}
/**
* Returns a boolean value indicating whether the segment has been compacted.
* <p>
* The segment is considered compacted if its {@link SegmentDescriptor#version()} is greater than {@code 1}.
*
* @return Indicates whether the segment has been compacted.
*/
public boolean isCompacted() {
return descriptor.version() > 1;
}
/**
* Returns a boolean value indicating whether the segment is full.
* <p>
* The segment is considered full if one of the following conditions is met:
* <ul>
* <li>{@link #size()} is greater than or equal to {@link SegmentDescriptor#maxSegmentSize()}</li>
* <li>{@link #count()} is greater than or equal to {@link SegmentDescriptor#maxEntries()}</li>
* </ul>
*
* @return Indicates whether the segment is full.
*/
public boolean isFull() {
return size() >= descriptor.maxSegmentSize()
|| offsetIndex.size() >= descriptor.maxEntries();
}
/**
* Returns the total size of the segment in bytes.
*
* @return The size of the segment in bytes.
*/
public long size() {
return buffer.offset() + buffer.position();
}
/**
* Returns the current range of the segment.
* <p>
* The length includes entries that may have been {@link #skip(long) skipped} at the end of the segment.
*
* @return The current range of the segment.
*/
public long length() {
return !isEmpty() ? offsetIndex.lastOffset() + 1 + skip : 0;
}
/**
* Returns the count of all entries in the segment.
* <p>
* The count includes only entries that are physically present in the segment. Entries that have been compacted
* out of the segment are not counted towards the count, nor are {@link #skip(long) skipped} entries.
*
* @return The count of all entries in the segment.
*/
public int count() {
return offsetIndex.size();
}
/**
* Returns the base index of the segment.
* <p>
* The base index is equivalent to the segment's {@link #firstIndex()} if the segment is not {@link #isEmpty() emtpy}.
*
* @return The base index of the segment.
*/
long index() {
return descriptor.index();
}
/**
* Returns the index of the first entry in the segment.
* <p>
* If the segment is empty, {@code 0} will be returned regardless of the segment's base index.
*
* @return The index of the first entry in the segment or {@code 0} if the segment is empty.
* @throws IllegalStateException if the segment is not open
*/
public long firstIndex() {
assertSegmentOpen();
return !isEmpty() ? descriptor.index() : 0;
}
/**
* Returns the index of the last entry in the segment.
*
* @return The index of the last entry in the segment or {@code 0} if the segment is empty.
* @throws IllegalStateException if the segment is not open
*/
public long lastIndex() {
assertSegmentOpen();
return !isEmpty() ? offsetIndex.lastOffset() + descriptor.index() + skip : descriptor.index() - 1;
}
/**
* Returns the next index in the segment.
*
* @return The next index in the segment.
*/
public long nextIndex() {
return !isEmpty() ? lastIndex() + 1 : descriptor.index() + skip;
}
/**
* Returns the offset of the given index within the segment.
* <p>
* The offset reflects the zero-based offset of the given {@code index} in the segment when missing/compacted
* entries are taken into account. For instance, if a segment contains entries at indexes {@code {1, 3}}, the
* {@code offset} of index {@code 1} will be {@code 0} and index {@code 3} will be {@code 1}.
*
* @param index The index to check.
* @return The offset of the given index.
*/
public long offset(long index) {
return offsetIndex.find(relativeOffset(index));
}
/**
* Returns the offset for the given index.
*/
private long relativeOffset(long index) {
return index - descriptor.index();
}
/**
* Checks the range of the given index.
*
* @throws IndexOutOfBoundsException if the {@code index} is invalid for the segment
*/
private void checkRange(long index) {
Assert.indexNot(isEmpty(), "segment is empty");
Assert.indexNot(index < firstIndex(), index + " is less than the first index in the segment");
Assert.indexNot(index > lastIndex(), index + " is greater than the last index in the segment");
}
/**
* Commits an entry to the segment.
*
* @throws NullPointerException if {@code entry} is null
* @throws IllegalStateException if the segment is full
* @throws IndexOutOfBoundsException if the {@code entry} index does not match the next index
*/
public long append(Entry entry) {
Assert.notNull(entry, "entry");
Assert.stateNot(isFull(), "segment is full");
long index = nextIndex();
Assert.index(index == entry.getIndex(), "inconsistent index: %s", entry.getIndex());
// Calculate the offset of the entry.
long offset = relativeOffset(index);
// Get the term from the entry.
long term = entry.getTerm();
// Get the highest term in the index.
long lastTerm = termIndex.term();
// The entry term must be positive and >= the last term in the segment.
Assert.arg(term > 0 && term >= lastTerm, "term must be monotonically increasing");
// Mark the starting position of the record and record the starting position of the new entry.
long position = buffer.position();
// Determine whether to skip writing the term to the segment.
boolean skipTerm = term == lastTerm;
// Calculate the length of the entry header bytes.
int headerLength = INTEGER + LONG + BOOLEAN + (skipTerm ? 0 : LONG);
// Clear the memory and skip the size and header.
memory.clear().skip(headerLength);
// Serialize the object into the in-memory buffer.
serializer.writeObject(entry, memory);
// Flip the in-memory buffer indexes.
memory.flip();
// The total length of the entry is the in-memory buffer limit.
int totalLength = (int) memory.limit();
// Calculate the length of the serialized bytes based on the in-memory buffer limit and header length.
int entryLength = totalLength - headerLength;
// Set the entry size.
entry.setSize(totalLength);
// Compute the checksum for the entry.
Checksum crc32 = new CRC32();
crc32.update(memory.array(), headerLength, entryLength);
long checksum = crc32.getValue();
// Rewind the in-memory buffer and write the length, checksum, and offset.
memory.rewind()
.writeUnsignedInt(checksum)
.writeLong(offset);
// If the term has not yet been written, write the term to this entry.
if (skipTerm) {
memory.writeBoolean(false);
} else {
memory.writeBoolean(true).writeLong(term);
}
// Write the entry length and entry to the segment.
buffer.writeInt(totalLength)
.write(memory.rewind());
// Index the offset, position, and length.
offsetIndex.index(offset, position);
// If the entry term is greater than the last indexed term, index the term.
if (term > lastTerm) {
termIndex.index(offset, term);
}
// Reset skip to zero since we wrote a new entry.
skip = 0;
return index;
}
/**
* Reads the term for the entry at the given index.
*
* @param index The index for which to read the term.
* @return The term for the given index.
* @throws IllegalStateException if the segment is not open or {@code index} is inconsistent
*/
public long term(long index) {
assertSegmentOpen();
checkRange(index);
// Get the offset of the index within this segment.
long offset = relativeOffset(index);
// Look up the term for the offset in the term index.
return termIndex.lookup(offset);
}
/**
* Reads the entry at the given index.
*
* @param index The index from which to read the entry.
* @return The entry at the given index.
* @throws IllegalStateException if the segment is not open or {@code index} is inconsistent with the entry
*/
public synchronized <T extends Entry> T get(long index) {
assertSegmentOpen();
checkRange(index);
// Get the offset of the index within this segment.
long offset = relativeOffset(index);
// Get the start position of the entry from the memory index.
long position = offsetIndex.position(offset);
// If the index contained the entry, read the entry from the buffer.
if (position != -1) {
// Read the length of the entry.
int length = buffer.readInt(position);
// Read the entry into memory.
try (Buffer slice = buffer.slice(position + INTEGER, length)) {
slice.read(memory.clear().limit(length));
memory.flip();
}
// Read the checksum of the entry.
long checksum = memory.readUnsignedInt();
// Verify that the entry at the given offset matches.
long entryOffset = memory.readLong();
Assert.state(entryOffset == offset, "inconsistent index: %s", index);
// Skip the term if necessary.
if (memory.readBoolean()) {
memory.skip(LONG);
}
// Calculate the entry position and length.
int entryPosition = (int) memory.position();
int entryLength = length - entryPosition;
// Compute the checksum for the entry bytes.
Checksum crc32 = new CRC32();
crc32.update(memory.array(), entryPosition, entryLength);
// If the stored checksum equals the computed checksum, return the entry.
if (checksum == crc32.getValue()) {
T entry = serializer.readObject(memory);
entry.setIndex(index).setTerm(termIndex.lookup(offset)).setSize(length);
return entry;
}
}
return null;
}
/**
* Returns a boolean value indicating whether the given index is within the range of the segment.
*
* @param index The index to check.
* @return Indicates whether the given index is within the range of the segment.
* @throws IllegalStateException if the segment is not open
*/
boolean validIndex(long index) {
assertSegmentOpen();
return !isEmpty() && index >= firstIndex() && index <= lastIndex();
}
/**
* Returns a boolean value indicating whether the entry at the given index is active.
*
* @param index The index to check.
* @return Indicates whether the entry at the given index is active.
* @throws IllegalStateException if the segment is not open
*/
public boolean contains(long index) {
assertSegmentOpen();
if (!validIndex(index))
return false;
// Check the memory index first for performance reasons.
long offset = relativeOffset(index);
return offsetIndex.contains(offset);
}
/**
* Releases an entry from the segment.
*
* @param index The index of the entry to release.
* @return Indicates whether the entry was newly released from the segment.
* @throws IllegalStateException if the segment is not open
*/
public boolean release(long index) {
assertSegmentOpen();
long offset = offsetIndex.find(relativeOffset(index));
return offset != -1 && offsetPredicate.release(offset);
}
/**
* Returns a boolean value indicating whether the given index was released from the segment.
*
* @param index The index of the entry to check.
* @return Indicates whether the given entry was released from the segment.
* @throws IllegalStateException if the segment is not open
*/
public boolean isLive(long index) {
assertSegmentOpen();
return offsetPredicate.test(offsetIndex.find(relativeOffset(index)));
}
/**
* Returns the number of entries in the segment that have been released.
*
* @return The number of entries in the segment that have been released.
* @throws IllegalStateException if the segment is not open
*/
public long releaseCount() {
assertSegmentOpen();
return offsetPredicate.count();
}
/**
* Returns a predicate for live offsets in the segment.
*
* @return A predicate for live offsets in the segment.
*/
public OffsetPredicate offsetPredicate() {
return offsetPredicate;
}
/**
* Skips a number of entries in the segment.
*
* @param entries The number of entries to skip.
* @return The segment.
* @throws IllegalStateException if the segment is not open
*/
public Segment skip(long entries) {
assertSegmentOpen();
this.skip += entries;
return this;
}
/**
* Truncates entries after the given index.
*
* @param index The index after which to remove entries.
* @return The segment.
* @throws IllegalStateException if the segment is not open
*/
public Segment truncate(long index) {
assertSegmentOpen();
Assert.index(index >= manager.commitIndex(), "cannot truncate committed index");
long offset = relativeOffset(index);
long lastOffset = offsetIndex.lastOffset();
long diff = Math.abs(lastOffset - offset);
skip = Math.max(skip - diff, 0);
if (offset < lastOffset) {
long position = offsetIndex.truncate(offset);
buffer.position(position)
.zero(position)
.flush();
termIndex.truncate(offset);
}
return this;
}
/**
* Flushes the segment buffers to disk.
*
* @return The segment.
*/
public Segment flush() {
buffer.flush();
offsetIndex.flush();
return this;
}
@Override
public void close() {
buffer.close();
offsetIndex.close();
offsetPredicate.close();
descriptor.close();
open = false;
}
/**
* Deletes the segment.
*/
public void delete() {
Buffer buffer = this.buffer instanceof SlicedBuffer ? ((SlicedBuffer) this.buffer).root() : this.buffer;
if (buffer instanceof FileBuffer) {
((FileBuffer) buffer).delete();
} else if (buffer instanceof MappedBuffer) {
((MappedBuffer) buffer).delete();
}
offsetIndex.delete();
}
@Override
public String toString() {
return String.format("Segment[id=%d, version=%d, index=%d, length=%d]", descriptor.id(), descriptor.version(), firstIndex(), length());
}
private void assertSegmentOpen() {
Assert.state(isOpen(), "segment not open");
}
}