/**
* Copyright 2016 LinkedIn Corp. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/
package com.github.ambry.store;
import com.github.ambry.config.StoreConfig;
import com.github.ambry.utils.ByteBufferInputStream;
import com.github.ambry.utils.CrcInputStream;
import com.github.ambry.utils.CrcOutputStream;
import com.github.ambry.utils.FilterFactory;
import com.github.ambry.utils.IFilter;
import com.github.ambry.utils.Pair;
import com.github.ambry.utils.Time;
import com.github.ambry.utils.Utils;
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentNavigableMap;
import java.util.concurrent.ConcurrentSkipListMap;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.AtomicReference;
import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Represents a segment of an index. The segment is represented by a
* start offset and an end offset and has the keys sorted. The segment
* can either be read only and memory mapped or writable and in memory.
* The segment uses a bloom filter to optimize reads from disk. If the
* index is read only, a key is searched by doing a binary search on
* the memory mapped file. If the index is in memory, a normal map
* lookup is performed to find key.
*/
class IndexSegment {
static final String INDEX_SEGMENT_FILE_NAME_SUFFIX = "index";
static final String BLOOM_FILE_NAME_SUFFIX = "bloom";
private final static int KEY_SIZE_INVALID_VALUE = -1;
private final static int VALUE_SIZE_INVALID_VALUE = -1;
private final int VERSION_FIELD_LENGTH = 2;
private final int KEY_SIZE_FIELD_LENGTH = 4;
private final int VALUE_SIZE_FIELD_LENGTH = 4;
private final int CRC_FIELD_LENGTH = 8;
private final int LOG_END_OFFSET_FIELD_LENGTH = 8;
private final int LAST_MODIFIED_TIME_FIELD_LENGTH = 8;
private final int RESET_KEY_TYPE_FIELD_LENGTH = 2;
private int indexSizeExcludingEntries;
private int firstKeyRelativeOffset;
private final String indexSegmentFilenamePrefix;
private final Offset startOffset;
private final AtomicReference<Offset> endOffset;
private final File indexFile;
private final ReadWriteLock rwLock;
private final AtomicBoolean mapped;
private final Logger logger = LoggerFactory.getLogger(getClass());
private final AtomicLong sizeWritten;
private final StoreKeyFactory factory;
private final File bloomFile;
private final StoreMetrics metrics;
private final AtomicInteger numberOfItems;
private final Time time;
// an approximation of the last modified time.
private final AtomicLong lastModifiedTimeSec;
private MappedByteBuffer mmap = null;
private IFilter bloomFilter;
private int keySize;
private int valueSize;
private short version;
private Offset prevSafeEndPoint = null;
// reset key refers to the first StoreKey that is added to the index segment
private Pair<StoreKey, PersistentIndex.IndexEntryType> resetKey = null;
protected ConcurrentSkipListMap<StoreKey, IndexValue> index = null;
/**
* Creates a new segment
* @param dataDir The data directory to use for this segment
* @param startOffset The start {@link Offset} in the {@link Log} that this segment represents.
* @param factory The store key factory used to create new store keys
* @param keySize The key size that this segment supports
* @param valueSize The value size that this segment supports
* @param config The store config used to initialize the index segment
* @param time the {@link Time} instance to use
*/
IndexSegment(String dataDir, Offset startOffset, StoreKeyFactory factory, int keySize, int valueSize,
StoreConfig config, StoreMetrics metrics, Time time) {
this.rwLock = new ReentrantReadWriteLock();
this.startOffset = startOffset;
this.endOffset = new AtomicReference<>(startOffset);
index = new ConcurrentSkipListMap<>();
mapped = new AtomicBoolean(false);
sizeWritten = new AtomicLong(0);
this.factory = factory;
this.keySize = keySize;
this.valueSize = valueSize;
this.version = PersistentIndex.CURRENT_VERSION;
bloomFilter = FilterFactory.getFilter(config.storeIndexMaxNumberOfInmemElements,
config.storeIndexBloomMaxFalsePositiveProbability);
numberOfItems = new AtomicInteger(0);
this.metrics = metrics;
this.time = time;
lastModifiedTimeSec = new AtomicLong(time.seconds());
indexSegmentFilenamePrefix = generateIndexSegmentFilenamePrefix();
indexFile = new File(dataDir, indexSegmentFilenamePrefix + INDEX_SEGMENT_FILE_NAME_SUFFIX);
bloomFile = new File(dataDir, indexSegmentFilenamePrefix + BLOOM_FILE_NAME_SUFFIX);
}
/**
* Initializes an existing segment. Memory maps the segment or reads the segment into memory. Also reads the
* persisted bloom filter from disk.
* @param indexFile The index file that the segment needs to be initialized from
* @param shouldMap Indicates if the segment needs to be memory mapped
* @param factory The store key factory used to create new store keys
* @param config The store config used to initialize the index segment
* @param metrics The store metrics used to track metrics
* @param journal The journal to use
* @param time the {@link Time} instance to use
* @throws StoreException
*/
IndexSegment(File indexFile, boolean shouldMap, StoreKeyFactory factory, StoreConfig config, StoreMetrics metrics,
Journal journal, Time time) throws StoreException {
try {
startOffset = getIndexSegmentStartOffset(indexFile.getName());
endOffset = new AtomicReference<>(startOffset);
indexSegmentFilenamePrefix = generateIndexSegmentFilenamePrefix();
this.indexFile = indexFile;
this.rwLock = new ReentrantReadWriteLock();
this.factory = factory;
this.time = time;
sizeWritten = new AtomicLong(0);
numberOfItems = new AtomicInteger(0);
mapped = new AtomicBoolean(false);
lastModifiedTimeSec = new AtomicLong(0);
if (shouldMap) {
map(false);
// Load the bloom filter for this index
// We need to load the bloom filter only for mapped indexes
bloomFile = new File(indexFile.getParent(), indexSegmentFilenamePrefix + BLOOM_FILE_NAME_SUFFIX);
CrcInputStream crcBloom = new CrcInputStream(new FileInputStream(bloomFile));
DataInputStream stream = new DataInputStream(crcBloom);
bloomFilter = FilterFactory.deserialize(stream);
long crcValue = crcBloom.getValue();
if (crcValue != stream.readLong()) {
// TODO metrics
// we don't recover the filter. we just by pass the filter. Crc corrections will be done
// by the scrubber
bloomFilter = null;
logger.error("IndexSegment : {} error validating crc for bloom filter for {}", indexFile.getAbsolutePath(),
bloomFile.getAbsolutePath());
}
stream.close();
} else {
index = new ConcurrentSkipListMap<StoreKey, IndexValue>();
bloomFilter = FilterFactory.getFilter(config.storeIndexMaxNumberOfInmemElements,
config.storeIndexBloomMaxFalsePositiveProbability);
bloomFile = new File(indexFile.getParent(), indexSegmentFilenamePrefix + BLOOM_FILE_NAME_SUFFIX);
try {
readFromFile(indexFile, journal);
} catch (StoreException e) {
if (e.getErrorCode() == StoreErrorCodes.Index_Creation_Failure
|| e.getErrorCode() == StoreErrorCodes.Index_Version_Error) {
// we just log the error here and retain the index so far created.
// subsequent recovery process will add the missed out entries
logger.error("Index Segment : {} error while reading from index {}", indexFile.getAbsolutePath(),
e.getMessage());
} else {
throw e;
}
}
}
} catch (Exception e) {
throw new StoreException(
"Index Segment : " + indexFile.getAbsolutePath() + " error while loading index from file", e,
StoreErrorCodes.Index_Creation_Failure);
}
this.metrics = metrics;
}
/**
* @return the name of the log segment that this index segment refers to;
*/
String getLogSegmentName() {
return startOffset.getName();
}
/**
* The start offset that this segment represents
* @return The start offset that this segment represents
*/
Offset getStartOffset() {
return startOffset;
}
/**
* The end offset that this segment represents
* @return The end offset that this segment represents
*/
Offset getEndOffset() {
return endOffset.get();
}
/**
* Returns if this segment is mapped or not
* @return True, if the segment is readonly and mapped. False, otherwise
*/
boolean isMapped() {
return mapped.get();
}
/**
* The underlying file that this segment represents
* @return The file that this segment represents
*/
File getFile() {
return indexFile;
}
/**
* The key size in this segment
* @return The key size in this segment
*/
int getKeySize() {
return keySize;
}
/**
* The value size in this segment
* @return The value size in this segment
*/
int getValueSize() {
return valueSize;
}
/**
* The time of last modification of this segment in ms
* @return The time in ms of the last modification of this segment.
*/
long getLastModifiedTimeMs() {
return TimeUnit.SECONDS.toMillis(lastModifiedTimeSec.get());
}
/**
* The time of last modification of this segment in secs
* @return The time in secs of the last modification of this segment.
*/
long getLastModifiedTimeSecs() {
return lastModifiedTimeSec.get();
}
/**
* Sets the last modified time (secs) of this segment.
* @param lastModifiedTimeSec the value to set to (secs).
*/
void setLastModifiedTimeSecs(long lastModifiedTimeSec) {
this.lastModifiedTimeSec.set(lastModifiedTimeSec);
}
/**
* The version of the {@link PersistentIndex} that this {@link IndexSegment} is based on
* @return the version of the {@link PersistentIndex} that this {@link IndexSegment} is based on
*/
short getVersion() {
return version;
}
/**
* The resetKey for the index segment.
* @return the reset key for the index segment which is a {@link Pair} of StoreKey and
* {@link PersistentIndex.IndexEntryType}
*/
Pair<StoreKey, PersistentIndex.IndexEntryType> getResetKey() {
return resetKey;
}
/**
* Finds an entry given a key. It finds from the in memory map or
* does a binary search on the mapped persistent segment
* @param keyToFind The key to find
* @return The blob index value that represents the key or null if not found
* @throws StoreException
*/
IndexValue find(StoreKey keyToFind) throws StoreException {
IndexValue toReturn = null;
try {
rwLock.readLock().lock();
if (!mapped.get()) {
IndexValue value = index.get(keyToFind);
if (value != null) {
metrics.blobFoundInActiveSegmentCount.inc();
}
toReturn = value;
} else {
if (bloomFilter != null) {
metrics.bloomAccessedCount.inc();
}
if (bloomFilter == null || bloomFilter.isPresent(ByteBuffer.wrap(keyToFind.toBytes()))) {
if (bloomFilter == null) {
logger.trace("IndexSegment {} bloom filter empty. Searching file with start offset {} and for key {}",
indexFile.getAbsolutePath(), startOffset, keyToFind);
} else {
metrics.bloomPositiveCount.inc();
logger.trace("IndexSegment {} found in bloom filter for index with start offset {} and for key {} ",
indexFile.getAbsolutePath(), startOffset, keyToFind);
}
// binary search on the mapped file
ByteBuffer duplicate = mmap.duplicate();
int low = 0;
int high = numberOfEntries(duplicate) - 1;
logger.trace("binary search low : {} high : {}", low, high);
while (low <= high) {
int mid = (int) (Math.ceil(high / 2.0 + low / 2.0));
StoreKey found = getKeyAt(duplicate, mid);
logger.trace("Index Segment {} binary search - key found on iteration {}", indexFile.getAbsolutePath(),
found);
int result = found.compareTo(keyToFind);
if (result == 0) {
byte[] buf = new byte[valueSize];
duplicate.get(buf);
toReturn = new IndexValue(startOffset.getName(), ByteBuffer.wrap(buf), getVersion());
break;
} else if (result < 0) {
low = mid + 1;
} else {
high = mid - 1;
}
}
if (bloomFilter != null && toReturn == null) {
metrics.bloomFalsePositiveCount.inc();
}
}
}
} catch (IOException e) {
throw new StoreException("IndexSegment : " + indexFile.getAbsolutePath() + " IO error while searching", e,
StoreErrorCodes.IOError);
} finally {
rwLock.readLock().unlock();
}
return toReturn;
}
private int numberOfEntries(ByteBuffer mmap) {
return (mmap.capacity() - indexSizeExcludingEntries) / (keySize + valueSize);
}
private StoreKey getKeyAt(ByteBuffer mmap, int index) throws IOException {
mmap.position(firstKeyRelativeOffset + (index * (keySize + valueSize)));
return factory.getStoreKey(new DataInputStream(new ByteBufferInputStream(mmap)));
}
private int findIndex(StoreKey keyToFind, ByteBuffer mmap) throws IOException {
// binary search on the mapped file
int low = 0;
int high = numberOfEntries(mmap) - 1;
logger.trace("IndexSegment {} binary search low : {} high : {}", indexFile.getAbsolutePath(), low, high);
while (low <= high) {
int mid = (int) (Math.ceil(high / 2.0 + low / 2.0));
StoreKey found = getKeyAt(mmap, mid);
logger.trace("IndexSegment {} binary search - key found on iteration {}", indexFile.getAbsolutePath(), found);
int result = found.compareTo(keyToFind);
if (result == 0) {
return mid;
} else if (result < 0) {
low = mid + 1;
} else {
high = mid - 1;
}
}
return -1;
}
/**
* Adds an entry into the segment. The operation works only if the segment is read/write
* @param entry The entry that needs to be added to the segment.
* @param fileEndOffset The file end offset that this entry represents.
* @throws StoreException
*/
void addEntry(IndexEntry entry, Offset fileEndOffset) throws StoreException {
try {
rwLock.readLock().lock();
if (mapped.get()) {
throw new StoreException("IndexSegment : " + indexFile.getAbsolutePath() + " cannot add to a mapped index ",
StoreErrorCodes.Illegal_Index_Operation);
}
logger.trace("IndexSegment {} inserting key - {} value - offset {} size {} ttl {} "
+ "originalMessageOffset {} fileEndOffset {}", indexFile.getAbsolutePath(), entry.getKey(),
entry.getValue().getOffset(), entry.getValue().getSize(), entry.getValue().getExpiresAtMs(),
entry.getValue().getOriginalMessageOffset(), fileEndOffset);
if (index.put(entry.getKey(), entry.getValue()) == null) {
numberOfItems.incrementAndGet();
sizeWritten.addAndGet(entry.getKey().sizeInBytes() + entry.getValue().getBytes().capacity());
bloomFilter.add(ByteBuffer.wrap(entry.getKey().toBytes()));
if (resetKey == null) {
resetKey = new Pair<>(entry.getKey(),
entry.getValue().isFlagSet(IndexValue.Flags.Delete_Index) ? PersistentIndex.IndexEntryType.DELETE
: PersistentIndex.IndexEntryType.PUT);
}
}
endOffset.set(fileEndOffset);
long operationTimeInMs = entry.getValue().getOperationTimeInMs();
if (operationTimeInMs == Utils.Infinite_Time) {
lastModifiedTimeSec.set(time.seconds());
} else if ((operationTimeInMs / Time.MsPerSec) > lastModifiedTimeSec.get()) {
lastModifiedTimeSec.set(operationTimeInMs / Time.MsPerSec);
}
if (keySize == KEY_SIZE_INVALID_VALUE) {
StoreKey key = entry.getKey();
keySize = key.sizeInBytes();
logger.info("IndexSegment : {} setting key size to {} of key {} for index with start offset {}",
indexFile.getAbsolutePath(), key.sizeInBytes(), key.getLongForm(), startOffset);
}
if (valueSize == VALUE_SIZE_INVALID_VALUE) {
valueSize = entry.getValue().getBytes().capacity();
logger.info("IndexSegment : {} setting value size to {} for index with start offset {}",
indexFile.getAbsolutePath(), valueSize, startOffset);
}
} finally {
rwLock.readLock().unlock();
}
}
/**
* The total size in bytes written to this segment so far
* @return The total size in bytes written to this segment so far
*/
long getSizeWritten() {
try {
rwLock.readLock().lock();
if (mapped.get()) {
throw new UnsupportedOperationException("Operation supported only on umapped indexes");
}
return sizeWritten.get();
} finally {
rwLock.readLock().unlock();
}
}
/**
* The number of items contained in this segment
* @return The number of items contained in this segment
*/
int getNumberOfItems() {
try {
rwLock.readLock().lock();
if (mapped.get()) {
throw new UnsupportedOperationException("Operation supported only on unmapped indexes");
}
return numberOfItems.get();
} finally {
rwLock.readLock().unlock();
}
}
/**
* Writes the index to a persistent file. Writes the data in the following format in version 1
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
* | version | keysize | valuesize | fileendpointer | last modified time(in secs) | Reset key | Reset key type
* |(2 bytes)|(4 bytes)| (4 bytes) | (8 bytes) | (4 bytes) | (n bytes) | ( 2 bytes)
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
* key 1 | value 1 | ... | key n | value n | crc |
* (n bytes)| (n bytes)| | (n bytes) | (n bytes) | (8 bytes)|
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
* version - the index format version
* keysize - the size of the key in this index segment
* valuesize - the size of the value in this index segment
* fileendpointer - the log end pointer that pertains to the index being persisted
* last modified time - the last modified time of the index segment in secs
* reset key - the reset key(StoreKey) of the index segment
* reset key type - the reset key index entry type(PUT/DELETE)
* key n / value n - the key and value entries contained in this index segment
* crc - the crc of the index segment content
*
* Those that were written in version 0 has the following format
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
* | version | keysize | valuesize | fileendpointer | key 1 | value 1 | ... | key n | value n | crc |
* |(2 bytes)|(4 bytes)| (4 bytes) | (8 bytes) | (n bytes)| (n bytes)| | (n bytes) | (n bytes) | (8 bytes)|
* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
* version - the index format version
* keysize - the size of the key in this index segment
* valuesize - the size of the value in this index segment
* fileendpointer - the log end pointer that pertains to the index being persisted
* key n / value n - the key and value entries contained in this index segment
* crc - the crc of the index segment content
*
* @param safeEndPoint the end point (that is relevant to this segment) until which the log has been flushed.
* @throws IOException
* @throws StoreException
*/
void writeIndexSegmentToFile(Offset safeEndPoint) throws IOException, StoreException {
if (safeEndPoint.compareTo(startOffset) <= 0) {
return;
}
if (!safeEndPoint.equals(prevSafeEndPoint)) {
if (safeEndPoint.compareTo(getEndOffset()) > 0) {
throw new StoreException(
"SafeEndOffSet " + safeEndPoint + " is greater than current end offset for current " + "index segment "
+ getEndOffset(), StoreErrorCodes.Illegal_Index_Operation);
}
File temp = new File(getFile().getAbsolutePath() + ".tmp");
FileOutputStream fileStream = new FileOutputStream(temp);
CrcOutputStream crc = new CrcOutputStream(fileStream);
DataOutputStream writer = new DataOutputStream(crc);
try {
rwLock.readLock().lock();
// write the current version
writer.writeShort(getVersion());
// write key, value size, file end pointer
writer.writeInt(this.keySize);
writer.writeInt(this.valueSize);
writer.writeLong(safeEndPoint.getOffset());
if (getVersion() == PersistentIndex.VERSION_1) {
// write last modified time and reset key incase of version 1
writer.writeLong(lastModifiedTimeSec.get());
writer.write(resetKey.getFirst().toBytes());
writer.writeShort(resetKey.getSecond().ordinal());
}
// NOTE: In the event of a crash, it is possible that there is a part of the log that is not covered by the
// index. This happens due to the fact that a DELETE that occurs in the same segment as a PUT overwrites the
// PUT entry. Consider the following case:-
// (entries are of the form ID:TYPE:START_OFFSET-END_OFFSET)
// This is the order of operations
// A:PUT:0-100
// B:PUT:101-200
// A:DELETE:201-250
// These are the entries in the index segment
// A:DELETE:201-250
// B:PUT:101-200
// If safeEndPoint < 250, then B:PUT:101-200 will be written but not A:DELETE:201-250. If the process were to
// crash at this point, the index end offset would be 200 and the span 0-100 would not be represented in the
// index.
// write the entries
for (Map.Entry<StoreKey, IndexValue> entry : index.entrySet()) {
if (entry.getValue().getOffset().getOffset() + entry.getValue().getSize() <= safeEndPoint.getOffset()) {
writer.write(entry.getKey().toBytes());
writer.write(entry.getValue().getBytes().array());
logger.trace("IndexSegment : {} writing key - {} value - offset {} size {} fileEndOffset {}",
getFile().getAbsolutePath(), entry.getKey(), entry.getValue().getOffset(), entry.getValue().getSize(),
safeEndPoint);
}
}
prevSafeEndPoint = safeEndPoint;
long crcValue = crc.getValue();
writer.writeLong(crcValue);
// flush and overwrite old file
fileStream.getChannel().force(true);
// swap temp file with the original file
temp.renameTo(getFile());
} catch (IOException e) {
throw new StoreException(
"IndexSegment : " + indexFile.getAbsolutePath() + " IO error while persisting index to disk", e,
StoreErrorCodes.IOError);
} finally {
writer.close();
rwLock.readLock().unlock();
}
logger.trace("IndexSegment : {} completed writing index to file", indexFile.getAbsolutePath());
}
}
/**
* Memory maps the segment of index. Optionally, it also persist the bloom filter to disk
* @param persistBloom True, if the bloom filter needs to be persisted. False otherwise.
* @throws IOException
* @throws StoreException
*/
void map(boolean persistBloom) throws IOException, StoreException {
RandomAccessFile raf = new RandomAccessFile(indexFile, "r");
rwLock.writeLock().lock();
try {
mmap = raf.getChannel().map(FileChannel.MapMode.READ_ONLY, 0, indexFile.length());
mmap.position(0);
version = mmap.getShort();
indexSizeExcludingEntries =
VERSION_FIELD_LENGTH + KEY_SIZE_FIELD_LENGTH + VALUE_SIZE_FIELD_LENGTH + LOG_END_OFFSET_FIELD_LENGTH
+ CRC_FIELD_LENGTH;
switch (version) {
case 0:
keySize = mmap.getInt();
valueSize = mmap.getInt();
endOffset.set(new Offset(startOffset.getName(), mmap.getLong()));
lastModifiedTimeSec.set(indexFile.lastModified() / 1000);
firstKeyRelativeOffset = indexSizeExcludingEntries - CRC_FIELD_LENGTH;
break;
case 1:
keySize = mmap.getInt();
valueSize = mmap.getInt();
endOffset.set(new Offset(startOffset.getName(), mmap.getLong()));
lastModifiedTimeSec.set(mmap.getLong());
StoreKey storeKey = factory.getStoreKey(new DataInputStream(new ByteBufferInputStream(mmap)));
short resetKeyType = mmap.getShort();
resetKey = new Pair<>(storeKey, PersistentIndex.IndexEntryType.values()[resetKeyType]);
indexSizeExcludingEntries +=
(LAST_MODIFIED_TIME_FIELD_LENGTH + resetKey.getFirst().sizeInBytes() + RESET_KEY_TYPE_FIELD_LENGTH);
firstKeyRelativeOffset = indexSizeExcludingEntries - CRC_FIELD_LENGTH;
break;
default:
throw new StoreException("IndexSegment : " + indexFile.getAbsolutePath() + " unknown version in index file",
StoreErrorCodes.Index_Version_Error);
}
mapped.set(true);
index = null;
} finally {
raf.close();
rwLock.writeLock().unlock();
}
// we should be fine reading bloom filter here without synchronization as the index is read only
// we only persist the bloom filter once during its entire lifetime
if (persistBloom) {
CrcOutputStream crcStream = new CrcOutputStream(new FileOutputStream(bloomFile));
DataOutputStream stream = new DataOutputStream(crcStream);
FilterFactory.serialize(bloomFilter, stream);
long crcValue = crcStream.getValue();
stream.writeLong(crcValue);
}
}
/**
* Reads the index segment from file into an in memory representation
* @param fileToRead The file to read the index segment from
* @param journal The journal to use.
* @throws StoreException
* @throws IOException
*/
private void readFromFile(File fileToRead, Journal journal) throws StoreException, IOException {
logger.info("IndexSegment : {} reading index from file", indexFile.getAbsolutePath());
index.clear();
CrcInputStream crcStream = new CrcInputStream(new FileInputStream(fileToRead));
DataInputStream stream = new DataInputStream(crcStream);
try {
version = stream.readShort();
switch (version) {
case PersistentIndex.VERSION_0:
case PersistentIndex.VERSION_1:
keySize = stream.readInt();
valueSize = stream.readInt();
long logEndOffset = stream.readLong();
indexSizeExcludingEntries =
VERSION_FIELD_LENGTH + KEY_SIZE_FIELD_LENGTH + VALUE_SIZE_FIELD_LENGTH + LOG_END_OFFSET_FIELD_LENGTH
+ CRC_FIELD_LENGTH;
if (version == PersistentIndex.VERSION_0) {
lastModifiedTimeSec.set(indexFile.lastModified() / 1000);
} else if (version == PersistentIndex.VERSION_1) {
lastModifiedTimeSec.set(stream.readLong());
StoreKey storeKey = factory.getStoreKey(stream);
short resetKeyType = stream.readShort();
resetKey = new Pair<>(storeKey, PersistentIndex.IndexEntryType.values()[resetKeyType]);
indexSizeExcludingEntries +=
(LAST_MODIFIED_TIME_FIELD_LENGTH + resetKey.getFirst().sizeInBytes() + RESET_KEY_TYPE_FIELD_LENGTH);
}
firstKeyRelativeOffset = indexSizeExcludingEntries - CRC_FIELD_LENGTH;
logger.trace("IndexSegment : {} reading log end offset {} from file", indexFile.getAbsolutePath(),
logEndOffset);
long maxEndOffset = Long.MIN_VALUE;
while (stream.available() > CRC_FIELD_LENGTH) {
StoreKey key = factory.getStoreKey(stream);
byte[] value = new byte[valueSize];
stream.read(value);
IndexValue blobValue = new IndexValue(startOffset.getName(), ByteBuffer.wrap(value), version);
long offsetInLogSegment = blobValue.getOffset().getOffset();
// ignore entries that have offsets outside the log end offset that this index represents
if (offsetInLogSegment + blobValue.getSize() <= logEndOffset) {
index.put(key, blobValue);
logger.trace("IndexSegment : {} putting key {} in index offset {} size {}", indexFile.getAbsolutePath(),
key, blobValue.getOffset(), blobValue.getSize());
// regenerate the bloom filter for in memory indexes
bloomFilter.add(ByteBuffer.wrap(key.toBytes()));
// add to the journal
if (blobValue.getOriginalMessageOffset() != IndexValue.UNKNOWN_ORIGINAL_MESSAGE_OFFSET
&& offsetInLogSegment != blobValue.getOriginalMessageOffset()
&& blobValue.getOriginalMessageOffset() >= startOffset.getOffset()) {
// we add an entry for the original message offset if it is within the same index segment
journal.addEntry(new Offset(startOffset.getName(), blobValue.getOriginalMessageOffset()), key);
}
journal.addEntry(blobValue.getOffset(), key);
sizeWritten.addAndGet(key.sizeInBytes() + valueSize);
numberOfItems.incrementAndGet();
if (offsetInLogSegment + blobValue.getSize() > maxEndOffset) {
maxEndOffset = offsetInLogSegment + blobValue.getSize();
}
} else {
logger.info(
"IndexSegment : {} ignoring index entry outside the log end offset that was not synced logEndOffset "
+ "{} key {} entryOffset {} entrySize {} entryDeleteState {}", indexFile.getAbsolutePath(),
logEndOffset, key, blobValue.getOffset(), blobValue.getSize(),
blobValue.isFlagSet(IndexValue.Flags.Delete_Index));
}
}
endOffset.set(new Offset(startOffset.getName(), maxEndOffset));
logger.trace("IndexSegment : {} setting end offset for index {}", indexFile.getAbsolutePath(), maxEndOffset);
long crc = crcStream.getValue();
if (crc != stream.readLong()) {
// reset structures
keySize = KEY_SIZE_INVALID_VALUE;
valueSize = VALUE_SIZE_INVALID_VALUE;
endOffset.set(startOffset);
index.clear();
bloomFilter.clear();
throw new StoreException("IndexSegment : " + indexFile.getAbsolutePath() + " crc check does not match",
StoreErrorCodes.Index_Creation_Failure);
}
break;
default:
throw new StoreException("IndexSegment : " + indexFile.getAbsolutePath() + " invalid version in index file ",
StoreErrorCodes.Index_Version_Error);
}
} catch (IOException e) {
throw new StoreException("IndexSegment : " + indexFile.getAbsolutePath() + " IO error while reading from file ",
e, StoreErrorCodes.IOError);
} finally {
stream.close();
}
}
/**
* Gets all the entries upto maxEntries from the start of a given key (exclusive) or all entries if key is null,
* till maxTotalSizeOfEntriesInBytes
* @param key The key from where to start retrieving entries.
* If the key is null, all entries are retrieved upto maxentries
* @param findEntriesCondition The condition that determines when to stop fetching entries.
* @param entries The input entries list that needs to be filled. The entries list can have existing entries
* @param currentTotalSizeOfEntriesInBytes The current total size in bytes of the entries
* @return true if any entries were added.
* @throws IOException
*/
boolean getEntriesSince(StoreKey key, FindEntriesCondition findEntriesCondition, List<MessageInfo> entries,
AtomicLong currentTotalSizeOfEntriesInBytes) throws IOException {
List<IndexEntry> indexEntries = new ArrayList<>();
boolean isNewEntriesAdded =
getIndexEntriesSince(key, findEntriesCondition, indexEntries, currentTotalSizeOfEntriesInBytes);
for (IndexEntry indexEntry : indexEntries) {
IndexValue value = indexEntry.getValue();
MessageInfo info =
new MessageInfo(indexEntry.getKey(), value.getSize(), value.isFlagSet(IndexValue.Flags.Delete_Index),
value.getExpiresAtMs());
entries.add(info);
}
return isNewEntriesAdded;
}
/**
* Gets all the index entries upto maxEntries from the start of a given key (exclusive) or all entries if key is null,
* till maxTotalSizeOfEntriesInBytes
* @param key The key from where to start retrieving entries.
* If the key is null, all entries are retrieved upto maxentries
* @param findEntriesCondition The condition that determines when to stop fetching entries.
* @param entries The input entries list that needs to be filled. The entries list can have existing entries
* @param currentTotalSizeOfEntriesInBytes The current total size in bytes of the entries
* @return true if any entries were added.
* @throws IOException
*/
boolean getIndexEntriesSince(StoreKey key, FindEntriesCondition findEntriesCondition, List<IndexEntry> entries,
AtomicLong currentTotalSizeOfEntriesInBytes) throws IOException {
if (!findEntriesCondition.proceed(currentTotalSizeOfEntriesInBytes.get(), getLastModifiedTimeSecs())) {
return false;
}
int entriesSizeAtStart = entries.size();
if (mapped.get()) {
int index = 0;
if (key != null) {
index = findIndex(key, mmap.duplicate());
}
if (index != -1) {
ByteBuffer readBuf = mmap.duplicate();
int totalEntries = numberOfEntries(readBuf);
while (findEntriesCondition.proceed(currentTotalSizeOfEntriesInBytes.get(), getLastModifiedTimeSecs())
&& index < totalEntries) {
StoreKey newKey = getKeyAt(readBuf, index);
byte[] buf = new byte[valueSize];
readBuf.get(buf);
// we include the key in the final list if it is not the initial key or if the initial key was null
if (key == null || newKey.compareTo(key) != 0) {
IndexValue newValue = new IndexValue(startOffset.getName(), ByteBuffer.wrap(buf), getVersion());
entries.add(new IndexEntry(newKey, newValue));
currentTotalSizeOfEntriesInBytes.addAndGet(newValue.getSize());
}
index++;
}
} else {
logger.error("IndexSegment : " + indexFile.getAbsolutePath() + " index not found for key " + key);
}
} else if (key == null || index.containsKey(key)) {
ConcurrentNavigableMap<StoreKey, IndexValue> tempMap = index;
if (key != null) {
tempMap = index.tailMap(key, true);
}
for (Map.Entry<StoreKey, IndexValue> entry : tempMap.entrySet()) {
if (key == null || entry.getKey().compareTo(key) != 0) {
IndexValue newValue = new IndexValue(startOffset.getName(), entry.getValue().getBytes(), getVersion());
entries.add(new IndexEntry(entry.getKey(), newValue));
currentTotalSizeOfEntriesInBytes.addAndGet(entry.getValue().getSize());
if (!findEntriesCondition.proceed(currentTotalSizeOfEntriesInBytes.get(), getLastModifiedTimeSecs())) {
break;
}
}
}
} else {
logger.error("IndexSegment : " + indexFile.getAbsolutePath() + " key not found: " + key);
}
return entries.size() > entriesSizeAtStart;
}
/**
* @return the prefix for the index segment file name (also used for bloom filter file name).
*/
private String generateIndexSegmentFilenamePrefix() {
String logSegmentName = startOffset.getName();
StringBuilder filenamePrefix = new StringBuilder(logSegmentName);
if (!logSegmentName.isEmpty()) {
filenamePrefix.append(BlobStore.SEPARATOR);
}
return filenamePrefix.append(startOffset.getOffset()).append(BlobStore.SEPARATOR).toString();
}
/**
* Gets the start {@link Offset} in the {@link Log} that the index with file name {@code filename} represents.
* @param filename the name of the index file.
* @return the start {@link Offset} in the {@link Log} that the index with file name {@code filename} represents.
*/
static Offset getIndexSegmentStartOffset(String filename) {
// file name pattern for index is {logSegmentName}_{offset}_index.
// If the logSegment name is empty, then the file name pattern is {offset}_index.
String logSegmentName;
String startOffsetValue;
int firstSepIdx = filename.indexOf(BlobStore.SEPARATOR);
int lastSepIdx = filename.lastIndexOf(BlobStore.SEPARATOR);
if (firstSepIdx == lastSepIdx) {
// pattern is offset_index.
logSegmentName = "";
startOffsetValue = filename.substring(0, firstSepIdx);
} else {
// pattern is logSegmentName_offset_index.
int lastButOneSepIdx = filename.substring(0, lastSepIdx).lastIndexOf(BlobStore.SEPARATOR);
logSegmentName = filename.substring(0, lastButOneSepIdx);
startOffsetValue = filename.substring(lastButOneSepIdx + 1, lastSepIdx);
}
return new Offset(logSegmentName, Long.parseLong(startOffsetValue));
}
}