package com.github.hoffart.dmap;
import java.io.File;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.nio.MappedByteBuffer;
import java.nio.channels.FileChannel.MapMode;
import java.util.*;
import com.github.hoffart.dmap.util.CompressionUtils;
import com.github.hoffart.dmap.util.ExtendedFileChannel;
import gnu.trove.impl.Constants;
import gnu.trove.map.hash.TObjectIntHashMap;
import org.iq80.snappy.Snappy;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.github.hoffart.dmap.util.ByteArray;
import com.github.hoffart.dmap.util.ByteArrayUtils;
import com.github.hoffart.dmap.util.map.CachingHashMap;
/**
* Disk-backed implementation of a very simple Map that supports only
*/
public class DMap {
public static final int VERSION = 4;
private static final int DEFAULT_BLOCK_CACHE_COUNT = 250;
/** Current Map file generated by Builder has Global trailer offset at 13. */
protected static final int DEFAULT_LOC_FOR_TRAILER_OFFSET = 13;
/** Map file with data. */
private final File mapFile_;
private final ExtendedFileChannel raf_;
/** Number of entries in the map. */
private final int size;
/** The block size */
private final int blockSize;
/** Indicates if the values are compressed */
private final boolean valuesCompressed;
/** Maximum number of blocks that can are held in memory when value blocks are held on disk. */
private final int cacheBlockCount_;
/** First Key - Mapped block pair. */
private final Map<ByteArray, MappedByteBuffer> cachedByteBuffers_;
/** Mapping of first key of block to block's start offset. */
private final Map<ByteArray, Long> firstKeyInBlock_;
/** Mapping of block start offset to block trailer offset. */
private final Map<Long, Long> blockOffsetInfo_;
/** Flag to enable/disable preloading of key offset pairs. */
private final boolean preloadAllKeyOffsets;
/** Flag to enable/disable preloading of all the values. */
private final boolean preloadAllValues;
/** Trove Map load factor (default: 0.5) */
private final float troveLoadFactor = Constants.DEFAULT_LOAD_FACTOR;
/** Trove Map no Entry value (default: -1) */
private final int troveNoEntryValue = -1;
/** Mapping of BlockTrailer Start offset and block trailer mapped bytebuffer of trailer. */
private final Map<Long, MappedByteBuffer> blockTrailerBuffer_;
/** Mapping of BlockTrailer start offset and all key-offset pairs info contained in the trailer. */
private final Map<Long, TObjectIntHashMap<ByteArray>> blockTrailerKeys;
/** First keys of all the blocks present in the dmap loaded once. */
private ByteArray[] firstKeys;
private final Logger logger_ = LoggerFactory.getLogger(DMap.class);
private DMap(Builder loader) throws IOException {
mapFile_ = loader.mapFile_;
preloadAllKeyOffsets = loader.preloadOffsets_;
preloadAllValues = loader.preloadValues_;
cacheBlockCount_ = loader.cacheBlockSize_;
raf_ = new ExtendedFileChannel(new RandomAccessFile(mapFile_, "r").getChannel());
int version = raf_.readInt();
if(version != VERSION) {
throw new IOException("Invalid version of DMap file encountered. Please fix.");
}
// Sorted first keys: firstKey->blockStart
firstKeyInBlock_ = new TreeMap<>();
// blockStartOffset->blockTrailerOffset
blockOffsetInfo_ = new HashMap<>();
// without pre load key-offset, put BlockTrailer total to mem:<blockTrailerStartOffset, BlockTrailer>
blockTrailerBuffer_ = new HashMap<>();
// BlockTrailerStartOffset map to <key, offset of value in current block>
blockTrailerKeys = new HashMap<>();
size = raf_.readInt();
blockSize = raf_.readInt();
valuesCompressed = raf_.readBool();
if (size == 0) {
cachedByteBuffers_ = new CachingHashMap<>(0);
return;
}
//加载BlockTrailer里的key->offset到内存中,如果设置了preloadAllKeyOffsets的话
//否则直接把BlockTrailer加载到内存中.这样在get(key)时,需要解析BlockTrailer信息
loadKeyDetails();
//把value也加载到内存里.cachedByteBuffers_是每个Block的firstKey和整个数据块的内容的映射
if (preloadAllValues) {
int numBlocks = getBlockCount();
// override the cacheBlockCount_
cachedByteBuffers_ = new CachingHashMap<>(numBlocks);
for(ByteArray firstKey : firstKeyInBlock_.keySet()) {
// firstKey --------------> blockStart ----------------> blockTrailerStart
// firstKeyInBlock blockOffsetInfo
//Block-A|Block-A-Trailer|Block-B|Block-B-Trailer
//|blockStart
// |blockTrailerStart
//|<---->|
// mappedBuffer
long blockStart = firstKeyInBlock_.get(firstKey);
long blockTrailerStart = blockOffsetInfo_.get(blockStart);
//定位到blockStart位置,读取长度为blockTrailerStart-blockStart,即整个数据块的内容加载到内存中
MappedByteBuffer mappedBuffer_ = raf_.map(MapMode.READ_ONLY, blockStart, blockTrailerStart - blockStart);
mappedBuffer_.load();
//缓存里存放的是每个块的第一个firstKey和整个数据块的内容
cachedByteBuffers_.put(firstKey, mappedBuffer_);
}
logger_.debug("Preloaded all " + numBlocks + " blocks.");
} else
//如果没有事先加载,则先构建一个Map. 在get的时候再放入
cachedByteBuffers_ = new CachingHashMap<>(cacheBlockCount_);
}
/*
* This public Builder class allows creation of customized DMap instance.
* This is the Only way to create a DMap instance.
*/
public static class Builder {
private boolean preloadOffsets_;
private boolean preloadValues_;
private int cacheBlockSize_;
private final File mapFile_;
/**
* A Loader constructor that takes a File parameter to be loaded into DMap.
*
* @param mapFile A File instance to be loaded.
*/
public Builder(File mapFile) {
mapFile_ = mapFile;
cacheBlockSize_ = DEFAULT_BLOCK_CACHE_COUNT;
// by default, both keyoffset loading and value loading will be disabled
preloadOffsets_ = false;
preloadValues_ = false;
}
/**
* This method enables key-offset preloading during DMap instantiation
*
* @return The current Loader instance.
*/
public Builder preloadOffsets() {
this.preloadOffsets_ = true;
return this;
}
/**
* This method enables values preloading during DMap instantiation
*
* @return The current Loader instance.
*/
public Builder preloadValues() {
this.preloadValues_ = true;
return this;
}
/**
* This method sets the DMap block limit to specified value
*
* @return The current Loader instance.
*/
public Builder setMaxBlockLimit(int value) {
this.cacheBlockSize_ = value;
return this;
}
/**
* The parameter-less build method creates an instance of DMap.
* This method needs to be called once all DMap customizations are done.
*
* @return A DMap instance.
* @throws IOException
*/
public DMap build() throws IOException {
return new DMap(this);
}
}
/**
* Get the number of entries in the map.
*
* @return Number of entries in the map.
*/
public int size() {
return size;
}
/**
* Get the size of a block in bytes
*
* @return Size of the blocks
*/
public int getBlockSize() {
return blockSize;
}
/**
* Get number of blocks in the map.数据块的数量
*
* @return Number of blocks in the map.
* @throws IOException
*/
public synchronized int getBlockCount() throws IOException {
//首先定位到GlobalTrailerOffset,在DMapBuilder.build的最末尾,这个位置开始首先写入BlockCount
long trailerOffset = getGlobalTrailerOffset();
raf_.position(trailerOffset);
//读取出BlockCounts
return raf_.readVInt();
}
/**
* Get byte[] value for key.
*
* @param key Key to retrieve the value for.
* @return byte[] associated with key.
*/
public byte[] get(byte[] key) throws IOException {
if (size == 0) return null;
ByteArray keyBytes = new ByteArray(key);
logger_.debug("get(" + keyBytes + ") - hash: " + keyBytes.hashCode());
// identify the block containing the given key using first key information.
// firstKeys is settup at loadKeyDetails()
ByteArray firstKeyBytes = ByteArrayUtils.findMaxElementLessThanTarget(firstKeys, keyBytes);
// key not in range (less than start key)
if(firstKeyBytes == null) return null;
//firstKey-->所在的Block的startOffset-->BlockTrailerOffset
//要获取key对应的value, 首先要找到key对应的value,其中value在Block中的Offset, 这个信息记录在BlockTrailer里
long blockStart = firstKeyInBlock_.get(firstKeyBytes);
long blockTrailerStart = blockOffsetInfo_.get(blockStart);
// load the value offset
int valueOffset = getValueOffset(keyBytes, blockTrailerStart);
if (valueOffset == troveNoEntryValue) return null;
//缓存: 一整个Block的数据都缓存起来. 如果是顺序读的话,因为知道了Block的firstKey,
//只要读取了Block的第一个key, 这个Block其余的key也都加载进内存中.而不必从文件中读取了
MappedByteBuffer blockMapBuffer = cachedByteBuffers_.get(firstKeyBytes);
if(blockMapBuffer == null) {
synchronized (cachedByteBuffers_) {
blockMapBuffer = cachedByteBuffers_.get(firstKeyBytes);
if (blockMapBuffer == null) {
//起始位置是BlockStart, 读取的数量=BlockTrailerOffset-BlockStartOffset=这个所有value的长度
//|val1,val2,...|
//|<BlockStart |<BlockTrailerOffset
//后者减去前者就是当前Block所有的value了
blockMapBuffer = raf_.map(MapMode.READ_ONLY, blockStart, blockTrailerStart - blockStart);
//缓存的key是Block的firstKey, 缓存的内容是这个Block的所有数据内容
//整个过程要做的工作和preloadAllValues=true时在构造函数里的工作一样.都是要加载整个数据块的内容到内存中
cachedByteBuffers_.put(firstKeyBytes, blockMapBuffer);
}
}
}
//上一步已经将当前Block放进内存中了,现在要获取value的值,因为知道了value在Block中的offset,可以直接从内存中get出来
ByteBuffer slice = blockMapBuffer.slice();
//定位到value所在的offset位置
slice.position(valueOffset);
//数据value的格式是valLen,然后是value,所以依次读取
int valueLength = CompressionUtils.readVInt(slice);
byte[] value = new byte[valueLength];
slice.get(value);
//如果写入的时候经过压缩,读取的时候就要解压缩
if (valuesCompressed)
value = Snappy.uncompress(value, 0, value.length);
return value;
}
/* NOTE:
* When Offset preloading is disabled, this method does a linear search over all the keys in the given block
* to find the matching key and retrieve the value offset associated with the key.
* Searching single block DMap contaning N keys will be slower than Searching M-Blocks DMap
* with each block containing a subset of key.
*/
private int getValueOffset(ByteArray keyBytes, Long blockTrailerStartOffset) throws IOException {
int valueOffset = troveNoEntryValue;
//没有预加载key-offset的话,要从blockTrailerBuffer_中自己去解析出来
if(!preloadAllKeyOffsets) {
// time for linear search over the keys in block using mappedTrailer
// 现在trailerBuffer里放的是整个BlockTrailer的信息. 对应的内容就是DMapBuilder.updateBlockTrailer()
ByteBuffer trailerBuffer = blockTrailerBuffer_.get(blockTrailerStartOffset).slice();
// load key count - int
int numKeysInBlock = CompressionUtils.readVInt(trailerBuffer);
// start search over keys 循环当前BlockTrailer里的所有key,判断和要查询的keyBytes是否一样,查到则找到offset
for(int count=0; count<numKeysInBlock; count++) {
int keyLen = CompressionUtils.readVInt(trailerBuffer);
byte[] currentkey = new byte[keyLen];
trailerBuffer.get(currentkey);
ByteArray currentKeyBytes = new ByteArray(currentkey);
int offset = CompressionUtils.readVInt(trailerBuffer);
logger_.debug("Comparing " + keyBytes + " and " + currentKeyBytes + " : " + keyBytes.compareTo(currentKeyBytes));
if(keyBytes.compareTo(currentKeyBytes) == 0) {
valueOffset = offset;
break;
}
}
} else {
// just look up in the existing map
//value的offset记录在BlockTrailer里.
//value看做Block,value之后的内容是BlockTrailer. BlockTrailer里记录了key的数量,key,以及key对应的value在Block中的offset
//BlockTrailer的中文意思是Block的跟踪者. 跟踪者要能记录Block中的信息,才叫做跟踪.
//对于Map而言,key对应value是很自然的. 通过key对应value在Block中的offset,可以间接地对应到value
//value1,value2,...key1,value1'offset
// |<------------------------|
TObjectIntHashMap tmpMap = blockTrailerKeys.get(blockTrailerStartOffset);
if(tmpMap != null) {
valueOffset = tmpMap.get(keyBytes);
}
}
return valueOffset;
}
// 定位到Header的postion=13位置,读取出里面的值,这个值是GlobalTrailerOffset在文件中的位置
private long getGlobalTrailerOffset() throws IOException {
raf_.position(DEFAULT_LOC_FOR_TRAILER_OFFSET);
return raf_.readLong();
}
/**
*
* @param trailerStartOffset BlockTrailer的start-offset
* @param trailerSize 这个值的计算方式是下一个Block的start减去前一个Block的Trailer.
* @throws IOException
*/
private void processBlockTrailer(long trailerStartOffset, long trailerSize) throws IOException {
//将BlockTrailer的内存都加载到内存中. BlockTrailer的信息是在DMapBuilder.updateBlockTrailer写入的
//主要是要将key和value在Block中的offset映射起来. 这样根据key能找到offset,从而在Block的offset处开始读取结果数据
MappedByteBuffer trailerBuffer = raf_.map(MapMode.READ_ONLY, trailerStartOffset, trailerSize);
if(!preloadAllKeyOffsets) {
//不预先加载keyOffset.这里的keyOffset中的offset指的是key对应的value在Block中的offset.
//如果预先加载到内存中,则key对应的value的offset都在内存中.要查找key对应的value时,
//直接获取tmpKeyOffsetMap.get(key)得到offset,然后定位到Block的offset开始读取数据.
//如果没有预先加载,相当于BlockTrailer这部分信息要在get(key)的时候每次都解析一次.
//放到内存的好处是事先把BlockTrailer的信息都解析出来.这样在get时,直接从内存获取,无需解析.
blockTrailerBuffer_.put(trailerStartOffset, trailerBuffer);
} else {
//BlockTrailer首先写入当前Block的key数量,因为写入的使用使用writeVInt,对应读取的时候就用readVInt
int numKeysInBlock = CompressionUtils.readVInt(trailerBuffer);
//构造和key数量相符(初始容量)的Map
TObjectIntHashMap<ByteArray> tmpKeyOffsetMap =
new TObjectIntHashMap<>((int) (numKeysInBlock/troveLoadFactor+0.5), troveLoadFactor, troveNoEntryValue);
//读取的顺序和在DMapBuilder.updateBlockTrailer中写入的顺序一样
for(int count=0; count<numKeysInBlock; count++) {
//keyLen, key, key对应的value在当前Block的offset
int keyLen = CompressionUtils.readVInt(trailerBuffer);
byte[] currentkey = new byte[keyLen];
trailerBuffer.get(currentkey);
ByteArray currentKeyBytes = new ByteArray(currentkey);
int offset = CompressionUtils.readVInt(trailerBuffer);
//key->value在当前Block的offset
tmpKeyOffsetMap.put(currentKeyBytes, offset);
}
//BlockTrailer的开始位置--> <key-->value在Block里的offset>
//因为一个DataBlock写入了多个value,同样BlockTrailer里也记录了多个key.
//blockTrailerKeys记录的是一个BlockTrailer的offset, 以及在这里面的所有key和所有value在Block中的offset
blockTrailerKeys.put(trailerStartOffset, tmpKeyOffsetMap);
}
}
private void loadKeyDetails() throws IOException {
// 定位到GlobalTrailerOffset,并读取出BlockCounts.
int numBlocks = getBlockCount();
logger_.debug("Number of blocks in file : " + numBlocks);
long blockStart;
long blockTrailerStart;
long prevBlockTrailerStart = -1;
for(int blockCount = 0; blockCount < numBlocks; ++blockCount) {
//写入的顺序对应DMapBuilder的step8:BlockStart, BlockTrailer, firstKeyLen, firstKey
blockStart = raf_.readVLong();
blockTrailerStart = raf_.readVLong();
int firstKeySize = raf_.readVInt();
byte[] firstKeyBytes = new byte[firstKeySize];
raf_.read(firstKeyBytes);
firstKeyInBlock_.put(new ByteArray(firstKeyBytes), blockStart);
blockOffsetInfo_.put(blockStart, blockTrailerStart);
//第一个数据块没有前面的数据块.所以第一个数据块不会调用processBlockTrailer
//Block-1|Block-1-Trailer|Block-2|Block-2-Trailer
//| |prevBlockTrailerStart
//blockStart
//第二个数据块,blockCount=1. 文件的格式为(数据块之间以|分隔):
//Block-1|Block-1-Trailer|Block-2|Block-2-Trailer
// |prevBlockTrailerStart
// |blockStart
// |<------------->|
// BlockTrailer to be processed. firstParam:prevBlockTrailerStart, secParam:trailerSize
// 因为第一次循环没有处理,第二次循环处理第一个BlockTrailer. 当到达最后一次循环时,处理的是倒数第二个BlockTrailer
// 所以在循环外面还需要最后一次处理,处理最后一个BlockTrailer.
if(blockCount > 0) {
// compute the previous block's trailer size and store the information
processBlockTrailer(prevBlockTrailerStart, blockStart - prevBlockTrailerStart);
}
prevBlockTrailerStart = blockTrailerStart;
}
processBlockTrailer(prevBlockTrailerStart, getGlobalTrailerOffset()-prevBlockTrailerStart);
// load all the first keys for binary search during get()
firstKeys = new ByteArray[firstKeyInBlock_.size()];
firstKeyInBlock_.keySet().toArray(firstKeys);
}
/**
* This method returns an iterator for this DMap with different implementations for different preload settings.
* These iterators are NOT thread save.
* @return an iterator for the current dmap
*/
public EntryIterator entryIterator() {
if (preloadAllKeyOffsets)
return new EntryIteratorForPreloadedKeys();
else
return new EntryIteratorWithoutPreloading();
}
private class EntryIteratorWithoutPreloading implements EntryIterator {
Iterator<MappedByteBuffer> blockIterator_;
ByteBuffer curBuffer_;
int curBlockKeyNum_;
int curKey_;
//如果没有预加载key-offset,则可用的是BlockTrailer的信息:blockTrailerBuffer_:<BlockTrailerOffset,BlockTrailer>
private EntryIteratorWithoutPreloading() {
//the iterator of BlockTrailer. 用于迭代每个BlockTrailer
//next方法的迭代由用户控制,循环读取BlockTrailer的每一对信息
//实际上是否控制执行下一个BlockTrailer都是在next()方法中执行.
blockIterator_ = blockTrailerBuffer_.values().iterator();
if (blockIterator_.hasNext()) {
// current BlockTrailer
curBuffer_ = blockIterator_.next().slice();
// the first data is the number of keys in this block/blockTrailer
curBlockKeyNum_ = CompressionUtils.readVInt(curBuffer_);
} else {
curBuffer_ = null;
curBlockKeyNum_ = 0;
}
curKey_ = 0;
}
@Override
public boolean hasNext() {
return curKey_ < curBlockKeyNum_ || blockIterator_.hasNext();
}
@Override
public Entry next() throws IOException {
// TODO: make it more efficient if necessary
if (curKey_++ < curBlockKeyNum_) {
// keyLen, key, offset
int keyLen = CompressionUtils.readVInt(curBuffer_);
byte[] key = new byte[keyLen];
curBuffer_.get(key);
// skip offset, 因为next循环要能够定位到每一个|keyLen,key,offset|的边界
CompressionUtils.readVInt(curBuffer_);
// Entry的两个参数分别是key和对应的value.
return new Entry(key, get(key));
} else if (blockIterator_.hasNext()) {
// 一个BlockTrailer已经读取完了.判断是否有下一个BlockTrailer,有的话,重置相关变量,下一次从if里执行
curBuffer_ = blockIterator_.next().slice();
// 第一个BlockTrailer的这三个变量都在构造函数里,接下来的BlockTrailer在这里设置
curBlockKeyNum_ = CompressionUtils.readVInt(curBuffer_);
curKey_ = 0;
// 递归调用.
return next();
} else
return null;
}
}
private class EntryIteratorForPreloadedKeys implements EntryIterator {
Iterator<TObjectIntHashMap<ByteArray>> blockIterator_;
Iterator<ByteArray> keyIterator_;
Entry nextEntry;
// 如果预加载了key-offset,则从blockTrailerKeys中获取:<BlockTrailerOffset,<key,offset>>
private EntryIteratorForPreloadedKeys() {
blockIterator_ = blockTrailerKeys.values().iterator();
if (blockIterator_.hasNext()) {
keyIterator_ = blockIterator_.next().keySet().iterator();
} else
keyIterator_ = null;
nextEntry = null;
}
@Override
public boolean hasNext() throws IOException {
if (nextEntry == null)
nextEntry = getNextEntry();
return nextEntry != null;
}
@Override
public Entry next() throws IOException {
if (nextEntry == null)
nextEntry = getNextEntry();
Entry tmpNextEntry = nextEntry;
nextEntry = null;
return tmpNextEntry;
}
private Entry getNextEntry() throws IOException {
// TODO: make it more efficient if necessary
// 当前BlockTrailer里的key->offset的迭代
if (keyIterator_ != null && keyIterator_.hasNext()) {
ByteArray key = keyIterator_.next();
return new Entry(key.getBytes(), get(key.getBytes()));
}
//当前BlockTrailer里的List都读取完了,读取下一个BlockTrailer
//因为blockTrailerKeys放的是BlockTrailerOffset和key->offset的映射.
//if里执行的是key->offset的迭代
while (blockIterator_.hasNext()) {
keyIterator_ = blockIterator_.next().keySet().iterator();
if (keyIterator_.hasNext())
//还是递归调用. 一个BlockTrailer读取完毕,要接着下一个BlockTrailer里的key->offset
return getNextEntry();
}
return null;
}
}
/**
* A not thread save iterator for DMap entries (byte[], byte[])
*/
public static interface EntryIterator {
public boolean hasNext() throws IOException;
public Entry next() throws IOException;
}
public static class Entry {
private byte[] key;
private byte[] value;
private Entry(byte[] key, byte[] value) {
this.key = key;
this.value = value;
}
public byte[] getKey() {
return key;
}
public byte[] getValue() {
return value;
}
}
}