package com.github.hoffart.dmap; import java.io.File; import java.io.IOException; import java.io.RandomAccessFile; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import com.github.hoffart.dmap.util.ByteArray; import com.github.hoffart.dmap.util.CompressionUtils; import com.github.hoffart.dmap.util.ExtendedFileChannel; import org.iq80.snappy.Snappy; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Builder for the DMap. The DMapBuilder is a write-once builder, as DMap is read-only. * * Improve: * - Spill keys to disk every K bytes. * - Make appendable. * - Make iterable. * - Compress using varint or delta-encoding */ public class DMapBuilder { /** Default Key-Value Block size (in bytes) - set to 1 MB. */ private static final int DEFAULT_BLOCK_SIZE = 1048576; /** Current block size for the file*/ private int blockSize_; /** Compress values */ private boolean compressValues_; /** Map file to write to. */ private File mapFile_; /** Writer to the map file. */ private ExtendedFileChannel output_; /** Temporary Map file. */ private File tmpMapFile_; /** Writer to the temp file. */ private ExtendedFileChannel tmpOutput_; /** Keep track of number of entries retrieved from original file */ private int entriesCount_; private final Logger logger_ = LoggerFactory.getLogger(DMapBuilder.class); public DMapBuilder(File mapFile) throws IOException { this(mapFile, DEFAULT_BLOCK_SIZE); } public DMapBuilder(File mapFile, int blockSize) throws IOException { this(mapFile, blockSize, true); } /** * DMap的文件实例. 将Map数据以磁盘Disk形式存储. 所以叫做DMap=DiskMap * @param mapFile Map File instance. * @param blockSize Size of a block (in bytes). * @throws IOException */ public DMapBuilder(File mapFile, int blockSize, boolean compressValues) throws IOException { boolean success = mapFile.createNewFile(); if (success) { blockSize_ = blockSize; compressValues_ = compressValues; mapFile_ = mapFile; try { //创建临时文件 tmpMapFile_ = File.createTempFile("tmpDMap_", "_" + mapFile.getName()); } catch (Exception e) { throw new IOException("Error creating intermediate file: " + tmpMapFile_ + ", cannot write."); } //临时文件输出流 tmpOutput_ = new ExtendedFileChannel(new RandomAccessFile(tmpMapFile_, "rw").getChannel()); //DMap的输出流, DMap用的文件是mapFile. output_ = new ExtendedFileChannel(new RandomAccessFile(mapFile_, "rw").getChannel()); } else { throw new IOException("Output map file already exists at: " + mapFile + ", cannot write."); } } //往Map中添加一对<Key,Value> public void add(byte[] key, byte[] value) throws IOException { //首先写到临时文件输出流, 即写到临时文件中 //分别是Key的长度,Value的长度,然后才是Key,Value tmpOutput_.writeInt(key.length); tmpOutput_.writeInt(value.length); tmpOutput_.write(key); tmpOutput_.write(value); //Map的条目数量+1 entriesCount_++; } //构建MapFile, 添加Map数据时,数据只是追加到临时文件中. //build才是正式写到文件中,不能简单地追加数据到文件中.要考虑到get操作的高效读取. public void build() throws IOException { //临时文件中key和offset的映射 Map<ByteArray, Long> tmpKeyOffsetMap = new HashMap<>(); //要先flush,如果没有flush,则数据还是在内存中:输出流对象中.flush之后,才写到临时文件中 tmpOutput_.flush(); tmpOutput_.close(); //读取临时文件 ExtendedFileChannel raf = new ExtendedFileChannel(new RandomAccessFile(tmpMapFile_, "r").getChannel()); //key和value是之前add到输出流中的每对数据,下面会循环获取 byte[] key; byte[] value; long currentOffset = 0; try { // 1. 从临时文件中加载key-->offset的映射关系到tmpKeyOffsetMap中 // key=写入键值对的key, offset:在临时文件中每一对键值对的开始写入位置.即keyLen的开始位置. // 保存这个Map的目的是,当知道key后,根据map获取offset,并定位到临时文件的这个位置,开始读取整个键值对. logger_.debug("Keys to process: " + entriesCount_); for(int i=0;i<entriesCount_;i++) { //首先读取key的长度和value的长度,这也是<Key,Value>数据通过add写到临时文件中的顺序 int keyLen = raf.readInt(); int valLen = raf.readInt(); //读取key内容 key = new byte[keyLen]; raf.read(key); //key不能重复! if(tmpKeyOffsetMap.containsKey(new ByteArray(key))) { throw new IOException("Duplicate key encountered: " + key); } //key在临时文件中的offset.初始值为0,因为第一个key在文件的第0个位置 tmpKeyOffsetMap.put(new ByteArray(key), currentOffset); // ignore the value byte sequence (for the time being) and move to next record start pos //忽略value的值.先把所有的key->offset的对应关系放到map中 //读取完key之后,raf.position()现在在key的末尾=value的开始,加上valLen,即到了value的末尾,下一个key的开始 //定位到下一个key的offset. firstKeyLen|firstValLen|firestKey|firstVal|secKeyLen // keyLen valLen key value |<-currentOffset currentOffset = raf.position() + valLen; raf.position(currentOffset); } logger_.debug("Loaded " + tmpKeyOffsetMap.size() + " keys from temporary file"); // global header - version, entries count, block size, trailer offset // 2. 写入全局头信息: 版本,条目数,块大小,跟踪信息的offset(现在只是个占位符,在最后会进行更新) output_.writeInt(DMap.VERSION); output_.writeInt(tmpKeyOffsetMap.size()); output_.writeInt(blockSize_); output_.writeBool(compressValues_); // insert placeholder for trailer offset output_.writeLong(0); // 3. 所有的key,对key进行排序. 排序的目的是为了查找时快速定位key的位置 List<ByteArray> allKeys = new ArrayList<>(tmpKeyOffsetMap.keySet()); Collections.sort(allKeys); logger_.info("Writing map for " + allKeys.size() + " keys."); // 在全局头信息后面开始写入Map数据.当前的位置即Map的开始位置=第一个Block在文件中的offset // * Block在文件中的offset, 用于定位每个Block long globalOffset = output_.position(); // 数据块的开始信息,还没开始写数据,块的开始=0 // * 数据在每个Block中的offset, 用于定位每条数据 int currentBlockOffset = 0; // 当前数据块还可以写入多少字节,初始时整个数据块可写 int remainingBytes = blockSize_; // 当前数据块的第一个键 ByteArray firstKey = null; //Block级别的key->offset, 一个文件会有多个Block! //File = Block1 | Block2 | Block3 | ... // Map to store block-level key-offset pairs (to be written to each block trailer) Map<ByteArray, Integer> blockKeyOffset_ = new HashMap<>(); //数据块的起始和结束位置(全局) // Map to store blockStart-blockTrailerStart pair (to be written to global trailer) Map<Long, Long> blockTrailerOffsets = new HashMap<>(); //每个Block的第一个key:<CurrentBlockOffset, theFirstKeyOfThisBlock> // Map to store blockStart-firstKey pair (to be written to global trailer) Map<Long, ByteArray> blockFirstKey = new HashMap<>(); // 4. 开始循环处理每对Key,Value for (ByteArray keyBytes : allKeys) { long offset = tmpKeyOffsetMap.get(keyBytes); raf.position(offset); int keyLen = raf.readInt(); int valLen = raf.readInt(); value = new byte[valLen]; // position pointer at the starting of value data raf.position(raf.position() + keyLen); raf.read(value); if (compressValues_) { value = Snappy.compress(value); } //数据的长度,即每一个<Key,Value>键值对的值的长度 int dataLength = CompressionUtils.getVNumSize(value.length) + value.length; //一个KeyValue的value长度比BlockSize还大? if(dataLength > blockSize_) { throw new IOException("Data size ("+ dataLength +" bytes) greater than specified block size(" + blockSize_ + " bytes)"); } // 6. 当数据块写满后,更新BlockTrailer // write block trailer & reset variables //dataLength是每一个KeyValue, remainingBytes的初始值是BlockSize,在处理完一个KV后值会减少这个KV的长度 //如果当前写入的KV比剩余空间大,说明剩余的空间不够当前KV的写入, 则要重新创建一个文件,并重置变量 if(dataLength > remainingBytes) { logger_.debug("Key : " + keyBytes + " with value doesnt fit in remaining "+ remainingBytes + " bytes."); // globalOffset记录了每个Block+BlockTrailer在文件中的offset. // 一个完整的数据块包括Block+BlockTrailer. globalOffset = updateBlockTrailer(blockKeyOffset_, blockTrailerOffsets, blockFirstKey, firstKey, globalOffset); logger_.debug("Creating new block @ " + globalOffset); currentBlockOffset = 0; remainingBytes = blockSize_; firstKey = null; } //每个Block的firstKey. 在新建Block的时候,重置了firstKey=null,这里会再次执行. 只有在每个Block的开始才会执行! if(firstKey == null) firstKey = keyBytes; // 5. 先写入value到Block //写入Value. 还没有写入key?? 什么时候写入key? 调用updateBlockTrailer的时候. 如果数据没有满一个Block,则不会调用 logger_.debug("write@ " + globalOffset + " key: " + keyBytes + "" + " (hash: " + keyBytes.hashCode() + ")"); output_.writeVInt(value.length); // write value (key can be retrieved from block trailer) output_.write(value); // store key-offset pair (needed for block trailer) //在每一个Block里的key->offset的映射[offset针对的是当前Block]. 之前有一个tmpKeyOffsetMap是key在整个临时文件中的offset //注意:blockKeyOffset的value是Map的key对应的value在Block中的offset. 通过key得到value在Block中的位置即可读取出value //临时文件是整个键值对一起写入(追加),而写到文件中是先把value都抽取出来放在Block里,然后在BlockTrailer里记录key和value在block里的offset的映射 blockKeyOffset_.put(keyBytes, currentBlockOffset); //Block的大小是Value的长度总和(因为这里的加减是dataLength),不包括Key. blockOffset增加,则remainingBytes减少 currentBlockOffset += dataLength; remainingBytes -= dataLength; } // 7. 写入最后一个数据块的Trailer信息. 最后一个数据块已经在上面的循环中写入了.但是可能没有满足blockSize_,需要在这里调用一次更新BlockTrailer // write the last block trailer information globalOffset = updateBlockTrailer(blockKeyOffset_, blockTrailerOffsets, blockFirstKey, firstKey, globalOffset); // 8. 写入GlobalBlockTrailer信息. //总的文件格式如下: // Header | Block1's Data | Block1's Trailer | Block2's Data | Block2's Trailer | ... | GlobalBlockTrailer // 一个完整的数据块信息(BlockData+BlockTrailer)的格式如下: //|Block1 |Block1's Trailer // v1Len,v1,v2Len,v2,....,|numKeys,k1Len,k1,v1'off,k2Len,k2,v2'off // GlobalTrailer的格式如下: // numBlocks,block1'off,block1's trailer'off,key1Len,key1 // write global trailer (block start offset->block trailer offset pair & first key in the block) // blockTrailerOffsets记录的是每个Block的offset->这个Block的Trailer的offset的映射关系 // 每个BlockTrailer的offset可以理解为每个Block的结束位置.因为Block的结束是BlockTrailer的开始 output_.writeVInt(blockTrailerOffsets.size()); //有多少个数据块 List<Long> allBlockKeys = new ArrayList<>(blockTrailerOffsets.keySet()); // Map是无序的.但是Block的位置是固定的.由于先写入的Block的offset肯定最小.按照升序排序,还原BlockOffset的有序性 Collections.sort(allBlockKeys); for(long blockStart : allBlockKeys) { //写入每个Block在文件中的开始位置和结束位置(Block的结束位置理解为BlockTrailer的offset) output_.writeVLong(blockStart); output_.writeVLong(blockTrailerOffsets.get(blockStart)); //blockFirstKey放的是BlockStartOffset和这个Block的第一个Key byte[] tmpFirstKeyByte = blockFirstKey.get(blockStart).getBytes(); // write the first key info to global trailer output_.writeVInt(tmpFirstKeyByte.length); output_.write(tmpFirstKeyByte); } raf.close(); output_.flush(); output_.close(); // fill in the previously created placeholder for trailer offset raf = new ExtendedFileChannel(new RandomAccessFile(mapFile_, "rw").getChannel()); raf.position(DMap.DEFAULT_LOC_FOR_TRAILER_OFFSET); logger_.info("DMap Trailer start at " + globalOffset + "."); //在指定位置填充GlobalBlockTrailer的Offset, 这个位置首先写入了BlockCount.见前面! //所有Block+BlockTrailer(包括最后一个数据块对应的Trailer)都写完后,写入GlobalBlockTrailer. raf.writeLong(globalOffset); } finally { // delete the intermediate temp file tmpMapFile_.delete(); raf.close(); } } /** * Returns new global offset after updating the block trailer * @param keyOffsets key-->key对应的value在当前Block的offset * @param blockTrailerOffsets * @param blockFirstKey 当前Block的Offset(Block在文件的全局offset)-->当前Block的第一个Key * @param firstKey 当前Block的firstKey * @param globalOffset 当前Block在文件中的offset位置 * @return * @throws IOException */ private long updateBlockTrailer(Map<ByteArray, Integer> keyOffsets, Map<Long, Long> blockTrailerOffsets, Map<Long, ByteArray> blockFirstKey, ByteArray firstKey, long globalOffset) throws IOException { //文件当前的位置,即当前Block的末尾.注意Block不包括Key,只包含Value. Block的末尾是准备写key的开始. Block可以理解为数据块 //|value1|value2|....|key1|v1-off|key2|v2-off... |val10|val11|....|key10|val10-off|.|...| //|<-----Block A---->|<----Block A Trailer------>|<--Block B----->|<--Block B Trailer-->| //blockTrailerOffset: value1'off->key1'off, value10'off->key10'off //blockFirstKey: value1'off->key1, value10'off->key10 long trailerOffset = output_.size(); // write number of entries in the current block 当前Block的key数量 output_.writeVInt(keyOffsets.size()); for(Entry<ByteArray, Integer> e : keyOffsets.entrySet()) { ByteArray byteArray = e.getKey(); //写入keyLen, Key, output_.writeVInt(byteArray.getBytes().length); output_.write(byteArray.getBytes()); //key对应的value在当前Block的offset output_.writeVInt(e.getValue()); } keyOffsets.clear(); // track block offset info 跟踪每个块的trailer的offset blockTrailerOffsets.put(globalOffset, trailerOffset); // track first keys in each block 跟踪每个Block的firstKey blockFirstKey.put(globalOffset, firstKey); // 上面两个Map的key都是globalOffset, 即每个数据块在文件中的offset. // blockTrailerOffsets记录的映射值是Trailer在文件中的开始位置 // blockFirstKey记录的是这个数据块的第一个键 // 更新完BlockTrailer后,返回当前位置.即BlockTrailer的结束位置. 也是下一个Block的开始位置. // 因为一个完整的数据块信息包括了Block+BlockTrailer. return output_.position(); } }