DMapBuilder.java example

Explorer
zava-master
- src
package com.github.hoffart.dmap;

import java.io.File;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;

import com.github.hoffart.dmap.util.ByteArray;
import com.github.hoffart.dmap.util.CompressionUtils;
import com.github.hoffart.dmap.util.ExtendedFileChannel;
import org.iq80.snappy.Snappy;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Builder for the DMap. The DMapBuilder is a write-once builder, as DMap is read-only.
 * 
 * Improve:
 *  - Spill keys to disk every K bytes.
 *  - Make appendable.
 *  - Make iterable.
 *  - Compress using varint or delta-encoding
 */
public class DMapBuilder {

  /** Default Key-Value Block size (in bytes) - set to 1 MB. */
  private static final int DEFAULT_BLOCK_SIZE = 1048576;

  /** Current block size for the file*/
  private int blockSize_;
  
  /** Compress values */
  private boolean compressValues_;

  /** Map file to write to. */
  private File mapFile_;

  /** Writer to the map file. */
  private ExtendedFileChannel output_;

  /** Temporary Map file. */
  private File tmpMapFile_;

  /** Writer to the temp file. */
  private ExtendedFileChannel tmpOutput_;

  /** Keep track of number of entries retrieved from original file */
  private int entriesCount_;

  private final Logger logger_ = LoggerFactory.getLogger(DMapBuilder.class);

  public DMapBuilder(File mapFile) throws IOException {
    this(mapFile, DEFAULT_BLOCK_SIZE);
  }
  
  public DMapBuilder(File mapFile, int blockSize) throws IOException {
    this(mapFile, blockSize, true);
  }

  /**
   * DMap的文件实例. 将Map数据以磁盘Disk形式存储. 所以叫做DMap=DiskMap
   * @param mapFile Map File instance.
   * @param blockSize Size of a block (in bytes).
   * @throws IOException
   */
  public DMapBuilder(File mapFile, int blockSize, boolean compressValues) throws IOException {
    boolean success = mapFile.createNewFile();
    if (success) {
      blockSize_ = blockSize;
      compressValues_ = compressValues;
      mapFile_ = mapFile;
      try {
        //创建临时文件
        tmpMapFile_ = File.createTempFile("tmpDMap_", "_" + mapFile.getName());
      } catch (Exception e) {
        throw new IOException("Error creating intermediate file: " + tmpMapFile_ + ", cannot write.");
      }
      //临时文件输出流
      tmpOutput_ = new ExtendedFileChannel(new RandomAccessFile(tmpMapFile_, "rw").getChannel());
      
      //DMap的输出流, DMap用的文件是mapFile.
      output_ = new ExtendedFileChannel(new RandomAccessFile(mapFile_, "rw").getChannel());
    } else {
      throw new IOException("Output map file already exists at: " + mapFile
          + ", cannot write.");
    }
  }

  //往Map中添加一对<Key,Value>
  public void add(byte[] key, byte[] value) throws IOException {
    //首先写到临时文件输出流, 即写到临时文件中
    //分别是Key的长度,Value的长度,然后才是Key,Value
    tmpOutput_.writeInt(key.length);
    tmpOutput_.writeInt(value.length);
    tmpOutput_.write(key);
    tmpOutput_.write(value);
    //Map的条目数量+1
    entriesCount_++;
  }

  //构建MapFile, 添加Map数据时,数据只是追加到临时文件中.
  //build才是正式写到文件中,不能简单地追加数据到文件中.要考虑到get操作的高效读取.
  public void build() throws IOException {
    //临时文件中key和offset的映射
    Map<ByteArray, Long> tmpKeyOffsetMap =  new HashMap<>();
    //要先flush,如果没有flush,则数据还是在内存中:输出流对象中.flush之后,才写到临时文件中
    tmpOutput_.flush();
    tmpOutput_.close();

    //读取临时文件
    ExtendedFileChannel raf = new ExtendedFileChannel(new RandomAccessFile(tmpMapFile_, "r").getChannel());
    //key和value是之前add到输出流中的每对数据,下面会循环获取
    byte[] key;
    byte[] value;
    long currentOffset = 0;

    try {
      // 1. 从临时文件中加载key-->offset的映射关系到tmpKeyOffsetMap中
      // key=写入键值对的key, offset:在临时文件中每一对键值对的开始写入位置.即keyLen的开始位置.
      // 保存这个Map的目的是,当知道key后,根据map获取offset,并定位到临时文件的这个位置,开始读取整个键值对.
      logger_.debug("Keys to process: " + entriesCount_);
      for(int i=0;i<entriesCount_;i++) {
        //首先读取key的长度和value的长度,这也是<Key,Value>数据通过add写到临时文件中的顺序
        int keyLen = raf.readInt();
        int valLen = raf.readInt();

        //读取key内容
        key = new byte[keyLen];
        raf.read(key);
        //key不能重复!
        if(tmpKeyOffsetMap.containsKey(new ByteArray(key))) {
          throw new IOException("Duplicate key encountered: " + key);
        }

        //key在临时文件中的offset.初始值为0,因为第一个key在文件的第0个位置
        tmpKeyOffsetMap.put(new ByteArray(key), currentOffset);
        // ignore the value byte sequence (for the time being) and move to next record start pos
        //忽略value的值.先把所有的key->offset的对应关系放到map中

        //读取完key之后,raf.position()现在在key的末尾=value的开始,加上valLen,即到了value的末尾,下一个key的开始
        //定位到下一个key的offset.  firstKeyLen|firstValLen|firestKey|firstVal|secKeyLen
        //                           keyLen      valLen      key      value |<-currentOffset
        currentOffset = raf.position() + valLen;
        raf.position(currentOffset);
      }
      logger_.debug("Loaded " + tmpKeyOffsetMap.size() + " keys from temporary file");

      // global header - version, entries count, block size, trailer offset
      // 2. 写入全局头信息: 版本,条目数,块大小,跟踪信息的offset(现在只是个占位符,在最后会进行更新)
      output_.writeInt(DMap.VERSION);
      output_.writeInt(tmpKeyOffsetMap.size());
      output_.writeInt(blockSize_);
      output_.writeBool(compressValues_);
      // insert placeholder for trailer offset
      output_.writeLong(0);

      // 3. 所有的key,对key进行排序. 排序的目的是为了查找时快速定位key的位置
      List<ByteArray> allKeys = new ArrayList<>(tmpKeyOffsetMap.keySet());
      Collections.sort(allKeys);
      logger_.info("Writing map for " + allKeys.size() + " keys.");

      // 在全局头信息后面开始写入Map数据.当前的位置即Map的开始位置=第一个Block在文件中的offset
      // * Block在文件中的offset, 用于定位每个Block
      long globalOffset = output_.position();
      // 数据块的开始信息,还没开始写数据,块的开始=0
      // * 数据在每个Block中的offset, 用于定位每条数据
      int currentBlockOffset = 0;
      // 当前数据块还可以写入多少字节,初始时整个数据块可写
      int remainingBytes = blockSize_;
      // 当前数据块的第一个键
      ByteArray firstKey = null;

      //Block级别的key->offset, 一个文件会有多个Block!
      //File = Block1 | Block2 | Block3 | ...
      // Map to store block-level key-offset pairs (to be written to each block trailer)
      Map<ByteArray, Integer> blockKeyOffset_ = new HashMap<>();
      //数据块的起始和结束位置(全局)
      // Map to store blockStart-blockTrailerStart pair (to be written to global trailer)
      Map<Long, Long> blockTrailerOffsets = new HashMap<>();
      //每个Block的第一个key:<CurrentBlockOffset, theFirstKeyOfThisBlock>
      // Map to store blockStart-firstKey pair (to be written to global trailer)
      Map<Long, ByteArray> blockFirstKey = new HashMap<>();

      // 4. 开始循环处理每对Key,Value
      for (ByteArray keyBytes : allKeys) {
        long offset = tmpKeyOffsetMap.get(keyBytes);
        raf.position(offset);
        int keyLen = raf.readInt();
        int valLen = raf.readInt();
        value = new byte[valLen];
        // position pointer at the starting of value data
        raf.position(raf.position() + keyLen);
        raf.read(value);

        if (compressValues_) {
          value = Snappy.compress(value);
        }

        //数据的长度,即每一个<Key,Value>键值对的值的长度
        int dataLength = CompressionUtils.getVNumSize(value.length) + value.length;
        //一个KeyValue的value长度比BlockSize还大?
        if(dataLength > blockSize_) {
          throw new IOException("Data size ("+ dataLength +" bytes) greater than specified block size(" + blockSize_ + " bytes)");
        }

        // 6. 当数据块写满后,更新BlockTrailer
        // write block trailer & reset variables
        //dataLength是每一个KeyValue, remainingBytes的初始值是BlockSize,在处理完一个KV后值会减少这个KV的长度
        //如果当前写入的KV比剩余空间大,说明剩余的空间不够当前KV的写入, 则要重新创建一个文件,并重置变量
        if(dataLength > remainingBytes) {
          logger_.debug("Key : " + keyBytes + " with value doesnt fit in remaining "+ remainingBytes + " bytes.");
          // globalOffset记录了每个Block+BlockTrailer在文件中的offset.
          // 一个完整的数据块包括Block+BlockTrailer.
          globalOffset = updateBlockTrailer(blockKeyOffset_, blockTrailerOffsets, blockFirstKey, firstKey, globalOffset);
          logger_.debug("Creating new block @ " + globalOffset);
          currentBlockOffset = 0;
          remainingBytes = blockSize_;
          firstKey = null;
        }
        //每个Block的firstKey. 在新建Block的时候,重置了firstKey=null,这里会再次执行. 只有在每个Block的开始才会执行!
        if(firstKey == null) firstKey = keyBytes;

        // 5. 先写入value到Block
        //写入Value. 还没有写入key?? 什么时候写入key? 调用updateBlockTrailer的时候. 如果数据没有满一个Block,则不会调用
        logger_.debug("write@ " + globalOffset + " key: " + keyBytes + "" + " (hash: " + keyBytes.hashCode() + ")");
        output_.writeVInt(value.length);
        // write value (key can be retrieved from block trailer)
        output_.write(value);
        // store key-offset pair (needed for block trailer)
        //在每一个Block里的key->offset的映射[offset针对的是当前Block]. 之前有一个tmpKeyOffsetMap是key在整个临时文件中的offset
        //注意:blockKeyOffset的value是Map的key对应的value在Block中的offset. 通过key得到value在Block中的位置即可读取出value
        //临时文件是整个键值对一起写入(追加),而写到文件中是先把value都抽取出来放在Block里,然后在BlockTrailer里记录key和value在block里的offset的映射
        blockKeyOffset_.put(keyBytes, currentBlockOffset);
        //Block的大小是Value的长度总和(因为这里的加减是dataLength),不包括Key. blockOffset增加,则remainingBytes减少
        currentBlockOffset += dataLength;
        remainingBytes -= dataLength;
      }

      // 7. 写入最后一个数据块的Trailer信息. 最后一个数据块已经在上面的循环中写入了.但是可能没有满足blockSize_,需要在这里调用一次更新BlockTrailer
      // write the last block trailer information
      globalOffset = updateBlockTrailer(blockKeyOffset_, blockTrailerOffsets, blockFirstKey, firstKey, globalOffset);

      // 8. 写入GlobalBlockTrailer信息.
      //总的文件格式如下:
      // Header | Block1's Data | Block1's Trailer | Block2's Data | Block2's Trailer | ... | GlobalBlockTrailer
      // 一个完整的数据块信息(BlockData+BlockTrailer)的格式如下:
      //|Block1                 |Block1's Trailer
      // v1Len,v1,v2Len,v2,....,|numKeys,k1Len,k1,v1'off,k2Len,k2,v2'off
      // GlobalTrailer的格式如下:
      // numBlocks,block1'off,block1's trailer'off,key1Len,key1

      // write global trailer (block start offset->block trailer offset pair & first key in the block)
      // blockTrailerOffsets记录的是每个Block的offset->这个Block的Trailer的offset的映射关系
      // 每个BlockTrailer的offset可以理解为每个Block的结束位置.因为Block的结束是BlockTrailer的开始
      output_.writeVInt(blockTrailerOffsets.size()); //有多少个数据块
      List<Long> allBlockKeys = new ArrayList<>(blockTrailerOffsets.keySet());
      // Map是无序的.但是Block的位置是固定的.由于先写入的Block的offset肯定最小.按照升序排序,还原BlockOffset的有序性
      Collections.sort(allBlockKeys);
      for(long blockStart : allBlockKeys) {
        //写入每个Block在文件中的开始位置和结束位置(Block的结束位置理解为BlockTrailer的offset)
        output_.writeVLong(blockStart);
        output_.writeVLong(blockTrailerOffsets.get(blockStart));
        //blockFirstKey放的是BlockStartOffset和这个Block的第一个Key
        byte[] tmpFirstKeyByte = blockFirstKey.get(blockStart).getBytes();
        // write the first key info to global trailer
        output_.writeVInt(tmpFirstKeyByte.length);
        output_.write(tmpFirstKeyByte);
      }
      raf.close();
      output_.flush();
      output_.close();

      // fill in the previously created placeholder for trailer offset
      raf = new ExtendedFileChannel(new RandomAccessFile(mapFile_, "rw").getChannel());
      raf.position(DMap.DEFAULT_LOC_FOR_TRAILER_OFFSET);
      logger_.info("DMap Trailer start at " + globalOffset + ".");
      //在指定位置填充GlobalBlockTrailer的Offset, 这个位置首先写入了BlockCount.见前面!
      //所有Block+BlockTrailer(包括最后一个数据块对应的Trailer)都写完后,写入GlobalBlockTrailer.
      raf.writeLong(globalOffset);
    } finally {
      // delete the intermediate temp file
      tmpMapFile_.delete();
      raf.close();
    }
  }

    /**
     * Returns new global offset after updating the block trailer
     * @param keyOffsets key-->key对应的value在当前Block的offset
     * @param blockTrailerOffsets
     * @param blockFirstKey 当前Block的Offset(Block在文件的全局offset)-->当前Block的第一个Key
     * @param firstKey 当前Block的firstKey
     * @param globalOffset 当前Block在文件中的offset位置
     * @return
     * @throws IOException
     */
  private long updateBlockTrailer(Map<ByteArray, Integer> keyOffsets,
      Map<Long, Long> blockTrailerOffsets,
      Map<Long, ByteArray> blockFirstKey,
      ByteArray firstKey, long globalOffset) throws IOException {
    //文件当前的位置,即当前Block的末尾.注意Block不包括Key,只包含Value. Block的末尾是准备写key的开始. Block可以理解为数据块
    //|value1|value2|....|key1|v1-off|key2|v2-off... |val10|val11|....|key10|val10-off|.|...|
    //|<-----Block A---->|<----Block A Trailer------>|<--Block B----->|<--Block B Trailer-->|
    //blockTrailerOffset: value1'off->key1'off, value10'off->key10'off
    //blockFirstKey: value1'off->key1, value10'off->key10
    long trailerOffset = output_.size();

    // write number of entries in the current block 当前Block的key数量
    output_.writeVInt(keyOffsets.size());
    for(Entry<ByteArray, Integer> e : keyOffsets.entrySet()) {
      ByteArray byteArray = e.getKey();
      //写入keyLen, Key,
      output_.writeVInt(byteArray.getBytes().length);
      output_.write(byteArray.getBytes());
      //key对应的value在当前Block的offset
      output_.writeVInt(e.getValue());
    }
    keyOffsets.clear();
    // track block offset info 跟踪每个块的trailer的offset
    blockTrailerOffsets.put(globalOffset, trailerOffset);
    // track first keys in each block 跟踪每个Block的firstKey
    blockFirstKey.put(globalOffset, firstKey);
    // 上面两个Map的key都是globalOffset, 即每个数据块在文件中的offset.
    // blockTrailerOffsets记录的映射值是Trailer在文件中的开始位置
    // blockFirstKey记录的是这个数据块的第一个键

    // 更新完BlockTrailer后,返回当前位置.即BlockTrailer的结束位置. 也是下一个Block的开始位置.
    // 因为一个完整的数据块信息包括了Block+BlockTrailer.
    return output_.position();
  }
}