/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.facebook.infrastructure.io;
import com.facebook.infrastructure.config.DatabaseDescriptor;
import com.facebook.infrastructure.utils.BasicUtilities;
import com.facebook.infrastructure.utils.BloomFilter;
import com.facebook.infrastructure.utils.LogUtil;
import com.facebook.infrastructure.db.RowMutation;
import org.apache.log4j.Logger;
import org.apache.commons.lang.ArrayUtils;
import java.io.File;
import java.io.IOException;
import java.util.*;
/**
* This class is built on top of the SequenceFile. It stores
* data on disk in sorted fashion. However the sorting is upto
* the application. This class expects keys to be handed to it
* in sorted order. SSTable is broken up into blocks where each
* block contains 128 keys. At the end of every block the block
* index is written which contains the offsets to the keys in the
* block. SSTable also maintains an index file to which every 128th
* key is written with a pointer to the block index which is the block
* that actually contains the key. This index file is then read and
* maintained in memory. SSTable is append only and immutable. SSTable
* on disk looks as follows:
*
* -------------------------
* |------------------------|<-------|
* | | | BLOCK-INDEX PTR
* | | |
* |------------------------|--------
* |------------------------|<-------|
* | | |
* | | | BLOCK-INDEX PTR
* | | |
* |------------------------|---------
* |------------------------|<--------|
* | | |
* | | |
* | | | BLOCK-INDEX PTR
* | | |
* |------------------------| |
* |------------------------|----------
* |------------------------|-----------------> BLOOM-FILTER
* version-info <--|----------|-------------|-------> relative offset to last block index.
*
* Author : Avinash Lakshman ( alakshman@facebook.com) & Prashant Malik ( pmalik@facebook.com )
*/
public class SSTable
{
private static Logger logger_ = Logger.getLogger(SSTable.class);
/* use this as a monitor to lock when loading index. */
private static Object indexLoadLock_ = new Object();
/* Every 128th key is an index. */
private static final int indexInterval_ = 128;
/* Key associated with block index written to disk */
public static final String blockIndexKey_ = "BLOCK-INDEX";
/* Position in SSTable after the first Block Index */
private static long positionAfterFirstBlockIndex_ = 0L;
/* Required extension for temporary files created during compactions. */
public static final String temporaryFile_ = "tmp";
/* Use this long as a 64 bit entity to turn on some bits for various settings */
private static final long version_ = 0L;
/*
* This map has the SSTable as key and a BloomFilter as value. This
* BloomFilter will tell us if a key/column pair is in the SSTable.
* If not we can avoid scanning it.
*/
private static Map<String, BloomFilter> bfs_ = new Hashtable<String, BloomFilter>();
/* Maintains a touched set of keys */
private static LinkedHashMap<String, Long> touchCache_ = new TouchedKeyCache(DatabaseDescriptor.getTouchKeyCacheSize());
/**
* This class holds the position of a key in a block
* and the size of the data associated with this key.
*/
static class BlockMetadata
{
static final BlockMetadata NULL = new BlockMetadata(-1L, -1L);
long position_;
long size_;
BlockMetadata(long position, long size)
{
position_ = position;
size_ = size;
}
}
/*
* This abstraction provides LRU symantics for the keys that are
* "touched". Currently it holds the offset of the key in a data
* file. May change to hold a reference to a IFileReader which
* memory maps the key and its associated data on a touch.
*/
private static class TouchedKeyCache extends LinkedHashMap<String, Long>
{
private final int capacity_;
TouchedKeyCache(int capacity)
{
super(capacity + 1, 1.1f, true);
capacity_ = capacity;
}
protected boolean removeEldestEntry(Map.Entry<String, Long> entry)
{
return ( size() > capacity_ );
}
}
/**
* This is a simple container for the index Key and its corresponding position
* in the data file. Binary search is performed on a list of these objects
* to lookup keys within the SSTable data file.
*/
public static class KeyPositionInfo implements Comparable<KeyPositionInfo>
{
public final String key;
public final long position;
public KeyPositionInfo(String key)
{
this(key, 0);
}
public KeyPositionInfo(String key, long position)
{
this.key = key;
this.position = position;
}
public int compareTo(KeyPositionInfo kPosInfo)
{
return key.compareTo(kPosInfo.key);
}
public String toString()
{
return key + ":" + position;
}
}
public static int indexInterval()
{
return indexInterval_;
}
/*
* Maintains a list of KeyPositionInfo objects per SSTable file loaded.
* We do this so that we don't read the index file into memory multiple
* times.
*/
private static Map<String, List<KeyPositionInfo>> indexMetadataMap_ = new Hashtable<String, List<KeyPositionInfo>>();
/**
* This method deletes both the specified data file
* and the associated index file
*
* @param dataFile - data file associated with the SSTable
*/
public static void delete(String dataFile)
{
/* remove the cached index table from memory */
indexMetadataMap_.remove(dataFile);
File file = new File(dataFile);
if ( file.exists() )
/* delete the data file */
if (file.delete())
{
logger_.info("** Deleted " + file.getName() + " **");
}
else
{
logger_.error("Failed to delete " + file.getName());
}
}
public static int getApproximateKeyCount( List<String> dataFiles)
{
int count = 0 ;
for(String dataFile : dataFiles )
{
List<KeyPositionInfo> index = indexMetadataMap_.get(dataFile);
if (index != null )
{
count += index.size() + 1;
}
}
return count * indexInterval_;
}
/**
* Get all indexed keys in the SSTable.
*/
public static List<String> getSortedKeys()
{
Set<String> indexFiles = indexMetadataMap_.keySet();
List<KeyPositionInfo> keyPositionInfos = new ArrayList<KeyPositionInfo>();
for ( String indexFile : indexFiles )
{
keyPositionInfos.addAll( indexMetadataMap_.get(indexFile) );
}
List<String> indexedKeys = new ArrayList<String>();
for ( KeyPositionInfo keyPositionInfo : keyPositionInfos )
{
indexedKeys.add(keyPositionInfo.key);
}
Collections.sort(indexedKeys);
return indexedKeys;
}
public static void onStart(List<String> filenames) throws IOException
{
for ( String filename : filenames )
{
SSTable.maybeLoadIndexFile(filename);
}
}
/*
* Stores the Bloom Filter associated with the given file.
*/
public static void storeBloomFilter(String filename, BloomFilter bf)
{
bfs_.put(filename, bf);
}
/*
* Removes the bloom filter associated with the specified file.
*/
public static void removeAssociatedBloomFilter(String filename)
{
bfs_.remove(filename);
}
/*
* Determines if the given key is in the specified file. If the
* key is not present then we skip processing this file.
*/
public static boolean isKeyInFile(String key, String filename)
{
boolean bVal = false;
BloomFilter bf = bfs_.get(filename);
if ( bf != null )
{
bVal = bf.isPresent(key);
}
return bVal;
}
public static long fetchOffset(String key, String file) throws IOException
{
long position = -1L;
DataOutputBuffer bufOut = new DataOutputBuffer();
DataInputBuffer bufIn = new DataInputBuffer();
IFileReader dataReader = SequenceFile.bufferedReader(file, 1024*1024);
while ( !dataReader.isEOF() )
{
bufOut.reset();
/* Record the position of the key. */
position = dataReader.getCurrentPosition();
dataReader.next(bufOut);
bufIn.reset(bufOut.getData(), bufOut.getLength());
/* Key just read */
String keyOnDisk = bufIn.readUTF();
if ( keyOnDisk.equals(key) )
{
break;
}
}
return position;
}
private String dataFile_;
private IFileWriter dataWriter_;
private String lastWrittenKey_;
private long prevBlockPosition_ = 0L;
private int indexKeysWritten_ = 0;
/* Holds the keys and their respective positions in a block */
private SortedMap<String, BlockMetadata> blockIndex_ = new TreeMap<String, BlockMetadata>(Collections.reverseOrder());
/*
* This ctor basically gets passed in the full path name
* of the data file associated with this SSTable. Use this
* ctor to read the data in this file.
*/
public SSTable(String dataFileName) throws IOException
{
dataFile_ = dataFileName;
SSTable.maybeLoadIndexFile(dataFile_);
}
/*
* Intialize the index files and also cache the Bloom Filters
* associated with these files.
*/
public static void maybeLoadIndexFile(String filename) throws IOException {
// prevent multiple threads from loading the same index files multiple times
synchronized( indexLoadLock_ )
{
if ( indexMetadataMap_.get(filename) == null )
{
long start = System.currentTimeMillis();
loadIndex(filename);
logger_.debug("INDEX LOAD TIME: " + (System.currentTimeMillis() - start) + " ms.");
}
}
}
private static void loadBloomFilter(IFileReader reader, long size) throws IOException
{
/* read the position of the bloom filter */
reader.seek(size - 8);
byte[] bytes = new byte[8];
long currentPosition = reader.getCurrentPosition();
reader.readDirect(bytes);
long position = BasicUtilities.byteArrayToLong(bytes);
/* seek to the position of the bloom filter */
reader.seek(currentPosition - position);
DataOutputBuffer bufOut = new DataOutputBuffer();
DataInputBuffer bufIn = new DataInputBuffer();
/* read the bloom filter from disk */
reader.next(bufOut);
bufIn.reset(bufOut.getData(), bufOut.getLength());
String key = bufIn.readUTF();
if ( key.equals(SequenceFile.marker_) )
{
/*
* We are now reading the serialized Bloom Filter. We read
* the length and then pass the bufIn to the serializer of
* the BloomFilter. We then store the Bloom filter in the
* map. However if the Bloom Filter already exists then we
* need not read the rest of the file.
*/
bufIn.readInt();
if ( bfs_.get(reader.getFileName()) == null )
bfs_.put(reader.getFileName(), BloomFilter.serializer().deserialize(bufIn));
}
}
private static void loadIndex(String filename) throws IOException
{
IFileReader indexReader = SequenceFile.reader(filename);
File file = new File(filename);
long size = file.length();
/* load the bloom filter into memory */
loadBloomFilter(indexReader, size);
/* read the position of the last block index */
byte[] bytes = new byte[8];
/* seek to the position to read the relative position of the last block index */
indexReader.seek(size - 16L);
/* the beginning of the last block index */
long currentPosition = indexReader.getCurrentPosition();
indexReader.readDirect(bytes);
long lastBlockIndexPosition = BasicUtilities.byteArrayToLong(bytes);
List<KeyPositionInfo> keyPositionInfos = new ArrayList<KeyPositionInfo>();
indexMetadataMap_.put(filename, keyPositionInfos);
DataOutputBuffer bufOut = new DataOutputBuffer();
DataInputBuffer bufIn = new DataInputBuffer();
/* Read all block indexes to maintain an index in memory */
try
{
long nextPosition = currentPosition - lastBlockIndexPosition;
/* read the block indexes from the end of the file till we hit the first one. */
while ( nextPosition > 0 )
{
indexReader.seek(nextPosition);
bufOut.reset();
/* position @ the current block index being processed */
currentPosition = indexReader.getCurrentPosition();
long bytesRead = indexReader.next(bufOut);
if ( bytesRead != -1 )
{
bufIn.reset(bufOut.getData(), bufOut.getLength());
/* read the block key. */
String blockIndexKey = bufIn.readUTF();
if ( !blockIndexKey.equals(SSTable.blockIndexKey_) )
throw new IOException("Unexpected position to be reading the block index from.");
/* read the size of the block index */
int sizeOfBlockIndex = bufIn.readInt();
/* Number of keys in the block. */
int keys = bufIn.readInt();
String largestKeyInBlock = null;
for ( int i = 0; i < keys; ++i )
{
String keyInBlock = bufIn.readUTF();
if ( i == 0 )
{
largestKeyInBlock = keyInBlock;
/* relative offset in the block for the key*/
long position = bufIn.readLong();
/* size of data associated with the key */
bufIn.readLong();
/* load the actual position of the block index into the index map */
keyPositionInfos.add( new KeyPositionInfo(largestKeyInBlock, currentPosition) );
}
else
{
/*
* This is not the key we are looking for. So read its position
* and the size of the data associated with it. This was stored
* as the BlockMetadata.
*/
long position = bufIn.readLong();
bufIn.readLong();
}
}
lastBlockIndexPosition = bufIn.readLong();
nextPosition = currentPosition - lastBlockIndexPosition;
}
}
Collections.sort(keyPositionInfos);
}
catch( IOException ex )
{
logger_.error(LogUtil.throwableToString(ex));
}
finally
{
indexReader.close();
}
}
/**
* Section of a file that needs to be scanned
*/
public static class Range
{
public long start;
public long end;
Range(long start, long end)
{
this.start = start;
this.end = end;
}
}
//
//
// BEGIN ACTUAL SSTABLE CODE
//
//
/*
* This ctor is used for writing data into the SSTable. Use this
* version to write to the SSTable.
*/
public SSTable(String directory, String filename) throws IOException
{
dataFile_ = directory + System.getProperty("file.separator") + filename + "-Data.db";
dataWriter_ = SequenceFile.bufferedWriter(dataFile_, 32*1024*1024);
// dataWriter_ = SequenceFile.checksumWriter(dataFile_);
/* Write the block index first. This is an empty one */
dataWriter_.append(SSTable.blockIndexKey_, ArrayUtils.EMPTY_BYTE_ARRAY);
SSTable.positionAfterFirstBlockIndex_ = dataWriter_.getCurrentPosition();
}
public String getDataFileLocation() throws IOException
{
File file = new File(dataFile_);
if ( file.exists() )
return file.getAbsolutePath();
throw new IOException("File " + dataFile_ + " was not found on disk.");
}
public long lastModified()
{
return dataWriter_.lastModified();
}
/*
* Seeks to the specified key on disk.
*/
public void touch(String key, boolean fData) throws IOException
{
if ( touchCache_.containsKey(key) )
return;
IFileReader dataReader = SequenceFile.reader(dataFile_);
try
{
Range fileCoordinate = getRange(key, dataReader);
/* Get offset of key from block Index */
dataReader.seek(fileCoordinate.end);
BlockMetadata blockMetadata = dataReader.getBlockMetadata(key);
if ( blockMetadata.position_ != -1L )
{
touchCache_.put(dataFile_ + ":" + key, blockMetadata.position_);
}
if ( fData )
{
/* Read the data associated with this key and pull it into the Buffer Cache */
if ( blockMetadata.position_ != -1L )
{
dataReader.seek(blockMetadata.position_);
DataOutputBuffer bufOut = new DataOutputBuffer();
dataReader.next(bufOut);
logger_.debug("Finished the touch of the key to pull it into buffer cache.");
}
}
}
finally
{
dataReader.close();
}
}
private void dumpBlockIndex() throws IOException
{
DataOutputBuffer bufOut = new DataOutputBuffer();
/*
* Record the position where we start writing the block index. This is will be
* used as the position of the lastWrittenKey in the block in the index file
*/
long position = dataWriter_.getCurrentPosition();
Set<String> keys = blockIndex_.keySet();
/* Number of keys in this block */
bufOut.writeInt(keys.size());
for ( String key : keys )
{
bufOut.writeUTF(key);
BlockMetadata blockMetadata = blockIndex_.get(key);
/* position of the key as a relative offset */
bufOut.writeLong(position - blockMetadata.position_);
bufOut.writeLong(blockMetadata.size_);
}
/* Write the relative offset to the previous block index. */
bufOut.writeLong(position - prevBlockPosition_);
prevBlockPosition_ = position;
/* Write out the block index. */
dataWriter_.append(SSTable.blockIndexKey_, bufOut);
blockIndex_.clear();
/* Load this index into the in memory index map */
List<KeyPositionInfo> keyPositionInfos = SSTable.indexMetadataMap_.get(dataFile_);
if ( keyPositionInfos == null )
{
keyPositionInfos = new ArrayList<KeyPositionInfo>();
SSTable.indexMetadataMap_.put(dataFile_, keyPositionInfos);
}
keyPositionInfos.add(new KeyPositionInfo(lastWrittenKey_, position));
}
public void append(String key, DataOutputBuffer buffer) throws IOException
{
append(key, buffer.getData());
}
public void append(String key, byte[] value) throws IOException
{
assert key != null;
if ( lastWrittenKey_ != null && key.compareTo(lastWrittenKey_) <= 0 )
{
logger_.info("Last written key : " + lastWrittenKey_);
logger_.info("Current key : " + key);
logger_.info("Writing into file " + dataFile_);
throw new IOException("Keys must be written in ascending order.");
}
long currentPosition = (lastWrittenKey_ == null) ? SSTable.positionAfterFirstBlockIndex_ : dataWriter_.getCurrentPosition();
dataWriter_.append(key, value);
++indexKeysWritten_;
lastWrittenKey_ = key;
blockIndex_.put(key, new BlockMetadata(currentPosition, (long) value.length));
if ( indexKeysWritten_ == indexInterval_ )
{
dumpBlockIndex();
indexKeysWritten_ = 0;
}
}
public static Range getRange(String key, IFileReader dataReader) throws IOException
{
List<KeyPositionInfo> indexInfo = indexMetadataMap_.get(dataReader.getFileName());
int size = (indexInfo == null) ? 0 : indexInfo.size();
long start = 0L;
long end = dataReader.getEOF();
if ( size > 0 )
{
final int index = Collections.binarySearch(indexInfo, new KeyPositionInfo(key));
if ( index < 0 )
{
// key is not present at all; scan is required
int insertIndex = (index + 1) * -1;
start = (insertIndex == 0) ? 0 : indexInfo.get(insertIndex - 1).position;
if ( insertIndex < size )
{
end = indexInfo.get(insertIndex).position;
}
else
{
/* This is the Block Index in the file. */
end = start;
}
}
else
{
/* If we are here that means the key is in the index file
* and we can retrieve it w/o a scan.
* TODO we would
* like to have a retreive(key, fromPosition) but for now
* we use scan(start, start + 1) - a hack. */
start = indexInfo.get(index).position;
end = start;
}
}
else
{
/*
* We are here which means there are less than
* 128 keys in the system and hence our only recourse
* is a linear scan from start to finish. Automatically
* use memory mapping since we have a huge file and very
* few keys.
*/
end = dataReader.getEOF();
}
return new Range(start, end);
}
public DataInputBuffer next(String key, String columnFamilyName, List<String> cNames) throws IOException
{
assert columnFamilyName.split(":").length == 1;
IFileReader dataReader = SequenceFile.reader(dataFile_);
try
{
Range range = getRange(key, dataReader);
/*
* we have the position we have to read from in order to get the
* column family, get the column family and column(s) needed.
*/
DataOutputBuffer bufOut = new DataOutputBuffer();
DataInputBuffer bufIn = new DataInputBuffer();
try
{
dataReader.next(key, bufOut, columnFamilyName, cNames, range);
if ( bufOut.getLength() > 0 )
{
bufIn.reset(bufOut.getData(), bufOut.getLength());
/* read the key even though we do not use it */
bufIn.readUTF();
bufIn.readInt();
}
}
catch( IOException ex )
{
logger_.info("Bloom filter false positive", ex);
}
return bufIn;
}
finally
{
dataReader.close();
}
}
public DataInputBuffer next(String key, String cf) throws IOException
{
String[] values = RowMutation.getColumnAndColumnFamily(cf);
String columnFamilyName = values[0];
List<String> cnNames = (values.length == 1) ? null : Arrays.asList(new String[] { values[1] });
return next(key, columnFamilyName, cnNames);
}
public void close(BloomFilter bf) throws IOException
{
/* Any remnants in the blockIndex should be dumped */
dumpBlockIndex();
/* reset the buffer and serialize the Bloom Filter. */
DataOutputBuffer bufOut = new DataOutputBuffer();
BloomFilter.serializer().serialize(bf, bufOut);
byte[] bytes = new byte[bufOut.getLength()];
System.arraycopy(bufOut.getData(), 0, bytes, 0, bytes.length);
/*
* Write the bloom filter for this SSTable.
* Then write two longs one which is a version
* and one which is a pointer to the last written
* block index.
*/
long bloomFilterPosition = dataWriter_.getCurrentPosition();
dataWriter_.close(bytes, bytes.length);
/* write the version field into the SSTable */
dataWriter_.writeDirect(BasicUtilities.longToByteArray(version_));
/* write the relative position of the last block index from current position */
long blockPosition = dataWriter_.getCurrentPosition() - prevBlockPosition_;
dataWriter_.writeDirect(BasicUtilities.longToByteArray(blockPosition));
/* write the position of the bloom filter */
long bloomFilterRelativePosition = dataWriter_.getCurrentPosition() - bloomFilterPosition;
dataWriter_.writeDirect(BasicUtilities.longToByteArray(bloomFilterRelativePosition));
dataWriter_.close();
bufOut.close();
}
/*
* Renames a temporray sstable file to a valid data and index file
*/
public void closeRename(BloomFilter bf) throws IOException
{
close( bf);
String tmpDataFile = dataFile_;
String dataFileName = dataFile_.replace("-" + temporaryFile_,"");
File dataFile = new File(dataFile_);
dataFile.renameTo(new File(dataFileName));
dataFile_ = dataFileName;
/* Now repair the in memory index associated with the old name */
List<KeyPositionInfo> keyPositionInfos = SSTable.indexMetadataMap_.remove(tmpDataFile);
SSTable.indexMetadataMap_.put(dataFile_, keyPositionInfos);
}
public void closeRename(BloomFilter bf, List<String> files) throws IOException
{
closeRename(bf);
files.add(dataFile_);
}
}