package com.facebook.infrastructure.db;
import com.facebook.infrastructure.config.DatabaseDescriptor;
import com.facebook.infrastructure.dht.Range;
import com.facebook.infrastructure.io.*;
import com.facebook.infrastructure.net.EndPoint;
import com.facebook.infrastructure.service.StorageService;
import com.facebook.infrastructure.utils.BloomFilter;
import com.facebook.infrastructure.utils.CountingBloomFilter;
import com.facebook.infrastructure.utils.Filter;
import com.facebook.infrastructure.utils.LogUtil;
import org.apache.log4j.Logger;
import java.io.File;
import java.io.IOException;
import java.math.BigInteger;
import java.util.*;
public class ColumnFamilyCompactor {
private static Logger logger_ = Logger.getLogger(ColumnFamilyCompactor.class);
static final int THRESHOLD = 4;
/*
* Break the files into buckets and then compact.
*/
static CountingBloomFilter doCompaction(ColumnFamilyStore cfs, List<Range> ranges) throws IOException
{
List<String> files = cfs.getAllSSTablesOnDisk();
CountingBloomFilter result = null;
for(List<String> fileList : ColumnFamilyCompactor.getCompactionBuckets(files, 50L*1024L*1024L, 200L*1024L*1024L*1024L))
{
CountingBloomFilter tempResult = null;
// If ranges != null we should split the files irrespective of the threshold.
if(fileList.size() >= ColumnFamilyCompactor.THRESHOLD || ranges != null)
{
files.clear();
int count = 0;
for(String file : fileList)
{
files.add(file);
count++;
if( count == ColumnFamilyCompactor.THRESHOLD && ranges == null )
break;
}
try
{
// For each bucket if it has crossed the threshhold do the compaction
// In case of range compaction merge the counting bloom filters also.
if(ranges == null )
tempResult = ColumnFamilyCompactor.doRangeCompaction(cfs, files, ranges, ColumnFamilyStore.BUFFER_SIZE);
else
tempResult = ColumnFamilyCompactor.doRangeCompaction(cfs, files, ranges, ColumnFamilyStore.BUFFER_SIZE);
if(result == null)
{
result = tempResult;
}
else
{
result.merge(tempResult);
}
}
catch ( Exception ex)
{
logger_.error(ex);
}
}
}
return result;
}
/*
* Stage the compactions , compact similar size files.
* This fn figures out the files close enough by size and if they
* are greater than the threshold then compacts.
*/
static Set<List<String>> getCompactionBuckets(List<String> files, long min, long max)
{
Map<List<String>, Long> buckets = new HashMap<List<String>, Long>();
List<String> largeFileList = new ArrayList<String>();
for(String fname : files)
{
File f = new File(fname);
long size = f.length();
if ( size > max)
{
largeFileList.add(fname);
continue;
}
boolean bFound = false;
for (List<String> bucket : new ArrayList<List<String>>(buckets.keySet()))
{
long averageSize = buckets.get(bucket);
// group in the same bucket if it's w/in 50% of the average for this bucket,
// or this file and the bucket are all considered "small" (less than `min`)
if ((size > averageSize/2 && size < 3*averageSize/2)
|| ( size < min && averageSize < min))
{
// remove and re-add because adding changes the hash
buckets.remove(bucket);
averageSize = (averageSize + size) / 2 ;
bucket.add(fname);
buckets.put(bucket, averageSize);
bFound = true;
break;
}
}
if(!bFound)
{
ArrayList<String> bucket = new ArrayList<String>();
bucket.add(fname);
buckets.put(bucket, size);
}
}
// Put files greater than the max in separate buckets so that they are never compacted
// (but we need them in the buckets since for range compactions we need to split these files)
Set<List<String>> bucketSet = new HashSet<List<String>>(buckets.keySet());
for (String file : largeFileList)
{
bucketSet.add(Arrays.asList(new String[] { file }));
}
return bucketSet;
}
/**
* This function is used to do the anti compaction process , it spits out the file which has keys that belong to a given range
* If the target is not specified it spits out the file as a compacted file with the unecessary ranges wiped out.
*
* @param files
* @param ranges
* @param target
* @param minBufferSize
* @param fileList
* @return
* @throws java.io.IOException
*/
static CountingBloomFilter doRangeOnlyAntiCompaction(
ColumnFamilyStore cfs, List<String> files, List<Range> ranges, EndPoint target, int minBufferSize, List<String> fileList, List<BloomFilter> compactedBloomFilters) throws IOException {
CountingBloomFilter rangeCountingBloomFilter = null;
long startTime = System.currentTimeMillis();
long totalBytesRead = 0;
long totalBytesWritten = 0;
long totalkeysRead = 0;
long totalkeysWritten = 0;
String rangeFileLocation = null;
String mergedFileName = null;
try {
// Calculate the expected compacted filesize
long expectedRangeFileSize = getExpectedCompactedFileSize(files);
/* in the worst case a node will be giving out alf of its data so we take a chance */
expectedRangeFileSize = expectedRangeFileSize / 2;
rangeFileLocation = DatabaseDescriptor.getCompactionFileLocation(expectedRangeFileSize);
// boolean isLoop = isLoopAround( ranges );
// Range maxRange = getMaxRange( ranges );
// If the compaction file path is null that means we have no space left for this compaction.
if (rangeFileLocation == null) {
logger_.warn("Total bytes to be written for range compaction ..."
+ expectedRangeFileSize + " is greater than the safe limit of the disk space available.");
return null;
}
PriorityQueue<FileStruct> pq = initializePriorityQueue(files, ranges, minBufferSize);
if (pq.size() > 0) {
mergedFileName = cfs.getTempFileName();
SSTable ssTableRange = null;
String lastkey = null;
List<FileStruct> lfs = new ArrayList<FileStruct>();
DataOutputBuffer bufOut = new DataOutputBuffer();
int expectedBloomFilterSize = SSTable.getApproximateKeyCount(files);
expectedBloomFilterSize = (expectedBloomFilterSize > 0) ? expectedBloomFilterSize : SSTable.indexInterval();
logger_.debug("Expected bloom filter size : " + expectedBloomFilterSize);
/* Create the bloom filter for the compacted file. */
BloomFilter compactedRangeBloomFilter = new BloomFilter(expectedBloomFilterSize, 8);
List<ColumnFamily> columnFamilies = new ArrayList<ColumnFamily>();
while (pq.size() > 0 || lfs.size() > 0) {
FileStruct fs = null;
if (pq.size() > 0) {
fs = pq.poll();
}
if (fs != null
&& (lastkey == null || lastkey.compareTo(fs.getKey()) == 0)) {
// The keys are the same so we need to add this to the
// ldfs list
lastkey = fs.getKey();
lfs.add(fs);
} else {
Collections.sort(lfs, new ColumnFamilyStore.FileStructComparator());
bufOut.reset();
if (lfs.size() > 1) {
for (FileStruct filestruct : lfs) {
try {
/* read the length although we don't need it */
filestruct.getBufIn().readInt();
// Skip the Index
if (DatabaseDescriptor.isNameIndexEnabled(cfs.getColumnFamilyName())) {
IndexHelper.skip(filestruct.getBufIn());
}
// We want to add only 2 and resolve them right there in order to save on memory footprint
if (columnFamilies.size() > 1) {
ColumnFamilyStore.merge(columnFamilies);
}
// deserialize into column families
columnFamilies.add(ColumnFamily.serializer().deserialize(filestruct.getBufIn()));
}
catch (Exception ex) {
logger_.error(LogUtil.throwableToString(ex));
}
}
// Now after merging all crap append to the sstable
ColumnFamily columnFamily = ColumnFamilyStore.resolveAndRemoveDeleted(columnFamilies);
columnFamilies.clear();
if (columnFamily != null) {
/* serialize the cf with column indexes */
ColumnFamily.serializerWithIndexes().serialize(columnFamily, bufOut);
}
} else {
FileStruct filestruct = lfs.get(0);
try {
/* read the length although we don't need it */
int size = filestruct.getBufIn().readInt();
bufOut.write(filestruct.getBufIn(), size);
}
catch (Exception ex) {
logger_.warn(LogUtil.throwableToString(ex));
filestruct.close();
continue;
}
}
if (Range.isKeyInRanges(ranges, lastkey)) {
if (ssTableRange == null) {
if (target != null)
rangeFileLocation = rangeFileLocation + System.getProperty("file.separator") + "bootstrap";
FileUtils.createDirectory(rangeFileLocation);
ssTableRange = new SSTable(rangeFileLocation, mergedFileName);
}
if (rangeCountingBloomFilter == null) {
rangeCountingBloomFilter = new CountingBloomFilter(expectedBloomFilterSize, 8);
}
try {
ssTableRange.append(lastkey, bufOut);
compactedRangeBloomFilter.add(lastkey);
if (target != null && StorageService.instance().isPrimary(lastkey, target)) {
rangeCountingBloomFilter.add(lastkey);
}
}
catch (Exception ex) {
logger_.warn(LogUtil.throwableToString(ex));
}
}
totalkeysWritten++;
for (FileStruct filestruct : lfs) {
try {
filestruct.getNextKey();
if (filestruct.isExhausted()) {
continue;
}
/* keep on looping until we find a key in the range */
while (!Range.isKeyInRanges(ranges, filestruct.getKey())) {
fs.getNextKey();
if (filestruct.isExhausted()) {
break;
}
/* check if we need to continue , if we are done with ranges empty the queue and close all file handles and exit */
//if( !isLoop && StorageService.hash(filestruct.key).compareTo(maxRange.right()) > 0 && !filestruct.key.equals(""))
//{
//filestruct.reader.close();
//filestruct = null;
//break;
//}
}
if (filestruct != null) {
pq.add(filestruct);
}
totalkeysRead++;
}
catch (Exception ex) {
// Ignore the exception as it might be a corrupted file
// in any case we have read as far as possible from it
// and it will be deleted after compaction.
logger_.error(LogUtil.throwableToString(ex));
filestruct.close();
}
}
lfs.clear();
lastkey = null;
if (fs != null) {
// Add back the fs since we processed the rest of
// filestructs
pq.add(fs);
}
}
}
if (ssTableRange != null) {
if (fileList == null)
fileList = new ArrayList<String>();
ssTableRange.closeRename(compactedRangeBloomFilter, fileList);
if (compactedBloomFilters != null)
compactedBloomFilters.add(compactedRangeBloomFilter);
}
}
}
catch (Exception ex) {
logger_.warn(LogUtil.throwableToString(ex));
}
logger_.debug("Total time taken for range split ..."
+ (System.currentTimeMillis() - startTime));
logger_.debug("Total bytes Read for range split ..." + totalBytesRead);
logger_.debug("Total bytes written for range split ..."
+ totalBytesWritten + " Total keys read ..." + totalkeysRead);
return rangeCountingBloomFilter;
}/*
* This function does the actual compaction for files.
* It maintains a priority queue of with the first key from each file
* and then removes the top of the queue and adds it to the SStable and
* repeats this process while reading the next from each file until its
* done with all the files . The SStable to which the keys are written
* represents the new compacted file. Before writing if there are keys
* that occur in multiple files and are the same then a resolution is done
* to get the latest data.
*
*/
static CountingBloomFilter doRangeCompaction(ColumnFamilyStore cfs, List<String> files, List<Range> ranges, int minBufferSize) throws IOException {
CountingBloomFilter rangeCountingBloomFilter = null;
String newfile = null;
long startTime = System.currentTimeMillis();
long totalBytesRead = 0;
long totalBytesWritten = 0;
long totalkeysRead = 0;
long totalkeysWritten = 0;
try {
// Calculate the expected compacted filesize
long expectedCompactedFileSize = getExpectedCompactedFileSize(files);
String compactionFileLocation = DatabaseDescriptor.getCompactionFileLocation(expectedCompactedFileSize);
// If the compaction file path is null that means we have no space left for this compaction.
if (compactionFileLocation == null) {
if (ranges == null || ranges.size() == 0) {
String maxFile = getMaxSizeFile(files);
files.remove(maxFile);
return doRangeCompaction(cfs, files, ranges, minBufferSize);
}
logger_.warn("Total bytes to be written for compaction ..."
+ expectedCompactedFileSize + " is greater than the safe limit of the disk space available.");
return null;
}
PriorityQueue<FileStruct> pq = initializePriorityQueue(files, ranges, minBufferSize);
if (pq.size() > 0) {
String mergedFileName = cfs.getTempFileName();
SSTable ssTable = null;
SSTable ssTableRange = null;
String lastkey = null;
List<FileStruct> mergeNeeded = new ArrayList<FileStruct>();
DataOutputBuffer bufOut = new DataOutputBuffer();
int expectedBloomFilterSize = SSTable.getApproximateKeyCount(files);
expectedBloomFilterSize = (expectedBloomFilterSize > 0) ? expectedBloomFilterSize : SSTable.indexInterval();
logger_.debug("Expected bloom filter size : " + expectedBloomFilterSize);
/* Create the bloom filter for the compacted file. */
BloomFilter compactedBloomFilter = new BloomFilter(expectedBloomFilterSize, 8);
BloomFilter compactedRangeBloomFilter = new BloomFilter(expectedBloomFilterSize, 8);
List<ColumnFamily> columnFamilies = new ArrayList<ColumnFamily>();
while (pq.size() > 0 || mergeNeeded.size() > 0) {
// pop off the queue until we get a different key
FileStruct fs = pq.peek();
if (fs != null
&& (lastkey == null || lastkey.equals(fs.getKey()))) {
// The keys are the same so we need to add this to the
// merge list
lastkey = fs.getKey();
mergeNeeded.add(pq.poll());
continue;
}
// merge the keys in the merge list
Collections.sort(mergeNeeded, new ColumnFamilyStore.FileStructComparator());
bufOut.reset();
if (mergeNeeded.size() > 1) {
for (FileStruct filestruct : mergeNeeded) {
try {
/* read the length although we don't need it */
filestruct.getBufIn().readInt();
// Skip the Index
if (DatabaseDescriptor.isNameIndexEnabled(cfs.getColumnFamilyName())) {
IndexHelper.skip(filestruct.getBufIn());
}
// We want to add only 2 and resolve them right there in order to save on memory footprint
if (columnFamilies.size() > 1) {
ColumnFamilyStore.merge(columnFamilies);
}
// deserialize into column families
columnFamilies.add(ColumnFamily.serializer().deserialize(filestruct.getBufIn()));
}
catch (Exception ex) {
logger_.error(ex);
continue;
}
}
// Now after merging all crap append to the sstable
ColumnFamily columnFamily = ColumnFamilyStore.resolveAndRemoveDeleted(columnFamilies);
columnFamilies.clear();
if (columnFamily != null) {
/* serialize the cf with column indexes */
ColumnFamily.serializerWithIndexes().serialize(columnFamily, bufOut);
}
} else {
FileStruct filestruct = mergeNeeded.get(0);
try {
/* read the length although we don't need it */
int size = filestruct.getBufIn().readInt();
bufOut.write(filestruct.getBufIn(), size);
}
catch (Exception ex) {
ex.printStackTrace();
filestruct.close();
continue;
}
}
if (Range.isKeyInRanges(ranges, lastkey)) {
if (ssTableRange == null) {
String mergedRangeFileName = cfs.getTempFileName();
ssTableRange = new SSTable(DatabaseDescriptor.getBootstrapFileLocation(), mergedRangeFileName);
}
if (rangeCountingBloomFilter == null) {
rangeCountingBloomFilter = new CountingBloomFilter(expectedBloomFilterSize, 8);
}
ssTableRange.append(lastkey, bufOut);
compactedRangeBloomFilter.add(lastkey);
rangeCountingBloomFilter.add(lastkey);
} else {
if (ssTable == null) {
ssTable = new SSTable(compactionFileLocation, mergedFileName);
}
try {
ssTable.append(lastkey, bufOut);
}
catch (Exception ex) {
logger_.warn(LogUtil.throwableToString(ex));
}
/* Fill the bloom filter with the key */
compactedBloomFilter.add(lastkey);
}
totalkeysWritten++;
for (FileStruct filestruct : mergeNeeded) {
try {
filestruct.getNextKey();
if (filestruct.isExhausted()) {
continue;
}
pq.add(filestruct);
totalkeysRead++;
}
catch (Exception ex) {
// Ignore the exception as it might be a corrupted file
// in any case we have read as far as possible from it
// and it will be deleted after compaction.
logger_.error(ex);
filestruct.close();
continue;
}
}
mergeNeeded.clear();
lastkey = null;
}
if (ssTable != null) {
ssTable.closeRename(compactedBloomFilter);
newfile = ssTable.getDataFileLocation();
}
if (ssTableRange != null) {
ssTableRange.closeRename(compactedRangeBloomFilter);
}
totalBytesWritten = cfs.completeCompaction(files, newfile, totalBytesWritten, compactedBloomFilter);
}
}
catch (Exception ex) {
logger_.warn(LogUtil.throwableToString(ex));
}
logger_.debug("Total time taken for compaction ..."
+ (System.currentTimeMillis() - startTime));
logger_.debug("Total bytes Read for compaction ..." + totalBytesRead);
logger_.debug("Total bytes written for compaction ..."
+ totalBytesWritten + " Total keys read ..." + totalkeysRead);
return rangeCountingBloomFilter;
}
static int compactionMemoryThreshold_ = 1 << 30;
static PriorityQueue<FileStruct> initializePriorityQueue(List<String> files, List<Range> ranges, int minBufferSize) throws IOException
{
PriorityQueue<FileStruct> pq = new PriorityQueue<FileStruct>();
if (files.size() > 1 || (ranges != null && files.size() > 0))
{
int bufferSize = Math.min( (compactionMemoryThreshold_ / files.size()), minBufferSize ) ;
FileStruct fs = null;
for (String file : files)
{
try
{
fs = new FileStruct(SequenceFile.bufferedReader(file, bufferSize));
fs.getNextKey();
if(fs.isExhausted())
continue;
pq.add(fs);
}
catch ( Exception ex)
{
logger_.error("corrupt sstable", ex);
try
{
if(fs != null)
{
fs.close();
}
}
catch(Exception e)
{
logger_.error("Unable to close file :" + file, e);
}
}
}
}
return pq;
}
/*
* Stage the compactions , compact similar size files.
* This fn figures out the files close enough by size and if they
* are greater than the threshold then compacts.
*
* Unused.
*/
static Map<Integer, List<String>> stageOrderedCompaction(List<String> files)
{
Map<Integer, List<String>> buckets = new HashMap<Integer, List<String>>();
long averages[] = new long[100];
int count = 0 ;
long max = 200L*1024L*1024L*1024L;
long min = 50L*1024L*1024L;
List<String> largeFileList = new ArrayList<String>();
for(String file : files)
{
File f = new File(file);
long size = f.length();
if ( size > max)
{
largeFileList.add(file);
continue;
}
boolean bFound = false;
for ( int i = 0 ; i < count ; i++ )
{
if ( (size > averages[i]/2 && size < 3*averages[i]/2) || ( size < min && averages[i] < min ))
{
averages[i] = (averages[i] + size) / 2 ;
List<String> fileList = buckets.get(i);
if(fileList == null)
{
fileList = new ArrayList<String>();
buckets.put(i, fileList);
}
fileList.add(file);
bFound = true;
break;
}
}
if(!bFound)
{
List<String> fileList = buckets.get(count);
if(fileList == null)
{
fileList = new ArrayList<String>();
buckets.put(count, fileList);
}
fileList.add(file);
averages[count] = size;
count++;
}
}
// Put files greater than teh max in a separate bucket so that they are never compacted
// but we need them in the buckets since for range compactions we need to split these files.
count++;
for(String file : largeFileList)
{
List<String> tempLargeFileList = new ArrayList<String>();
tempLargeFileList.add(file);
buckets.put(count, tempLargeFileList);
count++;
}
return buckets;
}
/*
* Add up all the files sizes this is the worst case file
* size for compaction of all the list of files given.
*/
static long getExpectedCompactedFileSize(List<String> files)
{
long expectedFileSize = 0;
for(String file : files)
{
File f = new File(file);
long size = f.length();
expectedFileSize = expectedFileSize + size;
}
return expectedFileSize;
}
/*
* Find the maximum size file in the list .
*/
static String getMaxSizeFile( List<String> files )
{
long maxSize = 0L;
String maxFile = null;
for ( String file : files )
{
File f = new File(file);
if(f.length() > maxSize )
{
maxSize = f.length();
maxFile = file;
}
}
return maxFile;
}
Range getMaxRange( List<Range> ranges )
{
Range maxRange = new Range( BigInteger.ZERO, BigInteger.ZERO );
for( Range range : ranges)
{
if( range.left().compareTo(maxRange.left()) > 0 )
{
maxRange = range;
}
}
return maxRange;
}
boolean isLoopAround ( List<Range> ranges )
{
boolean isLoop = false;
for( Range range : ranges)
{
if( range.left().compareTo(range.right()) > 0 )
{
isLoop = true;
break;
}
}
return isLoop;
}
public interface Wrapper {
public Filter run() throws IOException;
}
}