package com.facebook.infrastructure.db; import com.facebook.infrastructure.config.DatabaseDescriptor; import com.facebook.infrastructure.dht.Range; import com.facebook.infrastructure.io.*; import com.facebook.infrastructure.net.EndPoint; import com.facebook.infrastructure.service.StorageService; import com.facebook.infrastructure.utils.BloomFilter; import com.facebook.infrastructure.utils.CountingBloomFilter; import com.facebook.infrastructure.utils.Filter; import com.facebook.infrastructure.utils.LogUtil; import org.apache.log4j.Logger; import java.io.File; import java.io.IOException; import java.math.BigInteger; import java.util.*; public class ColumnFamilyCompactor { private static Logger logger_ = Logger.getLogger(ColumnFamilyCompactor.class); static final int THRESHOLD = 4; /* * Break the files into buckets and then compact. */ static CountingBloomFilter doCompaction(ColumnFamilyStore cfs, List<Range> ranges) throws IOException { List<String> files = cfs.getAllSSTablesOnDisk(); CountingBloomFilter result = null; for(List<String> fileList : ColumnFamilyCompactor.getCompactionBuckets(files, 50L*1024L*1024L, 200L*1024L*1024L*1024L)) { CountingBloomFilter tempResult = null; // If ranges != null we should split the files irrespective of the threshold. if(fileList.size() >= ColumnFamilyCompactor.THRESHOLD || ranges != null) { files.clear(); int count = 0; for(String file : fileList) { files.add(file); count++; if( count == ColumnFamilyCompactor.THRESHOLD && ranges == null ) break; } try { // For each bucket if it has crossed the threshhold do the compaction // In case of range compaction merge the counting bloom filters also. if(ranges == null ) tempResult = ColumnFamilyCompactor.doRangeCompaction(cfs, files, ranges, ColumnFamilyStore.BUFFER_SIZE); else tempResult = ColumnFamilyCompactor.doRangeCompaction(cfs, files, ranges, ColumnFamilyStore.BUFFER_SIZE); if(result == null) { result = tempResult; } else { result.merge(tempResult); } } catch ( Exception ex) { logger_.error(ex); } } } return result; } /* * Stage the compactions , compact similar size files. * This fn figures out the files close enough by size and if they * are greater than the threshold then compacts. */ static Set<List<String>> getCompactionBuckets(List<String> files, long min, long max) { Map<List<String>, Long> buckets = new HashMap<List<String>, Long>(); List<String> largeFileList = new ArrayList<String>(); for(String fname : files) { File f = new File(fname); long size = f.length(); if ( size > max) { largeFileList.add(fname); continue; } boolean bFound = false; for (List<String> bucket : new ArrayList<List<String>>(buckets.keySet())) { long averageSize = buckets.get(bucket); // group in the same bucket if it's w/in 50% of the average for this bucket, // or this file and the bucket are all considered "small" (less than `min`) if ((size > averageSize/2 && size < 3*averageSize/2) || ( size < min && averageSize < min)) { // remove and re-add because adding changes the hash buckets.remove(bucket); averageSize = (averageSize + size) / 2 ; bucket.add(fname); buckets.put(bucket, averageSize); bFound = true; break; } } if(!bFound) { ArrayList<String> bucket = new ArrayList<String>(); bucket.add(fname); buckets.put(bucket, size); } } // Put files greater than the max in separate buckets so that they are never compacted // (but we need them in the buckets since for range compactions we need to split these files) Set<List<String>> bucketSet = new HashSet<List<String>>(buckets.keySet()); for (String file : largeFileList) { bucketSet.add(Arrays.asList(new String[] { file })); } return bucketSet; } /** * This function is used to do the anti compaction process , it spits out the file which has keys that belong to a given range * If the target is not specified it spits out the file as a compacted file with the unecessary ranges wiped out. * * @param files * @param ranges * @param target * @param minBufferSize * @param fileList * @return * @throws java.io.IOException */ static CountingBloomFilter doRangeOnlyAntiCompaction( ColumnFamilyStore cfs, List<String> files, List<Range> ranges, EndPoint target, int minBufferSize, List<String> fileList, List<BloomFilter> compactedBloomFilters) throws IOException { CountingBloomFilter rangeCountingBloomFilter = null; long startTime = System.currentTimeMillis(); long totalBytesRead = 0; long totalBytesWritten = 0; long totalkeysRead = 0; long totalkeysWritten = 0; String rangeFileLocation = null; String mergedFileName = null; try { // Calculate the expected compacted filesize long expectedRangeFileSize = getExpectedCompactedFileSize(files); /* in the worst case a node will be giving out alf of its data so we take a chance */ expectedRangeFileSize = expectedRangeFileSize / 2; rangeFileLocation = DatabaseDescriptor.getCompactionFileLocation(expectedRangeFileSize); // boolean isLoop = isLoopAround( ranges ); // Range maxRange = getMaxRange( ranges ); // If the compaction file path is null that means we have no space left for this compaction. if (rangeFileLocation == null) { logger_.warn("Total bytes to be written for range compaction ..." + expectedRangeFileSize + " is greater than the safe limit of the disk space available."); return null; } PriorityQueue<FileStruct> pq = initializePriorityQueue(files, ranges, minBufferSize); if (pq.size() > 0) { mergedFileName = cfs.getTempFileName(); SSTable ssTableRange = null; String lastkey = null; List<FileStruct> lfs = new ArrayList<FileStruct>(); DataOutputBuffer bufOut = new DataOutputBuffer(); int expectedBloomFilterSize = SSTable.getApproximateKeyCount(files); expectedBloomFilterSize = (expectedBloomFilterSize > 0) ? expectedBloomFilterSize : SSTable.indexInterval(); logger_.debug("Expected bloom filter size : " + expectedBloomFilterSize); /* Create the bloom filter for the compacted file. */ BloomFilter compactedRangeBloomFilter = new BloomFilter(expectedBloomFilterSize, 8); List<ColumnFamily> columnFamilies = new ArrayList<ColumnFamily>(); while (pq.size() > 0 || lfs.size() > 0) { FileStruct fs = null; if (pq.size() > 0) { fs = pq.poll(); } if (fs != null && (lastkey == null || lastkey.compareTo(fs.getKey()) == 0)) { // The keys are the same so we need to add this to the // ldfs list lastkey = fs.getKey(); lfs.add(fs); } else { Collections.sort(lfs, new ColumnFamilyStore.FileStructComparator()); bufOut.reset(); if (lfs.size() > 1) { for (FileStruct filestruct : lfs) { try { /* read the length although we don't need it */ filestruct.getBufIn().readInt(); // Skip the Index if (DatabaseDescriptor.isNameIndexEnabled(cfs.getColumnFamilyName())) { IndexHelper.skip(filestruct.getBufIn()); } // We want to add only 2 and resolve them right there in order to save on memory footprint if (columnFamilies.size() > 1) { ColumnFamilyStore.merge(columnFamilies); } // deserialize into column families columnFamilies.add(ColumnFamily.serializer().deserialize(filestruct.getBufIn())); } catch (Exception ex) { logger_.error(LogUtil.throwableToString(ex)); } } // Now after merging all crap append to the sstable ColumnFamily columnFamily = ColumnFamilyStore.resolveAndRemoveDeleted(columnFamilies); columnFamilies.clear(); if (columnFamily != null) { /* serialize the cf with column indexes */ ColumnFamily.serializerWithIndexes().serialize(columnFamily, bufOut); } } else { FileStruct filestruct = lfs.get(0); try { /* read the length although we don't need it */ int size = filestruct.getBufIn().readInt(); bufOut.write(filestruct.getBufIn(), size); } catch (Exception ex) { logger_.warn(LogUtil.throwableToString(ex)); filestruct.close(); continue; } } if (Range.isKeyInRanges(ranges, lastkey)) { if (ssTableRange == null) { if (target != null) rangeFileLocation = rangeFileLocation + System.getProperty("file.separator") + "bootstrap"; FileUtils.createDirectory(rangeFileLocation); ssTableRange = new SSTable(rangeFileLocation, mergedFileName); } if (rangeCountingBloomFilter == null) { rangeCountingBloomFilter = new CountingBloomFilter(expectedBloomFilterSize, 8); } try { ssTableRange.append(lastkey, bufOut); compactedRangeBloomFilter.add(lastkey); if (target != null && StorageService.instance().isPrimary(lastkey, target)) { rangeCountingBloomFilter.add(lastkey); } } catch (Exception ex) { logger_.warn(LogUtil.throwableToString(ex)); } } totalkeysWritten++; for (FileStruct filestruct : lfs) { try { filestruct.getNextKey(); if (filestruct.isExhausted()) { continue; } /* keep on looping until we find a key in the range */ while (!Range.isKeyInRanges(ranges, filestruct.getKey())) { fs.getNextKey(); if (filestruct.isExhausted()) { break; } /* check if we need to continue , if we are done with ranges empty the queue and close all file handles and exit */ //if( !isLoop && StorageService.hash(filestruct.key).compareTo(maxRange.right()) > 0 && !filestruct.key.equals("")) //{ //filestruct.reader.close(); //filestruct = null; //break; //} } if (filestruct != null) { pq.add(filestruct); } totalkeysRead++; } catch (Exception ex) { // Ignore the exception as it might be a corrupted file // in any case we have read as far as possible from it // and it will be deleted after compaction. logger_.error(LogUtil.throwableToString(ex)); filestruct.close(); } } lfs.clear(); lastkey = null; if (fs != null) { // Add back the fs since we processed the rest of // filestructs pq.add(fs); } } } if (ssTableRange != null) { if (fileList == null) fileList = new ArrayList<String>(); ssTableRange.closeRename(compactedRangeBloomFilter, fileList); if (compactedBloomFilters != null) compactedBloomFilters.add(compactedRangeBloomFilter); } } } catch (Exception ex) { logger_.warn(LogUtil.throwableToString(ex)); } logger_.debug("Total time taken for range split ..." + (System.currentTimeMillis() - startTime)); logger_.debug("Total bytes Read for range split ..." + totalBytesRead); logger_.debug("Total bytes written for range split ..." + totalBytesWritten + " Total keys read ..." + totalkeysRead); return rangeCountingBloomFilter; }/* * This function does the actual compaction for files. * It maintains a priority queue of with the first key from each file * and then removes the top of the queue and adds it to the SStable and * repeats this process while reading the next from each file until its * done with all the files . The SStable to which the keys are written * represents the new compacted file. Before writing if there are keys * that occur in multiple files and are the same then a resolution is done * to get the latest data. * */ static CountingBloomFilter doRangeCompaction(ColumnFamilyStore cfs, List<String> files, List<Range> ranges, int minBufferSize) throws IOException { CountingBloomFilter rangeCountingBloomFilter = null; String newfile = null; long startTime = System.currentTimeMillis(); long totalBytesRead = 0; long totalBytesWritten = 0; long totalkeysRead = 0; long totalkeysWritten = 0; try { // Calculate the expected compacted filesize long expectedCompactedFileSize = getExpectedCompactedFileSize(files); String compactionFileLocation = DatabaseDescriptor.getCompactionFileLocation(expectedCompactedFileSize); // If the compaction file path is null that means we have no space left for this compaction. if (compactionFileLocation == null) { if (ranges == null || ranges.size() == 0) { String maxFile = getMaxSizeFile(files); files.remove(maxFile); return doRangeCompaction(cfs, files, ranges, minBufferSize); } logger_.warn("Total bytes to be written for compaction ..." + expectedCompactedFileSize + " is greater than the safe limit of the disk space available."); return null; } PriorityQueue<FileStruct> pq = initializePriorityQueue(files, ranges, minBufferSize); if (pq.size() > 0) { String mergedFileName = cfs.getTempFileName(); SSTable ssTable = null; SSTable ssTableRange = null; String lastkey = null; List<FileStruct> mergeNeeded = new ArrayList<FileStruct>(); DataOutputBuffer bufOut = new DataOutputBuffer(); int expectedBloomFilterSize = SSTable.getApproximateKeyCount(files); expectedBloomFilterSize = (expectedBloomFilterSize > 0) ? expectedBloomFilterSize : SSTable.indexInterval(); logger_.debug("Expected bloom filter size : " + expectedBloomFilterSize); /* Create the bloom filter for the compacted file. */ BloomFilter compactedBloomFilter = new BloomFilter(expectedBloomFilterSize, 8); BloomFilter compactedRangeBloomFilter = new BloomFilter(expectedBloomFilterSize, 8); List<ColumnFamily> columnFamilies = new ArrayList<ColumnFamily>(); while (pq.size() > 0 || mergeNeeded.size() > 0) { // pop off the queue until we get a different key FileStruct fs = pq.peek(); if (fs != null && (lastkey == null || lastkey.equals(fs.getKey()))) { // The keys are the same so we need to add this to the // merge list lastkey = fs.getKey(); mergeNeeded.add(pq.poll()); continue; } // merge the keys in the merge list Collections.sort(mergeNeeded, new ColumnFamilyStore.FileStructComparator()); bufOut.reset(); if (mergeNeeded.size() > 1) { for (FileStruct filestruct : mergeNeeded) { try { /* read the length although we don't need it */ filestruct.getBufIn().readInt(); // Skip the Index if (DatabaseDescriptor.isNameIndexEnabled(cfs.getColumnFamilyName())) { IndexHelper.skip(filestruct.getBufIn()); } // We want to add only 2 and resolve them right there in order to save on memory footprint if (columnFamilies.size() > 1) { ColumnFamilyStore.merge(columnFamilies); } // deserialize into column families columnFamilies.add(ColumnFamily.serializer().deserialize(filestruct.getBufIn())); } catch (Exception ex) { logger_.error(ex); continue; } } // Now after merging all crap append to the sstable ColumnFamily columnFamily = ColumnFamilyStore.resolveAndRemoveDeleted(columnFamilies); columnFamilies.clear(); if (columnFamily != null) { /* serialize the cf with column indexes */ ColumnFamily.serializerWithIndexes().serialize(columnFamily, bufOut); } } else { FileStruct filestruct = mergeNeeded.get(0); try { /* read the length although we don't need it */ int size = filestruct.getBufIn().readInt(); bufOut.write(filestruct.getBufIn(), size); } catch (Exception ex) { ex.printStackTrace(); filestruct.close(); continue; } } if (Range.isKeyInRanges(ranges, lastkey)) { if (ssTableRange == null) { String mergedRangeFileName = cfs.getTempFileName(); ssTableRange = new SSTable(DatabaseDescriptor.getBootstrapFileLocation(), mergedRangeFileName); } if (rangeCountingBloomFilter == null) { rangeCountingBloomFilter = new CountingBloomFilter(expectedBloomFilterSize, 8); } ssTableRange.append(lastkey, bufOut); compactedRangeBloomFilter.add(lastkey); rangeCountingBloomFilter.add(lastkey); } else { if (ssTable == null) { ssTable = new SSTable(compactionFileLocation, mergedFileName); } try { ssTable.append(lastkey, bufOut); } catch (Exception ex) { logger_.warn(LogUtil.throwableToString(ex)); } /* Fill the bloom filter with the key */ compactedBloomFilter.add(lastkey); } totalkeysWritten++; for (FileStruct filestruct : mergeNeeded) { try { filestruct.getNextKey(); if (filestruct.isExhausted()) { continue; } pq.add(filestruct); totalkeysRead++; } catch (Exception ex) { // Ignore the exception as it might be a corrupted file // in any case we have read as far as possible from it // and it will be deleted after compaction. logger_.error(ex); filestruct.close(); continue; } } mergeNeeded.clear(); lastkey = null; } if (ssTable != null) { ssTable.closeRename(compactedBloomFilter); newfile = ssTable.getDataFileLocation(); } if (ssTableRange != null) { ssTableRange.closeRename(compactedRangeBloomFilter); } totalBytesWritten = cfs.completeCompaction(files, newfile, totalBytesWritten, compactedBloomFilter); } } catch (Exception ex) { logger_.warn(LogUtil.throwableToString(ex)); } logger_.debug("Total time taken for compaction ..." + (System.currentTimeMillis() - startTime)); logger_.debug("Total bytes Read for compaction ..." + totalBytesRead); logger_.debug("Total bytes written for compaction ..." + totalBytesWritten + " Total keys read ..." + totalkeysRead); return rangeCountingBloomFilter; } static int compactionMemoryThreshold_ = 1 << 30; static PriorityQueue<FileStruct> initializePriorityQueue(List<String> files, List<Range> ranges, int minBufferSize) throws IOException { PriorityQueue<FileStruct> pq = new PriorityQueue<FileStruct>(); if (files.size() > 1 || (ranges != null && files.size() > 0)) { int bufferSize = Math.min( (compactionMemoryThreshold_ / files.size()), minBufferSize ) ; FileStruct fs = null; for (String file : files) { try { fs = new FileStruct(SequenceFile.bufferedReader(file, bufferSize)); fs.getNextKey(); if(fs.isExhausted()) continue; pq.add(fs); } catch ( Exception ex) { logger_.error("corrupt sstable", ex); try { if(fs != null) { fs.close(); } } catch(Exception e) { logger_.error("Unable to close file :" + file, e); } } } } return pq; } /* * Stage the compactions , compact similar size files. * This fn figures out the files close enough by size and if they * are greater than the threshold then compacts. * * Unused. */ static Map<Integer, List<String>> stageOrderedCompaction(List<String> files) { Map<Integer, List<String>> buckets = new HashMap<Integer, List<String>>(); long averages[] = new long[100]; int count = 0 ; long max = 200L*1024L*1024L*1024L; long min = 50L*1024L*1024L; List<String> largeFileList = new ArrayList<String>(); for(String file : files) { File f = new File(file); long size = f.length(); if ( size > max) { largeFileList.add(file); continue; } boolean bFound = false; for ( int i = 0 ; i < count ; i++ ) { if ( (size > averages[i]/2 && size < 3*averages[i]/2) || ( size < min && averages[i] < min )) { averages[i] = (averages[i] + size) / 2 ; List<String> fileList = buckets.get(i); if(fileList == null) { fileList = new ArrayList<String>(); buckets.put(i, fileList); } fileList.add(file); bFound = true; break; } } if(!bFound) { List<String> fileList = buckets.get(count); if(fileList == null) { fileList = new ArrayList<String>(); buckets.put(count, fileList); } fileList.add(file); averages[count] = size; count++; } } // Put files greater than teh max in a separate bucket so that they are never compacted // but we need them in the buckets since for range compactions we need to split these files. count++; for(String file : largeFileList) { List<String> tempLargeFileList = new ArrayList<String>(); tempLargeFileList.add(file); buckets.put(count, tempLargeFileList); count++; } return buckets; } /* * Add up all the files sizes this is the worst case file * size for compaction of all the list of files given. */ static long getExpectedCompactedFileSize(List<String> files) { long expectedFileSize = 0; for(String file : files) { File f = new File(file); long size = f.length(); expectedFileSize = expectedFileSize + size; } return expectedFileSize; } /* * Find the maximum size file in the list . */ static String getMaxSizeFile( List<String> files ) { long maxSize = 0L; String maxFile = null; for ( String file : files ) { File f = new File(file); if(f.length() > maxSize ) { maxSize = f.length(); maxFile = file; } } return maxFile; } Range getMaxRange( List<Range> ranges ) { Range maxRange = new Range( BigInteger.ZERO, BigInteger.ZERO ); for( Range range : ranges) { if( range.left().compareTo(maxRange.left()) > 0 ) { maxRange = range; } } return maxRange; } boolean isLoopAround ( List<Range> ranges ) { boolean isLoop = false; for( Range range : ranges) { if( range.left().compareTo(range.right()) > 0 ) { isLoop = true; break; } } return isLoop; } public interface Wrapper { public Filter run() throws IOException; } }