/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.facebook.infrastructure.db;
import com.facebook.infrastructure.config.DatabaseDescriptor;
import com.facebook.infrastructure.dht.Range;
import com.facebook.infrastructure.io.DataInputBuffer;
import com.facebook.infrastructure.io.SSTable;
import com.facebook.infrastructure.net.EndPoint;
import com.facebook.infrastructure.service.StorageService;
import com.facebook.infrastructure.utils.BloomFilter;
import com.facebook.infrastructure.utils.CountingBloomFilter;
import com.facebook.infrastructure.utils.Filter;
import com.facebook.infrastructure.utils.LogUtil;
import org.apache.log4j.Logger;
import java.io.File;
import java.io.IOException;
import java.util.*;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicReference;
import java.util.concurrent.locks.ReentrantReadWriteLock;
/**
* Author : Avinash Lakshman ( alakshman@facebook.com) & Prashant Malik ( pmalik@facebook.com )
*/
public class ColumnFamilyStore
{
static class FileStructComparator implements Comparator<FileStruct>
{
public int compare(FileStruct f, FileStruct f2)
{
return f.getFileName().compareTo(f2.getFileName());
}
public boolean equals(Object o)
{
if (!(o instanceof FileStructComparator))
return false;
return true;
}
}
static class FileNameComparator implements Comparator<String>
{
public int compare(String f, String f2)
{
return getIndexFromFileName(f2) - getIndexFromFileName(f);
}
public boolean equals(Object o)
{
if (!(o instanceof FileNameComparator))
return false;
return true;
}
}
public static final int BUFFER_SIZE = 128*1024*1024;
private static Logger logger_ = Logger.getLogger(ColumnFamilyStore.class);
private String table_;
public String cfName;
/* This is used to generate the next index for a SSTable */
private AtomicInteger fileIndexGenerator_ = new AtomicInteger(0);
/* memtable associated with this ColumnFamilyStore. */
private AtomicReference<Memtable> memtable_;
private AtomicReference<BinaryMemtable> binaryMemtable_;
/* SSTables on disk for this column family */
private Set<String> ssTables_ = new HashSet<String>();
/* Modification lock used for protecting reads from compactions. */
private ReentrantReadWriteLock lock_ = new ReentrantReadWriteLock(true);
/* Flag indicates if a compaction is in process */
public AtomicBoolean isCompacting_ = new AtomicBoolean(false);
ColumnFamilyStore(String table, String columnFamily) throws IOException
{
table_ = table;
cfName = columnFamily;
/*
* Get all data files associated with old Memtables for this table.
* These files are named as follows <Table>-1.db, ..., <Table>-n.db. Get
* the max which in this case is n and increment it to use it for next
* index.
*/
List<Integer> indices = new ArrayList<Integer>();
String[] dataFileDirectories = DatabaseDescriptor.getAllDataFileLocations();
for ( String directory : dataFileDirectories )
{
File fileDir = new File(directory);
File[] files = fileDir.listFiles();
for (File file : files)
{
String filename = file.getName();
String[] tblCfName = getTableAndColumnFamilyName(filename);
if (tblCfName[0].equals(table_)
&& tblCfName[1].equals(columnFamily))
{
int index = getIndexFromFileName(filename);
indices.add(index);
}
}
}
Collections.sort(indices);
int value = (indices.size() > 0) ? (indices.get(indices.size() - 1)) : 0;
fileIndexGenerator_.set(value);
memtable_ = new AtomicReference<Memtable>( new Memtable(table_, cfName) );
binaryMemtable_ = new AtomicReference<BinaryMemtable>( new BinaryMemtable(table_, cfName) );
}
void onStart() throws IOException
{
/* Do major compaction */
List<File> ssTables = new ArrayList<File>();
String[] dataFileDirectories = DatabaseDescriptor.getAllDataFileLocations();
for ( String directory : dataFileDirectories )
{
File fileDir = new File(directory);
File[] files = fileDir.listFiles();
for (File file : files)
{
String filename = file.getName();
if(((file.length() == 0) || (filename.indexOf("-" + SSTable.temporaryFile_) != -1) ) && (filename.indexOf(cfName) != -1))
{
file.delete();
continue;
}
String[] tblCfName = getTableAndColumnFamilyName(filename);
if (tblCfName[0].equals(table_)
&& tblCfName[1].equals(cfName)
&& filename.indexOf("-Data.db") != -1)
{
ssTables.add(file.getAbsoluteFile());
}
}
}
Collections.sort(ssTables, new FileUtils.FileComparator());
List<String> filenames = new ArrayList<String>();
for (File ssTable : ssTables)
{
filenames.add(ssTable.getAbsolutePath());
}
/* There are no files to compact just add to the list of SSTables */
ssTables_.addAll(filenames);
/* Load the index files and the Bloom Filters associated with them. */
SSTable.onStart(filenames);
logger_.debug("Submitting a major compaction task ...");
CompactionManager.instance().submit(this);
if(cfName.equals(Table.hints_))
{
HintedHandOffManager.instance().submit(this);
}
CompactionManager.instance().submitPeriodicCompaction(this);
}
List<String> getAllSSTablesOnDisk()
{
return new ArrayList<String>(ssTables_);
}
/*
* This method is called to obtain statistics about
* the Column Family represented by this Column Family
* Store. It will report the total number of files on
* disk and the total space oocupied by the data files
* associated with this Column Family.
*/
public String cfStats(String newLineSeparator, java.text.DecimalFormat df)
{
StringBuilder sb = new StringBuilder();
/*
* We want to do this so that if there are
* no files on disk we do not want to display
* something ugly on the admin page.
*/
if ( ssTables_.size() == 0 )
{
return sb.toString();
}
sb.append(cfName + " statistics :");
sb.append(newLineSeparator);
sb.append("Number of files on disk : " + ssTables_.size());
sb.append(newLineSeparator);
double totalSpace = 0d;
for ( String file : ssTables_ )
{
File f = new File(file);
totalSpace += f.length();
}
String diskSpace = FileUtils.stringifyFileSize(totalSpace);
sb.append("Total disk space : " + diskSpace);
sb.append(newLineSeparator);
sb.append("--------------------------------------");
sb.append(newLineSeparator);
return sb.toString();
}
public boolean isSuper() {
return DatabaseDescriptor.getColumnType(cfName).equals("Super");
}
/*
* This is called after bootstrap to add the files
* to the list of files maintained.
*/
void addToList(String file)
{
lock_.writeLock().lock();
try
{
ssTables_.add(file);
}
finally
{
lock_.writeLock().unlock();
}
}
void touch(String key, boolean fData) throws IOException
{
/* Scan the SSTables on disk first */
lock_.readLock().lock();
try
{
List<String> files = new ArrayList<String>(ssTables_);
for (String file : files)
{
/*
* Get the BloomFilter associated with this file. Check if the key
* is present in the BloomFilter. If not continue to the next file.
*/
boolean bVal = SSTable.isKeyInFile(key, file);
if ( !bVal )
continue;
SSTable ssTable = new SSTable(file);
ssTable.touch(key, fData);
}
}
finally
{
lock_.readLock().unlock();
}
}
/*
* This method forces a compaction of the SSTables on disk. We wait
* for the process to complete by waiting on a future pointer.
*/
CountingBloomFilter forceCompaction(List<Range> ranges, EndPoint target, long skip, List<String> fileList) throws ExecutionException, InterruptedException {
CountingBloomFilter cbf = null;
Future<CountingBloomFilter> futurePtr = null;
if( ranges != null)
futurePtr = CompactionManager.instance().submit(this, ranges, target, fileList);
else
futurePtr = CompactionManager.instance().submitMajor(this, ranges, skip);
/* Waiting for the compaction to complete. */
cbf = futurePtr.get();
logger_.debug("Done forcing compaction ...");
return cbf;
}
String getColumnFamilyName()
{
return cfName;
}
private String[] getTableAndColumnFamilyName(String filename)
{
StringTokenizer st = new StringTokenizer(filename, "-");
String[] values = new String[2];
int i = 0;
while (st.hasMoreElements())
{
if (i == 0)
values[i] = (String) st.nextElement();
else if (i == 1)
{
values[i] = (String) st.nextElement();
break;
}
++i;
}
return values;
}
protected static int getIndexFromFileName(String filename)
{
/*
* File name is of the form <table>-<column family>-<index>-Data.db.
* This tokenizer will strip the .db portion.
*/
StringTokenizer st = new StringTokenizer(filename, "-");
/*
* Now I want to get the index portion of the filename. We accumulate
* the indices and then sort them to get the max index.
*/
int count = st.countTokens();
int i = 0;
String index = null;
while (st.hasMoreElements())
{
index = (String) st.nextElement();
if (i == (count - 2))
break;
++i;
}
return Integer.parseInt(index);
}
String getNextFileName()
{
String name = table_ + "-" + cfName + "-" + fileIndexGenerator_.incrementAndGet();
return name;
}
/*
* Return a temporary file name.
*/
public String getTempFileName()
{
String name = table_ + "-" + cfName + "-" + SSTable.temporaryFile_ + "-" + fileIndexGenerator_.incrementAndGet() ;
return name;
}
/*
* This version is used only on start up when we are recovering from logs.
* In the future we may want to parellelize the log processing for a table
* by having a thread per log file present for recovery. Re-visit at that
* time.
*/
void switchMemtable(String key, ColumnFamily columnFamily, CommitLog.CommitLogContext cLogCtx) throws IOException
{
memtable_.set( new Memtable(table_, cfName) );
if(!key.equals(Memtable.FLUSH_KEY))
memtable_.get().put(key, columnFamily, cLogCtx);
}
/*
* This version is used when we forceflush.
*/
void switchMemtable() throws IOException
{
memtable_.set( new Memtable(table_, cfName) );
}
/*
* This version is used only on start up when we are recovering from logs.
* In the future we may want to parellelize the log processing for a table
* by having a thread per log file present for recovery. Re-visit at that
* time.
*/
void switchBinaryMemtable(String key, byte[] buffer) throws IOException
{
binaryMemtable_.set( new BinaryMemtable(table_, cfName) );
binaryMemtable_.get().put(key, buffer);
}
void forceFlush() throws IOException
{
//MemtableManager.instance().submit(getColumnFamilyName(), memtable_.get() , CommitLog.CommitLogContext.NULL);
//memtable_.get().flush(true, CommitLog.CommitLogContext.NULL);
memtable_.get().forceflush(this);
}
public void flushMemtable() throws IOException {
memtable_.get().flushInPlace();
}
void forceFlushBinary() throws IOException
{
BinaryMemtableManager.instance().submit(getColumnFamilyName(), binaryMemtable_.get());
//binaryMemtable_.get().flush(true);
}
/*
* Insert/Update the column family for this key. param @ lock - lock that
* needs to be used. param @ key - key for update/insert param @
* columnFamily - columnFamily changes
*/
void apply(String key, ColumnFamily columnFamily, CommitLog.CommitLogContext cLogCtx)
throws IOException
{
memtable_.get().put(key, columnFamily, cLogCtx);
}
/*
* Insert/Update the column family for this key. param @ lock - lock that
* needs to be used. param @ key - key for update/insert param @
* columnFamily - columnFamily changes
*/
void applyBinary(String key, byte[] buffer)
throws IOException
{
binaryMemtable_.get().put(key, buffer);
}
public ColumnFamily getColumnFamily(String key, String columnFamilyColumn, IFilter filter) throws IOException
{
List<ColumnFamily> columnFamilies = getColumnFamiliesForKey(key, columnFamilyColumn, filter);
return resolveAndRemoveDeleted(columnFamilies);
}
/**
*
* Get the column family in the most efficient order.
* 1. Memtable
* 2. Sorted list of files
*/
List<ColumnFamily> getColumnFamiliesForKey(String key, String columnFamilyColumn, IFilter filter) throws IOException {
List<ColumnFamily> columnFamilies = new ArrayList<ColumnFamily>();
long start = System.currentTimeMillis();
/* Get the ColumnFamily from Memtable */
getColumnFamilyFromCurrentMemtable(key, columnFamilyColumn, filter, columnFamilies);
if (columnFamilies.size() == 0 || !filter.isDone()) {
/* Check if MemtableManager has any historical information */
MemtableFlushManager.instance().getColumnFamily(key, cfName, columnFamilyColumn, filter, columnFamilies);
}
if (columnFamilies.size() == 0 || !filter.isDone()) {
getColumnFamilyFromDisk(key, columnFamilyColumn, columnFamilies, filter);
logger_.trace("DISK TIME: " + (System.currentTimeMillis() - start) + " ms.");
}
return columnFamilies;
}
/**
* Fetch from disk files and go in sorted order to be efficient
* This fn exits as soon as the required data is found.
* @param key
* @param columnFamilyColumn
* @param columnFamilies
* @param filter
* @throws IOException
*/
private void getColumnFamilyFromDisk(String key, String columnFamilyColumn, List<ColumnFamily> columnFamilies, IFilter filter) throws IOException
{
/* Scan the SSTables on disk first */
lock_.readLock().lock();
try
{
List<String> files = new ArrayList<String>(ssTables_);
Collections.sort(files, new FileNameComparator());
for (String file : files)
{
/*
* Get the BloomFilter associated with this file. Check if the key
* is present in the BloomFilter. If not continue to the next file.
*/
boolean bVal = SSTable.isKeyInFile(key, file);
if ( !bVal )
continue;
ColumnFamily columnFamily = fetchColumnFamily(key, columnFamilyColumn, filter, file);
long start = System.currentTimeMillis();
if (columnFamily != null)
{
/*
* TODO
* By using the filter before removing deleted columns (which is done by resolve())
* we have a efficient implementation of timefilter
* but for count filter this can return wrong results
* we need to take care of that later.
*/
columnFamilies.add(columnFamily);
if(filter.isDone())
{
break;
}
}
logger_.info("DISK Data structure population TIME: " + (System.currentTimeMillis() - start)
+ " ms.");
}
files.clear();
}
finally
{
lock_.readLock().unlock();
}
}
private ColumnFamily fetchColumnFamily(String key, String columnFamilyColumn, IFilter filter, String ssTableFile) throws IOException
{
SSTable ssTable = new SSTable(ssTableFile);
long start = System.currentTimeMillis();
DataInputBuffer bufIn = null;
bufIn = filter.next(key, columnFamilyColumn, ssTable);
logger_.info("DISK ssTable.next TIME: " + (System.currentTimeMillis() - start) + " ms.");
if (bufIn.getLength() == 0)
return null;
start = System.currentTimeMillis();
ColumnFamily columnFamily = ColumnFamily.serializer().deserialize(bufIn, columnFamilyColumn, filter);
logger_.info("DISK Deserialize TIME: " + (System.currentTimeMillis() - start) + " ms.");
return columnFamily;
}
private void getColumnFamilyFromCurrentMemtable(String key, String cf, IFilter filter, List<ColumnFamily> columnFamilies)
{
/* Get the ColumnFamily from Memtable */
ColumnFamily columnFamily = memtable_.get().get(key, cf, filter);
if (columnFamily != null)
{
columnFamilies.add(columnFamily);
}
}
/** merge all columnFamilies into a single instance, with only the newest versions of columns preserved. */
static ColumnFamily resolve(List<ColumnFamily> columnFamilies)
{
int size = columnFamilies.size();
if (size == 0)
return null;
// start from nothing so that we don't include potential deleted columns from the first instance
String cfname = columnFamilies.get(0).name();
ColumnFamily cf = new ColumnFamily(cfname);
// merge
for (ColumnFamily cf2 : columnFamilies)
{
assert cf.name().equals(cf2.name());
logger_.trace(cf + " merging " + cf2);
cf.addColumns(cf2);
cf.delete(Math.max(cf.getMarkedForDeleteAt(), cf2.getMarkedForDeleteAt()));
logger_.trace("merged as " + cf);
}
return cf;
}
/** like resolve, but leaves the resolved CF as the only item in the list */
static void merge(List<ColumnFamily> columnFamilies)
{
ColumnFamily cf = resolve(columnFamilies);
columnFamilies.clear();
columnFamilies.add(cf);
}
public static ColumnFamily resolveAndRemoveDeleted(List<ColumnFamily> columnFamilies) {
ColumnFamily cf = resolve(columnFamilies);
return removeDeleted(cf);
}
static ColumnFamily removeDeleted(ColumnFamily cf) {
if (cf == null) {
return cf;
}
for (String cname : new ArrayList<String>(cf.getColumns().keySet())) {
IColumn c = cf.getColumns().get(cname);
if (c instanceof SuperColumn) {
long min_timestamp = Math.max(c.getMarkedForDeleteAt(), cf.getMarkedForDeleteAt());
// don't operate directly on the supercolumn, it could be the one in the memtable
cf.remove(cname);
IColumn sc = new SuperColumn(cname);
for (IColumn subColumn : c.getSubColumns()) {
if (!subColumn.isMarkedForDelete() && subColumn.timestamp() >= min_timestamp) {
sc.addColumn(subColumn.name(), subColumn);
}
}
if (sc.getSubColumns().size() > 0) {
cf.addColumn(sc);
logger_.trace("adding sc " + sc.name() + " to CF with " + sc.getSubColumns().size() + " columns: " + sc);
}
} else if (c.isMarkedForDelete() || c.timestamp() < cf.getMarkedForDeleteAt()) {
cf.remove(cname);
}
}
logger_.trace("after removeDeleted: " + cf);
return cf;
}
/*
* This version is used only on start up when we are recovering from logs.
* Hence no locking is required since we process logs on the main thread. In
* the future we may want to parellelize the log processing for a table by
* having a thread per log file present for recovery. Re-visit at that time.
*/
void applyNow(String key, ColumnFamily columnFamily) throws IOException
{
memtable_.get().putOnRecovery(key, columnFamily);
}
/*
* This method is called when the Memtable is frozen and ready to be flushed
* to disk. This method informs the CommitLog that a particular ColumnFamily
* is being flushed to disk.
*/
void onMemtableFlush(CommitLog.CommitLogContext cLogCtx) throws IOException
{
if ( cLogCtx.isValidContext() )
CommitLog.open(table_).onMemtableFlush(cfName, cLogCtx);
}
/*
* Called after the Memtable flushes its in-memory data. This information is
* cached in the ColumnFamilyStore. This is useful for reads because the
* ColumnFamilyStore first looks in the in-memory store and the into the
* disk to find the key. If invoked during recoveryMode the
* onMemtableFlush() need not be invoked.
*
* param @ filename - filename just flushed to disk
* param @ bf - bloom filter which indicates the keys that are in this file.
*/
void storeLocation(String filename, BloomFilter bf) throws IOException
{
int ssTableSize = 0;
lock_.writeLock().lock();
try
{
ssTables_.add(filename);
SSTable.storeBloomFilter(filename, bf);
ssTableSize = ssTables_.size();
}
finally
{
lock_.writeLock().unlock();
}
if ((ssTableSize >= ColumnFamilyCompactor.THRESHOLD && !isCompacting_.get())
|| (isCompacting_.get() && ssTableSize % ColumnFamilyCompactor.THRESHOLD == 0))
{
logger_.debug("Submitting for compaction ...");
CompactionManager.instance().submit(this);
}
}
public Filter compact(ColumnFamilyCompactor.Wrapper r) {
logger_.debug("Started compaction ..." + cfName);
assert !isCompacting_.get();
isCompacting_.set(true);
try
{
return r.run();
}
catch (Throwable t)
{
throw new RuntimeException(t);
}
finally
{
isCompacting_.set(false);
logger_.debug("Finished compaction ..." + cfName);
}
}
void forceCleanup()
{
CompactionManager.instance().submitCleanup(this);
}
/**
* cleans up one particular file by removing keys that this node is not responsible for.
* @param file
* @throws IOException
*/
/* TODO: Take care of the comments later. */
void doCleanup(String file) throws IOException
{
if(file == null )
return;
List<Range> myRanges = null;
List<String> files = new ArrayList<String>();
files.add(file);
List<String> newFiles = new ArrayList<String>();
Map<EndPoint, List<Range>> endPointtoRangeMap = StorageService.instance().constructEndPointToRangesMap();
myRanges = endPointtoRangeMap.get(StorageService.getLocalStorageEndPoint());
List<BloomFilter> compactedBloomFilters = new ArrayList<BloomFilter>();
ColumnFamilyCompactor.doRangeOnlyAntiCompaction(this, files, myRanges, null, BUFFER_SIZE, newFiles, compactedBloomFilters);
logger_.info("Original file : " + file + " of size " + new File(file).length());
lock_.writeLock().lock();
try
{
ssTables_.remove(file);
SSTable.removeAssociatedBloomFilter(file);
for (String newfile : newFiles)
{
logger_.info("New file : " + newfile + " of size " + new File(newfile).length());
if ( newfile != null )
{
ssTables_.add(newfile);
logger_.info("Inserting bloom filter for file " + newfile);
SSTable.storeBloomFilter(newfile, compactedBloomFilters.get(0));
}
}
SSTable.delete(file);
}
finally
{
lock_.writeLock().unlock();
}
}
long completeCompaction(List<String> files, String newfile, long totalBytesWritten, BloomFilter compactedBloomFilter) {
lock_.writeLock().lock();
try
{
for (String file : files)
{
ssTables_.remove(file);
SSTable.removeAssociatedBloomFilter(file);
}
if ( newfile != null )
{
ssTables_.add(newfile);
logger_.info("Inserting bloom filter for file " + newfile);
SSTable.storeBloomFilter(newfile, compactedBloomFilter);
totalBytesWritten = (new File(newfile)).length();
}
}
finally
{
lock_.writeLock().unlock();
}
for (String file : files)
{
SSTable.delete(file);
}
return totalBytesWritten;
}
public Memtable getMemtable() {
return memtable_.get();
}
public Set<String> getSSTables() {
return Collections.unmodifiableSet(ssTables_);
}
}