/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.facebook.infrastructure.db; import com.facebook.infrastructure.concurrent.DebuggableThreadPoolExecutor; import com.facebook.infrastructure.concurrent.ThreadFactoryImpl; import com.facebook.infrastructure.config.DatabaseDescriptor; import com.facebook.infrastructure.io.DataOutputBuffer; import com.facebook.infrastructure.io.SSTable; import com.facebook.infrastructure.utils.BloomFilter; import com.facebook.infrastructure.utils.DestructivePQIterator; import com.facebook.infrastructure.utils.LogUtil; import org.apache.log4j.Logger; import java.io.IOException; import java.util.*; import java.util.concurrent.*; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.locks.Condition; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReentrantLock; /** * Author : Avinash Lakshman ( alakshman@facebook.com) & Prashant Malik ( pmalik@facebook.com ) */ public class Memtable implements MemtableMBean, Comparable<Memtable> { private static final Logger logger_ = Logger.getLogger( Memtable.class ); private static final Map<String, ExecutorService> apartments_ = new HashMap<String, ExecutorService>(); public static final String FLUSH_KEY = "__FlushKey__"; public static void shutdown() { Set<String> names = apartments_.keySet(); for (String name : names) { apartments_.get(name).shutdownNow(); } } private int threshold_ = 128*1024*1024; private int thresholdCount_ = 1024*1024; private AtomicInteger currentSize_ = new AtomicInteger(0); private AtomicInteger currentObjectCount_ = new AtomicInteger(0); /* Table and ColumnFamily name are used to determine the ColumnFamilyStore */ private String table_; private String cfName_; /* Creation time of this Memtable */ private long creationTime_; private boolean isFrozen_ = false; private Map<String, ColumnFamily> columnFamilies_ = new HashMap<String, ColumnFamily>(); /* Lock and Condition for notifying new clients about Memtable switches */ Lock lock_ = new ReentrantLock(); Condition condition_; Memtable(String table, String cfName) throws IOException { if ( apartments_.get(cfName) == null ) { apartments_.put(cfName, new DebuggableThreadPoolExecutor( 1, 1, Integer.MAX_VALUE, TimeUnit.SECONDS, new LinkedBlockingQueue<Runnable>(), new ThreadFactoryImpl("FAST-MEMTABLE-POOL") )); } condition_ = lock_.newCondition(); table_ = table; cfName_ = cfName; creationTime_ = System.currentTimeMillis(); } public Iterator<String> sortedKeyIterator() { return new DestructivePQIterator<String>(new PriorityQueue<String>(columnFamilies_.keySet())); } class Putter implements Runnable { private String key_; private ColumnFamily columnFamily_; Putter(String key, ColumnFamily cf) { key_ = key; columnFamily_ = cf; } public void run() { resolve(key_, columnFamily_); } } class Getter implements Callable<ColumnFamily> { private String key_; private String columnFamilyName_; private IFilter filter_; Getter(String key, String cfName) { key_ = key; columnFamilyName_ = cfName; } Getter(String key, String cfName, IFilter filter) { this(key, cfName); filter_ = filter; } public ColumnFamily call() { ColumnFamily cf = getLocalCopy(key_, columnFamilyName_, filter_); return cf; } } class Remover implements Runnable { private String key_; private ColumnFamily columnFamily_; Remover(String key, ColumnFamily columnFamily) { key_ = key; columnFamily_ = columnFamily; } public void run() { resolve(key_, columnFamily_); } } /** * Compares two Memtable based on creation time. * @param rhs * @return */ public int compareTo(Memtable rhs) { long diff = creationTime_ - rhs.creationTime_; if ( diff > 0 ) return 1; else if ( diff < 0 ) return -1; else return 0; } public int getMemtableThreshold() { return currentSize_.get(); } void resolveSize(int oldSize, int newSize) { currentSize_.addAndGet(newSize - oldSize); } void resolveCount(int oldCount, int newCount) { currentObjectCount_.addAndGet(newCount - oldCount); } private boolean isLifetimeViolated() { /* Memtable lifetime in terms of milliseconds */ long lifetimeInMillis = DatabaseDescriptor.getMemtableLifetime() * 3600 * 1000; return ( ( System.currentTimeMillis() - creationTime_ ) >= lifetimeInMillis ); } boolean isThresholdViolated(String key) { boolean bVal = false;//isLifetimeViolated(); if (currentSize_.get() >= threshold_ || currentObjectCount_.get() >= thresholdCount_ || bVal || key.equals(FLUSH_KEY)) { if ( bVal ) logger_.info("Memtable's lifetime for " + cfName_ + " has been violated."); return true; } return false; } String getColumnFamily() { return cfName_; } /* * This version is used by the external clients to put data into * the memtable. This version will respect the threshold and flush * the memtable to disk when the size exceeds the threshold. */ void put(String key, ColumnFamily columnFamily, CommitLog.CommitLogContext cLogCtx) throws IOException { if (isThresholdViolated(key) ) { lock_.lock(); try { ColumnFamilyStore cfStore = Table.open(table_).getColumnFamilyStore(cfName_); if (!isFrozen_) { isFrozen_ = true; MemtableFlushManager.instance().submit(cfStore.getColumnFamilyName(), this, cLogCtx); cfStore.switchMemtable(key, columnFamily, cLogCtx); } else { cfStore.apply(key, columnFamily, cLogCtx); } } finally { lock_.unlock(); } } else { Runnable putter = new Putter(key, columnFamily); apartments_.get(cfName_).submit(putter); } } /* * This version is used to switch memtable and force flush. */ public void forceflush(ColumnFamilyStore cfStore) throws IOException { RowMutation rm = new RowMutation(DatabaseDescriptor.getTables().get(0), FLUSH_KEY); try { if (cfStore.isSuper()) { rm.add(cfStore.cfName + ":SC1:Column", "0".getBytes(), 0); } else { rm.add(cfStore.cfName + ":Column", "0".getBytes(), 0); } rm.apply(); } catch(ColumnFamilyNotDefinedException ex) { logger_.debug(LogUtil.throwableToString(ex)); } } void flushInPlace() throws IOException { flushInPlace(CommitLog.CommitLogContext.NULL); } private void resolve(String key, ColumnFamily columnFamily) { ColumnFamily oldCf = columnFamilies_.get(key); if ( oldCf != null ) { int oldSize = oldCf.size(); int oldObjectCount = oldCf.getColumnCount(); oldCf.addColumns(columnFamily); int newSize = oldCf.size(); int newObjectCount = oldCf.getColumnCount(); resolveSize(oldSize, newSize); resolveCount(oldObjectCount, newObjectCount); // TODO we could save compaction some work by removing all known-to-be-deleted columns from memory // (but this is not high priority because chances are if you're deleting a CF or supercolumn, // it seems that having most of the data still unflushed in the memtable would be uncommon.) oldCf.delete(Math.max(oldCf.getMarkedForDeleteAt(), columnFamily.getMarkedForDeleteAt())); } else { columnFamilies_.put(key, columnFamily); currentSize_.addAndGet(columnFamily.size() + key.length()); currentObjectCount_.addAndGet(columnFamily.getColumnCount()); } } /* * This version is called on commit log recovery. The threshold * is not respected and a forceFlush() needs to be invoked to flush * the contents to disk. */ void putOnRecovery(String key, ColumnFamily columnFamily) throws IOException { if(!key.equals(Memtable.FLUSH_KEY)) resolve(key, columnFamily); } ColumnFamily getLocalCopy(String key, String columnFamilyColumn, IFilter filter) { String[] values = RowMutation.getColumnAndColumnFamily(columnFamilyColumn); ColumnFamily columnFamily = null; if(values.length == 1 ) { columnFamily = columnFamilies_.get(key); } else { ColumnFamily cFamily = columnFamilies_.get(key); if (cFamily == null) { return null; } if (values.length == 2) { IColumn column = cFamily.getColumn(values[1]); // super or normal column if(column != null ) { columnFamily = new ColumnFamily(cfName_); columnFamily.addColumn(column); } } else { assert values.length == 3; SuperColumn superColumn = (SuperColumn)cFamily.getColumn(values[1]); if (superColumn != null) { IColumn subColumn = superColumn.getSubColumn(values[2]); if(subColumn != null) { columnFamily = new ColumnFamily(cfName_); columnFamily.addColumn(values[1] + ":" + values[2], subColumn.value(), subColumn.timestamp(), subColumn.isMarkedForDelete()); } } } } if (columnFamily == null) { return null; } /* Filter unnecessary data from the column based on the provided filter */ return filter.filter(columnFamilyColumn, columnFamily); } ColumnFamily get(String key, String cfName) { Callable<ColumnFamily> call = new Getter(key, cfName); ColumnFamily cf = null; try { cf = apartments_.get(cfName_).submit(call).get(); } catch ( ExecutionException ex ) { logger_.debug(LogUtil.throwableToString(ex)); } catch ( InterruptedException ex2 ) { logger_.debug(LogUtil.throwableToString(ex2)); } return cf; } ColumnFamily get(String key, String cfName, IFilter filter) { Callable<ColumnFamily> call = new Getter(key, cfName, filter); ColumnFamily cf = null; try { cf = apartments_.get(cfName_).submit(call).get(); } catch ( ExecutionException ex ) { logger_.debug(LogUtil.throwableToString(ex)); } catch ( InterruptedException ex2 ) { logger_.debug(LogUtil.throwableToString(ex2)); } return cf; } /* * param recoveryMode - indicates if this was invoked during * recovery. */ void flushInPlace(CommitLog.CommitLogContext cLogCtx) throws IOException { ColumnFamilyStore cfStore = Table.open(table_).getColumnFamilyStore(cfName_); if ( columnFamilies_.size() == 0 ) { // This should be called even if size is 0 // This is because we should try to delete the useless commitlogs // even though there is nothing to flush in memtables for a given family like Hints etc. cfStore.onMemtableFlush(cLogCtx); return; } String directory = DatabaseDescriptor.getDataFileLocation(); String filename = cfStore.getNextFileName(); /* * Use the SSTable to write the contents of the TreeMap * to disk. */ SSTable ssTable = new SSTable(directory, filename); List<String> keys = new ArrayList<String>( columnFamilies_.keySet() ); Collections.sort(keys); DataOutputBuffer buffer = new DataOutputBuffer(); /* Use this BloomFilter to decide if a key exists in a SSTable */ BloomFilter bf = new BloomFilter(keys.size(), 8); for ( String key : keys ) { buffer.reset(); ColumnFamily columnFamily = columnFamilies_.get(key); if ( columnFamily != null ) { /* serialize the cf with column indexes */ ColumnFamily.serializerWithIndexes().serialize( columnFamily, buffer ); /* Now write the key and value to disk */ ssTable.append(key, buffer); bf.add(key); columnFamily.clear(); } } ssTable.close(bf); cfStore.onMemtableFlush(cLogCtx); cfStore.storeLocation( ssTable.getDataFileLocation(), bf ); columnFamilies_.clear(); buffer.close(); } }