/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.cassandra.db; import java.io.File; import java.io.IOException; import java.util.Arrays; import java.util.Comparator; import java.util.Iterator; import java.util.Map; import java.util.concurrent.ConcurrentNavigableMap; import java.util.concurrent.ConcurrentSkipListMap; import java.util.concurrent.ExecutorService; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.locks.Condition; import org.apache.log4j.Logger; import org.apache.commons.lang.ArrayUtils; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.db.filter.*; import org.apache.cassandra.db.marshal.AbstractType; import org.apache.cassandra.db.proc.IRowProcessor; import org.apache.cassandra.db.proc.RowProcessorChain; import org.apache.cassandra.dht.IPartitioner; import org.apache.cassandra.io.BloomFilterWriter; import org.apache.cassandra.io.SSTableReader; import org.apache.cassandra.io.SSTableWriter; import org.apache.cassandra.io.util.DataOutputBuffer; import org.apache.cassandra.service.StorageService; import org.apache.cassandra.utils.WrappedRunnable; public class Memtable implements Comparable<Memtable>, IFlushable { private static final Logger logger = Logger.getLogger(Memtable.class); private volatile boolean isFrozen; private final int THRESHOLD = DatabaseDescriptor.getMemtableThroughput() * 1024*1024; // not static since we might want to change at runtime private final int THRESHOLD_COUNT = (int)(DatabaseDescriptor.getMemtableOperations() * 1024*1024); private final AtomicInteger currentThroughput = new AtomicInteger(0); private final AtomicInteger currentOperations = new AtomicInteger(0); private final long creationTime; private final ConcurrentNavigableMap<DecoratedKey, ColumnFamily> columnFamilies = new ConcurrentSkipListMap<DecoratedKey, ColumnFamily>(); private final IPartitioner partitioner = StorageService.getPartitioner(); private final ColumnFamilyStore cfs; public Memtable(ColumnFamilyStore cfs) { this.cfs = cfs; creationTime = System.currentTimeMillis(); } /** * Compares two Memtable based on creation time. * @param rhs Memtable to compare to. * @return a negative integer, zero, or a positive integer as this object * is less than, equal to, or greater than the specified object. */ public int compareTo(Memtable rhs) { long diff = creationTime - rhs.creationTime; if ( diff > 0 ) return 1; else if ( diff < 0 ) return -1; else return 0; } public int getCurrentThroughput() { return currentThroughput.get(); } public int getCurrentOperations() { return currentOperations.get(); } boolean isThresholdViolated() { return currentThroughput.get() >= this.THRESHOLD || currentOperations.get() >= this.THRESHOLD_COUNT; } boolean isFrozen() { return isFrozen; } void freeze() { isFrozen = true; } /** * Should only be called by ColumnFamilyStore.apply. NOT a public API. * (CFS handles locking to avoid submitting an op * to a flushing memtable. Any other way is unsafe.) */ void put(String key, ColumnFamily columnFamily) { assert !isFrozen; // not 100% foolproof but hell, it's an assert resolve(key, columnFamily); } private void resolve(String key, ColumnFamily cf) { currentThroughput.addAndGet(cf.size()); currentOperations.addAndGet((cf.getColumnCount() == 0) ? cf.isMarkedForDelete() ? 1 : 0 : cf.getColumnCount()); DecoratedKey decoratedKey = partitioner.decorateKey(key); ColumnFamily oldCf = columnFamilies.putIfAbsent(decoratedKey, cf); if (oldCf == null) return; oldCf.resolve(cf); } // for debugging public String contents() { StringBuilder builder = new StringBuilder(); builder.append("{"); for (Map.Entry<DecoratedKey, ColumnFamily> entry : columnFamilies.entrySet()) { builder.append(entry.getKey()).append(": ").append(entry.getValue()).append(", "); } builder.append("}"); return builder.toString(); } private SSTableReader writeSortedContents() { BloomFilterWriter bloomFilterWriter = null; try { logger.info("Writing " + this); SSTableWriter writer = new SSTableWriter(cfs.getFlushPath(), columnFamilies.size(),getCurrentOperations(), StorageService.getPartitioner()); boolean bloomColumns = writer.getBloomFilterWriter().isBloomColumns(); bloomFilterWriter = writer.getBloomFilterWriter(); IRowProcessor rowProc = null; if (cfs.metadata.rowProcessors!=null) { rowProc = new RowProcessorChain().addAll(cfs.metadata.rowProcessors).build(); rowProc.setColumnFamilyStore(cfs); if (!rowProc.shouldProcessIncomplete()) rowProc = null; } DataOutputBuffer buffer = new DataOutputBuffer(); for (Map.Entry<DecoratedKey, ColumnFamily> entry : columnFamilies.entrySet()) { buffer.reset(); DecoratedKey key = entry.getKey(); ColumnFamily cf = entry.getValue(); if (rowProc!=null) { cf=rowProc.process(key, cf, true); if (cf==null) continue; } /* serialize the cf with column indexes */ ColumnFamily.serializer().serializeWithIndexes(cf, buffer, bloomColumns); /* Now write the key and value to disk */ writer.append(key, buffer); if (bloomColumns) bloomFilterWriter.add(key, cf); } SSTableReader ssTable = writer.closeAndOpenReader(); logger.info(String.format("Completed flushing %s (%d bytes)", ssTable.getFilename(), new File(ssTable.getFilename()).length())); return ssTable; } catch (IOException e) { if ( bloomFilterWriter != null ) bloomFilterWriter.getFilter().close(); throw new FSWriteError(e); } } public void flushAndSignal(final Condition condition, ExecutorService sorter, final ExecutorService writer) { cfs.getMemtablesPendingFlush().add(this); // it's ok for the MT to briefly be both active and pendingFlush writer.submit(new WrappedRunnable() { public void runMayThrow() throws IOException { cfs.addSSTable(writeSortedContents()); cfs.getMemtablesPendingFlush().remove(Memtable.this); condition.signalAll(); } }); } public String toString() { return String.format("Memtable-%s@%s(%s bytes, %s operations)", cfs.getColumnFamilyName(), hashCode(), currentThroughput, currentOperations); } public Iterator<DecoratedKey> getKeyIterator(DecoratedKey startWith) { return columnFamilies.navigableKeySet().tailSet(startWith).iterator(); } public boolean isClean() { return columnFamilies.isEmpty(); } public String getTableName() { return cfs.getTable().name; } /** * obtain an iterator of columns in this memtable in the specified order starting from a given column. */ public ColumnIterator getSliceIterator(ColumnFamily cf, SliceQueryFilter filter, AbstractType typeComparator) { final ColumnFamily columnFamily = cf == null ? ColumnFamily.create(getTableName(), filter.getColumnFamilyName()) : cf.cloneMeShallow(); final IColumn columns[] = (cf == null ? columnFamily : cf).getSortedColumns().toArray(new IColumn[columnFamily.getSortedColumns().size()]); // TODO if we are dealing with supercolumns, we need to clone them while we have the read lock since they can be modified later if (filter.reversed) ArrayUtils.reverse(columns); IColumn startIColumn; final boolean isStandard = DatabaseDescriptor.getColumnFamilyType(getTableName(), filter.getColumnFamilyName()).equals("Standard"); if (isStandard) startIColumn = new Column(filter.start); else startIColumn = new SuperColumn(filter.start, null); // ok to not have subcolumnComparator since we won't be adding columns to this object // can't use a ColumnComparatorFactory comparator since those compare on both name and time (and thus will fail to match // our dummy column, since the time there is arbitrary). Comparator<IColumn> comparator = filter.getColumnComparator(typeComparator); int index; if (filter.start.length == 0 && filter.reversed) { /* scan from the largest column in descending order */ index = 0; } else { index = Arrays.binarySearch(columns, startIColumn, comparator); } final int startIndex = index < 0 ? -(index + 1) : index; return new AbstractColumnIterator() { private int curIndex_ = startIndex; public ColumnFamily getColumnFamily() { return columnFamily; } public boolean hasNext() { return curIndex_ < columns.length; } public IColumn next() { // clone supercolumns so caller can freely removeDeleted or otherwise mutate it return isStandard ? columns[curIndex_++] : ((SuperColumn)columns[curIndex_++]).cloneMe(); } }; } public ColumnIterator getNamesIterator(final ColumnFamily cf, final NamesQueryFilter filter) { final ColumnFamily columnFamily = cf == null ? ColumnFamily.create(getTableName(), filter.getColumnFamilyName()) : cf.cloneMeShallow(); final boolean isStandard = DatabaseDescriptor.getColumnFamilyType(getTableName(), filter.getColumnFamilyName()).equals("Standard"); return new SimpleAbstractColumnIterator() { private Iterator<byte[]> iter = filter.columns.iterator(); private byte[] current; public ColumnFamily getColumnFamily() { return columnFamily; } protected IColumn computeNext() { if (cf == null) { return endOfData(); } while (iter.hasNext()) { current = iter.next(); IColumn column = cf.getColumn(current); if (column != null) // clone supercolumns so caller can freely removeDeleted or otherwise mutate it return isStandard ? column : ((SuperColumn)column).cloneMe(); } return endOfData(); } }; } public ColumnFamily getColumnFamily(String key) { return columnFamilies.get(partitioner.decorateKey(key)); } void clearUnsafe() { columnFamilies.clear(); } public boolean isExpired() { return System.currentTimeMillis() > creationTime + DatabaseDescriptor.getMemtableLifetimeMS(); } public Iterator<Map.Entry<DecoratedKey, ColumnFamily>> getEntryIterator() { return columnFamilies.entrySet().iterator(); } }