/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.cassandra.io.sstable;
import java.io.*;
import java.lang.ref.Reference;
import java.lang.ref.ReferenceQueue;
import java.nio.ByteBuffer;
import java.util.*;
import com.google.common.base.Function;
import com.google.common.collect.Collections2;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.cassandra.cache.InstrumentedCache;
import org.apache.cassandra.config.CFMetaData;
import org.apache.cassandra.config.DatabaseDescriptor;
import org.apache.cassandra.db.*;
import org.apache.cassandra.db.filter.QueryFilter;
import org.apache.cassandra.db.marshal.AbstractType;
import org.apache.cassandra.dht.AbstractBounds;
import org.apache.cassandra.dht.IPartitioner;
import org.apache.cassandra.dht.Range;
import org.apache.cassandra.io.IColumnSerializer;
import org.apache.cassandra.io.util.BufferedRandomAccessFile;
import org.apache.cassandra.io.util.FileDataInput;
import org.apache.cassandra.io.util.FileUtils;
import org.apache.cassandra.io.util.SegmentedFile;
import org.apache.cassandra.service.StorageService;
import org.apache.cassandra.utils.*;
/**
* SSTableReaders are open()ed by Table.onStart; after that they are created by SSTableWriter.renameAndOpen.
* Do not re-call open() on existing SSTable files; use the references kept by ColumnFamilyStore post-start instead.
*/
public class SSTableReader extends SSTable implements Comparable<SSTableReader>
{
private static final Logger logger = LoggerFactory.getLogger(SSTableReader.class);
// guesstimated size of INDEX_INTERVAL index entries
private static final int INDEX_FILE_BUFFER_BYTES = 16 * DatabaseDescriptor.getIndexInterval();
// `finalizers` is required to keep the PhantomReferences alive after the enclosing SSTR is itself
// unreferenced. otherwise they will never get enqueued.
private static final Set<Reference<SSTableReader>> finalizers = new HashSet<Reference<SSTableReader>>();
private static final ReferenceQueue<SSTableReader> finalizerQueue = new ReferenceQueue<SSTableReader>()
{{
Runnable runnable = new Runnable()
{
public void run()
{
while (true)
{
SSTableDeletingReference r;
try
{
r = (SSTableDeletingReference) finalizerQueue.remove();
finalizers.remove(r);
}
catch (InterruptedException e)
{
throw new RuntimeException(e);
}
try
{
r.cleanup();
}
catch (IOException e)
{
logger.error("Error deleting " + r.desc, e);
}
}
}
};
new Thread(runnable, "SSTABLE-DELETER").start();
}};
/**
* maxDataAge is a timestamp in local server time (e.g. System.currentTimeMilli) which represents an uppper bound
* to the newest piece of data stored in the sstable. In other words, this sstable does not contain items created
* later than maxDataAge.
*
* The field is not serialized to disk, so relying on it for more than what truncate does is not advised.
*
* When a new sstable is flushed, maxDataAge is set to the time of creation.
* When a sstable is created from compaction, maxDataAge is set to max of all merged tables.
*
* The age is in milliseconds since epoc and is local to this host.
*/
public final long maxDataAge;
// indexfile and datafile: might be null before a call to load()
private SegmentedFile ifile;
private SegmentedFile dfile;
private IndexSummary indexSummary;
private Filter bf;
private InstrumentedCache<Pair<Descriptor,DecoratedKey>, Long> keyCache;
private BloomFilterTracker bloomFilterTracker = new BloomFilterTracker();
private volatile SSTableDeletingReference phantomReference;
public static long getApproximateKeyCount(Iterable<SSTableReader> sstables)
{
long count = 0;
for (SSTableReader sstable : sstables)
{
int indexKeyCount = sstable.getKeySamples().size();
count = count + (indexKeyCount + 1) * DatabaseDescriptor.getIndexInterval();
if (logger.isDebugEnabled())
logger.debug("index size for bloom filter calc for file : " + sstable.getFilename() + " : " + count);
}
return count;
}
public static SSTableReader open(Descriptor desc) throws IOException
{
Set<Component> components = SSTable.componentsFor(desc);
return open(desc, components, DatabaseDescriptor.getCFMetaData(desc.ksname, desc.cfname), StorageService.getPartitioner());
}
public static SSTableReader open(Descriptor descriptor, Set<Component> components, CFMetaData metadata, IPartitioner partitioner) throws IOException
{
return open(descriptor, components, Collections.<DecoratedKey>emptySet(), null, metadata, partitioner);
}
public static SSTableReader open(Descriptor descriptor, Set<Component> components, Set<DecoratedKey> savedKeys, SSTableTracker tracker, CFMetaData metadata, IPartitioner partitioner) throws IOException
{
assert partitioner != null;
long start = System.currentTimeMillis();
logger.info("Opening " + descriptor);
EstimatedHistogram rowSizes;
EstimatedHistogram columnCounts;
File statsFile = new File(descriptor.filenameFor(SSTable.COMPONENT_STATS));
if (statsFile.exists())
{
DataInputStream dis = null;
try
{
logger.debug("Load statistics for {}", descriptor);
dis = new DataInputStream(new BufferedInputStream(new FileInputStream(statsFile)));
rowSizes = EstimatedHistogram.serializer.deserialize(dis);
columnCounts = EstimatedHistogram.serializer.deserialize(dis);
}
finally
{
FileUtils.closeQuietly(dis);
}
}
else
{
logger.debug("No statistics for {}", descriptor);
rowSizes = SSTable.defaultRowHistogram();
columnCounts = SSTable.defaultColumnHistogram();
}
SSTableReader sstable = new SSTableReader(descriptor, components, metadata, partitioner, null, null, null, null, System.currentTimeMillis(), rowSizes, columnCounts);
sstable.setTrackedBy(tracker);
// versions before 'c' encoded keys as utf-16 before hashing to the filter
if (descriptor.hasStringsInBloomFilter)
{
sstable.load(true, savedKeys);
}
else
{
sstable.load(false, savedKeys);
sstable.loadBloomFilter();
}
if (logger.isDebugEnabled())
logger.debug("INDEX LOAD TIME for " + descriptor + ": " + (System.currentTimeMillis() - start) + " ms.");
if (logger.isDebugEnabled() && sstable.getKeyCache() != null)
logger.debug(String.format("key cache contains %s/%s keys", sstable.getKeyCache().getSize(), sstable.getKeyCache().getCapacity()));
return sstable;
}
/**
* Open a RowIndexedReader which already has its state initialized (by SSTableWriter).
*/
static SSTableReader internalOpen(Descriptor desc, Set<Component> components, CFMetaData metadata, IPartitioner partitioner, SegmentedFile ifile, SegmentedFile dfile, IndexSummary isummary, Filter bf, long maxDataAge, EstimatedHistogram rowsize,
EstimatedHistogram columncount) throws IOException
{
assert desc != null && partitioner != null && ifile != null && dfile != null && isummary != null && bf != null;
return new SSTableReader(desc, components, metadata, partitioner, ifile, dfile, isummary, bf, maxDataAge, rowsize, columncount);
}
private SSTableReader(Descriptor desc,
Set<Component> components,
CFMetaData metadata,
IPartitioner partitioner,
SegmentedFile ifile,
SegmentedFile dfile,
IndexSummary indexSummary,
Filter bloomFilter,
long maxDataAge,
EstimatedHistogram rowSizes,
EstimatedHistogram columnCounts)
throws IOException
{
super(desc, components, metadata, partitioner, rowSizes, columnCounts);
this.maxDataAge = maxDataAge;
this.ifile = ifile;
this.dfile = dfile;
this.indexSummary = indexSummary;
this.bf = bloomFilter;
}
public void setTrackedBy(SSTableTracker tracker)
{
if (tracker != null)
{
phantomReference = new SSTableDeletingReference(tracker, this, finalizerQueue);
finalizers.add(phantomReference);
keyCache = tracker.getKeyCache();
}
}
void loadBloomFilter() throws IOException
{
DataInputStream stream = null;
try
{
stream = new DataInputStream(new BufferedInputStream(new FileInputStream(descriptor.filenameFor(Component.FILTER))));
if (descriptor.usesOldBloomFilter)
{
bf = LegacyBloomFilter.serializer().deserialize(stream);
}
else
{
bf = BloomFilter.serializer().deserialize(stream);
}
}
finally
{
FileUtils.closeQuietly(stream);
}
}
/**
* Loads ifile, dfile and indexSummary, and optionally recreates the bloom filter.
*/
private void load(boolean recreatebloom, Set<DecoratedKey> keysToLoadInCache) throws IOException
{
boolean cacheLoading = keyCache != null && !keysToLoadInCache.isEmpty();
SegmentedFile.Builder ibuilder = SegmentedFile.getBuilder(DatabaseDescriptor.getIndexAccessMode());
SegmentedFile.Builder dbuilder = SegmentedFile.getBuilder(DatabaseDescriptor.getDiskAccessMode());
// we read the positions in a BRAF so we don't have to worry about an entry spanning a mmap boundary.
BufferedRandomAccessFile input = new BufferedRandomAccessFile(new File(descriptor.filenameFor(Component.PRIMARY_INDEX)),
"r",
BufferedRandomAccessFile.DEFAULT_BUFFER_SIZE,
true);
try
{
if (keyCache != null && keyCache.getCapacity() - keyCache.getSize() < keysToLoadInCache.size())
keyCache.updateCapacity(keyCache.getSize() + keysToLoadInCache.size());
long indexSize = input.length();
long estimatedKeys = SSTable.estimateRowsFromIndex(input);
indexSummary = new IndexSummary(estimatedKeys);
if (recreatebloom)
// estimate key count based on index length
bf = LegacyBloomFilter.getFilter(estimatedKeys, 15);
while (true)
{
long indexPosition = input.getFilePointer();
if (indexPosition == indexSize)
break;
boolean shouldAddEntry = indexSummary.shouldAddEntry();
ByteBuffer key = (shouldAddEntry || cacheLoading || recreatebloom)
? ByteBufferUtil.readWithShortLength(input)
: ByteBufferUtil.skipShortLength(input);
long dataPosition = input.readLong();
if (key != null)
{
DecoratedKey decoratedKey = decodeKey(partitioner, descriptor, key);
if (recreatebloom)
bf.add(decoratedKey.key);
if (shouldAddEntry)
indexSummary.addEntry(decoratedKey, indexPosition);
if (cacheLoading && keysToLoadInCache.contains(decoratedKey))
cacheKey(decoratedKey, dataPosition);
}
indexSummary.incrementRowid();
ibuilder.addPotentialBoundary(indexPosition);
dbuilder.addPotentialBoundary(dataPosition);
}
indexSummary.complete();
}
finally
{
FileUtils.closeQuietly(input);
}
// finalize the state of the reader
ifile = ibuilder.complete(descriptor.filenameFor(Component.PRIMARY_INDEX));
dfile = dbuilder.complete(descriptor.filenameFor(Component.DATA));
}
/** get the position in the index file to start scanning to find the given key (at most indexInterval keys away) */
private IndexSummary.KeyPosition getIndexScanPosition(DecoratedKey decoratedKey)
{
assert indexSummary.getIndexPositions() != null && indexSummary.getIndexPositions().size() > 0;
int index = Collections.binarySearch(indexSummary.getIndexPositions(), new IndexSummary.KeyPosition(decoratedKey, -1));
if (index < 0)
{
// binary search gives us the first index _greater_ than the key searched for,
// i.e., its insertion position
int greaterThan = (index + 1) * -1;
if (greaterThan == 0)
return null;
return indexSummary.getIndexPositions().get(greaterThan - 1);
}
else
{
return indexSummary.getIndexPositions().get(index);
}
}
/**
* For testing purposes only.
*/
public void forceFilterFailures()
{
bf = LegacyBloomFilter.alwaysMatchingBloomFilter();
}
public Filter getBloomFilter()
{
return bf;
}
/**
* @return The key cache: for monitoring purposes.
*/
public InstrumentedCache getKeyCache()
{
return keyCache;
}
/**
* @return An estimate of the number of keys in this SSTable.
*/
public long estimatedKeys()
{
return indexSummary.getIndexPositions().size() * DatabaseDescriptor.getIndexInterval();
}
/**
* @return Approximately 1/INDEX_INTERVALth of the keys in this SSTable.
*/
public Collection<DecoratedKey> getKeySamples()
{
return Collections2.transform(indexSummary.getIndexPositions(),
new Function<IndexSummary.KeyPosition, DecoratedKey>(){
public DecoratedKey apply(IndexSummary.KeyPosition kp)
{
return kp.key;
}
});
}
/**
* Determine the minimal set of sections that can be extracted from this SSTable to cover the given ranges.
* @return A sorted list of (offset,end) pairs that cover the given ranges in the datafile for this SSTable.
*/
public List<Pair<Long,Long>> getPositionsForRanges(Collection<Range> ranges)
{
// use the index to determine a minimal section for each range
List<Pair<Long,Long>> positions = new ArrayList<Pair<Long,Long>>();
for (AbstractBounds range : AbstractBounds.normalize(ranges))
{
long left = getPosition(new DecoratedKey(range.left, null), Operator.GT);
if (left == -1)
// left is past the end of the file
continue;
long right = getPosition(new DecoratedKey(range.right, null), Operator.GT);
if (right == -1 || Range.isWrapAround(range.left, range.right))
// right is past the end of the file, or it wraps
right = length();
if (left == right)
// empty range
continue;
positions.add(new Pair(Long.valueOf(left), Long.valueOf(right)));
}
return positions;
}
public void cacheKey(DecoratedKey key, Long info)
{
assert key.key != null;
// avoid keeping a permanent reference to the original key buffer
DecoratedKey copiedKey = new DecoratedKey(key.token, ByteBufferUtil.clone(key.key));
keyCache.put(new Pair<Descriptor, DecoratedKey>(descriptor, copiedKey), info);
}
public Long getCachedPosition(DecoratedKey key)
{
return getCachedPosition(new Pair<Descriptor, DecoratedKey>(descriptor, key));
}
private Long getCachedPosition(Pair<Descriptor, DecoratedKey> unifiedKey)
{
if (keyCache != null && keyCache.getCapacity() > 0)
return keyCache.get(unifiedKey);
return null;
}
/**
* @param decoratedKey The key to apply as the rhs to the given Operator.
* @param op The Operator defining matching keys: the nearest key to the target matching the operator wins.
* @return The position in the data file to find the key, or -1 if the key is not present
*/
public long getPosition(DecoratedKey decoratedKey, Operator op)
{
// first, check bloom filter
if (op == Operator.EQ)
{
assert decoratedKey.key != null; // null is ok for GE scans
if (!bf.isPresent(decoratedKey.key))
return -1;
}
// next, the key cache
Pair<Descriptor, DecoratedKey> unifiedKey = new Pair<Descriptor, DecoratedKey>(descriptor, decoratedKey);
Long cachedPosition = getCachedPosition(unifiedKey);
if (cachedPosition != null)
return cachedPosition;
// next, see if the sampled index says it's impossible for the key to be present
IndexSummary.KeyPosition sampledPosition = getIndexScanPosition(decoratedKey);
if (sampledPosition == null)
{
if (op == Operator.EQ)
bloomFilterTracker.addFalsePositive();
// we matched the -1th position: if the operator might match forward, return the 0th position
return op.apply(1) >= 0 ? 0 : -1;
}
// scan the on-disk index, starting at the nearest sampled position
Iterator<FileDataInput> segments = ifile.iterator(sampledPosition.indexPosition, INDEX_FILE_BUFFER_BYTES);
while (segments.hasNext())
{
FileDataInput input = segments.next();
try
{
while (!input.isEOF())
{
// read key & data position from index entry
DecoratedKey indexDecoratedKey = decodeKey(partitioner, descriptor, ByteBufferUtil.readWithShortLength(input));
long dataPosition = input.readLong();
int comparison = indexDecoratedKey.compareTo(decoratedKey);
int v = op.apply(comparison);
if (v == 0)
{
if (comparison == 0 && keyCache != null && keyCache.getCapacity() > 0)
{
if (op == Operator.EQ)
bloomFilterTracker.addTruePositive();
// store exact match for the key
if (decoratedKey.key != null)
cacheKey(decoratedKey, dataPosition);
}
return dataPosition;
}
if (v < 0)
{
if (op == Operator.EQ)
bloomFilterTracker.addFalsePositive();
return -1;
}
}
}
catch (IOException e)
{
throw new IOError(e);
}
finally
{
FileUtils.closeQuietly(input);
}
}
if (op == Operator.EQ)
bloomFilterTracker.addFalsePositive();
return -1;
}
/**
* @return The length in bytes of the data file for this SSTable.
*/
public long length()
{
return dfile.length;
}
public void markCompacted()
{
if (logger.isDebugEnabled())
logger.debug("Marking " + getFilename() + " compacted");
try
{
if (!new File(descriptor.filenameFor(Component.COMPACTED_MARKER)).createNewFile())
throw new IOException("Unable to create compaction marker");
}
catch (IOException e)
{
throw new IOError(e);
}
phantomReference.deleteOnCleanup();
}
/**
* @param bufferSize Buffer size in bytes for this Scanner.
* @param filter filter to use when reading the columns
* @return A Scanner for seeking over the rows of the SSTable.
*/
public SSTableScanner getScanner(int bufferSize, QueryFilter filter)
{
return new SSTableScanner(this, filter, bufferSize);
}
/**
* Direct I/O SSTableScanner
* @param bufferSize Buffer size in bytes for this Scanner.
* @return A Scanner for seeking over the rows of the SSTable.
*/
public SSTableScanner getDirectScanner(int bufferSize)
{
return new SSTableScanner(this, bufferSize, true);
}
public FileDataInput getFileDataInput(DecoratedKey decoratedKey, int bufferSize)
{
long position = getPosition(decoratedKey, Operator.EQ);
if (position < 0)
return null;
return dfile.getSegment(position, bufferSize);
}
public int compareTo(SSTableReader o)
{
return descriptor.generation - o.descriptor.generation;
}
public AbstractType getColumnComparator()
{
return metadata.comparator;
}
public ColumnFamily createColumnFamily()
{
return ColumnFamily.create(metadata);
}
public IColumnSerializer getColumnSerializer()
{
return metadata.cfType == ColumnFamilyType.Standard
? Column.serializer()
: SuperColumn.serializer(metadata.subcolumnComparator);
}
/**
* Tests if the sstable contains data newer than the given age param (in localhost currentMilli time).
* This works in conjunction with maxDataAge which is an upper bound on the create of data in this sstable.
* @param age The age to compare the maxDataAre of this sstable. Measured in millisec since epoc on this host
* @return True iff this sstable contains data that's newer than the given age parameter.
*/
public boolean newSince(long age)
{
return maxDataAge > age;
}
public static long readRowSize(DataInput in, Descriptor d) throws IOException
{
if (d.hasIntRowSize)
return in.readInt();
return in.readLong();
}
public void createLinks(String snapshotDirectoryPath) throws IOException
{
for (Component component : components)
{
File sourceFile = new File(descriptor.filenameFor(component));
File targetLink = new File(snapshotDirectoryPath, sourceFile.getName());
CLibrary.createHardLink(sourceFile, targetLink);
}
}
/**
* Conditionally use the deprecated 'IPartitioner.convertFromDiskFormat' method.
*/
public static DecoratedKey decodeKey(IPartitioner p, Descriptor d, ByteBuffer bytes)
{
if (d.hasEncodedKeys)
return p.convertFromDiskFormat(bytes);
return p.decorateKey(bytes);
}
/**
* TODO: Move someplace reusable
*/
public abstract static class Operator
{
public static final Operator EQ = new Equals();
public static final Operator GE = new GreaterThanOrEqualTo();
public static final Operator GT = new GreaterThan();
/**
* @param comparison The result of a call to compare/compareTo, with the desired field on the rhs.
* @return less than 0 if the operator cannot match forward, 0 if it matches, greater than 0 if it might match forward.
*/
public abstract int apply(int comparison);
final static class Equals extends Operator
{
public int apply(int comparison) { return -comparison; }
}
final static class GreaterThanOrEqualTo extends Operator
{
public int apply(int comparison) { return comparison >= 0 ? 0 : -comparison; }
}
final static class GreaterThan extends Operator
{
public int apply(int comparison) { return comparison > 0 ? 0 : 1; }
}
}
public long getBloomFilterFalsePositiveCount()
{
return bloomFilterTracker.getFalsePositiveCount();
}
public long getRecentBloomFilterFalsePositiveCount()
{
return bloomFilterTracker.getRecentFalsePositiveCount();
}
public long getBloomFilterTruePositiveCount()
{
return bloomFilterTracker.getTruePositiveCount();
}
public long getRecentBloomFilterTruePositiveCount()
{
return bloomFilterTracker.getRecentTruePositiveCount();
}
}