/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.cassandra.io;
import java.io.*;
import java.util.*;
import java.lang.ref.ReferenceQueue;
import java.lang.ref.Reference;
import java.nio.channels.FileChannel;
import java.nio.MappedByteBuffer;
import org.apache.log4j.Logger;
import org.apache.cassandra.cache.InstrumentedCache;
import org.apache.cassandra.dht.IPartitioner;
import org.apache.cassandra.utils.BloomFilter;
import org.apache.cassandra.utils.FBUtilities;
import org.apache.cassandra.utils.Pair;
import org.apache.cassandra.service.StorageService;
import org.apache.cassandra.config.DatabaseDescriptor;
import org.apache.cassandra.db.*;
import org.apache.cassandra.db.marshal.AbstractType;
import org.apache.cassandra.io.util.BufferedRandomAccessFile;
import org.apache.cassandra.io.util.FileDataInput;
import org.apache.cassandra.io.util.MappedFileDataInput;
/**
* SSTableReaders are open()ed by Table.onStart; after that they are created by SSTableWriter.renameAndOpen.
* Do not re-call open() on existing SSTable files; use the references kept by ColumnFamilyStore post-start instead.
*/
public class SSTableReader extends SSTable implements Comparable<SSTableReader>
{
private static final Logger logger = Logger.getLogger(SSTableReader.class);
// `finalizers` is required to keep the PhantomReferences alive after the enclosing SSTR is itself
// unreferenced. otherwise they will never get enqueued.
private static final Set<Reference<SSTableReader>> finalizers = new HashSet<Reference<SSTableReader>>();
private static final ReferenceQueue<SSTableReader> finalizerQueue = new ReferenceQueue<SSTableReader>()
{{
Runnable runnable = new Runnable()
{
public void run()
{
while (true)
{
SSTableDeletingReference r = null;
try
{
r = (SSTableDeletingReference) finalizerQueue.remove();
finalizers.remove(r);
}
catch (InterruptedException e)
{
throw new RuntimeException(e);
}
try
{
r.cleanup();
}
catch (IOException e)
{
logger.error("Error deleting " + r.path, e);
}
}
}
};
new Thread(runnable, "SSTABLE-DELETER").start();
}};
// in a perfect world, BUFFER_SIZE would be final, but we need to test with a smaller size to stay sane.
static long BUFFER_SIZE = Integer.MAX_VALUE;
public static int indexInterval()
{
return IndexSummary.INDEX_INTERVAL;
}
public static long getApproximateKeyCount(Iterable<SSTableReader> sstables)
{
long count = 0;
for (SSTableReader sstable : sstables)
{
int indexKeyCount = sstable.getIndexPositions().size();
count = count + (indexKeyCount + 1) * IndexSummary.INDEX_INTERVAL;
if (logger.isDebugEnabled())
logger.debug("index size for bloom filter calc for file : " + sstable.getFilename() + " : " + count);
}
return count;
}
public static SSTableReader open(String dataFileName) throws IOException
{
return open(dataFileName, StorageService.getPartitioner());
}
/** public, but only for tests */
public static SSTableReader open(String dataFileName, IPartitioner partitioner) throws IOException
{
assert partitioner != null;
long start = System.currentTimeMillis();
SSTableReader sstable = new SSTableReader(dataFileName, partitioner);
logger.info("Sampling index for " + dataFileName);
sstable.loadIndexFile();
sstable.loadBloomFilter();
if (logger.isDebugEnabled())
logger.debug("INDEX LOAD TIME for " + dataFileName + ": " + (System.currentTimeMillis() - start) + " ms.");
return sstable;
}
private volatile SSTableDeletingReference phantomReference;
// jvm can only map up to 2GB at a time, so we split index/data into segments of that size when using mmap i/o
private final MappedByteBuffer[] indexBuffers;
private final MappedByteBuffer[] buffers;
private InstrumentedCache<Pair<String, DecoratedKey>, PositionSize> keyCache;
private BloomFilterTracker bloomFilterTracker = new BloomFilterTracker();
SSTableReader(String filename, IPartitioner partitioner, IndexSummary indexSummary, BloomFilter bloomFilter)
throws IOException
{
super(filename, partitioner);
if (DatabaseDescriptor.getIndexAccessMode() == DatabaseDescriptor.DiskAccessMode.mmap)
{
long indexLength = new File(indexFilename()).length();
int bufferCount = 1 + (int) (indexLength / BUFFER_SIZE);
indexBuffers = new MappedByteBuffer[bufferCount];
long remaining = indexLength;
for (int i = 0; i < bufferCount; i++)
{
indexBuffers[i] = mmap(indexFilename(), i * BUFFER_SIZE, (int) Math.min(remaining, BUFFER_SIZE));
remaining -= BUFFER_SIZE;
}
}
else
{
assert DatabaseDescriptor.getIndexAccessMode() == DatabaseDescriptor.DiskAccessMode.standard;
indexBuffers = null;
}
if (DatabaseDescriptor.getDiskAccessMode() == DatabaseDescriptor.DiskAccessMode.mmap)
{
int bufferCount = 1 + (int) (new File(path).length() / BUFFER_SIZE);
buffers = new MappedByteBuffer[bufferCount];
long remaining = length();
for (int i = 0; i < bufferCount; i++)
{
buffers[i] = mmap(path, i * BUFFER_SIZE, (int) Math.min(remaining, BUFFER_SIZE));
remaining -= BUFFER_SIZE;
}
}
else
{
assert DatabaseDescriptor.getDiskAccessMode() == DatabaseDescriptor.DiskAccessMode.standard;
buffers = null;
}
this.indexSummary = indexSummary;
this.bf = bloomFilter;
}
public void setTrackedBy(SSTableTracker tracker)
{
phantomReference = new SSTableDeletingReference(tracker, this, finalizerQueue);
finalizers.add(phantomReference);
// TODO keyCache should never be null in live Cassandra, but only setting it here
// means it can be during tests, so we have to do otherwise-unnecessary != null checks
keyCache = tracker.getKeyCache();
}
private static MappedByteBuffer mmap(String filename, long start, int size) throws IOException
{
RandomAccessFile raf;
try
{
raf = new RandomAccessFile(filename, "r");
}
catch (FileNotFoundException e)
{
throw new IOError(e);
}
try
{
return raf.getChannel().map(FileChannel.MapMode.READ_ONLY, start, size);
}
finally
{
raf.close();
}
}
private SSTableReader(String filename, IPartitioner partitioner) throws IOException
{
this(filename, partitioner, null, null);
}
public List<IndexSummary.KeyPosition> getIndexPositions()
{
return indexSummary.getIndexPositions();
}
public long estimatedKeys()
{
return indexSummary.getIndexPositions().size() * IndexSummary.INDEX_INTERVAL;
}
void loadBloomFilter() throws IOException
{
DataInputStream stream = new DataInputStream(new FileInputStream(filterFilename()));
try
{
bf = BloomFilter.serializer().deserialize(stream);
}
finally
{
stream.close();
}
}
void loadIndexFile() throws IOException
{
// we read the positions in a BRAF so we don't have to worry about an entry spanning a mmap boundary.
// any entries that do, we force into the in-memory sample so key lookup can always bsearch within
// a single mmapped segment.
indexSummary = new IndexSummary();
BufferedRandomAccessFile input = new BufferedRandomAccessFile(indexFilename(), "r");
try
{
long indexSize = input.length();
while (true)
{
long indexPosition = input.getFilePointer();
if (indexPosition == indexSize)
{
break;
}
DecoratedKey decoratedKey = partitioner.convertFromDiskFormat(input.readUTF());
long dataPosition = input.readLong();
long nextIndexPosition = input.getFilePointer();
// read the next index entry to see how big the row is
long nextDataPosition;
if (input.isEOF())
{
nextDataPosition = length();
}
else
{
input.readUTF();
nextDataPosition = input.readLong();
input.seek(nextIndexPosition);
}
indexSummary.maybeAddEntry(decoratedKey, dataPosition, nextDataPosition - dataPosition, indexPosition, nextIndexPosition);
}
indexSummary.complete();
}
finally
{
input.close();
}
}
/** get the position in the index file to start scanning to find the given key (at most indexInterval keys away) */
private IndexSummary.KeyPosition getIndexScanPosition(DecoratedKey decoratedKey)
{
assert indexSummary.getIndexPositions() != null && indexSummary.getIndexPositions().size() > 0;
int index = Collections.binarySearch(indexSummary.getIndexPositions(), new IndexSummary.KeyPosition(decoratedKey, -1));
if (index < 0)
{
// binary search gives us the first index _greater_ than the key searched for,
// i.e., its insertion position
int greaterThan = (index + 1) * -1;
if (greaterThan == 0)
return null;
return indexSummary.getIndexPositions().get(greaterThan - 1);
}
else
{
return indexSummary.getIndexPositions().get(index);
}
}
/**
* returns the position in the data file to find the given key, or -1 if the key is not present
*/
public PositionSize getPosition(DecoratedKey decoratedKey) throws IOException
{
// first, check bloom filter
if (!bf.isPresent(partitioner.convertToDiskFormat(decoratedKey)))
return null;
// next, the key cache
Pair<String, DecoratedKey> unifiedKey = new Pair<String, DecoratedKey>(path, decoratedKey);
if (keyCache != null && keyCache.getCapacity() > 0)
{
PositionSize cachedPosition = keyCache.get(unifiedKey);
if (cachedPosition != null)
{
return cachedPosition;
}
}
// next, see if the sampled index says it's impossible for the key to be present
IndexSummary.KeyPosition sampledPosition = getIndexScanPosition(decoratedKey);
if (sampledPosition == null)
{
bloomFilterTracker.addFalsePositive();
return null;
}
// get either a buffered or a mmap'd input for the on-disk index
long p = sampledPosition.indexPosition;
FileDataInput input;
if (indexBuffers == null)
{
input = new BufferedRandomAccessFile(indexFilename(), "r");
((BufferedRandomAccessFile)input).seek(p);
}
else
{
input = indexInputAt(p);
}
// scan the on-disk index, starting at the nearest sampled position
try
{
int i = 0;
do
{
// handle exact sampled index hit
IndexSummary.KeyPosition kp = indexSummary.getSpannedIndexPosition(input.getAbsolutePosition());
if (kp != null && kp.key.equals(decoratedKey))
{
bloomFilterTracker.addTruePositive();
return indexSummary.getSpannedDataPosition(kp);
}
// if using mmapped i/o, skip to the next mmap buffer if necessary
if (input.isEOF() || kp != null)
{
if (indexBuffers == null) // not mmap-ing, just one index input
break;
FileDataInput oldInput = input;
if (kp == null)
{
input = indexInputAt(input.getAbsolutePosition());
}
else
{
long nextUnspannedPostion = input.getAbsolutePosition()
+ 2 + FBUtilities.encodedUTF8Length(StorageService.getPartitioner().convertToDiskFormat(kp.key))
+ 8;
input = indexInputAt(nextUnspannedPostion);
}
oldInput.close();
if (input == null)
break;
continue;
}
// read key & data position from index entry
DecoratedKey indexDecoratedKey = partitioner.convertFromDiskFormat(input.readUTF());
long dataPosition = input.readLong();
int v = indexDecoratedKey.compareTo(decoratedKey);
if (v == 0)
{
PositionSize info = getDataPositionSize(input, dataPosition);
if (keyCache != null && keyCache.getCapacity() > 0)
keyCache.put(unifiedKey, info);
bloomFilterTracker.addTruePositive();
return info;
}
if (v > 0)
{
bloomFilterTracker.addFalsePositive();
return null;
}
} while (++i < IndexSummary.INDEX_INTERVAL);
}
finally
{
if (input != null)
input.close();
}
bloomFilterTracker.addFalsePositive();
return null;
}
private FileDataInput indexInputAt(long indexPosition)
{
if (indexPosition > indexSummary.getLastIndexPosition())
return null;
int bufferIndex = bufferIndex(indexPosition);
return new MappedFileDataInput(indexBuffers[bufferIndex], indexFilename(), BUFFER_SIZE * bufferIndex, (int)(indexPosition % BUFFER_SIZE));
}
private PositionSize getDataPositionSize(FileDataInput input, long dataPosition) throws IOException
{
// if we've reached the end of the index, then the row size is "the rest of the data file"
if (input.isEOF())
return new PositionSize(dataPosition, length() - dataPosition);
// otherwise, row size is the start of the next row (in next index entry), minus the start of this one.
long nextIndexPosition = input.getAbsolutePosition();
// if next index entry would span mmap boundary, get the next row position from the summary instead
PositionSize nextPositionSize = indexSummary.getSpannedDataPosition(nextIndexPosition);
if (nextPositionSize != null)
return new PositionSize(dataPosition, nextPositionSize.position - dataPosition);
// read next entry directly
int utflen = input.readUnsignedShort();
if (utflen != input.skipBytes(utflen))
throw new EOFException();
return new PositionSize(dataPosition, input.readLong() - dataPosition);
}
/** like getPosition, but if key is not found will return the location of the first key _greater_ than the desired one, or -1 if no such key exists. */
public long getNearestPosition(DecoratedKey decoratedKey) throws IOException
{
IndexSummary.KeyPosition sampledPosition = getIndexScanPosition(decoratedKey);
if (sampledPosition == null)
{
return 0;
}
// can't use a MappedFileDataInput here, since we might cross a segment boundary while scanning
BufferedRandomAccessFile input = new BufferedRandomAccessFile(indexFilename(path), "r");
input.seek(sampledPosition.indexPosition);
try
{
while (true)
{
DecoratedKey indexDecoratedKey;
try
{
indexDecoratedKey = partitioner.convertFromDiskFormat(input.readUTF());
}
catch (EOFException e)
{
return -1;
}
long position = input.readLong();
int v = indexDecoratedKey.compareTo(decoratedKey);
if (v >= 0)
return position;
}
}
finally
{
input.close();
}
}
public long length()
{
return new File(path).length();
}
public int compareTo(SSTableReader o)
{
return ColumnFamilyStore.getGenerationFromFileName(path) - ColumnFamilyStore.getGenerationFromFileName(o.path);
}
public void markCompacted() throws IOException
{
if (logger.isDebugEnabled())
logger.debug("Marking " + path + " compacted");
if (!new File(compactedFilename()).createNewFile())
{
throw new IOException("Unable to create compaction marker");
}
phantomReference.deleteOnCleanup();
}
/** obviously only for testing */
public void forceBloomFilterFailures()
{
bf = BloomFilter.alwaysMatchingBloomFilter();
}
public IPartitioner getPartitioner()
{
return partitioner;
}
public SSTableScanner getScanner(int bufferSize) throws IOException
{
return new SSTableScanner(this, bufferSize);
}
public FileDataInput getFileDataInput(DecoratedKey decoratedKey, int bufferSize) throws IOException
{
PositionSize info = getPosition(decoratedKey);
if (info == null)
return null;
if (buffers == null || (bufferIndex(info.position) != bufferIndex(info.position + info.size)))
{
BufferedRandomAccessFile file = new BufferedRandomAccessFile(path, "r", bufferSize);
file.seek(info.position);
return file;
}
return new MappedFileDataInput(buffers[bufferIndex(info.position)], path, BUFFER_SIZE * (info.position / BUFFER_SIZE), (int) (info.position % BUFFER_SIZE));
}
static int bufferIndex(long position)
{
return (int) (position / BUFFER_SIZE);
}
public AbstractType getColumnComparator()
{
return DatabaseDescriptor.getComparator(getTableName(), getColumnFamilyName());
}
public ColumnFamily makeColumnFamily()
{
return ColumnFamily.create(getTableName(), getColumnFamilyName());
}
public ICompactSerializer2<IColumn> getColumnSerializer()
{
return DatabaseDescriptor.getColumnFamilyType(getTableName(), getColumnFamilyName()).equals("Standard")
? Column.serializer()
: SuperColumn.serializer(getColumnComparator());
}
public long getBloomFilterFalsePositiveCount()
{
return bloomFilterTracker.getFalsePositiveCount();
}
public long getRecentBloomFilterFalsePositiveCount()
{
return bloomFilterTracker.getRecentFalsePositiveCount();
}
public long getBloomFilterTruePositiveCount()
{
return bloomFilterTracker.getTruePositiveCount();
}
public long getRecentBloomFilterTruePositiveCount()
{
return bloomFilterTracker.getRecentTruePositiveCount();
}
}