/* * Copyright (c) 2013-2017 Cinchapi Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.cinchapi.concourse.server.storage.cache; import java.nio.ByteBuffer; import java.nio.MappedByteBuffer; import java.nio.channels.FileChannel.MapMode; import java.util.concurrent.locks.ReentrantReadWriteLock; import javax.annotation.concurrent.ThreadSafe; import com.cinchapi.concourse.server.GlobalState; import com.cinchapi.concourse.server.io.Byteable; import com.cinchapi.concourse.server.io.Composite; import com.cinchapi.concourse.server.io.FileSystem; import com.cinchapi.concourse.util.ByteBuffers; import com.cinchapi.concourse.util.LongBitSet; import com.google.common.hash.Hashing; /** * A {@link LoggingBloomFilter} is one that uses append-only logging for * serialization. * <p> * In this Bloom Filter, all changes to the internal bit set are periodically * synced to a file on disk. This allows the filter to serialize its state in a * dynamic fashion that affords optimal performance and high throughput. Disk * writes only happen when an external caller invokes the {@link #diskSync()} * method and there are relevant changes to record. * </p> * <p> * A LoggingBloomFilter should be used instead of a regular {@link BloomFilter} * when writes will continue to occur after the filter has been synced to disk. * </p> * * @author Jeff Nelson */ @ThreadSafe public class LoggingBloomFilter { /** * Create a new {@link LoggingBloomFilter} that is backed by {@code file} * and has enough capacity to handle the number of * {@code expectedInsertions} with the specified false positive probability * ({@code fpp}). * * @param file * @param expectedInsertions * @param fpp * @return the LoggingBloomFilter */ public static LoggingBloomFilter create(String file, int expectedInsertions, double fpp) { int numBits = getNumBits(expectedInsertions, fpp); return new LoggingBloomFilter(file, numBits, getNumHashFunctions( expectedInsertions, numBits)); } /** * Given the number of {@code expectedInsertions} and the acceptable false * positive probability ({@code fpp}), return the number of bits necessary * to include in the bloom filter. * <p> * <em>Math courtesy of http://hur.st/bloomfilter</em> * </p> * * @param expectedInsertions * @param fpp * @return the number of bits to use */ private static int getNumBits(int expectedInsertions, double fpp) { if(fpp == 0) { fpp = Double.MIN_VALUE; } return (int) Math.ceil((expectedInsertions * Math.log(fpp)) / Math.log(1 / Math.pow(2, Math.log(2)))); } /** * Given the number of {@code expectedInsertions} and the {@code numBits} in * the bloom filter, return the number of ideal hash functions. * <p> * Use {@link #getNumBits(int, double)} to determine the number of bits that * are necessary to use in the bloom filter. * </p> * <p> * <em>Math courtesy of http://hur.st/bloomfilter</em> * </p> * * @param expectedInsertions * @param numBits * @return the ideal number of hash function */ private static int getNumHashFunctions(int expectedInsertions, int numBits) { return (int) Math.round(Math.log(2) * (numBits / expectedInsertions)); } /** * The internal bit set that holds the boom filter's state. */ private final LongBitSet bits; /** * The buffer that records recent changes to the state of the {@link #bits} * set. */ private ByteBuffer buffer; /** * The backing file. We write to this periodically in an append-only * fashion. */ private final String file; /** * The ideal number of hash functions to use in the bloom filter. */ private final int numHashFunctions; /** * A lock for concurrency control. */ private final ReentrantReadWriteLock masterLock = new ReentrantReadWriteLock(); /** * The position where we should begin appending data in the backing * {@link #file}. */ private int position; /** * Since {@link #buffer} is intentionally oversized, we record the actual * length of the recent changes so we know how much data to append to disk * when {@link #diskSync()} is invoked. */ private int lengthOfRecentChanges; private int numBits; /** * Construct a new instance. * * @param directory * @param numBits * @param numHashFunctions */ private LoggingBloomFilter(String file, int numBits, int numHashFunctions) { this.bits = LongBitSet.create(); this.numBits = numBits; this.numHashFunctions = numHashFunctions; this.file = file; diskSyncCleanup(); if(position > 0) { // load the existing changes into memory ByteBuffer bytes = FileSystem.readBytes(file); while (bytes.position() < position) { bits.set(bytes.getInt(), true); } } } /** * Force a sync of the recent changes to this bloom filter to disk. This * method should be called in conjunction with the page turning * functionality in the {@link Buffer}. */ public void diskSync() { MappedByteBuffer data = FileSystem.map(file, MapMode.READ_WRITE, position, lengthOfRecentChanges); data.put(ByteBuffers.slice(buffer, 0, lengthOfRecentChanges)); data.force(); diskSyncCleanup(); } /** * Return {@code true} if it is possible that the {@code byteables} have * been placed in this bloom filter. Return {@code false} if they have * definitely not been placed in this bloom filter. * * @param byteables * @return {@code true} if the filter <em>might</em> contain the * {@code byteables} */ public boolean mightContain(Byteable... byteables) { masterLock.readLock().lock(); try { int[] hashes = hash(Composite.create(byteables)); for (int hash : hashes) { if(!bits.get(hash)) { return false; } } return true; } finally { masterLock.readLock().unlock(); } } /** * Put the {@code byteables} into this bloom filter and ensure subsequent * invocations of {@link #mightContain(Byteable...)} with the same items * will always return {@code true}. * * @param byteables * @return {@code true} if the bloom filter's bits have changed as a result * of this operation. If the bits changed, this is definitely the * first time the {@code byteables} have been added to this filter. * If the bits haven't changed, this might be the first time the * items have been added. Please note that this method always * returns the opposite of what {@link #mightContain(Byteable...)} * would have returned at the same time */ public boolean put(Byteable... byteables) { masterLock.writeLock().lock(); try { int[] hashes = hash(Composite.create(byteables)); boolean bitsChanged = true; for (int hash : hashes) { if(!bits.get(hash)) { bits.set(hash); bitsChanged = true; buffer.putInt(hash); lengthOfRecentChanges += 4; } } return bitsChanged; } finally { masterLock.writeLock().unlock(); } } /** * Cleanup the in-memory meta data during initialization or after a call to * {@link #diskSync()}. */ private void diskSyncCleanup() { // We allocate a ByteBuffer that is equal to the BUFFER_PAGE_SIZE so // that we can be sure that we'll have enough space to store all the // possible changes before the Buffer calls #diskSync() in conjunction // with adding a new page. buffer = ByteBuffer.allocate(GlobalState.BUFFER_PAGE_SIZE); lengthOfRecentChanges = 0; position = (int) FileSystem.getFileSize(file); } /** * Return the hashes for the {@code composite} object based on the ideal * {@link #numHashFunctions} and the size of the underlying {@link #bitSet}. * * @param composite * @return the hashes for {@code composite}. */ private int[] hash(Composite composite) { long hash64 = Hashing.murmur3_128() .hashBytes(ByteBuffers.toByteArray(composite.getBytes())) .asLong(); int hash1 = (int) hash64; int hash2 = (int) (hash64 >>> 32); int[] hashes = new int[numHashFunctions]; for (int i = 1; i <= numHashFunctions; ++i) { hashes[i - 1] = Math.abs((hash1 + i * hash2) % numBits); } return hashes; } }