/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.util; import java.text.NumberFormat; import java.util.Random; import org.apache.hadoop.hbase.Cell; import org.apache.hadoop.hbase.classification.InterfaceAudience; import org.apache.hadoop.hbase.nio.ByteBuff; import org.apache.hadoop.hbase.regionserver.BloomType; /** * Utility methods related to BloomFilters */ @InterfaceAudience.Private public final class BloomFilterUtil { /** Record separator for the Bloom filter statistics human-readable string */ public static final String STATS_RECORD_SEP = "; "; /** * Used in computing the optimal Bloom filter size. This approximately equals * 0.480453. */ public static final double LOG2_SQUARED = Math.log(2) * Math.log(2); /** * A random number generator to use for "fake lookups" when testing to * estimate the ideal false positive rate. */ private static Random randomGeneratorForTest; /** Bit-value lookup array to prevent doing the same work over and over */ public static final byte [] bitvals = { (byte) 0x01, (byte) 0x02, (byte) 0x04, (byte) 0x08, (byte) 0x10, (byte) 0x20, (byte) 0x40, (byte) 0x80 }; /** * Private constructor to keep this class from being instantiated. */ private BloomFilterUtil() { } /** * @param maxKeys * @param errorRate * @return the number of bits for a Bloom filter than can hold the given * number of keys and provide the given error rate, assuming that the * optimal number of hash functions is used and it does not have to * be an integer. */ public static long computeBitSize(long maxKeys, double errorRate) { return (long) Math.ceil(maxKeys * (-Math.log(errorRate) / LOG2_SQUARED)); } public static void setFakeLookupMode(boolean enabled) { if (enabled) { randomGeneratorForTest = new Random(283742987L); } else { randomGeneratorForTest = null; } } /** * The maximum number of keys we can put into a Bloom filter of a certain * size to maintain the given error rate, assuming the number of hash * functions is chosen optimally and does not even have to be an integer * (hence the "ideal" in the function name). * * @param bitSize * @param errorRate * @return maximum number of keys that can be inserted into the Bloom filter * @see #computeMaxKeys(long, double, int) for a more precise estimate */ public static long idealMaxKeys(long bitSize, double errorRate) { // The reason we need to use floor here is that otherwise we might put // more keys in a Bloom filter than is allowed by the target error rate. return (long) (bitSize * (LOG2_SQUARED / -Math.log(errorRate))); } /** * The maximum number of keys we can put into a Bloom filter of a certain * size to get the given error rate, with the given number of hash functions. * * @param bitSize * @param errorRate * @param hashCount * @return the maximum number of keys that can be inserted in a Bloom filter * to maintain the target error rate, if the number of hash functions * is provided. */ public static long computeMaxKeys(long bitSize, double errorRate, int hashCount) { return (long) (-bitSize * 1.0 / hashCount * Math.log(1 - Math.exp(Math.log(errorRate) / hashCount))); } /** * Computes the actual error rate for the given number of elements, number * of bits, and number of hash functions. Taken directly from the * <a href= * "http://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives" * > Wikipedia Bloom filter article</a>. * * @param maxKeys * @param bitSize * @param functionCount * @return the actual error rate */ public static double actualErrorRate(long maxKeys, long bitSize, int functionCount) { return Math.exp(Math.log(1 - Math.exp(-functionCount * maxKeys * 1.0 / bitSize)) * functionCount); } /** * Increases the given byte size of a Bloom filter until it can be folded by * the given factor. * * @param bitSize * @param foldFactor * @return Foldable byte size */ public static int computeFoldableByteSize(long bitSize, int foldFactor) { long byteSizeLong = (bitSize + 7) / 8; int mask = (1 << foldFactor) - 1; if ((mask & byteSizeLong) != 0) { byteSizeLong >>= foldFactor; ++byteSizeLong; byteSizeLong <<= foldFactor; } if (byteSizeLong > Integer.MAX_VALUE) { throw new IllegalArgumentException("byteSize=" + byteSizeLong + " too " + "large for bitSize=" + bitSize + ", foldFactor=" + foldFactor); } return (int) byteSizeLong; } public static int optimalFunctionCount(int maxKeys, long bitSize) { long i = bitSize / maxKeys; double result = Math.ceil(Math.log(2) * i); if (result > Integer.MAX_VALUE){ throw new IllegalArgumentException("result too large for integer value."); } return (int)result; } /** * Creates a Bloom filter chunk of the given size. * * @param byteSizeHint the desired number of bytes for the Bloom filter bit * array. Will be increased so that folding is possible. * @param errorRate target false positive rate of the Bloom filter * @param hashType Bloom filter hash function type * @param foldFactor * @param bloomType * @return the new Bloom filter of the desired size */ public static BloomFilterChunk createBySize(int byteSizeHint, double errorRate, int hashType, int foldFactor, BloomType bloomType) { BloomFilterChunk bbf = new BloomFilterChunk(hashType, bloomType); bbf.byteSize = computeFoldableByteSize(byteSizeHint * 8L, foldFactor); long bitSize = bbf.byteSize * 8; bbf.maxKeys = (int) idealMaxKeys(bitSize, errorRate); bbf.hashCount = optimalFunctionCount(bbf.maxKeys, bitSize); // Adjust max keys to bring error rate closer to what was requested, // because byteSize was adjusted to allow for folding, and hashCount was // rounded. bbf.maxKeys = (int) computeMaxKeys(bitSize, errorRate, bbf.hashCount); return bbf; } public static boolean contains(byte[] buf, int offset, int length, ByteBuff bloomBuf, int bloomOffset, int bloomSize, Hash hash, int hashCount) { HashKey<byte[]> hashKey = new ByteArrayHashKey(buf, offset, length); return contains(bloomBuf, bloomOffset, bloomSize, hash, hashCount, hashKey); } private static <T> boolean contains(ByteBuff bloomBuf, int bloomOffset, int bloomSize, Hash hash, int hashCount, HashKey<T> hashKey) { int hash1 = hash.hash(hashKey, 0); int hash2 = hash.hash(hashKey, hash1); int bloomBitSize = bloomSize << 3; if (randomGeneratorForTest == null) { // Production mode. int compositeHash = hash1; for (int i = 0; i < hashCount; i++) { int hashLoc = Math.abs(compositeHash % bloomBitSize); compositeHash += hash2; if (!checkBit(hashLoc, bloomBuf, bloomOffset)) { return false; } } } else { // Test mode with "fake lookups" to estimate "ideal false positive rate". for (int i = 0; i < hashCount; i++) { int hashLoc = randomGeneratorForTest.nextInt(bloomBitSize); if (!checkBit(hashLoc, bloomBuf, bloomOffset)){ return false; } } } return true; } public static boolean contains(Cell cell, ByteBuff bloomBuf, int bloomOffset, int bloomSize, Hash hash, int hashCount, BloomType type) { HashKey<Cell> hashKey = type == BloomType.ROW ? new RowBloomHashKey(cell) : new RowColBloomHashKey(cell); return contains(bloomBuf, bloomOffset, bloomSize, hash, hashCount, hashKey); } /** * Check if bit at specified index is 1. * * @param pos index of bit * @return true if bit at specified index is 1, false if 0. */ static boolean checkBit(int pos, ByteBuff bloomBuf, int bloomOffset) { int bytePos = pos >> 3; //pos / 8 int bitPos = pos & 0x7; //pos % 8 byte curByte = bloomBuf.get(bloomOffset + bytePos); curByte &= bitvals[bitPos]; return (curByte != 0); } /** * A human-readable string with statistics for the given Bloom filter. * * @param bloomFilter the Bloom filter to output statistics for; * @return a string consisting of "<key>: <value>" parts * separated by {@link #STATS_RECORD_SEP}. */ public static String formatStats(BloomFilterBase bloomFilter) { StringBuilder sb = new StringBuilder(); long k = bloomFilter.getKeyCount(); long m = bloomFilter.getMaxKeys(); sb.append("BloomSize: " + bloomFilter.getByteSize() + STATS_RECORD_SEP); sb.append("No of Keys in bloom: " + k + STATS_RECORD_SEP); sb.append("Max Keys for bloom: " + m); if (m > 0) { sb.append(STATS_RECORD_SEP + "Percentage filled: " + NumberFormat.getPercentInstance().format(k * 1.0 / m)); } return sb.toString(); } public static String toString(BloomFilterChunk bloomFilter) { return formatStats(bloomFilter) + STATS_RECORD_SEP + "Actual error rate: " + String.format("%.8f", bloomFilter.actualErrorRate()); } }