/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * <p> * http://www.apache.org/licenses/LICENSE-2.0 * <p> * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.flink.runtime.operators.util; import org.apache.flink.core.memory.MemorySegment; import static org.apache.flink.util.Preconditions.checkArgument; /** * BloomFilter is a probabilistic data structure for set membership check. BloomFilters are * highly space efficient when compared to using a HashSet. Because of the probabilistic nature of * bloom filter false positive (element not present in bloom filter but test() says true) are * possible but false negatives are not possible (if element is present then test() will never * say false). The false positive probability is configurable depending on which storage requirement * may increase or decrease. Lower the false positive probability greater is the space requirement. * Bloom filters are sensitive to number of elements that will be inserted in the bloom filter. * During the creation of bloom filter expected number of entries must be specified. If the number * of insertions exceed the specified initial number of entries then false positive probability will * increase accordingly. * <p> * Internally, this implementation of bloom filter uses MemorySegment to store BitSet, BloomFilter and * BitSet are designed to be able to switch between different MemorySegments, so that Flink can share * the same BloomFilter/BitSet object instance for different bloom filters. * <p> * Part of this class refers to the implementation from Apache Hive project * https://github.com/apache/hive/blob/master/common/src/java/org/apache/hive/common/util/BloomFilter.java */ public class BloomFilter { protected BitSet bitSet; protected int expectedEntries; protected int numHashFunctions; public BloomFilter(int expectedEntries, int byteSize) { checkArgument(expectedEntries > 0, "expectedEntries should be > 0"); this.expectedEntries = expectedEntries; this.numHashFunctions = optimalNumOfHashFunctions(expectedEntries, byteSize << 3); this.bitSet = new BitSet(byteSize); } public void setBitsLocation(MemorySegment memorySegment, int offset) { this.bitSet.setMemorySegment(memorySegment, offset); } /** * Compute optimal bits number with given input entries and expected false positive probability. * * @param inputEntries * @param fpp * @return optimal bits number */ public static int optimalNumOfBits(long inputEntries, double fpp) { int numBits = (int) (-inputEntries * Math.log(fpp) / (Math.log(2) * Math.log(2))); return numBits; } /** * Compute the false positive probability based on given input entries and bits size. * Note: this is just the math expected value, you should not expect the fpp in real case would under the return value for certain. * * @param inputEntries * @param bitSize * @return */ public static double estimateFalsePositiveProbability(long inputEntries, int bitSize) { int numFunction = optimalNumOfHashFunctions(inputEntries, bitSize); double p = Math.pow(Math.E, -(double) numFunction * inputEntries / bitSize); double estimatedFPP = Math.pow(1 - p, numFunction); return estimatedFPP; } /** * compute the optimal hash function number with given input entries and bits size, which would * make the false positive probability lowest. * * @param expectEntries * @param bitSize * @return hash function number */ static int optimalNumOfHashFunctions(long expectEntries, long bitSize) { return Math.max(1, (int) Math.round((double) bitSize / expectEntries * Math.log(2))); } public void addHash(int hash32) { int hash1 = hash32; int hash2 = hash32 >>> 16; for (int i = 1; i <= numHashFunctions; i++) { int combinedHash = hash1 + (i * hash2); // hashcode should be positive, flip all the bits if it's negative if (combinedHash < 0) { combinedHash = ~combinedHash; } int pos = combinedHash % bitSet.bitSize(); bitSet.set(pos); } } public boolean testHash(int hash32) { int hash1 = hash32; int hash2 = hash32 >>> 16; for (int i = 1; i <= numHashFunctions; i++) { int combinedHash = hash1 + (i * hash2); // hashcode should be positive, flip all the bits if it's negative if (combinedHash < 0) { combinedHash = ~combinedHash; } int pos = combinedHash % bitSet.bitSize(); if (!bitSet.get(pos)) { return false; } } return true; } public void reset() { this.bitSet.clear(); } @Override public String toString() { StringBuilder output = new StringBuilder(); output.append("BloomFilter:\n"); output.append("\thash function number:").append(numHashFunctions).append("\n"); output.append(bitSet); return output.toString(); } /** * Bare metal bit set implementation. For performance reasons, this implementation does not check * for index bounds nor expand the bit set size if the specified index is greater than the size. */ public class BitSet { private MemorySegment memorySegment; // MemorySegment byte array offset. private int offset; // MemorySegment byte size. private int length; private final int LONG_POSITION_MASK = 0xffffffc0; public BitSet(int byteSize) { checkArgument(byteSize > 0, "bits size should be greater than 0."); checkArgument(byteSize << 29 == 0, "bytes size should be integral multiple of long size(8 Bytes)."); this.length = byteSize; } public void setMemorySegment(MemorySegment memorySegment, int offset) { this.memorySegment = memorySegment; this.offset = offset; } /** * Sets the bit at specified index. * * @param index - position */ public void set(int index) { int longIndex = (index & LONG_POSITION_MASK) >>> 3; long current = memorySegment.getLong(offset + longIndex); current |= (1L << index); memorySegment.putLong(offset + longIndex, current); } /** * Returns true if the bit is set in the specified index. * * @param index - position * @return - value at the bit position */ public boolean get(int index) { int longIndex = (index & LONG_POSITION_MASK) >>> 3; long current = memorySegment.getLong(offset + longIndex); return (current & (1L << index)) != 0; } /** * Number of bits */ public int bitSize() { return length << 3; } public MemorySegment getMemorySegment() { return this.memorySegment; } /** * Clear the bit set. */ public void clear() { long zeroValue = 0L; for (int i = 0; i < (length / 8); i++) { memorySegment.putLong(offset + i * 8, zeroValue); } } @Override public String toString() { StringBuilder output = new StringBuilder(); output.append("BitSet:\n"); output.append("\tMemorySegment:").append(memorySegment.size()).append("\n"); output.append("\tOffset:").append(offset).append("\n"); output.append("\tLength:").append(length).append("\n"); return output.toString(); } } }