/** * Licensed to Cloudera, Inc. under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. Cloudera, Inc. licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.cloudera.util.bloom; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.IOException; import java.nio.ByteBuffer; import java.util.Arrays; import org.apache.hadoop.util.bloom.BloomFilter; import org.apache.hadoop.util.bloom.Key; import org.apache.hadoop.util.hash.Hash; import com.google.common.base.Preconditions; /** * This is an interface for a bloom set. Currently this shim wraps the * o.a.h.util.bloom.BloomFilter with a more generic API but uses inefficient * calls. * * This is only for test code so for now. Operations of this are woefully * inefficient computationally and this should be eventually replaced with a * version that does not rely on hadoop's Writeable-tied version. */ public class BloomSet { final BloomFilter bloom; final static int hashType = Hash.JENKINS_HASH; // just pick a default for now. /** * Create empty BloomSet. * * Wikipedia uses the following size and # of hashes to achieve particular * false positives rates. * * m bits needed, n inserted elements, p false positive rate, k hash * functions, e is max false positive error rate. * * k = (m/n) ln 2 * * m = - n ln p / (ln 2)^2 * * 1.44 log_2 (1/e) = # of bits per inserted element **/ public BloomSet(int nbits, int hashes) { bloom = new BloomFilter(nbits, hashes, hashType); } /** * Copy constructor */ public BloomSet(BloomSet src) { Preconditions.checkArgument(src != null); byte[] bytes = src.getBytes(); bloom = deserialize(bytes); } /** * Instantiate a serialized BloomSet. */ public BloomSet(byte[] serialized) { bloom = deserialize(serialized); } /** * Takes an array of bytes and deserializes it into the current BloomSet. */ protected BloomFilter deserialize(byte[] serialized) { try { Preconditions.checkArgument(serialized != null); DataInputStream in = new DataInputStream(new ByteArrayInputStream( serialized)); BloomFilter bloom = new BloomFilter(); // empty constructor bloom.readFields(in); return bloom; } catch (IOException e) { throw new IllegalArgumentException(e.getMessage()); } } /** * Adds an int to the bloom filter. */ public void addInt(int i) { ByteBuffer buf = ByteBuffer.allocate(4); buf.putInt(i); Key k = new Key(buf.array()); bloom.add(k); } /** * Get a serialized version of the BloomSet */ public byte[] getBytes() { try { // serialize ByteArrayOutputStream bits = new ByteArrayOutputStream(); DataOutputStream out = new DataOutputStream(bits); bloom.write(out); out.flush(); return bits.toByteArray(); } catch (IOException e) { // should never happen. e.printStackTrace(); return null; } } @Override public int hashCode() { // TODO likely inefficient return Arrays.hashCode(getBytes()); } @Override public boolean equals(Object b) { if (!(b instanceof BloomSet)) { return false; } byte[] as = getBytes(); byte[] bs = ((BloomSet) b).getBytes(); return Arrays.equals(as, bs); } /** * Applies a bitwise 'and', modifying this bloom sets. 'and'ing these actually * creates a new physical representation that is equivalent to anding the two * sets. */ public void and(BloomSet b) { bloom.and(b.bloom); } /** * Rhetorically asks, does the current set contain the specified subset with * high probability? * * This essentially does a 'and' and then verifies if the resulting set is * equal to the original. */ public boolean contains(BloomSet subset) { // examples: // super sub => (sub & super) ^ sub == 0 // 1111 0011 => true // 1100 1100 => true // 1100 1000 => true // 1100 0000 => true // 1100 0010 => false // serialize the bloom filter, and send it on close. BloomSet subClone = new BloomSet(subset); // if subset subClone.and(this); return subClone.equals(subset); } }