/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.addthis.hydra.store.util;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import com.addthis.basis.util.LessBytes;
import com.addthis.codec.annotations.FieldConfig;
import com.addthis.codec.codables.SuperCodable;
import com.addthis.hydra.common.hash.PluggableHashFunction;
import com.fasterxml.jackson.annotation.JsonAutoDetect;
/**
* A simple, codable Bloom Filter adhering to the SeenFilter interface.
* <p>A Bloom filter is a space-efficient probabilistic data structure that is used
* to test whether an element is a member of a set. False positive matches are
* possible, but false negatives are not; i.e. a query returns either
* "inside set (may be wrong)" or "definitely not in set". Elements can be added to
* the set, but not removed. The more elements that are added to the set,
* the larger the probability of false positives.
*
* @user-reference
*/
@JsonAutoDetect(getterVisibility = JsonAutoDetect.Visibility.NONE,
isGetterVisibility = JsonAutoDetect.Visibility.NONE,
setterVisibility = JsonAutoDetect.Visibility.NONE)
public class SeenFilterBasic<K> implements SeenFilter<K>, SuperCodable {
public static final int HASH_HASHCODE = 0; /* mostly bad */
public static final int HASH_HASHCODE_SHIFT_REV = 1; /* mostly bad */
public static final int HASH_HASHCODE_LONG_REV = 2; /* mostly bad */
public static final int HASH_MD5 = 3; /* marginally better accuracy, much slower */
public static final int HASH_PLUGGABLE_SHIFT = 4; /* default, best blend if speed and accuracy */
/**
* for one of the hash types
*/
private static MessageDigest md5;
static {
try {
md5 = java.security.MessageDigest.getInstance("MD5");
} catch (NoSuchAlgorithmException e) {
e.printStackTrace();
}
}
/**
* Optionally specify the initial state of the bloom filter.
* If this field is specified then the {@link #bitsfree} field
* must also be specified.
*/
@FieldConfig(codable = true)
private int[] bitset;
/**
* Cardinality of the bloom filter
* (total number of bits allocated to the filter).
* This field must be 32 or greater. This field is required.
*/
@FieldConfig(codable = true, required = true)
private int bits;
/**
* Number of hash function evaluations for each insertion
* operation. This parameter is usually referred to as
* the "k" parameter in the literature. This field is required.
*/
@FieldConfig(codable = true, required = true)
private int bitsper;
/**
* Type of hash function that is used. The following types are available.
* <p>0 - HASH_HASHCODE : mostly bad
* <p>1 - HASH_HASHCODE_SHIFT_REV : mostly bad
* <p>2 - HASH_HASHCODE_LONG_REV : mostly bad
* <p>3 - HASH_MD5 : marginally better accuracy, much slower
* <p>4 - HASH_PLUGGABLE_SHIFT : best blend of speed and accuracy
* <p>This field is required. It is strongly recommended that you use "4".
*/
@FieldConfig(codable = true, required = true)
private int hash;
/**
* If {@link #bitset} is specified the you must populate
* this field with the number of 0 bits in the initial bloom filter.
*/
@FieldConfig(codable = true)
private int bitsfree;
public SeenFilterBasic() {
}
public SeenFilterBasic(int bits, int bitsper) {
this(bits, bitsper, HASH_PLUGGABLE_SHIFT);
}
public SeenFilterBasic(int bits, int bitsper, int hash) {
if (bits < 32) {
throw new RuntimeException("invalid bits @ " + bits);
}
this.hash = hash;
this.bits = (bits / 32) * 32;
this.bitsfree = bits;
this.bitsper = bitsper;
this.bitset = new int[bits / 32];
}
public SeenFilterBasic<K> newInstance() {
return new SeenFilterBasic<>(bits, bitsper, hash);
}
public SeenFilterBasic<K> newInstance(int bits) {
return new SeenFilterBasic<>(bits > 0 ? bits : this.bits, bitsper, hash);
}
@Override
public String toString() {
return "SeenFilterBasic[" + bits + "," + bitset.length + "," + hash + "]";
}
/**
* first stage byte set generator to feed hash algorithms
*/
private static byte[] generatePreHash(Object o) {
Class<?> clazz = o.getClass();
if (clazz == Raw.class) {
return ((Raw) o).toBytes();
}
if (clazz == Long.class) {
return LessBytes.toBytes((Long) o);
}
return Raw.get(o.toString()).toBytes();
}
/**
* call a given hash method for generating a single entry in a hash bit set
*/
public static long customHash(Object o, int index, int hash) {
switch (hash) {
default:
case HASH_HASHCODE:
return o.hashCode();
case HASH_HASHCODE_SHIFT_REV:
int hc = o.hashCode();
return (hc << 32) | LessBytes.reverseBits(hc);
case HASH_HASHCODE_LONG_REV:
long lhc = (long) o.hashCode();
return lhc | LessBytes.reverseBits(lhc);
case HASH_MD5:
byte[] r1 = generatePreHash(o);
byte[] r2 = new byte[r1.length];
for (int i = 0; i < r1.length; i++) {
r2[r2.length - i - 1] = (byte) (r1[i] ^ index);
}
md5.reset();
r1 = md5.digest(r1);
r2 = md5.digest(r2);
return (((long) PluggableHashFunction.hash(r1)) << 32L) | ((long) PluggableHashFunction.hash(r2));
case HASH_PLUGGABLE_SHIFT:
r1 = generatePreHash(o);
r2 = new byte[r1.length];
for (int i = 0; i < r1.length; i++) {
r2[r2.length - i - 1] = (byte) (r1[i] ^ index);
}
return (((long) PluggableHashFunction.hash(r1)) << 32) | ((long) PluggableHashFunction.hash(r2));
}
}
/**
* return number of bits backing this filter
*/
public int getBits() {
return bitset.length * 32;
}
public int[] getBitStore() {
return bitset;
}
public int getBitCount() {
return bits;
}
public int getBitsPerEntry() {
return bitsper;
}
public int getHashMethod() {
return hash;
}
/**
* return used bit saturation (0-100) as a percent
*/
public int getSaturation() {
try {
return 100 - (int) ((bitsfree * 100L) / (bitset.length * 32L));
} catch (Exception ex) {
System.out.println(hashCode() + " >> " + ex + " >> " + bits + " , " + bitsper + " , " + hash + " , " + bitsfree + " , " + bitset);
return 0;
}
}
/**
* generate a single entry in hash offset set .
* will be called with an index from 0-bitsper
* to generate each bit in the hash set.
* override this in subclasses and hash type will
* be ignored.
*/
public long generateHash(K o, int index) {
return customHash(o, index, hash);
}
/**
* generate a bit hash offset set
*/
public long[] getHashSet(K o) {
long[] bs = new long[bitsper];
for (int i = 0; i < bitsper; i++) {
bs[i] = Math.abs(generateHash(o, i));
}
return bs;
}
/**
* return true (seen) if all bits set
*/
public boolean checkHashSet(long[] bs) {
for (long l : bs) {
if (!getBit(Math.abs((int) (l % bits)))) {
return false;
}
}
return true;
}
/**
* warning: like setHashSet but does not update bitsfee
*/
public void updateHashSet(long[] bs) {
for (int i = 0; i < bitsper; i++) {
long hash = bs[i];
int offset = (int) (hash % bits);
int byteoff = offset / 32;
long val = (1 << (offset % 32));
bitset[byteoff] |= val;
}
}
/**
* set all bits from this hash offset set
*/
public void setHashSet(long[] bs) {
for (long l : bs) {
setBit(Math.abs((int) (l % bits)));
}
}
/**
* sets this offset bit
*/
public void setBit(int offset) {
int byteoff = offset / 32;
long val = (1 << (offset % 32));
if ((bitset[byteoff] & val) == 0 && bitsfree > 0) {
bitsfree--;
}
bitset[byteoff] |= val;
}
/**
* returns true of this offset bit is set
*/
public boolean getBit(int offset) {
int byteoff = offset / 32;
long val = (1 << (offset % 32));
return (bitset[byteoff] & val) == val;
}
/**
* return true if seen before
*/
public boolean updateSeen(K o) {
boolean allset = true;
for (int i = 0; i < bitsper; i++) {
long hash = Math.abs(generateHash(o, i));
int offset = (int) (hash % bits);
int byteoff = offset / 32;
long val = (1 << (offset % 32));
allset = allset & ((bitset[byteoff] & val) == 0);
bitset[byteoff] |= val;
}
return allset;
}
public SeenFilterBasic<K> mergeSeen(SeenFilterBasic<?> merge) {
return merge((SeenFilter<K>) merge);
}
@Override
public SeenFilterBasic<K> merge(SeenFilter<K> merge) {
if (!(merge instanceof SeenFilterBasic)) {
throw new IllegalArgumentException(merge + " incompatible filter with " + this);
}
SeenFilterBasic<K> filterMerge = (SeenFilterBasic<K>) merge;
if (!(filterMerge.hash == hash && filterMerge.bits == bits)) {
throw new IllegalArgumentException(merge + " settings differ from " + this);
}
SeenFilterBasic<K> filterNew = new SeenFilterBasic<>();
if (filterMerge.bits != bits || filterMerge.bitsper != bitsper || filterMerge.bitset.length != bitset.length) {
throw new IllegalArgumentException("cannot merge dissimilar blooms");
}
filterNew.hash = hash;
filterNew.bits = bits;
filterNew.bitsfree = bits;
filterNew.bitsper = bitsper;
filterNew.bitset = new int[bitset.length];
for (int i = 0; i < bitset.length; i++) {
filterNew.bitset[i] = bitset[i] | filterMerge.bitset[i];
long v = filterNew.bitset[i];
for (int j = 0; j < 32; j++) {
if ((v & 1) == 1) {
filterNew.bitsfree--;
}
v >>= 1;
}
}
return filterNew;
}
@Override
public void clear() {
bitset = new int[bitset.length];
}
@Override
public void setSeen(K o) {
for (int i = 0; i < bitsper; i++) {
long hash = Math.abs(generateHash(o, i));
setBit((int) (hash % bits));
}
}
@Override
public boolean getSeen(K o) {
for (int i = 0; i < bitsper; i++) {
long hash = Math.abs(generateHash(o, i));
if (!getBit((int) (hash % bits))) {
return false;
}
}
return true;
}
@Override
public boolean getSetSeen(K o) {
boolean seen = true;
for (int i = 0; i < bitsper; i++) {
long hash = Math.abs(generateHash(o, i));
int bit = (int) (hash % bits);
if (getBit(bit)) {
continue;
}
setBit(bit);
seen = false;
}
return seen;
}
@Override
public void postDecode() {
if (bits <= 0) {
throw new RuntimeException("invalid bits @ 0");
}
if (bitset == null) {
this.bitset = new int[bits / 32];
this.bitsfree = bits;
}
}
@Override
public void preEncode() {
}
}