/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.commoncrawl.util; import java.io.UnsupportedEncodingException; import java.lang.reflect.Method; /** * Base class for BloomFilter implementation * * @author rana * */ public abstract class Filter { int hashCount; int getHashCount() { return hashCount; } public int[] getHashBuckets(String key) { return Filter.getHashBuckets(key, hashCount, buckets()); } public int[] getHashBuckets(long key) { return Filter.getHashBuckets(key, hashCount, buckets()); } abstract int buckets(); public abstract void add(String key); public abstract boolean isPresent(String key); // for testing abstract int emptyBuckets(); ICompactSerializer<Filter> getSerializer() { Method method = null; try { method = getClass().getMethod("serializer"); return (ICompactSerializer<Filter>) method.invoke(null); } catch (Exception e) { throw new RuntimeException(e); } } // murmur is faster than a sha-based approach and provides as-good collision // resistance. the combinatorial generation approach described in // http://www.eecs.harvard.edu/~kirsch/pubs/bbbf/esa06.pdf // does prove to work in actual tests, and is obviously faster // than performing further iterations of murmur. static int[] getHashBuckets(String key, int hashCount, int max) { byte[] b; try { b = key.getBytes("UTF-16"); } catch (UnsupportedEncodingException e) { throw new RuntimeException(e); } int[] result = new int[hashCount]; int hash1 = MurmurHash.hash(b, b.length, 0); int hash2 = MurmurHash.hash(b, b.length, hash1); for (int i = 0; i < hashCount; i++) { result[i] = Math.abs((hash1 + i * hash2) % max); } return result; } static int[] getHashBuckets(long key, int hashCount, int max) { byte[] b = new byte[8]; b[0] = (byte) ((key >>> 56) & 0xFF); b[1] = (byte) ((key >>> 48) & 0xFF); b[2] = (byte) ((key >>> 40) & 0xFF); b[3] = (byte) ((key >>> 32) & 0xFF); b[4] = (byte) ((key >>> 24) & 0xFF); b[5] = (byte) ((key >>> 16) & 0xFF); b[6] = (byte) ((key >>> 8) & 0xFF); b[7] = (byte) ((key) & 0xFF); int[] result = new int[hashCount]; int hash1 = MurmurHash.hash(b, b.length, 0); int hash2 = MurmurHash.hash(b, b.length, hash1); for (int i = 0; i < hashCount; i++) { result[i] = Math.abs((hash1 + i * hash2) % max); } return result; } }