/* * Copyright 2004-2014 H2 Group. Multiple-Licensed under the MPL 2.0, * and the EPL 1.0 (http://h2database.com/html/license.html). * Initial Developer: H2 Group */ package org.h2.dev.hash; import java.io.ByteArrayOutputStream; import java.util.ArrayList; import java.util.Collection; import java.util.List; import java.util.Set; import java.util.zip.Deflater; import java.util.zip.Inflater; /** * A perfect hash function tool. It needs about 1.4 bits per key, and the * resulting hash table is about 79% full. The minimal perfect hash function * needs about 2.3 bits per key. * <p> * Generating the hash function takes about 1 second per million keys * for both perfect hash and minimal perfect hash. * <p> * The algorithm is recursive: sets that contain no or only one entry are not * processed as no conflicts are possible. Sets that contain between 2 and 16 * entries, up to 16 hash functions are tested to check if they can store the * data without conflict. If no function was found, the same is tested on a * larger bucket (except for the minimal perfect hash). If no hash function was * found, and for larger buckets, the bucket is split into a number of smaller * buckets (up to 32). * <p> * At the end of the generation process, the data is compressed using a general * purpose compression tool (Deflate / Huffman coding). The uncompressed data is * around 1.52 bits per key (perfect hash) and 3.72 (minimal perfect hash). * <p> * Please also note the MinimalPerfectHash class, which uses less space per key. */ public class PerfectHash { /** * The maximum size of a bucket. */ private static final int MAX_SIZE = 16; /** * The maximum number of hash functions to test. */ private static final int OFFSETS = 16; /** * The maximum number of buckets to split the set into. */ private static final int MAX_SPLIT = 32; /** * The description of the hash function. Used for calculating the hash of a * key. */ private final byte[] data; /** * The offset of the result of the hash function at the given offset within * the data array. Used for calculating the hash of a key. */ private final int[] plus; /** * The position of the next bucket in the data array (in case this bucket * needs to be skipped). Used for calculating the hash of a key. */ private final int[] next; /** * Create a hash object to convert keys to hashes. * * @param data the data returned by the generate method */ public PerfectHash(byte[] data) { this.data = data = expand(data); plus = new int[data.length]; next = new int[data.length]; for (int i = 0, p = 0; i < data.length; i++) { plus[i] = p; int n = data[i] & 255; p += n < 2 ? n : n >= MAX_SPLIT ? (n / OFFSETS) : 0; } } /** * Calculate the hash from the key. * * @param x the key * @return the hash */ public int get(int x) { return get(0, x, 0); } private int get(int pos, int x, int level) { int n = data[pos] & 255; if (n < 2) { return plus[pos]; } else if (n >= MAX_SPLIT) { return plus[pos] + hash(x, level, n % OFFSETS, n / OFFSETS); } pos++; int h = hash(x, level, 0, n); for (int i = 0; i < h; i++) { pos = read(pos); } return get(pos, x, level + 1); } private int read(int pos) { int p = next[pos]; if (p == 0) { int n = data[pos] & 255; if (n < 2 || n >= MAX_SPLIT) { return pos + 1; } int start = pos++; for (int i = 0; i < n; i++) { pos = read(pos); } next[start] = p = pos; } return p; } /** * Generate the perfect hash function data from the given set of integers. * * @param list the set * @param minimal whether the perfect hash function needs to be minimal * @return the data */ public static byte[] generate(Set<Integer> list, boolean minimal) { ByteArrayOutputStream out = new ByteArrayOutputStream(); generate(list, 0, minimal, out); return compress(out.toByteArray()); } private static void generate(Collection<Integer> set, int level, boolean minimal, ByteArrayOutputStream out) { int size = set.size(); if (size <= 1) { out.write(size); return; } if (size < MAX_SIZE) { int max = minimal ? size : Math.min(MAX_SIZE - 1, size * 2); for (int s = size; s <= max; s++) { // Try a few hash functions ("offset" is basically the hash // function index). We could try less hash functions, and // instead use a larger size and remember the position of the // hole (specially for the minimal perfect case), but that's // more complicated. nextOffset: for (int offset = 0; offset < OFFSETS; offset++) { int bits = 0; for (int x : set) { int h = hash(x, level, offset, s); if ((bits & (1 << h)) != 0) { continue nextOffset; } bits |= 1 << h; } out.write(s * OFFSETS + offset); return; } } } // Split the set into multiple smaller sets. We could try to split more // evenly by trying out multiple hash functions, but that's more // complicated. int split; if (minimal) { split = size > 150 ? size / 83 : (size + 3) / 4; } else { split = size > 265 ? size / 142 : (size + 5) / 7; } split = Math.min(MAX_SPLIT - 1, Math.max(2, split)); out.write(split); List<List<Integer>> lists = new ArrayList<List<Integer>>(split); for (int i = 0; i < split; i++) { lists.add(new ArrayList<Integer>(size / split)); } for (int x : set) { lists.get(hash(x, level, 0, split)).add(x); } for (List<Integer> s2 : lists) { generate(s2, level + 1, minimal, out); } } /** * Calculate the hash of a key. The result depends on the key, the recursion * level, and the offset. * * @param x the key * @param level the recursion level * @param offset the index of the hash function * @param size the size of the bucket * @return the hash (a value between 0, including, and the size, excluding) */ private static int hash(int x, int level, int offset, int size) { x += level * OFFSETS + offset; x = ((x >>> 16) ^ x) * 0x45d9f3b; x = ((x >>> 16) ^ x) * 0x45d9f3b; x = (x >>> 16) ^ x; return Math.abs(x % size); } /** * Compress the hash description using a Huffman coding. * * @param d the data * @return the compressed data */ private static byte[] compress(byte[] d) { Deflater deflater = new Deflater(); deflater.setStrategy(Deflater.HUFFMAN_ONLY); deflater.setInput(d); deflater.finish(); ByteArrayOutputStream out2 = new ByteArrayOutputStream(d.length); byte[] buffer = new byte[1024]; while (!deflater.finished()) { int count = deflater.deflate(buffer); out2.write(buffer, 0, count); } deflater.end(); return out2.toByteArray(); } /** * Decompress the hash description using a Huffman coding. * * @param d the data * @return the decompressed data */ private static byte[] expand(byte[] d) { Inflater inflater = new Inflater(); inflater.setInput(d); ByteArrayOutputStream out = new ByteArrayOutputStream(d.length); byte[] buffer = new byte[1024]; try { while (!inflater.finished()) { int count = inflater.inflate(buffer); out.write(buffer, 0, count); } inflater.end(); } catch (Exception e) { throw new IllegalArgumentException(e); } return out.toByteArray(); } }