/* * Copyright 2004-2014 H2 Group. Multiple-Licensed under the MPL 2.0, * and the EPL 1.0 (http://h2database.com/html/license.html). * Initial Developer: H2 Group */ package org.h2.test.unit; import java.io.File; import java.io.IOException; import java.io.RandomAccessFile; import java.util.BitSet; import java.util.HashSet; import java.util.Random; import java.util.Set; import java.util.concurrent.TimeUnit; import org.h2.dev.hash.MinimalPerfectHash; import org.h2.dev.hash.MinimalPerfectHash.LongHash; import org.h2.dev.hash.MinimalPerfectHash.StringHash; import org.h2.dev.hash.MinimalPerfectHash.UniversalHash; import org.h2.dev.hash.PerfectHash; import org.h2.test.TestBase; /** * Tests the perfect hash tool. */ public class TestPerfectHash extends TestBase { /** * Run just this test. * * @param a ignored */ public static void main(String... a) throws Exception { TestPerfectHash test = (TestPerfectHash) TestBase.createCaller().init(); test.measure(); largeFile(); test.test(); test.measure(); } private static void largeFile() throws IOException { largeFile("sequence.txt"); for (int i = 1; i <= 4; i++) { largeFile("unique" + i + ".txt"); } largeFile("enwiki-20140811-all-titles.txt"); } private static void largeFile(String s) throws IOException { String fileName = System.getProperty("user.home") + "/temp/" + s; if (!new File(fileName).exists()) { System.out.println("not found: " + fileName); return; } RandomAccessFile f = new RandomAccessFile(fileName, "r"); byte[] data = new byte[(int) f.length()]; f.readFully(data); UniversalHash<Text> hf = new UniversalHash<Text>() { @Override public int hashCode(Text o, int index, int seed) { return o.hashCode(index, seed); } }; f.close(); HashSet<Text> set = new HashSet<Text>(); Text t = new Text(data, 0); while (true) { set.add(t); int end = t.getEnd(); if (end >= data.length - 1) { break; } t = new Text(data, end + 1); if (set.size() % 1000000 == 0) { System.out.println("size: " + set.size()); } } System.out.println("file: " + s); System.out.println("size: " + set.size()); long time = System.nanoTime(); byte[] desc = MinimalPerfectHash.generate(set, hf); time = System.nanoTime() - time; System.out.println("millis: " + TimeUnit.NANOSECONDS.toMillis(time)); System.out.println("len: " + desc.length); int bits = desc.length * 8; System.out.println(((double) bits / set.size()) + " bits/key"); } /** * Measure the hash functions. */ public void measure() { int size = 1000000; testMinimal(size / 10); int s; long time = System.nanoTime(); s = testMinimal(size); time = System.nanoTime() - time; System.out.println((double) s / size + " bits/key (minimal) in " + TimeUnit.NANOSECONDS.toMillis(time) + " ms"); time = System.nanoTime(); s = testMinimalWithString(size); time = System.nanoTime() - time; System.out.println((double) s / size + " bits/key (minimal; String keys) in " + TimeUnit.NANOSECONDS.toMillis(time) + " ms"); time = System.nanoTime(); s = test(size, true); time = System.nanoTime() - time; System.out.println((double) s / size + " bits/key (minimal old) in " + TimeUnit.NANOSECONDS.toMillis(time) + " ms"); time = System.nanoTime(); s = test(size, false); time = System.nanoTime() - time; System.out.println((double) s / size + " bits/key (not minimal) in " + TimeUnit.NANOSECONDS.toMillis(time) + " ms"); } @Override public void test() { testBrokenHashFunction(); for (int i = 0; i < 100; i++) { testMinimal(i); } for (int i = 100; i <= 100000; i *= 10) { testMinimal(i); } for (int i = 0; i < 100; i++) { test(i, true); test(i, false); } for (int i = 100; i <= 100000; i *= 10) { test(i, true); test(i, false); } } private void testBrokenHashFunction() { int size = 10000; Random r = new Random(10000); HashSet<String> set = new HashSet<String>(size); while (set.size() < size) { set.add("x " + r.nextDouble()); } for (int test = 1; test < 10; test++) { final int badUntilLevel = test; UniversalHash<String> badHash = new UniversalHash<String>() { @Override public int hashCode(String o, int index, int seed) { if (index < badUntilLevel) { return 0; } return StringHash.getFastHash(o, index, seed); } }; byte[] desc = MinimalPerfectHash.generate(set, badHash); testMinimal(desc, set, badHash); } } private int test(int size, boolean minimal) { Random r = new Random(size); HashSet<Integer> set = new HashSet<Integer>(); while (set.size() < size) { set.add(r.nextInt()); } byte[] desc = PerfectHash.generate(set, minimal); int max = test(desc, set); if (minimal) { assertEquals(size - 1, max); } else { if (size > 10) { assertTrue(max < 1.5 * size); } } return desc.length * 8; } private int test(byte[] desc, Set<Integer> set) { int max = -1; HashSet<Integer> test = new HashSet<Integer>(); PerfectHash hash = new PerfectHash(desc); for (int x : set) { int h = hash.get(x); assertTrue(h >= 0); assertTrue(h <= set.size() * 3); max = Math.max(max, h); assertFalse(test.contains(h)); test.add(h); } return max; } private int testMinimal(int size) { Random r = new Random(size); HashSet<Long> set = new HashSet<Long>(size); while (set.size() < size) { set.add((long) r.nextInt()); } LongHash hf = new LongHash(); byte[] desc = MinimalPerfectHash.generate(set, hf); int max = testMinimal(desc, set, hf); assertEquals(size - 1, max); return desc.length * 8; } private int testMinimalWithString(int size) { Random r = new Random(size); HashSet<String> set = new HashSet<String>(size); while (set.size() < size) { set.add("x " + r.nextDouble()); } StringHash hf = new StringHash(); byte[] desc = MinimalPerfectHash.generate(set, hf); int max = testMinimal(desc, set, hf); assertEquals(size - 1, max); return desc.length * 8; } private <K> int testMinimal(byte[] desc, Set<K> set, UniversalHash<K> hf) { int max = -1; BitSet test = new BitSet(); MinimalPerfectHash<K> hash = new MinimalPerfectHash<K>(desc, hf); for (K x : set) { int h = hash.get(x); assertTrue(h >= 0); assertTrue(h <= set.size() * 3); max = Math.max(max, h); assertFalse(test.get(h)); test.set(h); } return max; } /** * A text. */ static class Text { /** * The byte data (may be shared, so must not be modified). */ final byte[] data; /** * The start location. */ final int start; Text(byte[] data, int start) { this.data = data; this.start = start; } /** * The hash code (using a universal hash function). * * @param index the hash function index * @param seed the random seed * @return the hash code */ public int hashCode(int index, int seed) { if (index < 8) { int x = (index * 0x9f3b) ^ seed; int result = seed; int p = start; while (true) { int c = data[p++] & 255; if (c == '\n') { break; } x = 31 + x * 0x9f3b; result ^= x * (1 + c); } return result; } int end = getEnd(); return StringHash.getSipHash24(data, start, end, index, seed); } int getEnd() { int end = start; while (data[end] != '\n') { end++; } return end; } @Override public int hashCode() { return hashCode(0, 0); } @Override public boolean equals(Object other) { if (other == this) { return true; } else if (!(other instanceof Text)) { return false; } Text o = (Text) other; int end = getEnd(); int s2 = o.start; int e2 = o.getEnd(); if (e2 - s2 != end - start) { return false; } for (int s1 = start; s1 < end; s1++, s2++) { if (data[s1] != o.data[s2]) { return false; } } return true; } @Override public String toString() { return new String(data, start, getEnd() - start); } } }