PerfectStringHash.java example

Explorer
rrd4j-master
- src
  - main
    - java
  - test
    - java
      - org
        rrd4j
        TestDemo.java
        TestLSL.java
        TutorialTest.java
        converter
        ConverterTest.java
        core
        BigStringsTest.java
        LongRunning.java
        RrdDbPoolTest.java
        RrdDbTest.java
        RrdDefTest.java
        RrdNioBackendTest.java
        TestArchiveSelection.java
        UriTest.java
        jrrd
        ExportImport.java
        JrrdTest.java
        timespec
        TimeParserTest.java
        data
        AggregatorTest.java
        DataProcessorTest.java
        RpnCalculatorTest.java
        VariableTest.java
        graph
        CommentTextTest.java
        DownSamplerTest.java
        DummyGraph.java
        ImageFormatTest.java
        ImageWriterTest.java
        LogarithmicTest.java
        PercentileTest.java
        SimpleTimeLabelFormatTest.java
        SpanTest.java
        StackTest.java
        TimeAxisText.java
        ValueAxisTest.java
/*
 * Copyright 2010 Tom Gibara
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0

 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * 
 */
package com.tomgibara.crinch.hashing;

import java.math.BigInteger;
import java.util.Arrays;
import java.util.Comparator;

/**
 * <p>
 * A "minimal perfect hash" for Strings. After construction with an array of
 * <em>n</em> unique non-null strings, an instance of this class will return a
 * unique hash value <em>h</em> (0 <= h < n) for any string <em>s</em> in the
 * array. A negative has value will typically be returned for a string that is
 * not in the array.
 * </p>
 * 
 * <p>
 * However, the supplied array is <em>not</em> retained. This means that the
 * implementation cannot necessarily confirm that a string is not in the
 * supplied array. Where this implementation cannot distinguish that a string is
 * not in the array, a 'valid' hash value may be returned. Under no
 * circumstances will a hash value be returned that is greater than or equal to
 * <em>n</em>.
 * </p>
 * 
 * <p>
 * <strong>IMPORTANT NOTE:</strong> The array of strings supplied to the
 * constructor will be mutated: it is re-ordered so that
 * <code>hash(a[i]) == i</code>. Application code must generally use this
 * information to map hash values back onto the appropriate string value.
 * </p>
 * 
 * <p>
 * <strong>NOTE:</strong> Good performance of this algorithm is predicated on
 * string hash values being cached by the <code>String</code> class. Experience
 * indicates that is is a good assumption.
 * </p>
 * 
 * 
 * @author Tom Gibara
 */

public class PerfectStringHash implements Hash<String> {

    // statics

    /**
     * Comparator used to order the supplied string array. Hashcodes take
     * priority, we will do a binary search on those. Otherwise, lengths take
     * priority over character ordering because the hash algorithm prefers to
     * compare lengths, it's faster.
     */

    private static final Comparator<String> comparator = new Comparator<String>() {
        @Override
        public int compare(String s1, String s2) {
            final int h1 = s1.hashCode();
            final int h2 = s2.hashCode();
            if (h1 == h2) {
                final int d = s1.length() - s2.length();
                return d == 0 ? s1.compareTo(s2) : d;
            }
            return h1 < h2 ? -1 : 1;
        }
    };

    /**
     * Builds a (typically v. small) decision tree for distinguishing strings
     * that share the same hash value.
     * 
     * @param values
     *            the string values to distinguish
     * @param start
     *            the index from which the values should be read
     * @param length
     *            the number of string values that need to be distinguished
     * @param pivots
     *            the array that will hold our decision nodes
     * @param pivotIndex
     *            the index at which the tree should be written
     */
    private static void generatePivots(String[] values, int start, int length, int[] pivots, int pivotIndex) {
        final int capacity = Integer.highestOneBit(length - 1) << 1;
        final int depth = Integer.numberOfTrailingZeros(capacity);
        pivots[ pivotIndex << 1     ] = depth;
        pivots[(pivotIndex << 1) + 1] = length;
        pivotIndex++;
        //build the array
        for (int i = 0; i < depth; i++) {
            int step = capacity >> i;
        for (int j = (1 << (depth-i-1)) - 1; j < capacity; j += step) {
            final int part;
            final int comp;
            if (j >= length - 1) {
                part = Integer.MIN_VALUE;
                comp = 0;
            } else {
                final String v1 = values[start + j];
                final String v2 = values[start + j + 1];
                final int l1 = v1.length();
                final int l2 = v2.length();
                if (l1 == l2) {
                    int tPart = -1;
                    int tComp = -1;
                    for (int k = 0; k < l1; k++) {
                        final char c1 = v1.charAt(k);
                        final char c2 = v2.charAt(k);
                        if (c1 == c2) continue;
                        if (c1 < c2) { //must occur at some point because we have already checked that the two strings are unequal
                            tPart = k;
                            tComp = c1;
                        } else {
                            //shouldn't be possible - we've sorted the strings to avoid this case
                            throw new IllegalStateException();
                        }
                        break;
                    }
                    //check if we've been passed a duplicated value
                    if (tPart == -1) throw new IllegalArgumentException("duplicate value: " + v1);
                    part = tPart;
                    comp = tComp;
                } else {
                    part = -1;
                    comp = l1;
                }
            }
            pivots[ pivotIndex<<1     ] = part;
            pivots[(pivotIndex<<1) + 1] = comp;
            pivotIndex++;
        }
        }
    }

    // fields

    /**
     * The hashcodes of the supplied strings.
     */

    private final int[] hashes;

    /**
     * Stores two ints for every string, an offset into the pivot array (-1 if
     * not necessary) and the depth of the decision tree that is rooted there.
     */

    private final int[] offsets;

    /**
     * Stores two ints for every decision, the index at which a character
     * comparison needs to be made, followed by the character value to be
     * compared against; or -1 to indicate a length comparison, followed by the
     * length to be compared against.
     */

    private final int[] pivots;

    /**
     * Cache a range object which indicates the range of hash values generated.
     */

    private final HashRange range;

    /**
     * Constructs a minimal perfect string hashing over the supplied strings.
     * 
     * @param values
     *            an array of unique non-null strings that will be reordered
     *            such that <code>hash(values[i]) == i</code>.
     */

    public PerfectStringHash(final String values[]) {
        final int length = values.length;
        if (length == 0) throw new IllegalArgumentException("No values supplied");

        final int[] hashes = new int[length];
        final int[] offsets = new int[2 * length];
        final int[] runLengths = new int[length];

        //sort values so that we can assume ordering by hashcode, length and char[]
        Arrays.sort(values, comparator);

        //pull the hashcodes into an array for analysis
        for (int i = 0; i < length; i++) hashes[i] = values[i].hashCode();

        //test for unique hashes
        int offset = 0;
        if (length > 1) {
            int previousHash = hashes[0];
            int runLength = 1;
            for (int i = 1; i <= length; i++) {
                int currentHash = i == length ? ~previousHash : hashes[i];
                if (currentHash == previousHash) {
                    runLength++;
                } else {
                    if (runLength > 1) {
                        final int firstIndex = i - runLength;
                        for (int j = i - 1; j >= firstIndex; j--) {
                            runLengths[j] = runLength;
                            //offset points to the first node in decision tree
                            offsets[ j<<1     ] = offset;
                            //adjustment is number of indices to first duplicate
                            offsets[(j<<1) + 1] = j - firstIndex;
                        }
                        //extra one for recording depth
                        offset += (Integer.highestOneBit(runLength - 1) << 1);
                        runLength = 1;
                    } else {
                        runLengths[i-1] = 1;
                        offsets[(i-1)<<1] = -1;
                    }
                }
                previousHash = currentHash;
            }
        }

        //shortcut for when all hashes are unique
        if (offset == 0) {
            this.hashes = hashes;
            this.offsets = null;
            this.pivots = null;
            this.range = new HashRange(0, length - 1);
            return;
        }

        //build the decision trees
        final int[] pivots = new int[offset * 2];
        for (int i = 0; i < length;) {
            final int runLength = runLengths[i];
            if (runLength > 1)  generatePivots(values, i, runLength, pivots, (int) offsets[i << 1]);
            i += runLength;
        }

        //setup our state
        this.pivots = pivots;
        this.offsets = offsets;
        this.hashes = hashes;
        this.range = new HashRange(0, length - 1);
    }

    // hash generator methods

    @Override
    public HashRange getRange() {
        return range;
    }

    @Override
    public BigInteger hashAsBigInt(String value) {
        return BigInteger.valueOf(hash(value));
    }

    //TODO decide whether to throw an IAE if -1 is returned from hash
    @Override
    public int hashAsInt(String value) {
        return hash(value);
    }

    @Override
    public long hashAsLong(String value) {
        return hash(value);
    }

    /**
     * Generates a hashcode for the supplied string.
     * 
     * @param value
     *            any string, not null
     * @return a minimal hashcode for the supplied string, or -1
     */

    private int hash(String value) {
        final int h = value.hashCode();
        final int index = Arrays.binarySearch(hashes, h);
        final int[] pivots = this.pivots;
        if (pivots == null || index < 0) return index;

        final int offset = offsets[index << 1];
        if (offset == -1) return index;

        final int depth = pivots[(offset << 1)    ];
        final int count = pivots[(offset << 1) + 1];
        int i = 0;
        for (int d = 0; d < depth; d++) {
            final int t = (offset + (1 << d) + i) << 1;
            final int part = pivots[t    ];
            final int comp = pivots[t + 1];
            final boolean right;
            if (part == Integer.MIN_VALUE) { //easy case - no right value
                right = false;
            } else if (part == -1) { //compare length
                right = value.length() > comp;
            } else { //lengths are equal, compare character
                right = value.charAt(part) > (char) comp;
            }
            i <<= 1;
            if (right) i++;
        }
        return i >= count ? -1 : index + i - offsets[(index << 1) + 1];
    }

}