/* * Copyright 2001-2004 Sean Owen * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.planetj.math.rabinhash; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.ObjectOutputStream; import java.io.Serializable; import java.io.ObjectInputStream; import java.net.URL; import java.nio.ByteBuffer; import java.nio.IntBuffer; /** * <p>This class provides an implementation of a hash function based on Rabin fingerprints, one * which can efficiently produce a 32-bit hash value for a sequence of bytes. It does so by considering * strings of bytes as large polynomials over GF(2) -- that is, with coefficients of 0 and 1 -- * and then reducing them modulo some irreducible polynomial of degree 32. The result is a hash function * with very satisfactory properties. In addition the polynomial operations are fast in hardware; * even in this Java implementation the speed is reasonable.</p> * * <p>Methods in this class can compute a hash value for an array of bytes, chars or ints, as well as * any {@link java.io.Serializable} object, String, file, or resource denoted by URL.</p> * * <p>Methods of this class are all thread-safe, and hash function objects are immutable.</p> * * <p>Polynomials of degree 32 are used frequently in this code, and are represented efficiently as * <code>int</code>s. An <code>int</code> has 32 bits, whereas a polynomial of degree 32 has 33 coefficients. * Therefore, the high-order bit of the <code>int</code> is the degree 31 term's * coefficient, and the low-order bit is the constant coefficient.</p> * * <p>For example the integer 0x00000803, in binary, is:</p> * * <p><code>00000000 00000000 00001000 00000011</code></p> * * <p>Therefore it correponds to the polynomial:</p> * * <p><code>x<sup>32</sup> + x<sup>11</sup> + x + 1</code></p> * * <p>The implementation is derived from the paper "Some applications of Rabin's fingerprinting method" * by Andrei Broder. See <a href="http://server3.pa-x.dec.com/SRC/publications/src-papers.html"> * http://server3.pa-x.dec.com/SRC/publications/src-papers.html</a> for a full citation and the paper * in PDF format.</p> * * * @author Sean Owen * @version 2.0 * @since 2.0 */ public final class RabinHashFunction32 implements Serializable, Cloneable { /** Represents x<sup>32</sup> + x<sup>7</sup> + x<sup>3</sup> + x<sup>2</sup> + 1. */ private static final int DEFAULT_IRREDUCIBLE_POLY = 0x0000008D; /** Default hash function, provided for convenience. */ public static final RabinHashFunction32 DEFAULT_HASH_FUNCTION = new RabinHashFunction32(DEFAULT_IRREDUCIBLE_POLY); private static final int P_DEGREE = 32; private static final int X_P_DEGREE = 1 << (P_DEGREE - 1); private static final int READ_BUFFER_SIZE = 1024; private final int P; private transient int[] table32, table40, table48, table56; /** * <p>Creates a RabinHashFunction32 based on the specified polynomial.</p> * * <p>This class does not test the polynomial for irreducibility; therefore this constructor should * only be used with polynomials that are already known to be irreducible, or else the hash function * will not perform optimally.</p> * * @param P a degree 32 polynomial over GF(2), represented as an <code>int</code> */ public RabinHashFunction32(final int P) { this.P = P; initializeTables(); } private void initializeTables() { final int[] mods = new int[P_DEGREE]; // We want to have mods[i] == x^(P_DEGREE+i) mods[0] = P; for (int i = 1; i < P_DEGREE; i++) { final int lastMod = mods[i - 1]; // x^i == x(x^(i-1)) (mod P) int thisMod = lastMod << 1; // if x^(i-1) had a x_(P_DEGREE-1) term then x^i has a // x^P_DEGREE term that 'fell off' the top end. // Since x^P_DEGREE == P (mod P), we should add P // to account for this: if ((lastMod & X_P_DEGREE) != 0) { thisMod ^= P; } mods[i] = thisMod; } // Let i be a number between 0 and 255 (i.e. a byte). // Let its bits be b0, b1, ..., b7. // Let Q32 be the polynomial b0*x^39 + b1*x^38 + ... + b7*x^32 (mod P). // Then table32[i] is Q32, represented as an int (see below). // Likewise Q40 be the polynomial b0*x^47 + b1*x^46 + ... + b7*x^40 (mod P). // table40[i] is Q40, represented as an int. Likewise table48 and table56. table32 = new int[256]; table40 = new int[256]; table48 = new int[256]; table56 = new int[256]; for (int i = 0; i < 256; i++) { int c = i; for (int j = 0; j < 8 && c > 0; j++) { if ((c & 1) != 0) { table32[i] ^= mods[j]; table40[i] ^= mods[j + 8]; table48[i] ^= mods[j + 16]; table56[i] ^= mods[j + 24]; } c >>>= 1; } } } /** * @return irreducible polynomial used in this hash function, represented as an <code>int</code> */ public int getP() { return P; } private int computeWShifted(final int w) { return table32[w & 0xFF] ^ table40[(w >>> 8) & 0xFF] ^ table48[(w >>> 16) & 0xFF] ^ table56[(w >>> 24) & 0xFF]; } /** * <p>Return the Rabin hash value of an array of bytes.</p> * * @param A the array of bytes * @return the hash value * @throws NullPointerException if A is null */ public int hash(final byte[] A) { return hash(A, 0, A.length, 0); } int hash(final byte[] A, final int offset, final int length, int w) { int s = offset; // First, process a few bytes so that the number of bytes remaining is a multiple of 4. // This makes the later loop easier. final int starterBytes = length % 4; if (starterBytes != 0) { final int max = offset + starterBytes; while (s < max) { w = (w << 8) ^ (A[s] & 0xFF); s++; } } final int max = offset + length; while (s < max) { w = computeWShifted(w) ^ (A[s] << 24) ^ ((A[s + 1] & 0xFF) << 16) ^ ((A[s + 2] & 0xFF) << 8) ^ (A[s + 3] & 0xFF); s += 4; } return w; } /** * <p>Return the Rabin hash value of an array of chars.</p> * * @param A the array of chars * @return the hash value * @throws NullPointerException if A is null */ public int hash(final char[] A) { int w, s; // If an odd number of characters, process the first char so that the number remaining // is a multiple of 2. This makes the later loop easier. if (A.length % 2 == 1) { w = A[0] & 0xFFFF; s = 1; } else { w = 0; s = 0; } while (s < A.length) { w = computeWShifted(w) ^ ((A[s] & 0xFFFF) << 16) ^ (A[s + 1] & 0xFFFF); s += 2; } return w; } /** * <p>Returns the Rabin hash value of an array of <code>int</code>s. This method is the most efficient of * all the hash methods, so it should be used when possible.</p> * * @param A array of <code>int</code>s * @return the hash value * @throws NullPointerException if A is null */ public int hash(final int[] A) { int w = 0; for (int s = 0; s < A.length; s++) { w = computeWShifted(w) ^ A[s]; } return w; } /** * <p>Returns the Rabin hash value of a ByteBuffer.</p> * * @param A ByteBuffer * @return the hash value * @throws NullPointerException if A is null */ public int hash(final ByteBuffer A) { return hash(A.asIntBuffer()); } /** * <p>Returns the Rabin hash value of an IntBuffer.</p> * * @param A IntBuffer * @return the hash value * @throws NullPointerException if A is null */ public int hash(final IntBuffer A) { int w = 0; while (A.hasRemaining()) { w = computeWShifted(w) ^ A.get(); } return w; } /** * <p>Returns the Rabin hash value of a serializable object.</p> * * @return the hash value * @param obj the object to be hashed * @throws NullPointerException if obj is null */ public int hash(final Serializable obj) { if (obj == null) { throw new NullPointerException(); } final ByteArrayOutputStream baos = new ByteArrayOutputStream(); try { final ObjectOutputStream oos = new ObjectOutputStream(baos); oos.writeObject(obj); } catch (IOException ioe) { // can't happen } return hash(baos.toByteArray()); } /** * <p>Computes the Rabin hash value of a String.</p> * * @param s the string to be hashed * @return the hash value * @throws NullPointerException if s is null */ public int hash(final String s) { return hash(s.toCharArray()); } /** * <p>Computes the Rabin hash value of the contents of a file.</p> * * @return the hash value of the file * @param f the file to be hashed * @throws FileNotFoundException if the file cannot be found * @throws IOException if an error occurs while reading the file * @throws NullPointerException if f is null */ public int hash(final File f) throws FileNotFoundException, IOException { if (f == null) { throw new NullPointerException(); } final FileInputStream fis = new FileInputStream(f); try { return hash(fis); } finally { fis.close(); } } /** * <p>Computes the Rabin hash value of the contents of a file, specified by URL.</p> * * @return the hash value of the file * @param url the URL of the file to be hashed * @throws IOException if an error occurs while reading from the URL * @throws NullPointerException if url is null */ public int hash(final URL url) throws IOException { final InputStream is = url.openStream(); try { return hash(is); } finally { is.close(); } } /** * <p>Computes the Rabin hash value of the data from an <code>InputStream</code>.</p> * * @param is the InputStream to hash * @return the hash value of the data from the InputStream * @throws IOException if an error occurs while reading from the InputStream * @throws NullPointerException if stream is null */ public int hash(final InputStream is) throws IOException { final byte[] buffer = new byte[READ_BUFFER_SIZE]; int w = 0; int bytesRead; while ((bytesRead = is.read(buffer)) > 0) { w = hash(buffer, 0, bytesRead, w); } return w; } public boolean equals(final Object o) { return o instanceof RabinHashFunction32 && ((RabinHashFunction32) o).P == P; } public int hashCode() { return P; } public String toString() { return "RabinHashFunction32[P: " + RabinHashFunctionUtils.polynomialToString(P) + "]"; } private void readObject(final ObjectInputStream stream) throws IOException, ClassNotFoundException { stream.defaultReadObject(); initializeTables(); } }