HashFunction.java example

Explorer
learning-bittorrent-master
package com.limegroup.gnutella.routing;

import com.limegroup.gnutella.FileManager;
import com.limegroup.gnutella.util.I18NConvert;
import com.limegroup.gnutella.util.StringUtils;

/** 
 * The official platform-independent hashing function for query-routing.  The
 * key property is that it allows interpolation of hash tables of different
 * sizes.  More formally, with x>=0, n>=0, k>=0, 0<=r<=n,<ul>
 * <li>2 ^ k * hash(x, n) <= hash(x, n+k) < 2 ^ (k+1) * hash(x, n);</li>
 * <li>hash(x, n-r) = int(hash(x, n) / 2 ^ r).</li>
 * </ul>
 *
 * This version should now work cross-platform, however it is not intended
 * to be secure, only very fast to compute.  See Chapter 12.3.2. of CLR
 * for details of multiplication-based algorithms.
 */
public class HashFunction {
    //private static final double A=(Math.sqrt(5.0)-1.0)/2.0;
    //private static final long TWO_31=0x80000000l;
    //private static final int A_INT=(int)(A*TWO_31); //=1327217884
    private static final int A_INT=0x4F1BBCDC;
        
    /**
     * Returns the n-<b>bit</b> hash of x, where n="bits".  That is, the
     * returned value value can fit in "bits" unsigned bits, and is
     * between 0 and (2^bits)-1.
     */
    private static int hashFast(int x, byte bits) {
        // Keep only the "bits" highest bits of the 32 *lowest* bits of the
        // product (ignore overflowing bits of the 64-bit product result).
        // The constant factor should distribute equally each byte of x in
        // the returned bits.
        return (int)(x * A_INT) >>> (32 - bits);
    }

    /*
     * Returns the n-bit hash of x.toLowerCase(), where n=<tt>bits</tt>.
     * That is, the returned value value can fit in "<tt>bits</tt>" unsigned
     * bits, and is between 0 and <tt>(2 ^ bits) - 1</tt>.
     *
     * @param x the string to hash
     * @param bits the number of bits to use in the resulting answer
     * @return the hash value
     * @see hash(String,int,int,byte)
     */    
    public static int hash(String x, byte bits) {
        return hash(x, 0, x.length(), bits);
    }       

    /**
     * Returns the same value as hash(x.substring(start, end), bits), but tries
     * to avoid allocations.<p>
     *
     * Note that x is lower-cased when hashing, using a locale-neutral
     * character case conversion based on the UTF-16 representation of the
     * source string to hash.  So it is stable across all platforms and locales.
     * However this does not only convert ASCII characters but ALL Unicode
     * characters having a single lowercase mapping character.  No attempt is
     * made here to remove accents and diacritics.<p>
     *
     * The string is supposed to be in NFC canonical form, but this is not
     * enforced here.  Conversion to lowercase of characters uses Unicode rules
     * built into the the java.lang.Character core class, excluding all special
     * case rules (N-to-1, 1-to-M, N-to-M, locale-sensitive and contextual).<p>
     *
     * A better way to hash strings would be to use String conversion in the
     * Locale.US context (for stability across servents) after transformation
     * to NFKD and removal of all diacritics from hashed keywords.  If needed,
     * this should be done before splitting the query string into hashable
     * keywords.
     *
     * @param x the string to hash
     * @param bits the number of bits to use in the resulting answer
     * @param start the start offset of the substring to hash
     * @param end just PAST the end of the substring to hash
     * @return the hash value 
     */   
    public static int hash(String x, int start, int end, byte bits) {
        //1. First turn x[start...end-1] into a number by treating all 4-byte
        //chunks as a little-endian quadword, and XOR'ing the result together.
        //We pad x with zeroes as needed. 
        //    To avoid having do deal with special cases, we do this by XOR'ing
        //a rolling value one byte at a time, taking advantage of the fact that
        //x XOR 0==x.
        int xor=0;  //the running total
        int j=0;    //the byte position in xor.  INVARIANT: j==8*((i-start)%4)
        for (int i=start; i<end; i++) {
            // internationalization be damned? Not a problem here:
            // we just hash the lower 8 bits of the lowercase UTF-16 code-units
            // representing characters, ignoring only the high 8 bits that
            // indicate a Unicode page, and it is not very widely distributed
            // even though they could also have feeded the hash function.
            xor ^= (Character.toLowerCase(x.charAt(i)) & 0xFF) << j;
            j = (j + 8) & 24;
        }
        //2. Now map number to range 0 - (2^bits-1).
        return hashFast(xor, bits);
    }


    /** 
     * Returns a list of canonicalized keywords in the given file name, suitable
     * for passing to hash(String,int).  The returned keywords are
     * lower-cased, though that is not strictly needed as hash ignores
     * case.<p>
     *
     * This function is not consistent for case conversion: it uses a locale
     * dependant String conversion, which also considers special casing rules
     * (N-to-1, 1-to-M, N-to-N, locale-sensitive and contextual variants),
     * unlike the simplified case conversion done in
     * <tt>hash(String, int, int, byte)</tt>, which is locale-neutral.<p>
     *
     * A better way to hash strings would be to use String conversion in the
     * Locale.US context (for stability across servents) after transformation
     * to NFKD and removal of all diacritics from hashed keywords.  If needed,
     * this should be done before splitting the file name string into hashable
     * keywords. Then we should remove the unneeded toLowerCase() call in
     * the <tt>hash(String, int, int, byte)</tt> function.
     * 
     * @param fileName The name of the file to break up into keywords.  These
     *  keywords will subsequently be hashed for inclusion in the bit vector.
     */
    public static String[] keywords(String filePath) {
        //TODO1: this isn't a proper implementation.  It should really be
        //to tokenized by ALL non-alphanumeric characters.

        //TODO2: perhaps we should do an English-specific version that accounts
        //for plurals, common keywords, etc.  But that's only necessary for 
        //our own files, since the assumption is that queries have already been
        //canonicalized. 
        return StringUtils.split(
            // TODO: a better canonicalForm(query) function here that
            // also removes accents by converting first to NFKD and keeping
            // only PRIMARY differences
            I18NConvert.instance().getNorm(filePath),
            FileManager.DELIMITERS);
    }

    /** 
     * Returns the index of the keyword starting at or after the i'th position
     * of query, or -1 if no such luck.
     */
    public static int keywordStart(String query, int i) {
        //Search for the first character that is not a delimiterer TODO3: we can
        //make this O(|DELIMITERS|) times faster by converting
        //FileManager.DELIMITERS into a Set in this' static initializer.  But
        //then we have to allocate Strings here.  Can work around the problem,
        //but it's trouble.
        final String DELIMITERS=FileManager.DELIMITERS;
        for ( ; i<query.length() ; i++) {
            char c=query.charAt(i);
            //If c not in DELIMITERS, declare success.
            if (DELIMITERS.indexOf(c)<0)
                return i;
        }
        return -1;
    }   

    /** 
     * Returns the index just past the end of the keyword starting at the i'th
     * position of query, or query.length() if no such index.
     */
    public static int keywordEnd(String query, int i) {
        //Search for the first character that is a delimiter.  
        //TODO3: see above
        final String DELIMITERS=FileManager.DELIMITERS;
        for ( ; i<query.length() ; i++) {
            char c=query.charAt(i);
            //If c in DELIMITERS, declare success.
            if (DELIMITERS.indexOf(c)>=0)
                return i;
        }
        return query.length();
    }    
        

    /**
     * @return an array of strings with the original strings and prefixes
     */
    public static String[] getPrefixes(String[] words){
        // 1. Count the number of words that can have prefixes (5 chars or more)
        int prefixable = 0;
        for (int i = 0; i < words.length; i++) {
            if (words[i].length() > 4)
                prefixable++;
        }
        // 2. If none, just returns the same words (saves allocations)
        if (prefixable == 0)
            return words;
        // 3. Create an expanded array with words and prefixes
        final String[] retArray = new String[words.length + prefixable * 2];
        int j = 0;
        for (int i = 0; i < words.length; i++) {
            final String word = words[i];
            retArray[j++] = word;
            final int len = word.length();
            if (len > 4) {
                retArray[j++] = word.substring(0, len - 1);
                retArray[j++] = word.substring(0, len - 2);
            }
        }
        return retArray;
    }
}