package com.limegroup.gnutella.routing;
import com.limegroup.gnutella.FileManager;
import com.limegroup.gnutella.util.I18NConvert;
import com.limegroup.gnutella.util.StringUtils;
/**
* The official platform-independent hashing function for query-routing. The
* key property is that it allows interpolation of hash tables of different
* sizes. More formally, with x>=0, n>=0, k>=0, 0<=r<=n,<ul>
* <li>2 ^ k * hash(x, n) <= hash(x, n+k) < 2 ^ (k+1) * hash(x, n);</li>
* <li>hash(x, n-r) = int(hash(x, n) / 2 ^ r).</li>
* </ul>
*
* This version should now work cross-platform, however it is not intended
* to be secure, only very fast to compute. See Chapter 12.3.2. of CLR
* for details of multiplication-based algorithms.
*/
public class HashFunction {
//private static final double A=(Math.sqrt(5.0)-1.0)/2.0;
//private static final long TWO_31=0x80000000l;
//private static final int A_INT=(int)(A*TWO_31); //=1327217884
private static final int A_INT=0x4F1BBCDC;
/**
* Returns the n-<b>bit</b> hash of x, where n="bits". That is, the
* returned value value can fit in "bits" unsigned bits, and is
* between 0 and (2^bits)-1.
*/
private static int hashFast(int x, byte bits) {
// Keep only the "bits" highest bits of the 32 *lowest* bits of the
// product (ignore overflowing bits of the 64-bit product result).
// The constant factor should distribute equally each byte of x in
// the returned bits.
return (int)(x * A_INT) >>> (32 - bits);
}
/*
* Returns the n-bit hash of x.toLowerCase(), where n=<tt>bits</tt>.
* That is, the returned value value can fit in "<tt>bits</tt>" unsigned
* bits, and is between 0 and <tt>(2 ^ bits) - 1</tt>.
*
* @param x the string to hash
* @param bits the number of bits to use in the resulting answer
* @return the hash value
* @see hash(String,int,int,byte)
*/
public static int hash(String x, byte bits) {
return hash(x, 0, x.length(), bits);
}
/**
* Returns the same value as hash(x.substring(start, end), bits), but tries
* to avoid allocations.<p>
*
* Note that x is lower-cased when hashing, using a locale-neutral
* character case conversion based on the UTF-16 representation of the
* source string to hash. So it is stable across all platforms and locales.
* However this does not only convert ASCII characters but ALL Unicode
* characters having a single lowercase mapping character. No attempt is
* made here to remove accents and diacritics.<p>
*
* The string is supposed to be in NFC canonical form, but this is not
* enforced here. Conversion to lowercase of characters uses Unicode rules
* built into the the java.lang.Character core class, excluding all special
* case rules (N-to-1, 1-to-M, N-to-M, locale-sensitive and contextual).<p>
*
* A better way to hash strings would be to use String conversion in the
* Locale.US context (for stability across servents) after transformation
* to NFKD and removal of all diacritics from hashed keywords. If needed,
* this should be done before splitting the query string into hashable
* keywords.
*
* @param x the string to hash
* @param bits the number of bits to use in the resulting answer
* @param start the start offset of the substring to hash
* @param end just PAST the end of the substring to hash
* @return the hash value
*/
public static int hash(String x, int start, int end, byte bits) {
//1. First turn x[start...end-1] into a number by treating all 4-byte
//chunks as a little-endian quadword, and XOR'ing the result together.
//We pad x with zeroes as needed.
// To avoid having do deal with special cases, we do this by XOR'ing
//a rolling value one byte at a time, taking advantage of the fact that
//x XOR 0==x.
int xor=0; //the running total
int j=0; //the byte position in xor. INVARIANT: j==8*((i-start)%4)
for (int i=start; i<end; i++) {
// internationalization be damned? Not a problem here:
// we just hash the lower 8 bits of the lowercase UTF-16 code-units
// representing characters, ignoring only the high 8 bits that
// indicate a Unicode page, and it is not very widely distributed
// even though they could also have feeded the hash function.
xor ^= (Character.toLowerCase(x.charAt(i)) & 0xFF) << j;
j = (j + 8) & 24;
}
//2. Now map number to range 0 - (2^bits-1).
return hashFast(xor, bits);
}
/**
* Returns a list of canonicalized keywords in the given file name, suitable
* for passing to hash(String,int). The returned keywords are
* lower-cased, though that is not strictly needed as hash ignores
* case.<p>
*
* This function is not consistent for case conversion: it uses a locale
* dependant String conversion, which also considers special casing rules
* (N-to-1, 1-to-M, N-to-N, locale-sensitive and contextual variants),
* unlike the simplified case conversion done in
* <tt>hash(String, int, int, byte)</tt>, which is locale-neutral.<p>
*
* A better way to hash strings would be to use String conversion in the
* Locale.US context (for stability across servents) after transformation
* to NFKD and removal of all diacritics from hashed keywords. If needed,
* this should be done before splitting the file name string into hashable
* keywords. Then we should remove the unneeded toLowerCase() call in
* the <tt>hash(String, int, int, byte)</tt> function.
*
* @param fileName The name of the file to break up into keywords. These
* keywords will subsequently be hashed for inclusion in the bit vector.
*/
public static String[] keywords(String filePath) {
//TODO1: this isn't a proper implementation. It should really be
//to tokenized by ALL non-alphanumeric characters.
//TODO2: perhaps we should do an English-specific version that accounts
//for plurals, common keywords, etc. But that's only necessary for
//our own files, since the assumption is that queries have already been
//canonicalized.
return StringUtils.split(
// TODO: a better canonicalForm(query) function here that
// also removes accents by converting first to NFKD and keeping
// only PRIMARY differences
I18NConvert.instance().getNorm(filePath),
FileManager.DELIMITERS);
}
/**
* Returns the index of the keyword starting at or after the i'th position
* of query, or -1 if no such luck.
*/
public static int keywordStart(String query, int i) {
//Search for the first character that is not a delimiterer TODO3: we can
//make this O(|DELIMITERS|) times faster by converting
//FileManager.DELIMITERS into a Set in this' static initializer. But
//then we have to allocate Strings here. Can work around the problem,
//but it's trouble.
final String DELIMITERS=FileManager.DELIMITERS;
for ( ; i<query.length() ; i++) {
char c=query.charAt(i);
//If c not in DELIMITERS, declare success.
if (DELIMITERS.indexOf(c)<0)
return i;
}
return -1;
}
/**
* Returns the index just past the end of the keyword starting at the i'th
* position of query, or query.length() if no such index.
*/
public static int keywordEnd(String query, int i) {
//Search for the first character that is a delimiter.
//TODO3: see above
final String DELIMITERS=FileManager.DELIMITERS;
for ( ; i<query.length() ; i++) {
char c=query.charAt(i);
//If c in DELIMITERS, declare success.
if (DELIMITERS.indexOf(c)>=0)
return i;
}
return query.length();
}
/**
* @return an array of strings with the original strings and prefixes
*/
public static String[] getPrefixes(String[] words){
// 1. Count the number of words that can have prefixes (5 chars or more)
int prefixable = 0;
for (int i = 0; i < words.length; i++) {
if (words[i].length() > 4)
prefixable++;
}
// 2. If none, just returns the same words (saves allocations)
if (prefixable == 0)
return words;
// 3. Create an expanded array with words and prefixes
final String[] retArray = new String[words.length + prefixable * 2];
int j = 0;
for (int i = 0; i < words.length; i++) {
final String word = words[i];
retArray[j++] = word;
final int len = word.length();
if (len > 4) {
retArray[j++] = word.substring(0, len - 1);
retArray[j++] = word.substring(0, len - 2);
}
}
return retArray;
}
}