package org.weiboad.ragnar.server.util;
import java.math.BigInteger;
import java.util.StringTokenizer;
public class SimHash {
private String tokens;
private BigInteger strSimHash;
private int hashbits = 128;
public SimHash(String tokens) {
this.tokens = tokens;
this.strSimHash = this.simHash();
}
public SimHash(String tokens, int hashbits) {
this.tokens = tokens;
this.hashbits = hashbits;
this.strSimHash = this.simHash();
}
public BigInteger getSimHash() {
return this.strSimHash;
}
public BigInteger simHash() {
int[] v = new int[this.hashbits];
StringTokenizer stringTokens = new StringTokenizer(this.tokens);
while (stringTokens.hasMoreTokens()) {
String temp = stringTokens.nextToken();
BigInteger t = this.hash(temp);
for (int i = 0; i < this.hashbits; i++) {
BigInteger bitmask = new BigInteger("1").shiftLeft(i);
if (t.and(bitmask).signum() != 0) {
v[i] += 1;
} else {
v[i] -= 1;
}
}
}
BigInteger fingerprint = new BigInteger("0");
for (int i = 0; i < this.hashbits; i++) {
if (v[i] >= 0) {
fingerprint = fingerprint.add(new BigInteger("1").shiftLeft(i));
}
}
return fingerprint;
}
private BigInteger hash(String source) {
if (source == null || source.length() == 0) {
return new BigInteger("0");
} else {
char[] sourceArray = source.toCharArray();
BigInteger x = BigInteger.valueOf(((long) sourceArray[0]) << 7);
BigInteger m = new BigInteger("1000003");
BigInteger mask = new BigInteger("2").pow(this.hashbits).subtract(
new BigInteger("1"));
for (char item : sourceArray) {
BigInteger temp = BigInteger.valueOf((long) item);
x = x.multiply(m).xor(temp).and(mask);
}
x = x.xor(new BigInteger(String.valueOf(source.length())));
if (x.equals(new BigInteger("-1"))) {
x = new BigInteger("-2");
}
return x;
}
}
public int hammingDistance(SimHash other) {
BigInteger m = new BigInteger("1").shiftLeft(this.hashbits).subtract(
new BigInteger("1"));
BigInteger x = this.strSimHash.xor(other.strSimHash).and(m);
int tot = 0;
while (x.signum() != 0) {
tot += 1;
x = x.and(x.subtract(new BigInteger("1")));
}
return tot;
}
/*public static void main(String[] args) {
String s = "This is a test string for testing";
SimHash hash1 = new SimHash(s, 128);
System.out.println(hash1.strSimHash + " " + hash1.strSimHash.bitLength());
s = "This is a test string for testing also";
SimHash hash2 = new SimHash(s, 128);
System.out.println(hash2.strSimHash+ " " + hash2.strSimHash.bitCount());
s = "This is a test string for testing als";
SimHash hash3 = new SimHash(s, 128);
System.out.println(hash3.strSimHash+ " " + hash3.strSimHash.bitCount());
System.out.println("============================");
System.out.println(hash1.hammingDistance(hash2));
System.out.println(hash1.hammingDistance(hash3));
}*/
}