package com.cse10.duplicateDetector;
import java.util.List;
/**
* Created by Chamath on 1/2/2015.
*/
public class SimHashCalculator {
private WordSegmenter wordSegmenter;
private HashCalculator hashCalculator;
public SimHashCalculator(WordSegmenter wordSegmenter) {
this.wordSegmenter = wordSegmenter;
hashCalculator = new HashCalculator();
}
/**
* calculate signature/finger print value (64 bit) for the given document
*
* @param document
* @return
*/
public long getSimhash64Value(String document) {
int bitLen = 64;
int[] bits = new int[bitLen];
List<String> tokens = wordSegmenter.getWords(document);
//for each token in string
for (String word : tokens) {
//calculate 64 bit hash value for each token
word = word.toLowerCase();
long hashValue = hashCalculator.getHash64Value(word);
for (int i = bitLen; i >= 1; --i) {
if (((hashValue >> (bitLen - i)) & 1) == 1) {
++bits[i - 1];
} else {
--bits[i - 1];
}
}
}
long hash = 0x0000000000000000;
long one = 0x0000000000000001;
for (int i = bitLen; i >= 1; --i) {
if (bits[i - 1] > 1) {
hash |= one;
}
one = one << 1;
}
return hash;
}
/**
* calculate signature/finger print value (32 bit) for the given document
*
* @param document
* @return
*/
public long getSimhash32Value(String document) {
int bitLen = 32;
int[] bits = new int[bitLen];
List<String> tokens = wordSegmenter.getWords(document);
for (String t : tokens) {
int v = hashCalculator.getHash32Value(t);
for (int i = bitLen; i >= 1; --i) {
if (((v >> (bitLen - i)) & 1) == 1) {
++bits[i - 1];
} else {
--bits[i - 1];
}
}
}
int hash = 0x00000000;
int one = 0x00000001;
for (int i = bitLen; i >= 1; --i) {
if (bits[i - 1] > 1) {
hash |= one;
}
one = one << 1;
}
return hash;
}
}