package ivory.ffg.data;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
import java.util.Set;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import com.kamikaze.pfordelta.PForDelta;
/**
* Document vector representation using the hashing technique described
* in \cite{sigir2011 submission}.
*
* @author Nima Asadi
*/
public class DocumentVectorHashedArray implements DocumentVector {
private static final int[] CAPACITY = new int[]{256, 512, 1024, 2048, 4096, 8192, 16384, 32768};
private static final int[] SHIFT = new int[]{8, 9, 10, 11, 12, 13, 14, 15};
private static final int[] MASK = new int[]{0xFF, 0x1FF, 0x3FF, 0x7FF, 0xFFF, 0x1FFF, 0x3FFF, 0x7FFF};
/**
* \theta_{Collisions}
*/
private static final int MAX_COLLISIONS = 20;
private int[] exceptionBlock; //Block that contains the exceptions
private int[] compBlock; //Compressed document vector
private int documentLength; //Length of the original document
private byte exceptionLength; //Length of the exception table
private byte mask; //\omega_m
private byte hashIndex; //MASK[hashIndex] is \omega_h
@Override public void write(DataOutput output) throws IOException {
Preconditions.checkNotNull(output);
output.writeByte(exceptionLength);
if(exceptionLength > 0) {
output.writeInt(exceptionBlock.length);
for(int i = 0; i < exceptionBlock.length; i++) {
output.writeInt(exceptionBlock[i]);
}
}
output.writeInt(documentLength);
output.writeInt(compBlock.length);
for(int i = 0; i < compBlock.length; i++) {
output.writeInt(compBlock[i]);
}
output.writeByte(mask);
output.writeByte(hashIndex);
}
@Override public void readFields(DataInput input) throws IOException {
Preconditions.checkNotNull(input);
exceptionLength = input.readByte();
if(exceptionLength > 0) {
exceptionBlock = new int[input.readInt()];
for(int i = 0; i < exceptionBlock.length; i++) {
exceptionBlock[i] = input.readInt();
}
}
documentLength = input.readInt();
compBlock = new int[input.readInt()];
for(int i = 0; i < compBlock.length; i++) {
compBlock[i] = input.readInt();
}
mask = input.readByte();
hashIndex = input.readByte();
}
/**
* Reads and returns a document vector from input
*
* @param input DataInput
* @return A document vector
*/
public static DocumentVectorHashedArray readInstance(DataInput input) throws IOException {
Preconditions.checkNotNull(input);
DocumentVectorHashedArray document = new DocumentVectorHashedArray();
document.readFields(input);
return document;
}
private DocumentVectorHashedArray() {
}
// This hash function can be replaced by any integer hash function.
private static int hash(int key, int hashIndex) {
// Preconditions.checkArgument(hashIndex >= 0 && hashIndex < SHIFT.length);
return ((key >>> SHIFT[hashIndex]) ^ (key & MASK[hashIndex])) & MASK[hashIndex];
}
//Applies \omega_m
private int lowMask(int key) {
return (key & (0xFFFFFFFF>>>mask));
}
@Override public int[] transformTerms(int[] terms) {
Preconditions.checkNotNull(terms);
int[] hashedTerms = new int[terms.length];
if(exceptionLength == 0) {
if(hashIndex < CAPACITY.length) {
// Case 2(a)
for(int i = 0; i < terms.length; i++) {
hashedTerms[i] = hash(lowMask(terms[i]), hashIndex);
}
return hashedTerms;
} else {
// Case 1 and 3
for(int i = 0; i < terms.length; i++) {
hashedTerms[i] = lowMask(terms[i]);
}
return hashedTerms;
}
} else {
// Case 2(b)
int[] exception = new int[exceptionLength];
PForDelta.decompressOneBlock(exception, exceptionBlock, exceptionLength);
for(int i = 0; i < terms.length; i++) {
hashedTerms[i] = lowMask(terms[i]);
}
boolean inserted;
for(int t = 0; t < hashedTerms.length; t++) {
inserted = false;
for(int i = 0; i < exception.length; i += 2) {
if(exception[i] == hashedTerms[t]) {
hashedTerms[t] = exception[i + 1];
inserted = true;
break;
}
}
if(!inserted) {
hashedTerms[t] = hash(hashedTerms[t], hashIndex);
}
}
return hashedTerms;
}
}
@Override public int getDocumentLength() {
return documentLength;
}
@Override public int[] decompressDocument() throws IOException {
int[] decomp = new int[documentLength];
PForDelta.decompressOneBlock(decomp, compBlock, documentLength);
return decomp;
}
@Override public int[][] decompressPositions(int[] terms) throws IOException {
Preconditions.checkNotNull(terms);
return DocumentVectorUtility.getPositions(decompressDocument(), transformTerms(terms));
}
/**
* Creates an instance of this class by transforming the given
* document vector into the new (hashed) space.
*
* @param document Input document vector.
* @return Transformed document vector and the hash function.
*/
public static DocumentVectorHashedArray newInstance(int[] document) {
Preconditions.checkNotNull(document);
Set<Integer> termids = Sets.newHashSet();
Set<Integer> origTermids = Sets.newHashSet();
// Construct the set of unique terms within the input document.
for(int i = 0; i < document.length; i++) {
origTermids.add(document[i]);
}
// Search for \omega_m
int msb = 31;
for(int i = 31; i >= 0; i--) {
int mask = ~(1<<i);
int[] tempDocument = new int[document.length];
termids.clear();
for(int j = 0; j < document.length; j++) {
int d = document[j];
tempDocument[j] = (d & mask);
termids.add(tempDocument[j]);
}
int collision = origTermids.size() - termids.size();
if(collision == 0) {
for(int j = 0; j < document.length; j++) {
document[j] = tempDocument[j];
}
msb = i;
} else {
break;
}
}
// Case 1: if \omega_m <= \theta_\omega
if(msb <= SHIFT[0]) {
DocumentVectorHashedArray compDocument = new DocumentVectorHashedArray();
compDocument.compBlock = PForDelta.compressOneBlockOpt(document, document.length);
compDocument.documentLength = document.length;
compDocument.exceptionLength = (byte) 0;
compDocument.hashIndex = (byte) CAPACITY.length;
compDocument.mask = (byte) (32 - msb);
return compDocument;
}
termids.clear();
for(int i = 0; i < document.length; i++) {
termids.add(document[i]);
}
int[] uniqueTerms = new int[termids.size()];
int uniqueIndex = 0;
for(int term: termids) {
uniqueTerms[uniqueIndex++] = term;
}
Arrays.sort(uniqueTerms);
int hashIndex = 0;
for(int i = 0; i < CAPACITY.length; i++) {
if(uniqueTerms.length < CAPACITY[i] - 1) { //excluding zero
hashIndex = i;
break;
}
}
List<Integer> collisions = Lists.newArrayList();
List<Integer> code = Lists.newArrayList();
// Search for \omega_h such that |Collisions| < \theta_collisions
do {
termids.clear();
collisions.clear();
for(int i = uniqueTerms.length - 1; i >= 0; i--) {
int h = hash(uniqueTerms[i], hashIndex);
if(!termids.contains(h)) {
termids.add(h);
} else {
collisions.add(uniqueTerms[i]);
}
}
hashIndex++;
} while(collisions.size() > MAX_COLLISIONS && hashIndex < CAPACITY.length);
hashIndex--;
if(collisions.size() == 0) {
// Case 2(a): |Collisions| == 0
for(int i = 0; i < document.length; i++) {
document[i] = hash(document[i], hashIndex);
}
DocumentVectorHashedArray compDocument = new DocumentVectorHashedArray();
compDocument.compBlock = PForDelta.compressOneBlockOpt(document, document.length);
compDocument.documentLength = document.length;
compDocument.exceptionLength = (byte) 0;
compDocument.hashIndex = (byte) hashIndex;
compDocument.mask = (byte) (32 - msb);
return compDocument;
} else if(collisions.size() > MAX_COLLISIONS) {
// Case 3: |Collisions| > \theta_collisions for all possible \omega_h
DocumentVectorHashedArray compDocument = new DocumentVectorHashedArray();
compDocument.compBlock = PForDelta.compressOneBlockOpt(document, document.length);
compDocument.documentLength = document.length;
compDocument.exceptionLength = (byte) 0;
compDocument.hashIndex = (byte) CAPACITY.length;
compDocument.mask = (byte) (32 - msb);
return compDocument;
}
// Begin Case 2(b): Find exceptions and manuallay construct a hash table for them
int id = 1;
for(int i = 0; i < collisions.size(); i++) {
while(termids.contains(id)) {
id++;
}
code.add(id);
termids.add(id);
}
// Encode exceptions into an array of pairs <exception, hash>
int[] exception = new int[collisions.size() * 2];
int pos = 0;
for(int i = 0; i < collisions.size(); i++) {
exception[pos++] = collisions.get(i);
exception[pos++] = code.get(i);
}
// Encode the document using the manual table as well as the integer hash function,
// with parameters \omega_h and \omega_m
for(int i = 0; i < document.length; i++) {
boolean inserted = false;
for(int j = 0; j < exception.length; j += 2) {
if(exception[j] == document[i]) {
document[i] = exception[j + 1];
inserted = true;
break;
}
}
if(!inserted) {
document[i] = hash(document[i], hashIndex);
}
}
DocumentVectorHashedArray compDocument = new DocumentVectorHashedArray();
compDocument.compBlock = PForDelta.compressOneBlockOpt(document, document.length);
compDocument.exceptionBlock = PForDelta.compressOneBlockOpt(exception, exception.length);
compDocument.documentLength = document.length;
compDocument.exceptionLength = (byte) exception.length;
compDocument.hashIndex = (byte) hashIndex;
compDocument.mask = (byte) (32 - msb);
return compDocument;
}
@Override public boolean equals(Object o) {
Preconditions.checkNotNull(o);
Preconditions.checkArgument(o instanceof DocumentVectorHashedArray);
DocumentVectorHashedArray other = (DocumentVectorHashedArray) o;
if(this.hashIndex != other.hashIndex || this.mask != other.mask ||
this.exceptionLength != other.exceptionLength || this.documentLength != other.documentLength ||
this.compBlock.length !=other.compBlock.length ||
this.exceptionBlock.length != other.exceptionBlock.length) {
return false;
}
for(int i = 0; i < compBlock.length; i++) {
if(this.compBlock[i] != other.compBlock[i]) {
return false;
}
}
for(int i = 0; i < exceptionBlock.length; i++) {
if(this.exceptionBlock[i] != other.exceptionBlock[i]) {
return false;
}
}
return true;
}
}