package ivory.ffg.data;
import ivory.bloomir.data.CompressedPostings;
import ivory.core.data.index.TermPositions;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.List;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.kamikaze.pfordelta.PForDelta;
/**
* A compressed positional postings list representation. This class uses
* PForDelta to compress a given set of document ids into
* blocks of equal size.
*
* In this implementation, the positions are compressed using gamma coding.
*
* @author Nima Asadi
*/
public class CompressedPositionalPostings extends CompressedPostings {
private int[][] positionalSkipListCompressedBlock; // Compressed forward index to positions
private int positionalSkipListLastBlockSize;
private int[][] positionalCompressedBlock; // Encoded positions
private int positionalLastBlockSize;
private int[][] tfCompressedBlock;
private int tfLastBlockSize;
private int currentBlock = -1;
private int currentPositionsBlock = -1;
private int[] skipList = null;
private int[] termFrequency = null;
private int[] positions = null;
private CompressedPositionalPostings() {
super();
setBlockSize(128);
}
/**
* Constructs an instance of this class by encoding the document ids as well as their term positions
*
* @param data An array of document ids
* @param positions List of TermPosition objects, one for each document id
* @return A compressed positional postings list object
*/
public static CompressedPositionalPostings newInstance(int[] data, List<TermPositions> positions) throws IOException {
Preconditions.checkNotNull(data);
Preconditions.checkNotNull(positions);
Preconditions.checkArgument(data.length == positions.size());
CompressedPositionalPostings postings = new CompressedPositionalPostings();
// Use the super class to encode the document ids
postings.compressData(data);
// Create a forward index to the position array and encode the positions using
int[] skipList = new int[data.length];
int[] tf = new int[data.length];
int skipListIndex = 0;
List<Integer> buffer = Lists.newArrayList();
for(int i = 0; i < positions.size(); i++) {
int[] pos = positions.get(i).getPositions();
tf[skipListIndex] = pos.length;
skipList[skipListIndex++] = buffer.size();
if(pos.length > 0) {
buffer.add(pos[0]);
for(int j = 1; j < pos.length; j++) {
buffer.add(pos[j] - pos[j - 1]);
}
}
}
int[] tempPositions = new int[buffer.size()];
for(int i = 0; i < buffer.size(); i++) {
tempPositions[i] = buffer.get(i);
}
postings.positionalCompressedBlock =
DocumentVectorUtility.compressData(tempPositions, getBlockSize(), false);
postings.positionalLastBlockSize =
DocumentVectorUtility.lastBlockSize(tempPositions.length,
postings.positionalCompressedBlock.length,
getBlockSize());
postings.positionalSkipListCompressedBlock =
DocumentVectorUtility.compressData(skipList, getBlockSize(), true);
postings.positionalSkipListLastBlockSize =
DocumentVectorUtility.lastBlockSize(skipList.length,
postings.positionalSkipListCompressedBlock.length,
getBlockSize());
postings.tfCompressedBlock =
DocumentVectorUtility.compressData(tf, getBlockSize(), false);
postings.tfLastBlockSize =
DocumentVectorUtility.lastBlockSize(tf.length,
postings.tfCompressedBlock.length,
getBlockSize());
return postings;
}
/**
* Decompresses the forward index
*/
private void decompressMetaData(int index) {
int block = index/getBlockSize();
if(block == positionalSkipListCompressedBlock.length - 1) {
skipList = new int[positionalSkipListLastBlockSize];
termFrequency = new int[tfLastBlockSize];
} else {
skipList = new int[getBlockSize()];
termFrequency = new int[getBlockSize()];
}
PForDelta.decompressOneBlock(skipList, positionalSkipListCompressedBlock[block], skipList.length);
for(int i = 1; i < skipList.length; i++) {
skipList[i] += skipList[i - 1];
}
PForDelta.decompressOneBlock(termFrequency, tfCompressedBlock[block], termFrequency.length);
}
/**
* Retrieves positions for a document given a forward index and the index of the document
* in the document id vector.
*
* @param index Index of the document id in the document id vector
* @return Positions of the occurrences of the term within a document
*/
public int[] decompressPositions(int index) throws IOException {
Preconditions.checkArgument(index >= 0);
if((index/getBlockSize()) != currentBlock || skipList == null) {
decompressMetaData(index);
currentBlock = index/getBlockSize();
}
int[] buffer = new int[termFrequency[index % getBlockSize()]];
int beginOffset = skipList[index%getBlockSize()];
int endOffset = beginOffset + buffer.length - 1;
int block = beginOffset/getBlockSize();
if(block != currentPositionsBlock || positions == null) {
if(block == positionalCompressedBlock.length - 1) {
positions = new int[positionalLastBlockSize];
} else {
positions = new int[getBlockSize()];
}
PForDelta.decompressOneBlock(positions, positionalCompressedBlock[block], positions.length);
currentPositionsBlock = block;
}
beginOffset %= getBlockSize();
int bufferIndex = 0;
int endBlock = endOffset/getBlockSize();
endOffset %= getBlockSize();
if(endBlock != block) {
buffer[bufferIndex++] = positions[beginOffset];
for(int i = beginOffset + 1; i < positions.length; i++) {
buffer[bufferIndex] = positions[i] + buffer[bufferIndex - 1];
bufferIndex++;
}
for(int i = block + 1; i < endBlock; i++) {
PForDelta.decompressOneBlock(positions, positionalCompressedBlock[i], positions.length);
for(int j = 0; j < positions.length; j++) {
buffer[bufferIndex] = positions[j] + buffer[bufferIndex - 1];
bufferIndex++;
}
}
if(endBlock == positionalCompressedBlock.length - 1) {
positions = new int[positionalLastBlockSize];
} else {
positions = new int[getBlockSize()];
}
PForDelta.decompressOneBlock(positions, positionalCompressedBlock[endBlock], positions.length);
currentPositionsBlock = endBlock;
for(int i = 0; i <= endOffset; i++) {
buffer[bufferIndex] = positions[i] + buffer[bufferIndex - 1];
bufferIndex++;
}
} else {
buffer[bufferIndex++] = positions[beginOffset];
for(int i = beginOffset + 1; i <= endOffset; i++) {
buffer[bufferIndex] = positions[i] + buffer[bufferIndex - 1];
bufferIndex++;
}
}
return buffer;
}
public void close() {
termFrequency = null;
skipList = null;
positions = null;
currentBlock = -1;
currentPositionsBlock = -1;
}
@Override public void write(DataOutput output) throws IOException {
Preconditions.checkNotNull(output);
super.write(output);
output.writeInt(positionalSkipListLastBlockSize);
output.writeInt(positionalSkipListCompressedBlock.length);
for(int i = 0; i < positionalSkipListCompressedBlock.length; i++) {
output.writeInt(positionalSkipListCompressedBlock[i].length);
for(int j = 0; j < positionalSkipListCompressedBlock[i].length; j++) {
output.writeInt(positionalSkipListCompressedBlock[i][j]);
}
}
output.writeInt(tfLastBlockSize);
output.writeInt(tfCompressedBlock.length);
for(int i = 0; i < tfCompressedBlock.length; i++) {
output.writeInt(tfCompressedBlock[i].length);
for(int j = 0; j < tfCompressedBlock[i].length; j++) {
output.writeInt(tfCompressedBlock[i][j]);
}
}
output.writeInt(positionalLastBlockSize);
output.writeInt(positionalCompressedBlock.length);
for(int i = 0; i < positionalCompressedBlock.length; i++) {
output.writeInt(positionalCompressedBlock[i].length);
for(int j = 0; j < positionalCompressedBlock[i].length; j++) {
output.writeInt(positionalCompressedBlock[i][j]);
}
}
}
@Override public void readFields(DataInput input) throws IOException {
Preconditions.checkNotNull(input);
super.readFields(input);
positionalSkipListLastBlockSize = input.readInt();
positionalSkipListCompressedBlock = new int[input.readInt()][];
for(int i = 0; i < positionalSkipListCompressedBlock.length; i++) {
positionalSkipListCompressedBlock[i] = new int[input.readInt()];
for(int j = 0; j < positionalSkipListCompressedBlock[i].length; j++) {
positionalSkipListCompressedBlock[i][j] = input.readInt();
}
}
tfLastBlockSize = input.readInt();
tfCompressedBlock = new int[input.readInt()][];
for(int i = 0; i < tfCompressedBlock.length; i++) {
tfCompressedBlock[i] = new int[input.readInt()];
for(int j = 0; j < tfCompressedBlock[i].length; j++) {
tfCompressedBlock[i][j] = input.readInt();
}
}
positionalLastBlockSize = input.readInt();
positionalCompressedBlock = new int[input.readInt()][];
for(int i = 0; i < positionalCompressedBlock.length; i++) {
positionalCompressedBlock[i] = new int[input.readInt()];
for(int j = 0; j < positionalCompressedBlock[i].length; j++) {
positionalCompressedBlock[i][j] = input.readInt();
}
}
}
/**
* Reads and returns an instance of this class from input
*
* @param input DataInput
* @return An instance of the compressed positional postings list
*/
public static CompressedPositionalPostings readInstance(DataInput input) throws IOException {
Preconditions.checkNotNull(input);
CompressedPositionalPostings postings = new CompressedPositionalPostings();
postings.readFields(input);
return postings;
}
@Override public boolean equals(Object o) {
Preconditions.checkNotNull(o);
Preconditions.checkArgument(o instanceof CompressedPositionalPostings);
if(!super.equals(o)) {
return false;
}
CompressedPositionalPostings other = (CompressedPositionalPostings) o;
if(this.positionalSkipListLastBlockSize != other.positionalSkipListLastBlockSize) {
return false;
}
if(this.positionalSkipListCompressedBlock.length !=
other.positionalSkipListCompressedBlock.length) {
return false;
}
for(int i = 0; i < this.positionalSkipListCompressedBlock.length; i++) {
if(this.positionalSkipListCompressedBlock[i].length !=
other.positionalSkipListCompressedBlock[i].length) {
return false;
}
for(int j = 0; j < positionalSkipListCompressedBlock[i].length; j++) {
if(this.positionalSkipListCompressedBlock[i][j] !=
other.positionalSkipListCompressedBlock[i][j]) {
return false;
}
}
}
if(this.tfLastBlockSize != other.tfLastBlockSize) {
return false;
}
if(this.tfCompressedBlock.length !=
other.tfCompressedBlock.length) {
return false;
}
for(int i = 0; i < this.tfCompressedBlock.length; i++) {
if(this.tfCompressedBlock[i].length !=
other.tfCompressedBlock[i].length) {
return false;
}
for(int j = 0; j < tfCompressedBlock[i].length; j++) {
if(this.tfCompressedBlock[i][j] !=
other.tfCompressedBlock[i][j]) {
return false;
}
}
}
if(this.positionalLastBlockSize !=
other.positionalLastBlockSize) {
return false;
}
if(this.positionalCompressedBlock.length !=
other.positionalCompressedBlock.length) {
return false;
}
for(int i = 0; i < this.positionalCompressedBlock.length; i++) {
if(this.positionalCompressedBlock[i].length !=
other.positionalCompressedBlock[i].length) {
return false;
}
for(int j = 0; j < positionalCompressedBlock[i].length; j++) {
if(this.positionalCompressedBlock[i][j] !=
other.positionalCompressedBlock[i][j]) {
return false;
}
}
}
return true;
}
}