package ivory.ffg.data; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import java.util.List; import com.google.common.base.Preconditions; import com.google.common.collect.Lists; import com.kamikaze.pfordelta.PForDelta; import ivory.core.data.index.TermPositions; /** * Implementation of an (mini-)indexed document vector refered to as IDV. * In this class, term ids are encoded using PForDelta whereas positions * are compressed using the gamma coding. * * @author Nima Asadi */ public class DocumentVectorMiniInvertedIndex implements DocumentVector { private static final int BLOCK_SIZE = 128; private int documentLength; // Length of the document vector private int termidsLastBlockSize; private int[][] termids; // Compressed term id vectors private int positionalSkipListLastBlockSize; //Length of the forward index vector (holds the index of position arrays) private int[][] positionalSkipListCompressedBlock; // Compressed forward index vector private int tfLastBlockSize; private int[][] tfCompressedBlock; // Compressed forward index vector private int positionalLastBlockSize; private int[][] positionalCompressedBlock; // Array of positions protected DocumentVectorMiniInvertedIndex() { } /** * Constructs an indexed document vector given the input term id vector and the term positions * * @param data Array of term ids * @param positions List of TermPosition objects, one for each term id * @return A compressed, indexed document vector. */ public static DocumentVectorMiniInvertedIndex newInstance(int[] data, List<TermPositions> positions, int documentLength) throws IOException { Preconditions.checkNotNull(data); Preconditions.checkNotNull(positions); Preconditions.checkArgument(data.length == positions.size()); Preconditions.checkArgument(documentLength >= data.length); DocumentVectorMiniInvertedIndex postings = new DocumentVectorMiniInvertedIndex(); // Compress term ids using PForDelta postings.documentLength = documentLength; postings.termids = DocumentVectorUtility.compressData(data, BLOCK_SIZE, true); postings.termidsLastBlockSize = DocumentVectorUtility.lastBlockSize(data.length, postings.termids.length, BLOCK_SIZE); // Create a forward index to the position array and encode the positions int[] skipList = new int[data.length]; int[] tf = new int[data.length]; int skipListIndex = 0; List<Integer> buffer = Lists.newArrayList(); for(int i = 0; i < positions.size(); i++) { int[] pos = positions.get(i).getPositions(); tf[skipListIndex] = pos.length; skipList[skipListIndex++] = buffer.size(); if(pos.length > 0) { buffer.add(pos[0]); for(int j = 1; j < pos.length; j++) { buffer.add(pos[j] - pos[j - 1]); } } } int[] tempPositions = new int[buffer.size()]; for(int i = 0; i < buffer.size(); i++) { tempPositions[i] = buffer.get(i); } postings.positionalCompressedBlock = DocumentVectorUtility.compressData(tempPositions, BLOCK_SIZE, false); postings.positionalLastBlockSize = DocumentVectorUtility.lastBlockSize(tempPositions.length, postings.positionalCompressedBlock.length, BLOCK_SIZE); postings.positionalSkipListCompressedBlock = DocumentVectorUtility.compressData(skipList, BLOCK_SIZE, true); postings.positionalSkipListLastBlockSize = DocumentVectorUtility.lastBlockSize(skipList.length, postings.positionalSkipListCompressedBlock.length, BLOCK_SIZE); postings.tfCompressedBlock = DocumentVectorUtility.compressData(tf, BLOCK_SIZE, false); postings.tfLastBlockSize = DocumentVectorUtility.lastBlockSize(tf.length, postings.tfCompressedBlock.length, BLOCK_SIZE); return postings; } /** * Decompresses the forward index to position array. * * @return Decompressed forward index used when retrieving positions */ private int[] decompressSkipList(int block) { int[] outBlock = null; if(block == positionalSkipListCompressedBlock.length - 1) { outBlock = new int[positionalSkipListLastBlockSize]; } else { outBlock = new int[BLOCK_SIZE]; } PForDelta.decompressOneBlock(outBlock, positionalSkipListCompressedBlock[block], outBlock.length); for(int i = 1; i < outBlock.length; i++) { outBlock[i] += outBlock[i - 1]; } return outBlock; } private int[] decompressTermFrequency(int block) { int[] outBlock = null; if(block == tfCompressedBlock.length - 1) { outBlock = new int[tfLastBlockSize]; } else { outBlock = new int[BLOCK_SIZE]; } PForDelta.decompressOneBlock(outBlock, tfCompressedBlock[block], outBlock.length); return outBlock; } /** * Decompresses the term id array * * @return Decompressed term id array */ private int[] decompressTermids(int block) { int[] outBlock = null; if(block == termids.length - 1) { outBlock = new int[termidsLastBlockSize]; } else { outBlock = new int[BLOCK_SIZE]; } PForDelta.decompressOneBlock(outBlock, termids[block], outBlock.length); for(int i = 1; i < outBlock.length; i++) { outBlock[i] += outBlock[i - 1]; } return outBlock; } @Override public int[][] decompressPositions(int[] terms) throws IOException { Preconditions.checkNotNull(terms); int[][] positions = new int[terms.length][]; if(documentLength == 0) { return positions; } int skipListBlock = -1; int positionBlock = -1; int[] tempSkipList = null; int[] tempTermFrequency = null; int[] tempPositions = null; int found = 0; for(int i = 0; i < termids.length; i++) { if(found >= terms.length) { return positions; } int[] tids = decompressTermids(i); for(int j = 0; j < tids.length; j++) { if(found >= terms.length) { return positions; } for(int q = 0; q < terms.length; q++) { if(terms[q] == tids[j]) { found++; if(i != skipListBlock || tempSkipList == null) { tempSkipList = decompressSkipList(i); tempTermFrequency = decompressTermFrequency(i); skipListBlock = i; } positions[q] = new int[tempTermFrequency[j]]; int beginOffset = tempSkipList[j]; int endOffset = beginOffset + positions[q].length - 1; int block = beginOffset/BLOCK_SIZE; if(block != positionBlock || tempPositions == null) { if(block == positionalCompressedBlock.length - 1) { tempPositions = new int[positionalLastBlockSize]; } else { tempPositions = new int[BLOCK_SIZE]; } PForDelta.decompressOneBlock(tempPositions, positionalCompressedBlock[block], tempPositions.length); positionBlock = block; } beginOffset %= BLOCK_SIZE; int bufferIndex = 0; int endBlock = endOffset/BLOCK_SIZE; endOffset %= BLOCK_SIZE; if(endBlock != block) { positions[q][bufferIndex++] = tempPositions[beginOffset]; for(int o = beginOffset + 1; o < tempPositions.length; o++) { positions[q][bufferIndex] = tempPositions[o] + positions[q][bufferIndex - 1]; bufferIndex++; } for(int k = block + 1; k < endBlock; k++) { PForDelta.decompressOneBlock(tempPositions, positionalCompressedBlock[k], tempPositions.length); for(int l = 0; l < tempPositions.length; l++) { positions[q][bufferIndex] = tempPositions[l] + positions[q][bufferIndex - 1]; bufferIndex++; } } if(endBlock == positionalCompressedBlock.length - 1) { tempPositions = new int[positionalLastBlockSize]; } else { tempPositions = new int[BLOCK_SIZE]; } PForDelta.decompressOneBlock(tempPositions, positionalCompressedBlock[endBlock], tempPositions.length); positionBlock = endBlock; for(int o = 0; o <= endOffset; o++) { positions[q][bufferIndex] = tempPositions[o] + positions[q][bufferIndex - 1]; bufferIndex++; } } else { positions[q][bufferIndex++] = tempPositions[beginOffset]; for(int o = beginOffset + 1; o <= endOffset; o++) { positions[q][bufferIndex] = tempPositions[o] + positions[q][bufferIndex - 1]; bufferIndex++; } } } } } } return positions; } @Override public int[] decompressDocument() throws IOException { throw new UnsupportedOperationException("Implementation not available!"); } @Override public int[] transformTerms(int[] terms) { return terms; } @Override public int getDocumentLength() { return documentLength; } @Override public void write(DataOutput output) throws IOException { Preconditions.checkNotNull(output); output.writeInt(documentLength); output.writeInt(termidsLastBlockSize); output.writeInt(termids.length); for(int i = 0; i < termids.length; i++) { output.writeInt(termids[i].length); for(int j = 0; j < termids[i].length; j++) { output.writeInt(termids[i][j]); } } output.writeInt(positionalSkipListLastBlockSize); output.writeInt(positionalSkipListCompressedBlock.length); for(int i = 0; i < positionalSkipListCompressedBlock.length; i++) { output.writeInt(positionalSkipListCompressedBlock[i].length); for(int j = 0; j < positionalSkipListCompressedBlock[i].length; j++) { output.writeInt(positionalSkipListCompressedBlock[i][j]); } } output.writeInt(tfLastBlockSize); output.writeInt(tfCompressedBlock.length); for(int i = 0; i < tfCompressedBlock.length; i++) { output.writeInt(tfCompressedBlock[i].length); for(int j = 0; j < tfCompressedBlock[i].length; j++) { output.writeInt(tfCompressedBlock[i][j]); } } output.writeInt(positionalLastBlockSize); output.writeInt(positionalCompressedBlock.length); for(int i = 0; i < positionalCompressedBlock.length; i++) { output.writeInt(positionalCompressedBlock[i].length); for(int j = 0; j < positionalCompressedBlock[i].length; j++) { output.writeInt(positionalCompressedBlock[i][j]); } } } @Override public void readFields(DataInput input) throws IOException { Preconditions.checkNotNull(input); documentLength = input.readInt(); termidsLastBlockSize = input.readInt(); termids = new int[input.readInt()][]; for(int i = 0; i < termids.length; i++) { termids[i] = new int[input.readInt()]; for(int j = 0; j < termids[i].length; j++) { termids[i][j] = input.readInt(); } } positionalSkipListLastBlockSize = input.readInt(); positionalSkipListCompressedBlock = new int[input.readInt()][]; for(int i = 0; i < positionalSkipListCompressedBlock.length; i++) { positionalSkipListCompressedBlock[i] = new int[input.readInt()]; for(int j = 0; j < positionalSkipListCompressedBlock[i].length; j++) { positionalSkipListCompressedBlock[i][j] = input.readInt(); } } tfLastBlockSize = input.readInt(); tfCompressedBlock = new int[input.readInt()][]; for(int i = 0; i < tfCompressedBlock.length; i++) { tfCompressedBlock[i] = new int[input.readInt()]; for(int j = 0; j < tfCompressedBlock[i].length; j++) { tfCompressedBlock[i][j] = input.readInt(); } } positionalLastBlockSize = input.readInt(); positionalCompressedBlock = new int[input.readInt()][]; for(int i = 0; i < positionalCompressedBlock.length; i++) { positionalCompressedBlock[i] = new int[input.readInt()]; for(int j = 0; j < positionalCompressedBlock[i].length; j++) { positionalCompressedBlock[i][j] = input.readInt(); } } } /** * Reads and returns an instance of this class from input. * * @param input DataInput * @return An instance of this class. */ public static DocumentVectorMiniInvertedIndex readInstance(DataInput input) throws IOException { Preconditions.checkNotNull(input); DocumentVectorMiniInvertedIndex postings = new DocumentVectorMiniInvertedIndex(); postings.readFields(input); return postings; } @Override public boolean equals(Object o) { Preconditions.checkNotNull(o); Preconditions.checkArgument(o instanceof DocumentVectorMiniInvertedIndex); DocumentVectorMiniInvertedIndex other = (DocumentVectorMiniInvertedIndex) o; if(this.documentLength != other.documentLength || this.termidsLastBlockSize != other.termidsLastBlockSize || this.termids.length != other.termids.length || this.positionalSkipListLastBlockSize != other.positionalSkipListLastBlockSize || this.positionalSkipListCompressedBlock.length != other.positionalSkipListCompressedBlock.length || this.tfLastBlockSize != other.tfLastBlockSize || this.tfCompressedBlock.length != other.tfCompressedBlock.length || this.positionalLastBlockSize != other.positionalLastBlockSize || this.positionalCompressedBlock.length != other.positionalCompressedBlock.length) { return false; } for(int i = 0; i < termids.length; i++) { if(this.termids[i].length != other.termids[i].length) { return false; } for(int j = 0; j < termids[i].length; j++) { if(this.termids[i][j] != other.termids[i][j]) { return false; } } } for(int i = 0; i < positionalSkipListCompressedBlock.length; i++) { if(this.positionalSkipListCompressedBlock[i].length != other.positionalSkipListCompressedBlock[i].length) { return false; } for(int j = 0; j < positionalSkipListCompressedBlock[i].length; j++) { if(this.positionalSkipListCompressedBlock[i][j] != other.positionalSkipListCompressedBlock[i][j]) { return false; } } } for(int i = 0; i < tfCompressedBlock.length; i++) { if(this.tfCompressedBlock[i].length != other.tfCompressedBlock[i].length) { return false; } for(int j = 0; j < tfCompressedBlock[i].length; j++) { if(this.tfCompressedBlock[i][j] != other.tfCompressedBlock[i][j]) { return false; } } } for(int i = 0; i < positionalCompressedBlock.length; i++) { if(this.positionalCompressedBlock[i].length != other.positionalCompressedBlock[i].length) { return false; } for(int j = 0; j < positionalCompressedBlock[i].length; j++) { if(this.positionalCompressedBlock[i][j] != other.positionalCompressedBlock[i][j]) { return false; } } } return true; } }