package ivory.ffg.data; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.DataInput; import java.io.IOException; import java.util.List; import com.google.common.collect.Lists; import com.kamikaze.pfordelta.PForDelta; import ivory.core.compression.BitInputStream; import ivory.core.compression.BitOutputStream; import ivory.core.data.document.IntDocVector; import ivory.core.data.index.TermPositions; /** * Auxiliary functions * * @author Nima Asadi */ public class DocumentVectorUtility { public static final int BLOCK_SIZE = 128; public static final int MAX_POSITIONS = 100; private static final int[] TEMP_POSITIONS = new int[MAX_POSITIONS]; /** * Given a document vector and an array of query terms, this function * constructs the positions. * * @param doc Document vector * @param terms Query terms * @return Position array for every query term */ public static int[][] getPositions(int[] doc, int[] terms) { int[][] positions = new int[terms.length][]; int pindex = 0; for(int i = 0; i < terms.length; i++) { pindex = 0; for(int j = 0; j < doc.length && pindex < TEMP_POSITIONS.length; j++) { if(doc[j] == terms[i]) { TEMP_POSITIONS[pindex++] = j + 1; } } positions[i] = new int[pindex]; for(int j = 0; j < positions[i].length; j++) { positions[i][j] = TEMP_POSITIONS[j]; } } return positions; } /** * Serializes the positions using gamma codes * * @param positions Array of positions for a term * @return Serialized positions (using gamma codes) */ public static byte[] serializePositions(int[] positions) throws IOException { ByteArrayOutputStream b = new ByteArrayOutputStream(); BitOutputStream t = new BitOutputStream(b); t.writeGamma(positions.length); for (int i = 0; i < positions.length; i++) { if (i == 0) { t.writeGamma(positions[0] + 1); } else { int pgap = positions[i] - positions[i - 1]; t.writeGamma(pgap); } } t.padAndFlush(); t.close(); return b.toByteArray(); } /** * Deserializes the gamma-encoded positions. * * @param bytes Serialized positions * @return A decoded integer array of positions */ public static int[] deserializePositions(byte[] bytes) throws IOException { ByteArrayInputStream byteStream = new ByteArrayInputStream(bytes); BitInputStream bitStream = new BitInputStream(byteStream); int[] positions = new int[bitStream.readGamma()]; for(int i = 0; i < positions.length; i++) { if (i == 0) { positions[i] = bitStream.readGamma() - 1; } else { positions[i] = (positions[i - 1] + bitStream.readGamma()); } } bitStream.close(); return positions; } /** * Compresses positions using PForDelta compression * * @return Serialized positions (using PForDelta) */ public static int[][] compressData(int[] data, int blockSize, boolean computeGaps) { // Data is stored in blocks of equal size.. int nbBlocks = (int) Math.ceil(((double) data.length) / ((double) blockSize)); int[][] compressedBlocks = new int[nbBlocks][]; int[] temp = new int[blockSize]; // Compress all blocks except for the last block which might // contain fewer elements. for(int i = 0; i < nbBlocks - 1; i++) { if(!computeGaps) { for(int j = 0; j < temp.length; j++) { temp[j] = data[i * blockSize + j]; } } else { temp[0] = data[i * blockSize]; int pre = temp[0]; for(int j = 1; j < temp.length; j++) { temp[j] = data[i * blockSize + j] - pre; pre = data[i * blockSize + j]; } } compressedBlocks[i] = PForDelta.compressOneBlockOpt(temp, blockSize); } // Compress the last block int remaining = lastBlockSize(data.length, nbBlocks, blockSize); temp = new int[remaining]; if(!computeGaps) { for(int j = 0; j < temp.length; j++) { temp[j] = data[(nbBlocks - 1) * blockSize + j]; } } else { temp[0] = data[(nbBlocks - 1) * blockSize]; int pre = temp[0]; for(int j = 1; j < temp.length; j++) { temp[j] = data[(nbBlocks - 1) * blockSize + j] - pre; pre = data[(nbBlocks - 1) * blockSize + j]; } } compressedBlocks[nbBlocks - 1] = PForDelta.compressOneBlockOpt(temp, remaining); return compressedBlocks; } public static int lastBlockSize(int dataLength, int nbBlocks, int blockSize) { return dataLength - ((nbBlocks - 1) * blockSize); } /** * Factory method * * @param documentVectorClass DocumentVector class * @param document IntDocVector (term positions start from 1) * @return New DocumentVector (term positions start from 1) */ public static DocumentVector newInstance(String documentVectorClass, IntDocVector document) throws Exception { IntDocVector.Reader r = document.getReader(); if(documentVectorClass.equals(DocumentVectorMiniInvertedIndex.class.getName())) { List<Integer> termids = Lists.newArrayList(); List<TermPositions> positions = Lists.newArrayList(); int cnt = 0; while(r.hasMoreTerms()) { termids.add(r.nextTerm()); int[] p = r.getPositions(); positions.add(new TermPositions(p, r.getTf())); for(int j = 0; j < p.length; j++) { if(p[j] > cnt) { cnt = p[j]; } } } int[] data = new int[termids.size()]; for(int i = 0; i < termids.size(); i++) { data[i] = termids.get(i); } return DocumentVectorMiniInvertedIndex.newInstance(data, positions, cnt); } int cnt = 0; while(r.hasMoreTerms()) { r.nextTerm(); int[] p = r.getPositions(); for(int j = 0; j < p.length; j++) { if(p[j] > cnt) { cnt = p[j]; } } } r = document.getReader(); int[] data = new int[cnt]; while(r.hasMoreTerms()) { int id = r.nextTerm(); int[] p = r.getPositions(); for(int j = 0; j < p.length; j++) { data[p[j] - 1] = id; } } if(documentVectorClass.equals(DocumentVectorHashedArray.class.getName())) { return DocumentVectorHashedArray.newInstance(data); } else if(documentVectorClass.equals(DocumentVectorPForDeltaArray.class.getName())) { return DocumentVectorPForDeltaArray.newInstance(data); } else if(documentVectorClass.equals(DocumentVectorVIntArray.class.getName())) { return DocumentVectorVIntArray.newInstance(data); } else { throw new ClassNotFoundException("DocumentVector " + documentVectorClass + " class not found!"); } } /** * Reads an instance of DocumentVector from input * * @param input DataInput * @param documentVectorClass DocumentVector class * @return DocumentVector object */ public static DocumentVector readInstance(String documentVectorClass, DataInput input) throws Exception { if(documentVectorClass.equals(DocumentVectorMiniInvertedIndex.class.getName())) { return DocumentVectorMiniInvertedIndex.readInstance(input); } else if(documentVectorClass.equals(DocumentVectorHashedArray.class.getName())) { return DocumentVectorHashedArray.readInstance(input); } else if(documentVectorClass.equals(DocumentVectorPForDeltaArray.class.getName())) { return DocumentVectorPForDeltaArray.readInstance(input); } else if(documentVectorClass.equals(DocumentVectorVIntArray.class.getName())) { return DocumentVectorVIntArray.readInstance(input); } else { throw new ClassNotFoundException("DocumentVector " + documentVectorClass + " class not found!"); } } }