package com.alimama.mdrill.buffer; /** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.util.Arrays; /** * Implementation of the optimized PForDelta algorithm for sorted integer arrays. The basic ideas are based on * * 1. Original algorithm from * http://homepages.cwi.nl/~heman/downloads/msthesis.pdf * * 2. Optimization and * variation from http://www2008.org/papers/pdf/p387-zhangA.pdf * * 3. Further optimization * http://www2009.org/proceedings/pdf/p401.pdf * * As a part of the PForDelta implementation, Simple16 is used to compress exceptions. The original Simple16 algorithm can also be found in the above literatures. * @author hao yan, hyan2008@gmail.com */ // nocommit -- must merge our 2 pfor impls before landing on trunk // nocommit -- need serious random unit test for these int encoders public class PForDelta{ //All possible values of b in the PForDelta algorithm private static final int[] POSSIBLE_B = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,16,20,28}; // Max number of bits to store an uncompressed value private static final int MAX_BITS = 32; // Header records the value of b and the number of exceptions in the block private static final int HEADER_NUM = 1; // Header size in bits private static final int HEADER_SIZE = MAX_BITS * HEADER_NUM; private static final int[] MASK = {0x00000000, 0x00000001, 0x00000003, 0x00000007, 0x0000000f, 0x0000001f, 0x0000003f, 0x0000007f, 0x000000ff, 0x000001ff, 0x000003ff, 0x000007ff, 0x00000fff, 0x00001fff, 0x00003fff, 0x00007fff, 0x0000ffff, 0x0001ffff, 0x0003ffff, 0x0007ffff, 0x000fffff, 0x001fffff, 0x003fffff, 0x007fffff, 0x00ffffff, 0x01ffffff, 0x03ffffff, 0x07ffffff, 0x0fffffff, 0x1fffffff, 0x3fffffff, 0x7fffffff, 0xffffffff}; public static void main(String[] args) { long t1=System.currentTimeMillis(); int[] test=new int[100000]; for(int j=0;j<test.length;j++) { test[j]=(int) (500*Math.random()); } for(int i=0;i<1000;i++) { int[] compress=compressOneBlock(test, test.length); int[] result=decompressOneBlock(compress,test.length); // System.out.println(Arrays.toString(test)); // System.out.println(Arrays.toString(compress)); // System.out.println(Arrays.equals(result, test)+","+compress.length+","+test.length+","+compress.length*100d/test.length); } System.out.println(System.currentTimeMillis()-t1); } /** * Compress one block of blockSize integers using PForDelta with the optimal parameter b * @param inBlock the block to be compressed * @param blockSize the block size * @return the compressed block */ public static int[] compressOneBlock(final int[] inBlock, int blockSize) { // find the best b that can lead to the smallest overall compressed size int currentB = POSSIBLE_B[0]; int tmpB = currentB; int optSize = estimateCompressedSize(inBlock, tmpB, blockSize); for (int i = 1; i < POSSIBLE_B.length; ++i) { tmpB = POSSIBLE_B[i]; int curSize = estimateCompressedSize(inBlock, tmpB, blockSize); if(curSize < optSize) { currentB = tmpB; optSize = curSize; } } // compress the block using the above best b int[] outBlock = compressOneBlockCore(inBlock, currentB, blockSize); return outBlock; } /** * Decompress one block using PForDelta * @param inBlock the block to be decompressed * @param blockSize the number of elements in the decompressed block * @return the decompressed block */ public static int[] decompressOneBlock(int[] inBlock, int blockSize) { int[] expPos = new int[blockSize]; int[] expHighBits = new int[blockSize]; int[] outBlock = new int[blockSize]; assert inBlock != null; /* if(inBlock == null) { System.out.println("error: compBlock is null"); return null; } */ int expNum = inBlock[0] & 0x3ff; int bits = (inBlock[0]>>>10) & (0x1f); // decompress the b-bit slots int offset = HEADER_SIZE; int compressedBits = 0; if(bits == 0) { Arrays.fill(outBlock,0); } else { compressedBits = decompressBBitSlots(outBlock, inBlock, blockSize, bits); } offset += compressedBits; // decompress exceptions if(expNum>0) { compressedBits = decompressBlockByS16(expPos, inBlock, offset, expNum); offset += compressedBits; compressedBits = decompressBlockByS16(expHighBits, inBlock, offset, expNum); offset += compressedBits; for (int i = 0; i < expNum; i++) { int curExpPos = expPos[i] ; int curHighBits = expHighBits[i]; outBlock[curExpPos] = (outBlock[curExpPos] & MASK[bits]) | ((curHighBits & MASK[32-bits] ) << bits); } } return outBlock; } /** * Estimate the compressed size in ints of a block * @param inputBlock the block to be compressed * @param bits the value of the parameter b * @param blockSize the block size * @return the compressed size in ints * @throws IllegalArgumentException */ private static int estimateCompressedSize(int[] inputBlock, int bits, int blockSize) throws IllegalArgumentException { int maxNoExp = (1<<bits)-1; // Size of the header and the bits-bit slots int outputOffset = HEADER_SIZE + bits * blockSize; int expNum = 0; for (int i = 0; i<blockSize; ++i) { if (inputBlock[i] > maxNoExp) { expNum++; } } outputOffset += (expNum<<5); return outputOffset; } /** * The core implementation of compressing a block with blockSize integers using PForDelta with the given parameter b * @param inputBlock the block to be compressed * @param bits the the value of the parameter b * @param blockSize the block size * @return the compressed block * @throws IllegalArgumentException */ private static int[] compressOneBlockCore(int[] inputBlock, int bits, int blockSize) throws IllegalArgumentException { int[] expPos = new int[blockSize]; int[] expHighBits = new int[blockSize]; int maxCompBitSize = HEADER_SIZE + blockSize * (MAX_BITS + MAX_BITS + MAX_BITS) + 32; int[] tmpCompressedBlock = new int[(maxCompBitSize>>>5)]; int outputOffset = HEADER_SIZE; int expUpperBound = 1<<bits; int expNum = 0; // compress the b-bit slots for (int i = 0; i<blockSize; ++i) { assert inputBlock[i] >= 0: "input value is " + inputBlock[i]; /* if(inputBlock[i] < 0) { System.out.println("haha<0: [" + i +"]" + inputBlock[i]); } */ if (inputBlock[i] < expUpperBound) { writeBits(tmpCompressedBlock, inputBlock[i], outputOffset, bits); } else // exp { // store the lower bits-bits of the exception writeBits(tmpCompressedBlock, inputBlock[i] & MASK[bits], outputOffset, bits); // write the position of exception expPos[expNum] = i; // write the higher 32-bits bits of the exception expHighBits[expNum] = (inputBlock[i] >>> bits) & MASK[32-bits]; expNum++; } outputOffset += bits; } // the first int in the compressed block stores the value of b and the number of exceptions tmpCompressedBlock[0] = ((bits & MASK[10]) << 10) | (expNum & 0x3ff); // compress exceptions if(expNum>0) { int compressedBitSize = compressBlockByS16(tmpCompressedBlock, outputOffset, expPos, expNum, blockSize, inputBlock); outputOffset += compressedBitSize; compressedBitSize = compressBlockByS16(tmpCompressedBlock, outputOffset, expHighBits, expNum, blockSize, inputBlock); outputOffset += compressedBitSize; } // discard the redundant parts in the tmpCompressedBlock int compressedSizeInInts = (outputOffset+31)>>>5; int[] compBlock; compBlock = new int[compressedSizeInInts]; System.arraycopy(tmpCompressedBlock,0, compBlock, 0, compressedSizeInInts); return compBlock; } /** * Decompress b-bit slots * @param outDecompSlots decompressed block which is the output * @param inCompBlock the compressed block which is the input * @param blockSize the block size * @param bits the value of the parameter b * @return the compressed size in bits of the data that has been decompressed */ private static int decompressBBitSlots(int[] outDecompSlots, int[] inCompBlock, int blockSize, int bits) { int compressedBitSize = 0; int offset = HEADER_SIZE; for(int i =0; i<blockSize; i++) { outDecompSlots[i] = readBits(inCompBlock, offset, bits); offset += bits; } compressedBitSize = bits * blockSize; return compressedBitSize; } /** * Compress a block of blockSize integers using Simple16 algorithm * @param outCompBlock the compressed block which is the output * @param outStartOffsetInBits the start offset in bits of the compressed block * @param inBlock the block to be compressed * @param blockSize the block size * @return the compressed size in bits */ private static int compressBlockByS16(int[] outCompBlock, int outStartOffsetInBits, int[] inBlock, int blockSize, int oriBlockSize, int[] oriInputBlock) { int outOffset = (outStartOffsetInBits+31)>>>5; int num, inOffset=0, numLeft; for(numLeft=blockSize; numLeft>0; numLeft -= num) { num = Simple16.s16Compress(outCompBlock, outOffset, inBlock, inOffset, numLeft, blockSize, oriBlockSize, oriInputBlock); assert num >= 0; /* if(num<0) { System.out.println("oops: s16 get -1 "); } */ outOffset++; inOffset += num; } int compressedBitSize = (outOffset<<5)-outStartOffsetInBits; return compressedBitSize; } /** * Decompress a block of blockSize integers using Simple16 algorithm * @param outDecompBlock the decompressed block which is the output * @param inCompBlock the compressed block which is the input * @param blockSize the block size * @param inStartOffsetInBits the start offset in bits of the compressed block * @return the compressed size in bits of the data that has been decompressed */ private static int decompressBlockByS16(int[] outDecompBlock, int[] inCompBlock, int inStartOffsetInBits, int blockSize) { int inOffset = (inStartOffsetInBits+31)>>>5; int num, outOffset=0, numLeft; for(numLeft=blockSize; numLeft>0; numLeft -= num) { num = Simple16.s16Decompress(outDecompBlock, outOffset, inCompBlock, inOffset, numLeft); outOffset += num; inOffset++; } int compressedBitSize = (inOffset<<5)-inStartOffsetInBits; return compressedBitSize; } /** * Write a certain number of bits of an integer into an integer array starting from the given start offset * * @param out the output array * @param val the integer to be written * @param outOffset the start offset in bits in the output array * @param bits the number of bits to be written (bits>=0) */ private static final void writeBits(int[] out, int val, int outOffset, int bits) { if(bits == 0) return; final int index = outOffset >>> 5; final int skip = outOffset & 0x1f; val &= (0xffffffff >>> (32 - bits)); out[index] |= (val << skip); if (32 - skip < bits) { out[index + 1] |= (val >>> (32 - skip)); } } /** * Read a certain number of bits of an integer into an integer array starting from the given start offset * * @param in the input array * @param val the integer to be read * @param inOffset the start offset in bits in the input array * @param bits the number of bits to be read, unlike writeBits(), readBits() does not deal with bits==0 and thus bits must > 0. When bits ==0, the calling functions will just skip the entire bits-bit slots without decoding them * @return the bits bits of the input */ private static final int readBits(int[] in, final int inOffset, final int bits) { final int index = inOffset >>> 5; final int skip = inOffset & 0x1f; int val = in[index] >>> skip; if (32 - skip < bits) { val |= (in[index + 1] << (32 - skip)); } return val & (0xffffffff >>> (32 - bits)); } }