/** * Copyright 2014 National University of Ireland, Galway. * * This file is part of the SIREn project. Project and contact information: * * https://github.com/rdelbru/SIREn * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.sindice.siren.index.codecs.block; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.IntsRef; import org.sindice.siren.index.codecs.block.AForFrameCompressor.FrameCompressor; /** * Implementation of {@link BlockCompressor} based on the Adaptive Frame Of * Reference algorithm. * * <p> * * Adaptive Frame Of Reference (AFOR) attempts to retain the best of FOR * (Frame Of Reference), i.e., a very efficient compression and decompression * algorithm using highly-optimised routines, while providing a better tolerance * against outliers and therefore achieving a higher compression ratio. Compared * to PFOR, AFOR does not rely on the encoding of exceptions in the presence of * outliers. Instead, AFOR partitions a block into multiple frames of variable * length, the partition and the length of the frames being chosen appropriately * in order to adapt the encoding to the value distribution. * * <p> * * For more information about AFOR, please refer to the journal article * <a href="http://dx.doi.org/10.1016/j.websem.2011.04.004">Searching web * data: An entity retrieval and high-performance indexing model</a>. */ public class AForBlockCompressor extends BlockCompressor { /** Size of header in buffer */ protected final int HEADER_SIZE = 0; protected final FrameCompressor[] compressors = AForFrameCompressor.compressors; private final long[][] maxFrames = new long[6][]; protected static final int MAX_FRAME_SIZE = 32; protected static final int MIN_FRAME_SIZE = 8; public AForBlockCompressor() { // initialise the arrays that will host the 6 configurations maxFrames[0] = new long[4]; maxFrames[1] = new long[3]; maxFrames[2] = new long[3]; maxFrames[3] = new long[3]; maxFrames[4] = new long[2]; maxFrames[5] = new long[1]; } @Override public void compress(final IntsRef input, final BytesRef output) { assert input.ints.length % 32 == 0; final int[] uncompressedData = input.ints; final byte[] compressedData = output.bytes; // prepare the input buffer before starting the compression this.prepareInputBuffer(input); while (input.offset < input.length) { for (final long compressorCode : this.frameCompressorCodes(uncompressedData, input.offset, input.length)) { compressedData[output.offset] = (byte) compressorCode; this.compressors[(int) compressorCode].compress(input, output); } } // flip buffer input.offset = 0; output.length = output.offset; output.offset = 0; } @Override public int maxCompressedSize(final int arraySize) { // the number of windows final int numberOfWindows = (int) Math.ceil((float) arraySize / (float) this.getWindowSize()); // 1 byte for the frame code, and 32 integers of 4 bytes each final int maxSize = numberOfWindows * (1 + (32 * 4)); return HEADER_SIZE + maxSize; } /** * Prepare the input buffer for compression * <p> * This method will fill with 0 the portion of the input array that will be * covered by the frame window but that is outside the block. * <p> * This is necessary in order to avoid the compression instructions to behave * unexpectedly. The compression instructions are optimised to work over * frame window of integers that can be encoded with a certain number of bits. * If the last frame window of the block contains unexpected integers, i.e., * integers with a larger number of bits than expected, then the encoding * of the last integer of a block can be corrupted. By filling the array with * 0, we avoid such problem as 0 does not have consequence in the compression * instructions. */ private void prepareInputBuffer(final IntsRef input) { final int[] ints = input.ints; final int length = input.length; // the number of windows final int numberOfWindows = (int) Math.ceil((float) length / (float) this.getWindowSize()); // compute the subset of the array that will be covered by the frame window final int frameWindowCoverage = numberOfWindows * MAX_FRAME_SIZE; // fill with 0 the portion of the array that is outside the block but will // be covered by the sliding frame window for (int j = length; j < frameWindowCoverage; j++) { ints[j] = 0; } } /** * Determine the frame compression codes for the next frame window. Each frame * compression code is defined by * <ul> * <li> the number of frame bits to be used for compression; * <li> the size of the frame * </ul> * The method tries to find the best configuration (i.e., the one with the * smallest size in term of bytes and the easier to decompress) of frame size and frame bits * for the current frame window. Currently it is based on six configurations: * <ul> * <li> one frame of 32 integers: [32] * <li> two frames of 16 integers: [16,16] * <li> one frame of 8 integers, followed by one frame of 16 integers and one * frame of 8 integers: [8, 16, 8] * <li> one frame of 16 integers, followed by two frames of 8 integers: * [16, 8, 8] * <li> two frames of 8 integers, followed by one frame of 16 integers: * [8, 8, 16] * <li> four frames of 8 integers: [8,8,8,8] * </ul> */ private long[] frameCompressorCodes(final int[] unCompressedData, final int offset, final int length) { // Get the maximum integer for each frame of minimum size for (int i = 0; i < maxFrames[0].length; i++) { long max = 0; final int frameOffset = MIN_FRAME_SIZE * i; final int frameStart = offset + frameOffset; // if we reach the end of the block, stop checking for max integers for (int j = frameStart; j < length && j < frameStart + MIN_FRAME_SIZE; j++) { max = max >= (unCompressedData[j] & 0xFFFFFFFFL) ? max : (unCompressedData[j] & 0xFFFFFFFFL); } // Derive the frame compressor code from the max // 66 is the code of the special 8x0 frame compressor and 67 the code of the // first frame compressor for sequence of 8 integers maxFrames[0][i] = max == 0 ? 66 : logNextHigherPowerOf2(max) + 67; } // Choose the best config among the six int bestSize = this.getSize(0); int bestConfig = 0; for (int i = 1; i < 6; i++) { final int size = this.getSize(i); if (size <= bestSize) { bestSize = size; bestConfig = i; } } return maxFrames[bestConfig]; } private int getSize(final int config) { switch (config) { case 0: return (int) (((maxFrames[0][0] + maxFrames[0][1] + maxFrames[0][2] + maxFrames[0][3] - 264) << 3) + 32); case 1: maxFrames[1][0] = maxFrames[0][0]; maxFrames[1][1] = maxFrames[0][1]; maxFrames[1][2] = maxFrames[0][2] > maxFrames[0][3] ? maxFrames[0][2] - 33 : maxFrames[0][3] - 33; return (int) (((maxFrames[1][0] + maxFrames[1][1] - 132) << 3) + ((maxFrames[1][2] - 33) << 4) + 24); case 2: maxFrames[2][0] = maxFrames[0][0] > maxFrames[0][1] ? maxFrames[0][0] - 33 : maxFrames[0][1] - 33; maxFrames[2][1] = maxFrames[0][2]; maxFrames[2][2] = maxFrames[0][3]; return (int) (((maxFrames[2][1] + maxFrames[2][2] - 132) << 3) + ((maxFrames[2][0] - 33) << 4) + 24); case 3: maxFrames[3][0] = maxFrames[0][0]; maxFrames[3][1] = maxFrames[0][1] > maxFrames[0][2] ? maxFrames[0][1] - 33 : maxFrames[0][2] - 33; maxFrames[3][2] = maxFrames[0][3]; return (int) (((maxFrames[3][0] + maxFrames[3][2] - 132) << 3) + ((maxFrames[3][1] - 33) << 4) + 24); case 4: maxFrames[4][0] = maxFrames[2][0]; maxFrames[4][1] = maxFrames[1][2]; return (int) (((maxFrames[4][0] + maxFrames[4][1] - 66) << 4) + 16); case 5: maxFrames[5][0] = maxFrames[4][0] > maxFrames[4][1] ? maxFrames[4][0] - 33: maxFrames[4][1] - 33; return (int) ((maxFrames[5][0] << 5) + 8); default: throw new Error("AFor: Unknown config"); } } /** * Lookup table for finding the log base 2 */ private static final int[] LogTable256 = new int[256]; static { LogTable256[0] = LogTable256[1] = 0; for (int i = 2; i < 256; i++) { LogTable256[i] = 1 + LogTable256[i / 2]; } } /** * Optimised routine for finding the log base 2 of an integer. * * @see http://graphics.stanford.edu/~seander/bithacks.html#IntegerLogLookup */ private static int logNextHigherPowerOf2(final long v) { long t, tt; tt = v >> 16; if (tt > 0) { return (t = tt >> 8) > 0 ? 24 + LogTable256[(int) t] : 16 + LogTable256[(int) tt]; } else { return (t = v >> 8) > 0 ? 8 + LogTable256[(int) t] : LogTable256[(int) v]; } } @Override public int getWindowSize() { return AForBlockCompressor.MAX_FRAME_SIZE; } }