AForBlockCompressor.java example

Explorer
siren-master
/**
 * Copyright 2014 National University of Ireland, Galway.
 *
 * This file is part of the SIREn project. Project and contact information:
 *
 *  https://github.com/rdelbru/SIREn
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *  http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.sindice.siren.index.codecs.block;

import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
import org.sindice.siren.index.codecs.block.AForFrameCompressor.FrameCompressor;

/**
 * Implementation of {@link BlockCompressor} based on the Adaptive Frame Of
 * Reference algorithm.
 *
 * <p>
 *
 * Adaptive Frame Of Reference (AFOR) attempts to retain the best of FOR
 * (Frame Of Reference), i.e., a very efficient compression and decompression
 * algorithm using highly-optimised routines, while providing a better tolerance
 * against outliers and therefore achieving a higher compression ratio. Compared
 * to PFOR, AFOR does not rely on the encoding of exceptions in the presence of
 * outliers. Instead, AFOR partitions a block into multiple frames of variable
 * length, the partition and the length of the frames being chosen appropriately
 * in order to adapt the encoding to the value distribution.
 *
 * <p>
 *
 * For more information about AFOR, please refer to the journal article
 * <a href="http://dx.doi.org/10.1016/j.websem.2011.04.004">Searching web
 * data: An entity retrieval and high-performance indexing model</a>.
 */
public class AForBlockCompressor extends BlockCompressor {

  /** Size of header in buffer */
  protected final int HEADER_SIZE = 0;

  protected final FrameCompressor[] compressors = AForFrameCompressor.compressors;

  private final long[][]            maxFrames       = new long[6][];

  protected static final int          MAX_FRAME_SIZE  = 32;
  protected static final int          MIN_FRAME_SIZE  = 8;

  public AForBlockCompressor() {
    // initialise the arrays that will host the 6 configurations
    maxFrames[0] = new long[4];
    maxFrames[1] = new long[3];
    maxFrames[2] = new long[3];
    maxFrames[3] = new long[3];
    maxFrames[4] = new long[2];
    maxFrames[5] = new long[1];
  }

  @Override
  public void compress(final IntsRef input, final BytesRef output) {
    assert input.ints.length % 32 == 0;
    final int[] uncompressedData = input.ints;
    final byte[] compressedData = output.bytes;

    // prepare the input buffer before starting the compression
    this.prepareInputBuffer(input);

    while (input.offset < input.length) {
      for (final long compressorCode : this.frameCompressorCodes(uncompressedData, input.offset, input.length)) {
        compressedData[output.offset] = (byte) compressorCode;
        this.compressors[(int) compressorCode].compress(input, output);
      }
    }

    // flip buffer
    input.offset = 0;
    output.length = output.offset;
    output.offset = 0;
  }

  @Override
  public int maxCompressedSize(final int arraySize) {
    // the number of windows
    final int numberOfWindows = (int) Math.ceil((float) arraySize / (float) this.getWindowSize());
    // 1 byte for the frame code, and 32 integers of 4 bytes each
    final int maxSize = numberOfWindows * (1 + (32 * 4));
    return HEADER_SIZE + maxSize;
  }

  /**
   * Prepare the input buffer for compression
   * <p>
   * This method will fill with 0 the portion of the input array that will be
   * covered by the frame window but that is outside the block.
   * <p>
   * This is necessary in order to avoid the compression instructions to behave
   * unexpectedly. The compression instructions are optimised to work over
   * frame window of integers that can be encoded with a certain number of bits.
   * If the last frame window of the block contains unexpected integers, i.e.,
   * integers with a larger number of bits than expected, then the encoding
   * of the last integer of a block can be corrupted. By filling the array with
   * 0, we avoid such problem as 0 does not have consequence in the compression
   * instructions.
   */
  private void prepareInputBuffer(final IntsRef input) {
    final int[] ints = input.ints;
    final int length = input.length;
    // the number of windows
    final int numberOfWindows = (int) Math.ceil((float) length / (float) this.getWindowSize());
    // compute the subset of the array that will be covered by the frame window
    final int frameWindowCoverage = numberOfWindows * MAX_FRAME_SIZE;
    // fill with 0 the portion of the array that is outside the block but will
    // be covered by the sliding frame window
    for (int j = length; j < frameWindowCoverage; j++) {
      ints[j] = 0;
    }
  }

  /**
   * Determine the frame compression codes for the next frame window. Each frame
   * compression code is defined by
   * <ul>
   * <li> the number of frame bits to be used for compression;
   * <li> the size of the frame
   * </ul>
   * The method tries to find the best configuration (i.e., the one with the
   * smallest size in term of bytes and the easier to decompress) of frame size and frame bits
   * for the current frame window. Currently it is based on six configurations:
   * <ul>
   * <li> one frame of 32 integers: [32]
   * <li> two frames of 16 integers: [16,16]
   * <li> one frame of 8 integers, followed by one frame of 16 integers and one
   * frame of 8 integers: [8, 16, 8]
   * <li> one frame of 16 integers, followed by two frames of 8 integers:
   * [16, 8, 8]
   * <li> two frames of 8 integers, followed by one frame of 16 integers:
   * [8, 8, 16]
   * <li> four frames of 8 integers: [8,8,8,8]
   * </ul>
   */
  private long[] frameCompressorCodes(final int[] unCompressedData, final int offset, final int length) {
    // Get the maximum integer for each frame of minimum size
    for (int i = 0; i < maxFrames[0].length; i++) {
      long max = 0;

      final int frameOffset = MIN_FRAME_SIZE * i;
      final int frameStart = offset + frameOffset;

      // if we reach the end of the block, stop checking for max integers
      for (int j = frameStart; j < length && j < frameStart + MIN_FRAME_SIZE; j++) {
        max = max >= (unCompressedData[j] & 0xFFFFFFFFL) ? max : (unCompressedData[j] & 0xFFFFFFFFL);
      }

      // Derive the frame compressor code from the max
      // 66 is the code of the special 8x0 frame compressor and 67 the code of the
      // first frame compressor for sequence of 8 integers
      maxFrames[0][i] = max == 0 ? 66 : logNextHigherPowerOf2(max) + 67;
    }

    // Choose the best config among the six
    int bestSize = this.getSize(0);
    int bestConfig = 0;

    for (int i = 1; i < 6; i++) {
      final int size = this.getSize(i);
      if (size <= bestSize) {
        bestSize = size;
        bestConfig = i;
      }
    }

    return maxFrames[bestConfig];
  }

  private int getSize(final int config) {
    switch (config) {
      case 0:
        return (int) (((maxFrames[0][0] + maxFrames[0][1] + maxFrames[0][2] + maxFrames[0][3] - 264) << 3) + 32);

      case 1:
        maxFrames[1][0] = maxFrames[0][0];
        maxFrames[1][1] = maxFrames[0][1];
        maxFrames[1][2] = maxFrames[0][2] > maxFrames[0][3] ? maxFrames[0][2] - 33 : maxFrames[0][3] - 33;
        return (int) (((maxFrames[1][0] + maxFrames[1][1] - 132) << 3) + ((maxFrames[1][2] - 33) << 4) + 24);

      case 2:
        maxFrames[2][0] = maxFrames[0][0] > maxFrames[0][1] ? maxFrames[0][0] - 33 : maxFrames[0][1] - 33;
        maxFrames[2][1] = maxFrames[0][2];
        maxFrames[2][2] = maxFrames[0][3];
        return (int) (((maxFrames[2][1] + maxFrames[2][2] - 132) << 3) + ((maxFrames[2][0] - 33) << 4) + 24);

      case 3:
        maxFrames[3][0] = maxFrames[0][0];
        maxFrames[3][1] = maxFrames[0][1] > maxFrames[0][2] ? maxFrames[0][1] - 33 : maxFrames[0][2] - 33;
        maxFrames[3][2] = maxFrames[0][3];
        return (int) (((maxFrames[3][0] + maxFrames[3][2] - 132) << 3) + ((maxFrames[3][1] - 33) << 4) + 24);

      case 4:
        maxFrames[4][0] = maxFrames[2][0];
        maxFrames[4][1] = maxFrames[1][2];
        return (int) (((maxFrames[4][0] + maxFrames[4][1] - 66) << 4) + 16);

      case 5:
        maxFrames[5][0] = maxFrames[4][0] > maxFrames[4][1] ? maxFrames[4][0] - 33: maxFrames[4][1] - 33;
        return (int) ((maxFrames[5][0] << 5) + 8);

      default:
        throw new Error("AFor: Unknown config");
    }
  }

  /**
   * Lookup table for finding the log base 2
   */
  private static final int[] LogTable256 = new int[256];

  static {
    LogTable256[0] = LogTable256[1] = 0;
    for (int i = 2; i < 256; i++) {
      LogTable256[i] = 1 + LogTable256[i / 2];
    }
  }

  /**
   * Optimised routine for finding the log base 2 of an integer.
   *
   * @see http://graphics.stanford.edu/~seander/bithacks.html#IntegerLogLookup
   */
  private static int logNextHigherPowerOf2(final long v) {
    long t, tt;

    tt = v >> 16;
    if (tt > 0) {
      return (t = tt >> 8) > 0 ? 24 + LogTable256[(int) t] : 16 + LogTable256[(int) tt];
    }
    else {
      return (t = v >> 8) > 0 ? 8 + LogTable256[(int) t] : LogTable256[(int) v];
    }
  }

  @Override
  public int getWindowSize() {
    return AForBlockCompressor.MAX_FRAME_SIZE;
  }

}