/** * diqube: Distributed Query Base. * * Copyright (C) 2015 Bastian Gloeckle * * This file is part of diqube. * * diqube is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package org.diqube.data.types.lng.array; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.function.Function; import java.util.stream.Collectors; import java.util.stream.IntStream; import java.util.stream.LongStream; import org.diqube.data.serialize.DataSerializable; import org.diqube.data.serialize.DeserializationException; import org.diqube.data.serialize.SerializationException; import org.diqube.data.serialize.thrift.v1.SLongCompressedArrayBitEfficient; /** * A {@link CompressedLongArray} that stores the long values in a bit-efficient way. * * <p> * This means that it inspects the uncompressed values for min and max values and determines the number of bits actually * needed to represent all of those values. It then stores these number of bits per entry in the array. * * <p> * Internal representation of these bits is another long array, where each of the input values is represented with a * leading bit that designates positive or a negative value (only if negative values are at all int he input array) * followed by the bit representation of the absolute value of the input value. This representation forces to take * special care about values {@link Long#MIN_VALUE}, because these are internally represented using two' complement and * therefore cannot be represented with max 64 bits in our representation. * * @author Bastian Gloeckle */ @DataSerializable(thriftClass = SLongCompressedArrayBitEfficient.class) public class BitEfficientLongArray extends AbstractExplorableCompressedLongArray<SLongCompressedArrayBitEfficient> { /** Number of elements in the array. Available after {@link #prepareCompression(long[], boolean)}. */ private int size; /** * Number of bits used per compressed value, including potential sign-bits. Available after * {@link #prepareCompression(long[], boolean)}. */ private int numberOfBitsPerValue; /** * true if input array was sorted, and compressed version of elements is sorted, too, therefore. Available after * {@link #prepareCompression(long[], boolean)}. */ private boolean isSorted; /** * true if the uncompressed version of the array contains the same value for each index. Available after * {@link #prepareCompression(long[], boolean)}. */ private boolean isSameValue; /** * true if our compression contains a sign-bit first. Available after {@link #prepareCompression(long[], boolean)}. */ private boolean containsSignBit; /** * The array containing the bit-compressed values. This does not contain the {@link Long#MIN_VALUE} elements of the * uncompressed input array at all. This array may be <code>null</code> if the input array contained only * {@link Long#MIN_VALUE}s or {@link #size} == 0. */ private long[] compressedValues; /** * Indices of the input array where {@link Long#MIN_VALUE} was found. This array is sorted, but may be * <code>null</code> in case there were no {@link Long#MIN_VALUE}s in the input array or {@link #size} == 0. */ private int[] longMinValueLocations; /** Number of {@link Long#MIN_VALUE}s in input array. Available after {@link #prepareCompression(long[], boolean)}. */ private int numberOfLongMinValues; /** * Minimum decompressed value, not counting {@link Long#MIN_VALUE}. Available after * {@link #prepareCompression(long[], boolean)}. */ private long minValue; /** * Minimum decompressed value, counting {@link Long#MIN_VALUE}. Available after * {@link #prepareCompression(long[], boolean)}. */ private long absoluteMinValue; /** Maximum decompressed value. Available after {@link #prepareCompression(long[], boolean)}. */ private long maxValue; /** * Instantiate new {@link BitEfficientLongArray} and execute compression right away. * * @param originalValues * The uncompressed values. * @param isSorted * true if the values are sorted. */ public BitEfficientLongArray(long[] originalValues, boolean isSorted) { super(); compress(originalValues, isSorted); } /** * Instantiate new {@link BitEfficientLongArray} in {@link State#EXPLORING}. */ public BitEfficientLongArray() { super(); } @Override protected double doExpectedCompressionRatio(long[] originalValues, boolean isSorted) { return calculateApproxCompressionRatio(numberOfBitsPerValue, isSameValue, size, numberOfLongMinValues); } /** * Prepare the compression, but do not execute compression itself. After calling this method, following fields are * set: * * <ul> * <li>{@link #isSorted} * <li>{@link #size} * <li>{@link #numberOfLongMinValues} * <li>{@link #absoluteMinValue} * <li>{@link #minValue} * <li>{@link #maxValue} * <li>{@link #isSameValue} * <li>{@link #numberOfBitsPerValue} * <li>{@link #containsSignBit} * </ul> * * @param originalValues * @param isSorted */ @Override protected void doPrepareCompression(long[] originalValues, boolean isSorted) { this.isSorted = isSorted; size = originalValues.length; if (size == 0) return; numberOfLongMinValues = 0; // find min and max if (isSorted) { absoluteMinValue = originalValues[0]; maxValue = originalValues[originalValues.length - 1]; } else { absoluteMinValue = Long.MAX_VALUE; maxValue = Long.MIN_VALUE; for (int i = 0; i < originalValues.length; i++) { if (originalValues[i] < absoluteMinValue) absoluteMinValue = originalValues[i]; if (originalValues[i] > maxValue) maxValue = originalValues[i]; // as we traverse the whole array anyway, we can count the number of MIN_VALUES we see, as we might need this // later. if (originalValues[i] == Long.MIN_VALUE) numberOfLongMinValues++; } } isSameValue = absoluteMinValue == maxValue; if (absoluteMinValue == Long.MIN_VALUE) { // Special case: We cannot represent Long.MIN_VALUE in our representation within 64 bits, we do not use two's // complement representation. Therefore having Long.MIN_VALUE in the input array is a special case: We store the // locations of the MIN_VALUEs in a separate array, and do not use the compression algorithm below for these // values. if (isSameValue) { // Input array has ONLY Long.MIN_VALUE numberOfLongMinValues = size; minValue = Long.MIN_VALUE; numberOfBitsPerValue = 0; return; } // Now we need to know how much times MIN_VALUE is inside the array. We know this already, if the array is not // sorted (counted that when traversing the array before). if (isSorted) { if (originalValues[1] != Long.MIN_VALUE) numberOfLongMinValues = 1; else { numberOfLongMinValues = binarySearchForLastOccurenceOfLongMinValue(originalValues); } } // Fill in locations of Long.MIN_VALUE & find new min value (ignoring Long.MIN_VALUE), so we will be able to // calculate the optimal bit-representation for all values that we need to compress later. if (isSorted) { minValue = originalValues[numberOfLongMinValues]; } else { minValue = maxValue; for (int i = 0; i < originalValues.length; i++) { if (originalValues[i] != Long.MIN_VALUE && originalValues[i] < minValue) minValue = originalValues[i]; } } } else minValue = absoluteMinValue; // Calculate number of bits needed per input value. numberOfBitsPerValue = 0; if (maxValue > 0) { numberOfBitsPerValue = numberOfBitsNeededForPositiveLong(maxValue); } containsSignBit = false; if (minValue < 0) { numberOfBitsPerValue = Math.max(numberOfBitsPerValue + 1, numberOfBitsNeededForPositiveLong(Math.abs(minValue)) + 1); containsSignBit = true; } if (numberOfBitsPerValue == 0) // This could happen if maxValue == minValue == 0 -> let's store at least one bit. numberOfBitsPerValue = 1; } @Override protected void doCompress(long[] originalValues, boolean isSorted) { if (numberOfLongMinValues > 0) { // now we can instantiate our array which will hold the locations of Long.MIN_VALUE. longMinValueLocations = new int[numberOfLongMinValues]; // Fill in locations of Long.MIN_VALUE & find new min value (ignoring Long.MIN_VALUE), so we will be able to // calculate the optimal bit-representation for all values that we need to compress later. if (isSorted) { for (int i = 0; i < numberOfLongMinValues; i++) longMinValueLocations[i] = i; } else { int j = 0; for (int i = 0; i < originalValues.length; i++) { if (originalValues[i] == Long.MIN_VALUE) longMinValueLocations[j++] = i; } } if (isSameValue) // Input array has ONLY Long.MIN_VALUE return; } // Start compressing the values compressedValues = new long[(int) Math.ceil(numberOfBitsPerValue * (size - numberOfLongMinValues) / 64.)]; Arrays.fill(compressedValues, 0L); int longMinValuesSeen = 0; for (int pos = 0; pos < originalValues.length; pos++) { // Skip positions with Long.MIN_VALUE, as they have been handled separately. if (originalValues[pos] == Long.MIN_VALUE) { longMinValuesSeen++; continue; } // calculate the bit representation we want to store. This value will at most have numberOfBitsPerValue set, at // bit numbers 0..numberOfBitsPerValue-1. long value; if (containsSignBit) { value = originalValues[pos]; if (value < 0) value = -value; // make value positive, so we do not have to handle two's complement representation below. value = value & createBitMask(0, numberOfBitsPerValue - 1); if (originalValues[pos] < 0) value |= 1L << (numberOfBitsPerValue - 1); // set sign bit, marking this value as negative. } else { // Positive value. value = originalValues[pos] & createBitMask(0, numberOfBitsPerValue); } // find the long inside compressedValues where the first (or all) bits of our new value will be stored. int compressedPos = (int) Math.floorDiv((long) (pos - longMinValuesSeen) * numberOfBitsPerValue, 64L); // Uppermost bit: Bit number from the right side of the long that contains the uppermost bit of our compressed // value. Bit numbers are 0-based, means right-most and least-valued bit has number 0. int compressedPosUppermostBit = 63 - (int) (((long) (pos - longMinValuesSeen) * numberOfBitsPerValue) % 64L); int spaceLeftInCompressedLong = compressedPosUppermostBit + 1; if (spaceLeftInCompressedLong >= numberOfBitsPerValue) // New compressed value fits fully into the compressed long. compressedValues[compressedPos] |= value << (spaceLeftInCompressedLong - numberOfBitsPerValue); else { // Split value between two compressed longs. compressedValues[compressedPos] |= value >>> numberOfBitsPerValue - spaceLeftInCompressedLong; compressedValues[compressedPos + 1] |= value << (64 - numberOfBitsPerValue + spaceLeftInCompressedLong); } } } /** * Does a binary search on a sorted array for the last occurence of {@link Long#MIN_VALUE}. * * Expects that the array does (1) not only contain {@link Long#MIN_VALUE} and (2) that at least the first two entries * in the array are {@link Long#MIN_VALUE}. * * @return number of times the array contains {@link Long#MIN_VALUE}. These values then occur at indices 0..result-1. */ private int binarySearchForLastOccurenceOfLongMinValue(long[] originalValues) { int numberOfLongMinValues = 0; int lo = 0; int high = originalValues.length - 1; while (numberOfLongMinValues == 0) { int mid = lo + Math.floorDiv(high - lo, 2); if (originalValues[mid] == Long.MIN_VALUE) { if (originalValues[mid + 1] != Long.MIN_VALUE) { numberOfLongMinValues = mid + 1; break; } lo = mid; } else { if (originalValues[mid - 1] == Long.MIN_VALUE) { numberOfLongMinValues = mid; break; } high = mid; } } return numberOfLongMinValues; } private static int numberOfBitsNeededForPositiveLong(long positiveValue) { if (positiveValue == 0) return 1; return 64 - Long.numberOfLeadingZeros(positiveValue); } @Override public boolean isSameValue() { if (size == 0) return true; return isSameValue; } @Override public int size() { return size; } @Override public boolean isSorted() { return isSorted; } @Override public long[] decompressedArray() { if (size == 0) return new long[0]; // function iterating linearily through longMinValueLocations and finding Long.MIN_VALUEs. Function<Integer, Integer> minValueFn = new Function<Integer, Integer>() { private int nextLongMinValueLocationIdx = 0; @Override public Integer apply(Integer t) { if (nextLongMinValueLocationIdx == longMinValueLocations.length) return longMinValueLocations.length; if (longMinValueLocations[nextLongMinValueLocationIdx] == t) { nextLongMinValueLocationIdx++; // value is Long.MIN_VALUE return null; } return nextLongMinValueLocationIdx; } }; long[] res = new long[size]; for (int i = 0; i < size; i++) res[i] = get(i, minValueFn); return res; } @Override public long get(int index) throws ArrayIndexOutOfBoundsException { return get(index, new Function<Integer, Integer>() { @Override public Integer apply(Integer index) { // Execute logarithmic binary search int longMinValueIndex = Arrays.binarySearch(longMinValueLocations, index); if (longMinValueIndex >= 0 && longMinValueLocations[longMinValueIndex] == index) // value at index is actually MIN_VALUE. return null; // value at index is not MIN_VALUE, return number of MIN_VALUES that have index < our index. int numberOfLongMinValuesBeforeCurrentPos = Math.abs(longMinValueIndex) - 1; return numberOfLongMinValuesBeforeCurrentPos; } }); } @Override public List<Long> getMultiple(List<Integer> sortedIndices) throws ArrayIndexOutOfBoundsException { List<Long> res = new ArrayList<>(); // as this compression has a approx O(1) access time on get(), we can simply execute get() here in a loop. for (int idx : sortedIndices) res.add(get(idx)); return res; } /** * Executes {@link #get(int)}, but using a separate function that is capable of identifying if the value at that index * is {@link Long#MIN_VALUE} and if not, how many {@link Long#MIN_VALUE}s are available before that index. * * @param minValueFn * Function gets index, returns <code>null</code> if there is a {@link Long#MIN_VALUE} at this index, * otherwise it returns the number of {@link Long#MIN_VALUE}s that have index < the passed index. */ public long get(int index, Function<Integer, Integer> minValueFn) throws ArrayIndexOutOfBoundsException { if (index < 0 || index >= size) throw new ArrayIndexOutOfBoundsException("Tried to access index " + index + " but size is " + size); // If the array contains Long.MIN_VALUEs, we need to know (1) if the queried index has value Long.MIN_VALUE and if // not, (2) how much Long.MIN_VALUEs have been stored for indices that are smaller than the current index: We did // not put any bits into compressedValues for the Long.MIN_VALUE input values, we therefore need this information to // calculate the position of the compressed value in compressedValues we want to decompress. int numberOfLongMinValuesBeforeCurrentPos = 0; if (longMinValueLocations != null) { Integer minValueResult = minValueFn.apply(index); if (minValueResult == null) return Long.MIN_VALUE; numberOfLongMinValuesBeforeCurrentPos = minValueResult; } // Calculate the long in compressedValues that holds the first (or all) bits of the value we want to load. int compressedPos = (int) Math.floorDiv((long) (index - numberOfLongMinValuesBeforeCurrentPos) * numberOfBitsPerValue, 64L); // Uppermost bit: Bit number from the right side of the long that contains the uppermost bit of our compressed // value. Bit numbers are 0-based, means right-most and least-valued bit has number 0. int compressedPosUppermostBit = 63 - (int) (((long) (index - numberOfLongMinValuesBeforeCurrentPos) * numberOfBitsPerValue) % 64L); int numberOfBitsStoredInCompressedLong = compressedPosUppermostBit + 1; long value; if (numberOfBitsStoredInCompressedLong >= numberOfBitsPerValue) { // Compressed value is fully contained in one compressedValue[] entry. value = compressedValues[compressedPos] & createBitMask(compressedPosUppermostBit - numberOfBitsPerValue + 1, compressedPosUppermostBit); value = value >>> (compressedPosUppermostBit - numberOfBitsPerValue + 1); } else { // Compressed value is split across two compressedValue[] entries. // upper bits value = compressedValues[compressedPos] & createBitMask(0, compressedPosUppermostBit); value = value << (numberOfBitsPerValue - compressedPosUppermostBit - 1); // lower bits int numberOfLowerBits = numberOfBitsPerValue - numberOfBitsStoredInCompressedLong; long lowerBits = compressedValues[compressedPos + 1] & createBitMask(64 - numberOfLowerBits, 63); value |= lowerBits >>> (64 - numberOfLowerBits); } // Check if the compressed value is negative. boolean uppermostBitSet = (value & (1L << (numberOfBitsPerValue - 1))) != 0; if (containsSignBit && uppermostBitSet) { value &= ~(1L << (numberOfBitsPerValue - 1)); // unset uppermost bit value = -value; } return value; } @Override public void serialize(DataSerializationHelper mgr, SLongCompressedArrayBitEfficient target) throws SerializationException { target.setSize(size); target.setNumberOfBitsPerValue(numberOfBitsPerValue); target.setIsSorted(isSorted); target.setIsSameValue(isSameValue); target.setContainsSignBit(containsSignBit); target.setMinValue(minValue); target.setAbsoluteMinValue(absoluteMinValue); target.setMaxValue(maxValue); if (longMinValueLocations != null) target.setLongMinValueLocations(IntStream.of(longMinValueLocations).boxed().collect(Collectors.toList())); target.setCompressedValues(LongStream.of(compressedValues).boxed().collect(Collectors.toList())); } @Override public void deserialize(DataSerializationHelper mgr, SLongCompressedArrayBitEfficient source) throws DeserializationException { size = source.getSize(); numberOfBitsPerValue = source.getNumberOfBitsPerValue(); isSorted = source.isIsSorted(); isSameValue = source.isIsSameValue(); containsSignBit = source.isContainsSignBit(); compressedValues = source.getCompressedValues().stream().mapToLong(Long::longValue).toArray(); if (source.isSetLongMinValueLocations()) longMinValueLocations = source.getLongMinValueLocations().stream().mapToInt(Integer::intValue).toArray(); numberOfLongMinValues = (longMinValueLocations != null) ? longMinValueLocations.length : 0; minValue = source.getMinValue(); absoluteMinValue = source.getAbsoluteMinValue(); maxValue = source.getMaxValue(); } /** * @return A long having the bits at the specific locations set (lower- and upper-bound indices included) */ private long createBitMask(int idxLowestBitSet, int idxUppermostBitSet) { long res = 0; for (int i = idxLowestBitSet; i <= idxUppermostBitSet; i++) res |= 1L << i; return res; } private static double calculateApproxCompressionRatio(int numberOfBitsPerValue, boolean isSameValue, int size, int numberOfLongMinValues) { if (size == 0) return 1.; if (numberOfLongMinValues > 0 && isSameValue) // Only MIN_VALUEs. return 1.; double avgBitsPerEntryCompressed = ((((double) size - numberOfLongMinValues) * numberOfBitsPerValue) + numberOfLongMinValues * 64) / size; double uncompressedNumberOfBitsPerValue = 64.; return avgBitsPerEntryCompressed / uncompressedNumberOfBitsPerValue; } @Override public long calculateApproximateSizeInBytes() { return 16 + // object header of this 39 + // small fields (= non array fields) ((compressedValues != null) ? compressedValues.length * 8 : 0) + // ((longMinValueLocations != null) ? longMinValueLocations.length * 4 : 0); } /** * Calculate an approximate compression ratio based not on a full array (use * {@link #expectedCompressionRatio(long[], boolean)} for that), but based on a few values calculated for an input * array. * * @param min * Minimum value of the input array. If the array contains {@link Long#MIN_VALUE}, this parameter should * contain the value of the <b>second smallest</b> number in the array (= ignore {@link Long#MIN_VALUE}!). * This parameter is not expected to be set to {@link Long#MIN_VALUE}. This value is ignored if the input * array only contains {@link Long#MIN_VALUE} (-> size == numberOfLongMinValues). * @param max * Maximum value of the input array. This value is ignored if the input array only contains * {@link Long#MIN_VALUE} (-> size == numberOfLongMinValues). * @param size * Overall number of elements in the input array. * @param numberOfLongMinValues * The number of times the input array contains the value {@link Long#MIN_VALUE}. * @return Compression ratio as defined as result in {@link #expectedCompressionRatio(long[], boolean)}. */ public static double calculateApproxCompressionRatio(long min, long max, int size, int numberOfLongMinValues) { if (size == numberOfLongMinValues) return 1.; int numberOfBitsPerMinValue = numberOfBitsNeededForPositiveLong(Math.abs(min)) + ((min < 0) ? 1 : 0); int numberOfBitsPerMaxValue = numberOfBitsNeededForPositiveLong(Math.abs(max)) + ((max < 0) ? 1 : 0); boolean sameValue = min == max && numberOfLongMinValues == 0; return calculateApproxCompressionRatio(Math.max(1, Math.max(numberOfBitsPerMinValue, numberOfBitsPerMaxValue)), sameValue, size, numberOfLongMinValues); } }