/** * diqube: Distributed Query Base. * * Copyright (C) 2015 Bastian Gloeckle * * This file is part of diqube. * * diqube is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package org.diqube.data.types.lng.array; import java.util.ArrayList; import java.util.List; import java.util.TreeSet; import java.util.function.Supplier; import java.util.stream.Collectors; import java.util.stream.LongStream; import org.diqube.data.serialize.DataSerializable; import org.diqube.data.serialize.DeserializationException; import org.diqube.data.serialize.SerializationException; import org.diqube.data.serialize.thrift.v1.SLongCompressedArray; import org.diqube.data.serialize.thrift.v1.SLongCompressedArrayRLE; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.collect.Iterators; import com.google.common.collect.PeekingIterator; /** * Run-Length-Encoding for long arrays. * * <p> * A run length encoding encodes repeated values into a count and the value itself. Therefore this encoding is * meaningful if the input array contains long consecutive parts with the same value. * * <p> * This {@link CompressedLongArray} implements {@link TransitiveExplorableCompressedLongArray} and can therefore be * backed by other {@link ExplorableCompressedLongArray} where two arrays would be created: One holding the counts and * one holding the values themselves. * * <p> * Be aware that the simple {@link #get(int)} method has linear runtime in this compression! * * @author Bastian Gloeckle */ @DataSerializable(thriftClass = SLongCompressedArrayRLE.class) public class RunLengthLongArray extends AbstractTransitiveExplorableCompressedLongArray<SLongCompressedArrayRLE> { private static final Logger logger = LoggerFactory.getLogger(RunLengthLongArray.class); /** If the decompressed array is sorted. */ private boolean isSorted; private long numberOfDifferentTuples; private long maxValue; private long minValue; private long secondMinValue; private long maxCount; private long minCount; /** Contains compressed values. If <code>null</code>, the compressed values are in {@link #delegateCompressedValue} */ private long[] compressedValues; /** * Contains compressed counts. If <code>null</code>, the compressed values are in {@link #delegateCompressedCounts} */ private long[] compressedCounts; /** Contains compressed values. If <code>null</code>, the compressed values are in {@link #compressedValues} */ private ExplorableCompressedLongArray<?> delegateCompressedValue = null; /** Contains compressed counts. If <code>null</code>, the compressed values are in {@link #compressedCounts} */ private ExplorableCompressedLongArray<?> delegateCompressedCounts = null; /** size of the uncompressed array */ private int size; public RunLengthLongArray() { super(); } public RunLengthLongArray(long[] inputArray, boolean isSorted) { super(); compress(inputArray, isSorted); } @Override protected void doPrepareCompression(long[] inputArray, boolean isSorted) { this.isSorted = isSorted; size = inputArray.length; maxValue = Long.MIN_VALUE; minValue = Long.MAX_VALUE; secondMinValue = Long.MAX_VALUE; maxCount = Long.MIN_VALUE; minCount = Long.MAX_VALUE; numberOfDifferentTuples = 1; long lastValue; long lastCount = 0; lastValue = inputArray[0]; for (int pos = 0; pos < inputArray.length; pos++) { if (inputArray[pos] == lastValue) lastCount++; else { if (lastCount > maxCount) maxCount = lastCount; if (lastCount < minCount) minCount = lastCount; lastValue = inputArray[pos]; lastCount = 1; numberOfDifferentTuples++; } if (lastValue > maxValue) maxValue = lastValue; if (lastValue < minValue) { if (minValue < secondMinValue) secondMinValue = minValue; minValue = lastValue; } if (lastValue < secondMinValue && lastValue > minValue) secondMinValue = lastValue; } if (lastCount > maxCount) maxCount = lastCount; if (lastCount < minCount) minCount = lastCount; if (secondMinValue > maxValue) secondMinValue = maxValue; } @Override protected double doTransitiveExpectedCompressionRatio(long[] inputArray, boolean isSorted, TransitiveCompressionRatioCalculator transitiveCalculator) { double valueCompression = transitiveCalculator.calculateTransitiveCompressionRatio(minValue, secondMinValue, maxValue, numberOfDifferentTuples); double countCompression = transitiveCalculator.calculateTransitiveCompressionRatio(minCount, minCount, maxCount, numberOfDifferentTuples); double resRatio = (numberOfDifferentTuples * (valueCompression + countCompression)) / size; logger.trace( "Res ratio: {}, Value ratio: {}, count ratio: {}, minValue: {}, secondMinValue: {}, " + "maxValue: {}, minCount: {}, maxCount: {}, numberOfDifferentTuples: {}", resRatio, valueCompression, countCompression, minValue, secondMinValue, maxValue, minCount, maxCount, numberOfDifferentTuples); return resRatio; } @Override protected double doExpectedCompressionRatio(long[] inputArray, boolean isSorted) { return (numberOfDifferentTuples * 2) / inputArray.length; } @Override protected void doCompress(long[] inputArray, boolean isSorted) { compressedValues = new long[(int) numberOfDifferentTuples]; compressedCounts = new long[(int) numberOfDifferentTuples]; int resultPos = 0; long lastValue; long lastCount = 0; lastValue = inputArray[0]; for (int pos = 0; pos < inputArray.length; pos++) { if (inputArray[pos] == lastValue) lastCount++; else { compressedCounts[resultPos] = lastCount; compressedValues[resultPos] = lastValue; resultPos++; lastValue = inputArray[pos]; lastCount = 1; } } compressedCounts[resultPos] = lastCount; compressedValues[resultPos] = lastValue; } @Override public void compress(long[] inputArray, boolean isSorted, Supplier<ExplorableCompressedLongArray<?>> transitiveSupplier) throws IllegalStateException { compress(inputArray, isSorted); delegateCompressedCounts = transitiveSupplier.get(); delegateCompressedCounts.compress(compressedCounts, false); compressedCounts = null; delegateCompressedValue = transitiveSupplier.get(); delegateCompressedValue.compress(compressedValues, isSorted); compressedValues = null; } @Override public boolean isSameValue() { return numberOfDifferentTuples == 1; } @Override public int size() { return size; } @Override public boolean isSorted() { return isSorted; } @Override public long[] decompressedArray() { int internalSize; if (compressedCounts != null) internalSize = compressedCounts.length; else internalSize = delegateCompressedCounts.size(); long[] res = new long[size]; int resPos = 0; for (int internalPos = 0; internalPos < internalSize; internalPos++) { long count; long value; if (compressedCounts != null) { count = compressedCounts[internalPos]; value = compressedValues[internalPos]; } else { count = delegateCompressedCounts.get(internalPos); value = delegateCompressedValue.get(internalPos); } for (int i = 0; i < count; i++) res[resPos++] = value; } return res; } @Override public long get(int index) throws ArrayIndexOutOfBoundsException { int internalSize; if (compressedCounts != null) internalSize = compressedCounts.length; else internalSize = delegateCompressedCounts.size(); if (index < 0 || index >= size()) throw new ArrayIndexOutOfBoundsException("Index out of bounds"); if (index == 0) { if (compressedValues != null) return compressedValues[0]; return delegateCompressedValue.get(0); } int decompressedCount = 0; for (int pos = 0; pos < internalSize - 1; pos++) { long lengthValue; if (compressedValues != null) lengthValue = compressedCounts[pos]; else lengthValue = delegateCompressedCounts.get(pos); decompressedCount += lengthValue; if (decompressedCount > index) { if (compressedValues != null) return compressedValues[pos]; return delegateCompressedValue.get(pos); } } // should never happen if (compressedValues != null) return compressedValues[internalSize - 1]; return delegateCompressedValue.get(internalSize - 1); } @Override public List<Long> getMultiple(List<Integer> sortedIndices) throws ArrayIndexOutOfBoundsException { // first: Find the internal indices that we need to resolve List<Integer> internalIndicesToResolveSorted = new ArrayList<>(); try { int internalSize; if (compressedCounts != null) internalSize = compressedCounts.length; else internalSize = delegateCompressedCounts.size(); PeekingIterator<Integer> sortedIndicesIt = Iterators.peekingIterator(sortedIndices.iterator()); if (sortedIndicesIt.peek() < 0 || sortedIndicesIt.peek() >= size) throw new ArrayIndexOutOfBoundsException("Array index out of bounds: Requested index " + sortedIndicesIt.peek() + " but have only " + size + " elements."); int decompressedCount = 0; for (int pos = 0; pos < internalSize && sortedIndicesIt.hasNext(); pos++) { long lengthValue; if (compressedValues != null) lengthValue = compressedCounts[pos]; else lengthValue = delegateCompressedCounts.get(pos); decompressedCount += lengthValue; while (sortedIndicesIt.hasNext() && sortedIndicesIt.peek() < decompressedCount) { internalIndicesToResolveSorted.add(pos); sortedIndicesIt.next(); if (sortedIndicesIt.hasNext() && (sortedIndicesIt.peek() < 0 || sortedIndicesIt.peek() >= size)) throw new ArrayIndexOutOfBoundsException("Array index out of bounds: Requested index " + sortedIndicesIt.peek() + " but have only " + size + " elements."); } } } catch (Throwable t) { throw t; } // second: resolve those internal indices List<Long> res = new ArrayList<>(); if (compressedValues != null) { for (int idx : internalIndicesToResolveSorted) res.add(compressedValues[idx]); } else { // unique-ify indices to resolve List<Integer> delegateIdx = new ArrayList<>(new TreeSet<>(internalIndicesToResolveSorted)); List<Long> delegateRes = delegateCompressedValue.getMultiple(delegateIdx); PeekingIterator<Integer> delegateIdxIt = Iterators.peekingIterator(delegateIdx.iterator()); PeekingIterator<Long> delegateResIt = Iterators.peekingIterator(delegateRes.iterator()); for (int idx : internalIndicesToResolveSorted) { while (delegateIdxIt.peek() != idx) { delegateIdxIt.next(); delegateResIt.next(); } res.add(delegateResIt.peek()); } } return res; } @Override public void serialize(DataSerializationHelper mgr, SLongCompressedArrayRLE target) throws SerializationException { target.setSize(size); target.setIsSorted(isSorted); target.setNumberOfDifferentTuples(numberOfDifferentTuples); target.setMaxValue(maxValue); target.setMaxCount(maxCount); target.setMinValue(minValue); target.setSecondMinValue(secondMinValue); target.setMinCount(minCount); if (compressedValues != null) { target.setCompressedValues(LongStream.of(compressedValues).boxed().collect(Collectors.toList())); target.setCompressedCounts(LongStream.of(compressedCounts).boxed().collect(Collectors.toList())); } else { target.setDelegateCompressedValue(mgr.serializeChild(SLongCompressedArray.class, delegateCompressedValue)); target.setDelegateCompressedCounts(mgr.serializeChild(SLongCompressedArray.class, delegateCompressedCounts)); } } @SuppressWarnings("unchecked") @Override public void deserialize(DataSerializationHelper mgr, SLongCompressedArrayRLE source) throws DeserializationException { size = source.getSize(); isSorted = source.isIsSorted(); numberOfDifferentTuples = source.getNumberOfDifferentTuples(); maxValue = source.getMaxValue(); maxCount = source.getMaxCount(); minValue = source.getMinValue(); secondMinValue = source.getSecondMinValue(); minCount = source.getMinCount(); if (source.isSetCompressedValues()) { compressedValues = source.getCompressedValues().stream().mapToLong(Long::longValue).toArray(); compressedCounts = source.getCompressedCounts().stream().mapToLong(Long::longValue).toArray(); delegateCompressedValue = null; delegateCompressedCounts = null; } else { delegateCompressedValue = mgr.deserializeChild(ExplorableCompressedLongArray.class, source.getDelegateCompressedValue()); delegateCompressedCounts = mgr.deserializeChild(ExplorableCompressedLongArray.class, source.getDelegateCompressedCounts()); compressedValues = null; compressedCounts = null; } } @Override public long calculateApproximateSizeInBytes() { return 16 + // object header of this 53 + // small fields ((compressedCounts != null) ? compressedCounts.length * 8 : 0) + ((compressedValues != null) ? compressedValues.length * 8 : 0) + ((delegateCompressedCounts != null) ? delegateCompressedCounts.calculateApproximateSizeInBytes() : 0) + ((delegateCompressedValue != null) ? delegateCompressedValue.calculateApproximateSizeInBytes() : 0); } }