/** * diqube: Distributed Query Base. * * Copyright (C) 2015 Bastian Gloeckle * * This file is part of diqube. * * diqube is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package org.diqube.loader.compression; import java.util.HashMap; import java.util.Iterator; import java.util.Map; import java.util.Map.Entry; import java.util.NavigableMap; import org.diqube.data.types.lng.array.BitEfficientLongArray; import org.diqube.data.types.lng.array.CompressedLongArray; import org.diqube.data.types.lng.array.ReferenceBasedLongArray; import org.diqube.data.types.lng.array.RunLengthLongArray; import org.diqube.data.types.lng.dict.ArrayCompressedLongDictionary; import org.diqube.data.types.lng.dict.ConstantLongDictionary; import org.diqube.data.types.lng.dict.EmptyLongDictionary; import org.diqube.data.types.lng.dict.LongDictionary; import org.diqube.loader.compression.CompressedLongArrayBuilder.BitEfficientCompressionStrategy; import org.diqube.loader.compression.CompressedLongArrayBuilder.ReferenceAndBitEfficientCompressionStrategy; import org.diqube.util.Pair; /** * Builds compressed {@link LongDictionary}s. * * <p> * It currently builds {@link ArrayCompressedLongDictionary} objects with the better of two compressions: * * <ul> * <li>A plain {@link BitEfficientLongArray}</li> * <li>A {@link ReferenceBasedLongArray} with a {@link BitEfficientLongArray} inside.</li> * </ul> * * These compressions are fine to be used with dictionaries. {@link BitEfficientLongArray} use a special case for * {@link Long#MIN_VALUE} with which the {@link BitEfficientLongArray#get(int)} method might degenerate to a O(log m) * with m being the number of MIN_VALUES in the array. As dictionaries though have each value only once and therefore * {@link Long#MIN_VALUE} is contained in the array at most once, we can assume that log m is actually constant - * therefore the array-access complexity in both compression scenarios is constant. We therefore can hold that the * access to the dictionary itself is at most logarithmic. As counter example: It would not be meaningful to use * {@link RunLengthLongArray}s in dictionaries, as their get method is linear already and we would end up having worst * access to the dictionary of O(n log m). * * TODO #83: Extract super-interface. * * @author Bastian Gloeckle */ public class CompressedLongDictionaryBuilder { private NavigableMap<Long, Long> entityMap; private String name; /** * @param entityMap * From decompressed long value to temporary IDs that has been assigned already. */ public CompressedLongDictionaryBuilder fromEntityMap(NavigableMap<Long, Long> entityMap) { this.entityMap = entityMap; return this; } public CompressedLongDictionaryBuilder withDictionaryName(String name) { this.name = name; return this; } /** * Compresses the values and builds a new {@link LongDictionary} from them. * <p> * In addition to the new {@link LongDictionary}, this method returns a map that maps the temporary value IDs (as * provided in the map in {@link #fromEntityMap(NavigableMap)}) to the final IDs assigned by this builder. That map * will contain an entry only, if the ID of a specific value was actually changed by this builder. * * @return A {@link Pair} of the newly built {@link LongDictionary} and the ID map (from temporary IDs to final IDs, * containing the tuples where the ID was actually changed). */ public Pair<LongDictionary<?>, Map<Long, Long>> build() { if (entityMap.size() == 0) { return new Pair<>(new EmptyLongDictionary(), new HashMap<>()); } else if (entityMap.size() == 1) { Entry<Long, Long> entry = entityMap.entrySet().iterator().next(); LongDictionary<?> dict = new ConstantLongDictionary(entry.getKey()); Map<Long, Long> valueMap = new HashMap<>(); if (entry.getValue() != 0L) valueMap.put(entry.getValue(), 0L); return new Pair<>(dict, valueMap); } Map<Long, Long> idMap = new HashMap<Long, Long>(); long[] uncompressed = new long[entityMap.size()]; Iterator<Entry<Long, Long>> entryIt = entityMap.entrySet().iterator(); for (int i = 0; i < uncompressed.length; i++) { Entry<Long, Long> entry = entryIt.next(); uncompressed[i] = entry.getKey(); if (i != entry.getValue()) { idMap.put(entry.getValue(), (long) i); } } @SuppressWarnings("unchecked") CompressedLongArrayBuilder compressedBuilder = new CompressedLongArrayBuilder().withLogName(name).withValues(uncompressed) .withStrategies(BitEfficientCompressionStrategy.class, ReferenceAndBitEfficientCompressionStrategy.class); CompressedLongArray<?> compressedArray = compressedBuilder.build(); LongDictionary<?> dictRes = new ArrayCompressedLongDictionary(compressedArray); return new Pair<>(dictRes, idMap); } }