/** * diqube: Distributed Query Base. * * Copyright (C) 2015 Bastian Gloeckle * * This file is part of diqube. * * diqube is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package org.diqube.loader.compression; import java.util.HashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.NavigableMap; import java.util.SortedSet; import java.util.TreeMap; import java.util.function.Function; import java.util.function.Supplier; import java.util.stream.Stream; import org.apache.thrift.TBase; import org.diqube.data.serialize.DeserializationException; import org.diqube.data.serialize.SerializationException; import org.diqube.data.types.str.dict.ParentNode; import org.diqube.data.types.str.dict.StringDictionary; import org.diqube.data.types.str.dict.TerminalNode; import org.diqube.data.types.str.dict.TrieNode; import org.diqube.data.types.str.dict.TrieStringDictionary; import org.diqube.util.Pair; import org.diqube.util.SortedSetUnionStreamSupplier; import com.google.common.base.Strings; /** * Builds a compressed string dictionary out of a map that contains values and temporary ids. * * TODO #83: Extract super-interface. * * @author Bastian Gloeckle */ public class CompressedStringDictionaryBuilder { private NavigableMap<String, Long> entityMap; /** * @param entityMap * From decompressed string value to temporary Column Value IDs that have been assigned already. */ public CompressedStringDictionaryBuilder fromEntityMap(NavigableMap<String, Long> entityMap) { this.entityMap = entityMap; return this; } /** * Build the dictionary. * * @return {@link Pair} containing the new {@link StringDictionary} and an ID change map (maps from temporary ID that * was provided in {@link #fromEntityMap(Map)} to the final ID assigned in the resulting dict). */ public Pair<StringDictionary<?>, Map<Long, Long>> build() { SortedSet<String> keys = (SortedSet<String>) entityMap.keySet(); Map<Long, Long> idMap = new HashMap<>(); long newId = 0; for (String key : keys) { long curId = newId++; if (entityMap.get(key) != curId) idMap.put(entityMap.get(key), curId); } ConstructionParentNode root = new ConstructionParentNode(); ConstructionParentNode curNode = root; String curNodePrefix = ""; newId = 0; // note that the keys are traversed in sorted order already! for (String stringValue : keys) { // go up the current tree until our prefix matches, this might go up as far as the root node! while (!stringValue.startsWith(curNodePrefix)) { curNodePrefix = curNodePrefix.substring(0, curNodePrefix.length() - curNode.getParentToThisStringLength()); curNode = curNode.getParent(); } String remaining = stringValue.substring(curNodePrefix.length(), stringValue.length()).intern(); // check if there is a key that has a common prefix with our key. Note that there can be only one such key! See // class comment of TrieStringDictionary for why this is true. List<String> possiblyInterestingKeys = new LinkedList<>(); possiblyInterestingKeys.add(curNode.getChildTerminals().floorKey(remaining)); possiblyInterestingKeys.add(curNode.getChildTerminals().ceilingKey(remaining)); possiblyInterestingKeys.add(curNode.getChildNodes().floorKey(remaining)); possiblyInterestingKeys.add(curNode.getChildNodes().ceilingKey(remaining)); String interestingKey = null; String interestingCommonPrefix = null; for (String possiblyInterestingKey : possiblyInterestingKeys) { if (possiblyInterestingKey == null || possiblyInterestingKey.equals("")) // ignore the empty-string-terminal nodes - they will not match our new string. continue; String tmp = Strings.commonPrefix(possiblyInterestingKey, remaining); if (!"".equals(tmp)) { interestingKey = possiblyInterestingKey; interestingCommonPrefix = tmp.intern(); break; } } if (interestingKey != null) { // we found an entry with a common prefix - create new parent node and move the old node there and our new // string, too. ConstructionParentNode newParent = new ConstructionParentNode(); newParent.setParent(curNode); newParent.setParentToThisStringLength(interestingCommonPrefix.length()); newParent.getChildTerminals().put(removePrefix(remaining, interestingCommonPrefix), new TerminalNode(newId++)); if (curNode.getChildNodes().containsKey(interestingKey)) { ConstructionParentNode nodeToMove = curNode.getChildNodes().get(interestingKey); nodeToMove .setParentToThisStringLength(nodeToMove.getParentToThisStringLength() - interestingCommonPrefix.length()); newParent.getChildNodes().put(removePrefix(interestingKey, interestingCommonPrefix), nodeToMove); curNode.getChildNodes().remove(interestingKey); } else { // curNode.getChildTerminals().containsKey(interestingKey) newParent.getChildTerminals().put(removePrefix(interestingKey, interestingCommonPrefix), curNode.getChildTerminals().get(interestingKey)); curNode.getChildTerminals().remove(interestingKey); } curNode.getChildNodes().put(interestingCommonPrefix, newParent); // continue working in the new parent. curNode = newParent; curNodePrefix += interestingCommonPrefix; } else { // there was no node with a common prefix. add a new terminal node! curNode.getChildTerminals().put(remaining, new TerminalNode(newId++)); } } TrieStringDictionary res = new TrieStringDictionary(root.constructFinalNode(), entityMap.firstKey(), entityMap.lastKey(), entityMap.size() - 1); return new Pair<>(res, idMap); } private String removePrefix(String orig, String prefix) { if (prefix.length() == orig.length()) return "".intern(); return orig.substring(prefix.length(), orig.length()).intern(); } /** * Just like a {@link ParentNode}, but with additional information that is required while building the trie. * * After building the trie, for an instance of this class the real {@link ParentNode} can be created using * {@link #constructFinalNode()}. */ private static class ConstructionParentNode extends TrieNode<TBase<?, ?>> { private int parentToThisStringLength; private ConstructionParentNode parent; private NavigableMap<String, ConstructionParentNode> childNodes = new TreeMap<>(); private NavigableMap<String, TerminalNode> childTerminals = new TreeMap<>(); public NavigableMap<String, ConstructionParentNode> getChildNodes() { return childNodes; } public NavigableMap<String, TerminalNode> getChildTerminals() { return childTerminals; } public ConstructionParentNode getParent() { return parent; } public int getParentToThisStringLength() { return parentToThisStringLength; } public void setParentToThisStringLength(int parentToThisStringLength) { this.parentToThisStringLength = parentToThisStringLength; } public void setParent(ConstructionParentNode parent) { this.parent = parent; } /** * @return The actual {@link ParentNode} object for this {@link ConstructionParentNode}. This method actually * returns the recursive result, where all child nodes are created and returned, too - correctly wired of * course. */ public ParentNode constructFinalNode() { Function<String, TrieNode<?>> getFinalTrieNode = new Function<String, TrieNode<?>>() { @Override public TrieNode<?> apply(String key) { if (childTerminals.containsKey(key)) return childTerminals.get(key); return childNodes.get(key).constructFinalNode(); } }; Supplier<Stream<String>> allKeyStream = new SortedSetUnionStreamSupplier<>( // (SortedSet<String>) this.childNodes.keySet(), (SortedSet<String>) this.childTerminals.keySet()); TrieNode<?>[] childNodes = allKeyStream.get().map(getFinalTrieNode).toArray(l -> new TrieNode[l]); char[][] childChars = allKeyStream.get().map(s -> s.toCharArray()).toArray(l -> new char[l][]); long minId, maxId; if (childNodes[0] instanceof TerminalNode) minId = ((TerminalNode) childNodes[0]).getTerminalId(); else minId = ((ParentNode) childNodes[0]).getMinId(); if (childNodes[childNodes.length - 1] instanceof TerminalNode) maxId = ((TerminalNode) childNodes[childNodes.length - 1]).getTerminalId(); else maxId = ((ParentNode) childNodes[childNodes.length - 1]).getMaxId(); return new ParentNode(childChars, childNodes, minId, maxId); } @Override public void serialize(org.diqube.data.serialize.DataSerialization.DataSerializationHelper mgr, TBase<?, ?> target) throws SerializationException { // noop } @Override public void deserialize(org.diqube.data.serialize.DataSerialization.DataSerializationHelper mgr, TBase<?, ?> source) throws DeserializationException { // noop } @Override public long calculateApproximateSizeInBytes() { return 0; } } }