// Copyright 2014 The Bazel Authors. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package com.google.devtools.build.lib.util; import static java.nio.charset.StandardCharsets.UTF_8; import com.google.common.collect.Lists; import com.google.devtools.build.lib.concurrent.ThreadSafety.ThreadCompatible; import com.google.devtools.build.lib.concurrent.ThreadSafety.ThreadSafe; import java.util.ArrayList; /** * Provides memory-efficient bidirectional mapping String <-> unique integer. * Uses byte-wise compressed prefix trie internally. * <p> * Class allows index retrieval for the given string, addition of the new * index and string retrieval for the given index. It also allows efficient * serialization of the internal data structures. * <p> * Internally class stores list of nodes with each node containing byte[] * representation of compressed trie node: * <pre> * varint32 parentIndex; // index of the parent node * varint32 keylen; // length of the node key * byte[keylen] key; // node key data * repeated jumpEntry { // Zero or more jump entries, referencing child nodes * byte key // jump key (first byte of the child node key) * varint32 nodeIndex // child index * } * <p> * Note that jumpEntry key byte is actually duplicated in the child node * instance. This is done to improve performance of the index->string * lookup (so we can avoid jump table parsing during this lookup). * <p> * Root node of the trie must have parent id pointing to itself. * <p> * TODO(bazel-team): (2010) Consider more fine-tuned locking mechanism - e.g. * distinguishing between read and write locks. */ @ThreadSafe public class CompactStringIndexer extends AbstractIndexer { private static final int NOT_FOUND = -1; private ArrayList<byte[]> nodes; // Compressed prefix trie nodes. private int rootId; // Root node id. /* * Creates indexer instance. */ public CompactStringIndexer (int expectedCapacity) { Preconditions.checkArgument(expectedCapacity > 0); nodes = Lists.newArrayListWithExpectedSize(expectedCapacity); rootId = NOT_FOUND; } /** * Allocates new node index. Must be called only from * synchronized methods. */ private int allocateIndex() { nodes.add(null); return nodes.size() - 1; } /** * Replaces given node record with the new one. Must be called only from * synchronized methods. * <p> * Subclasses can override this method to be notified when an update actually * takes place. */ @ThreadCompatible protected void updateNode(int index, byte[] content) { nodes.set(index, content); } /** * Returns parent id for the given node content. * * @return parent node id */ private int getParentId(byte[] content) { int[] intHolder = new int[1]; VarInt.getVarInt(content, 0, intHolder); return intHolder[0]; } /** * Creates new node using specified key suffix. Must be called from * synchronized methods. * * @param parentNode parent node id * @param key original key that is being added to the indexer * @param offset node key offset in the original key. * * @return new node id corresponding to the given key */ private int createNode(int parentNode, byte[] key, int offset) { int index = allocateIndex(); int len = key.length - offset; Preconditions.checkState(len >= 0); // Content consists of parent id, key length and key. There are no jump records. byte[] content = new byte[VarInt.varIntSize(parentNode) + VarInt.varIntSize(len) + len]; // Add parent id. int contentOffset = VarInt.putVarInt(parentNode, content, 0); // Add node key length. contentOffset = VarInt.putVarInt(len, content, contentOffset); // Add node key content. System.arraycopy(key, offset, content, contentOffset, len); updateNode(index, content); return index; } /** * Updates jump entry index in the given node. * * @param node node id to update * @param oldIndex old jump entry index * @param newIndex updated jump entry index */ private void updateJumpEntry(int node, int oldIndex, int newIndex) { byte[] content = nodes.get(node); int[] intHolder = new int[1]; int offset = VarInt.getVarInt(content, 0, intHolder); // parent id offset = VarInt.getVarInt(content, offset, intHolder); // key length offset += intHolder[0]; // Offset now points to the first jump entry. while (offset < content.length) { int next = VarInt.getVarInt(content, offset + 1, intHolder); // jump index if (intHolder[0] == oldIndex) { // Substitute oldIndex value with newIndex. byte[] newContent = new byte[content.length + VarInt.varIntSize(newIndex) - VarInt.varIntSize(oldIndex)]; System.arraycopy(content, 0, newContent, 0, offset + 1); offset = VarInt.putVarInt(newIndex, newContent, offset + 1); System.arraycopy(content, next, newContent, offset, content.length - next); updateNode(node, newContent); return; } else { offset = next; } } StringBuilder builder = new StringBuilder().append("Index ").append(oldIndex) .append(" is not present in the node ").append(node).append(", "); dumpNodeContent(builder, content); throw new IllegalArgumentException(builder.toString()); } /** * Creates new branch node content at the predefined location, splitting * prefix from the given node and optionally adding another child node * jump entry. * * @param originalNode node that will be split * @param newBranchNode new branch node id * @param splitOffset offset at which to split original node key * @param indexKey optional additional jump key * @param childIndex optional additional jump index. Optional jump entry will * be skipped if this index is set to NOT_FOUND. */ private void createNewBranchNode(int originalNode, int newBranchNode, int splitOffset, byte indexKey, int childIndex) { byte[] content = nodes.get(originalNode); int[] intHolder = new int[1]; int keyOffset = VarInt.getVarInt(content, 0, intHolder); // parent id // If original node is a root node, new branch node will become new root. So set parent id // appropriately (for root node it is set to the node's own id). int parentIndex = (originalNode == intHolder[0] ? newBranchNode : intHolder[0]); keyOffset = VarInt.getVarInt(content, keyOffset, intHolder); // key length Preconditions.checkState(intHolder[0] >= splitOffset); // Calculate new content size. int newSize = VarInt.varIntSize(parentIndex) + VarInt.varIntSize(splitOffset) + splitOffset + 1 + VarInt.varIntSize(originalNode) + (childIndex != NOT_FOUND ? 1 + VarInt.varIntSize(childIndex) : 0); // New content consists of parent id, new key length, truncated key and two jump records. byte[] newContent = new byte[newSize]; // Add parent id. int contentOffset = VarInt.putVarInt(parentIndex, newContent, 0); // Add adjusted key length. contentOffset = VarInt.putVarInt(splitOffset, newContent, contentOffset); // Add truncated key content and first jump key. System.arraycopy(content, keyOffset, newContent, contentOffset, splitOffset + 1); // Add index for the first jump key. contentOffset = VarInt.putVarInt(originalNode, newContent, contentOffset + splitOffset + 1); // If requested, add additional jump entry. if (childIndex != NOT_FOUND) { // Add second jump key. newContent[contentOffset] = indexKey; // Add index for the second jump key. VarInt.putVarInt(childIndex, newContent, contentOffset + 1); } updateNode(newBranchNode, newContent); } /** * Inject newly created branch node into the trie data structure. Method * will update parent node jump entry to point to the new branch node (or * will update root id if branch node becomes new root) and will truncate * key prefix from the original node that was split (that prefix now * resides in the branch node). * * @param originalNode node that will be split * @param newBranchNode new branch node id * @param commonPrefixLength how many bytes should be split into the new branch node. */ private void injectNewBranchNode(int originalNode, int newBranchNode, int commonPrefixLength) { byte[] content = nodes.get(originalNode); int parentId = getParentId(content); if (originalNode == parentId) { rootId = newBranchNode; // update root index } else { updateJumpEntry(parentId, originalNode, newBranchNode); } // Truncate prefix from the original node and set its parent to the our new branch node. int[] intHolder = new int[1]; int suffixOffset = VarInt.getVarInt(content, 0, intHolder); // parent id suffixOffset = VarInt.getVarInt(content, suffixOffset, intHolder); // key length int len = intHolder[0] - commonPrefixLength; Preconditions.checkState(len >= 0); suffixOffset += commonPrefixLength; // New content consists of parent id, new key length and duplicated key suffix. byte[] newContent = new byte[VarInt.varIntSize(newBranchNode) + VarInt.varIntSize(len) + (content.length - suffixOffset)]; // Add parent id. int contentOffset = VarInt.putVarInt(newBranchNode, newContent, 0); // Add new key length. contentOffset = VarInt.putVarInt(len, newContent, contentOffset); // Add key and jump table. System.arraycopy(content, suffixOffset, newContent, contentOffset, content.length - suffixOffset); updateNode(originalNode, newContent); } /** * Adds new child node (that uses specified key suffix) to the given * current node. * Example: * <pre> * Had "ab". Adding "abcd". * * 1:"ab",'c'->2 * 1:"ab" -> \ * 2:"cd" * </pre> */ private int addChildNode(int parentNode, byte[] key, int keyOffset) { int child = createNode(parentNode, key, keyOffset); byte[] content = nodes.get(parentNode); // Add jump table entry to the parent node. int entryOffset = content.length; // New content consists of original content and additional jump record. byte[] newContent = new byte[entryOffset + 1 + VarInt.varIntSize(child)]; // Copy original content. System.arraycopy(content, 0, newContent, 0, entryOffset); // Add jump key. newContent[entryOffset] = key[keyOffset]; // Add jump index. VarInt.putVarInt(child, newContent, entryOffset + 1); updateNode(parentNode, newContent); return child; } /** * Splits node into two at the specified offset. * Example: * <pre> * Had "abcd". Adding "ab". * * 2:"ab",'c'->1 * 1:"abcd" -> \ * 1:"cd" * </pre> */ private int splitNodeSuffix(int nodeToSplit, int commonPrefixLength) { int newBranchNode = allocateIndex(); // Create new node with truncated key. createNewBranchNode(nodeToSplit, newBranchNode, commonPrefixLength, (byte) 0, NOT_FOUND); injectNewBranchNode(nodeToSplit, newBranchNode, commonPrefixLength); return newBranchNode; } /** * Splits node into two at the specified offset and adds another leaf. * Example: * <pre> * Had "abcd". Adding "abef". * * 3:"ab",'c'->1,'e'->2 * 1:"abcd" -> / \ * 1:"cd" 2:"ef" * </pre> */ private int addBranch(int nodeToSplit, byte[] key, int offset, int commonPrefixLength) { int newBranchNode = allocateIndex(); int child = createNode(newBranchNode, key, offset + commonPrefixLength); // Create new node with the truncated key and reference to the new child node. createNewBranchNode(nodeToSplit, newBranchNode, commonPrefixLength, key[offset + commonPrefixLength], child); injectNewBranchNode(nodeToSplit, newBranchNode, commonPrefixLength); return child; } private int findOrCreateIndexInternal(int node, byte[] key, int offset, boolean createIfNotFound) { byte[] content = nodes.get(node); int[] intHolder = new int[1]; int contentOffset = VarInt.getVarInt(content, 0, intHolder); // parent id contentOffset = VarInt.getVarInt(content, contentOffset, intHolder); // key length int skyKeyLen = intHolder[0]; int remainingKeyLen = key.length - offset; int minKeyLen = Math.min(skyKeyLen, remainingKeyLen); // Compare given key/offset content with the node key. Skip first key byte for recursive // calls - this byte is equal to the byte in the jump entry and was already compared. for (int i = (offset > 0 ? 1 : 0); i < minKeyLen; i++) { // compare key if (key[offset + i] != content[contentOffset + i]) { // Mismatch found somewhere in the middle of the node key. If requested, node // should be split and another leaf added for the new key. return createIfNotFound ? addBranch(node, key, offset, i) : NOT_FOUND; } } if (remainingKeyLen > minKeyLen) { // Node key matched portion of the key - find appropriate jump entry. If found - recursion. // If not - mismatch (we will add new child node if requested). contentOffset += skyKeyLen; while (contentOffset < content.length) { if (key[offset + skyKeyLen] == content[contentOffset]) { // compare index value VarInt.getVarInt(content, contentOffset + 1, intHolder); // Found matching jump entry - recursively compare the child. return findOrCreateIndexInternal(intHolder[0], key, offset + skyKeyLen, createIfNotFound); } else { // Jump entry key does not match. Skip rest of the entry data. contentOffset = VarInt.getVarInt(content, contentOffset + 1, intHolder); } } // There are no matching jump entries - report mismatch or create a new leaf if necessary. return createIfNotFound ? addChildNode(node, key, offset + skyKeyLen) : NOT_FOUND; } else if (skyKeyLen > minKeyLen) { // Key suffix is a subset of the node key. Report mismatch or split the node if requested). return createIfNotFound ? splitNodeSuffix(node, minKeyLen) : NOT_FOUND; } else { // Node key exactly matches key suffix - return associated index value. return node; } } private synchronized int findOrCreateIndex(byte[] key, boolean createIfNotFound) { if (rootId == NOT_FOUND) { // Root node does not seem to exist - create it if needed. if (createIfNotFound) { rootId = createNode(0, key, 0); Preconditions.checkState(rootId == 0); return 0; } else { return NOT_FOUND; } } return findOrCreateIndexInternal(rootId, key, 0, createIfNotFound); } private byte[] reconstructKeyInternal(int node, int suffixSize) { byte[] content = nodes.get(node); Preconditions.checkNotNull(content); int[] intHolder = new int[1]; int contentOffset = VarInt.getVarInt(content, 0, intHolder); // parent id int parentNode = intHolder[0]; contentOffset = VarInt.getVarInt(content, contentOffset, intHolder); // key length int len = intHolder[0]; byte[] key; if (node != parentNode) { // We haven't reached root node yet. Make a recursive call, adjusting suffix length. key = reconstructKeyInternal(parentNode, suffixSize + len); } else { // We are in a root node. Finally allocate array for the key. It will be filled up // on our way back from recursive call tree. key = new byte[suffixSize + len]; } // Fill appropriate portion of the full key with the node key content. System.arraycopy(content, contentOffset, key, key.length - suffixSize - len, len); return key; } private byte[] reconstructKey(int node) { return reconstructKeyInternal(node, 0); } /* (non-Javadoc) * @see com.google.devtools.build.lib.util.StringIndexer#clear() */ @Override public synchronized void clear() { nodes.clear(); } /* (non-Javadoc) * @see com.google.devtools.build.lib.util.StringIndexer#size() */ @Override public synchronized int size() { return nodes.size(); } protected int getOrCreateIndexForBytes(byte[] bytes) { return findOrCreateIndex(bytes, true); } protected synchronized boolean addBytes(byte[] bytes) { int count = nodes.size(); int index = getOrCreateIndexForBytes(bytes); return index >= count; } protected int getIndexForBytes(byte[] bytes) { return findOrCreateIndex(bytes, false); } /* (non-Javadoc) * @see com.google.devtools.build.lib.util.StringIndexer#getOrCreateIndex(java.lang.String) */ @Override public int getOrCreateIndex(String s) { return getOrCreateIndexForBytes(string2bytes(s)); } /* (non-Javadoc) * @see com.google.devtools.build.lib.util.StringIndexer#getIndex(java.lang.String) */ @Override public int getIndex(String s) { return getIndexForBytes(string2bytes(s)); } /* (non-Javadoc) * @see com.google.devtools.build.lib.util.StringIndexer#addString(java.lang.String) */ @Override public boolean addString(String s) { return addBytes(string2bytes(s)); } protected synchronized byte[] getBytesForIndex(int i) { Preconditions.checkArgument(i >= 0); if (i >= nodes.size()) { return null; } return reconstructKey(i); } /* (non-Javadoc) * @see com.google.devtools.build.lib.util.StringIndexer#getStringForIndex(int) */ @Override public String getStringForIndex(int i) { byte[] bytes = getBytesForIndex(i); return bytes != null ? bytes2string(bytes) : null; } private void dumpNodeContent(StringBuilder builder, byte[] content) { int[] intHolder = new int[1]; int offset = VarInt.getVarInt(content, 0, intHolder); builder.append("parent: ").append(intHolder[0]); offset = VarInt.getVarInt(content, offset, intHolder); int len = intHolder[0]; builder.append(", len: ").append(len).append(", key: \"") .append(new String(content, offset, len, UTF_8)).append('"'); offset += len; while (offset < content.length) { builder.append(", '").append(new String(content, offset, 1, UTF_8)).append("': "); offset = VarInt.getVarInt(content, offset + 1, intHolder); builder.append(intHolder[0]); } builder.append(", size: ").append(content.length); } private int dumpContent(StringBuilder builder, int node, int indent, boolean[] seen) { for(int i = 0; i < indent; i++) { builder.append(" "); } builder.append(node).append(": "); if (node >= nodes.size()) { builder.append("OUT_OF_BOUNDS\n"); return 0; } else if (seen[node]) { builder.append("ALREADY_SEEN\n"); return 0; } seen[node] = true; byte[] content = nodes.get(node); if (content == null) { builder.append("NULL\n"); return 0; } dumpNodeContent(builder, content); builder.append("\n"); int contentSize = content.length; int[] intHolder = new int[1]; int contentOffset = VarInt.getVarInt(content, 0, intHolder); // parent id contentOffset = VarInt.getVarInt(content, contentOffset, intHolder); // key length contentOffset += intHolder[0]; while (contentOffset < content.length) { contentOffset = VarInt.getVarInt(content, contentOffset + 1, intHolder); contentSize += dumpContent(builder, intHolder[0], indent + 1, seen); } return contentSize; } @Override public synchronized String toString() { StringBuilder builder = new StringBuilder(); builder.append("size = ").append(nodes.size()).append("\n"); if (!nodes.isEmpty()) { int contentSize = dumpContent(builder, rootId, 0, new boolean[nodes.size()]); builder.append("contentSize = ").append(contentSize).append("\n"); } return builder.toString(); } }