/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.codec.prefixtree.encode.tokenize; import java.util.ArrayList; import java.util.List; import org.apache.hadoop.hbase.classification.InterfaceAudience; import org.apache.hadoop.hbase.util.ByteRange; import org.apache.hadoop.hbase.util.ByteRangeUtils; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.CollectionUtils; import org.apache.hadoop.hbase.util.SimpleMutableByteRange; import org.apache.hadoop.hbase.util.Strings; import com.google.common.collect.Lists; /** * Individual node in a Trie structure. Each node is one of 3 types: * <ul> * <li>Branch: an internal trie node that may have a token and must have multiple children, but does * not represent an actual input byte[], hence its numOccurrences is 0 * <li>Leaf: a node with no children and where numOccurrences is >= 1. It's token represents the * last bytes in the input byte[]s. * <li>Nub: a combination of a branch and leaf. Its token represents the last bytes of input * byte[]s and has numOccurrences >= 1, but it also has child nodes which represent input byte[]s * that add bytes to this nodes input byte[]. * </ul> * <br><br> * Example inputs (numInputs=7): * 0: AAA * 1: AAA * 2: AAB * 3: AAB * 4: AAB * 5: AABQQ * 6: AABQQ * <br><br> * Resulting TokenizerNodes: * AA <- branch, numOccurrences=0, tokenStartOffset=0, token.length=2 * A <- leaf, numOccurrences=2, tokenStartOffset=2, token.length=1 * B <- nub, numOccurrences=3, tokenStartOffset=2, token.length=1 * QQ <- leaf, numOccurrences=2, tokenStartOffset=3, token.length=2 * <br><br> * numInputs == 7 == sum(numOccurrences) == 0 + 2 + 3 + 2 */ @InterfaceAudience.Private public class TokenizerNode{ /* * Ref to data structure wrapper */ protected Tokenizer builder; /****************************************************************** * Tree content/structure used during tokenization * ****************************************************************/ /* * ref to parent trie node */ protected TokenizerNode parent; /* * node depth in trie, irrespective of each node's token length */ protected int nodeDepth; /* * start index of this token in original byte[] */ protected int tokenStartOffset; /* * bytes for this trie node. can be length 0 in root node */ protected ByteRange token; /* * A count of occurrences in the input byte[]s, not the trie structure. 0 for branch nodes, 1+ for * nubs and leaves. If the same byte[] is added to the trie multiple times, this is the only thing * that changes in the tokenizer. As a result, duplicate byte[]s are very inexpensive to encode. */ protected int numOccurrences; /* * The maximum fan-out of a byte[] trie is 256, so there are a maximum of 256 * child nodes. */ protected ArrayList<TokenizerNode> children; /* * Fields used later in the encoding process for sorting the nodes into the order they'll be * written to the output byte[]. With these fields, the TokenizerNode and therefore Tokenizer * are not generic data structures but instead are specific to HBase PrefixTree encoding. */ /* * unique id assigned to each TokenizerNode */ protected long id; /* * set >=0 for nubs and leaves */ protected int firstInsertionIndex = -1; /* * A positive value indicating how many bytes before the end of the block this node will start. If * the section is 55 bytes and negativeOffset is 9, then the node will start at 46. */ protected int negativeIndex = 0; /* * The offset in the output array at which to start writing this node's token bytes. Influenced * by the lengths of all tokens sorted before this one. */ protected int outputArrayOffset = -1; /*********************** construct *****************************/ public TokenizerNode(Tokenizer builder, TokenizerNode parent, int nodeDepth, int tokenStartOffset, int tokenOffset, int tokenLength) { this.token = new SimpleMutableByteRange(); reconstruct(builder, parent, nodeDepth, tokenStartOffset, tokenOffset, tokenLength); this.children = Lists.newArrayList(); } /* * Sub-constructor for initializing all fields without allocating a new object. Used by the * regular constructor. */ public void reconstruct(Tokenizer builder, TokenizerNode parent, int nodeDepth, int tokenStartOffset, int tokenOffset, int tokenLength) { this.builder = builder; this.id = builder.nextNodeId(); this.parent = parent; this.nodeDepth = nodeDepth; builder.submitMaxNodeDepthCandidate(nodeDepth); this.tokenStartOffset = tokenStartOffset; this.token.set(builder.tokens, tokenOffset, tokenLength); this.numOccurrences = 1; } /* * Clear the state of this node so that it looks like it was just allocated. */ public void reset() { builder = null; parent = null; nodeDepth = 0; tokenStartOffset = 0; token.unset(); numOccurrences = 0; children.clear();// branches & nubs // ids/offsets. used during writing to byte[] id = 0; firstInsertionIndex = -1;// set >=0 for nubs and leaves negativeIndex = 0; outputArrayOffset = -1; } /************************* building *********************************/ /* * <li>Only public method used during the tokenization process * <li>Requires that the input ByteRange sort after the previous, and therefore after all previous * inputs * <li>Only looks at bytes of the input array that align with this node's token */ public void addSorted(final ByteRange bytes) {// recursively build the tree /* * Recurse deeper into the existing trie structure */ if (matchesToken(bytes) && CollectionUtils.notEmpty(children)) { TokenizerNode lastChild = CollectionUtils.getLast(children); if (lastChild.partiallyMatchesToken(bytes)) { lastChild.addSorted(bytes); return; } } /* * Recursion ended. We must either * <li>1: increment numOccurrences if this input was equal to the previous * <li>2: convert this node from a leaf to a nub, and add a new child leaf * <li>3: split this node into a branch and leaf, and then add a second leaf */ // add it as a child of this node int numIdenticalTokenBytes = numIdenticalBytes(bytes);// should be <= token.length int tailOffset = tokenStartOffset + numIdenticalTokenBytes; int tailLength = bytes.getLength() - tailOffset; if (numIdenticalTokenBytes == token.getLength()) { if (tailLength == 0) {// identical to this node (case 1) incrementNumOccurrences(1); } else {// identical to this node, but with a few extra tailing bytes. (leaf -> nub) (case 2) int childNodeDepth = nodeDepth + 1; int childTokenStartOffset = tokenStartOffset + numIdenticalTokenBytes; TokenizerNode newChildNode = builder.addNode(this, childNodeDepth, childTokenStartOffset, bytes, tailOffset); addChild(newChildNode); } } else {//numIdenticalBytes > 0, split into branch/leaf and then add second leaf (case 3) split(numIdenticalTokenBytes, bytes); } } protected void addChild(TokenizerNode node) { node.setParent(this); children.add(node); } /** * Called when we need to convert a leaf node into a branch with 2 leaves. Comments inside the * method assume we have token BAA starting at tokenStartOffset=0 and are adding BOO. The output * will be 3 nodes:<br> * <ul> * <li>1: B <- branch * <li>2: AA <- leaf * <li>3: OO <- leaf * </ul> * * @param numTokenBytesToRetain => 1 (the B) * @param bytes => BOO */ protected void split(int numTokenBytesToRetain, final ByteRange bytes) { int childNodeDepth = nodeDepth; int childTokenStartOffset = tokenStartOffset + numTokenBytesToRetain; //create leaf AA TokenizerNode firstChild = builder.addNode(this, childNodeDepth, childTokenStartOffset, token, numTokenBytesToRetain); firstChild.setNumOccurrences(numOccurrences);// do before clearing this node's numOccurrences token.setLength(numTokenBytesToRetain);//shorten current token from BAA to B numOccurrences = 0;//current node is now a branch moveChildrenToDifferentParent(firstChild);//point the new leaf (AA) to the new branch (B) addChild(firstChild);//add the new leaf (AA) to the branch's (B's) children //create leaf OO TokenizerNode secondChild = builder.addNode(this, childNodeDepth, childTokenStartOffset, bytes, tokenStartOffset + numTokenBytesToRetain); addChild(secondChild);//add the new leaf (00) to the branch's (B's) children // we inserted branch node B as a new level above/before the two children, so increment the // depths of the children below firstChild.incrementNodeDepthRecursively(); secondChild.incrementNodeDepthRecursively(); } protected void incrementNodeDepthRecursively() { ++nodeDepth; builder.submitMaxNodeDepthCandidate(nodeDepth); for (int i = 0; i < children.size(); ++i) { children.get(i).incrementNodeDepthRecursively(); } } protected void moveChildrenToDifferentParent(TokenizerNode newParent) { for (int i = 0; i < children.size(); ++i) { TokenizerNode child = children.get(i); child.setParent(newParent); newParent.children.add(child); } children.clear(); } /************************ byte[] utils *************************/ protected boolean partiallyMatchesToken(ByteRange bytes) { return numIdenticalBytes(bytes) > 0; } protected boolean matchesToken(ByteRange bytes) { return numIdenticalBytes(bytes) == getTokenLength(); } protected int numIdenticalBytes(ByteRange bytes) { return ByteRangeUtils.numEqualPrefixBytes(token, bytes, tokenStartOffset); } /***************** moving nodes around ************************/ public void appendNodesToExternalList(List<TokenizerNode> appendTo, boolean includeNonLeaves, boolean includeLeaves) { if (includeNonLeaves && !isLeaf() || includeLeaves && isLeaf()) { appendTo.add(this); } for (int i = 0; i < children.size(); ++i) { TokenizerNode child = children.get(i); child.appendNodesToExternalList(appendTo, includeNonLeaves, includeLeaves); } } public int setInsertionIndexes(int nextIndex) { int newNextIndex = nextIndex; if (hasOccurrences()) { setFirstInsertionIndex(nextIndex); newNextIndex += numOccurrences; } for (int i = 0; i < children.size(); ++i) { TokenizerNode child = children.get(i); newNextIndex = child.setInsertionIndexes(newNextIndex); } return newNextIndex; } public void appendOutputArrayOffsets(List<Integer> offsets) { if (hasOccurrences()) { offsets.add(outputArrayOffset); } for (int i = 0; i < children.size(); ++i) { TokenizerNode child = children.get(i); child.appendOutputArrayOffsets(offsets); } } /***************** searching *********************************/ /* * Do a trie style search through the tokenizer. One option for looking up families or qualifiers * during encoding, but currently unused in favor of tracking this information as they are added. * * Keeping code pending further performance testing. */ public void getNode(TokenizerRowSearchResult resultHolder, byte[] key, int keyOffset, int keyLength) { int thisNodeDepthPlusLength = tokenStartOffset + token.getLength(); // quick check if the key is shorter than this node (may not work for binary search) if (CollectionUtils.isEmpty(children)) { if (thisNodeDepthPlusLength < keyLength) {// ran out of bytes resultHolder.set(TokenizerRowSearchPosition.NO_MATCH, null); return; } } // all token bytes must match for (int i = 0; i < token.getLength(); ++i) { if (key[tokenStartOffset + keyOffset + i] != token.get(i)) { // TODO return whether it's before or after so we can binary search resultHolder.set(TokenizerRowSearchPosition.NO_MATCH, null); return; } } if (thisNodeDepthPlusLength == keyLength && numOccurrences > 0) { resultHolder.set(TokenizerRowSearchPosition.MATCH, this);// MATCH return; } if (CollectionUtils.notEmpty(children)) { // TODO binary search the children for (int i = 0; i < children.size(); ++i) { TokenizerNode child = children.get(i); child.getNode(resultHolder, key, keyOffset, keyLength); if (resultHolder.isMatch()) { return; } else if (resultHolder.getDifference() == TokenizerRowSearchPosition.BEFORE) { // passed it, so it doesn't exist resultHolder.set(TokenizerRowSearchPosition.NO_MATCH, null); return; } // key is still AFTER the current node, so continue searching } } // checked all children (or there were no children), and didn't find it resultHolder.set(TokenizerRowSearchPosition.NO_MATCH, null); return; } /****************** writing back to byte[]'s *************************/ public byte[] getNewByteArray() { byte[] arrayToFill = new byte[tokenStartOffset + token.getLength()]; fillInBytes(arrayToFill); return arrayToFill; } public void fillInBytes(byte[] arrayToFill) { for (int i = 0; i < token.getLength(); ++i) { arrayToFill[tokenStartOffset + i] = token.get(i); } if (parent != null) { parent.fillInBytes(arrayToFill); } } /************************** printing ***********************/ @Override public String toString() { String s = ""; if (parent == null) { s += "R "; } else { s += getBnlIndicator(false) + " " + Bytes.toString(parent.getNewByteArray()); } s += "[" + Bytes.toString(token.deepCopyToNewArray()) + "]"; if (numOccurrences > 0) { s += "x" + numOccurrences; } return s; } public String getPaddedTokenAndOccurrenceString() { StringBuilder sb = new StringBuilder(); sb.append(getBnlIndicator(true)); sb.append(Strings.padFront(numOccurrences + "", ' ', 3)); sb.append(Strings.padFront(nodeDepth + "", ' ', 3)); if (outputArrayOffset >= 0) { sb.append(Strings.padFront(outputArrayOffset + "", ' ', 3)); } sb.append(" "); for (int i = 0; i < tokenStartOffset; ++i) { sb.append(" "); } sb.append(Bytes.toString(token.deepCopyToNewArray()).replaceAll(" ", "_")); return sb.toString(); } public String getBnlIndicator(boolean indent) { if (indent) { if (isNub()) { return " N "; } return isBranch() ? "B " : " L"; } if (isNub()) { return "N"; } return isBranch() ? "B" : "L"; } /********************** count different node types ********************/ public int getNumBranchNodesIncludingThisNode() { if (isLeaf()) { return 0; } int totalFromThisPlusChildren = isBranch() ? 1 : 0; for (int i = 0; i < children.size(); ++i) { TokenizerNode child = children.get(i); totalFromThisPlusChildren += child.getNumBranchNodesIncludingThisNode(); } return totalFromThisPlusChildren; } public int getNumNubNodesIncludingThisNode() { if (isLeaf()) { return 0; } int totalFromThisPlusChildren = isNub() ? 1 : 0; for (int i = 0; i < children.size(); ++i) { TokenizerNode child = children.get(i); totalFromThisPlusChildren += child.getNumNubNodesIncludingThisNode(); } return totalFromThisPlusChildren; } public int getNumLeafNodesIncludingThisNode() { if (isLeaf()) { return 1; } int totalFromChildren = 0; for (int i = 0; i < children.size(); ++i) { TokenizerNode child = children.get(i); totalFromChildren += child.getNumLeafNodesIncludingThisNode(); } return totalFromChildren; } /*********************** simple read-only methods *******************************/ public int getNodeDepth() { return nodeDepth; } public int getTokenLength() { return token.getLength(); } public boolean hasOccurrences() { return numOccurrences > 0; } public boolean isRoot() { return this.parent == null; } public int getNumChildren() { return CollectionUtils.nullSafeSize(children); } public TokenizerNode getLastChild() { if (CollectionUtils.isEmpty(children)) { return null; } return CollectionUtils.getLast(children); } public boolean isLeaf() { return CollectionUtils.isEmpty(children) && hasOccurrences(); } public boolean isBranch() { return CollectionUtils.notEmpty(children) && !hasOccurrences(); } public boolean isNub() { return CollectionUtils.notEmpty(children) && hasOccurrences(); } /********************** simple mutation methods *************************/ /** * Each occurrence > 1 indicates a repeat of the previous entry. * This can be called directly by * an external class without going through the process of detecting a repeat if it is a known * repeat by some external mechanism. PtEncoder uses this when adding cells to a row if it knows * the new cells are part of the current row. * @param d increment by this amount */ public void incrementNumOccurrences(int d) { numOccurrences += d; } /************************* autogenerated get/set ******************/ public int getTokenOffset() { return tokenStartOffset; } public TokenizerNode getParent() { return parent; } public ByteRange getToken() { return token; } public int getNumOccurrences() { return numOccurrences; } public void setParent(TokenizerNode parent) { this.parent = parent; } public void setNumOccurrences(int numOccurrences) { this.numOccurrences = numOccurrences; } public ArrayList<TokenizerNode> getChildren() { return children; } public long getId() { return id; } public int getFirstInsertionIndex() { return firstInsertionIndex; } public void setFirstInsertionIndex(int firstInsertionIndex) { this.firstInsertionIndex = firstInsertionIndex; } public int getNegativeIndex() { return negativeIndex; } public void setNegativeIndex(int negativeIndex) { this.negativeIndex = negativeIndex; } public int getOutputArrayOffset() { return outputArrayOffset; } public void setOutputArrayOffset(int outputArrayOffset) { this.outputArrayOffset = outputArrayOffset; } public void setId(long id) { this.id = id; } public void setBuilder(Tokenizer builder) { this.builder = builder; } public void setTokenOffset(int tokenOffset) { this.tokenStartOffset = tokenOffset; } public void setToken(ByteRange token) { this.token = token; } }