/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.codec.prefixtree.encode.tokenize;
import java.util.ArrayList;
import java.util.List;
import org.apache.hadoop.hbase.classification.InterfaceAudience;
import org.apache.hadoop.hbase.util.ByteRange;
import org.apache.hadoop.hbase.util.ByteRangeUtils;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.CollectionUtils;
import org.apache.hadoop.hbase.util.SimpleMutableByteRange;
import org.apache.hadoop.hbase.util.Strings;
import com.google.common.collect.Lists;
/**
* Individual node in a Trie structure. Each node is one of 3 types:
* <ul>
* <li>Branch: an internal trie node that may have a token and must have multiple children, but does
* not represent an actual input byte[], hence its numOccurrences is 0
* <li>Leaf: a node with no children and where numOccurrences is >= 1. It's token represents the
* last bytes in the input byte[]s.
* <li>Nub: a combination of a branch and leaf. Its token represents the last bytes of input
* byte[]s and has numOccurrences >= 1, but it also has child nodes which represent input byte[]s
* that add bytes to this nodes input byte[].
* </ul>
* <br><br>
* Example inputs (numInputs=7):
* 0: AAA
* 1: AAA
* 2: AAB
* 3: AAB
* 4: AAB
* 5: AABQQ
* 6: AABQQ
* <br><br>
* Resulting TokenizerNodes:
* AA <- branch, numOccurrences=0, tokenStartOffset=0, token.length=2
* A <- leaf, numOccurrences=2, tokenStartOffset=2, token.length=1
* B <- nub, numOccurrences=3, tokenStartOffset=2, token.length=1
* QQ <- leaf, numOccurrences=2, tokenStartOffset=3, token.length=2
* <br><br>
* numInputs == 7 == sum(numOccurrences) == 0 + 2 + 3 + 2
*/
@InterfaceAudience.Private
public class TokenizerNode{
/*
* Ref to data structure wrapper
*/
protected Tokenizer builder;
/******************************************************************
* Tree content/structure used during tokenization
* ****************************************************************/
/*
* ref to parent trie node
*/
protected TokenizerNode parent;
/*
* node depth in trie, irrespective of each node's token length
*/
protected int nodeDepth;
/*
* start index of this token in original byte[]
*/
protected int tokenStartOffset;
/*
* bytes for this trie node. can be length 0 in root node
*/
protected ByteRange token;
/*
* A count of occurrences in the input byte[]s, not the trie structure. 0 for branch nodes, 1+ for
* nubs and leaves. If the same byte[] is added to the trie multiple times, this is the only thing
* that changes in the tokenizer. As a result, duplicate byte[]s are very inexpensive to encode.
*/
protected int numOccurrences;
/*
* The maximum fan-out of a byte[] trie is 256, so there are a maximum of 256
* child nodes.
*/
protected ArrayList<TokenizerNode> children;
/*
* Fields used later in the encoding process for sorting the nodes into the order they'll be
* written to the output byte[]. With these fields, the TokenizerNode and therefore Tokenizer
* are not generic data structures but instead are specific to HBase PrefixTree encoding.
*/
/*
* unique id assigned to each TokenizerNode
*/
protected long id;
/*
* set >=0 for nubs and leaves
*/
protected int firstInsertionIndex = -1;
/*
* A positive value indicating how many bytes before the end of the block this node will start. If
* the section is 55 bytes and negativeOffset is 9, then the node will start at 46.
*/
protected int negativeIndex = 0;
/*
* The offset in the output array at which to start writing this node's token bytes. Influenced
* by the lengths of all tokens sorted before this one.
*/
protected int outputArrayOffset = -1;
/*********************** construct *****************************/
public TokenizerNode(Tokenizer builder, TokenizerNode parent, int nodeDepth,
int tokenStartOffset, int tokenOffset, int tokenLength) {
this.token = new SimpleMutableByteRange();
reconstruct(builder, parent, nodeDepth, tokenStartOffset, tokenOffset, tokenLength);
this.children = Lists.newArrayList();
}
/*
* Sub-constructor for initializing all fields without allocating a new object. Used by the
* regular constructor.
*/
public void reconstruct(Tokenizer builder, TokenizerNode parent, int nodeDepth,
int tokenStartOffset, int tokenOffset, int tokenLength) {
this.builder = builder;
this.id = builder.nextNodeId();
this.parent = parent;
this.nodeDepth = nodeDepth;
builder.submitMaxNodeDepthCandidate(nodeDepth);
this.tokenStartOffset = tokenStartOffset;
this.token.set(builder.tokens, tokenOffset, tokenLength);
this.numOccurrences = 1;
}
/*
* Clear the state of this node so that it looks like it was just allocated.
*/
public void reset() {
builder = null;
parent = null;
nodeDepth = 0;
tokenStartOffset = 0;
token.unset();
numOccurrences = 0;
children.clear();// branches & nubs
// ids/offsets. used during writing to byte[]
id = 0;
firstInsertionIndex = -1;// set >=0 for nubs and leaves
negativeIndex = 0;
outputArrayOffset = -1;
}
/************************* building *********************************/
/*
* <li>Only public method used during the tokenization process
* <li>Requires that the input ByteRange sort after the previous, and therefore after all previous
* inputs
* <li>Only looks at bytes of the input array that align with this node's token
*/
public void addSorted(final ByteRange bytes) {// recursively build the tree
/*
* Recurse deeper into the existing trie structure
*/
if (matchesToken(bytes) && CollectionUtils.notEmpty(children)) {
TokenizerNode lastChild = CollectionUtils.getLast(children);
if (lastChild.partiallyMatchesToken(bytes)) {
lastChild.addSorted(bytes);
return;
}
}
/*
* Recursion ended. We must either
* <li>1: increment numOccurrences if this input was equal to the previous
* <li>2: convert this node from a leaf to a nub, and add a new child leaf
* <li>3: split this node into a branch and leaf, and then add a second leaf
*/
// add it as a child of this node
int numIdenticalTokenBytes = numIdenticalBytes(bytes);// should be <= token.length
int tailOffset = tokenStartOffset + numIdenticalTokenBytes;
int tailLength = bytes.getLength() - tailOffset;
if (numIdenticalTokenBytes == token.getLength()) {
if (tailLength == 0) {// identical to this node (case 1)
incrementNumOccurrences(1);
} else {// identical to this node, but with a few extra tailing bytes. (leaf -> nub) (case 2)
int childNodeDepth = nodeDepth + 1;
int childTokenStartOffset = tokenStartOffset + numIdenticalTokenBytes;
TokenizerNode newChildNode = builder.addNode(this, childNodeDepth, childTokenStartOffset,
bytes, tailOffset);
addChild(newChildNode);
}
} else {//numIdenticalBytes > 0, split into branch/leaf and then add second leaf (case 3)
split(numIdenticalTokenBytes, bytes);
}
}
protected void addChild(TokenizerNode node) {
node.setParent(this);
children.add(node);
}
/**
* Called when we need to convert a leaf node into a branch with 2 leaves. Comments inside the
* method assume we have token BAA starting at tokenStartOffset=0 and are adding BOO. The output
* will be 3 nodes:<br>
* <ul>
* <li>1: B <- branch
* <li>2: AA <- leaf
* <li>3: OO <- leaf
* </ul>
*
* @param numTokenBytesToRetain => 1 (the B)
* @param bytes => BOO
*/
protected void split(int numTokenBytesToRetain, final ByteRange bytes) {
int childNodeDepth = nodeDepth;
int childTokenStartOffset = tokenStartOffset + numTokenBytesToRetain;
//create leaf AA
TokenizerNode firstChild = builder.addNode(this, childNodeDepth, childTokenStartOffset,
token, numTokenBytesToRetain);
firstChild.setNumOccurrences(numOccurrences);// do before clearing this node's numOccurrences
token.setLength(numTokenBytesToRetain);//shorten current token from BAA to B
numOccurrences = 0;//current node is now a branch
moveChildrenToDifferentParent(firstChild);//point the new leaf (AA) to the new branch (B)
addChild(firstChild);//add the new leaf (AA) to the branch's (B's) children
//create leaf OO
TokenizerNode secondChild = builder.addNode(this, childNodeDepth, childTokenStartOffset,
bytes, tokenStartOffset + numTokenBytesToRetain);
addChild(secondChild);//add the new leaf (00) to the branch's (B's) children
// we inserted branch node B as a new level above/before the two children, so increment the
// depths of the children below
firstChild.incrementNodeDepthRecursively();
secondChild.incrementNodeDepthRecursively();
}
protected void incrementNodeDepthRecursively() {
++nodeDepth;
builder.submitMaxNodeDepthCandidate(nodeDepth);
for (int i = 0; i < children.size(); ++i) {
children.get(i).incrementNodeDepthRecursively();
}
}
protected void moveChildrenToDifferentParent(TokenizerNode newParent) {
for (int i = 0; i < children.size(); ++i) {
TokenizerNode child = children.get(i);
child.setParent(newParent);
newParent.children.add(child);
}
children.clear();
}
/************************ byte[] utils *************************/
protected boolean partiallyMatchesToken(ByteRange bytes) {
return numIdenticalBytes(bytes) > 0;
}
protected boolean matchesToken(ByteRange bytes) {
return numIdenticalBytes(bytes) == getTokenLength();
}
protected int numIdenticalBytes(ByteRange bytes) {
return ByteRangeUtils.numEqualPrefixBytes(token, bytes, tokenStartOffset);
}
/***************** moving nodes around ************************/
public void appendNodesToExternalList(List<TokenizerNode> appendTo, boolean includeNonLeaves,
boolean includeLeaves) {
if (includeNonLeaves && !isLeaf() || includeLeaves && isLeaf()) {
appendTo.add(this);
}
for (int i = 0; i < children.size(); ++i) {
TokenizerNode child = children.get(i);
child.appendNodesToExternalList(appendTo, includeNonLeaves, includeLeaves);
}
}
public int setInsertionIndexes(int nextIndex) {
int newNextIndex = nextIndex;
if (hasOccurrences()) {
setFirstInsertionIndex(nextIndex);
newNextIndex += numOccurrences;
}
for (int i = 0; i < children.size(); ++i) {
TokenizerNode child = children.get(i);
newNextIndex = child.setInsertionIndexes(newNextIndex);
}
return newNextIndex;
}
public void appendOutputArrayOffsets(List<Integer> offsets) {
if (hasOccurrences()) {
offsets.add(outputArrayOffset);
}
for (int i = 0; i < children.size(); ++i) {
TokenizerNode child = children.get(i);
child.appendOutputArrayOffsets(offsets);
}
}
/***************** searching *********************************/
/*
* Do a trie style search through the tokenizer. One option for looking up families or qualifiers
* during encoding, but currently unused in favor of tracking this information as they are added.
*
* Keeping code pending further performance testing.
*/
public void getNode(TokenizerRowSearchResult resultHolder, byte[] key, int keyOffset,
int keyLength) {
int thisNodeDepthPlusLength = tokenStartOffset + token.getLength();
// quick check if the key is shorter than this node (may not work for binary search)
if (CollectionUtils.isEmpty(children)) {
if (thisNodeDepthPlusLength < keyLength) {// ran out of bytes
resultHolder.set(TokenizerRowSearchPosition.NO_MATCH, null);
return;
}
}
// all token bytes must match
for (int i = 0; i < token.getLength(); ++i) {
if (key[tokenStartOffset + keyOffset + i] != token.get(i)) {
// TODO return whether it's before or after so we can binary search
resultHolder.set(TokenizerRowSearchPosition.NO_MATCH, null);
return;
}
}
if (thisNodeDepthPlusLength == keyLength && numOccurrences > 0) {
resultHolder.set(TokenizerRowSearchPosition.MATCH, this);// MATCH
return;
}
if (CollectionUtils.notEmpty(children)) {
// TODO binary search the children
for (int i = 0; i < children.size(); ++i) {
TokenizerNode child = children.get(i);
child.getNode(resultHolder, key, keyOffset, keyLength);
if (resultHolder.isMatch()) {
return;
} else if (resultHolder.getDifference() == TokenizerRowSearchPosition.BEFORE) {
// passed it, so it doesn't exist
resultHolder.set(TokenizerRowSearchPosition.NO_MATCH, null);
return;
}
// key is still AFTER the current node, so continue searching
}
}
// checked all children (or there were no children), and didn't find it
resultHolder.set(TokenizerRowSearchPosition.NO_MATCH, null);
return;
}
/****************** writing back to byte[]'s *************************/
public byte[] getNewByteArray() {
byte[] arrayToFill = new byte[tokenStartOffset + token.getLength()];
fillInBytes(arrayToFill);
return arrayToFill;
}
public void fillInBytes(byte[] arrayToFill) {
for (int i = 0; i < token.getLength(); ++i) {
arrayToFill[tokenStartOffset + i] = token.get(i);
}
if (parent != null) {
parent.fillInBytes(arrayToFill);
}
}
/************************** printing ***********************/
@Override
public String toString() {
String s = "";
if (parent == null) {
s += "R ";
} else {
s += getBnlIndicator(false) + " " + Bytes.toString(parent.getNewByteArray());
}
s += "[" + Bytes.toString(token.deepCopyToNewArray()) + "]";
if (numOccurrences > 0) {
s += "x" + numOccurrences;
}
return s;
}
public String getPaddedTokenAndOccurrenceString() {
StringBuilder sb = new StringBuilder();
sb.append(getBnlIndicator(true));
sb.append(Strings.padFront(numOccurrences + "", ' ', 3));
sb.append(Strings.padFront(nodeDepth + "", ' ', 3));
if (outputArrayOffset >= 0) {
sb.append(Strings.padFront(outputArrayOffset + "", ' ', 3));
}
sb.append(" ");
for (int i = 0; i < tokenStartOffset; ++i) {
sb.append(" ");
}
sb.append(Bytes.toString(token.deepCopyToNewArray()).replaceAll(" ", "_"));
return sb.toString();
}
public String getBnlIndicator(boolean indent) {
if (indent) {
if (isNub()) {
return " N ";
}
return isBranch() ? "B " : " L";
}
if (isNub()) {
return "N";
}
return isBranch() ? "B" : "L";
}
/********************** count different node types ********************/
public int getNumBranchNodesIncludingThisNode() {
if (isLeaf()) {
return 0;
}
int totalFromThisPlusChildren = isBranch() ? 1 : 0;
for (int i = 0; i < children.size(); ++i) {
TokenizerNode child = children.get(i);
totalFromThisPlusChildren += child.getNumBranchNodesIncludingThisNode();
}
return totalFromThisPlusChildren;
}
public int getNumNubNodesIncludingThisNode() {
if (isLeaf()) {
return 0;
}
int totalFromThisPlusChildren = isNub() ? 1 : 0;
for (int i = 0; i < children.size(); ++i) {
TokenizerNode child = children.get(i);
totalFromThisPlusChildren += child.getNumNubNodesIncludingThisNode();
}
return totalFromThisPlusChildren;
}
public int getNumLeafNodesIncludingThisNode() {
if (isLeaf()) {
return 1;
}
int totalFromChildren = 0;
for (int i = 0; i < children.size(); ++i) {
TokenizerNode child = children.get(i);
totalFromChildren += child.getNumLeafNodesIncludingThisNode();
}
return totalFromChildren;
}
/*********************** simple read-only methods *******************************/
public int getNodeDepth() {
return nodeDepth;
}
public int getTokenLength() {
return token.getLength();
}
public boolean hasOccurrences() {
return numOccurrences > 0;
}
public boolean isRoot() {
return this.parent == null;
}
public int getNumChildren() {
return CollectionUtils.nullSafeSize(children);
}
public TokenizerNode getLastChild() {
if (CollectionUtils.isEmpty(children)) {
return null;
}
return CollectionUtils.getLast(children);
}
public boolean isLeaf() {
return CollectionUtils.isEmpty(children) && hasOccurrences();
}
public boolean isBranch() {
return CollectionUtils.notEmpty(children) && !hasOccurrences();
}
public boolean isNub() {
return CollectionUtils.notEmpty(children) && hasOccurrences();
}
/********************** simple mutation methods *************************/
/**
* Each occurrence > 1 indicates a repeat of the previous entry.
* This can be called directly by
* an external class without going through the process of detecting a repeat if it is a known
* repeat by some external mechanism. PtEncoder uses this when adding cells to a row if it knows
* the new cells are part of the current row.
* @param d increment by this amount
*/
public void incrementNumOccurrences(int d) {
numOccurrences += d;
}
/************************* autogenerated get/set ******************/
public int getTokenOffset() {
return tokenStartOffset;
}
public TokenizerNode getParent() {
return parent;
}
public ByteRange getToken() {
return token;
}
public int getNumOccurrences() {
return numOccurrences;
}
public void setParent(TokenizerNode parent) {
this.parent = parent;
}
public void setNumOccurrences(int numOccurrences) {
this.numOccurrences = numOccurrences;
}
public ArrayList<TokenizerNode> getChildren() {
return children;
}
public long getId() {
return id;
}
public int getFirstInsertionIndex() {
return firstInsertionIndex;
}
public void setFirstInsertionIndex(int firstInsertionIndex) {
this.firstInsertionIndex = firstInsertionIndex;
}
public int getNegativeIndex() {
return negativeIndex;
}
public void setNegativeIndex(int negativeIndex) {
this.negativeIndex = negativeIndex;
}
public int getOutputArrayOffset() {
return outputArrayOffset;
}
public void setOutputArrayOffset(int outputArrayOffset) {
this.outputArrayOffset = outputArrayOffset;
}
public void setId(long id) {
this.id = id;
}
public void setBuilder(Tokenizer builder) {
this.builder = builder;
}
public void setTokenOffset(int tokenOffset) {
this.tokenStartOffset = tokenOffset;
}
public void setToken(ByteRange token) {
this.token = token;
}
}