package com.colloquial.arithcode; /** <P>A node in a depth-bounded suffix tree that represents counts of * sequences of bytes. Nodes in the trie are accessed through * sequences of bytes in the same way as the {@link java.util.Map#get} * method of {@link java.util.Map}. Sequences of bytes are added as * descendants of the current node if necessary and their counts are * incremented and rescaled if necessary through an increment * operation that works in the same way as {@link java.util.Map#put} in * {@link java.util.Map}. * <P>The entire trie is accessible through the root node. A byte is * stored with each node, and a (scaled) count of the times the the * sequence of bytes leading to this node in the trie were seen. Each * node contains a linked list of child nodes, the first of which is * provided as a member element. Each node also represents a member * of the linked list of siblings, and the next sibling is provided as * a member element. * * <P>The nodes provide the cumulative probability estimates for the * model through the <code>total</code>, <code>interval</code> and * <code>intervalEscape</code> methods. * * @author <a href="http://www.colloquial.com/carp/">Bob Carpenter</a> * @version 1.1 * @see PPMModel * @since 1.0 */ final class PPMNode { /** Construct a node with the specified byte and next sibling. * @param b Byte represented by node. * @param nextSibling The next daughter node in the list of daughters. */ PPMNode(byte b, PPMNode nextSibling) { _byte = b; _nextSibling = nextSibling; } /** Construct a node with the specified byte. * @param b Byte represented by node. */ PPMNode(byte b) { this(b,null); } /** Returns <code>true</code> if the number of children for * this node is <code>1</code>. * @param bytes Bytes that have been seen in escaped context that should not be considered children. * @return <code>true</code> if the scaled number of outcomes for this node is <code>1</code>. */ boolean isDeterministic(ByteSet excludedBytes) { return _firstChild._nextSibling == null; // already checked _firstChild != null by calling isChildless first /* doing it right is about 10-12% slower and less than .01 b/B better int numOutcomes = 0; for (PPMNode node = _firstChild; node != null; node = node._nextSibling) if (!excludedBytes.contains(node._byte) && ++numOutcomes > 1) return false; return numOutcomes == 1; */ } /** Returns <code>true</code> if this node has no children, not counting * specified exclusions. * @param excludedBytes Bytes to exclude as children * @return <code>true</code> if this node has no children, not counting */ boolean isChildless(ByteSet excludedBytes) { // return _firstChild == null; // not much faster and compresses less due to added escapes for (PPMNode node = _firstChild; node != null; node = node._nextSibling) { if (!excludedBytes.contains(node._byte)) return false; } return true; } /** Total count for this node, not including those bytes in the specified set. * @param excludedBytes Set of bytes to exclude from counts. * @return Total count for this node. */ int totalCount(ByteSet excludedBytes) { int count = _numberOfOutcomes; for (PPMNode child = _firstChild; child != null; child = child._nextSibling) if (!excludedBytes.contains(child._byte)) count += child._count; return count; } /** Calculates the interval for the specified byte from this node and writes * it into the specified array. * @param b Byte whose interval is calcuated. * @param excludedBytes Set of bytes to exclude from counts. * @param result Array in which to write the range for the specified byte. */ void interval(int i, ByteSet excludedBytes, int[] result) { interval(Converter.integerToByte(i),excludedBytes,result); } /** Calculates the interval for the specified byte from this node and writes * it into the specified array. * @param b Byte whose interval is calcuated. * @param excludedBytes Set of bytes to exclude from counts. * @param result Array in which to write the range for the specified byte. */ private void interval(byte b, ByteSet excludedBytes, int[] result) { result[0] = 0; for (PPMNode dtrNode = _firstChild; dtrNode != null; dtrNode = dtrNode._nextSibling) { if (excludedBytes.contains(dtrNode._byte)) continue; if (dtrNode._byte == b) { result[1] = result[0] + dtrNode._count; result[2] = result[1] + _numberOfOutcomes; for (dtrNode = dtrNode._nextSibling; dtrNode != null; dtrNode = dtrNode._nextSibling) if (!excludedBytes.contains(dtrNode._byte)) result[2] += dtrNode._count; return; } result[0] += dtrNode._count; } } /** The interval for the escape count, less the set of excluded bytes. * @param excludedBytes Set of bytes to exclude from counts. * @param result Array into which to write the range for the specified bytes. */ void intervalEscape(ByteSet excludedBytes, int[] result) { result[2] = (result[1] = totalCount(excludedBytes)); result[0] = result[1]-_numberOfOutcomes; } /** Increment the counts for this node for the string specified in * the buffer. * @param buffer Buffer of bytes from which to read event to increment. */ void increment(ByteBuffer buffer) { if (buffer.length() > 0) increment(buffer.bytes(),buffer.offset(),buffer.length()); } /** Returns <code>true</code> if this node has a child with the specified byte, specified * as an integer. * @param b Byte coded as integer to check. * @return </code>true</code> if there is a child node with the specified byte. */ boolean hasDaughter(int i) { return hasDaughter(Converter.integerToByte(i)); } /** Returns <code>true</code> if this node has a child with the specified byte. * @param b Byte to check. * @return </code>true</code> if there is a child node with the specified byte. */ private boolean hasDaughter(byte b) { for (PPMNode dtrNode = _firstChild; dtrNode != null; dtrNode = dtrNode._nextSibling) if (dtrNode._byte == b) return true; return false; } /** Retrieves the symbol for which the midCount is between its low and high * counts (inclusive on low, exclusive on high). * @param midCount Count for which to find symbol. * @param excludedBytes Set of bytes to exclude from counts. * @return Symbol with specified count. */ int pointToSymbol(int midCount, ByteSet excludedBytes) { int highCount = 0; for (PPMNode child = _firstChild; child != null; child = child._nextSibling) { if (excludedBytes.contains(child._byte)) continue; highCount += child._count; if (highCount > midCount) return Converter.byteToInteger(child._byte); } return ArithCodeModel.ESCAPE; } /** Extends this node with the given sequence of bytes, specified * by an array, offset and length. * @param bytes Byte array providing bytes to extend. * @param offset Index of first byte in array. * @param length Number of bytes to extend. */ void complete(byte[] bytes, int offset, int length) { PPMNode node = this; while (length > 0) { ++node._numberOfOutcomes; node = node._firstChild = new PPMNode(bytes[offset]); ++offset; --length; } } /** Increment the count of all of the nodes along the sequence of * bytes determined by the specified array, beginning at the specified * offset and continuing for the specified length number of bytes. * @param bytes Array from which to read bytes. * @param offset Index of first byte to read from array. * @param length Total number of bytes to read from array. */ void increment(byte[] bytes, int offset, int length) { if (_firstChild == null) { ++_numberOfOutcomes; _firstChild = new PPMNode(bytes[offset]); if (length > 1) _firstChild.complete(bytes,offset+1,length-1); return; } PPMNode previousChild = null; // move to front for (PPMNode child = _firstChild; true; child = child._nextSibling) { if (child._byte == bytes[offset]) { if (length > 1) child.increment(bytes,offset+1,length-1); if (previousChild != null) { // move to front previousChild._nextSibling = child._nextSibling; child._nextSibling = _firstChild; _firstChild = child; } if (++child._count > MAX_INDIVIDUAL_COUNT) rescale(); return; } if (child._nextSibling == null) { ++_numberOfOutcomes; _firstChild = new PPMNode(bytes[offset],_firstChild); // start in front if (length > 1) _firstChild.complete(bytes,offset+1,length-1); // start in front return; } previousChild = child; // move to front } } /** The byte for this node. */ final byte _byte; /** The scaled count for this node. */ private short _count = 1; /** The scaled number of outcomes used to calculate escape likelihoods. */ private short _numberOfOutcomes; // implied = 0; /** The first child of this node. */ PPMNode _firstChild; // implied = null; /** The next sibling of this node. */ PPMNode _nextSibling; // implied = null; /** Prunes this node and its children, returning <code>null</code> if * the node's count is too low and pruning all children with counts * too low. * @return This node if it is above the minimum number of counts. */ /* private PPMNode prune() { if (_count < MIN_PRUNE_COUNT) return (PPMNode) null; if (_firstChild != null) _firstChild = _firstChild.pruneSiblings(); return this; } */ /** Prunes the siblings of this node, returning the next sibling * or <code>null</code> if there aren't any. * @return Linked list of siblings above the pruning threshold. */ /* private PPMNode pruneSiblings() { if (_count < MIN_PRUNE_COUNT) { if (_nextSibling == null) return null; return _nextSibling.pruneSiblings(); } if (_firstChild != null) _firstChild = _firstChild.pruneSiblings(); if (_nextSibling == null) return this; _nextSibling = _nextSibling.pruneSiblings(); return this; } */ /** Rescale all of the counts of the children of this node. * Divides by 2, rounding up, but eliminates all nodes that * fall below count threshold. Total number of outcomes is also * rescaled, but it will never fall below <code>1</code> to allow * possiblity for escapes. */ private void rescale() { _numberOfOutcomes = (short)((_numberOfOutcomes + 1)/2); _firstChild = _firstChild.rescaleSiblings(); } /** Rescale the counts on this node and the siblings of this node. Divides by 2, rounding * up, so no count every drops below 1. Returns rescaled node, which may not * be original sibling or may be <code>null</code> if siblings scale below */ private PPMNode rescaleSiblings() { _count >>= 1; // cheap divide by 2 if (_nextSibling == null) return (_count < MIN_COUNT) ? null : this; if (_count < MIN_COUNT) return _nextSibling.rescaleSiblings(); _nextSibling = _nextSibling.rescaleSiblings(); return this; } /** Minimum count for a node to survive pruning. */ // private static final int MIN_PRUNE_COUNT = 2; /** Minimum count for which to retain a node during rescaling. Surprisingly insensitive. */ private static final int MIN_COUNT = 128; /** Maximum count for daughter node before rescaling all daughters. Max value is * 8K; higher values cause overflow in the arithmetic coder. Higher values compress * better, lower values are generally faster up to the point they cause thrashing. * 8K is about .01 b/B more compressed, and about 25% slower vs. 4K. * 2K is about .01 b/B less compressed, and roughly same speed as 4K. */ private static final int MAX_INDIVIDUAL_COUNT = 8*1024; }