package com.colloquial.arithcode;
/** Provides a cumulative, adaptive byte model implementing
* prediction by partial matching up to a specified maximum context size.
* Uses Method C for estimation.
*
* Constants that control behavior include the maximum total count before
* rescaling, and the minimum count to retain after rescaling (an escape
* is always maintained with a count of at least <code>1</code>).
* <P>
* For more details, see <a href="../../../tutorial.html">The Arithemtic Coding Tutorial</a>.
*
* @author <a href="http://www.colloquial.com/carp/">Bob Carpenter</a>
* @version 1.1
* @since 1.0
*/
public final class PPMModel implements ArithCodeModel {
/** Construct a new model with the specified maximum length of
* context to use for prediction.
* @param maxContextLength Maximum length of context to use for prediction.
*/
public PPMModel(int maxContextLength) {
_maxContextLength = maxContextLength;
_buffer = new ByteBuffer(maxContextLength+1);
}
// specified in ArithCodeModel
public boolean escaped(int symbol) {
return (_contextNode != null
&& (symbol == ArithCodeModel.EOF
|| !_contextNode.hasDaughter(symbol)));
}
// specified in ArithCodeModel
public void exclude(int i) {
_excludedBytes.add(i);
}
// specified in ArithCodeModel
public void interval(int symbol, int[] result) {
if (symbol == ArithCodeModel.EOF) _backoffModel.interval(EOF,result,_excludedBytes);
else if (symbol == ArithCodeModel.ESCAPE) intervalEscape(result);
else intervalByte(symbol,result);
}
// specified in ArithCodeModel
public int pointToSymbol(int count) {
if (_contextNode != null) return _contextNode.pointToSymbol(count,_excludedBytes);
return _backoffModel.pointToSymbol(count,_excludedBytes);
}
// specified in ArithCodeModel
public int totalCount() {
if (_contextNode == null) return _backoffModel.totalCount(_excludedBytes);
return _contextNode.totalCount(_excludedBytes);
}
// specified in ArithCodeModel
public void increment(int i) {
increment(Converter.integerToByte(i));
}
/** Exclude all of the bytes in the specified byte set.
* @param bytesToExclude Set of bytes to exclude from outcome.
* @since 1.1
*/
public void exclude(ByteSet bytesToExclude) {
_excludedBytes.add(bytesToExclude);
}
/** Count of bytes coded to use in pruning.
*/
// private int _byteCount; // implied = 0; uncomment for pruning
/** Model to use for short contexts. */
private final ExcludingAdaptiveUnigramModel _backoffModel = new ExcludingAdaptiveUnigramModel();
/** Nodes at depth 1 in the model. All order 0 nodes are included in the unigram
*/
private final PPMNode[] _contexts = new PPMNode[256];
/** Maximum context length to search in trie. Maximum count will
* be for maximum context length plus one.
*/
private final int _maxContextLength;
/** Root of the trie structure of counts. Dummy byte as symbol.
*/
private final PPMNode _rootNode = new PPMNode((byte)'.');
/** Current context length.
*/
private int _contextLength; // implied = 0;
/** Current context node.
*/
private PPMNode _contextNode; // = null;
/** Bytes buffered for use as context.
*/
private final ByteBuffer _buffer;
/** Storage for the excluded bytes
*/
private final ByteSet _excludedBytes = new ByteSet();
/** Returns interval for byte specified as an integer in 0 to 255 range.
* @param i Integer specification of byte in 0 to 255 range.
* @param result Array specifying cumulative probability for byte i.
*/
private void intervalByte(int i, int[] result) {
if (_contextNode != null) _contextNode.interval(i,_excludedBytes,result);
else _backoffModel.interval(i,result,_excludedBytes);
increment(i);
}
/** Returns interval for escape in current context.
* @param result Array for specifying cumulative probability for escape symbol in current context.
*/
private void intervalEscape(int[] result) {
_contextNode.intervalEscape(_excludedBytes,result);
if (_contextLength >= MIN_CONTEXT_LENGTH)
for (PPMNode child = _contextNode._firstChild; child != null; child = child._nextSibling)
_excludedBytes.add(child._byte);
--_contextLength; // could decrement longer contexts more for a speedup in some cases
getContextNodeLongToShort();
}
// code used for pruning is edited out and marked as follows
//PRUNE private void prune() {
//PRUNE for (int i = 0; i < 256; ++i) if (_contexts[i] != null) _contexts[i] = _contexts[i].prune();
//PRUNE }
/** Adds counts for given byte to model in current context and then updates the current context.
* Rescales counts if necessary. Called by both encoding and deocding.
* @param b Byte to add to model.
*/
private void increment(byte b) {
_buffer.buffer(b);
byte firstByte = _buffer.bytes()[_buffer.offset()];
if (_contexts[Converter.byteToInteger(firstByte)] == null)
_contexts[Converter.byteToInteger(firstByte)] = new PPMNode(firstByte);
if (_buffer.length() > 1)
_contexts[Converter.byteToInteger(firstByte)].increment(_buffer.bytes(),
_buffer.offset()+1,
_buffer.length()-1);
// _backoffModel.increment(Converter.byteToInteger(b)); // updates backoff model; best to exclude it by .1 b/B!
_contextLength = Math.min(_maxContextLength,_buffer.length());
getContextNodeBinarySearch();
_excludedBytes.clear();
//PRUNE if (++_byteCount == PRUNE_INTERVAL) { _byteCount = 0; prune(); } // pruning
}
/** Use binary search to set the context node up to the currently
* specified context length. May set it to <code>null</code> if
* not found.
*/
private void getContextNodeBinarySearch() {
int low = MIN_CONTEXT_LENGTH;
int high = _contextLength;
_contextLength = MIN_CONTEXT_LENGTH-1; // not sure we need this
_contextNode = null;
boolean isDeterministic = false;
while (high >= low) {
int contextLength = (high + low)/2;
PPMNode contextNode = lookupNode(contextLength);
if (contextNode == null || contextNode.isChildless(_excludedBytes)) {
if (contextLength < high) high = contextLength;
else --high;
} else if (contextNode.isDeterministic(_excludedBytes)) {
_contextLength = contextLength;
_contextNode = contextNode;
isDeterministic = true;
if (contextLength < high) high = contextLength;
else --high;
} else if (!isDeterministic) {
_contextLength = contextLength;
_contextNode = contextNode;
if (contextLength > low) low = contextLength;
else ++low;
} else {
if (contextLength > low) low = contextLength;
else ++low;
}
}
}
/* un-used variant lookung up context lengths by starting with shortest and
* continuing to increase until found.
private void getContextNodeShortToLong() {
int maxContextLength = _contextLength;
_contextNode = null;
_contextLength = MIN_CONTEXT_LENGTH-1;
for (int contextLength = MIN_CONTEXT_LENGTH; contextLength <= maxContextLength; ++contextLength) {
PPMNode node = lookupNode(contextLength);
if (node == null || node.isChildless(_excludedBytes)) {
continue; // return; lose around .01 b/B total (not even average) with return, but 25% slower
}
_contextNode = node;
_contextLength = contextLength;
if (node.isDeterministic(_excludedBytes)) return;
}
}
*/
/** Starting at the longest context, count down in length to set
* a valid context or give up. This version finds the shortest deterministic
* context <= in length to the current context length, but if there is
* no deterministic context, returns longest context length that exists
* that is <= in length to the current context.
* Could also implement this in short to long order
*/
private void getContextNodeLongToShort() {
while (_contextLength >= MIN_CONTEXT_LENGTH) {
_contextNode = lookupNode(_contextLength);
if (_contextNode == null || _contextNode.isChildless(_excludedBytes)) { --_contextLength; continue; }
while (_contextLength > MIN_CONTEXT_LENGTH && _contextNode.isDeterministic(_excludedBytes)) {
// backoff to shortest deterministic node if context node is deterministic
PPMNode backoffNode = lookupNode(_contextLength-1);
if (backoffNode == null || !backoffNode.isDeterministic(_excludedBytes)) return;
_contextNode = backoffNode;
--_contextLength;
}
return;
}
_contextNode = null;
}
/** Returns node from the current byte buffer of
* the specified context length, or null if there isn't one.
* @param contextLength Number of bytes of context used.
* @return Node found at that context.
*/
private PPMNode lookupNode(int contextLength) {
PPMNode node = _contexts[Converter.byteToInteger(_buffer.bytes()[_buffer.offset()+_buffer.length()-contextLength])];
if (node == null) return (PPMNode) null;
return lookup(node,_buffer.bytes(),_buffer.offset()+_buffer.length()-contextLength+1,contextLength-1);
}
/** Looks up a node from the given bytes, offset and length starting
* from the specified node.
* @param node Node from which to search.
* @param bytes Sequence of bytes to search.
* @param offset Offset into sequence of bytes of the first byte.
* @param length Number of bytes to look up.
* @return Node found for the given bytes.
*/
private static PPMNode lookup(PPMNode node, byte[] bytes, int offset, int length) {
if (length == 0) return node;
for (PPMNode child = node._firstChild; length > 0 && child != null; ) {
if (bytes[offset]==child._byte) {
if (length == 1) return child;
node = child;
child = child._firstChild;
++offset;
--length;
} else {
child = child._nextSibling;
}
}
return (PPMNode) null;
}
/** Minimum context length to look down sequence of nodes.
* Shorter contexts use backoff model.
*/
private static final int MIN_CONTEXT_LENGTH = 1;
/** Period between prunings in number of bytes.
*/
//PRUNE private static final int PRUNE_INTERVAL = 250000; // loses about 10% compression rate, saves lots of space
}