package com.colloquial.arithcode;
import java.io.IOException;
import java.util.HashMap;
/** Model of a sequence of tokens. Limited to
* 256 distinct tokens (to enable coding as PPM).
*/
public final class TokenStreamEncoder {
public TokenStreamEncoder(ArithEncoder encoder, int tokenSequenceOrder, PPMModel tokenBytesModel) {
_encoder = encoder;
_tokenBytesModel = tokenBytesModel;
_tokenSequenceModel = new PPMModel(tokenSequenceOrder);
_tokenToSymbolMap = new HashMap();
}
public void encode(String token) throws IOException {
if (_tokenToSymbolMap.containsKey(token)) {
encodeToken(((Integer) (_tokenToSymbolMap.get(token))).intValue());
} else {
_tokenToSymbolMap.put(token,new Integer(_nextTokenIndex++));
encodeToken(_nextTokenIndex);
++_nextTokenIndex; // must do after encodeToken, because encodeToken uses it
encodeBytes(token.getBytes(LATIN1));
}
}
private void encodeToken(int symbol) throws IOException {
for (int i = _nextTokenIndex+1; i < 256; ++i) _tokenSequenceModel.exclude(i);
encode(_tokenSequenceModel,symbol);
}
private void encodeBytes(byte[] bytes) throws IOException {
for (int i = 0; i < bytes.length; ++i) {
_tokenBytesModel.exclude(LATIN1_UNUSED_BYTES);
encode(_tokenBytesModel,Converter.byteToInteger(bytes[i]));
}
_tokenBytesModel.exclude(LATIN1_UNUSED_BYTES);
encode(_tokenBytesModel,0); // uses 0 as separator, which can be trouble if 0 is a valid character
}
private void encode(PPMModel model, int symbol) throws IOException {
// COPIED VERBATIM FROM ArithCodeOutputStream
while (model.escaped(symbol)) {
model.interval(ArithCodeModel.ESCAPE,_interval); // have already done complete walk to compute escape
_encoder.encode(_interval);
}
model.interval(symbol,_interval); // have already done walk to element to compute escape
_encoder.encode(_interval);
}
/** Arithmetic encoder used for encoding symbols and the bytes making
* them up.
*/
private final ArithEncoder _encoder;
/** Interval used for coding ranges.
*/
private final int[] _interval = new int[3]; // ** COPIED VERBATIM FROM ArithCodeOutputStream ***
/** Index of next token, which must fall between 0 and 255 inclusive.
*/
private int _nextTokenIndex = 0;
/** MOdel for the bytes making up the tokens.
*/
private final PPMModel _tokenBytesModel;
/** Model for the sequence of tokens, encoded as bytes, making up
* the token stream.
*/
private final PPMModel _tokenSequenceModel;
/** Maps each token string to an Integer used to encode it.
*/
private final HashMap _tokenToSymbolMap;
private final static String LATIN1 = "ISO-8859-1";
private static final ByteSet LATIN1_UNUSED_BYTES = new ByteSet();
static {
for (int i = 1; i <= 8; ++i) LATIN1_UNUSED_BYTES.add(i);
for (int i = 11; i <= 12; ++i) LATIN1_UNUSED_BYTES.add(i);
for (int i = 14; i <= 31; ++i) LATIN1_UNUSED_BYTES.add(i);
for (int i = 127; i <= 159; ++i) LATIN1_UNUSED_BYTES.add(i);
}
}