package org.basex.util; import org.basex.util.list.ByteList; /** * This class compresses and decompresses tokens. It is inspired by the * Huffman coding, but was simplified to speed up processing. * * @author BaseX Team 2005-12, BSD License * @author Christian Gruen */ public final class Compress extends ByteList { /** Temporary value. */ private int pc; /** Pack offset. */ private int po; /** * Compresses the specified text. * @param txt text to be packed * @return packed text */ public byte[] pack(final byte[] txt) { // initialize compression final int tl = txt.length; reset(); Num.set(list, tl, 0); size = Num.length(tl); pc = 0; po = 0; // write packer version bit (0) push(0, 1); // relate upper with lower case and write mapping bit int lc = 0; for(final byte b : txt) lc += b >= 'A' && b <= 'Z' ? -1 : 1; final byte[] pack = lc >= 0 ? PACK1 : PACK2; push(lc >= 0 ? 1 : 0, 1); // compress all characters for(int t = 0; t < tl; t++) { int b = txt[t]; if(b >= 0) b = pack[b]; if(b >= 0x00 && b < 0x08) { // 1 xxx push(1 | b << 1, 4); } else if(b >= 0x08 && b < 0x10) { // 01 xxx push(2 | b << 2, 5); } else if(b >= 0x10 && b < 0x20) { // 001 xxxx push(4 | b << 3, 7); } else if(b >= 0x20 && b < 0x40) { // 0001 xxxxx push(8 | b << 4, 9); } else { // 0000 xxxxxxxx push(b << 4, 12); } } if(po != 0) add(pc); return size() < tl ? toArray() : txt; } /** * Pushes bits to the byte cache. * @param b value to be pushed. * @param s number of bits */ private void push(final int b, final int s) { int bb = b, oo = po, cc = pc; for(int i = 0; i < s; i++) { cc |= (bb & 1) << oo; bb >>= 1; if(++oo == 8) { add(cc); oo = 0; cc = 0; } } po = oo; pc = cc; } /** Current unpack position. */ private int uc; /** UNpack offset. */ private int uo; /** * Decompresses the specified text. * @param txt text to be unpacked * @return unpacked text */ public synchronized byte[] unpack(final byte[] txt) { // initialize decompression list = txt; size = txt.length; uc = Num.length(txt, 0); uo = 0; // read packer bit pull(); // choose mapping final byte[] unpack = pull() ? UNPACK1 : UNPACK2; // decompress all characters final int l = Num.get(txt, 0); final byte[] res = new byte[l]; for(int r = 0; r < l; r++) { final int b; if(pull()) { // 1 xxx b = pull(3); } else if(pull()) { // 01 xxx b = pull(3) | 0x08; } else if(pull()) { // 001 xxxx b = pull(4) | 0x10; } else if(pull()) { // 0001 xxxxx b = pull(5) | 0x20; } else { // 0000 xxxxxxxx b = pull(8); } res[r] = (byte) (b >= 128 ? b : unpack[b]); } return res; } /** * Pulls the specified number of bit and returns the result. * @param s number of bytes * @return result */ private int pull(final int s) { int oo = uo, cc = uc, x = 0; for(int i = 0; i < s; i++) { if((list[cc] & 1 << oo) != 0) x |= 1 << i; if(++oo == 8) { oo = 0; ++cc; } } uo = oo; uc = cc; return x; } /** * Pulls a single bit. * @return result */ private boolean pull() { final boolean b = (list[uc] & 1 << uo) != 0; if(++uo == 8) { uo = 0; ++uc; } return b; } /** First mapping for unpacking data. */ private static final byte[] UNPACK1 = { 0x20, 0x61, 0x65, 0x6E, 0x69, 0x6F, 0x72, 0x73, // encode via 1 xxx 0x74, 0x6C, 0x75, 0x68, 0x64, 0x63, 0x67, 0x6D, // encode via 01 xxx 0x70, 0x79, 0x62, 0x6B, 0x66, 0x76, 0x43, 0x53, // encode via 001 xxxx 0x77, 0x4D, 0x41, 0x42, 0x50, 0x7A, 0x2E, 0x0A, 0x54, 0x52, 0x4B, 0x4C, 0x47, 0x4E, 0x48, 0x6A, // encode via 0001 xxxxx 0x45, 0x49, 0x44, 0x46, 0x4A, 0x78, 0x4F, 0x71, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x2D, 0x27, 0x2C, 0x22, 0x3F, 0x56, 0x57, 0x55, 0x5A, 0x59, 0x51, 0x58, 0x09, 0x40, 0x28, 0x2F, 0x29, 0x2B, 0x7E, 0x21, 0x23, // encode via 0000 xxxxxxxx 0x24, 0x25, 0x26, 0x2A, 0x3B, 0x3C, 0x3D, 0x3E, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F, 0x60, 0x7B, 0x7C, 0x7D, 0x7F, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F }; /** First mapping for packing data. */ private static final byte[] PACK1 = new byte[UNPACK1.length]; /** Second mapping for unpacking data. */ private static final byte[] UNPACK2 = new byte[UNPACK1.length]; /** Second mapping for packing data. */ private static final byte[] PACK2 = new byte[UNPACK2.length]; // initializes the character mappings static { final int pl = UNPACK1.length; for(int p = 0; p < pl; p++) { final byte b1 = UNPACK1[p]; // swap lower and upper case in second mapping final byte b2 = (byte) (b1 >= 'A' && b1 <= 'Z' ? b1 + 0x20 : b1 >= 'a' && b1 <= 'z' ? b1 - 0x20 : b1); UNPACK2[p] = b2; PACK1[b1] = (byte) p; PACK2[b2] = (byte) p; } } }