/** * Byte Array Compressor * * The (de-)compression algorthm is quite simple and memory efficient. * * the compression is done using a dictionary of words. words are detected * by predefined seperators (see isSep()). the dictionary of the words is * the (de-)compression array itself. * * @author Steffen Rusitschka, Siemens AG, CT IC 6 * @author Dmitri Toropov, Siemens AG, CT IC 6 * */ package jade.imtp.leap.JICP; import java.io.*; public class JICPCompressor1 { private static final int WORD_MAGIC = 200; private static final int RLE_MAGIC = 201; private static final int MAX_WORDS = 254; // internal fields private static final int RLE_FLUSH = 0x100; private int rleOldValue = RLE_FLUSH; private int rleOccurrence = 0; private byte[] ba; // private int count = 0; private static boolean isSep(int value) { return !((value >= 'A' && value <= 'Z') || (value >= 'a' && value <= 'z') || (value >= '0' && value <= '9') || value == '-' || value == '_'); } /** * run length encoding write * if value is RLE_FLUSH, the stream will be flushed. */ private void rleWrite(ByteArrayOutputStream baos, int value) { // System.out.println("" + (count++) + ": " + (char)value + " " + value); if (rleOldValue == RLE_FLUSH) { rleOldValue = value; rleOccurrence = 1; return; } if (value != rleOldValue || rleOccurrence == 255) { if (rleOldValue == RLE_MAGIC || rleOccurrence > 2) { baos.write(RLE_MAGIC); baos.write(rleOccurrence); // System.out.println("rle: " + rleOccurrence + " times " + (char)rleOldValue); if (rleOccurrence > 2) { baos.write(rleOldValue); } } else { for (int i=0; i<rleOccurrence; ++i) { baos.write(rleOldValue); } } rleOccurrence = 0; } rleOccurrence++; rleOldValue = value; } /** * run length encoding read */ private int rleRead(ByteArrayInputStream bais) { if (rleOccurrence == 0) { rleOldValue = bais.read(); if (rleOldValue == RLE_MAGIC) { rleOccurrence = bais.read(); if (rleOccurrence > 2) { rleOldValue = bais.read(); } } else { rleOccurrence = 1; } } rleOccurrence--; // System.out.println("" + (count++) + ": " + (char)rleOldValue + " " + rleOldValue); return rleOldValue; } private int getValue(int index) { return index >= ba.length ? RLE_FLUSH : ((int)(char)ba[index]) & 255; } private void setValue(int index, int value) { if (index >= ba.length) { byte[] newba = new byte[index * 5/4 + 1]; System.arraycopy(ba, 0, newba, 0, ba.length); // System.out.println("[r]"); ba = newba; } ba[index] = (byte)value; } /** * compress() * * algorithm: * * if during compression a word is detected, its position inside the array * is stored in the wordIndexes[] array. but only, if the word itself was not * found in the array before. if so, only a magic byte (WORD_MAGIC) and the index of * the word in the wordIndexes[] array is stored as a byte. the magic byte is * encoded as (WORD_MAGIC, 255). this limits the number of words that are possible * inside the wordIndexes[] array to 254 (=MAX_WORD constant). * */ public static byte[] compress(byte[] ba) { if (ba == null) { return null; } return new JICPCompressor1().compressHelper(ba); } private byte[] compressHelper(byte[] uba) { int wordIndexes[] = new int[MAX_WORDS]; int beginIndex = 0; int wordIndex = 0; int lastWordIndex = 0; ByteArrayOutputStream baos = new ByteArrayOutputStream(); ba = uba; // set ba, so it is accessable through getValue(). // go through array for (int i=0; i<=ba.length; ++i) { // System.out.println("" + i + ": " + (char)ba[i] + " " + (int)(char)ba[i]); if (isSep(getValue(i))) { boolean foundWord = false; // if there is a seperator and the word has more than 2 chars, add the word if ((i - beginIndex) > 2) { // find the word for (int wi=0; wi<lastWordIndex; ++wi) { int existingWordIndex = wordIndexes[wi]; int j = 0; while (true) { int ch1 = getValue(existingWordIndex + j); int ch2 = getValue(beginIndex + j); if (ch1 != ch2) { break; // words are different } if (isSep(ch1) && isSep(ch2)) { foundWord = true; break; // words are the same } ++j; } if (foundWord) { // System.out.println(" w! " + existingWordIndex); rleWrite(baos, WORD_MAGIC); rleWrite(baos, wi); rleWrite(baos, getValue(i)); break; } } if (!foundWord) { if (wordIndex == MAX_WORDS) { wordIndex = 0; } // System.out.println("wi["+wordIndex+"]="+beginIndex); wordIndexes[wordIndex++] = beginIndex; if (wordIndex > lastWordIndex) { lastWordIndex = wordIndex; } } } if (!foundWord) { for (int j = beginIndex; j<=i; ++j) { int ch = getValue(j); rleWrite(baos, ch); if (ch == WORD_MAGIC) { rleWrite(baos, 255); } } } beginIndex = i+1; } } /* // do the remaining bytes for (int j = beginIndex; j<ba.length; ++j) { int ch = (int)((char)ba[j] & 255); rleWrite(baos, ch); if (ch == WORD_MAGIC) { rleWrite(baos, 255); } } rleWrite(baos, RLE_FLUSH); */ byte[] result = baos.toByteArray(); // System.out.println("" + ba.length + "->" + result.length + " = " + (result.length*100)/ba.length+"%"); return result; } /** * decompress() * * algorithm: * * during decompression, the wordIndexes[] array will be built. this is done by * reading the compressed array, decode it and look for words. if a magic byte * (WORD_MAGIC) is detected, it will be decoded to a "real" WORD_MAGIC if the * following byte is 255 (WORD_MAGIC, 255 - sequence). if the following byte * is not 255, it is treated as an index in the wordIndexes[] array. * the index stored inside there is used as the beginning of a word. this word * will be copied to the end of the decoded stream. the end of the word is * detected by a separator (see isSep()). */ public static byte[] decompress(byte[] cba) { if (cba == null) { return null; } return new JICPCompressor1().decompressHelper(cba); } private byte[] decompressHelper(byte[] cba) { // System.out.print("" + cba.length + " -> "); int wordIndexes[] = new int[MAX_WORDS]; int wordIndex = 0; int currentIndex = 0; int beginIndex = 0; ByteArrayInputStream bais = new ByteArrayInputStream(cba); int ch; ba = new byte[cba.length * 3/2]; // init ba for setValue() access. while ((ch = rleRead(bais)) != -1) { // System.out.println("" + (char)ch + " " + (int)(char)ch); if (ch == WORD_MAGIC) { int wi = rleRead(bais); if (wi == 255) { // System.out.print(" m "); setValue(currentIndex++, (byte)WORD_MAGIC); } else { int refWordIndex = wordIndexes[wi]; while (!isSep(ba[refWordIndex])) { // System.out.print(" w "); setValue(currentIndex++, ba[refWordIndex++]); } } beginIndex = currentIndex+1; } else { if (isSep((byte)ch)) { if ((currentIndex-beginIndex) > 2) { if (wordIndex == MAX_WORDS) { wordIndex = 0; } // System.out.println("wi["+wordIndex+"]="+beginIndex); wordIndexes[wordIndex++] = beginIndex; } beginIndex = currentIndex+1; } // System.out.print(" r "); setValue(currentIndex++, (byte)ch); } } byte[] newba = new byte[currentIndex]; System.arraycopy(ba, 0, newba, 0, currentIndex); System.out.println("dec " + cba.length + "->" + currentIndex); // System.out.println("[r]"); // System.out.println("" + newba.length); return newba; } }