package com.colloquial.arithcode; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.sql.Timestamp; import java.util.Arrays; import java.util.Random; /** Runs test suite for arithmetic coding and decoding with all of th esupplied * compression models from {@link #main}. Behavior is specified with * the following arguments. * <UL> * <LI><code>-f <i>FileName</i>: </code> * Test specified file.</LI> * <LI><code>-s <i>Integer</i>: </code> * Sized tests up to specified number of bytes.</LI> * <LI><code>-g: </code> * Run small tests.</LI> * <LI><code><i>String</i>: </code> * Test specified string.</LI> * <LI><code>-c <i>Directory</i>: </code> * Test calgary corpus found in specified directory.</LI> * <LI><code>-x <i>Directory</i>: </code> * Test xml corpus found in specified directory.</LI> * </UL> * <P> * The Calgary corpus can be downloaded from: * <blockquote> * <a href="ftp://ftp.cpsc.ucalgary.ca/pub/projects/text.compression.corpus"> * ftp://ftp.cpsc.ucalgary.ca/pub/projects/text.compression.corpus * </a>. * </blockquote> * </P> * <P> * Because of the use of statics, only a single test should be run per virtual machine. * * @author <a href="http://www.colloquial.com/carp/">Bob Carpenter</a> * @version 1.1 * @see ArithCodeModel * @see ArithCodeInputStream * @see ArithCodeOutputStream * @see AdaptiveUnigramModel * @see UniformModel * @see PPMModel * @since 1.0 */ public final class Test { /** Runs test suite as specified by arguments. * <UL> * <LI><code>-f <i>FileName</i>: </code> * Test specified file.</LI> * <LI><code>-s <i>Integer</i>: </code> * Sized tests up to specified number of bytes.</LI> * <LI><code>-g: </code> * Run small tests.</LI> * <LI><code><i>String</i>: </code> * Test specified string.</LI> * <LI><code>-c <i>Directory</i>: </code> * Test calgary corpus found in specified directory.</LI> * <LI><code>-x <i>Directory</i>: </code> * Test James Cheney's XML corpus found in specified directory.</LI> * </UL> * @param args Parameters in fixed order. * @throws IOException If there is an underlying I/O exception during compression/decompression. */ public static void main(String[] args) throws IOException { System.out.println(); System.out.println("Start Time: " + new Timestamp(System.currentTimeMillis())); long startTime = System.currentTimeMillis(); _testSet.clear(); for (int i = 0; i < args.length; ++i) { if (args[i].equals("-f")) test(new File(args[++i])); else if (args[i].equals("-s")) testSize(Integer.parseInt(args[++i])); else if (args[i].equals("-c")) testCalgary(args[++i]); else if (args[i].equals("-x")) testXML(args[++i]); else if (args[i].equals("-g")) testFixed(); else test(args[i]); } System.out.println(); System.out.print("Total Time: " + timeToSeconds(elapsed(startTime))); System.out.println(_testSet); } /** Read all of the input from the given input stream and write * it to the given output stream. * @param in Input stream from which to read. * @param out Output stream to which to write. * @throws IOException If there is an exception reading or writing on the given streams. */ static void copyStream(InputStream in, OutputStream out) throws IOException { int i; while (true) { int j = in.read(); if (j == -1) { in.close(); out.close(); return; } out.write(j); } } /** Return elapsed time since specified time in milliseconds (1/1000 second). * @param start Time from which to measure. * @return Time since start time in milliseconds. */ static long elapsed(long start) { return System.currentTimeMillis() - start; } /** Convert specified time in milliseconds to a string in seconds. * @param t Time to convert to a string. * @return String representation of specified time. */ static String timeToSeconds(long t) { return ((double)t)/1000.0 + " seconds"; } /** Creates the test set to use for the tests. */ private static TestSet _testSet = new TestSet(); /** Hide unused constructor. */ private Test() { } /** Runs tests from 1 to give size, increasing size by a factor * of two at each step. For each size, a test is made of a constant * string consisting of repetitions of a single character, and a test of a random sequence of * letters and then a random sequence of bytes. * @param size Maximum size up to which to test. * @throws IOException If there is an underlying I/O exception during compression/decompression. */ private static void testSize(int size) throws IOException { StringBuffer constantSB = new StringBuffer("a"); Random random = new Random(); for (int k = 1; k <= size; k *= 2) { byte[] bs = new byte[k]; nextRandomAlphaNum(bs,random); test(bs); random.nextBytes(bs); test(bs); String constantString = constantSB.toString(); test(constantString.toString()); constantSB.append(constantSB.toString()); } } /** Fixed test suite. * @throws IOException If there is an underlying I/O exception during compression/decompression. */ private static void testFixed() throws IOException { test(""); test("The quick brown fox jumped over the lazy dog."); test("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"); test("01234567890~`!@#$%^&*()-_=+{[}]:;\"'<,>.?/|\\." + "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ."); } /** Runs a test on James Cheney's XML corpus. * @param path Name of directory in which to find the Calgary corpus. * @throws IOException If there is an underlying I/O exception during compression/decompression. */ private static void testXML(String path) throws IOException { // Description [name in Cheney's paper] test(new File(path,"play1.xml")); // Shakespearean play [play] // test(new File(path,"play2.xml")); // test(new File(path,"play3.xml")); test(new File(path,"treebank.xml")); // natural language parses [treebank] // test(new File(path,"w3c1.xml")); // xml spec from w3c test(new File(path,"w3c2.xml")); // [spec] test(new File(path,"w3c3.xml")); // [spec2] // test(new File(path,"w3c4.xml")); // test(new File(path,"w3c5.xml")); test(new File(path,"weblog.xml")); // web log [weblog] test(new File(path,"tpc.xml")); // (small) [tpc?] test(new File(path,"sprot.xml")); // (small) [sprot] test(new File(path,"elts.xml")); // statistical & scientific db [elts] test(new File(path,"stats1.xml")); // [stats] // test(new File(path,"stats2.xml")); // test(new File(path,"pcc1.xml")); // formal proof test(new File(path,"pcc2.xml")); // [proof] // test(new File(path,"pcc3.xml")); test(new File(path,"tal1.xml")); // annotated assembly language [tal] // test(new File(path,"tal2.xml")); // test(new File(path,"tal3.xml")); } /** Runs a test on the Calgary corpus. * @param path Name of directory in which to find the Calgary corpus. * @throws IOException If there is an underlying I/O exception during compression/decompression. */ private static void testCalgary(String path) throws IOException { test(new File(path,"progc")); // programs test(new File(path,"progl")); test(new File(path,"progp")); test(new File(path,"paper1")); // text test(new File(path,"paper2")); test(new File(path,"book1")); test(new File(path,"book2")); test(new File(path,"news")); // unedited news test(new File(path,"bib")); // formatted bibtex test(new File(path,"trans")); // terminal session test(new File(path,"obj1")); // executable test(new File(path,"obj2")); test(new File(path,"geo")); // geophysical data test(new File(path,"pic")); // bitmap } /** Tests compression/decompression of a given file. * @param file File to test. * @return <code>true</code> if the test succeeds. * @return <code>true</code> if the test succeeds. * @throws IOException If there is an underlying I/O exception. */ private static boolean test(File file) throws IOException { System.out.println("\nTesting File: " + file); FileInputStream in = new FileInputStream(file); int available = in.available(); byte[] bytes = new byte[available]; in.read(bytes,0,available); return testBytes(bytes); } /** Tests compression/decompression of a given string. String is * first rendered as bytes, given current localized default; see * {@link java.lang.String#getBytes}. * @param text String to test for compression/decompression. * @return <code>true</code> if the test succeeds. * @throws IOException If there is an underlying I/O exception. */ private static boolean test(String text) throws IOException { System.out.println(); System.out.println("Testing: /" + trim(text) + "/"); return testBytes(text.getBytes()); } /** Tests compression/decompression of a given sequence of bytes. * @param bytes Bytes to test for compression/decompression. * @return <code>true</code> if the test succeeds. * @throws IOException If there is an underlying I/O exception. */ private static boolean test(byte[] bytes) throws IOException { System.out.println(); System.out.println("Testing byte array with length: " + bytes.length); return testBytes(bytes); } /** Run a test of PPM on the specified bytes using a model of the * specified order. * @param bytes Bytes to test. * @param order Order of PPM model to use. * @return <code>true</code> if the test is successful. */ private static boolean testPPMBytes(byte[] bytes, int order) throws IOException { return testBytes(bytes, new PPMModel(order), new PPMModel(order), "PPM(" + order + ")" + (order < 10 ? " ": " ")); } /** Tests given sequence of bytes against various models. * @param bytes Bytes to test for compression/decompression. * @return <code>true</code> if the test succeeds. * @throws IOException If there is an underlying I/O exception. */ private static boolean testBytes(byte[] bytes) throws IOException { boolean pass = true; pass = testBytes(bytes, UniformModel.MODEL, UniformModel.MODEL, "Uniform ") && pass; pass = testBytes(bytes, new AdaptiveUnigramModel(), new AdaptiveUnigramModel(), "Unigram ") && pass; pass = testPPMBytes(bytes,0) && pass; pass = testPPMBytes(bytes,1) && pass; pass = testPPMBytes(bytes,2) && pass; pass = testPPMBytes(bytes,3) && pass; pass = testPPMBytes(bytes,4) && pass; pass = testPPMBytes(bytes,5) && pass; pass = testPPMBytes(bytes,6) && pass; pass = testPPMBytes(bytes,7) && pass; pass = testPPMBytes(bytes,8) && pass; // pass = testPPMBytes(bytes,9) && pass; pass = testPPMBytes(bytes,10) && pass; pass = testPPMBytes(bytes,12) && pass; pass = testPPMBytes(bytes,16) && pass; // pass = testPPMBytes(bytes,24) && pass; // pass = testPPMBytes(bytes,32) && pass; return pass; } /** Tests specified sequence of bytes with specified models for input and output, and specified name. * @param bytes Bytest to test. * @param modelIn Model to use for encoding. * @param modelOut Model to use for decoding. * @param name Name ot use for display. * @return <code>true</code> if the test succeeds. * @throws IOException If there is an underlying I/O exception. */ private static boolean testBytes(byte[] bytes, ArithCodeModel modelIn, ArithCodeModel modelOut, String name) throws IOException { ByteArrayInputStream textBytesIn = new ByteArrayInputStream(bytes); ByteArrayOutputStream codeBytesOut = new ByteArrayOutputStream(); long startTime = System.currentTimeMillis(); copyStream(textBytesIn, new ArithCodeOutputStream(codeBytesOut, modelIn)); long encodeTime = elapsed(startTime); modelIn = null; // can GC input model ByteArrayOutputStream textBytesOut = new ByteArrayOutputStream(); byte[] codeBytes = codeBytesOut.toByteArray(); startTime = System.currentTimeMillis(); copyStream(new ArithCodeInputStream(new ByteArrayInputStream(codeBytes),modelOut), textBytesOut); long decodeTime = elapsed(startTime); _testSet.record(name,bytes.length,codeBytes.length,encodeTime,decodeTime); boolean pass = Arrays.equals(bytes,textBytesOut.toByteArray()); System.out.print(" " + name + " "); System.out.print(intToString(bytes.length,9) + " -> " + intToString(codeBytes.length,9) + " B"); System.out.print(" "+ compressionRateString(bytes.length,codeBytes.length)); System.out.print(" enc: " + speedString(bytes.length,encodeTime)); System.out.print(" dec: " + speedString(bytes.length,decodeTime)); System.out.println(pass ? "" : "***** FAIL *****"); return pass; } /** Returns a string representation of the compression rate indicated by the specified * number of original bytes and compressed bytes. Expressed in bits per byte. * @param numOriginalBytes Number of uncompressed bytes. * @param numCompressedBytes Number of bytes in the compressed file. * @return String representation of compression rate. */ private static String compressionRateString(int numBytesIn, int numBytesOut) { double val = ((double) (int) (1000.0 * (((double) (numBytesOut * 8.0)) / (double) numBytesIn)))/1000.0; String result = (val > 1000) ? "?" : (val+ ""); while (result.length() < 6) result = result + " "; return result + "b/B"; } /** Convert an integer to a string, padding with spaces in the front * to provide a result of the specified minimum length. * @param n Integer to convert to string. * @param minLength Minimum length of result. * @return String representation of integer, padded to at least specified length. */ private static String intToString(int n, int minLength) { String s = Integer.toString(n); while (s.length() < minLength) s = ' ' + s; return s; } /** Returns a string representation of the speed of compression indicated by the specified * number of original bytes and time in milliseconds. * @param numBytes Number of uncompressed bytes. * @param numMillis Number of milliseconds. * @return String representation of number of bytes per millisecond. */ private static String speedString(int numBytes, long numMillis) { int kbS = ((int) ((double)numBytes / (double)numMillis)); return (kbS > 100000 ? " ?" : intToString(kbS,6)) + " kB/s"; } /** Truncates string to printable length, appending epenthetic dots if * it is truncated. * @param in String to truncate. * @return Truncated string. */ private static String trim(String in) { return (in.length() <= 32) ? in : (in.substring(0,32) + "..."); } /** Fills the specified byte array with random alphanumeric characters * generated by the specified randomizer. * @param bs Byte array to fill. * @param r Randomizer. */ private static void nextRandomAlphaNum(byte[] bs, Random r) { for (int i = 0; i < bs.length; ++i) { bs[i] = nextRandomAlphaNum(r); } } /** Generates the next random byte between the specified low and * high bytes inclusive, using the specified randomizer. * @param r Randomizer. * @param low Low end of byte range, inclusive. * @param high High end of byte range, inclusive. * @return Random byte in low to high range. */ private static byte nextByteRange(Random r, int low, int high) { return (byte) (low + r.nextInt(1 + high - low)); } /** Returns next random alphabetic or numeric byte as * determined by the specified randomizer. * @param r Randomizer. * @return Next random alpha-numeric byte. */ private static byte nextRandomAlphaNum(Random r) { if (r.nextBoolean()) return nextByteRange(r,(byte)'a',(byte)'z'); if (r.nextBoolean()) return nextByteRange(r,(byte)'A',(byte)'Z'); return nextByteRange(r,(byte)'0',(byte)'9'); } }