AbstractRecordCompressorTestCase.java example

Explorer
blazegraph-master
- database-master
/*

Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.

Contact:
     SYSTAP, LLC DBA Blazegraph
     2501 Calvert ST NW #106
     Washington, DC 20008
     licenses@blazegraph.com

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

*/
/*
 * Created on May 2, 2009
 */

package com.bigdata.io.compression;

import java.io.ByteArrayOutputStream;
import java.nio.ByteBuffer;
import java.util.Random;
import java.util.concurrent.TimeUnit;

import junit.framework.TestCase2;

import com.bigdata.io.DirectBufferPool;
import com.bigdata.io.SerializerUtil;

/**
 * Abstract base class for {@link IRecordCompressor} test suites.
 * 
 * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
 * @version $Id$
 * 
 *          FIXME add unit tests using read-only {@link ByteBuffer}s and one
 *          using a {@link DirectBufferPool} to give us a direct buffer for
 *          testing. This will test the logic that handles cases when we can not
 *          access the backing byte[].
 */
public abstract class AbstractRecordCompressorTestCase extends TestCase2
        implements IRecordCompressorFactory {

    /**
     * 
     */
    public AbstractRecordCompressorTestCase() {
    }

    /**
     * @param name
     */
    public AbstractRecordCompressorTestCase(String name) {
        super(name);
    }

    /**
     * A bunch of words derived from a stopwords list that are used to generate
     * random strings comprised of a redundant set of terms. This approach was
     * chosen in order to give the compression algorithm some realistic data on
     * which to work.
     */
    private static String[] words = new String[] {
        "a", 
        "a's", 
        "able", 
        "about", 
        "above", 
        "according", 
        "accordingly", 
        "across", 
        "actually", 
        "after", 
        "afterwards", 
        "again", 
        "against", 
        "ain't", 
        "all", 
        "allow", 
        "allows", 
        "almost", 
        "alone", 
        "along", 
        "already", 
        "also", 
        "although", 
        "always", 
        "am", 
        "among", 
        "amongst", 
        "an", 
        "and", 
        "another", 
        "any", 
        "anybody", 
        "anyhow", 
        "anyone", 
        "anything", 
        "anyway", 
        "anyways", 
        "anywhere", 
        "apart", 
        "appear", 
        "appreciate", 
        "appropriate", 
        "are", 
        "aren't", 
        "around", 
        "as", 
        "aside", 
        "ask", 
        "asking", 
        "associated", 
        "at", 
        "available", 
        "away", 
        "awfully", 
        "b", 
        "be", 
        "became", 
        "because", 
        "become", 
        "becomes", 
        "becoming", 
        "been", 
        "before", 
        "beforehand", 
        "behind", 
        "being", 
        "believe", 
        "below", 
        "beside", 
        "besides", 
        "best", 
        "better", 
        "between", 
        "beyond", 
        "both", 
        "brief", 
        "but", 
        "by", 
        "c", 
        "c'mon", 
        "c's", 
        "came", 
        "can", 
        "can't", 
        "cannot", 
        "cant", 
        "cause", 
        "causes", 
        "certain", 
        "certainly", 
        "changes", 
        "clearly", 
        "co", 
        "com", 
        "come", 
        "comes", 
        "concerning", 
        "consequently", 
        "consider", 
        "considering", 
        "contain", 
        "containing", 
        "contains", 
        "corresponding", 
        "could", 
        "couldn't", 
        "course", 
        "currently", 
        "d", 
        "definitely", 
        "described", 
        "despite", 
        "did", 
        "didn't", 
        "different", 
        "do", 
        "does", 
        "doesn't", 
        "doing", 
        "don't", 
        "done", 
        "down", 
        "downwards", 
        "during", 
        "e", 
        "each", 
        "edu", 
        "eg", 
        "eight", 
        "either", 
        "else", 
        "elsewhere", 
        "enough", 
        "entirely", 
        "especially", 
        "et", 
        "etc", 
        "even", 
        "ever", 
        "every", 
        "everybody", 
        "everyone", 
        "everything", 
        "everywhere", 
        "ex", 
        "exactly", 
        "example", 
        "except", 
        "f", 
        "far", 
        "few", 
        "fifth", 
        "first", 
        "five", 
        "followed", 
        "following", 
        "follows", 
        "for", 
        "former", 
        "formerly", 
        "forth", 
        "four", 
        "from", 
        "further", 
        "furthermore", 
        "g", 
        "get", 
        "gets", 
        "getting", 
        "given", 
        "gives", 
        "go", 
        "goes", 
        "going", 
        "gone", 
        "got", 
        "gotten", 
        "greetings", 
        "h", 
        "had", 
        "hadn't", 
        "happens", 
        "hardly", 
        "has", 
        "hasn't", 
        "have", 
        "haven't", 
        "having", 
        "he", 
        "he's", 
        "hello", 
        "help", 
        "hence", 
        "her", 
        "here", 
        "here's", 
        "hereafter", 
        "hereby", 
        "herein", 
        "hereupon", 
        "hers", 
        "herself", 
        "hi", 
        "him", 
        "himself", 
        "his", 
        "hither", 
        "hopefully", 
        "how", 
        "howbeit", 
        "however", 
        "i", 
        "i'd", 
        "i'll", 
        "i'm", 
        "i've", 
        "ie", 
        "if", 
        "ignored", 
        "immediate", 
        "in", 
        "inasmuch", 
        "inc", 
        "indeed", 
        "indicate", 
        "indicated", 
        "indicates", 
        "inner", 
        "insofar", 
        "instead", 
        "into", 
        "inward", 
        "is", 
        "isn't", 
        "it", 
        "it'd", 
        "it'll", 
        "it's", 
        "its", 
        "itself", 
        "j", 
        "just", 
        "k", 
        "keep", 
        "keeps", 
        "kept", 
        "know", 
        "knows", 
        "known", 
        "l", 
        "last", 
        "lately", 
        "later", 
        "latter", 
        "latterly", 
        "least", 
        "less", 
        "lest", 
        "let", 
        "let's", 
        "like", 
        "liked", 
        "likely", 
        "little", 
        "look", 
        "looking", 
        "looks", 
        "ltd", 
        "m", 
        "mainly", 
        "many", 
        "may", 
        "maybe", 
        "me", 
        "mean", 
        "meanwhile", 
        "merely", 
        "might", 
        "more", 
        "moreover", 
        "most", 
        "mostly", 
        "much", 
        "must", 
        "my", 
        "myself", 
        "n", 
        "name", 
        "namely", 
        "nd", 
        "near", 
        "nearly", 
        "necessary", 
        "need", 
        "needs", 
        "neither", 
        "never", 
        "nevertheless", 
        "new", 
        "next", 
        "nine", 
        "no", 
        "nobody", 
        "non", 
        "none", 
        "noone", 
        "nor", 
        "normally", 
        "not", 
        "nothing", 
        "novel", 
        "now", 
        "nowhere", 
        "o", 
        "obviously", 
        "of", 
        "off", 
        "often", 
        "oh", 
        "ok", 
        "okay", 
        "old", 
        "on", 
        "once", 
        "one", 
        "ones", 
        "only", 
        "onto", 
        "or", 
        "other", 
        "others", 
        "otherwise", 
        "ought", 
        "our", 
        "ours", 
        "ourselves", 
        "out", 
        "outside", 
        "over", 
        "overall", 
        "own", 
        "p", 
        "particular", 
        "particularly", 
        "per", 
        "perhaps", 
        "placed", 
        "please", 
        "plus", 
        "possible", 
        "presumably", 
        "probably", 
        "provides", 
        "q", 
        "que", 
        "quite", 
        "qv", 
        "r", 
        "rather", 
        "rd", 
        "re", 
        "really", 
        "reasonably", 
        "regarding", 
        "regardless", 
        "regards", 
        "relatively", 
        "respectively", 
        "right", 
        "s", 
        "said", 
        "same", 
        "saw", 
        "say", 
        "saying", 
        "says", 
        "second", 
        "secondly", 
        "see", 
        "seeing", 
        "seem", 
        "seemed", 
        "seeming", 
        "seems", 
        "seen", 
        "self", 
        "selves", 
        "sensible", 
        "sent", 
        "serious", 
        "seriously", 
        "seven", 
        "several", 
        "shall", 
        "she", 
        "should", 
        "shouldn't", 
        "since", 
        "six", 
        "so", 
        "some", 
        "somebody", 
        "somehow", 
        "someone", 
        "something", 
        "sometime", 
        "sometimes", 
        "somewhat", 
        "somewhere", 
        "soon", 
        "sorry", 
        "specified", 
        "specify", 
        "specifying", 
        "still", 
        "sub", 
        "such", 
        "sup", 
        "sure", 
        "t", 
        "t's", 
        "take", 
        "taken", 
        "tell", 
        "tends", 
        "th", 
        "than", 
        "thank", 
        "thanks", 
        "thanx", 
        "that", 
        "that's", 
        "thats", 
        "the", 
        "their", 
        "theirs", 
        "them", 
        "themselves", 
        "then", 
        "thence", 
        "there", 
        "there's", 
        "thereafter", 
        "thereby", 
        "therefore", 
        "therein", 
        "theres", 
        "thereupon", 
        "these", 
        "they", 
        "they'd", 
        "they'll", 
        "they're", 
        "they've", 
        "think", 
        "third", 
        "this", 
        "thorough", 
        "thoroughly", 
        "those", 
        "though", 
        "three", 
        "through", 
        "throughout", 
        "thru", 
        "thus", 
        "to", 
        "together", 
        "too", 
        "took", 
        "toward", 
        "towards", 
        "tried", 
        "tries", 
        "truly", 
        "try", 
        "trying", 
        "twice", 
        "two", 
        "u", 
        "un", 
        "under", 
        "unfortunately", 
        "unless", 
        "unlikely", 
        "until", 
        "unto", 
        "up", 
        "upon", 
        "us", 
        "use", 
        "used", 
        "useful", 
        "uses", 
        "using", 
        "usually", 
        "uucp", 
        "v", 
        "value", 
        "various", 
        "very", 
        "via", 
        "viz", 
        "vs", 
        "w", 
        "want", 
        "wants", 
        "was", 
        "wasn't", 
        "way", 
        "we", 
        "we'd", 
        "we'll", 
        "we're", 
        "we've", 
        "welcome", 
        "well", 
        "went", 
        "were", 
        "weren't", 
        "what", 
        "what's", 
        "whatever", 
        "when", 
        "whence", 
        "whenever", 
        "where", 
        "where's", 
        "whereafter", 
        "whereas", 
        "whereby", 
        "wherein", 
        "whereupon", 
        "wherever", 
        "whether", 
        "which", 
        "while", 
        "whither", 
        "who", 
        "who's", 
        "whoever", 
        "whole", 
        "whom", 
        "whose", 
        "why", 
        "will", 
        "willing", 
        "wish", 
        "with", 
        "within", 
        "without", 
        "won't", 
        "wonder", 
        "would", 
        "would", 
        "wouldn't", 
        "x", 
        "y", 
        "yes", 
        "yet", 
        "you", 
        "you'd", 
        "you'll", 
        "you're", 
        "you've", 
        "your", 
        "yours", 
        "yourself", 
        "yourselves", 
        "z", 
        "zero"
        };

    final Random r = new Random();
    
    /**
     * Generate a record comprised of <i>n</i> random terms selected from
     * {@link #words}. The terms are concatenated with whitespace separators
     * and then serialized as a byte[] which is returned to the caller.
     * 
     * @param n
     *            The #of terms to include in the record.
     */
    protected byte[] getRandomRecord(int n) {
        
        StringBuilder sb = new StringBuilder();
        
        for(int i=0; i<n; i++ ) {

            if(i>0) sb.append(" ");

            sb.append(words[r.nextInt(words.length)]);
            
        }
        
        return sb.toString().getBytes();
        
    }

//    /**
//     * Test helper applies the compression algorithm to the data and then
//     * verifies that the expected data can be recovered by applying the
//     * decompression algorithm.
//     * 
//     * @param c
//     *            The (de-)compressor.
//     * @param expected
//     *            The data to be compressed.
//     * 
//     * @return The #of bytes in the compressed record.
//     */
//    protected int doCompressionTestWithByteBuffer(RecordCompressor c, final byte[] expected) {
//
//        // default size to something that will be large enough.
//        ByteArrayOutputStream baos = new ByteArrayOutputStream(expected.length);
//
//        ByteBuffer buf = ByteBuffer.wrap(expected);
//        
//        // compress the data onto the buffer.
//        c.compress(buf, baos);
//
//        // obtain a copy of the compressed data.
//        final byte[] compressed = baos.array();
//        
//        // reset the buffer to receive the results.
//        buf.clear();
//        
//        // decompress the compressed data.
//        final byte[] actual = c.decompress(compressed);
//        
//        assertEquals(expected,actual);
//        
//        return compressed.length;
//
//    }

    /**
     * Test helper applies the compression algorithm to the data and then
     * verifies that the expected data can be recovered by applying the
     * decompression algorithm.
     * 
     * @param c
     *            The (de-)compressor.
     * @param expected
     *            The data to be compressed.
     * 
     * @return The #of bytes in the compressed record.
     */
    protected int doCompressionTest(final IRecordCompressor c,
            final byte[] expected, final int off, final int len) {

        // default output buffer to something that will be large enough.
        final ByteArrayOutputStream baos = new ByteArrayOutputStream(
                expected.length);

        {
            
            // wrap up in a ByteBuffer.
            final ByteBuffer wrapper = ByteBuffer.wrap(expected, off, len);
            assertEquals(off + len, wrapper.limit());
            assertEquals(off, wrapper.position());

            // compress the data onto the output stream.
            c.compress(wrapper, baos);

            /*
             * verify that the position() was advanced to the limit() and that
             * the limit was not changed.
             */
            assertEquals(len + off, wrapper.limit());
            assertEquals(len + off, wrapper.position());
            
        }
        
        // obtain a copy of the compressed data.
        final byte[] compressed = baos.toByteArray();

        /*
         * Decompress the compressed data onto an exact fit byte[].
         */
        final byte[] actual;
        {

            /*
             * Decompress the compressed data. This returns a view onto a shared
             * buffer. The data between position() and limit() are the decompressed
             * data.
             */
            final ByteBuffer decompressed = c.decompress(compressed);
            
            assertEquals(0, decompressed.position());
            
            assertEquals(len, decompressed.limit());
            
            assertEquals(len, decompressed.remaining());

            /*
             * Copy the decompressed data into an exact fit buffer so that we
             * can use a test helper to verify the decompressed data.
             */
            
            actual = new byte[decompressed.remaining()];
            
            decompressed.get(actual);

        }

        /*
         * Verify the decompressed data.
         */
        for (int i = off, j = 0; i < len; i++, j++) {
     
            if (expected[i] != actual[j]) {

                fail("bytes differ at offset=" + j
                        + " in decompressed data: expected.length="
                        + expected.length + ", off=" + off + ", len=" + len);
                
            }
            
        }

        return compressed.length;

    }

    /**
     * Test ability to compress and decompress data.
     */
    public void test_recordCompressor01() {

        final IRecordCompressor c = getInstance();

        final byte[] expected = getRandomRecord(10000);

        doCompressionTest(c, expected, 0, expected.length);

    }

    /**
     * Test ability to compress and decompress data using a de-serialized
     * compression provider.
     */
    public void test_recordCompressor02() {

        final IRecordCompressor c = (IRecordCompressor) SerializerUtil
                .deserialize(SerializerUtil.serialize(getInstance()));

        final byte[] expected = getRandomRecord(10000);

        doCompressionTest(c, expected, 0, expected.length);

    }

    /**
     * Test ability to compress and decompress zero-length data.
     */
    public void test_recordCompressor03() {

        final IRecordCompressor c = getInstance();

        final byte[] expected = new byte[] {};

        doCompressionTest(c, expected, 0, expected.length);

    }
    
    /**
     * Stress test ability to compress and decompress data.
     */
    public void test_recordCompressor_stressTest() {

        final long begin = System.nanoTime();
        
//        final int limit = 50000;
        final int limit = 2500;
        
        long sumBytes = 0l;
        long sumCompressed = 0l;
        
        IRecordCompressor c = getInstance();

        final byte[] expected = getRandomRecord(10000);

        for (int i = 0; i < limit; i++) {

            final int off = r.nextInt(expected.length / 2);

            final int len = r.nextInt(expected.length - off);

            sumBytes += len;

            sumCompressed += doCompressionTest(c, expected, off, len);

            if (i % 100 == 0) {
             
                // every so often, we (de-)serialize the compressor itself.
                c = (IRecordCompressor) SerializerUtil
                        .deserialize(SerializerUtil.serialize(c));
                
            }
            
        }

        final long elapsed = System.nanoTime() - begin;
        
        System.err.println("Compressed " + limit + " records totaling "
                + sumBytes + " bytes into " + sumCompressed + " bytes: ratio="
                + ((double) sumCompressed / (double) sumBytes) + ", rate="
                + (sumBytes / (double) TimeUnit.NANOSECONDS.toSeconds(elapsed))
                + " bytes/sec: " + c);

    }

}