TestIndexSegmentWithBloomFilter.java example

/**

Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.

Contact:
     SYSTAP, LLC DBA Blazegraph
     2501 Calvert ST NW #106
     Washington, DC 20008
     licenses@blazegraph.com

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/
/*
 * Created on Dec 21, 2006
 */

package com.bigdata.btree;

import java.io.File;
import java.io.IOException;
import java.util.Properties;
import java.util.UUID;

import com.bigdata.btree.keys.TestKeyBuilder;
import com.bigdata.journal.BufferMode;
import com.bigdata.journal.Options;
import com.bigdata.rawstore.IRawStore;
import com.bigdata.rawstore.SimpleMemoryRawStore;
import com.bigdata.util.BytesUtil;

/**
 * Test build trees on the journal, evicts them into an {@link IndexSegment},
 * and then compares the performance and correctness of index point tests with
 * and without the use of the bloom filter.
 * 
 * @todo compare performance with and without the bloom filter.
 * 
 * @todo test points that will not be in the index as well as those that are.
 * 
 * @todo report on the cost to construct the filter and its serialized size and
 *       runtime space.
 * 
 * @todo verify the target error rate.
 * 
 * @todo explore different error rates, including Fast.mostSignificantBit( n ) +
 *       1 which would provide an expectation of no false positives.
 * 
 * @todo Compare for each build algorithm, just like
 *       {@link TestIndexSegmentBuilderWithLargeTrees}.
 * 
 * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
 * @version $Id$
 */
public class TestIndexSegmentWithBloomFilter extends AbstractBTreeTestCase {

    public TestIndexSegmentWithBloomFilter() {
    }

    public TestIndexSegmentWithBloomFilter(String name) {
        super(name);
    }

    private static final boolean bufferNodes = true;
    
    public Properties getProperties() {

        if (properties == null) {

            properties = super.getProperties();

            properties.setProperty(Options.BUFFER_MODE, BufferMode.Disk
                    .toString());

            properties.setProperty(Options.CREATE_TEMP_FILE, "true");

        }

        return properties;

    }

    private Properties properties;

    /**
     * Return a btree backed by a journal with the indicated branching factor.
     * The serializer requires that values in leaves are {@link SimpleEntry}
     * objects.
     * 
     * @param branchingFactor
     *            The branching factor.
     * 
     * @return The btree.
     */
    public BTree getBTree(int branchingFactor, BloomFilterFactory bloomFilterFactory) {

        IRawStore store = new SimpleMemoryRawStore(); 

        IndexMetadata metadata = new IndexMetadata(UUID.randomUUID());
        
        metadata.setBranchingFactor(branchingFactor);
        
        metadata.setBloomFilterFactory(bloomFilterFactory);
        
        BTree btree = BTree.create(store, metadata);

        return btree;

    }
    
    /**
     * Branching factors for the source btree that is then used to build an
     * {@link IndexSegment}. This parameter indirectly determines both the #of
     * leaves and the #of entries in the source btree.
     * 
     * Note: Regardless of the branching factor in the source btree, the same
     * {@link IndexSegment} should be build for a given set of entries
     * (key-value pairs) and a given output branching factor for the
     * {@link IndexSegment}. However, input trees of different heights also
     * stress different parts of the algorithm.
     */
    final int[] branchingFactors = new int[]{3,4,5,10,13};//64};//128};//,512};
    
    /**
     * A stress test for building {@link IndexSegment}s. A variety of
     * {@link BTree}s are built from dense random keys in [1:n] using a variety
     * of branching factors. For each {@link BTree}, a variety of
     * {@link IndexSegment}s are built using a variety of output branching
     * factors. For each {@link IndexSegment}, we then compare it against its
     * source {@link BTree} for the same total ordering.
     */
    public void test_randomDenseKeys() throws Exception {

        final double p = 1/64d;// error rate
        final double maxP = p*10; // max error rate

        for(int i=0; i<branchingFactors.length; i++) {
            
            final int m = branchingFactors[i];
            
            doBuildIndexSegmentAndCompare(doSplitWithRandomDenseKeySequence(
                    getBTree(m, new BloomFilterFactory(m/* n */, p, maxP)), m,
                    m));

            doBuildIndexSegmentAndCompare(doSplitWithRandomDenseKeySequence(
                    getBTree(m, new BloomFilterFactory(m * m/* n */, p, maxP)),
                    m, m * m));

            doBuildIndexSegmentAndCompare(doSplitWithRandomDenseKeySequence(
                    getBTree(m, new BloomFilterFactory(m * m * m/* n */, p,
                            maxP)), m, m * m * m));

            // @todo overflows the initial journal extent.
//            doBuildIndexSegmentAndCompare( doSplitWithRandomDenseKeySequence( getBTree(m,errorRate), m, m*m*m*m ) );

        }
        
    }
    
    /**
     * A stress test for building {@link IndexSegment}s. A variety of
     * {@link BTree}s are built from spase random keys using a variety of
     * branching factors. For each {@link BTree}, a variety of
     * {@link IndexSegment}s are built using a variety of output branching
     * factors. For each {@link IndexSegment}, we then compare it against its
     * source {@link BTree} for the same total ordering.
     */
    public void test_randomSparseKeys() throws Exception {

        int trace = 0;
        
        for(int i=0; i<branchingFactors.length; i++) {
            
            int m = branchingFactors[i];

            doBuildIndexSegmentAndCompare(doInsertRandomSparseKeySequenceTest(
                    getBTree(m), m, trace));

            doBuildIndexSegmentAndCompare(doInsertRandomSparseKeySequenceTest(
                    getBTree(m), m * m, trace));

            doBuildIndexSegmentAndCompare(doInsertRandomSparseKeySequenceTest(
                    getBTree(m), m * m * m, trace));

            //@todo overflows the initial journal extent.
//            doBuildIndexSegmentAndCompare( doInsertRandomSparseKeySequenceTest(m,m*m*m*m,trace) );

        }
    
    }

    /**
     * Test when the input tree is a root leaf with three values.  The output
     * tree will also be a root leaf.
     * 
     * @throws IOException
     */
    public void test_rootLeaf() throws Exception {

        final int m = 3; // for input and output trees.
        
        final BTree btree = getBTree(m, new BloomFilterFactory(100, 1 / 64d,
                1 / 32d));
        
        SimpleEntry v3 = new SimpleEntry(3);
        SimpleEntry v5 = new SimpleEntry(5);
        SimpleEntry v7 = new SimpleEntry(7);

        btree.insert(TestKeyBuilder.asSortKey(3), v3);
        btree.insert(TestKeyBuilder.asSortKey(5), v5);
        btree.insert(TestKeyBuilder.asSortKey(7), v7);
       
        final File outFile2 = new File(getName()+"_m"+m+ "_bloom.seg");

        if( outFile2.exists() && ! outFile2.delete() ) {
            fail("Could not delete old index segment: "+outFile2.getAbsoluteFile());
        }
        
        final File tmpDir = outFile2.getAbsoluteFile().getParentFile(); 
        
        /*
         * Build the index segment with a bloom filter.
         */
		if (log.isInfoEnabled())
			log.info("Building index segment (w/ bloom): in(m="
                + btree.getBranchingFactor() + ", nentries=" + btree.getEntryCount()
                + "), out(m=" + m + ")");

        final long commitTime = System.currentTimeMillis();
        
        final IndexSegmentBuilder builder2 = IndexSegmentBuilder.newInstance(
                outFile2, tmpDir, btree.getEntryCount(), btree.rangeIterator(),
                m, btree.getIndexMetadata(), commitTime,
                true/* compactingMerge */, bufferNodes);

        @SuppressWarnings("unused")
		final IndexSegmentCheckpoint checkpoint = builder2.call();
        
//      @see BLZG-1501 (remove LRUNexus)        
//        if (LRUNexus.INSTANCE != null) {
//
//            /*
//             * Clear the records for the index segment from the cache so we will
//             * read directly from the file. This is necessary to ensure that the
//             * data on the file is good rather than just the data in the cache.
//             */
//            
//            LRUNexus.INSTANCE.deleteCache(checkpoint.segmentUUID);
//
//        }

//        IndexSegmentBuilder builder2 = new IndexSegmentBuilder(outFile2,
//                tmpDir, btree, m, 1/64.);

        // the bloom filter instance before serialization.
        IBloomFilter bloomFilter = builder2.bloomFilter;
        
        // false positive tests (should succeed with reasonable errorRate).
        assertTrue("3",bloomFilter.contains(i2k(3)));
        assertTrue("5",bloomFilter.contains(i2k(5)));
        assertTrue("7",bloomFilter.contains(i2k(7)));
        // correct rejections (must succeed)
        assertFalse("4",bloomFilter.contains(i2k(4)));
        assertFalse("9",bloomFilter.contains(i2k(9)));

        /*
         * Verify can load the index file and that the metadata
         * associated with the index file is correct (we are only
         * checking those aspects that are easily defined by the test
         * case and not, for example, those aspects that depend on the
         * specifics of the length of serialized nodes or leaves).
         */
		if (log.isInfoEnabled())
			log.info("Opening index segment w/ bloom filter.");
        final IndexSegment seg2 = new IndexSegmentStore(outFile2).loadIndexSegment();
        try {
        
        /*
         * Verify the total index order.
         */
		if (log.isInfoEnabled())
			log.info("Verifying index segments.");
        assertSameBTree(btree, seg2);

        // the bloom filter instance that was de-serialized.
        bloomFilter = seg2.getBloomFilter();
        
        // false positive tests (should succeed with resonable errorRate).
        assertTrue("3",bloomFilter.contains(i2k(3)));
        assertTrue("5",bloomFilter.contains(i2k(5)));
        assertTrue("7",bloomFilter.contains(i2k(7)));
        // correct rejections (must succeed)
        assertFalse("4",bloomFilter.contains(i2k(4)));
        assertFalse("9",bloomFilter.contains(i2k(9)));
        
        // Note: this is a very small index (3 keys) so the cast is safe.
        byte[][] keys = new byte[(int)btree.getEntryCount()][];
        byte[][] vals = new byte[(int)btree.getEntryCount()][];

        getKeysAndValues(btree,keys,vals);

        doRandomLookupTest("btree", btree, keys, vals);
        doRandomLookupTest("w/ bloom", seg2, keys, vals);
        
        } finally {
			if (log.isInfoEnabled())
				log.info("Closing index segments.");
			seg2.close();
        
        }

        if (!outFile2.delete()) {

            log.warn("Could not delete index segment: " + outFile2);

        }
        
    }
    
    /**
     * Test helper builds an index segment from the btree using several
     * different branching factors and each time compares the resulting total
     * ordering to the original btree.
     * 
     * @param btree The source btree.
     */
    public void doBuildIndexSegmentAndCompare(final BTree btree)
            throws Exception {

    	try {
		if (btree.getEntryCount() > Integer.MAX_VALUE) {
			/*
			 * This code can not validate a B+Tree with more than MAX_INT keys
			 * since it relies on materialization of the data in RAM within
			 * arrays and Java does not support int64 array indices.
			 */
			throw new RuntimeException();
    	}
    	
        // branching factors used for the index segment.
        final int branchingFactors[] = new int[] { 3, 4, 5, 10, 20, 60, 100,
                256, 1024, 4096, 8192 };
        
        for( int i=0; i<branchingFactors.length; i++ ) {
        
            int m = branchingFactors[i];

            final File outFile = new File(getName()+"_m"+m+ ".seg");
            final File outFile2 = new File(getName()+"_m"+m+ "_bloom.seg");

            if( outFile.exists() && ! outFile.delete() ) {
                fail("Could not delete old index segment: "+outFile.getAbsoluteFile());
            }
            
            if( outFile2.exists() && ! outFile2.delete() ) {
                fail("Could not delete old index segment: "+outFile2.getAbsoluteFile());
            }
            
            final File tmpDir = outFile.getAbsoluteFile().getParentFile(); 
            
            /*
             * Build the index segment.
             */
            
            final long commitTime = System.currentTimeMillis();
            
            {
                
    			if (log.isInfoEnabled())
    				log.info("Building index segment (w/o bloom): in(m="
                        + btree.getBranchingFactor() + ", nentries=" + btree.getEntryCount()
                        + "), out(m=" + m + ")");
                
                IndexMetadata metadata = btree.getIndexMetadata().clone();
                
                metadata.setBloomFilterFactory(null/*disable*/);

                IndexSegmentBuilder.newInstance(outFile, tmpDir, btree.getEntryCount(),
                        btree.rangeIterator(), m, metadata, commitTime,
                        true/*compactingMerge*/,bufferNodes).call();
                
//              new IndexSegmentBuilder(outFile, tmpDir, btree, m, 0.);
                
            }
            
            final IndexSegmentBuilder builder2;
            {

    			if (log.isInfoEnabled())
    				log.info("Building index segment (w/ bloom): in(m="
                        + btree.getBranchingFactor() + ", nentries=" + btree.getEntryCount()
                        + "), out(m=" + m + ")");
            
                final IndexMetadata metadata = btree.getIndexMetadata().clone();
                
                /*
                 * Note: Since we know the exact #of index entries in an index
                 * segment, both [n] and [maxP] will be ignored when it comes
                 * time to create the bloom filter for the index segment.
                 */
                metadata.setBloomFilterFactory(new BloomFilterFactory(
                        1/* n */, 1 / 64d/* p */, 1 / 32d/* maxP */));

                builder2 = IndexSegmentBuilder.newInstance(outFile2, tmpDir,
                        btree.getEntryCount(), btree.rangeIterator(), m,
                        metadata, commitTime, true/* compactingMerge */,
                        bufferNodes);

                builder2.call();
            
//            IndexSegmentBuilder builder2 = new IndexSegmentBuilder(outFile2,
//                    tmpDir, btree, m, 1/64.);
                
            }

            /*
             * Verify can load the index file and that the metadata
             * associated with the index file is correct (we are only
             * checking those aspects that are easily defined by the test
             * case and not, for example, those aspects that depend on the
             * specifics of the length of serialized nodes or leaves).
             */
			if (log.isInfoEnabled())
				log.info("Opening index segment w/o bloom filter.");
            final IndexSegment seg = new IndexSegmentStore(outFile).loadIndexSegment();

            /*
             * Verify can load the index file and that the metadata
             * associated with the index file is correct (we are only
             * checking those aspects that are easily defined by the test
             * case and not, for example, those aspects that depend on the
             * specifics of the length of serialized nodes or leaves).
             */
			if (log.isInfoEnabled())
				log.info("Opening index segment w/ bloom filter.");
            final IndexSegment seg2 = new IndexSegmentStore(outFile2).loadIndexSegment();

            /*
             * Explicitly test the bloom filter against ground truth. 
             */
            
            // Note: cast is safe - we check entryCount above.
            final byte[][] keys = new byte[(int)btree.getEntryCount()][];
            final byte[][] vals = new byte[(int)btree.getEntryCount()][];

            getKeysAndValues(btree,keys,vals);

            /*
             * vet the bloom filter on the index segment builder
             * (pre-serialization).
             */
            doBloomFilterTest("pre-serialization", builder2.bloomFilter, keys);
            
            /*
             * vet the bloom filter on the loaded index segment
             * (post-serialization).
             */
            doBloomFilterTest("pre-serialization", seg2.getBloomFilter(), keys);

            /*
             * Verify index segments against the source btree and against one
             * another.
             */
			if (log.isInfoEnabled())
				log.info("Verifying index segments.");
            assertSameBTree(btree, seg);
            assertSameBTree(btree, seg2);
            seg2.close(); // close seg w/ bloom filter and the verify with implicit reopen.
            assertSameBTree(seg, seg2);

			if (log.isInfoEnabled())
				log.info("Closing index segments.");
            seg.close();
            seg2.close();

            if (!outFile.delete()) {

                log.warn("Could not delete index segment: " + outFile);

            }

            if (!outFile2.delete()) {

                log.warn("Could not delete index segment: " + outFile2);

            }

        } // build index segment with the next branching factor.

    	} finally {
            /*
             * Closing the journal.
             */
    		if (log.isInfoEnabled())
    			log.info("Closing journal.");
    		btree.getStore().destroy();        
    	}
    }

    /**
     * Test the bloom filter for false negatives given the ground truth set of
     * keys. if it reports that a key was not in the bloom filter then that is a
     * false negative. bloom filters are not supposed to have false negative.
     * 
     * @param keys
     *            The ground truth keys that were inserted into the bloom
     *            filter.
     */
    protected void doBloomFilterTest(String label, IBloomFilter bloomFilter, byte[][] keys) {
        
        /*
         * Closing the journal.
         */
		if (log.isInfoEnabled())
			log.info("\ncondition: "+label);//+", size="+bloomFilter.size());

        final int[] order = getRandomOrder(keys.length);

        for (int i = 0; i < order.length; i++) {

            final byte[] key = keys[order[i]];

            final boolean found = bloomFilter.contains(key);

            assertTrue("false negative: i=" + i + ", key="
                    + BytesUtil.toString(key), found);
            
        }
 
    }
}