TestSplitJoin.java example

/*

Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016.  All rights reserved.

Contact:
     SYSTAP, LLC DBA Blazegraph
     2501 Calvert ST NW #106
     Washington, DC 20008
     licenses@blazegraph.com

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

*/
/*
 * Created on Feb 26, 2008
 */

package com.bigdata.service;

import java.io.IOException;
import java.util.Properties;
import java.util.UUID;
import java.util.concurrent.ExecutionException;

import com.bigdata.btree.BTree;
import com.bigdata.btree.IRangeQuery;
import com.bigdata.btree.ITupleIterator;
import com.bigdata.btree.IndexMetadata;
import com.bigdata.btree.keys.TestKeyBuilder;
import com.bigdata.btree.proc.BatchInsert.BatchInsertConstructor;
import com.bigdata.btree.proc.BatchRemove.BatchRemoveConstructor;
import com.bigdata.io.SerializerUtil;
import com.bigdata.journal.BufferMode;
import com.bigdata.journal.ITx;
import com.bigdata.journal.TemporaryRawStore;
import com.bigdata.mdi.IMetadataIndex;
import com.bigdata.mdi.PartitionLocator;
import com.bigdata.resources.ResourceManager.Options;
import com.bigdata.service.ndx.ClientIndexView;
import com.bigdata.util.Bytes;

/**
 * Test suite verifies that inserts eventually split an index and that deletes
 * eventually cause the index partitions to be joined.
 * 
 * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
 * @version $Id$
 */
public class TestSplitJoin extends AbstractEmbeddedFederationTestCase {

    /**
     * 
     */
    public TestSplitJoin() {
        super();
    }

    public TestSplitJoin(String name) {
        super(name);
    }

    /**
     * Overridden to specify the {@link BufferMode#Disk} mode and to lower the
     * threshold at which an overflow operation will be selected.
     */
    public Properties getProperties() {
        
        Properties properties = new Properties( super.getProperties() );
        
        // overrides value set in the superclass.
        properties.setProperty(Options.BUFFER_MODE,BufferMode.Disk.toString());

        // this test does not rely on multiple data services.
        properties.setProperty(EmbeddedClient.Options.NDATA_SERVICES, "1");

        /*
         * Note: disable copy of small index segments to the new journal during
         * overflow so the behavior is more predictable.
         */ 
        properties.setProperty(Options.COPY_INDEX_THRESHOLD, "0");

        // Note: disables index partition moves.
        properties.setProperty(Options.MAXIMUM_MOVES_PER_TARGET, "0");

        // Note: make sure joins are enabled.
        properties.setProperty(Options.JOINS_ENABLED, "true");

        // Note: disable scatter splits
        properties.setProperty(Options.SCATTER_SPLIT_ENABLED, "false");

//        /*
//         * Note: Together these properties disable incremental index builds. We
//         * need to do that since a compacting build is required before the
//         * rangeCount() for the view will drop, which is a precondition for the
//         * JOIN.
//         */
//        properties.setProperty(Options.MAXIMUM_JOURNALS_PER_VIEW, "2");
//        properties.setProperty(Options.MAXIMUM_SEGMENTS_PER_VIEW, "1");
//        properties.setProperty(Options.MAXIMUM_OPTIONAL_MERGES_PER_OVERFLOW, ""+Integer.MAX_VALUE);

        // turn off acceleration features.
        properties.setProperty(Options.ACCELERATE_OVERFLOW_THRESHOLD, "0");
        properties.setProperty(Options.ACCELERATE_SPLIT_THRESHOLD, "0");
        
        // Note: Set a low maximum shard size.
        properties.setProperty(Options.NOMINAL_SHARD_SIZE, ""+Bytes.megabyte);
        
//        properties.setProperty(Options.INITIAL_EXTENT, ""+1*Bytes.megabyte);
        
//        properties.setProperty(Options.MAXIMUM_EXTENT, ""+1*Bytes.megabyte);
        
        return properties;
        
    }

    /**
     * Test registers a scale-out index, writes data onto the initial index
     * partition, forces a split, verifies that the scale-out index has been
     * divided into two index partitions, and verifies that a range scan of the
     * scale-out index agrees with the ground truth. The test then goes on to
     * delete index entries until it forces a join of the index partitions and
     * verifies that the index partitions were in fact joined.
     * 
     * @throws IOException
     * @throws ExecutionException
     * @throws InterruptedException
     */
    public void test_splitJoin() throws IOException, InterruptedException,
            ExecutionException {
        
        /*
         * Register the index.
         */
        final String name = "testIndex";
        final UUID indexUUID = UUID.randomUUID();
        final int batchSize = 5000;
//        final int entryCountPerSplit = 400;
//        final double overCapacityMultiplier = 1.5;
//        final int minimumEntryCountPerSplit = 100;
        {

            final IndexMetadata indexMetadata = new IndexMetadata(name,indexUUID);

//            // The threshold below which we will try to join index partitions.
//            ((DefaultSplitHandler)indexMetadata.getSplitHandler()).setMinimumEntryCount(minimumEntryCountPerSplit);
//            
//            // The target #of index entries per partition.
//            ((DefaultSplitHandler)indexMetadata.getSplitHandler()).setEntryCountPerSplit(entryCountPerSplit);
//
//            // Overcapacity multiplier before an index partition will be split.
//            ((DefaultSplitHandler)indexMetadata.getSplitHandler()).setOverCapacityMultiplier(overCapacityMultiplier);
            
            // must support delete markers
            indexMetadata.setDeleteMarkers(true);

            // register the scale-out index, creating a single index partition.
            fed.registerIndex(indexMetadata, dataService0.getServiceUUID());
            
        }

        /*
         * Verify the initial index partition.
         */
        final PartitionLocator pmd0;
        {
            
            final ClientIndexView ndx = (ClientIndexView) fed.getIndex(name,
                    ITx.UNISOLATED);

            final IMetadataIndex mdi = ndx.getMetadataIndex();
            
            assertEquals("#index partitions", 1, mdi.rangeCount(null, null));

            // This is the initial partition locator metadata record.
            pmd0 = mdi.get(new byte[]{});

            assertEquals("partitionId", 0L, pmd0.getPartitionId());

            assertEquals("dataServiceUUIDs", dataService0.getServiceUUID(),
                    pmd0.getDataServiceUUID());
            
        }
        assertEquals("partitionCount", 1, getPartitionCount(name));
        
        /*
         * Setup the ground truth B+Tree. This is backed by a temporary raw
         * store so that it does not overflow.
         */
        final BTree groundTruth;
        {
        
            final IndexMetadata indexMetadata = new IndexMetadata(indexUUID);

            groundTruth = BTree.create(new TemporaryRawStore(), indexMetadata);

        }

        /*
         * Populate the index with data until the initial the journal for the
         * data service on which the initial partition resides overflows.
         * 
         * Note: Since the keys are random there can be duplicates which means
         * that the resulting rangeCount can be less than the #of tuples written
         * on the index. We handle by looping until a scan of the metadata index
         * shows that an index partition split has occurred.
         * 
         * Note: The index split will occur asynchronously once (a) the index
         * partition has a sufficient #of entries; and (b) a group commit
         * occurs. However, this loop will continue to run, so writes will
         * continue to accumulate on the index partition on the live journal.
         * Once the overflow process completes the client be notified that the
         * index partition which it has been addressing no longer exists on the
         * data service. At that point the client SHOULD re-try the operation.
         * Once the client returns from that retry we will notice that the
         * partition count has increased and exit this loop.
         */
        int nrounds = 0;
        long nwritten = 0L;
//        boolean done = false;
        int npartitions = -1;
        final long overflowCounter0 = dataService0.getAsynchronousOverflowCounter();
        long overflowCounter = overflowCounter0;
        while (npartitions < 2) {

            final int nentries = batchSize;//minimumEntryCountPerSplit;
//            final KV[] data = getRandomKeyValues(nentries);
            final byte[][] keys = new byte[nentries][];
            final byte[][] vals = new byte[nentries][];

            for (int i = 0; i < nentries; i++) {

                keys[i] = TestKeyBuilder.asSortKey(nwritten + i);

                vals[i] = SerializerUtil.serialize(nwritten + i);
                
            }

            // insert the data into the ground truth index.
            groundTruth
                    .submit(0/*fromIndex*/,nentries/*toIndex*/, keys, vals,
                            BatchInsertConstructor.RETURN_NO_VALUES, null/* handler */);

            /*
             * Set flag to force overflow on group commit.
             */
//            if (groundTruth.getEntryCount() >= overCapacityMultiplier * entryCountPerSplit) {
                
                dataService0.forceOverflow(false/*immediate*/,false/*compactingMerge*/);

//                done = true;
//                
//            }

            // insert the data into the scale-out index.
            fed.getIndex(name, ITx.UNISOLATED)
                    .submit(0/*fromIndex*/,nentries/*toIndex*/, keys, vals,
                            BatchInsertConstructor.RETURN_NO_VALUES, null/* handler */);
            
            /*
             * Problem is comparing entryCount on ground truth to rangeCount
             * on scale-out index. Since duplicate keys can be generated the
             * scale-out count can be larger than the ground truth count.
             * Write a utility method based on an iterator that returns the
             * real count for the scale-out index if I care.
             */
//            assertEquals("rangeCount", groundTruth.getEntryCount(), fed
//                    .getIndex(name, ITx.UNISOLATED).rangeCount(null, null));
            
            // wait until asynchronous overflow processing is done.
            overflowCounter = awaitAsynchronousOverflow(dataService0,
                    overflowCounter/* oldValue */);

            nrounds++;

            nwritten += nentries;
            
            // When GTE 2 the initial index partition was split.
            npartitions = getPartitionCount(name);
            
            if (log.isInfoEnabled())
                log.info("Populating the index: overflowCounter="
                        + overflowCounter
                        + ", nrounds="
                        + nrounds
                        + ", nwritten="
                        + nwritten
                        + ", nentries="
                        + groundTruth.getEntryCount()
                        + " ("
                        + fed.getIndex(name, ITx.UNISOLATED).rangeCount() + ")"
                        + ", npartitions=" + npartitions);

            /*
             * Compare the index against ground truth after overflow.
             */

            if (log.isInfoEnabled())
                log.info("Verifying scale-out index against ground truth");

            assertSameEntryIterator(groundTruth, fed.getIndex(name,
                    ITx.UNISOLATED));

        }
        
//        // wait until overflow processing is done.
//        final long overflowCounter1 = awaitAsynchronousOverflow(dataService0,
//                overflowCounter0);
//        
//        assertEquals("partitionCount", 2, getPartitionCount(name));
        
//        /*
//         * Compare the index against ground truth after overflow.
//         */
//        
//        if (log.isInfoEnabled())
//            log.info("Verifying scale-out index against ground truth");
//
//        assertSameEntryIterator(groundTruth, fed.getIndex(name, ITx.UNISOLATED));

        /*
         * Get the key range for the left-most index partition and then delete
         * entries until the index partition underflows.
         */
        nrounds = 0;
        while (npartitions >= 2) {

            final byte[] fromKey = new byte[] {};

            final PartitionLocator locator = fed.getMetadataIndex(name,
                    ITx.READ_COMMITTED).get(fromKey);

            final byte[] toKey = locator.getRightSeparatorKey();

            // ground truth range count for that index partition.
            final int rangeCount = (int) groundTruth.rangeCount(fromKey, toKey);

            if (rangeCount == 0)
                fail("rangeCount=" + rangeCount + ", but expected non-zero");

            // #of entries to delete (seeking to trigger a join operation).
            final int ndelete = rangeCount <= batchSize ? (rangeCount / 2) + 1
                    : rangeCount - batchSize;
            
            assertTrue("rangeCount=" + rangeCount + ", batchSize=" + batchSize
                    + ", ndelete=" + ndelete, ndelete > 0);
            
            final byte[][] keys = new byte[ndelete][];

            final ITupleIterator<?> itr = groundTruth.rangeIterator(fromKey,
                    toKey, ndelete, IRangeQuery.KEYS, null/* filter */);

            for (int i = 0; i < ndelete; i++) {

                keys[i] = itr.next().getKey();

            }

            if (log.isInfoEnabled())
                log.info("Will delete " + ndelete + " of " + rangeCount
                        + " entries from " + locator + " to trigger underflow");

            groundTruth.submit(0/* fromIndex */, ndelete/* toIndex */, keys,
                    null/* vals */, BatchRemoveConstructor.RETURN_MUTATION_COUNT,
                    null/* handler */);

            // data service will overflow at the next group commit.
            dataService0
                    .forceOverflow(false/* immediate */, false/* compactingMerge */);

            // delete those tuples, triggering overflow.
            fed.getIndex(name, ITx.UNISOLATED).submit(0/* fromIndex */,
                    ndelete/* toIndex */, keys, null/* vals */,
                    BatchRemoveConstructor.RETURN_MUTATION_COUNT, null/* handler */);
            
            // wait until asynchronous overflow processing is done.
            overflowCounter = awaitAsynchronousOverflow(dataService0,
                    overflowCounter/* oldValue */);

            // #of partitions afterwards.
            npartitions = getPartitionCount(name);
            
            if (log.isInfoEnabled())
                log.info("Populating the index: overflowCounter="
                        + overflowCounter
                        + ", nrounds="
                        + nrounds
                        + ", nwritten="
                        + nwritten
                        + ", nentries="
                        + groundTruth.getEntryCount()
                        + " ("
                        + fed.getIndex(name, ITx.UNISOLATED).rangeCount(null,
                                null) + ")" + ", npartitions=" + npartitions);

            /*
             * Compare the index against ground truth after overflow.
             */
            if (log.isInfoEnabled())
                log.info("Verifying scale-out index against ground truth");

            assertSameEntryIterator(groundTruth, fed.getIndex(name,
                    ITx.UNISOLATED));

        }

//        // wait until overflow processing is done.
//        final long overflowCounter2 = awaitAsynchronousOverflow(dataService0,
//                overflowCounter1);
//        
//        /*
//         * Confirm index partitions were NOT joined.
//         * 
//         * Note: Even though we have deleted index entries the #of index entries
//         * in the partition (as reported by rangeCount()) WILL NOT be reduced
//         * until the next build task for that index partition. Therefore our
//         * post-overflow expectation is that an index build was performed and
//         * the #of index partitions WAS NOT changed.
//         */
//        assertEquals("partitionCount", 2, getPartitionCount(name));
//        
//        /*
//         * Compare the index against ground truth after overflow.
//         */
//        
//        if (log.isInfoEnabled())
//            log.info("Verifying scale-out index against ground truth");
//
//        assertSameEntryIterator(groundTruth, fed.getIndex(name, ITx.UNISOLATED));

//        /*
//         * Set the forceOverflow flag and then write another index entry on the
//         * other index partition (not the one that we are trying to underflow).
//         * This should trigger a JOIN operation now that the index partition has
//         * gone through a compacting build.
//         */
//        {
//
//            dataService0
//                    .forceOverflow(false/* immediate */, false/* compactingMerge */);
//
//            // find the locator for the last index partition.
//            final PartitionLocator locator = fed.getMetadataIndex(name,
//                    ITx.READ_COMMITTED).find(null);
//
//            final byte[][] keys = new byte[][] { locator.getLeftSeparatorKey() };
////            final byte[][] vals = new byte[][] { /*empty byte[] */ };
//
//            // overwrite the value (if any) under the left separator key.
//            groundTruth.submit(0/* fromIndex */, 1/* toIndex */, keys,
//                    null/* vals */, BatchRemoveConstructor.RETURN_MUTATION_COUNT,
//                    null/* handler */);
//
//            // overwrite the value (if any) under the left separator key.
//            fed.getIndex(name, ITx.UNISOLATED)
//                    .submit(0/* fromIndex */, 1/* toIndex */, keys, null/*vals*/,
//                            BatchRemoveConstructor.RETURN_MUTATION_COUNT, null/* handler */);
//            
//        }
//
//        // wait until overflow processing is done.
//        final long overflowCounter3 = awaitAsynchronousOverflow(dataService0, overflowCounter2);
                
//        /*
//         * Confirm index partitions were joined.
//         * 
//         * Note: Even though we have deleted index entries the #of index entries
//         * in the partition WILL NOT be reduced until the next build task for
//         * that index partition. Therefore our post-overflow expectation is that
//         * an index build was performed and the #of index partitions WAS NOT
//         * changed.
//         */

        // Confirm the index partitions were joins.
        assertEquals("partitionCount", 1, getPartitionCount(name));
        
        /*
         * Compare the index against ground truth after overflow.
         */
        
        if (log.isInfoEnabled())
            log.info("Verifying scale-out index against ground truth");

        assertSameEntryIterator(groundTruth, fed.getIndex(name,ITx.UNISOLATED));
        
    }
    
}