/* Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved. Contact: SYSTAP, LLC DBA Blazegraph 2501 Calvert ST NW #106 Washington, DC 20008 licenses@blazegraph.com This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; version 2 of the License. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /* * Created on Feb 26, 2008 */ package com.bigdata.service; import java.io.IOException; import java.util.Properties; import java.util.UUID; import java.util.concurrent.ExecutionException; import com.bigdata.btree.BTree; import com.bigdata.btree.IRangeQuery; import com.bigdata.btree.ITupleIterator; import com.bigdata.btree.IndexMetadata; import com.bigdata.btree.keys.TestKeyBuilder; import com.bigdata.btree.proc.BatchInsert.BatchInsertConstructor; import com.bigdata.btree.proc.BatchRemove.BatchRemoveConstructor; import com.bigdata.io.SerializerUtil; import com.bigdata.journal.BufferMode; import com.bigdata.journal.ITx; import com.bigdata.journal.TemporaryRawStore; import com.bigdata.mdi.IMetadataIndex; import com.bigdata.mdi.PartitionLocator; import com.bigdata.resources.ResourceManager.Options; import com.bigdata.service.ndx.ClientIndexView; import com.bigdata.util.Bytes; /** * Test suite verifies that inserts eventually split an index and that deletes * eventually cause the index partitions to be joined. * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a> * @version $Id$ */ public class TestSplitJoin extends AbstractEmbeddedFederationTestCase { /** * */ public TestSplitJoin() { super(); } public TestSplitJoin(String name) { super(name); } /** * Overridden to specify the {@link BufferMode#Disk} mode and to lower the * threshold at which an overflow operation will be selected. */ public Properties getProperties() { Properties properties = new Properties( super.getProperties() ); // overrides value set in the superclass. properties.setProperty(Options.BUFFER_MODE,BufferMode.Disk.toString()); // this test does not rely on multiple data services. properties.setProperty(EmbeddedClient.Options.NDATA_SERVICES, "1"); /* * Note: disable copy of small index segments to the new journal during * overflow so the behavior is more predictable. */ properties.setProperty(Options.COPY_INDEX_THRESHOLD, "0"); // Note: disables index partition moves. properties.setProperty(Options.MAXIMUM_MOVES_PER_TARGET, "0"); // Note: make sure joins are enabled. properties.setProperty(Options.JOINS_ENABLED, "true"); // Note: disable scatter splits properties.setProperty(Options.SCATTER_SPLIT_ENABLED, "false"); // /* // * Note: Together these properties disable incremental index builds. We // * need to do that since a compacting build is required before the // * rangeCount() for the view will drop, which is a precondition for the // * JOIN. // */ // properties.setProperty(Options.MAXIMUM_JOURNALS_PER_VIEW, "2"); // properties.setProperty(Options.MAXIMUM_SEGMENTS_PER_VIEW, "1"); // properties.setProperty(Options.MAXIMUM_OPTIONAL_MERGES_PER_OVERFLOW, ""+Integer.MAX_VALUE); // turn off acceleration features. properties.setProperty(Options.ACCELERATE_OVERFLOW_THRESHOLD, "0"); properties.setProperty(Options.ACCELERATE_SPLIT_THRESHOLD, "0"); // Note: Set a low maximum shard size. properties.setProperty(Options.NOMINAL_SHARD_SIZE, ""+Bytes.megabyte); // properties.setProperty(Options.INITIAL_EXTENT, ""+1*Bytes.megabyte); // properties.setProperty(Options.MAXIMUM_EXTENT, ""+1*Bytes.megabyte); return properties; } /** * Test registers a scale-out index, writes data onto the initial index * partition, forces a split, verifies that the scale-out index has been * divided into two index partitions, and verifies that a range scan of the * scale-out index agrees with the ground truth. The test then goes on to * delete index entries until it forces a join of the index partitions and * verifies that the index partitions were in fact joined. * * @throws IOException * @throws ExecutionException * @throws InterruptedException */ public void test_splitJoin() throws IOException, InterruptedException, ExecutionException { /* * Register the index. */ final String name = "testIndex"; final UUID indexUUID = UUID.randomUUID(); final int batchSize = 5000; // final int entryCountPerSplit = 400; // final double overCapacityMultiplier = 1.5; // final int minimumEntryCountPerSplit = 100; { final IndexMetadata indexMetadata = new IndexMetadata(name,indexUUID); // // The threshold below which we will try to join index partitions. // ((DefaultSplitHandler)indexMetadata.getSplitHandler()).setMinimumEntryCount(minimumEntryCountPerSplit); // // // The target #of index entries per partition. // ((DefaultSplitHandler)indexMetadata.getSplitHandler()).setEntryCountPerSplit(entryCountPerSplit); // // // Overcapacity multiplier before an index partition will be split. // ((DefaultSplitHandler)indexMetadata.getSplitHandler()).setOverCapacityMultiplier(overCapacityMultiplier); // must support delete markers indexMetadata.setDeleteMarkers(true); // register the scale-out index, creating a single index partition. fed.registerIndex(indexMetadata, dataService0.getServiceUUID()); } /* * Verify the initial index partition. */ final PartitionLocator pmd0; { final ClientIndexView ndx = (ClientIndexView) fed.getIndex(name, ITx.UNISOLATED); final IMetadataIndex mdi = ndx.getMetadataIndex(); assertEquals("#index partitions", 1, mdi.rangeCount(null, null)); // This is the initial partition locator metadata record. pmd0 = mdi.get(new byte[]{}); assertEquals("partitionId", 0L, pmd0.getPartitionId()); assertEquals("dataServiceUUIDs", dataService0.getServiceUUID(), pmd0.getDataServiceUUID()); } assertEquals("partitionCount", 1, getPartitionCount(name)); /* * Setup the ground truth B+Tree. This is backed by a temporary raw * store so that it does not overflow. */ final BTree groundTruth; { final IndexMetadata indexMetadata = new IndexMetadata(indexUUID); groundTruth = BTree.create(new TemporaryRawStore(), indexMetadata); } /* * Populate the index with data until the initial the journal for the * data service on which the initial partition resides overflows. * * Note: Since the keys are random there can be duplicates which means * that the resulting rangeCount can be less than the #of tuples written * on the index. We handle by looping until a scan of the metadata index * shows that an index partition split has occurred. * * Note: The index split will occur asynchronously once (a) the index * partition has a sufficient #of entries; and (b) a group commit * occurs. However, this loop will continue to run, so writes will * continue to accumulate on the index partition on the live journal. * Once the overflow process completes the client be notified that the * index partition which it has been addressing no longer exists on the * data service. At that point the client SHOULD re-try the operation. * Once the client returns from that retry we will notice that the * partition count has increased and exit this loop. */ int nrounds = 0; long nwritten = 0L; // boolean done = false; int npartitions = -1; final long overflowCounter0 = dataService0.getAsynchronousOverflowCounter(); long overflowCounter = overflowCounter0; while (npartitions < 2) { final int nentries = batchSize;//minimumEntryCountPerSplit; // final KV[] data = getRandomKeyValues(nentries); final byte[][] keys = new byte[nentries][]; final byte[][] vals = new byte[nentries][]; for (int i = 0; i < nentries; i++) { keys[i] = TestKeyBuilder.asSortKey(nwritten + i); vals[i] = SerializerUtil.serialize(nwritten + i); } // insert the data into the ground truth index. groundTruth .submit(0/*fromIndex*/,nentries/*toIndex*/, keys, vals, BatchInsertConstructor.RETURN_NO_VALUES, null/* handler */); /* * Set flag to force overflow on group commit. */ // if (groundTruth.getEntryCount() >= overCapacityMultiplier * entryCountPerSplit) { dataService0.forceOverflow(false/*immediate*/,false/*compactingMerge*/); // done = true; // // } // insert the data into the scale-out index. fed.getIndex(name, ITx.UNISOLATED) .submit(0/*fromIndex*/,nentries/*toIndex*/, keys, vals, BatchInsertConstructor.RETURN_NO_VALUES, null/* handler */); /* * Problem is comparing entryCount on ground truth to rangeCount * on scale-out index. Since duplicate keys can be generated the * scale-out count can be larger than the ground truth count. * Write a utility method based on an iterator that returns the * real count for the scale-out index if I care. */ // assertEquals("rangeCount", groundTruth.getEntryCount(), fed // .getIndex(name, ITx.UNISOLATED).rangeCount(null, null)); // wait until asynchronous overflow processing is done. overflowCounter = awaitAsynchronousOverflow(dataService0, overflowCounter/* oldValue */); nrounds++; nwritten += nentries; // When GTE 2 the initial index partition was split. npartitions = getPartitionCount(name); if (log.isInfoEnabled()) log.info("Populating the index: overflowCounter=" + overflowCounter + ", nrounds=" + nrounds + ", nwritten=" + nwritten + ", nentries=" + groundTruth.getEntryCount() + " (" + fed.getIndex(name, ITx.UNISOLATED).rangeCount() + ")" + ", npartitions=" + npartitions); /* * Compare the index against ground truth after overflow. */ if (log.isInfoEnabled()) log.info("Verifying scale-out index against ground truth"); assertSameEntryIterator(groundTruth, fed.getIndex(name, ITx.UNISOLATED)); } // // wait until overflow processing is done. // final long overflowCounter1 = awaitAsynchronousOverflow(dataService0, // overflowCounter0); // // assertEquals("partitionCount", 2, getPartitionCount(name)); // /* // * Compare the index against ground truth after overflow. // */ // // if (log.isInfoEnabled()) // log.info("Verifying scale-out index against ground truth"); // // assertSameEntryIterator(groundTruth, fed.getIndex(name, ITx.UNISOLATED)); /* * Get the key range for the left-most index partition and then delete * entries until the index partition underflows. */ nrounds = 0; while (npartitions >= 2) { final byte[] fromKey = new byte[] {}; final PartitionLocator locator = fed.getMetadataIndex(name, ITx.READ_COMMITTED).get(fromKey); final byte[] toKey = locator.getRightSeparatorKey(); // ground truth range count for that index partition. final int rangeCount = (int) groundTruth.rangeCount(fromKey, toKey); if (rangeCount == 0) fail("rangeCount=" + rangeCount + ", but expected non-zero"); // #of entries to delete (seeking to trigger a join operation). final int ndelete = rangeCount <= batchSize ? (rangeCount / 2) + 1 : rangeCount - batchSize; assertTrue("rangeCount=" + rangeCount + ", batchSize=" + batchSize + ", ndelete=" + ndelete, ndelete > 0); final byte[][] keys = new byte[ndelete][]; final ITupleIterator<?> itr = groundTruth.rangeIterator(fromKey, toKey, ndelete, IRangeQuery.KEYS, null/* filter */); for (int i = 0; i < ndelete; i++) { keys[i] = itr.next().getKey(); } if (log.isInfoEnabled()) log.info("Will delete " + ndelete + " of " + rangeCount + " entries from " + locator + " to trigger underflow"); groundTruth.submit(0/* fromIndex */, ndelete/* toIndex */, keys, null/* vals */, BatchRemoveConstructor.RETURN_MUTATION_COUNT, null/* handler */); // data service will overflow at the next group commit. dataService0 .forceOverflow(false/* immediate */, false/* compactingMerge */); // delete those tuples, triggering overflow. fed.getIndex(name, ITx.UNISOLATED).submit(0/* fromIndex */, ndelete/* toIndex */, keys, null/* vals */, BatchRemoveConstructor.RETURN_MUTATION_COUNT, null/* handler */); // wait until asynchronous overflow processing is done. overflowCounter = awaitAsynchronousOverflow(dataService0, overflowCounter/* oldValue */); // #of partitions afterwards. npartitions = getPartitionCount(name); if (log.isInfoEnabled()) log.info("Populating the index: overflowCounter=" + overflowCounter + ", nrounds=" + nrounds + ", nwritten=" + nwritten + ", nentries=" + groundTruth.getEntryCount() + " (" + fed.getIndex(name, ITx.UNISOLATED).rangeCount(null, null) + ")" + ", npartitions=" + npartitions); /* * Compare the index against ground truth after overflow. */ if (log.isInfoEnabled()) log.info("Verifying scale-out index against ground truth"); assertSameEntryIterator(groundTruth, fed.getIndex(name, ITx.UNISOLATED)); } // // wait until overflow processing is done. // final long overflowCounter2 = awaitAsynchronousOverflow(dataService0, // overflowCounter1); // // /* // * Confirm index partitions were NOT joined. // * // * Note: Even though we have deleted index entries the #of index entries // * in the partition (as reported by rangeCount()) WILL NOT be reduced // * until the next build task for that index partition. Therefore our // * post-overflow expectation is that an index build was performed and // * the #of index partitions WAS NOT changed. // */ // assertEquals("partitionCount", 2, getPartitionCount(name)); // // /* // * Compare the index against ground truth after overflow. // */ // // if (log.isInfoEnabled()) // log.info("Verifying scale-out index against ground truth"); // // assertSameEntryIterator(groundTruth, fed.getIndex(name, ITx.UNISOLATED)); // /* // * Set the forceOverflow flag and then write another index entry on the // * other index partition (not the one that we are trying to underflow). // * This should trigger a JOIN operation now that the index partition has // * gone through a compacting build. // */ // { // // dataService0 // .forceOverflow(false/* immediate */, false/* compactingMerge */); // // // find the locator for the last index partition. // final PartitionLocator locator = fed.getMetadataIndex(name, // ITx.READ_COMMITTED).find(null); // // final byte[][] keys = new byte[][] { locator.getLeftSeparatorKey() }; //// final byte[][] vals = new byte[][] { /*empty byte[] */ }; // // // overwrite the value (if any) under the left separator key. // groundTruth.submit(0/* fromIndex */, 1/* toIndex */, keys, // null/* vals */, BatchRemoveConstructor.RETURN_MUTATION_COUNT, // null/* handler */); // // // overwrite the value (if any) under the left separator key. // fed.getIndex(name, ITx.UNISOLATED) // .submit(0/* fromIndex */, 1/* toIndex */, keys, null/*vals*/, // BatchRemoveConstructor.RETURN_MUTATION_COUNT, null/* handler */); // // } // // // wait until overflow processing is done. // final long overflowCounter3 = awaitAsynchronousOverflow(dataService0, overflowCounter2); // /* // * Confirm index partitions were joined. // * // * Note: Even though we have deleted index entries the #of index entries // * in the partition WILL NOT be reduced until the next build task for // * that index partition. Therefore our post-overflow expectation is that // * an index build was performed and the #of index partitions WAS NOT // * changed. // */ // Confirm the index partitions were joins. assertEquals("partitionCount", 1, getPartitionCount(name)); /* * Compare the index against ground truth after overflow. */ if (log.isInfoEnabled()) log.info("Verifying scale-out index against ground truth"); assertSameEntryIterator(groundTruth, fed.getIndex(name,ITx.UNISOLATED)); } }