/*
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
licenses@blazegraph.com
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
* Created on Mar 11, 2008
*/
package com.bigdata.service;
import java.io.IOException;
import java.util.Properties;
import java.util.UUID;
import java.util.concurrent.ExecutionException;
import com.bigdata.btree.BTree;
import com.bigdata.btree.IRangeQuery;
import com.bigdata.btree.ITupleIterator;
import com.bigdata.btree.IndexMetadata;
import com.bigdata.btree.ScatterSplitConfiguration;
import com.bigdata.btree.keys.TestKeyBuilder;
import com.bigdata.btree.proc.BatchInsert.BatchInsertConstructor;
import com.bigdata.io.SerializerUtil;
import com.bigdata.journal.BufferMode;
import com.bigdata.journal.ITx;
import com.bigdata.journal.TemporaryRawStore;
import com.bigdata.mdi.IMetadataIndex;
import com.bigdata.mdi.PartitionLocator;
import com.bigdata.resources.ResourceManager;
import com.bigdata.resources.ResourceManager.Options;
import com.bigdata.service.ndx.ClientIndexView;
import com.bigdata.service.ndx.RawDataServiceTupleIterator;
import com.bigdata.util.Bytes;
/**
* Some unit tests for moving an index partition.
*
* @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
* @version $Id$
*/
public class TestScatterSplit extends AbstractEmbeddedFederationTestCase {
public TestScatterSplit() {
super();
}
public TestScatterSplit(String name) {
super(name);
}
/**
* Overridden to specify the {@link BufferMode#Disk} mode and to lower the
* threshold at which an overflow operation will be selected.
*/
public Properties getProperties() {
final Properties properties = new Properties(super.getProperties());
// overrides Transient in the base class.
properties.setProperty(Options.BUFFER_MODE, BufferMode.Disk
.toString());
// this test relies on 2 or more data services.
properties.setProperty(EmbeddedClient.Options.NDATA_SERVICES, "2");
// Note: disable copy of small index segments to the new journal during overflow.
properties.setProperty(Options.COPY_INDEX_THRESHOLD,"0");
// // set low minimum #of active partitions per data service.
// properties.setProperty(Options.MINIMUM_ACTIVE_INDEX_PARTITIONS,"1");
// // enable moves (one per target).
// properties.setProperty(ResourceManager.Options.MAXIMUM_MOVES_PER_TARGET,"1");
//
// // allow move of shards which would otherwise be split.
// properties.setProperty(ResourceManager.Options.MAXIMUM_MOVE_PERCENT_OF_SPLIT,"2.0");
//
// // disable the CPU threshold for moves.
// properties.setProperty(ResourceManager.Options.MOVE_PERCENT_CPU_TIME_THRESHOLD,".0");
// enable scatter splits
properties.setProperty(ResourceManager.Options.SCATTER_SPLIT_ENABLED,"true");
// /*
// * Note: Disables the initial round robin policy for the load balancer
// * service so that it will use our fakes scores.
// */
// properties.setProperty(LoadBalancerService.Options.INITIAL_ROUND_ROBIN_UPDATE_COUNT, "0");
// turn off acceleration features.
properties.setProperty(Options.ACCELERATE_OVERFLOW_THRESHOLD, "0");
properties.setProperty(Options.ACCELERATE_SPLIT_THRESHOLD, "0");
// Note: Set a low maximum shard size.
properties.setProperty(Options.NOMINAL_SHARD_SIZE, ""+Bytes.megabyte);
// properties.setProperty(Options.INITIAL_EXTENT, ""+1*Bytes.megabyte);
// properties.setProperty(Options.MAXIMUM_EXTENT, ""+1*Bytes.megabyte);
return properties;
}
/**
* Test writes on a scale-out index until it has enough data to undergo a
* scatter split, validates that the index was distributed into N shards per
* DS and validates the scale-out index after the scatter split against
* ground truth.
*
* @throws IOException
* @throws ExecutionException
* @throws InterruptedException
*/
public void test_scatterSplit() throws IOException, InterruptedException,
ExecutionException {
final int dataServiceCount = 2;
final int expectedIndexPartitionCount = 4;
/*
* Register the index.
*/
final String name = "testIndex";
final UUID indexUUID = UUID.randomUUID();
{
final IndexMetadata indexMetadata = new IndexMetadata(name,indexUUID);
// must support delete markers
indexMetadata.setDeleteMarkers(true);
/*
* Explicitly setup the scatter split operation to distribute 4
* index partitions across 2 data services (this is done explicitly
* in case the configuration defaults are changed).
*/
indexMetadata
.setScatterSplitConfiguration(new ScatterSplitConfiguration(
true,// enabled
.25,// percentOfSplitThreshold
dataServiceCount, //
expectedIndexPartitionCount//
));
// register the scale-out index, creating a single index partition.
fed.registerIndex(indexMetadata, dataService0.getServiceUUID());
}
/*
* Verify the initial index partition.
*/
final PartitionLocator pmd0;
{
final ClientIndexView ndx = (ClientIndexView) fed.getIndex(name,
ITx.UNISOLATED);
final IMetadataIndex mdi = ndx.getMetadataIndex();
assertEquals("#index partitions", 1, mdi.rangeCount());
// This is the initial partition locator metadata record.
pmd0 = mdi.get(new byte[]{});
assertEquals("partitionId", 0L, pmd0.getPartitionId());
assertEquals("dataServiceUUID", dataService0
.getServiceUUID(), pmd0.getDataServiceUUID());
}
assertEquals("partitionCount", 1, getPartitionCount(name));
/*
* Setup the ground truth B+Tree.
*/
final BTree groundTruth;
{
final IndexMetadata indexMetadata = new IndexMetadata(indexUUID);
groundTruth = BTree.create(new TemporaryRawStore(), indexMetadata);
}
/*
* Populate the index with data until the initial the journal for the
* data service on which the initial partition resides overflows.
*
* Note: The index split will occur asynchronously once (a) the index
* partition has a sufficient #of entries; and (b) a group commit
* occurs. However, this loop will continue to run, so writes will
* continue to accumulate on the index partition on the live journal.
* Once the overflow process completes the client be notified that the
* index partition which it has been addressing no longer exists on the
* data service. At that point the client SHOULD re-try the operation.
* Once the client returns from the retry we will notice that the
* partition count has increased and exit this loop.
*/
final int batchSize = 5000;
long overflowCounter = dataService0.getAsynchronousOverflowCounter();
int npartitions = -1;
{
if(log.isInfoEnabled())
log.info("Writing on indices to provoke overflow");
int nrounds = 0;
long nwritten = 0L;
while (npartitions <= 1) {
final byte[][] keys = new byte[batchSize][];
final byte[][] vals = new byte[batchSize][];
for (int i = 0; i < batchSize; i++) {
keys[i] = TestKeyBuilder.asSortKey(nwritten + i);
vals[i] = SerializerUtil.serialize(nwritten + i);
}
// insert the data into the ground truth index.
groundTruth
.submit(0/* fromIndex */, batchSize/* toIndex */, keys,
vals, BatchInsertConstructor.RETURN_NO_VALUES,
null/* handler */);
// Set flag to force overflow on group commit.
dataService0
.forceOverflow(false/* immediate */, false/* compactingMerge */);
// insert the data into the scale-out index.
fed.getIndex(name, ITx.UNISOLATED)
.submit(0/* fromIndex */, batchSize/* toIndex */, keys,
vals, BatchInsertConstructor.RETURN_NO_VALUES,
null/* handler */);
overflowCounter = awaitAsynchronousOverflow(dataService0,
overflowCounter);
assertEquals("rangeCount", groundTruth.getEntryCount(), fed
.getIndex(name, ITx.UNISOLATED).rangeCount());
nrounds++;
nwritten += batchSize;
npartitions = getPartitionCount(name);
// if (log.isInfoEnabled())
// log.info
System.err.println
("Populating the index: overflowCounter="
+ overflowCounter + ", nrounds=" + nrounds
+ ", nwritten=" + nwritten + ", nentries="
+ groundTruth.getEntryCount() + " ("
+ fed.getIndex(name, ITx.UNISOLATED).rangeCount()
+ "), npartitions=" + npartitions);
/*
* Compare the index against ground truth after overflow.
*/
if(log.isInfoEnabled())
log.info("Verifying scale-out index against ground truth");
assertSameEntryIterator(groundTruth, fed.getIndex(name,
ITx.UNISOLATED));
}
}
/*
* Figure out which index partition was moved and verify that there is
* now (at least) one index partition on each data service.
*/
{
int ndataService0 = 0;// #of index partitions on data service 0.
int ndataService1 = 0;// #of index partitions on data service 1.
final ITupleIterator itr = new RawDataServiceTupleIterator(
fed.getMetadataService(),//
MetadataService.getMetadataIndexName(name), //
ITx.READ_COMMITTED,//
true, // readConsistent
null, // fromKey
null, // toKey
0, // capacity,
IRangeQuery.DEFAULT,// flags
null // filter
);
int n = 0;
while (itr.hasNext()) {
final PartitionLocator locator = (PartitionLocator) SerializerUtil
.deserialize(itr.next().getValue());
System.err.println("locators["+n+"]="+locator);
if (locator.getDataServiceUUID().equals(dataService0
.getServiceUUID())) {
ndataService0++;
} else if (locator.getDataServiceUUID().equals(
dataService1.getServiceUUID())) {
ndataService1++;
} else {
fail("Not expecting partition move to this service: "
+ locator);
}
n++;
}
npartitions = getPartitionCount(name);
System.err.println("npartitions=" + npartitions);
System.err.println("npartitions(ds0)=" + ndataService0);
System.err.println("npartitions(ds1)=" + ndataService1);
// Verify expected #of partitions.
assertEquals("partitionCount=" + npartitions,
expectedIndexPartitionCount, npartitions);
assertEquals("#dataService0=" + ndataService0,
expectedIndexPartitionCount / 2, ndataService0);
assertEquals("#dataService1=" + ndataService0,
expectedIndexPartitionCount / 2, ndataService1);
}
}
}