/*
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
licenses@blazegraph.com
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
* Created on Mar 11, 2008
*/
package com.bigdata.service;
import java.io.IOException;
import java.util.Properties;
import java.util.UUID;
import java.util.concurrent.ExecutionException;
import org.apache.log4j.Level;
import com.bigdata.btree.BTree;
import com.bigdata.btree.IRangeQuery;
import com.bigdata.btree.ITupleIterator;
import com.bigdata.btree.IndexMetadata;
import com.bigdata.btree.keys.KV;
import com.bigdata.btree.keys.TestKeyBuilder;
import com.bigdata.btree.proc.BatchInsert.BatchInsertConstructor;
import com.bigdata.counters.AbstractStatisticsCollector;
import com.bigdata.io.SerializerUtil;
import com.bigdata.journal.BufferMode;
import com.bigdata.journal.ITx;
import com.bigdata.journal.TemporaryRawStore;
import com.bigdata.mdi.IMetadataIndex;
import com.bigdata.mdi.PartitionLocator;
import com.bigdata.resources.ResourceManager;
import com.bigdata.resources.ResourceManager.Options;
import com.bigdata.service.ndx.ClientIndexView;
import com.bigdata.service.ndx.RawDataServiceTupleIterator;
import com.bigdata.util.Bytes;
/**
* Some unit tests for moving an index partition.
*
* @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
* @version $Id$
*/
public class TestMove extends AbstractEmbeddedFederationTestCase {
public TestMove() {
super();
}
public TestMove(String name) {
super(name);
}
/**
* Overridden to specify the {@link BufferMode#Disk} mode and to lower the
* threshold at which an overflow operation will be selected.
*/
public Properties getProperties() {
final Properties properties = new Properties(super.getProperties());
// overrides Transient in the base class.
properties.setProperty(Options.BUFFER_MODE, BufferMode.Disk
.toString());
// this test relies on 2 or more data services.
properties.setProperty(EmbeddedClient.Options.NDATA_SERVICES, "2");
// Note: disable copy of small index segments to the new journal during overflow.
properties.setProperty(Options.COPY_INDEX_THRESHOLD,"0");
// set low minimum #of active partitions per data service.
properties.setProperty(Options.MINIMUM_ACTIVE_INDEX_PARTITIONS,"1");
// enable moves (one per target).
properties.setProperty(ResourceManager.Options.MAXIMUM_MOVES_PER_TARGET,"1");
// allow move of shards which would otherwise be split.
properties.setProperty(ResourceManager.Options.MAXIMUM_MOVE_PERCENT_OF_SPLIT,"2.0");
// disable the CPU threshold for moves.
properties.setProperty(ResourceManager.Options.MOVE_PERCENT_CPU_TIME_THRESHOLD,".0");
// disable scatter split
properties.setProperty(ResourceManager.Options.SCATTER_SPLIT_ENABLED,"false");
/*
* Note: Disables the initial round robin policy for the load balancer
* service so that it will use our fakes scores.
*/
properties.setProperty(LoadBalancerService.Options.INITIAL_ROUND_ROBIN_UPDATE_COUNT, "0");
// turn off acceleration features.
properties.setProperty(Options.ACCELERATE_OVERFLOW_THRESHOLD, "0");
properties.setProperty(Options.ACCELERATE_SPLIT_THRESHOLD, "0");
// Note: Set a low maximum shard size.
properties.setProperty(Options.NOMINAL_SHARD_SIZE, ""+Bytes.megabyte);
// properties.setProperty(Options.INITIAL_EXTENT, ""+1*Bytes.megabyte);
// properties.setProperty(Options.MAXIMUM_EXTENT, ""+1*Bytes.megabyte);
return properties;
}
/**
* Test forces a move of an index partition and validates the scale-out
* index after the move against ground truth.
*
* @throws IOException
* @throws ExecutionException
* @throws InterruptedException
*/
public void test_move() throws IOException, InterruptedException, ExecutionException {
/*
* Register the index.
*/
final String name = "testIndex";
final UUID indexUUID = UUID.randomUUID();
{
final IndexMetadata indexMetadata = new IndexMetadata(name,indexUUID);
// must support delete markers
indexMetadata.setDeleteMarkers(true);
// register the scale-out index, creating a single index partition.
fed.registerIndex(indexMetadata, dataService0.getServiceUUID());
}
/*
* Verify the initial index partition.
*/
final PartitionLocator pmd0;
{
final ClientIndexView ndx = (ClientIndexView) fed.getIndex(name,
ITx.UNISOLATED);
final IMetadataIndex mdi = ndx.getMetadataIndex();
assertEquals("#index partitions", 1, mdi.rangeCount());
// This is the initial partition locator metadata record.
pmd0 = mdi.get(new byte[]{});
assertEquals("partitionId", 0L, pmd0.getPartitionId());
assertEquals("dataServiceUUID", dataService0
.getServiceUUID(), pmd0.getDataServiceUUID());
}
assertEquals("partitionCount", 1, getPartitionCount(name));
/*
* Setup the ground truth B+Tree.
*/
final BTree groundTruth;
{
final IndexMetadata indexMetadata = new IndexMetadata(indexUUID);
groundTruth = BTree.create(new TemporaryRawStore(), indexMetadata);
}
/*
* Populate the index with data until the initial the journal for the
* data service on which the initial partition resides overflows.
*
* Note: The index split will occur asynchronously once (a) the index
* partition has a sufficient #of entries; and (b) a group commit
* occurs. However, this loop will continue to run, so writes will
* continue to accumulate on the index partition on the live journal.
* Once the overflow process completes the client be notified that the
* index partition which it has been addressing no longer exists on the
* data service. At that point the client SHOULD re-try the operation.
* Once the client returns from the retry we will notice that the
* partition count has increased and exit this loop.
*/
final int batchSize = 5000;
long overflowCounter = dataService0.getAsynchronousOverflowCounter();
int npartitions = -1;
{
if(log.isInfoEnabled())
log.info("Writing on indices to provoke overflow");
int nrounds = 0;
long nwritten = 0L;
while (npartitions < 2) {
final byte[][] keys = new byte[batchSize][];
final byte[][] vals = new byte[batchSize][];
for (int i = 0; i < batchSize; i++) {
keys[i] = TestKeyBuilder.asSortKey(nwritten + i);
vals[i] = SerializerUtil.serialize(nwritten + i);
}
// insert the data into the ground truth index.
groundTruth
.submit(0/* fromIndex */, batchSize/* toIndex */, keys,
vals, BatchInsertConstructor.RETURN_NO_VALUES,
null/* handler */);
// Set flag to force overflow on group commit.
dataService0
.forceOverflow(false/* immediate */, false/* compactingMerge */);
// insert the data into the scale-out index.
fed.getIndex(name, ITx.UNISOLATED)
.submit(0/* fromIndex */, batchSize/* toIndex */, keys,
vals, BatchInsertConstructor.RETURN_NO_VALUES,
null/* handler */);
overflowCounter = awaitAsynchronousOverflow(dataService0,
overflowCounter);
assertEquals("rangeCount", groundTruth.getEntryCount(), fed
.getIndex(name, ITx.UNISOLATED).rangeCount());
nrounds++;
nwritten += batchSize;
npartitions = getPartitionCount(name);
if (log.isInfoEnabled())
log.info
("Populating the index: overflowCounter="
+ overflowCounter + ", nrounds=" + nrounds
+ ", nwritten=" + nwritten + ", nentries="
+ groundTruth.getEntryCount() + " ("
+ fed.getIndex(name, ITx.UNISOLATED).rangeCount()
+ "), npartitions=" + npartitions);
/*
* Compare the index against ground truth after overflow.
*/
if(log.isInfoEnabled())
log.info("Verifying scale-out index against ground truth");
assertSameEntryIterator(groundTruth, fed.getIndex(name,
ITx.UNISOLATED));
}
}
npartitions = getPartitionCount(name);
// Verify at least 2 partitions.
assertTrue("partitionCount=" + npartitions, npartitions >= 2);
/*
* Fake out the load balancer so that it will report the source data
* service (dataService0) is "highly utilized" and the target data
* service (dataService1) is "under utilized".
*/
{
if (log.isInfoEnabled())
log.info("Setting up LBS for move.");
// explicitly set the log level for the load balancer.
LoadBalancerService.log.setLevel(Level.INFO);
final AbstractEmbeddedLoadBalancerService lbs = ((AbstractEmbeddedLoadBalancerService) ((EmbeddedFederation) fed)
.getLoadBalancerService());
final ServiceScore[] fakeServiceScores = new ServiceScore[2];
fakeServiceScores[0] = new ServiceScore(
AbstractStatisticsCollector.fullyQualifiedHostName,
dataService0.getServiceUUID(), "dataService0", 1.0/* rawScore */);
fakeServiceScores[1] = new ServiceScore(
AbstractStatisticsCollector.fullyQualifiedHostName,
dataService1.getServiceUUID(), "dataService1", 0.0/* rawScore */);
// set the fake scores on the load balancer.
lbs.setServiceScores(fakeServiceScores);
}
/*
* Continue to populate index until we can provoke another overflow.
*
* Since we have configured the various thresholds appropriately this
* overflow should select one of the index partitions to move over to
* the other data service.
*/
{
if (log.isInfoEnabled())
log.info("Writing on indices to provoke overflow");
// int nrounds = 0;
// long nwritten = 0L;
// boolean done = false;
// while (!done)
{
/*
* Just a little random data.
*
* Note: We have to write enough data so that the new updates
* are not just copied onto the new journal in order for the
* index partition(s) on which we write to be eligible for a
* move.
*/
final int nentries = 5000;
final KV[] data = getRandomKeyValues(nentries);
final byte[][] keys = new byte[nentries][];
final byte[][] vals = new byte[nentries][];
for (int i = 0; i < nentries; i++) {
keys[i] = data[i].key;
vals[i] = data[i].val;
}
// insert the data into the ground truth index.
groundTruth
.submit(0/* fromIndex */, nentries/* toIndex */, keys,
vals, BatchInsertConstructor.RETURN_NO_VALUES,
null/* handler */);
/*
* Set flag to force overflow on group commit.
*/
dataService0
.forceOverflow(false/* immediate */, true/* compactingMerge */);
// insert the data into the scale-out index.
fed.getIndex(name, ITx.UNISOLATED)
.submit(0/* fromIndex */, nentries/* toIndex */, keys,
vals, BatchInsertConstructor.RETURN_NO_VALUES,
null/* handler */);
// wait until overflow processing is done.
overflowCounter = awaitAsynchronousOverflow(dataService0,
overflowCounter);
/*
* Compare the index against ground truth after overflow.
*/
if (log.isInfoEnabled())
log.info("Verifying scale-out index against ground truth");
assertSameEntryIterator(groundTruth, fed.getIndex(name,
ITx.UNISOLATED));
}
}
/*
* Figure out which index partition was moved and verify that there is
* now (at least) one index partition on each data service.
*/
{
int ndataService0 = 0;// #of index partitions on data service 0.
int ndataService1 = 0;// #of index partitions on data service 1.
final ITupleIterator<?> itr = new RawDataServiceTupleIterator(
fed.getMetadataService(),//
MetadataService.getMetadataIndexName(name), //
ITx.READ_COMMITTED,//
true, // readConsistent
null, // fromKey
null, // toKey
0, // capacity,
IRangeQuery.DEFAULT,// flags
null // filter
);
int n = 0;
while (itr.hasNext()) {
final PartitionLocator locator = (PartitionLocator) SerializerUtil
.deserialize(itr.next().getValue());
if (log.isInfoEnabled())
log.info("locators[" + n + "]=" + locator);
if (locator.getDataServiceUUID().equals(dataService0
.getServiceUUID())) {
ndataService0++;
} else if (locator.getDataServiceUUID().equals(
dataService1.getServiceUUID())) {
ndataService1++;
} else {
fail("Not expecting partition move to this service: "
+ locator);
}
n++;
}
if (log.isInfoEnabled()) {
log.info("npartitions=" + getPartitionCount(name));
log.info("npartitions(ds0)=" + ndataService0);
log.info("npartitions(ds1)=" + ndataService1);
}
assertEquals("#dataService0=" + ndataService0, 1, ndataService0);
assertEquals("#dataService1=" + ndataService0, 1, ndataService1);
}
}
}