/*
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
licenses@blazegraph.com
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
* Created on May 6, 2009
*/
package com.bigdata.service.ndx.pipeline;
import java.math.BigInteger;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.TreeMap;
import java.util.UUID;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;
import java.util.concurrent.FutureTask;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import com.bigdata.btree.keys.KVO;
import com.bigdata.btree.keys.KeyBuilder;
import com.bigdata.btree.keys.TestKeyBuilder;
import com.bigdata.mdi.IMetadataIndex;
import com.bigdata.relation.accesspath.BlockingBuffer;
import com.bigdata.util.Bytes;
/**
* Stress test using key-range partitioned index ({@link IMetadataIndex}), which
* allows us to test the {@link AbstractMasterTask} under split, move, join and
* other kinds of index partition operations.
*
* @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
* @version $Id$
*
* FIXME Finish this stress test and enable in {@link TestAll}.
* <p>
* Note: {@link KeyBuilder#decodeBigInteger(int, byte[])} has since
* been written. However, the encoding of a {@link BigInteger} as an
* unsigned byte[] requires a 2 byte signum/runLength prefix. Thus,
* while {@link KeyBuilder#append(BigInteger)} and the decode method
* might be used to complete this test case, the separator keys will
* need to be properly formed initially in order to have the leading
* signum/runlength. The <code>null</code> for the last rightSeparator
* will also need to be handled specially.
*/
public class TestMasterTaskWithSplits extends AbstractKeyRangeMasterTestCase {
/**
*
*/
public TestMasterTaskWithSplits() {
}
/**
* @param arg0
*/
public TestMasterTaskWithSplits(String arg0) {
super(arg0);
}
/**
* Method returns a separator key which lies 1/2 between the given separator
* keys. This test suite uses long (64 bit) keys. An empty byte[]
* corresponds to ZERO (0L). A <code>null</code>, which may only appear as
* the right separator, corresponds to <code>2^64</code>. The math is
* performed using {@link BigInteger}.
*
* @param leftSeparator
* The left separator key. The left-most separator key is always
* an empty byte[] (<code>byte[0]</code>).
* @param rightSeparator
* The right separator key. The right-most separator key is
* always <code>null</code>.
*
* @return A separator key which lies 1/2-way between the given keys.
*/
protected byte[] getSeparatorKey(final byte[] leftSeparator,
final byte[] rightSeparator) {
final BigInteger v1 = decodeKey(leftSeparator);
final BigInteger v2 = decodeKey(rightSeparator);
final BigInteger vm = v1.add(v2).divide(BigInteger.valueOf(2));
return vm.toByteArray();
// final long leftKey = KeyBuilder.decodeLong(oldLocator
// .getLeftSeparatorKey(), 0);
//final long leftKey = new BigInteger(oldLocator.getLeftSeparatorKey()).longValue();
//
//final long rightKey = Long.MAX_VALUE;
//
//// divide the range in 1/2.
//final byte[] separatorKey = KeyBuilder
// .asSortKey((long)(rightKey - leftKey / 2));
}
/**
* <code>2^64</code>
*/
private final static BigInteger MAX_KEY = BigInteger
.valueOf(Long.MAX_VALUE).multiply(BigInteger.valueOf(2));
/**
* Convert an unsigned byte[] into a {@link BigInteger}. A <code>null</code>
* is understood as <code>2^64</code>. An empty byte[] is understood as a
* ZERO (0).
*
* @param key
* The bytes.
*
* @return The big integer value.
*
* @todo [See notes at the top of this file.]
*/
private BigInteger decodeKey(final byte[] key) {
if (key == null) {
return MAX_KEY;
}
if (key.length == 0)
return BigInteger.ZERO;
return new BigInteger(key);
}
/**
* Unit tests to verify the math used to compute the separator keys.
*
* FIXME This test passes because the assertions are correct, but it only
* lays out some known points and does not go further to verify that a
* desired translation between signed longs, unsigned byte[] keys, and
* {@link BigInteger} values is being carried out. [See notes at the top
* of this file.]
*/
public void test_decodeKey() {
// zero
assertEquals(BigInteger.valueOf(0), decodeKey(new byte[0]));
assertEquals(BigInteger.valueOf(0), decodeKey(new byte[] { 0 }));
assertEquals(BigInteger.valueOf(1), decodeKey(new byte[] { 1 }));
assertEquals(BigInteger.valueOf(-1), decodeKey(new byte[] { -1 }));
assertEquals(BigInteger.valueOf(Long.MIN_VALUE + 1),
decodeKey(TestKeyBuilder.asSortKey(1L)));
assertEquals(Long.MAX_VALUE, decodeKey(TestKeyBuilder.asSortKey(-1L))
.longValue());
assertEquals(MAX_KEY, decodeKey(null));
}
/**
* FIXME Verify that the separator keys are properly ordered. [See notes at
* the top of this file, but also note that we need to handle the
* <code>null</code> rightSeparator specially.]
*/
public void test_getSeparatorKey() {
assertEquals(
Long.MAX_VALUE,
decodeKey(
getSeparatorKey(new byte[0]/* leftSeparator */, null/* rightSeparator */))
.longValue());
}
/**
* Type-safe enumeration of index partition operations for this test.
*
* @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan
* Thompson</a>
* @version $Id$
*/
private static enum OpCode {
/**
* Scatter-split an index partition (takes one index partition and
* produces N index partitions, where N is on the order of 1x to 4x the
* #of data services in a cluster.
*/
ScatterSplit,
/**
* Split an index partition into two index partitions.
*/
Split,
/**
* Join two index partitions.
*/
Join,
/**
* Move an index partition (changes its locator but does not change its
* key range).
*/
Move,
/**
* This is not an index partition operation but rather is used to
* signal the end of the test.
*/
Done;
};
/**
* Class models an operation and the delay until it occurs. A sequence of
* such operations forms a schedule for the test.
*
* @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan
* Thompson</a>
* @version $Id$
*/
private static class Op {
/**
* The operation.
*/
public final OpCode code;
/**
* The delay until that operation.
*/
public final long delay;
/**
* The unit for that delay.
*/
public final TimeUnit unit;
public Op(OpCode code, long delay, TimeUnit unit) {
this.code = code;
this.delay = delay;
this.unit = unit;
}
}
/**
* Stress test for redirects.
* <p>
* Redirects are stored in an {@link IMetadataIndex} so we may test the
* behavior under SPLITs, MOVEs, or JOINs. The test writes {@link KVO}
* tuples on a {@link M master}. The master allocates the tuples to output
* buffers based on the {@link IMetadataIndex} mapping. A single thread
* executes an {@link Op}[] schedule while N concurrent producer threads
* write on the master. The test ends when the schedule is done.
*
* @throws InterruptedException
* @throws ExecutionException
*/
public void test_redirectStressTestWithSplits() throws InterruptedException,
ExecutionException {
/*
* Configuration for the stress test.
*/
/*
* #of concurrent producers.
*
* Note: Testing with GTE 150 threads is necessary to model a realistic
* use cases.
*/
final int nproducers = 200;
/*
* The minimum and maximum delay for writing a chunk. The actual write
* delays will have a uniform distribution within this range.
*
* Note: These values are based on observed delays for an RDF bulk data
* load operation on a 16 node cluster.
*/
final long minWriteDelay = 1000;
final long maxWriteDelay = 3000;
/*
* The minimum and maximum delay for producing a new chunk. The actual
* delays will have a uniform distribution within this range.
*
* Note: These values are SWAGs.
*/
final long minProducerDelay = 150;
final long maxProducerDelay = 750;
/*
* The size of a chunk generated by a producer (10k is typical of a
* deployed system).
*/
final int producerChunkSize = 1000; // vs 10000
/*
* Note: We should be able to use smaller chunks on the master and
* larger chunks on the client. The semantics of the chunk size are such
* that it controls the size of the chunks READ from the buffer, not the
* size of the chunks on the buffer. Therefore a 10k chunk producer
* feeding a master will cause 10k chunks to appear on the master. If
* the master has a 10k chunk size, then it will never need to combine
* chunks for its consumer (the sink). So the sink gets 10k chunks in
* its buffer. However, the chunkSize of the sink controls how large the
* writes will be on the index partition. So a 20k sink chunk size will
* cause 2 x 10k chunks to be combined and merge sorted before it writes
* on the index partition.
*
* @todo update the bigdataCluster.config appropriately.
*
* @todo when I made the subtaskChunkSize large enough that the code was
* actually combining chunks it uncovered a problem with the
* asynchronous writes which do not inherently protect against the
* presence of duplicate keys in the KVO[] stream. This was frowned upon
* for synchronous RPC, but for asynchronous writes it makes more sense
* to permit duplicates while still restricting the producers to
* generate ordered data. Therefore I am writing a series of unit tests
* for the ISplitter and then I will allow this case to be valid.
*/
final int masterQueueCapacity = 10;// vs 1000 (cluster config value).
final int masterChunkSize = 10000;
final long masterChunkTimeoutNanos = TimeUnit.MILLISECONDS.toNanos(50);
final int subtaskQueueCapacity = 50; // vs 500 (cluster config value).
final int subtaskChunkSize = 20000;
final long subtaskChunkTimeoutNanos = TimeUnit.MILLISECONDS
.toNanos(Long.MAX_VALUE);
/*
* The idle timeout for the sink (generally infinite unless using a
* KVOLatch to coordinate notification of results such as for the
* TERM2ID index).
*/
final long sinkIdleTimeout = Long.MAX_VALUE;
final long sinkPollTimeout = TimeUnit.MILLISECONDS.toNanos(50);
// The #of data services.
final int ndataServices = 40;
{
// Setup the mock data services.
for (int i = 0; i < ndataServices; i++) {
final UUID uuid = UUID.randomUUID();
dataServices.put(uuid, new DS(uuid) {
private final Random r = new Random();
/**
* Overridden to simulate the latency of the write operation.
*/
@Override
protected void acceptWrite(final L locator, final KVO<O>[] chunk) {
final long delayMillis = (long) (r.nextDouble() * (maxWriteDelay - minWriteDelay))
+ minWriteDelay;
if(log.isInfoEnabled())
log.info("Writing on " + chunk.length
+ " elements on " + locator + " (delay="
+ delayMillis + ") ...");
try {
Thread.sleep(delayMillis/* ms */);
} catch (InterruptedException ex) {
throw new RuntimeException(ex);
}
if(log.isInfoEnabled())
log.info("Wrote on " + this + ".");
}
});
}
}
final TimeUnit scheduleUnit = TimeUnit.SECONDS;// seconds or minutes.
final Op[] schedule = new Op[] {
// @todo include move & join as well.
// new Op(OpCode.ScatterSplit, 5, scheduleUnit), // FIXME test scatter split.
// new Op(OpCode.Split, 1, scheduleUnit),
// new Op(OpCode.Split, 1, scheduleUnit),
// new Op(OpCode.Split, 1, scheduleUnit),
/*
* Note: Always include this as the last operation or the test
* WILL NOT terminate!
*/
new Op(OpCode.Done, 10, scheduleUnit)
};
// duration of the stress test.
// final long timeoutMillis;
{
assert schedule[schedule.length - 1].code == OpCode.Done;
long t = 0L;
for (Op op : schedule) {
t += op.unit.toMillis(op.delay);
}
// timeoutMillis = t;
if (log.isInfoEnabled())
log.info("Test will run for " + t + "ms");
}
/*
* Stress test impl.
*/
// used to halt the producer(s) when the test is done.
final AtomicBoolean halt = new AtomicBoolean(false);
// the #of producers that are currently running.
final AtomicInteger producerCount = new AtomicInteger(0);
/**
* Writes on a master.
*
* @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan
* Thompson</a>
* @version $Id$
*/
class ProducerTask implements Callable<Void> {
private final BlockingBuffer<KVO<O>[]> buffer;
public ProducerTask(final BlockingBuffer<KVO<O>[]> buffer) {
this.buffer = buffer;
}
public Void call() throws Exception {
producerCount.incrementAndGet();
try {
final KeyBuilder keyBuilder = new KeyBuilder(
Bytes.SIZEOF_LONG);
final Random r = new Random();
while (true) {
// Sleep to simulate latency in the production of new
// chunks.
Thread
.sleep(r
.nextInt((int) (maxProducerDelay - minProducerDelay))
+ minProducerDelay);
if (halt.get()
|| Thread.interrupted()) {
if (log.isInfoEnabled())
log.info("Producer halting.");
return null;
}
/*
* Note: keys have uniform distribution.
*/
final KVO<O>[] a = new KVO[producerChunkSize];
for (int i = 0; i < a.length; i++) {
final byte[] key = keyBuilder.reset().append(
r.nextLong()).getKey();
final byte[] val = new byte[2];
r.nextBytes(val);
a[i] = new KVO(key, val);
}
// ensure sorted order for the chunk.
Arrays.sort(a);
buffer.add(a);
}
} finally {
producerCount.decrementAndGet();
}
}
}
/**
* Issues redirects.
*
* @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan
* Thompson</a>
* @version $Id$
*/
class RedirectTask implements Callable<Void> {
private final M master;
private final Op[] schedule;
final Random r = new Random();
/**
*
* @param master
* @param times
* The delay times between redirects. The delay until the
* next redirect is chosen randomly from among the
* specified times.
*/
public RedirectTask(final M master, final Op[] schedule) {
this.master = master;
this.schedule = schedule;
}
public Void call() throws Exception {
for (Op op : schedule) {
if (halt.get() || Thread.interrupted()) {
if (log.isInfoEnabled())
log.info("Redirecter halting.");
// Done.
return null;
}
final long delayMillis = op.unit.toMillis(op.delay);
if (log.isInfoEnabled())
log.info("Will wait " + delayMillis
+ "ms before executing: " + op.code);
Thread.sleep(delayMillis);
if (halt.get() || Thread.interrupted()) {
if (log.isInfoEnabled())
log.info("Redirecter halting.");
// Done.
return null;
}
switch (op.code) {
case ScatterSplit:
scatterSplit(op);
break;
case Split:
split(op);
break;
case Join:
join(op);
break;
case Move:
move(op);
break;
case Done:
done(op);
break;
default:
throw new UnsupportedOperationException(op.code
.toString());
}
}
if(log.isInfoEnabled())
log.info("Redirecter halting.");
return null;
}
/**
* Handles scatterSplit. One index partition is selected. It is then
* scattered by dividing its key range into 2N equal parts, where N
* is the #of of data services. The locators for the index partition
* in the metadata index are updated to reflect the scatter split.
*
* @param op
*/
protected void scatterSplit(Op op) {
master.mdiLock.lock();
try {
// the #of existing partitions.
final long npartitions2 = master.mdi.getEntryCount();
assertTrue(npartitions2 != 0);
assertTrue(npartitions2 <= Integer.MAX_VALUE);
final int npartitions = (int) npartitions2;
// choose which one to split.
final int index = r.nextInt(npartitions);
// lookup that locator.
final L locator = (L) master.mdi.valueAt(index,
master.mdi.getLookupTuple()).getObject();
/*
* Evenly divide the key range of the locator into N key
* ranges. If the rightSeparator is null, then we divide the
* keys based on the a-priori knowledge that the keys are
* 8-bytes long so the maximum key is formed by encoding
* Long.MAX_VALUE using a KeyBuilder. However, when the
* rightSeparator is null on input, the rightSeparator of
* the last output index partition will always be null as
* well.
*/
// FIXME Finish scatter-split
if(true)
throw new UnsupportedOperationException();
// Notify DS so it will issue stale locator response.
final DS oldDS = dataServices.get(locator.getDataServiceUUID());
oldDS.notifyGone(locator);
} finally {
master.mdiLock.unlock();
}
}
/**
* Handle split of a randomly chosen index partition into two new
* index partitions.
*
* @param op
*/
protected void split(final Op op) {
master.mdiLock.lock();
try {
// the #of existing partitions.
final long npartitions2 = master.mdi.getEntryCount();
assertTrue(npartitions2 != 0);
assertTrue(npartitions2 <= Integer.MAX_VALUE);
final int npartitions = (int) npartitions2;
// choose which one to split.
final int index = r.nextInt(npartitions);
// lookup that locator.
final L oldLocator = (L) master.mdi.valueAt(index,
master.mdi.getLookupTuple()).getObject();
/*
* Divide the key range of the locator into 2 key ranges. If
* the rightSeparator is null, then we divide the keys based
* on the a-priori knowledge that the keys are 8-bytes long
* so the maximum key is formed by encoding Long.MAX_VALUE
* using a KeyBuilder. However, when the rightSeparator is
* null on input, the rightSeparator of the last output
* index partition will always be null as well.
*/
final byte[] separatorKey = getSeparatorKey(oldLocator
.getLeftSeparatorKey(), oldLocator
.getRightSeparatorKey());
final L newLeftSibling = new L(master.mdi
.incrementAndGetNextPartitionId(),
getRandomDataService().uuid, oldLocator
.getLeftSeparatorKey(), separatorKey);
final L newRightSibling = new L(master.mdi
.incrementAndGetNextPartitionId(),
getRandomDataService().uuid, separatorKey,
oldLocator.getRightSeparatorKey());
// remove old locator.
assertNotNull(master.mdi.remove(oldLocator
.getLeftSeparatorKey()));
// add new locators covering the same key-range.
master.mdi.insert(newLeftSibling.getLeftSeparatorKey(),
newLeftSibling);
master.mdi.insert(newRightSibling.getLeftSeparatorKey(),
newRightSibling);
// Notify DS so it will issue stale locator response.
final DS oldDS = dataServices.get(oldLocator
.getDataServiceUUID());
oldDS.notifyGone(oldLocator);
} finally {
master.mdiLock.unlock();
}
}
// FIXME handle join
protected void join(Op op) {
throw new UnsupportedOperationException();
}
// FIXME handle move
protected void move(Op op) {
throw new UnsupportedOperationException();
}
/**
* Cause the test to halt.
* @param op
*/
protected void done(Op op) {
// set flag - will cause producers and redirector to halt.
halt.set(true);
}
}
/*
* The master under test.
*/
final BlockingBuffer<KVO<O>[]> masterBuffer = new BlockingBuffer<KVO<O>[]>(
masterQueueCapacity, masterChunkSize, masterChunkTimeoutNanos,
TimeUnit.NANOSECONDS);
final M master = new M(masterStats, masterBuffer, executorService,
sinkIdleTimeout, sinkPollTimeout) {
protected BlockingBuffer<KVO<O>[]> newSubtaskBuffer() {
return new BlockingBuffer<KVO<O>[]>(
new ArrayBlockingQueue<KVO<O>[]>(subtaskQueueCapacity), //
subtaskChunkSize,//
subtaskChunkTimeoutNanos,//
TimeUnit.NANOSECONDS,//
true // ordered
);
}
};
/*
* Setup the initial index partition.
*
* Note: The mdiLock is not required here since no other threads are
* accessing the MDI until we start them below.
*/
master.mdiLock.lock();
try {
// choose initial data service randomly.
final UUID dataServiceUUID = dataServices.keySet().iterator().next();
final DS dataService = dataServices.get(dataServiceUUID);
final L locator = new L(//
// the initial partitionId
master.mdi.incrementAndGetNextPartitionId(),
// the initial data service.
dataServiceUUID,
// leftSeparator is initially an empty byte[].
new byte[0],
// rightSeparator is initially null.
null
);
// add to the MDI
master.mdi.insert(locator.getLeftSeparatorKey(), locator);
// and inform the DS.
dataService.notifyLocator(locator);
} finally {
master.mdiLock.unlock();
}
/*
* Setup redirector with its schedule of operations.
*/
final RedirectTask redirecter = new RedirectTask(master, schedule);
// Start the master.
{
// Wrap computation as FutureTask.
final FutureTask<H> ft = new FutureTask<H>(master);
// Set Future on BlockingBuffer.
masterBuffer.setFuture(ft);
// Start the consumer.
executorService.submit(ft);
}
// Setup producers.
final List<FutureTask<Void>> producerFutures = new LinkedList<FutureTask<Void>>();
for (int i = 0; i < nproducers; i++) {
// Wrap computation as FutureTask.
producerFutures.add(new FutureTask<Void>(new ProducerTask(
masterBuffer)));
}
// Start writing data.
for (FutureTask<Void> ft : producerFutures) {
executorService.submit(ft);
}
// start redirects.
final Future<Void> redirecterFuture = executorService.submit(redirecter);
try {
// periodically verify no errors in running tasks.
while (!halt.get()) {
/*
* End the test if anything is done.
*/
// check master.
if (masterBuffer.getFuture().isDone()) {
break;
}
// check redirecter
if (redirecterFuture.isDone()) {
break;
}
// check producers.
for (Future<Void> f : producerFutures) {
if (f.isDone()) {
break;
}
}
// sleep in 1/4 second intervals.
Thread.sleep(250/*ms*/);
}
/*
* Set [halt] (it may already be set) so that the redirector and the
* producers will all halt and their check their Futures for errors.
*/
if (log.isInfoEnabled())
log.info("Halting redirector and producers.");
// set flag causing tasks to halt.
halt.set(true);
// await termination and check redirector future for errors.
redirecterFuture.get();
// await termination and check producer futures for errors.
for (Future<Void> f : producerFutures) {
f.get();
}
if (log.isInfoEnabled())
log.info("Closing master buffer: " + masterBuffer);
// close the master : queued data should be drained by sinks.
masterBuffer.close();
// await termination and check future for errors in master.
while (true) {
try {
masterBuffer.getFuture().get(1000, TimeUnit.MILLISECONDS);
break;
} catch (TimeoutException e) {
if (log.isInfoEnabled())
log
.info("Waiting on master: ~subtaskCount="
+ masterStats.getActiveSinkCount()
+ ", ~elementsRemaining="
+ (masterStats.elementsIn.get() - masterStats.elementsOut
.get()));
}
}
} finally {
{
// show the subtask stats using an ordered map.
final Map<L, HS> subStats = new TreeMap<L, HS>(master.stats
.getSubtaskStats());
for (Map.Entry<L, HS> e : subStats.entrySet()) {
if(log.isInfoEnabled())
log.info(e.getKey() + " : " + e.getValue());
}
}
// show the master stats
if(log.isInfoEnabled())
log.info(master.stats.toString());
}
}
}