/* Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved. Contact: SYSTAP, LLC DBA Blazegraph 2501 Calvert ST NW #106 Washington, DC 20008 licenses@blazegraph.com This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; version 2 of the License. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /* * Created on May 6, 2009 */ package com.bigdata.service.ndx.pipeline; import java.math.BigInteger; import java.util.Arrays; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Random; import java.util.TreeMap; import java.util.UUID; import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.Callable; import java.util.concurrent.ExecutionException; import java.util.concurrent.Future; import java.util.concurrent.FutureTask; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; import com.bigdata.btree.keys.KVO; import com.bigdata.btree.keys.KeyBuilder; import com.bigdata.btree.keys.TestKeyBuilder; import com.bigdata.mdi.IMetadataIndex; import com.bigdata.relation.accesspath.BlockingBuffer; import com.bigdata.util.Bytes; /** * Stress test using key-range partitioned index ({@link IMetadataIndex}), which * allows us to test the {@link AbstractMasterTask} under split, move, join and * other kinds of index partition operations. * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a> * @version $Id$ * * FIXME Finish this stress test and enable in {@link TestAll}. * <p> * Note: {@link KeyBuilder#decodeBigInteger(int, byte[])} has since * been written. However, the encoding of a {@link BigInteger} as an * unsigned byte[] requires a 2 byte signum/runLength prefix. Thus, * while {@link KeyBuilder#append(BigInteger)} and the decode method * might be used to complete this test case, the separator keys will * need to be properly formed initially in order to have the leading * signum/runlength. The <code>null</code> for the last rightSeparator * will also need to be handled specially. */ public class TestMasterTaskWithSplits extends AbstractKeyRangeMasterTestCase { /** * */ public TestMasterTaskWithSplits() { } /** * @param arg0 */ public TestMasterTaskWithSplits(String arg0) { super(arg0); } /** * Method returns a separator key which lies 1/2 between the given separator * keys. This test suite uses long (64 bit) keys. An empty byte[] * corresponds to ZERO (0L). A <code>null</code>, which may only appear as * the right separator, corresponds to <code>2^64</code>. The math is * performed using {@link BigInteger}. * * @param leftSeparator * The left separator key. The left-most separator key is always * an empty byte[] (<code>byte[0]</code>). * @param rightSeparator * The right separator key. The right-most separator key is * always <code>null</code>. * * @return A separator key which lies 1/2-way between the given keys. */ protected byte[] getSeparatorKey(final byte[] leftSeparator, final byte[] rightSeparator) { final BigInteger v1 = decodeKey(leftSeparator); final BigInteger v2 = decodeKey(rightSeparator); final BigInteger vm = v1.add(v2).divide(BigInteger.valueOf(2)); return vm.toByteArray(); // final long leftKey = KeyBuilder.decodeLong(oldLocator // .getLeftSeparatorKey(), 0); //final long leftKey = new BigInteger(oldLocator.getLeftSeparatorKey()).longValue(); // //final long rightKey = Long.MAX_VALUE; // //// divide the range in 1/2. //final byte[] separatorKey = KeyBuilder // .asSortKey((long)(rightKey - leftKey / 2)); } /** * <code>2^64</code> */ private final static BigInteger MAX_KEY = BigInteger .valueOf(Long.MAX_VALUE).multiply(BigInteger.valueOf(2)); /** * Convert an unsigned byte[] into a {@link BigInteger}. A <code>null</code> * is understood as <code>2^64</code>. An empty byte[] is understood as a * ZERO (0). * * @param key * The bytes. * * @return The big integer value. * * @todo [See notes at the top of this file.] */ private BigInteger decodeKey(final byte[] key) { if (key == null) { return MAX_KEY; } if (key.length == 0) return BigInteger.ZERO; return new BigInteger(key); } /** * Unit tests to verify the math used to compute the separator keys. * * FIXME This test passes because the assertions are correct, but it only * lays out some known points and does not go further to verify that a * desired translation between signed longs, unsigned byte[] keys, and * {@link BigInteger} values is being carried out. [See notes at the top * of this file.] */ public void test_decodeKey() { // zero assertEquals(BigInteger.valueOf(0), decodeKey(new byte[0])); assertEquals(BigInteger.valueOf(0), decodeKey(new byte[] { 0 })); assertEquals(BigInteger.valueOf(1), decodeKey(new byte[] { 1 })); assertEquals(BigInteger.valueOf(-1), decodeKey(new byte[] { -1 })); assertEquals(BigInteger.valueOf(Long.MIN_VALUE + 1), decodeKey(TestKeyBuilder.asSortKey(1L))); assertEquals(Long.MAX_VALUE, decodeKey(TestKeyBuilder.asSortKey(-1L)) .longValue()); assertEquals(MAX_KEY, decodeKey(null)); } /** * FIXME Verify that the separator keys are properly ordered. [See notes at * the top of this file, but also note that we need to handle the * <code>null</code> rightSeparator specially.] */ public void test_getSeparatorKey() { assertEquals( Long.MAX_VALUE, decodeKey( getSeparatorKey(new byte[0]/* leftSeparator */, null/* rightSeparator */)) .longValue()); } /** * Type-safe enumeration of index partition operations for this test. * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan * Thompson</a> * @version $Id$ */ private static enum OpCode { /** * Scatter-split an index partition (takes one index partition and * produces N index partitions, where N is on the order of 1x to 4x the * #of data services in a cluster. */ ScatterSplit, /** * Split an index partition into two index partitions. */ Split, /** * Join two index partitions. */ Join, /** * Move an index partition (changes its locator but does not change its * key range). */ Move, /** * This is not an index partition operation but rather is used to * signal the end of the test. */ Done; }; /** * Class models an operation and the delay until it occurs. A sequence of * such operations forms a schedule for the test. * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan * Thompson</a> * @version $Id$ */ private static class Op { /** * The operation. */ public final OpCode code; /** * The delay until that operation. */ public final long delay; /** * The unit for that delay. */ public final TimeUnit unit; public Op(OpCode code, long delay, TimeUnit unit) { this.code = code; this.delay = delay; this.unit = unit; } } /** * Stress test for redirects. * <p> * Redirects are stored in an {@link IMetadataIndex} so we may test the * behavior under SPLITs, MOVEs, or JOINs. The test writes {@link KVO} * tuples on a {@link M master}. The master allocates the tuples to output * buffers based on the {@link IMetadataIndex} mapping. A single thread * executes an {@link Op}[] schedule while N concurrent producer threads * write on the master. The test ends when the schedule is done. * * @throws InterruptedException * @throws ExecutionException */ public void test_redirectStressTestWithSplits() throws InterruptedException, ExecutionException { /* * Configuration for the stress test. */ /* * #of concurrent producers. * * Note: Testing with GTE 150 threads is necessary to model a realistic * use cases. */ final int nproducers = 200; /* * The minimum and maximum delay for writing a chunk. The actual write * delays will have a uniform distribution within this range. * * Note: These values are based on observed delays for an RDF bulk data * load operation on a 16 node cluster. */ final long minWriteDelay = 1000; final long maxWriteDelay = 3000; /* * The minimum and maximum delay for producing a new chunk. The actual * delays will have a uniform distribution within this range. * * Note: These values are SWAGs. */ final long minProducerDelay = 150; final long maxProducerDelay = 750; /* * The size of a chunk generated by a producer (10k is typical of a * deployed system). */ final int producerChunkSize = 1000; // vs 10000 /* * Note: We should be able to use smaller chunks on the master and * larger chunks on the client. The semantics of the chunk size are such * that it controls the size of the chunks READ from the buffer, not the * size of the chunks on the buffer. Therefore a 10k chunk producer * feeding a master will cause 10k chunks to appear on the master. If * the master has a 10k chunk size, then it will never need to combine * chunks for its consumer (the sink). So the sink gets 10k chunks in * its buffer. However, the chunkSize of the sink controls how large the * writes will be on the index partition. So a 20k sink chunk size will * cause 2 x 10k chunks to be combined and merge sorted before it writes * on the index partition. * * @todo update the bigdataCluster.config appropriately. * * @todo when I made the subtaskChunkSize large enough that the code was * actually combining chunks it uncovered a problem with the * asynchronous writes which do not inherently protect against the * presence of duplicate keys in the KVO[] stream. This was frowned upon * for synchronous RPC, but for asynchronous writes it makes more sense * to permit duplicates while still restricting the producers to * generate ordered data. Therefore I am writing a series of unit tests * for the ISplitter and then I will allow this case to be valid. */ final int masterQueueCapacity = 10;// vs 1000 (cluster config value). final int masterChunkSize = 10000; final long masterChunkTimeoutNanos = TimeUnit.MILLISECONDS.toNanos(50); final int subtaskQueueCapacity = 50; // vs 500 (cluster config value). final int subtaskChunkSize = 20000; final long subtaskChunkTimeoutNanos = TimeUnit.MILLISECONDS .toNanos(Long.MAX_VALUE); /* * The idle timeout for the sink (generally infinite unless using a * KVOLatch to coordinate notification of results such as for the * TERM2ID index). */ final long sinkIdleTimeout = Long.MAX_VALUE; final long sinkPollTimeout = TimeUnit.MILLISECONDS.toNanos(50); // The #of data services. final int ndataServices = 40; { // Setup the mock data services. for (int i = 0; i < ndataServices; i++) { final UUID uuid = UUID.randomUUID(); dataServices.put(uuid, new DS(uuid) { private final Random r = new Random(); /** * Overridden to simulate the latency of the write operation. */ @Override protected void acceptWrite(final L locator, final KVO<O>[] chunk) { final long delayMillis = (long) (r.nextDouble() * (maxWriteDelay - minWriteDelay)) + minWriteDelay; if(log.isInfoEnabled()) log.info("Writing on " + chunk.length + " elements on " + locator + " (delay=" + delayMillis + ") ..."); try { Thread.sleep(delayMillis/* ms */); } catch (InterruptedException ex) { throw new RuntimeException(ex); } if(log.isInfoEnabled()) log.info("Wrote on " + this + "."); } }); } } final TimeUnit scheduleUnit = TimeUnit.SECONDS;// seconds or minutes. final Op[] schedule = new Op[] { // @todo include move & join as well. // new Op(OpCode.ScatterSplit, 5, scheduleUnit), // FIXME test scatter split. // new Op(OpCode.Split, 1, scheduleUnit), // new Op(OpCode.Split, 1, scheduleUnit), // new Op(OpCode.Split, 1, scheduleUnit), /* * Note: Always include this as the last operation or the test * WILL NOT terminate! */ new Op(OpCode.Done, 10, scheduleUnit) }; // duration of the stress test. // final long timeoutMillis; { assert schedule[schedule.length - 1].code == OpCode.Done; long t = 0L; for (Op op : schedule) { t += op.unit.toMillis(op.delay); } // timeoutMillis = t; if (log.isInfoEnabled()) log.info("Test will run for " + t + "ms"); } /* * Stress test impl. */ // used to halt the producer(s) when the test is done. final AtomicBoolean halt = new AtomicBoolean(false); // the #of producers that are currently running. final AtomicInteger producerCount = new AtomicInteger(0); /** * Writes on a master. * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan * Thompson</a> * @version $Id$ */ class ProducerTask implements Callable<Void> { private final BlockingBuffer<KVO<O>[]> buffer; public ProducerTask(final BlockingBuffer<KVO<O>[]> buffer) { this.buffer = buffer; } public Void call() throws Exception { producerCount.incrementAndGet(); try { final KeyBuilder keyBuilder = new KeyBuilder( Bytes.SIZEOF_LONG); final Random r = new Random(); while (true) { // Sleep to simulate latency in the production of new // chunks. Thread .sleep(r .nextInt((int) (maxProducerDelay - minProducerDelay)) + minProducerDelay); if (halt.get() || Thread.interrupted()) { if (log.isInfoEnabled()) log.info("Producer halting."); return null; } /* * Note: keys have uniform distribution. */ final KVO<O>[] a = new KVO[producerChunkSize]; for (int i = 0; i < a.length; i++) { final byte[] key = keyBuilder.reset().append( r.nextLong()).getKey(); final byte[] val = new byte[2]; r.nextBytes(val); a[i] = new KVO(key, val); } // ensure sorted order for the chunk. Arrays.sort(a); buffer.add(a); } } finally { producerCount.decrementAndGet(); } } } /** * Issues redirects. * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan * Thompson</a> * @version $Id$ */ class RedirectTask implements Callable<Void> { private final M master; private final Op[] schedule; final Random r = new Random(); /** * * @param master * @param times * The delay times between redirects. The delay until the * next redirect is chosen randomly from among the * specified times. */ public RedirectTask(final M master, final Op[] schedule) { this.master = master; this.schedule = schedule; } public Void call() throws Exception { for (Op op : schedule) { if (halt.get() || Thread.interrupted()) { if (log.isInfoEnabled()) log.info("Redirecter halting."); // Done. return null; } final long delayMillis = op.unit.toMillis(op.delay); if (log.isInfoEnabled()) log.info("Will wait " + delayMillis + "ms before executing: " + op.code); Thread.sleep(delayMillis); if (halt.get() || Thread.interrupted()) { if (log.isInfoEnabled()) log.info("Redirecter halting."); // Done. return null; } switch (op.code) { case ScatterSplit: scatterSplit(op); break; case Split: split(op); break; case Join: join(op); break; case Move: move(op); break; case Done: done(op); break; default: throw new UnsupportedOperationException(op.code .toString()); } } if(log.isInfoEnabled()) log.info("Redirecter halting."); return null; } /** * Handles scatterSplit. One index partition is selected. It is then * scattered by dividing its key range into 2N equal parts, where N * is the #of of data services. The locators for the index partition * in the metadata index are updated to reflect the scatter split. * * @param op */ protected void scatterSplit(Op op) { master.mdiLock.lock(); try { // the #of existing partitions. final long npartitions2 = master.mdi.getEntryCount(); assertTrue(npartitions2 != 0); assertTrue(npartitions2 <= Integer.MAX_VALUE); final int npartitions = (int) npartitions2; // choose which one to split. final int index = r.nextInt(npartitions); // lookup that locator. final L locator = (L) master.mdi.valueAt(index, master.mdi.getLookupTuple()).getObject(); /* * Evenly divide the key range of the locator into N key * ranges. If the rightSeparator is null, then we divide the * keys based on the a-priori knowledge that the keys are * 8-bytes long so the maximum key is formed by encoding * Long.MAX_VALUE using a KeyBuilder. However, when the * rightSeparator is null on input, the rightSeparator of * the last output index partition will always be null as * well. */ // FIXME Finish scatter-split if(true) throw new UnsupportedOperationException(); // Notify DS so it will issue stale locator response. final DS oldDS = dataServices.get(locator.getDataServiceUUID()); oldDS.notifyGone(locator); } finally { master.mdiLock.unlock(); } } /** * Handle split of a randomly chosen index partition into two new * index partitions. * * @param op */ protected void split(final Op op) { master.mdiLock.lock(); try { // the #of existing partitions. final long npartitions2 = master.mdi.getEntryCount(); assertTrue(npartitions2 != 0); assertTrue(npartitions2 <= Integer.MAX_VALUE); final int npartitions = (int) npartitions2; // choose which one to split. final int index = r.nextInt(npartitions); // lookup that locator. final L oldLocator = (L) master.mdi.valueAt(index, master.mdi.getLookupTuple()).getObject(); /* * Divide the key range of the locator into 2 key ranges. If * the rightSeparator is null, then we divide the keys based * on the a-priori knowledge that the keys are 8-bytes long * so the maximum key is formed by encoding Long.MAX_VALUE * using a KeyBuilder. However, when the rightSeparator is * null on input, the rightSeparator of the last output * index partition will always be null as well. */ final byte[] separatorKey = getSeparatorKey(oldLocator .getLeftSeparatorKey(), oldLocator .getRightSeparatorKey()); final L newLeftSibling = new L(master.mdi .incrementAndGetNextPartitionId(), getRandomDataService().uuid, oldLocator .getLeftSeparatorKey(), separatorKey); final L newRightSibling = new L(master.mdi .incrementAndGetNextPartitionId(), getRandomDataService().uuid, separatorKey, oldLocator.getRightSeparatorKey()); // remove old locator. assertNotNull(master.mdi.remove(oldLocator .getLeftSeparatorKey())); // add new locators covering the same key-range. master.mdi.insert(newLeftSibling.getLeftSeparatorKey(), newLeftSibling); master.mdi.insert(newRightSibling.getLeftSeparatorKey(), newRightSibling); // Notify DS so it will issue stale locator response. final DS oldDS = dataServices.get(oldLocator .getDataServiceUUID()); oldDS.notifyGone(oldLocator); } finally { master.mdiLock.unlock(); } } // FIXME handle join protected void join(Op op) { throw new UnsupportedOperationException(); } // FIXME handle move protected void move(Op op) { throw new UnsupportedOperationException(); } /** * Cause the test to halt. * @param op */ protected void done(Op op) { // set flag - will cause producers and redirector to halt. halt.set(true); } } /* * The master under test. */ final BlockingBuffer<KVO<O>[]> masterBuffer = new BlockingBuffer<KVO<O>[]>( masterQueueCapacity, masterChunkSize, masterChunkTimeoutNanos, TimeUnit.NANOSECONDS); final M master = new M(masterStats, masterBuffer, executorService, sinkIdleTimeout, sinkPollTimeout) { protected BlockingBuffer<KVO<O>[]> newSubtaskBuffer() { return new BlockingBuffer<KVO<O>[]>( new ArrayBlockingQueue<KVO<O>[]>(subtaskQueueCapacity), // subtaskChunkSize,// subtaskChunkTimeoutNanos,// TimeUnit.NANOSECONDS,// true // ordered ); } }; /* * Setup the initial index partition. * * Note: The mdiLock is not required here since no other threads are * accessing the MDI until we start them below. */ master.mdiLock.lock(); try { // choose initial data service randomly. final UUID dataServiceUUID = dataServices.keySet().iterator().next(); final DS dataService = dataServices.get(dataServiceUUID); final L locator = new L(// // the initial partitionId master.mdi.incrementAndGetNextPartitionId(), // the initial data service. dataServiceUUID, // leftSeparator is initially an empty byte[]. new byte[0], // rightSeparator is initially null. null ); // add to the MDI master.mdi.insert(locator.getLeftSeparatorKey(), locator); // and inform the DS. dataService.notifyLocator(locator); } finally { master.mdiLock.unlock(); } /* * Setup redirector with its schedule of operations. */ final RedirectTask redirecter = new RedirectTask(master, schedule); // Start the master. { // Wrap computation as FutureTask. final FutureTask<H> ft = new FutureTask<H>(master); // Set Future on BlockingBuffer. masterBuffer.setFuture(ft); // Start the consumer. executorService.submit(ft); } // Setup producers. final List<FutureTask<Void>> producerFutures = new LinkedList<FutureTask<Void>>(); for (int i = 0; i < nproducers; i++) { // Wrap computation as FutureTask. producerFutures.add(new FutureTask<Void>(new ProducerTask( masterBuffer))); } // Start writing data. for (FutureTask<Void> ft : producerFutures) { executorService.submit(ft); } // start redirects. final Future<Void> redirecterFuture = executorService.submit(redirecter); try { // periodically verify no errors in running tasks. while (!halt.get()) { /* * End the test if anything is done. */ // check master. if (masterBuffer.getFuture().isDone()) { break; } // check redirecter if (redirecterFuture.isDone()) { break; } // check producers. for (Future<Void> f : producerFutures) { if (f.isDone()) { break; } } // sleep in 1/4 second intervals. Thread.sleep(250/*ms*/); } /* * Set [halt] (it may already be set) so that the redirector and the * producers will all halt and their check their Futures for errors. */ if (log.isInfoEnabled()) log.info("Halting redirector and producers."); // set flag causing tasks to halt. halt.set(true); // await termination and check redirector future for errors. redirecterFuture.get(); // await termination and check producer futures for errors. for (Future<Void> f : producerFutures) { f.get(); } if (log.isInfoEnabled()) log.info("Closing master buffer: " + masterBuffer); // close the master : queued data should be drained by sinks. masterBuffer.close(); // await termination and check future for errors in master. while (true) { try { masterBuffer.getFuture().get(1000, TimeUnit.MILLISECONDS); break; } catch (TimeoutException e) { if (log.isInfoEnabled()) log .info("Waiting on master: ~subtaskCount=" + masterStats.getActiveSinkCount() + ", ~elementsRemaining=" + (masterStats.elementsIn.get() - masterStats.elementsOut .get())); } } } finally { { // show the subtask stats using an ordered map. final Map<L, HS> subStats = new TreeMap<L, HS>(master.stats .getSubtaskStats()); for (Map.Entry<L, HS> e : subStats.entrySet()) { if(log.isInfoEnabled()) log.info(e.getKey() + " : " + e.getValue()); } } // show the master stats if(log.isInfoEnabled()) log.info(master.stats.toString()); } } }