/** Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved. Contact: SYSTAP, LLC DBA Blazegraph 2501 Calvert ST NW #106 Washington, DC 20008 licenses@blazegraph.com This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; version 2 of the License. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ /* * Created on Oct 15, 2006 */ package com.bigdata.journal; import java.io.File; import java.io.IOException; import java.io.RandomAccessFile; import java.nio.ByteBuffer; import java.text.NumberFormat; import java.util.Properties; import java.util.UUID; import junit.framework.Test; import junit.framework.TestCase2; import junit.framework.TestSuite; import com.bigdata.btree.BTree; import com.bigdata.btree.IIndex; import com.bigdata.btree.IndexMetadata; import com.bigdata.btree.keys.IKeyBuilder; import com.bigdata.btree.keys.KeyBuilder; import com.bigdata.rawstore.IRawStore; import com.bigdata.testutil.ExperimentDriver; import com.bigdata.util.Bytes; /** * <p> * A benchmark that computes the raw platform write rate for pure sequential IO, * the journal's write rate, and both unisolated and isolated index write rates. * The overhead of the journal can be estimated by comparing the pure sequential * writes on the file system with the write rates when using each of the journal * modes. Likewise, the overhead of the index can be estimated in comparison to * the journal write rate, and the overehead of (large) transactions can be * estimated in comparison to the unisolated index write rate (this test does * not estimate the overhead for small transactions for a variety of reasons). * </p> * <p> * The results from this benchmark can be used to compare the performance of the * different {@link IBufferMode} implementations. The theoretical maximum for a * platform is the sustained write rate for the disk subsystem on which the * journal file is located - this can be obtained with a bit of research on your * disk drives, e.g., using <a href="http://www.storagereview.com/"> * storagereview.com </a>. It is generally achieved by * {@link BenchmarkBlockBasedOptimium} * </p> * <p> * Note: you should run these tests multiple times to make sure that you are * getting valid numbers for your platform. You should also compare the data * with the expected disk maximum write rate for your platform. You can monitor * your platform using "perfmon" on Windows or "vmstat" on Un*x. It is important * that your system has not swapped out parts of the JVM or the benchmark will * run poorly (this can be a problem with a memory-limited Windows platform). * </p> * <p> * Analysis: The Transient mode achieves 30x the raw write rate when compared to * any of the disk-backed modes (1,282 MB/sec vs 40 MB/sec). However, the index * write rates are essentially constant across the buffer modes (roughly * 20MB/sec for unisolated writes, which is ~50% of the journal write rate when * backed by disk, and 9MB/sec for isolated writes, or ~25% of the journal write * rate when backed by disk). The limiting factor for index writes is the btree * code itself (it tends to be key search). The limiting factor for the isolated * index writes is that the transaction write set overflows onto disk, so we * wind up doing much more IO for a large transaction (however small * transactions have very little overhead when compared to unisolated index * writes). The disk-only buffer mode does a little better than the * fully-buffered modes for the isolated writes - presumably since (a) the * monotonically increasing keys defeat the index node and leaf cache; and (b) * the disk-only mode is able to make more RAM available to the JVM since it * does not maintain the large in memory buffer. * </p> * * @see src/architecture/performance.xls. * * @todo Test the impact of an AIO strategy. * * FIXME Use the {@link ExperimentDriver} and compare the various buffer modes * and other variables and the write rates for the {@link IRawStore} vs * unisolated indices. Checkout the disk queue under the performance monitor and * make sure that we are driving the disk as hard as possible. * * @todo Quantify impact of the disk-only mode write cache. * * @todo Note that the target performance environment requires multiple journals * and multiple read-optimized databases. Do not over-optimize for a * single writer. Write benchmarks for write absorption rates for * concurrent writers with and without concurrent readers. * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a> * @version $Id$ */ abstract public class BenchmarkJournalWriteRate extends TestCase2 { /** * */ public BenchmarkJournalWriteRate() { } /** * @param name */ public BenchmarkJournalWriteRate(String name) { super(name); } /** * Sets the initial extent for the test. */ public Properties getProperties() { Properties properties = super.getProperties(); properties.setProperty(Options.INITIAL_EXTENT,""+getInitialExtent()); properties.setProperty(Options.BUFFER_MODE, getBufferMode().toString()); properties.setProperty(Options.CREATE_TEMP_FILE, "true"); // Note: Forcing writes is generally MUCH slower. // properties.setProperty(Options.FORCE_WRITES, ForceEnum.ForceMetadata.toString()); properties.setProperty(Options.DELETE_ON_EXIT,"true"); return properties; } /** * The tests are performed with a record size of 4k, but you can vary the * "record size" to be larger or smaller. 4k was choosen as being * representative of the expected size of a node or leaf of a btree since * that is the primary kind of object that we write on the journal. */ protected int getRecordSize() { return Bytes.kilobyte32 * 4; } /** * The tests are performed with a 100M journal. */ protected long getInitialExtent() { return 100*Bytes.megabyte; } abstract protected BufferMode getBufferMode(); protected String getFilename() { return "benchmark-"+getBufferMode()+"-"+getName()+".jnl"; } /** * The branching factor used by the unisolated btree on the journal and by * the isolated btree iff a transaction is used to isolated the write set. * <p> * Note: A higher branching factor can be choosen for this test since the * btree writes use monotoically increasing keys. */ protected int getBranchingFactor() { // return 16; return 256; } Journal journal; void deleteFile() { try { File file = new File(getFilename()); if ( file.exists() && ! file.delete()) { System.err.println("Warning: could not delete: " + file.getAbsolutePath()); } } catch (Throwable t) { System.err.println("Warning: " + t); } } public void setUp() throws IOException { System.err.println("------------------\n"); deleteFile(); journal = new Journal( getProperties() ); } public void tearDown() throws IOException { try { journal.destroy(); } catch( IllegalStateException ex ) { System.err.println("Could not close the journal: "+ex); } } static NumberFormat cf; static NumberFormat fpf; static { cf = NumberFormat.getNumberInstance(); cf.setGroupingUsed(true); fpf = NumberFormat.getNumberInstance(); fpf.setGroupingUsed(false); fpf.setMaximumFractionDigits(2); } public void testRawRecordWriteRate() throws IOException { doRawRecordWriteRateTest(getRecordSize()); } /** * Test the index write rate using an index that does NOT support * transactional isolation using 32 bit integer keys and 128 byte values for * the index entries. */ public void testNonIsolatableIndexWriteRate() throws IOException { // register named index that does NOT support isolation. String name = "abc"; final BTree btree; { IndexMetadata metadata = new IndexMetadata(UUID.randomUUID()); metadata.setBranchingFactor(getBranchingFactor()); btree = BTree.create(journal, metadata); } journal.registerIndex(name, btree); journal.commit(); // NOT isolated. long tx = 0L; // run test. doIndexWriteRateTest(name, tx, 128); } /** * Test the index write rate using an index that supports transactional * isolation but without transactional isolation using 32 bit integer keys * and 128 byte values for the index entries. */ public void testUnisolatedIndexWriteRate() throws IOException { // register named index that can support isolation. String name = "abc"; final BTree btree; { IndexMetadata metadata = new IndexMetadata(UUID.randomUUID()); metadata.setBranchingFactor(getBranchingFactor()); metadata.setIsolatable(true); btree = BTree.create(journal, metadata); } journal.registerIndex(name, btree ); journal.commit(); // NOT isolated. long tx = 0L; // run test. doIndexWriteRateTest(name, tx, 128); } /** * Test the index write rate for a fully isolated transaction using 32 bit * integer keys and 128 byte values for the index entries. */ public void testIsolatedIndexWriteRate() throws IOException { // register named index that can support isolation. String name = "abc"; final BTree btree; { IndexMetadata metadata = new IndexMetadata(UUID.randomUUID()); metadata.setBranchingFactor(getBranchingFactor()); metadata.setIsolatable(true); btree = BTree.create(journal, metadata); } journal.registerIndex(name, btree ); journal.commit(); // fully isolated transaction. long tx = journal.newTx(ITx.UNISOLATED); // run test. doIndexWriteRateTest(name, tx, 128); } /** * Writes N records of the given size such that the journal will be filled * to capacity using the {@link IRawStore} interface (unisolated raw writes * not using an index). * * @param writeSize * The size of the object to be written. * * @return The elapsed time for the test. */ public long doRawRecordWriteRateTest(int writeSize) { Journal store = journal; System.err.println("Begin: bufferMode="+journal.getBufferStrategy().getBufferMode()); final long begin = System.currentTimeMillis(); final int nwrites = (int) journal.getBufferStrategy().getUserExtent() / writeSize; System.err.println("writeSize=" + writeSize + ", nwrites=" + nwrites); // the buffer is reused on each write. ByteBuffer data = ByteBuffer.allocate/*Direct*/(writeSize); for( int i=0; i<nwrites; i++ ) { data.put(0,(byte)i); // at least one non-zero byte. data.position( 0 ); data.limit( writeSize ); store.write(data); } final long elapsed = System.currentTimeMillis() - begin; final long bytesWritten = journal.getBufferStrategy().getNextOffset(); // MB/sec. final double writeRate = (bytesWritten/(double)Bytes.megabyte) / (elapsed / 1000d); System.err.println("Elapsed: " + elapsed + "(ms), bufferMode=" + journal.getBufferStrategy().getBufferMode() + ", recordSize=" + cf.format(writeSize) + ", nwrites=" + cf.format(nwrites) + ", writeRate=" + fpf.format(writeRate) + "MB/sec"); return elapsed; } /** * Writes N records of the given size such that the journal will be filled * to "near" capacity using either an isolated or unisolated {@link BTree} * to absorb the writes. The records are written in key order, so this is * the best cast for sequential key writes. The test ends before the journal * would overflow in order to measure only the cost of writes without buffer * extension handling. * <p> * Note that for transactional writes, the writes are buffered in memory and * then on disk, validated against the buffered writes, and finally * transferred to the unisolated index on the journal. Short transactions * are therefore very fast, but large transactions will be significantly * slower than the corresponding unisolated writes since there is several * times more IO for large transactions (write on tx buffer, read tx buffer * and validate against the unisolated index, read tx buffer and write on * the unisolated index). However, there is also logic to defeat validation * when no concurrent writes have occurred, so the worst case will not be * demonstrated by a single write process. * * @param name * The name of the index on which the writes will be performed. * The named index MUST have been registered by the caller and * that registration MUST have been committed. * * @param tx * The transaction identifier -or- 0L if the writes will not be * isolated by a transaction. * * @param valueSize * The size in bytes of the value to be written under each key. * * @return The elapsed time for the test. */ public long doIndexWriteRateTest(String name, long tx, int valueSize) { IKeyBuilder keyBuilder = new KeyBuilder(Bytes.SIZEOF_INT); // @todo rewrite as a Task submitted to the journal using that timestamp. IIndex ndx = (tx == 0 ? journal.getIndex(name) : journal.getLocalTransactionManager().getTx(tx).getIndex(name)); System.err.println("Begin: index write rate, isolated=" + (tx == 0 ? "no" : "yes") + ", isolatable=" + ndx.getIndexMetadata().isIsolatable() + ", bufferMode=" + journal.getBufferStrategy().getBufferMode()); // target percentage full to avoid journal overflow. final double percentFull = .90; // #of entries to insert into the index. final int nwrites = (int) (journal.getBufferStrategy().getExtent() * percentFull / valueSize); final long begin; { begin = System.currentTimeMillis(); for (int i = 0; i < nwrites; i++) { // key[] is new on each insert; keys are monotonically // increasing. final byte[] key = keyBuilder.reset().append(i).getKey(); // value[] is new on each insert. final byte[] value = new byte[valueSize]; value[0] = (byte) i; // at least one non-zero byte. ndx.insert(key, value); } } if (tx == 0L) { /* * Force to stable store when not using isolation (the transaction * does this anyway so this makes things more fair). */ final long beginCommit = System.currentTimeMillis(); final long elapsedWrite = beginCommit - begin; journal.commit(); final long elapsedCommit = System.currentTimeMillis() - beginCommit; System.err.println("Write : "+elapsedWrite+"(ms)"); System.err.println("Commit : "+elapsedCommit+"(ms)"); } else { /* * @todo track active vs validation vs commit time for transactions * so that I can report them here. */ // ITx t = journal.getTx(tx); // // final long beginPrepare = System.currentTimeMillis(); // // final long elapsedWrite = beginPrepare - begin; // // t.prepare(journal.nextTimestamp()); // // final long beginCommit = System.currentTimeMillis(); // // final long elapsedPrepare = beginCommit - beginPrepare; final long beginCommit = System.currentTimeMillis(); final long elapsedWrite = beginCommit - begin; journal.commit(tx); final long elapsedCommit = System.currentTimeMillis() - beginCommit; System.err.println("Write : "+elapsedWrite+"(ms)"); // System.err.println("Prepare: "+elapsedPrepare+"(ms)"); System.err.println("Commit : "+elapsedCommit+"(ms)"); } final long elapsed = System.currentTimeMillis() - begin; // The unisolated btree on which the data were actually written. final BTree btree = (BTree)journal.getIndex(name); final long nodesWritten = btree.getBtreeCounters().getNodesWritten(); final long leavesWritten = btree.getBtreeCounters().getLeavesWritten(); final long bytesWrittenByBTree = btree.getBtreeCounters().getBytesWritten(); final long bytesWritten = journal.getBufferStrategy().getNextOffset(); System.err.println("bytesWritten: btree="+bytesWrittenByBTree+", journal="+bytesWritten); System.err.println("btree counters: "+btree.getBtreeCounters()); final long recordsWritten = (nodesWritten + leavesWritten); final double averageRecordSize = bytesWrittenByBTree / (double)recordsWritten; // MB/sec. final double writeRate = (bytesWritten/(double)Bytes.megabyte) / (elapsed / 1000d); System.err.println("Elapsed: " + elapsed + "(ms), bufferMode=" + journal.getBufferStrategy().getBufferMode() + ", valueSize=" + cf.format(valueSize) + ", ninserts=" + cf.format(nwrites) + ", nrecordsWritten=" + recordsWritten + ", averageRecordSize=" + fpf.format(averageRecordSize) + ", branchingFactor="+btree.getBranchingFactor() + ", writeRate=" + fpf.format(writeRate) + "MB/sec"); return elapsed; } public static class BenchmarkTransientJournal extends BenchmarkJournalWriteRate { @Override protected BufferMode getBufferMode() {return BufferMode.Transient;} } public static class BenchmarkDirectJournal extends BenchmarkJournalWriteRate { @Override protected BufferMode getBufferMode() {return BufferMode.Direct;} } public static class BenchmarkMappedJournal extends BenchmarkJournalWriteRate { @Override protected BufferMode getBufferMode() {return BufferMode.Mapped;} } public static class BenchmarkDiskJournal extends BenchmarkJournalWriteRate { @Override protected BufferMode getBufferMode() {return BufferMode.Disk;} } public static class BenchmarkDiskRWJournal extends BenchmarkJournalWriteRate { @Override protected BufferMode getBufferMode() {return BufferMode.DiskRW;} } /** * <p> * Does N writes of M size data blocks on a pre-extended file using pure * sequential IO. Small writes may be used to estimate the maximum * throughput for large numbers of small writes. Large writes may be used to * estimate the absolute maximum throughput for your platform (OS + disk * system). * </p> * <p> * Note: This test is conducted without the use of a journal. It is bundled * in the same source code file so that we can compare the journal * performance with the raw IO performance of the platform. * </p> * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a> * @version $Id$ */ public abstract static class AbstractBenchmarkOptimium extends TestCase2 { /** * The file is named for the test class. * <p> * Note: contents are not actually a journal. */ protected String getFilename() { return getClass().getSimpleName()+".bin"; } /** * Override to specify the record size. */ abstract public int getRecordSize(); /** * 100M */ protected int getInitialExtent() { return Bytes.megabyte32 * 100; } RandomAccessFile raf; void deleteFile() { try { File file = new File(getFilename()); if ( file.exists() && ! file.delete()) { System.err.println("Warning: could not delete: " + file.getAbsolutePath()); } } catch (Throwable t) { System.err.println("Warning: " + t); } } public void setUp() throws IOException { System.err.println("------------------\n"); deleteFile(); // Note: This makes it MUCH slower. boolean forceWrites = false; raf = new RandomAccessFile(getFilename(),"rw"+(forceWrites?"d":"")); } public void tearDown() throws IOException { raf.getChannel().force(false); raf.close(); deleteFile(); } public void testNoIsolation() throws IOException { doOptimiumWriteRateTest(getRecordSize()); } /** * * @param recordSize * @throws IOException */ public void doOptimiumWriteRateTest(int recordSize) throws IOException { final long begin = System.currentTimeMillis(); final int dataSize = getRecordSize(); final long initialExtent = getInitialExtent(); final int nwrites = (int) initialExtent / dataSize; System.err.println("Begin: optimum write rate test: #writes=" + nwrites + ", dataSize=" + dataSize); raf.setLength(initialExtent); ByteBuffer data = ByteBuffer.allocateDirect(dataSize); long pos = 0; for( int i=0; i<nwrites; i++ ) { data.put(0,(byte)i); // at least one non-zero byte. data.position( 0 ); data.limit( dataSize ); raf.getChannel().write(data,pos); pos += dataSize; } final long elapsed = System.currentTimeMillis() - begin; final long bytesWritten = raf.length(); // MB/sec. final double writeRate = (bytesWritten/(double)Bytes.megabyte) / (elapsed / 1000d); System.err.println("Elapsed: " + elapsed + "(ms), non-journal optimum, recordSize=" + cf.format(dataSize) + ", nwrites=" + cf.format(nwrites) + ", writeRate=" + fpf.format(writeRate) + "MB/sec"); } } /** * <p> * Writes the same amount of data, using <code>128</code> byte records on * a pre-extended file using pure sequential IO. This case should produce * the optimium throughput to disk for small IOs. * </p> * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a> * @version $Id$ */ public static class BenchmarkSmallRecordOptimium extends AbstractBenchmarkOptimium { /** * 128 bytes. */ public int getRecordSize() { return 128; } } /** * <p> * Writes the same amount of data using large blocks on a pre-extended file * using pure sequential IO. This case should produce the "best-case" * optimium throughput to disk <i>for block-oriented IO</i>. In order for * the journal to approach this best case scenario, you need to be writing * large blocks. Note that the btree does exactly this, but the limiting * factor for throughput is the write on the btree data structures (mostly * key search) rather than the writes on the journal and their consequent * IO. * </p> * <p> * Note: This overrides several methods in the base class in order to * conduct a test without the use of a journal. * </p> * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a> * @version $Id$ */ public static class BenchmarkBlockBasedOptimium extends AbstractBenchmarkOptimium { /** * 8k */ public int getRecordSize() { return Bytes.kilobyte32 * 8; } } /** * <p> * Writes the same amount of data using a single nio "write buffer" * operation on a pre-extended file. The buffer is a direct buffer, so it is * allocated in the OS memory. The write should be pure sequential IO. This * case should produce the "best-case" optimium throughput to disk <i>for * sustained IO</i>. The journal SHOULD NOT be able approach this best case * scenario. Comparison to this case should reveal the overhead of the * journal, Java, and block-oriented IO when compare to sustained sequential * data transfer from RAM to disk. Since block-based IO is, in fact, better, * one can only presume that the nio library has some problem with very * large writes. * </p> * <p> * Note: This overrides several methods in the base class in order to * conduct a test without the use of a journal. * </p> * <p> * Note: I have seen block-based IO perform better in cases where system * resources were low (the disk was nearly full). * </p> * * @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a> * @version $Id$ */ public static class BenchmarkSustainedTransferOptimium extends AbstractBenchmarkOptimium { /** * The entire extent in one sustained write. */ public int getRecordSize() { return (int) getInitialExtent(); } } /** * Runs the tests that have not been commented out :-) * <p> * Note: Running all benchmarks together can challange the VM by running low * on heap, native memory given over to direct buffers - and things can actually * slow down with more memory. */ public static Test suite() { TestSuite suite = new TestSuite("Benchmark Journal Write Rates"); suite.addTestSuite( BenchmarkTransientJournal.class ); // suite.addTestSuite( BenchmarkDirectJournal.class ); // suite.addTestSuite( BenchmarkMappedJournal.class ); suite.addTestSuite( BenchmarkDiskJournal.class ); suite.addTestSuite( BenchmarkDiskRWJournal.class ); suite.addTestSuite( BenchmarkSmallRecordOptimium.class ); suite.addTestSuite( BenchmarkBlockBasedOptimium.class ); suite.addTestSuite( BenchmarkSustainedTransferOptimium.class ); return suite; } /** * Main routine can be used for running the test under a performance * analyzer. * * @param args * Not used. * * @throws Exception */ public static void main(String[] args) throws Exception { BenchmarkTransientJournal test = new BenchmarkTransientJournal(); test.setUp(); try { /* * Choose one test to run. (You must setUp/tearDown for each test). */ test.testNonIsolatableIndexWriteRate(); // test.testUnisolatedIndexWriteRate(); // test.testIsolatedIndexWriteRate(); } finally { test.tearDown(); } } }