/**
Copyright (C) SYSTAP, LLC DBA Blazegraph 2006-2016. All rights reserved.
Contact:
SYSTAP, LLC DBA Blazegraph
2501 Calvert ST NW #106
Washington, DC 20008
licenses@blazegraph.com
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/*
* Created on Dec 21, 2006
*/
package com.bigdata.btree;
import java.io.File;
import java.io.IOException;
import java.util.Properties;
import java.util.UUID;
import com.bigdata.btree.keys.TestKeyBuilder;
import com.bigdata.journal.BufferMode;
import com.bigdata.journal.Options;
import com.bigdata.rawstore.IRawStore;
import com.bigdata.rawstore.SimpleMemoryRawStore;
import com.bigdata.util.BytesUtil;
/**
* Test build trees on the journal, evicts them into an {@link IndexSegment},
* and then compares the performance and correctness of index point tests with
* and without the use of the bloom filter.
*
* @todo compare performance with and without the bloom filter.
*
* @todo test points that will not be in the index as well as those that are.
*
* @todo report on the cost to construct the filter and its serialized size and
* runtime space.
*
* @todo verify the target error rate.
*
* @todo explore different error rates, including Fast.mostSignificantBit( n ) +
* 1 which would provide an expectation of no false positives.
*
* @todo Compare for each build algorithm, just like
* {@link TestIndexSegmentBuilderWithLargeTrees}.
*
* @author <a href="mailto:thompsonbry@users.sourceforge.net">Bryan Thompson</a>
* @version $Id$
*/
public class TestIndexSegmentWithBloomFilter extends AbstractBTreeTestCase {
public TestIndexSegmentWithBloomFilter() {
}
public TestIndexSegmentWithBloomFilter(String name) {
super(name);
}
private static final boolean bufferNodes = true;
public Properties getProperties() {
if (properties == null) {
properties = super.getProperties();
properties.setProperty(Options.BUFFER_MODE, BufferMode.Disk
.toString());
properties.setProperty(Options.CREATE_TEMP_FILE, "true");
}
return properties;
}
private Properties properties;
/**
* Return a btree backed by a journal with the indicated branching factor.
* The serializer requires that values in leaves are {@link SimpleEntry}
* objects.
*
* @param branchingFactor
* The branching factor.
*
* @return The btree.
*/
public BTree getBTree(int branchingFactor, BloomFilterFactory bloomFilterFactory) {
IRawStore store = new SimpleMemoryRawStore();
IndexMetadata metadata = new IndexMetadata(UUID.randomUUID());
metadata.setBranchingFactor(branchingFactor);
metadata.setBloomFilterFactory(bloomFilterFactory);
BTree btree = BTree.create(store, metadata);
return btree;
}
/**
* Branching factors for the source btree that is then used to build an
* {@link IndexSegment}. This parameter indirectly determines both the #of
* leaves and the #of entries in the source btree.
*
* Note: Regardless of the branching factor in the source btree, the same
* {@link IndexSegment} should be build for a given set of entries
* (key-value pairs) and a given output branching factor for the
* {@link IndexSegment}. However, input trees of different heights also
* stress different parts of the algorithm.
*/
final int[] branchingFactors = new int[]{3,4,5,10,13};//64};//128};//,512};
/**
* A stress test for building {@link IndexSegment}s. A variety of
* {@link BTree}s are built from dense random keys in [1:n] using a variety
* of branching factors. For each {@link BTree}, a variety of
* {@link IndexSegment}s are built using a variety of output branching
* factors. For each {@link IndexSegment}, we then compare it against its
* source {@link BTree} for the same total ordering.
*/
public void test_randomDenseKeys() throws Exception {
final double p = 1/64d;// error rate
final double maxP = p*10; // max error rate
for(int i=0; i<branchingFactors.length; i++) {
final int m = branchingFactors[i];
doBuildIndexSegmentAndCompare(doSplitWithRandomDenseKeySequence(
getBTree(m, new BloomFilterFactory(m/* n */, p, maxP)), m,
m));
doBuildIndexSegmentAndCompare(doSplitWithRandomDenseKeySequence(
getBTree(m, new BloomFilterFactory(m * m/* n */, p, maxP)),
m, m * m));
doBuildIndexSegmentAndCompare(doSplitWithRandomDenseKeySequence(
getBTree(m, new BloomFilterFactory(m * m * m/* n */, p,
maxP)), m, m * m * m));
// @todo overflows the initial journal extent.
// doBuildIndexSegmentAndCompare( doSplitWithRandomDenseKeySequence( getBTree(m,errorRate), m, m*m*m*m ) );
}
}
/**
* A stress test for building {@link IndexSegment}s. A variety of
* {@link BTree}s are built from spase random keys using a variety of
* branching factors. For each {@link BTree}, a variety of
* {@link IndexSegment}s are built using a variety of output branching
* factors. For each {@link IndexSegment}, we then compare it against its
* source {@link BTree} for the same total ordering.
*/
public void test_randomSparseKeys() throws Exception {
int trace = 0;
for(int i=0; i<branchingFactors.length; i++) {
int m = branchingFactors[i];
doBuildIndexSegmentAndCompare(doInsertRandomSparseKeySequenceTest(
getBTree(m), m, trace));
doBuildIndexSegmentAndCompare(doInsertRandomSparseKeySequenceTest(
getBTree(m), m * m, trace));
doBuildIndexSegmentAndCompare(doInsertRandomSparseKeySequenceTest(
getBTree(m), m * m * m, trace));
//@todo overflows the initial journal extent.
// doBuildIndexSegmentAndCompare( doInsertRandomSparseKeySequenceTest(m,m*m*m*m,trace) );
}
}
/**
* Test when the input tree is a root leaf with three values. The output
* tree will also be a root leaf.
*
* @throws IOException
*/
public void test_rootLeaf() throws Exception {
final int m = 3; // for input and output trees.
final BTree btree = getBTree(m, new BloomFilterFactory(100, 1 / 64d,
1 / 32d));
SimpleEntry v3 = new SimpleEntry(3);
SimpleEntry v5 = new SimpleEntry(5);
SimpleEntry v7 = new SimpleEntry(7);
btree.insert(TestKeyBuilder.asSortKey(3), v3);
btree.insert(TestKeyBuilder.asSortKey(5), v5);
btree.insert(TestKeyBuilder.asSortKey(7), v7);
final File outFile2 = new File(getName()+"_m"+m+ "_bloom.seg");
if( outFile2.exists() && ! outFile2.delete() ) {
fail("Could not delete old index segment: "+outFile2.getAbsoluteFile());
}
final File tmpDir = outFile2.getAbsoluteFile().getParentFile();
/*
* Build the index segment with a bloom filter.
*/
if (log.isInfoEnabled())
log.info("Building index segment (w/ bloom): in(m="
+ btree.getBranchingFactor() + ", nentries=" + btree.getEntryCount()
+ "), out(m=" + m + ")");
final long commitTime = System.currentTimeMillis();
final IndexSegmentBuilder builder2 = IndexSegmentBuilder.newInstance(
outFile2, tmpDir, btree.getEntryCount(), btree.rangeIterator(),
m, btree.getIndexMetadata(), commitTime,
true/* compactingMerge */, bufferNodes);
@SuppressWarnings("unused")
final IndexSegmentCheckpoint checkpoint = builder2.call();
// @see BLZG-1501 (remove LRUNexus)
// if (LRUNexus.INSTANCE != null) {
//
// /*
// * Clear the records for the index segment from the cache so we will
// * read directly from the file. This is necessary to ensure that the
// * data on the file is good rather than just the data in the cache.
// */
//
// LRUNexus.INSTANCE.deleteCache(checkpoint.segmentUUID);
//
// }
// IndexSegmentBuilder builder2 = new IndexSegmentBuilder(outFile2,
// tmpDir, btree, m, 1/64.);
// the bloom filter instance before serialization.
IBloomFilter bloomFilter = builder2.bloomFilter;
// false positive tests (should succeed with reasonable errorRate).
assertTrue("3",bloomFilter.contains(i2k(3)));
assertTrue("5",bloomFilter.contains(i2k(5)));
assertTrue("7",bloomFilter.contains(i2k(7)));
// correct rejections (must succeed)
assertFalse("4",bloomFilter.contains(i2k(4)));
assertFalse("9",bloomFilter.contains(i2k(9)));
/*
* Verify can load the index file and that the metadata
* associated with the index file is correct (we are only
* checking those aspects that are easily defined by the test
* case and not, for example, those aspects that depend on the
* specifics of the length of serialized nodes or leaves).
*/
if (log.isInfoEnabled())
log.info("Opening index segment w/ bloom filter.");
final IndexSegment seg2 = new IndexSegmentStore(outFile2).loadIndexSegment();
try {
/*
* Verify the total index order.
*/
if (log.isInfoEnabled())
log.info("Verifying index segments.");
assertSameBTree(btree, seg2);
// the bloom filter instance that was de-serialized.
bloomFilter = seg2.getBloomFilter();
// false positive tests (should succeed with resonable errorRate).
assertTrue("3",bloomFilter.contains(i2k(3)));
assertTrue("5",bloomFilter.contains(i2k(5)));
assertTrue("7",bloomFilter.contains(i2k(7)));
// correct rejections (must succeed)
assertFalse("4",bloomFilter.contains(i2k(4)));
assertFalse("9",bloomFilter.contains(i2k(9)));
// Note: this is a very small index (3 keys) so the cast is safe.
byte[][] keys = new byte[(int)btree.getEntryCount()][];
byte[][] vals = new byte[(int)btree.getEntryCount()][];
getKeysAndValues(btree,keys,vals);
doRandomLookupTest("btree", btree, keys, vals);
doRandomLookupTest("w/ bloom", seg2, keys, vals);
} finally {
if (log.isInfoEnabled())
log.info("Closing index segments.");
seg2.close();
}
if (!outFile2.delete()) {
log.warn("Could not delete index segment: " + outFile2);
}
}
/**
* Test helper builds an index segment from the btree using several
* different branching factors and each time compares the resulting total
* ordering to the original btree.
*
* @param btree The source btree.
*/
public void doBuildIndexSegmentAndCompare(final BTree btree)
throws Exception {
try {
if (btree.getEntryCount() > Integer.MAX_VALUE) {
/*
* This code can not validate a B+Tree with more than MAX_INT keys
* since it relies on materialization of the data in RAM within
* arrays and Java does not support int64 array indices.
*/
throw new RuntimeException();
}
// branching factors used for the index segment.
final int branchingFactors[] = new int[] { 3, 4, 5, 10, 20, 60, 100,
256, 1024, 4096, 8192 };
for( int i=0; i<branchingFactors.length; i++ ) {
int m = branchingFactors[i];
final File outFile = new File(getName()+"_m"+m+ ".seg");
final File outFile2 = new File(getName()+"_m"+m+ "_bloom.seg");
if( outFile.exists() && ! outFile.delete() ) {
fail("Could not delete old index segment: "+outFile.getAbsoluteFile());
}
if( outFile2.exists() && ! outFile2.delete() ) {
fail("Could not delete old index segment: "+outFile2.getAbsoluteFile());
}
final File tmpDir = outFile.getAbsoluteFile().getParentFile();
/*
* Build the index segment.
*/
final long commitTime = System.currentTimeMillis();
{
if (log.isInfoEnabled())
log.info("Building index segment (w/o bloom): in(m="
+ btree.getBranchingFactor() + ", nentries=" + btree.getEntryCount()
+ "), out(m=" + m + ")");
IndexMetadata metadata = btree.getIndexMetadata().clone();
metadata.setBloomFilterFactory(null/*disable*/);
IndexSegmentBuilder.newInstance(outFile, tmpDir, btree.getEntryCount(),
btree.rangeIterator(), m, metadata, commitTime,
true/*compactingMerge*/,bufferNodes).call();
// new IndexSegmentBuilder(outFile, tmpDir, btree, m, 0.);
}
final IndexSegmentBuilder builder2;
{
if (log.isInfoEnabled())
log.info("Building index segment (w/ bloom): in(m="
+ btree.getBranchingFactor() + ", nentries=" + btree.getEntryCount()
+ "), out(m=" + m + ")");
final IndexMetadata metadata = btree.getIndexMetadata().clone();
/*
* Note: Since we know the exact #of index entries in an index
* segment, both [n] and [maxP] will be ignored when it comes
* time to create the bloom filter for the index segment.
*/
metadata.setBloomFilterFactory(new BloomFilterFactory(
1/* n */, 1 / 64d/* p */, 1 / 32d/* maxP */));
builder2 = IndexSegmentBuilder.newInstance(outFile2, tmpDir,
btree.getEntryCount(), btree.rangeIterator(), m,
metadata, commitTime, true/* compactingMerge */,
bufferNodes);
builder2.call();
// IndexSegmentBuilder builder2 = new IndexSegmentBuilder(outFile2,
// tmpDir, btree, m, 1/64.);
}
/*
* Verify can load the index file and that the metadata
* associated with the index file is correct (we are only
* checking those aspects that are easily defined by the test
* case and not, for example, those aspects that depend on the
* specifics of the length of serialized nodes or leaves).
*/
if (log.isInfoEnabled())
log.info("Opening index segment w/o bloom filter.");
final IndexSegment seg = new IndexSegmentStore(outFile).loadIndexSegment();
/*
* Verify can load the index file and that the metadata
* associated with the index file is correct (we are only
* checking those aspects that are easily defined by the test
* case and not, for example, those aspects that depend on the
* specifics of the length of serialized nodes or leaves).
*/
if (log.isInfoEnabled())
log.info("Opening index segment w/ bloom filter.");
final IndexSegment seg2 = new IndexSegmentStore(outFile2).loadIndexSegment();
/*
* Explicitly test the bloom filter against ground truth.
*/
// Note: cast is safe - we check entryCount above.
final byte[][] keys = new byte[(int)btree.getEntryCount()][];
final byte[][] vals = new byte[(int)btree.getEntryCount()][];
getKeysAndValues(btree,keys,vals);
/*
* vet the bloom filter on the index segment builder
* (pre-serialization).
*/
doBloomFilterTest("pre-serialization", builder2.bloomFilter, keys);
/*
* vet the bloom filter on the loaded index segment
* (post-serialization).
*/
doBloomFilterTest("pre-serialization", seg2.getBloomFilter(), keys);
/*
* Verify index segments against the source btree and against one
* another.
*/
if (log.isInfoEnabled())
log.info("Verifying index segments.");
assertSameBTree(btree, seg);
assertSameBTree(btree, seg2);
seg2.close(); // close seg w/ bloom filter and the verify with implicit reopen.
assertSameBTree(seg, seg2);
if (log.isInfoEnabled())
log.info("Closing index segments.");
seg.close();
seg2.close();
if (!outFile.delete()) {
log.warn("Could not delete index segment: " + outFile);
}
if (!outFile2.delete()) {
log.warn("Could not delete index segment: " + outFile2);
}
} // build index segment with the next branching factor.
} finally {
/*
* Closing the journal.
*/
if (log.isInfoEnabled())
log.info("Closing journal.");
btree.getStore().destroy();
}
}
/**
* Test the bloom filter for false negatives given the ground truth set of
* keys. if it reports that a key was not in the bloom filter then that is a
* false negative. bloom filters are not supposed to have false negative.
*
* @param keys
* The ground truth keys that were inserted into the bloom
* filter.
*/
protected void doBloomFilterTest(String label, IBloomFilter bloomFilter, byte[][] keys) {
/*
* Closing the journal.
*/
if (log.isInfoEnabled())
log.info("\ncondition: "+label);//+", size="+bloomFilter.size());
final int[] order = getRandomOrder(keys.length);
for (int i = 0; i < order.length; i++) {
final byte[] key = keys[order[i]];
final boolean found = bloomFilter.contains(key);
assertTrue("false negative: i=" + i + ", key="
+ BytesUtil.toString(key), found);
}
}
}