/* * * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.hbase.regionserver; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Random; import java.util.TreeSet; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.*; import org.apache.hadoop.hbase.client.Scan; import org.apache.hadoop.hbase.io.hfile.BlockCache; import org.apache.hadoop.hbase.io.hfile.CacheConfig; import org.apache.hadoop.hbase.io.hfile.NoOpDataBlockEncoder; import org.apache.hadoop.hbase.io.hfile.HFile; import org.apache.hadoop.hbase.io.hfile.TestHFileWriterV2; import org.apache.hadoop.hbase.regionserver.StoreFile.BloomType; import org.apache.hadoop.hbase.util.BloomFilterFactory; import org.apache.hadoop.hbase.util.ByteBloomFilter; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.CompoundBloomFilter; import org.apache.hadoop.hbase.util.CompoundBloomFilterBase; import org.apache.hadoop.hbase.util.CompoundBloomFilterWriter; import org.junit.Before; import org.junit.Test; import org.junit.experimental.categories.Category; /** * Tests writing Bloom filter blocks in the same part of the file as data * blocks. */ @Category(MediumTests.class) public class TestCompoundBloomFilter { private static final HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(); private static final Log LOG = LogFactory.getLog( TestCompoundBloomFilter.class); private static final int NUM_TESTS = 9; private static final BloomType BLOOM_TYPES[] = { BloomType.ROW, BloomType.ROW, BloomType.ROWCOL, BloomType.ROWCOL, BloomType.ROW, BloomType.ROWCOL, BloomType.ROWCOL, BloomType.ROWCOL, BloomType.ROW }; private static final int NUM_KV[]; static { final int N = 10000; // Only used in initialization. NUM_KV = new int[] { 21870, N, N, N, N, 1000, N, 7500, 7500}; assert NUM_KV.length == NUM_TESTS; } private static final int BLOCK_SIZES[]; static { final int blkSize = 65536; BLOCK_SIZES = new int[] { 512, 1000, blkSize, blkSize, blkSize, 128, 300, blkSize, blkSize }; assert BLOCK_SIZES.length == NUM_TESTS; } /** * Be careful not to specify too high a Bloom filter block size, otherwise * there will only be one oversized chunk and the observed false positive * rate will be too low. */ private static final int BLOOM_BLOCK_SIZES[] = { 1000, 4096, 4096, 4096, 8192, 128, 1024, 600, 600 }; static { assert BLOOM_BLOCK_SIZES.length == NUM_TESTS; } private static final double TARGET_ERROR_RATES[] = { 0.025, 0.01, 0.015, 0.01, 0.03, 0.01, 0.01, 0.07, 0.07 }; static { assert TARGET_ERROR_RATES.length == NUM_TESTS; } /** A false positive rate that is obviously too high. */ private static final double TOO_HIGH_ERROR_RATE; static { double m = 0; for (double errorRate : TARGET_ERROR_RATES) m = Math.max(m, errorRate); TOO_HIGH_ERROR_RATE = m + 0.03; } private static Configuration conf; private static CacheConfig cacheConf; private FileSystem fs; private BlockCache blockCache; /** A message of the form "in test#<number>:" to include in logging. */ private String testIdMsg; private static final int GENERATION_SEED = 2319; private static final int EVALUATION_SEED = 135; @Before public void setUp() throws IOException { conf = TEST_UTIL.getConfiguration(); // This test requires the most recent HFile format (i.e. v2). conf.setInt(HFile.FORMAT_VERSION_KEY, HFile.MAX_FORMAT_VERSION); fs = FileSystem.get(conf); cacheConf = new CacheConfig(conf); blockCache = cacheConf.getBlockCache(); assertNotNull(blockCache); } private List<KeyValue> createSortedKeyValues(Random rand, int n) { List<KeyValue> kvList = new ArrayList<KeyValue>(n); for (int i = 0; i < n; ++i) kvList.add(TestHFileWriterV2.randomKeyValue(rand)); Collections.sort(kvList, KeyValue.COMPARATOR); return kvList; } @Test public void testCompoundBloomFilter() throws IOException { conf.setBoolean(BloomFilterFactory.IO_STOREFILE_BLOOM_ENABLED, true); for (int t = 0; t < NUM_TESTS; ++t) { conf.setFloat(BloomFilterFactory.IO_STOREFILE_BLOOM_ERROR_RATE, (float) TARGET_ERROR_RATES[t]); testIdMsg = "in test #" + t + ":"; Random generationRand = new Random(GENERATION_SEED); List<KeyValue> kvs = createSortedKeyValues(generationRand, NUM_KV[t]); BloomType bt = BLOOM_TYPES[t]; Path sfPath = writeStoreFile(t, bt, kvs); readStoreFile(t, bt, kvs, sfPath); } } /** * Validates the false positive ratio by computing its z-value and comparing * it to the provided threshold. * * @param falsePosRate experimental positive rate * @param nTrials the number of Bloom filter checks * @param zValueBoundary z-value boundary, positive for an upper bound and * negative for a lower bound * @param cbf the compound Bloom filter we are using * @param additionalMsg additional message to include in log output and * assertion failures */ private void validateFalsePosRate(double falsePosRate, int nTrials, double zValueBoundary, CompoundBloomFilter cbf, String additionalMsg) { double p = BloomFilterFactory.getErrorRate(conf); double zValue = (falsePosRate - p) / Math.sqrt(p * (1 - p) / nTrials); String assortedStatsStr = " (targetErrorRate=" + p + ", falsePosRate=" + falsePosRate + ", nTrials=" + nTrials + ")"; LOG.info("z-value is " + zValue + assortedStatsStr); boolean isUpperBound = zValueBoundary > 0; if (isUpperBound && zValue > zValueBoundary || !isUpperBound && zValue < zValueBoundary) { String errorMsg = "False positive rate z-value " + zValue + " is " + (isUpperBound ? "higher" : "lower") + " than " + zValueBoundary + assortedStatsStr + ". Per-chunk stats:\n" + cbf.formatTestingStats(); fail(errorMsg + additionalMsg); } } private void readStoreFile(int t, BloomType bt, List<KeyValue> kvs, Path sfPath) throws IOException { StoreFile sf = new StoreFile(fs, sfPath, conf, cacheConf, bt, NoOpDataBlockEncoder.INSTANCE); StoreFile.Reader r = sf.createReader(); final boolean pread = true; // does not really matter StoreFileScanner scanner = r.getStoreFileScanner(true, pread); { // Test for false negatives (not allowed). int numChecked = 0; for (KeyValue kv : kvs) { byte[] row = kv.getRow(); boolean present = isInBloom(scanner, row, kv.getQualifier()); assertTrue(testIdMsg + " Bloom filter false negative on row " + Bytes.toStringBinary(row) + " after " + numChecked + " successful checks", present); ++numChecked; } } // Test for false positives (some percentage allowed). We test in two modes: // "fake lookup" which ignores the key distribution, and production mode. for (boolean fakeLookupEnabled : new boolean[] { true, false }) { ByteBloomFilter.setFakeLookupMode(fakeLookupEnabled); try { String fakeLookupModeStr = ", fake lookup is " + (fakeLookupEnabled ? "enabled" : "disabled"); CompoundBloomFilter cbf = (CompoundBloomFilter) r.getGeneralBloomFilter(); cbf.enableTestingStats(); int numFalsePos = 0; Random rand = new Random(EVALUATION_SEED); int nTrials = NUM_KV[t] * 10; for (int i = 0; i < nTrials; ++i) { byte[] query = TestHFileWriterV2.randomRowOrQualifier(rand); if (isInBloom(scanner, query, bt, rand)) { numFalsePos += 1; } } double falsePosRate = numFalsePos * 1.0 / nTrials; LOG.debug(String.format(testIdMsg + " False positives: %d out of %d (%f)", numFalsePos, nTrials, falsePosRate) + fakeLookupModeStr); // Check for obvious Bloom filter crashes. assertTrue("False positive is too high: " + falsePosRate + " (greater " + "than " + TOO_HIGH_ERROR_RATE + ")" + fakeLookupModeStr, falsePosRate < TOO_HIGH_ERROR_RATE); // Now a more precise check to see if the false positive rate is not // too high. The reason we use a relaxed restriction for the real-world // case as opposed to the "fake lookup" case is that our hash functions // are not completely independent. double maxZValue = fakeLookupEnabled ? 1.96 : 2.5; validateFalsePosRate(falsePosRate, nTrials, maxZValue, cbf, fakeLookupModeStr); // For checking the lower bound we need to eliminate the last chunk, // because it is frequently smaller and the false positive rate in it // is too low. This does not help if there is only one under-sized // chunk, though. int nChunks = cbf.getNumChunks(); if (nChunks > 1) { numFalsePos -= cbf.getNumPositivesForTesting(nChunks - 1); nTrials -= cbf.getNumQueriesForTesting(nChunks - 1); falsePosRate = numFalsePos * 1.0 / nTrials; LOG.info(testIdMsg + " False positive rate without last chunk is " + falsePosRate + fakeLookupModeStr); } validateFalsePosRate(falsePosRate, nTrials, -2.58, cbf, fakeLookupModeStr); } finally { ByteBloomFilter.setFakeLookupMode(false); } } r.close(true); // end of test so evictOnClose } private boolean isInBloom(StoreFileScanner scanner, byte[] row, BloomType bt, Random rand) { return isInBloom(scanner, row, TestHFileWriterV2.randomRowOrQualifier(rand)); } private boolean isInBloom(StoreFileScanner scanner, byte[] row, byte[] qualifier) { Scan scan = new Scan(row, row); TreeSet<byte[]> columns = new TreeSet<byte[]>(Bytes.BYTES_COMPARATOR); columns.add(qualifier); return scanner.shouldUseScanner(scan, columns, Long.MIN_VALUE); } private Path writeStoreFile(int t, BloomType bt, List<KeyValue> kvs) throws IOException { conf.setInt(BloomFilterFactory.IO_STOREFILE_BLOOM_BLOCK_SIZE, BLOOM_BLOCK_SIZES[t]); conf.setBoolean(CacheConfig.CACHE_BLOCKS_ON_WRITE_KEY, true); cacheConf = new CacheConfig(conf); StoreFile.Writer w = new StoreFile.WriterBuilder(conf, cacheConf, fs, BLOCK_SIZES[t]) .withOutputDir(TEST_UTIL.getDataTestDir()) .withBloomType(bt) .withChecksumType(HFile.DEFAULT_CHECKSUM_TYPE) .withBytesPerChecksum(HFile.DEFAULT_BYTES_PER_CHECKSUM) .build(); assertTrue(w.hasGeneralBloom()); assertTrue(w.getGeneralBloomWriter() instanceof CompoundBloomFilterWriter); CompoundBloomFilterWriter cbbf = (CompoundBloomFilterWriter) w.getGeneralBloomWriter(); int keyCount = 0; KeyValue prev = null; LOG.debug("Total keys/values to insert: " + kvs.size()); for (KeyValue kv : kvs) { w.append(kv); // Validate the key count in the Bloom filter. boolean newKey = true; if (prev != null) { newKey = !(bt == BloomType.ROW ? KeyValue.COMPARATOR.matchingRows(kv, prev) : KeyValue.COMPARATOR.matchingRowColumn(kv, prev)); } if (newKey) ++keyCount; assertEquals(keyCount, cbbf.getKeyCount()); prev = kv; } w.close(); return w.getPath(); } @Test public void testCompoundBloomSizing() { int bloomBlockByteSize = 4096; int bloomBlockBitSize = bloomBlockByteSize * 8; double targetErrorRate = 0.01; long maxKeysPerChunk = ByteBloomFilter.idealMaxKeys(bloomBlockBitSize, targetErrorRate); long bloomSize1 = bloomBlockByteSize * 8; long bloomSize2 = ByteBloomFilter.computeBitSize(maxKeysPerChunk, targetErrorRate); double bloomSizeRatio = (bloomSize2 * 1.0 / bloomSize1); assertTrue(Math.abs(bloomSizeRatio - 0.9999) < 0.0001); } @Test public void testCreateKey() { CompoundBloomFilterBase cbfb = new CompoundBloomFilterBase(); byte[] row = "myRow".getBytes(); byte[] qualifier = "myQualifier".getBytes(); byte[] rowKey = cbfb.createBloomKey(row, 0, row.length, row, 0, 0); byte[] rowColKey = cbfb.createBloomKey(row, 0, row.length, qualifier, 0, qualifier.length); KeyValue rowKV = KeyValue.createKeyValueFromKey(rowKey); KeyValue rowColKV = KeyValue.createKeyValueFromKey(rowColKey); assertEquals(rowKV.getTimestamp(), rowColKV.getTimestamp()); assertEquals(Bytes.toStringBinary(rowKV.getRow()), Bytes.toStringBinary(rowColKV.getRow())); assertEquals(0, rowKV.getQualifier().length); } }