/* * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.facebook.presto.orc; import com.facebook.presto.orc.TupleDomainOrcPredicate.ColumnReference; import com.facebook.presto.orc.metadata.OrcMetadataReader; import com.facebook.presto.orc.metadata.statistics.ColumnStatistics; import com.facebook.presto.orc.metadata.statistics.HiveBloomFilter; import com.facebook.presto.orc.metadata.statistics.IntegerStatistics; import com.facebook.presto.orc.proto.OrcProto; import com.facebook.presto.orc.protobuf.CodedInputStream; import com.facebook.presto.spi.predicate.Domain; import com.facebook.presto.spi.predicate.TupleDomain; import com.facebook.presto.spi.type.Type; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.google.common.primitives.Longs; import io.airlift.slice.Slice; import org.apache.hive.common.util.BloomFilter; import org.testng.annotations.Test; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.math.BigDecimal; import java.sql.Timestamp; import java.util.Arrays; import java.util.Collection; import java.util.Date; import java.util.List; import java.util.Map; import java.util.Optional; import static com.facebook.presto.orc.TupleDomainOrcPredicate.checkInBloomFilter; import static com.facebook.presto.orc.TupleDomainOrcPredicate.extractDiscreteValues; import static com.facebook.presto.spi.type.BigintType.BIGINT; import static com.facebook.presto.spi.type.BooleanType.BOOLEAN; import static com.facebook.presto.spi.type.DateType.DATE; import static com.facebook.presto.spi.type.DoubleType.DOUBLE; import static com.facebook.presto.spi.type.IntegerType.INTEGER; import static com.facebook.presto.spi.type.VarbinaryType.VARBINARY; import static com.facebook.presto.spi.type.VarcharType.VARCHAR; import static io.airlift.slice.Slices.utf8Slice; import static io.airlift.slice.Slices.wrappedBuffer; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertFalse; import static org.testng.Assert.assertTrue; import static org.testng.Assert.fail; public class TestOrcBloomFilters { private static final String TEST_STRING = "ORC_STRING"; private static final String TEST_STRING_NOT_WRITTEN = "ORC_STRING_not"; private static final int TEST_INTEGER = 12345; private static final String COLUMN_0 = "bigint_0"; private static final String COLUMN_1 = "bigint_1"; private static final Map<Object, Type> TEST_VALUES = ImmutableMap.<Object, Type>builder() .put(utf8Slice(TEST_STRING), VARCHAR) .put(wrappedBuffer(new byte[]{12, 34, 56}), VARBINARY) .put(4312L, BIGINT) .put(123, INTEGER) .put(234.567, DOUBLE) .build(); @Test public void testHiveBloomFilterSerde() throws Exception { BloomFilter bloomFilter = new BloomFilter(1_000_000L, 0.05); // String bloomFilter.addString(TEST_STRING); assertTrue(bloomFilter.testString(TEST_STRING)); assertFalse(bloomFilter.testString(TEST_STRING_NOT_WRITTEN)); // Integer bloomFilter.addLong(TEST_INTEGER); assertTrue(bloomFilter.testLong(TEST_INTEGER)); assertFalse(bloomFilter.testLong(TEST_INTEGER + 1)); // Re-construct HiveBloomFilter hiveBloomFilter = new HiveBloomFilter(ImmutableList.copyOf(Longs.asList(bloomFilter.getBitSet())), bloomFilter.getBitSize(), bloomFilter.getNumHashFunctions()); // String assertTrue(hiveBloomFilter.testString(TEST_STRING)); assertFalse(hiveBloomFilter.testString(TEST_STRING_NOT_WRITTEN)); // Integer assertTrue(hiveBloomFilter.testLong(TEST_INTEGER)); assertFalse(hiveBloomFilter.testLong(TEST_INTEGER + 1)); } @Test public void testOrcHiveBloomFilterSerde() throws Exception { BloomFilter bloomFilterWrite = new BloomFilter(1000L, 0.05); bloomFilterWrite.addString(TEST_STRING); assertTrue(bloomFilterWrite.testString(TEST_STRING)); OrcProto.BloomFilter.Builder bloomFilterBuilder = OrcProto.BloomFilter.newBuilder(); bloomFilterBuilder.addAllBitset(Longs.asList(bloomFilterWrite.getBitSet())); bloomFilterBuilder.setNumHashFunctions(bloomFilterWrite.getNumHashFunctions()); OrcProto.BloomFilter bloomFilter = bloomFilterBuilder.build(); OrcProto.BloomFilterIndex bloomFilterIndex = OrcProto.BloomFilterIndex.getDefaultInstance(); byte[] bytes = serializeBloomFilterToIndex(bloomFilter, bloomFilterIndex); // Read through method InputStream inputStream = new ByteArrayInputStream(bytes); OrcMetadataReader metadataReader = new OrcMetadataReader(); List<HiveBloomFilter> bloomFilters = metadataReader.readBloomFilterIndexes(inputStream); assertEquals(bloomFilters.size(), 1); assertTrue(bloomFilters.get(0).testString(TEST_STRING)); assertFalse(bloomFilters.get(0).testString(TEST_STRING_NOT_WRITTEN)); assertEquals(bloomFilterWrite.getBitSize(), bloomFilters.get(0).getBitSize()); assertEquals(bloomFilterWrite.getNumHashFunctions(), bloomFilters.get(0).getNumHashFunctions()); // Validate bit set assertTrue(Arrays.equals(bloomFilters.get(0).getBitSet(), bloomFilterWrite.getBitSet())); // Read directly: allows better inspection of the bit sets (helped to fix a lot of bugs) CodedInputStream input = CodedInputStream.newInstance(bytes); OrcProto.BloomFilterIndex deserializedBloomFilterIndex = OrcProto.BloomFilterIndex.parseFrom(input); List<OrcProto.BloomFilter> bloomFilterList = deserializedBloomFilterIndex.getBloomFilterList(); assertEquals(bloomFilterList.size(), 1); OrcProto.BloomFilter bloomFilterRead = bloomFilterList.get(0); // Validate contents of ORC bloom filter bit set assertTrue(Arrays.equals(Longs.toArray(bloomFilterRead.getBitsetList()), bloomFilterWrite.getBitSet())); // hash functions assertEquals(bloomFilterWrite.getNumHashFunctions(), bloomFilterRead.getNumHashFunctions()); // bit size assertEquals(bloomFilterWrite.getBitSet().length, bloomFilterRead.getBitsetCount()); } private static byte[] serializeBloomFilterToIndex(OrcProto.BloomFilter bloomFilter, OrcProto.BloomFilterIndex bloomFilterIndex) throws IOException { assertTrue(bloomFilter.isInitialized()); OrcProto.BloomFilterIndex.Builder builder = bloomFilterIndex.toBuilder(); builder.addBloomFilter(bloomFilter); OrcProto.BloomFilterIndex index = builder.build(); assertTrue(index.isInitialized()); assertEquals(index.getBloomFilterCount(), 1); ByteArrayOutputStream os = new ByteArrayOutputStream(); index.writeTo(os); os.flush(); return os.toByteArray(); } private static OrcProto.BloomFilter toOrcBloomFilter(BloomFilter bloomFilter) { OrcProto.BloomFilter.Builder builder = OrcProto.BloomFilter.newBuilder(); builder.addAllBitset(Longs.asList(bloomFilter.getBitSet())); builder.setNumHashFunctions(bloomFilter.getNumHashFunctions()); return builder.build(); } @Test public void testBloomFilterPredicateValuesExisting() throws Exception { BloomFilter bloomFilter = new BloomFilter(TEST_VALUES.size() * 10, 0.01); for (Object o : TEST_VALUES.keySet()) { if (o instanceof Long) { bloomFilter.addLong((Long) o); } else if (o instanceof Integer) { bloomFilter.addLong((Integer) o); } else if (o instanceof String) { bloomFilter.addString((String) o); } else if (o instanceof BigDecimal) { bloomFilter.addString(o.toString()); } else if (o instanceof Slice) { bloomFilter.addString(((Slice) o).toStringUtf8()); } else if (o instanceof Timestamp) { bloomFilter.addLong(((Timestamp) o).getTime()); } else if (o instanceof Double) { bloomFilter.addDouble((Double) o); } else { fail("Unsupported type " + o.getClass()); } } for (Map.Entry<Object, Type> testValue : TEST_VALUES.entrySet()) { boolean matched = checkInBloomFilter(bloomFilter, testValue.getKey(), testValue.getValue()); assertTrue(matched, "type " + testValue.getClass()); } // test unsupported type: can be supported by ORC but is not implemented yet assertTrue(checkInBloomFilter(bloomFilter, new Date(), DATE), "unsupported type DATE should always return true"); } @Test public void testBloomFilterPredicateValuesNonExisting() throws Exception { BloomFilter bloomFilter = new BloomFilter(TEST_VALUES.size() * 10, 0.01); for (Map.Entry<Object, Type> testValue : TEST_VALUES.entrySet()) { boolean matched = checkInBloomFilter(bloomFilter, testValue.getKey(), testValue.getValue()); assertFalse(matched, "type " + testValue.getKey().getClass()); } // test unsupported type: can be supported by ORC but is not implemented yet assertTrue(checkInBloomFilter(bloomFilter, new Date(), DATE), "unsupported type DATE should always return true"); } @Test public void testExtractValuesFromSingleDomain() throws Exception { Map<Type, Object> testValues = ImmutableMap.<Type, Object>builder() .put(BOOLEAN, true) .put(INTEGER, 1234L) .put(BIGINT, 4321L) .put(DOUBLE, 0.123) .put(VARCHAR, utf8Slice(TEST_STRING)) .build(); for (Map.Entry<Type, Object> testValue : testValues.entrySet()) { Domain predicateDomain = Domain.singleValue(testValue.getKey(), testValue.getValue()); Optional<Collection<Object>> discreteValues = extractDiscreteValues(predicateDomain.getValues()); assertTrue(discreteValues.isPresent()); Collection<Object> objects = discreteValues.get(); assertEquals(objects.size(), 1); assertEquals(objects.iterator().next(), testValue.getValue()); } } @Test // simulate query on a 2 columns where 1 is used as part of the where, with and without bloom filter public void testMatches() throws Exception { // stripe column Domain testingColumnHandleDomain = Domain.singleValue(BIGINT, 1234L); TupleDomain.ColumnDomain<String> column0 = new TupleDomain.ColumnDomain<>(COLUMN_0, testingColumnHandleDomain); // predicate consist of the bigint_0 = 1234 TupleDomain<String> effectivePredicate = TupleDomain.fromColumnDomains(Optional.of(ImmutableList.of(column0))); TupleDomain<String> emptyEffectivePredicate = TupleDomain.all(); // predicate column references List<ColumnReference<String>> columnReferences = ImmutableList.<ColumnReference<String>>builder() .add(new ColumnReference<>(COLUMN_0, 0, BIGINT)) .add(new ColumnReference<>(COLUMN_1, 1, BIGINT)) .build(); TupleDomainOrcPredicate<String> predicate = new TupleDomainOrcPredicate<>(effectivePredicate, columnReferences, true); TupleDomainOrcPredicate<String> emptyPredicate = new TupleDomainOrcPredicate<>(emptyEffectivePredicate, columnReferences, true); // assemble a matching and a non-matching bloom filter HiveBloomFilter hiveBloomFilter = new HiveBloomFilter(new BloomFilter(1000, 0.01)); OrcProto.BloomFilter emptyOrcBloomFilter = toOrcBloomFilter(hiveBloomFilter); hiveBloomFilter.addLong(1234); OrcProto.BloomFilter orcBloomFilter = toOrcBloomFilter(hiveBloomFilter); Map<Integer, ColumnStatistics> matchingStatisticsByColumnIndex = ImmutableMap.of(0, new ColumnStatistics( null, null, new IntegerStatistics(10L, 2000L), null, null, null, null, toHiveBloomFilter(orcBloomFilter))); Map<Integer, ColumnStatistics> nonMatchingStatisticsByColumnIndex = ImmutableMap.of(0, new ColumnStatistics( null, null, new IntegerStatistics(10L, 2000L), null, null, null, null, toHiveBloomFilter(emptyOrcBloomFilter))); Map<Integer, ColumnStatistics> withoutBloomFilterStatisticsByColumnIndex = ImmutableMap.of(0, new ColumnStatistics( null, null, new IntegerStatistics(10L, 2000L), null, null, null, null, null)); assertTrue(predicate.matches(1L, matchingStatisticsByColumnIndex)); assertTrue(predicate.matches(1L, withoutBloomFilterStatisticsByColumnIndex)); assertFalse(predicate.matches(1L, nonMatchingStatisticsByColumnIndex)); assertTrue(emptyPredicate.matches(1L, matchingStatisticsByColumnIndex)); } private static HiveBloomFilter toHiveBloomFilter(OrcProto.BloomFilter emptyOrcBloomFilter) { return new HiveBloomFilter(emptyOrcBloomFilter.getBitsetList(), emptyOrcBloomFilter.getBitsetCount() * 64, emptyOrcBloomFilter.getNumHashFunctions()); } }