/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.facebook.presto.operator.aggregation;
import com.facebook.presto.metadata.MetadataManager;
import com.facebook.presto.spi.Page;
import com.facebook.presto.spi.block.Block;
import com.facebook.presto.spi.block.BlockBuilder;
import com.facebook.presto.spi.block.BlockBuilderStatus;
import com.facebook.presto.spi.type.Type;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
import io.airlift.slice.Slice;
import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Random;
import java.util.Set;
import java.util.concurrent.ThreadLocalRandom;
import static com.facebook.presto.spi.type.DoubleType.DOUBLE;
import static io.airlift.testing.Assertions.assertLessThan;
import static org.testng.Assert.assertEquals;
public abstract class AbstractTestApproximateCountDistinct
{
public abstract InternalAggregationFunction getAggregationFunction();
public abstract Type getValueType();
public abstract Object randomValue();
protected static final MetadataManager metadata = MetadataManager.createTestMetadataManager();
@DataProvider(name = "provideStandardErrors")
public Object[][] provideStandardErrors()
{
return new Object[][] {
{ 0.0230 }, // 2k buckets
{ 0.0115 }, // 8k buckets
};
}
@Test(dataProvider = "provideStandardErrors")
public void testNoPositions(double maxStandardError)
throws Exception
{
assertCount(ImmutableList.of(), maxStandardError, 0);
}
@Test(dataProvider = "provideStandardErrors")
public void testSinglePosition(double maxStandardError)
throws Exception
{
assertCount(ImmutableList.of(randomValue()), maxStandardError, 1);
}
@Test(dataProvider = "provideStandardErrors")
public void testAllPositionsNull(double maxStandardError)
throws Exception
{
assertCount(Collections.nCopies(100, null), maxStandardError, 0);
}
@Test(dataProvider = "provideStandardErrors")
public void testMixedNullsAndNonNulls(double maxStandardError)
throws Exception
{
List<Object> baseline = createRandomSample(10000, 15000);
// Randomly insert nulls
// We need to retain the preexisting order to ensure that the HLL can generate the same estimates.
Iterator<Object> iterator = baseline.iterator();
List<Object> mixed = new ArrayList<>();
while (iterator.hasNext()) {
mixed.add(ThreadLocalRandom.current().nextBoolean() ? null : iterator.next());
}
assertCount(mixed, maxStandardError, estimateGroupByCount(baseline, maxStandardError));
}
@Test(dataProvider = "provideStandardErrors")
public void testMultiplePositions(double maxStandardError)
throws Exception
{
DescriptiveStatistics stats = new DescriptiveStatistics();
for (int i = 0; i < 500; ++i) {
int uniques = ThreadLocalRandom.current().nextInt(20000) + 1;
List<Object> values = createRandomSample(uniques, (int) (uniques * 1.5));
long actual = estimateGroupByCount(values, maxStandardError);
double error = (actual - uniques) * 1.0 / uniques;
stats.addValue(error);
}
assertLessThan(stats.getMean(), 1.0e-2);
assertLessThan(Math.abs(stats.getStandardDeviation() - maxStandardError), 1.0e-2);
}
@Test(dataProvider = "provideStandardErrors")
public void testMultiplePositionsPartial(double maxStandardError)
throws Exception
{
for (int i = 0; i < 100; ++i) {
int uniques = ThreadLocalRandom.current().nextInt(20000) + 1;
List<Object> values = createRandomSample(uniques, (int) (uniques * 1.5));
assertEquals(estimateCountPartial(values, maxStandardError), estimateGroupByCount(values, maxStandardError));
}
}
private void assertCount(List<Object> values, double maxStandardError, long expectedCount)
{
if (!values.isEmpty()) {
assertEquals(estimateGroupByCount(values, maxStandardError), expectedCount);
}
assertEquals(estimateCount(values, maxStandardError), expectedCount);
assertEquals(estimateCountPartial(values, maxStandardError), expectedCount);
}
private long estimateGroupByCount(List<Object> values, double maxStandardError)
{
Object result = AggregationTestUtils.groupedAggregation(getAggregationFunction(), createPage(values, maxStandardError));
return (long) result;
}
private long estimateCount(List<Object> values, double maxStandardError)
{
Object result = AggregationTestUtils.aggregation(getAggregationFunction(), createPage(values, maxStandardError));
return (long) result;
}
private long estimateCountPartial(List<Object> values, double maxStandardError)
{
Object result = AggregationTestUtils.partialAggregation(getAggregationFunction(), createPage(values, maxStandardError));
return (long) result;
}
private Page createPage(List<Object> values, double maxStandardError)
{
if (values.isEmpty()) {
return new Page(0);
}
else {
return new Page(values.size(),
createBlock(getValueType(), values),
createBlock(DOUBLE, ImmutableList.copyOf(Collections.nCopies(values.size(), maxStandardError))));
}
}
/**
* Produce a block with the given values in the last field.
*/
private static Block createBlock(Type type, List<Object> values)
{
BlockBuilder blockBuilder = type.createBlockBuilder(new BlockBuilderStatus(), values.size());
for (Object value : values) {
Class<?> javaType = type.getJavaType();
if (value == null) {
blockBuilder.appendNull();
}
else if (javaType == boolean.class) {
type.writeBoolean(blockBuilder, (Boolean) value);
}
else if (javaType == long.class) {
type.writeLong(blockBuilder, (Long) value);
}
else if (javaType == double.class) {
type.writeDouble(blockBuilder, (Double) value);
}
else if (javaType == Slice.class) {
Slice slice = (Slice) value;
type.writeSlice(blockBuilder, slice, 0, slice.length());
}
else {
throw new UnsupportedOperationException("not yet implemented: " + javaType);
}
}
return blockBuilder.build();
}
private List<Object> createRandomSample(int uniques, int total)
{
Preconditions.checkArgument(uniques <= total, "uniques (%s) must be <= total (%s)", uniques, total);
List<Object> result = new ArrayList<>(total);
result.addAll(makeRandomSet(uniques));
Random random = ThreadLocalRandom.current();
while (result.size() < total) {
int index = random.nextInt(result.size());
result.add(result.get(index));
}
return result;
}
private Set<Object> makeRandomSet(int count)
{
Set<Object> result = new HashSet<>();
while (result.size() < count) {
result.add(randomValue());
}
return result;
}
}