/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.beam.sdk.transforms;
import static org.apache.beam.sdk.transforms.display.DisplayDataMatchers.hasDisplayItem;
import static org.hamcrest.MatcherAssert.assertThat;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import org.apache.beam.sdk.TestUtils;
import org.apache.beam.sdk.testing.NeedsRunner;
import org.apache.beam.sdk.testing.PAssert;
import org.apache.beam.sdk.testing.TestPipeline;
import org.apache.beam.sdk.testing.ValidatesRunner;
import org.apache.beam.sdk.transforms.display.DisplayData;
import org.apache.beam.sdk.values.KV;
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.PCollectionView;
import org.junit.Rule;
import org.junit.Test;
import org.junit.experimental.categories.Category;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;
import org.junit.runners.Parameterized;
import org.junit.runners.Suite;
/**
* Tests for the ApproximateUnique transform.
*/
@RunWith(Suite.class)
@Suite.SuiteClasses({
ApproximateUniqueTest.ApproximateUniqueWithDuplicatesTest.class,
ApproximateUniqueTest.ApproximateUniqueVariationsTest.class,
ApproximateUniqueTest.ApproximateUniqueMiscTest.class
})
public class ApproximateUniqueTest implements Serializable {
// implements Serializable just to make it easy to use anonymous inner DoFn subclasses
@Rule
public final transient TestPipeline p = TestPipeline.create();
private static class VerifyEstimateFn implements SerializableFunction<Long, Void> {
private final long uniqueCount;
private final int sampleSize;
private VerifyEstimateFn(final long uniqueCount, final int sampleSize) {
this.uniqueCount = uniqueCount;
this.sampleSize = sampleSize;
}
@Override
public Void apply(final Long estimate) {
verifyEstimate(uniqueCount, sampleSize, estimate);
return null;
}
}
/**
* Checks that the estimation error, i.e., the difference between
* {@code uniqueCount} and {@code estimate} is less than
* {@code 2 / sqrt(sampleSize}).
*/
private static void verifyEstimate(final long uniqueCount,
final int sampleSize,
final long estimate) {
if (uniqueCount < sampleSize) {
assertEquals("Number of hashes is less than the sample size. "
+ "Estimate should be exact", uniqueCount, estimate);
}
final double error = 100.0 * Math.abs(estimate - uniqueCount) / uniqueCount;
final double maxError = 100.0 * 2 / Math.sqrt(sampleSize);
assertTrue("Estimate=" + estimate + " Actual=" + uniqueCount + " Error="
+ error + "%, MaxError=" + maxError + "%.", error < maxError);
assertTrue("Estimate=" + estimate + " Actual=" + uniqueCount + " Error="
+ error + "%, MaxError=" + maxError + "%.", error < maxError);
}
private static class VerifyEstimatePerKeyFn
implements SerializableFunction<Iterable<KV<Long, Long>>, Void> {
private final int sampleSize;
private VerifyEstimatePerKeyFn(final int sampleSize) {
this.sampleSize = sampleSize;
}
@Override
public Void apply(final Iterable<KV<Long, Long>> estimatePerKey) {
for (final KV<Long, Long> result : estimatePerKey) {
verifyEstimate(result.getKey(), sampleSize, result.getValue());
}
return null;
}
}
/**
* Tests for ApproximateUnique with duplicates.
*/
@RunWith(Parameterized.class)
public static class ApproximateUniqueWithDuplicatesTest extends
ApproximateUniqueTest {
@Parameterized.Parameter
public int elementCount;
@Parameterized.Parameter(1)
public int uniqueCount;
@Parameterized.Parameter(2)
public int sampleSize;
@Parameterized.Parameters(name = "total_{0}_unique_{1}_sample_{2}")
public static Iterable<Object[]> data() throws IOException {
return ImmutableList.<Object[]>builder()
.add(
new Object[] {
100, 100, 100
},
new Object[] {
1000, 1000, 100
},
new Object[] {
1500, 1000, 100
},
new Object[] {
10000, 1000, 100
})
.build();
}
private void runApproximateUniqueWithDuplicates(final int elementCount,
final int uniqueCount, final int sampleSize) {
assert elementCount >= uniqueCount;
final List<Double> elements = Lists.newArrayList();
for (int i = 0; i < elementCount; i++) {
elements.add(1.0 / (i % uniqueCount + 1));
}
Collections.shuffle(elements);
final PCollection<Double> input = p.apply(Create.of(elements));
final PCollection<Long> estimate =
input.apply(ApproximateUnique.<Double>globally(sampleSize));
PAssert.thatSingleton(estimate).satisfies(new VerifyEstimateFn(uniqueCount, sampleSize));
p.run();
}
@Test
@Category(NeedsRunner.class)
public void testApproximateUniqueWithDuplicates() {
runApproximateUniqueWithDuplicates(elementCount, uniqueCount, sampleSize);
}
}
/**
* Tests for ApproximateUnique with different sample sizes.
*/
@RunWith(Parameterized.class)
public static class ApproximateUniqueVariationsTest extends ApproximateUniqueTest {
private static final int TEST_PAGES = 100;
private static final List<String> TEST_LINES =
new ArrayList<>(TEST_PAGES * TestUtils.LINES.size());
static {
for (int i = 0; i < TEST_PAGES; i++) {
TEST_LINES.addAll(TestUtils.LINES);
}
}
@Parameterized.Parameter
public int sampleSize;
@Parameterized.Parameters(name = "sampleSize_{0}")
public static Iterable<Object[]> data() throws IOException {
return ImmutableList.<Object[]>builder()
.add(new Object[] {
16
},
new Object[] {
64
},
new Object[] {
128
},
new Object[] {
256
},
new Object[] {
512
},
new Object[] {
1000
},
new Object[] {
2014
},
new Object[] {
15
})
.build();
}
/**
* Applies {@code ApproximateUnique(sampleSize)} verifying that the estimation
* error falls within the maximum allowed error of {@code 2/sqrt(sampleSize)}.
*/
private void runApproximateUniquePipeline(final int sampleSize) {
final PCollection<String> input = p.apply(Create.of(TEST_LINES));
final PCollection<Long> approximate =
input.apply(ApproximateUnique.<String>globally(sampleSize));
final PCollectionView<Long> exact =
input
.apply(Distinct.<String>create())
.apply(Count.<String>globally())
.apply(View.<Long>asSingleton());
final PCollection<KV<Long, Long>> approximateAndExact = approximate
.apply(ParDo.of(new DoFn<Long, KV<Long, Long>>() {
@ProcessElement
public void processElement(final ProcessContext c) {
c.output(KV.of(c.element(), c.sideInput(exact)));
}
}).withSideInputs(exact));
PAssert.that(approximateAndExact).satisfies(new VerifyEstimatePerKeyFn(sampleSize));
p.run();
}
/**
* Applies {@link ApproximateUnique} for different sample sizes and verifies
* that the estimation error falls within the maximum allowed error of
* {@code 2 / sqrt(sampleSize)}.
*/
@Test
@Category(NeedsRunner.class)
public void testApproximateUniqueWithDifferentSampleSizes() {
if (sampleSize > 16) {
runApproximateUniquePipeline(sampleSize);
} else {
try {
p.enableAbandonedNodeEnforcement(false);
runApproximateUniquePipeline(15);
fail("Accepted sampleSize < 16");
} catch (final IllegalArgumentException e) {
assertTrue("Expected an exception due to sampleSize < 16",
e.getMessage().startsWith("ApproximateUnique needs a sampleSize >= 16"));
}
}
}
}
/**
* Further tests for ApproximateUnique.
*/
@RunWith(JUnit4.class)
public static class ApproximateUniqueMiscTest extends ApproximateUniqueTest {
@Test
public void testEstimationErrorToSampleSize() {
assertEquals(40000, ApproximateUnique.sampleSizeFromEstimationError(0.01));
assertEquals(10000, ApproximateUnique.sampleSizeFromEstimationError(0.02));
assertEquals(2500, ApproximateUnique.sampleSizeFromEstimationError(0.04));
assertEquals(1600, ApproximateUnique.sampleSizeFromEstimationError(0.05));
assertEquals(400, ApproximateUnique.sampleSizeFromEstimationError(0.1));
assertEquals(100, ApproximateUnique.sampleSizeFromEstimationError(0.2));
assertEquals(25, ApproximateUnique.sampleSizeFromEstimationError(0.4));
assertEquals(16, ApproximateUnique.sampleSizeFromEstimationError(0.5));
}
@Test
@Category(ValidatesRunner.class)
public void testApproximateUniqueWithSmallInput() {
final PCollection<Integer> input = p.apply(
Create.of(Arrays.asList(1, 2, 3, 3)));
final PCollection<Long> estimate = input
.apply(ApproximateUnique.<Integer>globally(1000));
PAssert.thatSingleton(estimate).isEqualTo(3L);
p.run();
}
@Test
@Category(NeedsRunner.class)
public void testApproximateUniqueWithSkewedDistributionsAndLargeSampleSize() {
runApproximateUniqueWithSkewedDistributions(10000, 2000, 1000);
}
private void runApproximateUniqueWithSkewedDistributions(final int elementCount,
final int uniqueCount,
final int sampleSize) {
final List<Integer> elements = Lists.newArrayList();
// Zipf distribution with approximately elementCount items.
final double s = 1 - 1.0 * uniqueCount / elementCount;
final double maxCount = Math.pow(uniqueCount, s);
for (int k = 0; k < uniqueCount; k++) {
final int count = Math.max(1, (int) Math.round(maxCount * Math.pow(k, -s)));
// Element k occurs count times.
for (int c = 0; c < count; c++) {
elements.add(k);
}
}
final PCollection<Integer> input = p.apply(Create.of(elements));
final PCollection<Long> estimate =
input.apply(ApproximateUnique.<Integer>globally(sampleSize));
PAssert.thatSingleton(estimate).satisfies(new VerifyEstimateFn(uniqueCount, sampleSize));
p.run();
}
@Test
@Category(NeedsRunner.class)
public void testApproximateUniquePerKey() {
final List<KV<Long, Long>> elements = Lists.newArrayList();
final List<Long> keys = ImmutableList.of(20L, 50L, 100L);
final int elementCount = 1000;
final int sampleSize = 100;
// Use the key as the number of unique values.
for (final long uniqueCount : keys) {
for (long value = 0; value < elementCount; value++) {
elements.add(KV.of(uniqueCount, value % uniqueCount));
}
}
final PCollection<KV<Long, Long>> input = p.apply(Create.of(elements));
final PCollection<KV<Long, Long>> counts =
input.apply(ApproximateUnique.<Long, Long>perKey(sampleSize));
PAssert.that(counts).satisfies(new VerifyEstimatePerKeyFn(sampleSize));
p.run();
}
@Test
public void testApproximateUniqueGetName() {
assertEquals("ApproximateUnique.PerKey", ApproximateUnique.<Long, Long>perKey(16).getName());
assertEquals("ApproximateUnique.Globally", ApproximateUnique.<Integer>globally(16).getName());
}
@Test
public void testDisplayData() {
final ApproximateUnique.Globally<Integer> specifiedSampleSize =
ApproximateUnique.globally(1234);
final ApproximateUnique.PerKey<String, Integer> specifiedMaxError =
ApproximateUnique.perKey(0.1234);
assertThat(DisplayData.from(specifiedSampleSize), hasDisplayItem("sampleSize", 1234));
final DisplayData maxErrorDisplayData = DisplayData.from(specifiedMaxError);
assertThat(maxErrorDisplayData, hasDisplayItem("maximumEstimationError", 0.1234));
assertThat("calculated sampleSize should be included", maxErrorDisplayData,
hasDisplayItem("sampleSize"));
}
}
}