ApproximateUniqueTest.java example

Explorer
beam-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.beam.sdk.transforms;

import static org.apache.beam.sdk.transforms.display.DisplayDataMatchers.hasDisplayItem;
import static org.hamcrest.MatcherAssert.assertThat;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;

import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import org.apache.beam.sdk.TestUtils;
import org.apache.beam.sdk.testing.NeedsRunner;
import org.apache.beam.sdk.testing.PAssert;
import org.apache.beam.sdk.testing.TestPipeline;
import org.apache.beam.sdk.testing.ValidatesRunner;
import org.apache.beam.sdk.transforms.display.DisplayData;
import org.apache.beam.sdk.values.KV;
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.PCollectionView;
import org.junit.Rule;
import org.junit.Test;
import org.junit.experimental.categories.Category;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;
import org.junit.runners.Parameterized;
import org.junit.runners.Suite;

/**
 * Tests for the ApproximateUnique transform.
 */
@RunWith(Suite.class)
@Suite.SuiteClasses({
    ApproximateUniqueTest.ApproximateUniqueWithDuplicatesTest.class,
    ApproximateUniqueTest.ApproximateUniqueVariationsTest.class,
    ApproximateUniqueTest.ApproximateUniqueMiscTest.class
})
public class ApproximateUniqueTest implements Serializable {
  // implements Serializable just to make it easy to use anonymous inner DoFn subclasses

  @Rule
  public final transient TestPipeline p = TestPipeline.create();

  private static class VerifyEstimateFn implements SerializableFunction<Long, Void> {
    private final long uniqueCount;
    private final int sampleSize;

    private VerifyEstimateFn(final long uniqueCount, final int sampleSize) {
      this.uniqueCount = uniqueCount;
      this.sampleSize = sampleSize;
    }

    @Override
    public Void apply(final Long estimate) {
      verifyEstimate(uniqueCount, sampleSize, estimate);
      return null;
    }
  }

  /**
   * Checks that the estimation error, i.e., the difference between
   * {@code uniqueCount} and {@code estimate} is less than
   * {@code 2 / sqrt(sampleSize}).
   */
  private static void verifyEstimate(final long uniqueCount,
                                     final int sampleSize,
                                     final long estimate) {
    if (uniqueCount < sampleSize) {
      assertEquals("Number of hashes is less than the sample size. "
                       + "Estimate should be exact", uniqueCount, estimate);
    }

    final double error = 100.0 * Math.abs(estimate - uniqueCount) / uniqueCount;
    final double maxError = 100.0 * 2 / Math.sqrt(sampleSize);

    assertTrue("Estimate=" + estimate + " Actual=" + uniqueCount + " Error="
                   + error + "%, MaxError=" + maxError + "%.", error < maxError);

    assertTrue("Estimate=" + estimate + " Actual=" + uniqueCount + " Error="
                   + error + "%, MaxError=" + maxError + "%.", error < maxError);
  }

  private static class VerifyEstimatePerKeyFn
      implements SerializableFunction<Iterable<KV<Long, Long>>, Void> {

    private final int sampleSize;

    private VerifyEstimatePerKeyFn(final int sampleSize) {
      this.sampleSize = sampleSize;
    }

    @Override
    public Void apply(final Iterable<KV<Long, Long>> estimatePerKey) {
      for (final KV<Long, Long> result : estimatePerKey) {
        verifyEstimate(result.getKey(), sampleSize, result.getValue());
      }
      return null;
    }
  }

  /**
   * Tests for ApproximateUnique with duplicates.
   */
  @RunWith(Parameterized.class)
  public static class ApproximateUniqueWithDuplicatesTest extends
                                                          ApproximateUniqueTest {

    @Parameterized.Parameter
    public int elementCount;
    @Parameterized.Parameter(1)
    public int uniqueCount;
    @Parameterized.Parameter(2)
    public int sampleSize;

    @Parameterized.Parameters(name = "total_{0}_unique_{1}_sample_{2}")
    public static Iterable<Object[]> data() throws IOException {
      return ImmutableList.<Object[]>builder()
          .add(
              new Object[] {
                  100, 100, 100
              },
              new Object[] {
                  1000, 1000, 100
              },
              new Object[] {
                  1500, 1000, 100
              },
              new Object[] {
                  10000, 1000, 100
              })
          .build();
    }


    private void runApproximateUniqueWithDuplicates(final int elementCount,
                                                    final int uniqueCount, final int sampleSize) {

      assert elementCount >= uniqueCount;
      final List<Double> elements = Lists.newArrayList();
      for (int i = 0; i < elementCount; i++) {
        elements.add(1.0 / (i % uniqueCount + 1));
      }
      Collections.shuffle(elements);

      final PCollection<Double> input = p.apply(Create.of(elements));
      final PCollection<Long> estimate =
          input.apply(ApproximateUnique.<Double>globally(sampleSize));

      PAssert.thatSingleton(estimate).satisfies(new VerifyEstimateFn(uniqueCount, sampleSize));

      p.run();
    }


    @Test
    @Category(NeedsRunner.class)
    public void testApproximateUniqueWithDuplicates() {
      runApproximateUniqueWithDuplicates(elementCount, uniqueCount, sampleSize);
    }
  }

  /**
   * Tests for ApproximateUnique with different sample sizes.
   */
  @RunWith(Parameterized.class)
  public static class ApproximateUniqueVariationsTest extends ApproximateUniqueTest {

    private static final int TEST_PAGES = 100;
    private static final List<String> TEST_LINES =
        new ArrayList<>(TEST_PAGES * TestUtils.LINES.size());

    static {
      for (int i = 0; i < TEST_PAGES; i++) {
        TEST_LINES.addAll(TestUtils.LINES);
      }
    }

    @Parameterized.Parameter
    public int sampleSize;

    @Parameterized.Parameters(name = "sampleSize_{0}")
    public static Iterable<Object[]> data() throws IOException {
      return ImmutableList.<Object[]>builder()
          .add(new Object[] {
                  16
              },
              new Object[] {
                  64
              },
              new Object[] {
                  128
              },
              new Object[] {
                  256
              },
              new Object[] {
                  512
              },
              new Object[] {
                  1000
              },
              new Object[] {
                  2014
              },
              new Object[] {
                  15
              })
          .build();
    }

    /**
     * Applies {@code ApproximateUnique(sampleSize)} verifying that the estimation
     * error falls within the maximum allowed error of {@code 2/sqrt(sampleSize)}.
     */
    private void runApproximateUniquePipeline(final int sampleSize) {
      final PCollection<String> input = p.apply(Create.of(TEST_LINES));
      final PCollection<Long> approximate =
          input.apply(ApproximateUnique.<String>globally(sampleSize));
      final PCollectionView<Long> exact =
          input
              .apply(Distinct.<String>create())
              .apply(Count.<String>globally())
              .apply(View.<Long>asSingleton());

      final PCollection<KV<Long, Long>> approximateAndExact = approximate
          .apply(ParDo.of(new DoFn<Long, KV<Long, Long>>() {

            @ProcessElement
            public void processElement(final ProcessContext c) {
              c.output(KV.of(c.element(), c.sideInput(exact)));
            }
          }).withSideInputs(exact));

      PAssert.that(approximateAndExact).satisfies(new VerifyEstimatePerKeyFn(sampleSize));

      p.run();
    }

    /**
     * Applies {@link ApproximateUnique} for different sample sizes and verifies
     * that the estimation error falls within the maximum allowed error of
     * {@code 2 / sqrt(sampleSize)}.
     */
    @Test
    @Category(NeedsRunner.class)
    public void testApproximateUniqueWithDifferentSampleSizes() {
      if (sampleSize > 16) {
        runApproximateUniquePipeline(sampleSize);
      } else {
        try {
          p.enableAbandonedNodeEnforcement(false);
          runApproximateUniquePipeline(15);
          fail("Accepted sampleSize < 16");
        } catch (final IllegalArgumentException e) {
          assertTrue("Expected an exception due to sampleSize < 16",
                     e.getMessage().startsWith("ApproximateUnique needs a sampleSize >= 16"));
        }
      }
    }
  }

  /**
   * Further tests for ApproximateUnique.
   */
  @RunWith(JUnit4.class)
  public static class ApproximateUniqueMiscTest extends ApproximateUniqueTest {

    @Test
    public void testEstimationErrorToSampleSize() {
      assertEquals(40000, ApproximateUnique.sampleSizeFromEstimationError(0.01));
      assertEquals(10000, ApproximateUnique.sampleSizeFromEstimationError(0.02));
      assertEquals(2500, ApproximateUnique.sampleSizeFromEstimationError(0.04));
      assertEquals(1600, ApproximateUnique.sampleSizeFromEstimationError(0.05));
      assertEquals(400, ApproximateUnique.sampleSizeFromEstimationError(0.1));
      assertEquals(100, ApproximateUnique.sampleSizeFromEstimationError(0.2));
      assertEquals(25, ApproximateUnique.sampleSizeFromEstimationError(0.4));
      assertEquals(16, ApproximateUnique.sampleSizeFromEstimationError(0.5));
    }

    @Test
    @Category(ValidatesRunner.class)
    public void testApproximateUniqueWithSmallInput() {
      final PCollection<Integer> input = p.apply(
          Create.of(Arrays.asList(1, 2, 3, 3)));

      final PCollection<Long> estimate = input
          .apply(ApproximateUnique.<Integer>globally(1000));

      PAssert.thatSingleton(estimate).isEqualTo(3L);

      p.run();
    }


    @Test
    @Category(NeedsRunner.class)
    public void testApproximateUniqueWithSkewedDistributionsAndLargeSampleSize() {
      runApproximateUniqueWithSkewedDistributions(10000, 2000, 1000);
    }

    private void runApproximateUniqueWithSkewedDistributions(final int elementCount,
                                                             final int uniqueCount,
                                                             final int sampleSize) {
      final List<Integer> elements = Lists.newArrayList();
      // Zipf distribution with approximately elementCount items.
      final double s = 1 - 1.0 * uniqueCount / elementCount;
      final double maxCount = Math.pow(uniqueCount, s);
      for (int k = 0; k < uniqueCount; k++) {
        final int count = Math.max(1, (int) Math.round(maxCount * Math.pow(k, -s)));
        // Element k occurs count times.
        for (int c = 0; c < count; c++) {
          elements.add(k);
        }
      }

      final PCollection<Integer> input = p.apply(Create.of(elements));
      final PCollection<Long> estimate =
          input.apply(ApproximateUnique.<Integer>globally(sampleSize));

      PAssert.thatSingleton(estimate).satisfies(new VerifyEstimateFn(uniqueCount, sampleSize));

      p.run();
    }

    @Test
    @Category(NeedsRunner.class)
    public void testApproximateUniquePerKey() {
      final List<KV<Long, Long>> elements = Lists.newArrayList();
      final List<Long> keys = ImmutableList.of(20L, 50L, 100L);
      final int elementCount = 1000;
      final int sampleSize = 100;
      // Use the key as the number of unique values.
      for (final long uniqueCount : keys) {
        for (long value = 0; value < elementCount; value++) {
          elements.add(KV.of(uniqueCount, value % uniqueCount));
        }
      }

      final PCollection<KV<Long, Long>> input = p.apply(Create.of(elements));
      final PCollection<KV<Long, Long>> counts =
          input.apply(ApproximateUnique.<Long, Long>perKey(sampleSize));

      PAssert.that(counts).satisfies(new VerifyEstimatePerKeyFn(sampleSize));

      p.run();

    }

    @Test
    public void testApproximateUniqueGetName() {
      assertEquals("ApproximateUnique.PerKey", ApproximateUnique.<Long, Long>perKey(16).getName());
      assertEquals("ApproximateUnique.Globally", ApproximateUnique.<Integer>globally(16).getName());
    }

    @Test
    public void testDisplayData() {
      final ApproximateUnique.Globally<Integer> specifiedSampleSize =
          ApproximateUnique.globally(1234);
      final ApproximateUnique.PerKey<String, Integer> specifiedMaxError =
          ApproximateUnique.perKey(0.1234);

      assertThat(DisplayData.from(specifiedSampleSize), hasDisplayItem("sampleSize", 1234));

      final DisplayData maxErrorDisplayData = DisplayData.from(specifiedMaxError);
      assertThat(maxErrorDisplayData, hasDisplayItem("maximumEstimationError", 0.1234));
      assertThat("calculated sampleSize should be included", maxErrorDisplayData,
                 hasDisplayItem("sampleSize"));
    }
  }

}