WindowedWordCountIT.java example

Explorer
beam-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.beam.examples;

import static org.hamcrest.Matchers.equalTo;

import com.google.common.base.MoreObjects;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.List;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.concurrent.ThreadLocalRandom;
import org.apache.beam.examples.common.ExampleUtils;
import org.apache.beam.examples.common.WriteOneFilePerWindow.PerWindowFiles;
import org.apache.beam.sdk.PipelineResult;
import org.apache.beam.sdk.io.FileSystems;
import org.apache.beam.sdk.io.fs.ResolveOptions.StandardResolveOptions;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.options.StreamingOptions;
import org.apache.beam.sdk.testing.FileChecksumMatcher;
import org.apache.beam.sdk.testing.SerializableMatcher;
import org.apache.beam.sdk.testing.StreamingIT;
import org.apache.beam.sdk.testing.TestPipeline;
import org.apache.beam.sdk.testing.TestPipelineOptions;
import org.apache.beam.sdk.transforms.windowing.IntervalWindow;
import org.apache.beam.sdk.util.ExplicitShardedFile;
import org.apache.beam.sdk.util.FluentBackoff;
import org.apache.beam.sdk.util.NumberedShardedFile;
import org.apache.beam.sdk.util.ShardedFile;
import org.apache.beam.sdk.util.Sleeper;
import org.hamcrest.Description;
import org.hamcrest.TypeSafeMatcher;
import org.joda.time.Duration;
import org.joda.time.Instant;
import org.junit.BeforeClass;
import org.junit.Rule;
import org.junit.Test;
import org.junit.experimental.categories.Category;
import org.junit.rules.TestName;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/** End-to-end integration test of {@link WindowedWordCount}. */
@RunWith(JUnit4.class)
public class WindowedWordCountIT {

  @Rule public TestName testName = new TestName();

  private static final String DEFAULT_INPUT =
      "gs://apache-beam-samples/shakespeare/sonnets.txt";
  static final int MAX_READ_RETRIES = 4;
  static final Duration DEFAULT_SLEEP_DURATION = Duration.standardSeconds(10L);
  static final FluentBackoff BACK_OFF_FACTORY =
      FluentBackoff.DEFAULT
          .withInitialBackoff(DEFAULT_SLEEP_DURATION)
          .withMaxRetries(MAX_READ_RETRIES);

  /** Options for the {@link WindowedWordCount} Integration Test. */
  public interface WindowedWordCountITOptions
      extends WindowedWordCount.Options, TestPipelineOptions, StreamingOptions {}

  @BeforeClass
  public static void setUp() {
    PipelineOptionsFactory.register(TestPipelineOptions.class);
  }

  @Test
  public void testWindowedWordCountInBatchDynamicSharding() throws Exception {
    WindowedWordCountITOptions options = batchOptions();
    // This is the default value, but make it explicit
    options.setNumShards(null);
    testWindowedWordCountPipeline(options);
  }

  @Test
  public void testWindowedWordCountInBatchStaticSharding() throws Exception {
    WindowedWordCountITOptions options = batchOptions();
    options.setNumShards(3);
    testWindowedWordCountPipeline(options);
  }

  // TODO: add a test with streaming and dynamic sharding after resolving
  // https://issues.apache.org/jira/browse/BEAM-1438

  @Test
  @Category(StreamingIT.class)
  public void testWindowedWordCountInStreamingStaticSharding() throws Exception {
    WindowedWordCountITOptions options = streamingOptions();
    options.setNumShards(3);
    testWindowedWordCountPipeline(options);
  }

  private WindowedWordCountITOptions defaultOptions() throws Exception {
    WindowedWordCountITOptions options =
        TestPipeline.testingPipelineOptions().as(WindowedWordCountITOptions.class);
    options.setInputFile(DEFAULT_INPUT);
    options.setTestTimeoutSeconds(1200L);

    options.setMinTimestampMillis(0L);
    options.setMinTimestampMillis(Duration.standardHours(1).getMillis());
    options.setWindowSize(10);

    options.setOutput(
        FileSystems.matchNewResource(options.getTempRoot(), true)
            .resolve(String.format(
                "WindowedWordCountIT.%s-%tFT%<tH:%<tM:%<tS.%<tL+%s",
                testName.getMethodName(), new Date(), ThreadLocalRandom.current().nextInt()),
                StandardResolveOptions.RESOLVE_DIRECTORY)
            .resolve("output", StandardResolveOptions.RESOLVE_DIRECTORY)
            .resolve("results", StandardResolveOptions.RESOLVE_FILE).toString());
    return options;
  }

  private WindowedWordCountITOptions streamingOptions() throws Exception {
    WindowedWordCountITOptions options = defaultOptions();
    options.setStreaming(true);
    return options;
  }

  private WindowedWordCountITOptions batchOptions() throws Exception {
    WindowedWordCountITOptions options = defaultOptions();
    // This is the default value, but make it explicit
    options.setStreaming(false);
    return options;
  }

  private void testWindowedWordCountPipeline(WindowedWordCountITOptions options) throws Exception {

    String outputPrefix = options.getOutput();

    PerWindowFiles filenamePolicy = new PerWindowFiles(outputPrefix);

    List<ShardedFile> expectedOutputFiles = Lists.newArrayListWithCapacity(6);

    for (int startMinute : ImmutableList.of(0, 10, 20, 30, 40, 50)) {
      final Instant windowStart =
          new Instant(options.getMinTimestampMillis()).plus(Duration.standardMinutes(startMinute));
      expectedOutputFiles.add(
          new NumberedShardedFile(
              filenamePolicy.filenamePrefixForWindow(
                  new IntervalWindow(
                      windowStart, windowStart.plus(Duration.standardMinutes(10)))) + "*"));
    }

    ShardedFile inputFile = new ExplicitShardedFile(Collections.singleton(options.getInputFile()));

    // For this integration test, input is tiny and we can build the expected counts
    SortedMap<String, Long> expectedWordCounts = new TreeMap<>();
    for (String line :
        inputFile.readFilesWithRetries(Sleeper.DEFAULT, BACK_OFF_FACTORY.backoff())) {
      String[] words = line.split(ExampleUtils.TOKENIZER_PATTERN);

      for (String word : words) {
        if (!word.isEmpty()) {
          expectedWordCounts.put(
              word, MoreObjects.firstNonNull(expectedWordCounts.get(word), 0L) + 1L);
        }
      }
    }

    options.setOnSuccessMatcher(
        new WordCountsMatcher(expectedWordCounts, expectedOutputFiles));

    WindowedWordCount.main(TestPipeline.convertToArgs(options));
  }

  /**
   * A matcher that bakes in expected word counts, so they can be read directly via some other
   * mechanism, and compares a sharded output file with the result.
   */
  private static class WordCountsMatcher extends TypeSafeMatcher<PipelineResult>
      implements SerializableMatcher<PipelineResult> {

    private static final Logger LOG = LoggerFactory.getLogger(FileChecksumMatcher.class);

    private final SortedMap<String, Long> expectedWordCounts;
    private final List<ShardedFile> outputFiles;
    private SortedMap<String, Long> actualCounts;

    public WordCountsMatcher(
        SortedMap<String, Long> expectedWordCounts, List<ShardedFile> outputFiles) {
      this.expectedWordCounts = expectedWordCounts;
      this.outputFiles = outputFiles;
    }

    @Override
    public boolean matchesSafely(PipelineResult pipelineResult) {
      try {
        // Load output data
        List<String> outputLines = new ArrayList<>();
        for (ShardedFile outputFile : outputFiles) {
          outputLines.addAll(
              outputFile.readFilesWithRetries(Sleeper.DEFAULT, BACK_OFF_FACTORY.backoff()));
        }

        // Since the windowing is nondeterministic we only check the sums
        actualCounts = new TreeMap<>();
        for (String line : outputLines) {
          String[] splits = line.split(": ");
          String word = splits[0];
          long count = Long.parseLong(splits[1]);

          Long current = actualCounts.get(word);
          if (current == null) {
            actualCounts.put(word, count);
          } else {
            actualCounts.put(word, current + count);
          }
        }

        return actualCounts.equals(expectedWordCounts);
      } catch (Exception e) {
        throw new RuntimeException(
            String.format("Failed to read from sharded output: %s due to exception",
                outputFiles), e);
      }
    }

    @Override
    public void describeTo(Description description) {
      equalTo(expectedWordCounts).describeTo(description);
    }

    @Override
    public void describeMismatchSafely(PipelineResult pResult, Description description) {
      equalTo(expectedWordCounts).describeMismatch(actualCounts, description);
    }
  }
}