StateSpecFunctions.java example

Explorer
beam-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.beam.runners.spark.stateful;

import com.google.common.base.Stopwatch;
import com.google.common.collect.Iterators;
import com.google.common.collect.Lists;
import java.io.Closeable;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.TimeUnit;
import org.apache.beam.runners.core.metrics.MetricsContainerStepMap;
import org.apache.beam.runners.spark.coders.CoderHelpers;
import org.apache.beam.runners.spark.io.EmptyCheckpointMark;
import org.apache.beam.runners.spark.io.MicrobatchSource;
import org.apache.beam.runners.spark.io.SparkUnboundedSource.Metadata;
import org.apache.beam.runners.spark.translation.SparkRuntimeContext;
import org.apache.beam.sdk.coders.Coder;
import org.apache.beam.sdk.io.Source;
import org.apache.beam.sdk.io.UnboundedSource;
import org.apache.beam.sdk.metrics.MetricsContainer;
import org.apache.beam.sdk.metrics.MetricsEnvironment;
import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
import org.apache.beam.sdk.transforms.windowing.GlobalWindow;
import org.apache.beam.sdk.transforms.windowing.PaneInfo;
import org.apache.beam.sdk.util.WindowedValue;
import org.apache.spark.streaming.State;
import org.apache.spark.streaming.StateSpec;
import org.joda.time.Instant;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import scala.Option;
import scala.Tuple2;
import scala.runtime.AbstractFunction3;

/**
 * A class containing {@link org.apache.spark.streaming.StateSpec} mappingFunctions.
 */
public class StateSpecFunctions {
  private static final Logger LOG = LoggerFactory.getLogger(StateSpecFunctions.class);

  /**
   * A helper class that is essentially a {@link Serializable} {@link AbstractFunction3}.
   */
  private abstract static class SerializableFunction3<T1, T2, T3, T4>
      extends AbstractFunction3<T1, T2, T3, T4> implements Serializable {
  }

  /**
   * A {@link org.apache.spark.streaming.StateSpec} function to support reading from
   * an {@link UnboundedSource}.
   *
   * <p>This StateSpec function expects the following:
   * <ul>
   * <li>Key: The (partitioned) Source to read from.</li>
   * <li>Value: An optional {@link UnboundedSource.CheckpointMark} to start from.</li>
   * <li>State: A byte representation of the (previously) persisted CheckpointMark.</li>
   * </ul>
   * And returns an iterator over all read values (for the micro-batch).
   *
   * <p>This stateful operation could be described as a flatMap over a single-element stream, which
   * outputs all the elements read from the {@link UnboundedSource} for this micro-batch.
   * Since micro-batches are bounded, the provided UnboundedSource is wrapped by a
   * {@link MicrobatchSource} that applies bounds in the form of duration and max records
   * (per micro-batch).
   *
   *
   * <p>In order to avoid using Spark Guava's classes which pollute the
   * classpath, we use the {@link StateSpec#function(scala.Function3)} signature which employs
   * scala's native {@link scala.Option}, instead of the
   * {@link StateSpec#function(org.apache.spark.api.java.function.Function3)} signature,
   * which employs Guava's {@link com.google.common.base.Optional}.
   *
   * <p>See also <a href="https://issues.apache.org/jira/browse/SPARK-4819">SPARK-4819</a>.</p>
   *
   * @param runtimeContext    A serializable {@link SparkRuntimeContext}.
   * @param <T>               The type of the input stream elements.
   * @param <CheckpointMarkT> The type of the {@link UnboundedSource.CheckpointMark}.
   * @return The appropriate {@link org.apache.spark.streaming.StateSpec} function.
   */
  public static <T, CheckpointMarkT extends UnboundedSource.CheckpointMark>
  scala.Function3<Source<T>, scala.Option<CheckpointMarkT>, State<Tuple2<byte[], Instant>>,
      Tuple2<Iterable<byte[]>, Metadata>> mapSourceFunction(
           final SparkRuntimeContext runtimeContext, final String stepName) {

    return new SerializableFunction3<Source<T>, Option<CheckpointMarkT>,
        State<Tuple2<byte[], Instant>>, Tuple2<Iterable<byte[]>, Metadata>>() {

      @Override
      public Tuple2<Iterable<byte[]>, Metadata> apply(
          Source<T> source,
          scala.Option<CheckpointMarkT> startCheckpointMark,
          State<Tuple2<byte[], Instant>> state) {

        MetricsContainerStepMap metricsContainers = new MetricsContainerStepMap();
        MetricsContainer metricsContainer = metricsContainers.getContainer(stepName);

        // Add metrics container to the scope of org.apache.beam.sdk.io.Source.Reader methods
        // since they may report metrics.
        try (Closeable ignored = MetricsEnvironment.scopedMetricsContainer(metricsContainer)) {
          // source as MicrobatchSource
          MicrobatchSource<T, CheckpointMarkT> microbatchSource =
              (MicrobatchSource<T, CheckpointMarkT>) source;

        // Initial high/low watermarks.
        Instant lowWatermark = BoundedWindow.TIMESTAMP_MIN_VALUE;
        final Instant highWatermark;

        // if state exists, use it, otherwise it's first time so use the startCheckpointMark.
        // startCheckpointMark may be EmptyCheckpointMark (the Spark Java API tries to apply
        // Optional(null)), which is handled by the UnboundedSource implementation.
        Coder<CheckpointMarkT> checkpointCoder = microbatchSource.getCheckpointMarkCoder();
        CheckpointMarkT checkpointMark;
        if (state.exists()) {
          // previous (output) watermark is now the low watermark.
          lowWatermark = state.get()._2();
          checkpointMark = CoderHelpers.fromByteArray(state.get()._1(), checkpointCoder);
          LOG.info("Continue reading from an existing CheckpointMark.");
        } else if (startCheckpointMark.isDefined()
            && !startCheckpointMark.get().equals(EmptyCheckpointMark.get())) {
          checkpointMark = startCheckpointMark.get();
          LOG.info("Start reading from a provided CheckpointMark.");
        } else {
          checkpointMark = null;
          LOG.info("No CheckpointMark provided, start reading from default.");
        }

        // create reader.
        final MicrobatchSource.Reader/*<T>*/ microbatchReader;
        final Stopwatch stopwatch = Stopwatch.createStarted();
        long readDurationMillis = 0;

        try {
          microbatchReader =
              (MicrobatchSource.Reader)
                  microbatchSource.getOrCreateReader(runtimeContext.getPipelineOptions(),
                                                     checkpointMark);
        } catch (IOException e) {
          throw new RuntimeException(e);
        }

        // read microbatch as a serialized collection.
        final List<byte[]> readValues = new ArrayList<>();
        WindowedValue.FullWindowedValueCoder<T> coder =
            WindowedValue.FullWindowedValueCoder.of(
                source.getDefaultOutputCoder(),
                GlobalWindow.Coder.INSTANCE);
        try {
          // measure how long a read takes per-partition.
          boolean finished = !microbatchReader.start();
          while (!finished) {
            final WindowedValue<T> wv =
                WindowedValue.of((T) microbatchReader.getCurrent(),
                                 microbatchReader.getCurrentTimestamp(),
                                 GlobalWindow.INSTANCE,
                                 PaneInfo.NO_FIRING);
            readValues.add(CoderHelpers.toByteArray(wv, coder));
            finished = !microbatchReader.advance();
          }

          // end-of-read watermark is the high watermark, but don't allow decrease.
          final Instant sourceWatermark = microbatchReader.getWatermark();
          highWatermark = sourceWatermark.isAfter(lowWatermark) ? sourceWatermark : lowWatermark;

          readDurationMillis = stopwatch.stop().elapsed(TimeUnit.MILLISECONDS);

          LOG.info(
              "Source id {} spent {} millis on reading.",
              microbatchSource.getId(),
              readDurationMillis);

          // if the Source does not supply a CheckpointMark skip updating the state.
          @SuppressWarnings("unchecked")
          final CheckpointMarkT finishedReadCheckpointMark =
              (CheckpointMarkT) microbatchReader.getCheckpointMark();
          byte[] codedCheckpoint = new byte[0];
          if (finishedReadCheckpointMark != null) {
            codedCheckpoint = CoderHelpers.toByteArray(finishedReadCheckpointMark, checkpointCoder);
          } else {
            LOG.info("Skipping checkpoint marking because the reader failed to supply one.");
          }
          // persist the end-of-read (high) watermark for following read, where it will become
          // the next low watermark.
          state.update(new Tuple2<>(codedCheckpoint, highWatermark));
        } catch (IOException e) {
          throw new RuntimeException("Failed to read from reader.", e);
        }

        final ArrayList<byte[]> payload =
            Lists.newArrayList(Iterators.unmodifiableIterator(readValues.iterator()));

        return new Tuple2<>(
            (Iterable<byte[]>) payload,
            new Metadata(
                readValues.size(),
                lowWatermark,
                highWatermark,
                readDurationMillis,
                metricsContainers));

        } catch (IOException e) {
          throw new RuntimeException(e);
        }
      }
    };
  }
}