/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.beam.runners.spark.io; import static com.google.common.base.Preconditions.checkArgument; import org.apache.beam.runners.spark.SparkPipelineOptions; import org.apache.beam.runners.spark.translation.SparkRuntimeContext; import org.apache.beam.sdk.io.Source; import org.apache.beam.sdk.io.UnboundedSource; import org.apache.spark.api.java.JavaSparkContext$; import org.apache.spark.rdd.RDD; import org.apache.spark.streaming.StreamingContext; import org.apache.spark.streaming.Time; import org.apache.spark.streaming.api.java.JavaPairDStream; import org.apache.spark.streaming.dstream.InputDStream; import org.apache.spark.streaming.scheduler.RateController; import org.apache.spark.streaming.scheduler.RateController$; import org.apache.spark.streaming.scheduler.rate.RateEstimator; import org.apache.spark.streaming.scheduler.rate.RateEstimator$; import org.joda.time.Duration; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import scala.Tuple2; /** * A {@link SourceDStream} is an {@link InputDStream} of {@link SourceRDD.Unbounded}s. * * <p>This InputDStream will create a stream of partitioned {@link UnboundedSource}s, * and their respective, (optional) starting {@link UnboundedSource.CheckpointMark}. * * <p>The underlying Source is actually a {@link MicrobatchSource} with bounds on read duration, * and max records. Both set here. * Read duration bound is affected by {@link SparkPipelineOptions#getReadTimePercentage()} and * {@link SparkPipelineOptions#getMinReadTimeMillis()}. * Records bound is controlled by the {@link RateController} mechanism. */ class SourceDStream<T, CheckpointMarkT extends UnboundedSource.CheckpointMark> extends InputDStream<Tuple2<Source<T>, CheckpointMarkT>> { private static final Logger LOG = LoggerFactory.getLogger(SourceDStream.class); private final UnboundedSource<T, CheckpointMarkT> unboundedSource; private final SparkRuntimeContext runtimeContext; private final Duration boundReadDuration; // Reader cache interval to expire readers if they haven't been accessed in the last microbatch. // The reason we expire readers is that upon executor death/addition source split ownership can be // reshuffled between executors. When this happens we want to close and expire unused readers // in the executor in case it regains ownership of the source split in the future - to avoid // resuming from an earlier checkpoint. private final double readerCacheInterval; // Number of partitions for the DStream is final and remains the same throughout the entire // lifetime of the pipeline, including when resuming from checkpoint. private final int numPartitions; // the initial parallelism, set by Spark's backend, will be determined once when the job starts. // in case of resuming/recovering from checkpoint, the DStream will be reconstructed and this // property should not be reset. private final int initialParallelism; // the bound on max records is optional. // in case it is set explicitly via PipelineOptions, it takes precedence // otherwise it could be activated via RateController. private final long boundMaxRecords; SourceDStream( StreamingContext ssc, UnboundedSource<T, CheckpointMarkT> unboundedSource, SparkRuntimeContext runtimeContext, Long boundMaxRecords) { super(ssc, JavaSparkContext$.MODULE$.<scala.Tuple2<Source<T>, CheckpointMarkT>>fakeClassTag()); this.unboundedSource = unboundedSource; this.runtimeContext = runtimeContext; SparkPipelineOptions options = runtimeContext.getPipelineOptions().as( SparkPipelineOptions.class); // Reader cache expiration interval. 50% of batch interval is added to accommodate latency. this.readerCacheInterval = 1.5 * options.getBatchIntervalMillis(); this.boundReadDuration = boundReadDuration(options.getReadTimePercentage(), options.getMinReadTimeMillis()); // set initial parallelism once. this.initialParallelism = ssc().sparkContext().defaultParallelism(); checkArgument(this.initialParallelism > 0, "Number of partitions must be greater than zero."); this.boundMaxRecords = boundMaxRecords; try { this.numPartitions = createMicrobatchSource() .split(options) .size(); } catch (Exception e) { throw new RuntimeException(e); } } @Override public scala.Option<RDD<Tuple2<Source<T>, CheckpointMarkT>>> compute(Time validTime) { RDD<scala.Tuple2<Source<T>, CheckpointMarkT>> rdd = new SourceRDD.Unbounded<>( ssc().sparkContext(), runtimeContext, createMicrobatchSource(), numPartitions); return scala.Option.apply(rdd); } private MicrobatchSource<T, CheckpointMarkT> createMicrobatchSource() { return new MicrobatchSource<>(unboundedSource, boundReadDuration, initialParallelism, computeReadMaxRecords(), -1, id(), readerCacheInterval); } private long computeReadMaxRecords() { if (boundMaxRecords > 0) { LOG.info("Max records per batch has been set to {}, as configured in the PipelineOptions.", boundMaxRecords); return boundMaxRecords; } else { final scala.Option<Long> rateControlledMax = rateControlledMaxRecords(); if (rateControlledMax.isDefined()) { LOG.info("Max records per batch has been set to {}, as advised by the rate controller.", rateControlledMax.get()); return rateControlledMax.get(); } else { LOG.info("Max records per batch has not been limited by neither configuration " + "nor the rate controller, and will remain unlimited for the current batch " + "({}).", Long.MAX_VALUE); return Long.MAX_VALUE; } } } @Override public void start() { } @Override public void stop() { } @Override public String name() { return "Beam UnboundedSource [" + id() + "]"; } /** * Number of partitions is exposed so clients of {@link SourceDStream} can use this to set * appropriate partitioning for operations such as {@link JavaPairDStream#mapWithState}. */ int getNumPartitions() { return numPartitions; } //---- Bound by time. // return the largest between the proportional read time (%batchDuration dedicated for read) // and the min. read time set. private Duration boundReadDuration(double readTimePercentage, long minReadTimeMillis) { long batchDurationMillis = ssc().graph().batchDuration().milliseconds(); Duration proportionalDuration = new Duration(Math.round( batchDurationMillis * readTimePercentage)); Duration lowerBoundDuration = new Duration(minReadTimeMillis); Duration readDuration = proportionalDuration.isLongerThan(lowerBoundDuration) ? proportionalDuration : lowerBoundDuration; LOG.info("Read duration set to: " + readDuration); return readDuration; } //---- Bound by records. private scala.Option<Long> rateControlledMaxRecords() { final scala.Option<RateController> rateControllerOption = rateController(); final scala.Option<Long> rateLimitPerBatch; final long rateLimitPerSec; if (rateControllerOption.isDefined() && ((rateLimitPerSec = rateControllerOption.get().getLatestRate()) > 0)) { final long batchDurationSec = ssc().graph().batchDuration().milliseconds() / 1000; rateLimitPerBatch = scala.Option.apply(rateLimitPerSec * batchDurationSec); } else { rateLimitPerBatch = scala.Option.empty(); } return rateLimitPerBatch; } private final RateController rateController = new SourceRateController(id(), RateEstimator$.MODULE$.create(ssc().conf(), ssc().graph().batchDuration())); @Override public scala.Option<RateController> rateController() { if (RateController$.MODULE$.isBackPressureEnabled(ssc().conf())) { return scala.Option.apply(rateController); } else { return scala.Option.empty(); } } private static class SourceRateController extends RateController { private SourceRateController(int id, RateEstimator rateEstimator) { super(id, rateEstimator); } @Override public void publish(long rate) { } } }