/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.beam.runners.spark.io;
import static com.google.common.base.Preconditions.checkArgument;
import com.google.common.collect.Lists;
import java.util.Arrays;
import java.util.Collections;
import java.util.Deque;
import java.util.LinkedList;
import java.util.List;
import java.util.Queue;
import org.apache.beam.runners.spark.util.GlobalWatermarkHolder.SparkWatermarks;
import org.apache.beam.sdk.coders.CannotProvideCoderException;
import org.apache.beam.sdk.coders.Coder;
import org.apache.beam.sdk.transforms.PTransform;
import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
import org.apache.beam.sdk.values.PBegin;
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.TimestampedValue;
import org.apache.beam.sdk.values.WindowingStrategy;
import org.joda.time.Duration;
import org.joda.time.Instant;
/**
* Create an input stream from Queue. For SparkRunner tests only.
*
* <p>To properly compose a stream of micro-batches with their Watermarks, please keep in mind
* that eventually there a two queues here - one for batches and another for Watermarks.
*
* <p>While both queues advance according to Spark's batch-interval, there is a slight difference
* in how data is pushed into the stream compared to the advancement of Watermarks since Watermarks
* advance onBatchCompleted hook call so if you'd want to set the watermark advance for a specific
* batch it should be called before that batch.
* Also keep in mind that being a queue that is polled per batch interval, if there is a need to
* "hold" the same Watermark without advancing it it should be stated explicitly or the Watermark
* will advance as soon as it can (in the next batch completed hook).
*
* <p>Example 1:
*
* {@code
* CreateStream.<TimestampedValue<String>>withBatchInterval(batchDuration)
* .nextBatch(
* TimestampedValue.of("foo", endOfGlobalWindow),
* TimestampedValue.of("bar", endOfGlobalWindow))
* .advanceNextBatchWatermarkToInfinity();
* }
* The first batch will see the default start-of-time WM of
* {@link BoundedWindow#TIMESTAMP_MIN_VALUE} and any following batch will see
* the end-of-time WM {@link BoundedWindow#TIMESTAMP_MAX_VALUE}.
*
* <p>Example 2:
*
* {@code
* CreateStream.<TimestampedValue<String>>withBatchInterval(batchDuration)
* .nextBatch(
* TimestampedValue.of(1, instant))
* .advanceWatermarkForNextBatch(instant.plus(Duration.standardMinutes(20)))
* .nextBatch(
* TimestampedValue.of(2, instant))
* .nextBatch(
* TimestampedValue.of(3, instant))
* .advanceWatermarkForNextBatch(instant.plus(Duration.standardMinutes(30)))
* }
* The first batch will see the start-of-time WM and the second will see the advanced (+20 min.) WM.
* The third WM will see the WM advanced to +30 min, because this is the next advancement of the WM
* regardless of where it ws called in the construction of CreateStream.
* //TODO: write a proper Builder enforcing all those rules mentioned.
* @param <T> stream type.
*/
public final class CreateStream<T> extends PTransform<PBegin, PCollection<T>> {
private final Duration batchInterval;
private final Queue<Iterable<TimestampedValue<T>>> batches = new LinkedList<>();
private final Deque<SparkWatermarks> times = new LinkedList<>();
private final Coder<T> coder;
private Instant initialSystemTime;
private Instant lowWatermark = BoundedWindow.TIMESTAMP_MIN_VALUE; //for test purposes.
private CreateStream(Duration batchInterval, Instant initialSystemTime, Coder<T> coder) {
this.batchInterval = batchInterval;
this.initialSystemTime = initialSystemTime;
this.coder = coder;
}
/** Set the batch interval for the stream. */
public static <T> CreateStream<T> of(Coder<T> coder, Duration batchInterval) {
return new CreateStream<>(batchInterval, new Instant(0), coder);
}
/**
* Enqueue next micro-batch elements.
* This is backed by a {@link Queue} so stream input order would keep the population order (FIFO).
*/
@SafeVarargs
public final CreateStream<T> nextBatch(TimestampedValue<T>... batchElements) {
// validate timestamps if timestamped elements.
for (TimestampedValue<T> element: batchElements) {
TimestampedValue timestampedValue = (TimestampedValue) element;
checkArgument(
timestampedValue.getTimestamp().isBefore(BoundedWindow.TIMESTAMP_MAX_VALUE),
"Elements must have timestamps before %s. Got: %s",
BoundedWindow.TIMESTAMP_MAX_VALUE,
timestampedValue.getTimestamp());
}
batches.offer(Arrays.asList(batchElements));
return this;
}
/**
* For non-timestamped elements.
*/
@SafeVarargs
public final CreateStream<T> nextBatch(T... batchElements) {
List<TimestampedValue<T>> timestamped = Lists.newArrayListWithCapacity(batchElements.length);
// as TimestampedValue.
for (T element: batchElements) {
timestamped.add(TimestampedValue.atMinimumTimestamp(element));
}
batches.offer(timestamped);
return this;
}
/**
* Adds an empty batch.
*/
public CreateStream<T> emptyBatch() {
batches.offer(Collections.<TimestampedValue<T>>emptyList());
return this;
}
/** Set the initial synchronized processing time. */
public CreateStream<T> initialSystemTimeAt(Instant initialSystemTime) {
this.initialSystemTime = initialSystemTime;
return this;
}
/**
* Advances the watermark in the next batch.
*/
public CreateStream<T> advanceWatermarkForNextBatch(Instant newWatermark) {
checkArgument(
!newWatermark.isBefore(lowWatermark), "The watermark is not allowed to decrease!");
checkArgument(
newWatermark.isBefore(BoundedWindow.TIMESTAMP_MAX_VALUE),
"The Watermark cannot progress beyond the maximum. Got: %s. Maximum: %s",
newWatermark,
BoundedWindow.TIMESTAMP_MAX_VALUE);
return advance(newWatermark);
}
/**
* Advances the watermark in the next batch to the end-of-time.
*/
public CreateStream<T> advanceNextBatchWatermarkToInfinity() {
return advance(BoundedWindow.TIMESTAMP_MAX_VALUE);
}
private CreateStream<T> advance(Instant newWatermark) {
// advance the system time.
Instant currentSynchronizedProcessingTime = times.peekLast() == null ? initialSystemTime
: times.peekLast().getSynchronizedProcessingTime();
Instant nextSynchronizedProcessingTime = currentSynchronizedProcessingTime.plus(batchInterval);
checkArgument(
nextSynchronizedProcessingTime.isAfter(currentSynchronizedProcessingTime),
"Synchronized processing time must always advance.");
times.offer(new SparkWatermarks(lowWatermark, newWatermark, nextSynchronizedProcessingTime));
lowWatermark = newWatermark;
return this;
}
/** Get the underlying queue representing the mock stream of micro-batches. */
public Queue<Iterable<TimestampedValue<T>>> getBatches() {
return batches;
}
/**
* Get times so they can be pushed into the
* {@link org.apache.beam.runners.spark.util.GlobalWatermarkHolder}.
*/
public Queue<SparkWatermarks> getTimes() {
return times;
}
@Override
public PCollection<T> expand(PBegin input) {
return PCollection.createPrimitiveOutputInternal(
input.getPipeline(), WindowingStrategy.globalDefault(), PCollection.IsBounded.UNBOUNDED);
}
@Override
protected Coder<T> getDefaultOutputCoder() throws CannotProvideCoderException {
return coder;
}
}