/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.beam.runners.core;
import static com.google.common.base.Preconditions.checkState;
import com.google.common.annotations.VisibleForTesting;
import java.io.Serializable;
import javax.annotation.Nullable;
import org.apache.beam.sdk.state.ReadableState;
import org.apache.beam.sdk.state.WatermarkHoldState;
import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
import org.apache.beam.sdk.transforms.windowing.PaneInfo.Timing;
import org.apache.beam.sdk.transforms.windowing.TimestampCombiner;
import org.apache.beam.sdk.transforms.windowing.Window.ClosingBehavior;
import org.apache.beam.sdk.util.WindowTracing;
import org.apache.beam.sdk.values.WindowingStrategy;
import org.joda.time.Duration;
import org.joda.time.Instant;
/**
* Implements the logic to hold the output watermark for a computation back
* until it has seen all the elements it needs based on the input watermark for the
* computation.
*
* <p>The backend ensures the output watermark can never progress beyond the
* input watermark for a computation. GroupAlsoByWindows computations may add a 'hold'
* to the output watermark in order to prevent it progressing beyond a time within a window.
* The hold will be 'cleared' when the associated pane is emitted.
*
* <p>This class is only intended for use by {@link ReduceFnRunner}. The two evolve together and
* will likely break any other uses.
*
* @param <W> The kind of {@link BoundedWindow} the hold is for.
*/
class WatermarkHold<W extends BoundedWindow> implements Serializable {
/**
* Return tag for state containing the output watermark hold
* used for elements.
*/
public static <W extends BoundedWindow>
StateTag<WatermarkHoldState> watermarkHoldTagForTimestampCombiner(
TimestampCombiner timestampCombiner) {
return StateTags.<WatermarkHoldState>makeSystemTagInternal(
StateTags.<W>watermarkStateInternal("hold", timestampCombiner));
}
/**
* Tag for state containing end-of-window and garbage collection output watermark holds.
* (We can't piggy-back on the data hold state since the timestampCombiner may be
* {@link TimestampCombiner#EARLIEST}, in which case every pane will
* would take the end-of-window time as its element time.)
*/
@VisibleForTesting
public static final StateTag<WatermarkHoldState> EXTRA_HOLD_TAG =
StateTags.makeSystemTagInternal(StateTags.watermarkStateInternal(
"extra", TimestampCombiner.EARLIEST));
private final TimerInternals timerInternals;
private final WindowingStrategy<?, W> windowingStrategy;
private final StateTag<WatermarkHoldState> elementHoldTag;
public WatermarkHold(TimerInternals timerInternals, WindowingStrategy<?, W> windowingStrategy) {
this.timerInternals = timerInternals;
this.windowingStrategy = windowingStrategy;
this.elementHoldTag =
watermarkHoldTagForTimestampCombiner(windowingStrategy.getTimestampCombiner());
}
/**
* Add a hold to prevent the output watermark progressing beyond the (possibly adjusted) timestamp
* of the element in {@code context}. We allow the actual hold time to be shifted later by the
* {@link TimestampCombiner}, but no further than the end of the window. The hold will
* remain until cleared by {@link #extractAndRelease}. Return the timestamp at which the hold
* was placed, or {@literal null} if no hold was placed.
*
* <p>In the following we'll write {@code E} to represent an element's timestamp after passing
* through the window strategy's output time function, {@code IWM} for the local input watermark,
* {@code OWM} for the local output watermark, and {@code GCWM} for the garbage collection
* watermark (which is at {@code IWM - getAllowedLateness}). Time progresses from left to right,
* and we write {@code [ ... ]} to denote a bounded window with implied lower bound.
*
* <p>Note that the GCWM will be the same as the IWM if {@code getAllowedLateness}
* is {@code ZERO}.
*
* <p>Here are the cases we need to handle. They are conceptually considered in the
* sequence written since if getAllowedLateness is ZERO the GCWM is the same as the IWM.
* <ol>
* <li>(Normal)
* <pre>
* |
* [ | E ]
* |
* IWM
* </pre>
* This is, hopefully, the common and happy case. The element is locally on-time and can
* definitely make it to an {@code ON_TIME} pane which we can still set an end-of-window timer
* for. We place an element hold at E, which may contribute to the {@code ON_TIME} pane's
* timestamp (depending on the output time function). Thus the OWM will not proceed past E
* until the next pane fires.
*
* <li>(Discard - no target window)
* <pre>
* | |
* [ E ] | |
* | |
* GCWM <-getAllowedLateness-> IWM
* </pre>
* The element is very locally late. The window has been garbage collected, thus there
* is no target pane E could be assigned to. We discard E.
*
* <li>(Unobservably late)
* <pre>
* | |
* [ | E | ]
* | |
* OWM IWM
* </pre>
* The element is locally late, however we can still treat this case as for 'Normal' above
* since the IWM has not yet passed the end of the window and the element is ahead of the
* OWM. In effect, we get to 'launder' the locally late element and consider it as locally
* on-time because no downstream computation can observe the difference.
*
* <li>(Maybe late 1)
* <pre>
* | |
* [ | E ] |
* | |
* OWM IWM
* </pre>
* The end-of-window timer may have already fired for this window, and thus an {@code ON_TIME}
* pane may have already been emitted. However, if timer firings have been delayed then it
* is possible the {@code ON_TIME} pane has not yet been emitted. We can't place an element
* hold since we can't be sure if it will be cleared promptly. Thus this element *may* find
* its way into an {@code ON_TIME} pane, but if so it will *not* contribute to that pane's
* timestamp. We may however set a garbage collection hold if required.
*
* <li>(Maybe late 2)
* <pre>
* | |
* [ E | | ]
* | |
* OWM IWM
* </pre>
* The end-of-window timer has not yet fired, so this element may still appear in an
* {@code ON_TIME} pane. However the element is too late to contribute to the output
* watermark hold, and thus won't contribute to the pane's timestamp. We can still place an
* end-of-window hold.
*
* <li>(Maybe late 3)
* <pre>
* | |
* [ E | ] |
* | |
* OWM IWM
* </pre>
* As for the (Maybe late 2) case, however we don't even know if the end-of-window timer
* has already fired, or it is about to fire. We can place only the garbage collection hold,
* if required.
*
* <li>(Definitely late)
* <pre>
* | |
* [ E ] | |
* | |
* OWM IWM
* </pre>
* The element is definitely too late to make an {@code ON_TIME} pane. We are too late to
* place an end-of-window hold. We can still place a garbage collection hold if required.
*
* </ol>
*/
@Nullable
public Instant addHolds(ReduceFn<?, ?, ?, W>.ProcessValueContext context) {
Instant hold = addElementHold(context);
if (hold == null) {
hold = addEndOfWindowOrGarbageCollectionHolds(context, false/*paneIsEmpty*/);
}
return hold;
}
/**
* Return {@code timestamp}, possibly shifted forward in time according to the window
* strategy's output time function.
*/
private Instant shift(Instant timestamp, W window) {
Instant shifted =
windowingStrategy
.getTimestampCombiner()
.assign(window, windowingStrategy.getWindowFn().getOutputTime(timestamp, window));
checkState(!shifted.isBefore(timestamp),
"TimestampCombiner moved element from %s to earlier time %s for window %s",
BoundedWindow.formatTimestamp(timestamp),
BoundedWindow.formatTimestamp(shifted),
window);
checkState(timestamp.isAfter(window.maxTimestamp())
|| !shifted.isAfter(window.maxTimestamp()),
"TimestampCombiner moved element from %s to %s which is beyond end of "
+ "window %s",
timestamp, shifted, window);
return shifted;
}
/**
* Attempt to add an 'element hold'. Return the {@link Instant} at which the hold was
* added (ie the element timestamp plus any forward shift requested by the
* {@link WindowingStrategy#getTimestampCombiner}), or {@literal null} if no hold was added.
* The hold is only added if both:
* <ol>
* <li>The backend will be able to respect it. In other words the output watermark cannot
* be ahead of the proposed hold time.
* <li>A timer will be set (by {@link ReduceFnRunner}) to clear the hold by the end of the
* window. In other words the input watermark cannot be ahead of the end of the window.
* </ol>
* The hold ensures the pane which incorporates the element is will not be considered late by
* any downstream computation when it is eventually emitted.
*/
@Nullable
private Instant addElementHold(ReduceFn<?, ?, ?, W>.ProcessValueContext context) {
// Give the window function a chance to move the hold timestamp forward to encourage progress.
// (A later hold implies less impediment to the output watermark making progress, which in
// turn encourages end-of-window triggers to fire earlier in following computations.)
Instant elementHold = shift(context.timestamp(), context.window());
Instant outputWM = timerInternals.currentOutputWatermarkTime();
Instant inputWM = timerInternals.currentInputWatermarkTime();
String which;
boolean tooLate;
// TODO: These case labels could be tightened.
// See the case analysis in addHolds above for the motivation.
if (outputWM != null && elementHold.isBefore(outputWM)) {
which = "too late to effect output watermark";
tooLate = true;
} else if (context.window().maxTimestamp().isBefore(inputWM)) {
which = "too late for end-of-window timer";
tooLate = true;
} else {
which = "on time";
tooLate = false;
checkState(!elementHold.isAfter(BoundedWindow.TIMESTAMP_MAX_VALUE),
"Element hold %s is beyond end-of-time", elementHold);
context.state().access(elementHoldTag).add(elementHold);
}
WindowTracing.trace(
"WatermarkHold.addHolds: element hold at {} is {} for "
+ "key:{}; window:{}; inputWatermark:{}; outputWatermark:{}",
elementHold, which, context.key(), context.window(), inputWM,
outputWM);
return tooLate ? null : elementHold;
}
/**
* Add an end-of-window hold or, if too late for that, a garbage collection hold (if required).
* Return the {@link Instant} at which hold was added, or {@literal null} if no hold was added.
*/
@Nullable
private Instant addEndOfWindowOrGarbageCollectionHolds(
ReduceFn<?, ?, ?, W>.Context context, boolean paneIsEmpty) {
Instant hold = addEndOfWindowHold(context, paneIsEmpty);
if (hold == null) {
hold = addGarbageCollectionHold(context, paneIsEmpty);
}
return hold;
}
/**
* Attempt to add an 'end-of-window hold'. Return the {@link Instant} at which the hold was added
* (ie the end of window time), or {@literal null} if no end of window hold is possible and we
* should fallback to a garbage collection hold.
*
* <p>We only add the hold if we can be sure a timer will be set (by {@link ReduceFnRunner})
* to clear it. In other words, the input watermark cannot be ahead of the end of window time.
*
* <p>An end-of-window hold is added in two situations:
* <ol>
* <li>An incoming element came in behind the output watermark (so we are too late for placing
* the usual element hold), but it may still be possible to include the element in an
* {@link Timing#ON_TIME} pane. We place the end of window hold to ensure that pane will
* not be considered late by any downstream computation.
* <li>We guarantee an {@link Timing#ON_TIME} pane will be emitted for all windows which saw at
* least one element, even if that {@link Timing#ON_TIME} pane is empty. Thus when elements in
* a pane are processed due to a fired trigger we must set both an end of window timer and an end
* of window hold. Again, the hold ensures the {@link Timing#ON_TIME} pane will not be considered
* late by any downstream computation.
* </ol>
*/
@Nullable
private Instant addEndOfWindowHold(ReduceFn<?, ?, ?, W>.Context context, boolean paneIsEmpty) {
Instant outputWM = timerInternals.currentOutputWatermarkTime();
Instant inputWM = timerInternals.currentInputWatermarkTime();
Instant eowHold = context.window().maxTimestamp();
if (eowHold.isBefore(inputWM)) {
WindowTracing.trace(
"WatermarkHold.addEndOfWindowHold: end-of-window hold at {} is too late for "
+ "end-of-window timer for key:{}; window:{}; inputWatermark:{}; outputWatermark:{}",
eowHold, context.key(), context.window(), inputWM, outputWM);
return null;
}
checkState(outputWM == null || !eowHold.isBefore(outputWM),
"End-of-window hold %s cannot be before output watermark %s",
eowHold, outputWM);
checkState(!eowHold.isAfter(BoundedWindow.TIMESTAMP_MAX_VALUE),
"End-of-window hold %s is beyond end-of-time", eowHold);
// If paneIsEmpty then this hold is just for empty ON_TIME panes, so we want to keep
// the hold away from the combining function in elementHoldTag.
// However if !paneIsEmpty then it could make sense to use the elementHoldTag here.
// Alas, onMerge is forced to add an end of window or garbage collection hold without
// knowing whether an element hold is already in place (stopping to check is too expensive).
// This it would end up adding an element hold at the end of the window which could
// upset the elementHoldTag combining function.
context.state().access(EXTRA_HOLD_TAG).add(eowHold);
WindowTracing.trace(
"WatermarkHold.addEndOfWindowHold: end-of-window hold at {} is on time for "
+ "key:{}; window:{}; inputWatermark:{}; outputWatermark:{}",
eowHold, context.key(), context.window(), inputWM, outputWM);
return eowHold;
}
/**
* Attempt to add a 'garbage collection hold' if it is required. Return the {@link Instant} at
* which the hold was added (ie the end of window time plus allowed lateness),
* or {@literal null} if no hold was added.
*
* <p>We only add the hold if it is distinct from what would be added by
* {@link #addEndOfWindowHold}. In other words, {@link WindowingStrategy#getAllowedLateness}
* must be non-zero.
*
* <p>A garbage collection hold is added in two situations:
* <ol>
* <li>An incoming element came in behind the output watermark, and was too late for placing
* the usual element hold or an end of window hold. Place the garbage collection hold so that
* we can guarantee when the pane is finally triggered its output will not be dropped due to
* excessive lateness by any downstream computation.
* <li>The {@link WindowingStrategy#getClosingBehavior()} is
* {@link ClosingBehavior#FIRE_ALWAYS}, and thus we guarantee a final pane will be emitted
* for all windows which saw at least one element. Again, the garbage collection hold guarantees
* that any empty final pane can be given a timestamp which will not be considered beyond
* allowed lateness by any downstream computation.
* </ol>
*
* <p>We use {@code paneIsEmpty} to distinguish cases 1 and 2.
*/
@Nullable
private Instant addGarbageCollectionHold(
ReduceFn<?, ?, ?, W>.Context context, boolean paneIsEmpty) {
Instant outputWM = timerInternals.currentOutputWatermarkTime();
Instant inputWM = timerInternals.currentInputWatermarkTime();
Instant gcHold = LateDataUtils.garbageCollectionTime(context.window(), windowingStrategy);
if (!windowingStrategy.getAllowedLateness().isLongerThan(Duration.ZERO)) {
WindowTracing.trace(
"WatermarkHold.addGarbageCollectionHold: garbage collection hold at {} is unnecessary "
+ "since no allowed lateness for key:{}; window:{}; inputWatermark:{}; "
+ "outputWatermark:{}",
gcHold, context.key(), context.window(), inputWM, outputWM);
return null;
}
if (paneIsEmpty && context.windowingStrategy().getClosingBehavior()
== ClosingBehavior.FIRE_IF_NON_EMPTY) {
WindowTracing.trace(
"WatermarkHold.addGarbageCollectionHold: garbage collection hold at {} is unnecessary "
+ "since empty pane and FIRE_IF_NON_EMPTY for key:{}; window:{}; inputWatermark:{}; "
+ "outputWatermark:{}",
gcHold, context.key(), context.window(), inputWM, outputWM);
return null;
}
if (!gcHold.isBefore(BoundedWindow.TIMESTAMP_MAX_VALUE)) {
// If the garbage collection hold is past the timestamp we can represent, instead truncate
// to the maximum timestamp that is not positive infinity. This ensures all windows will
// eventually be garbage collected.
gcHold = BoundedWindow.TIMESTAMP_MAX_VALUE.minus(Duration.millis(1L));
}
checkState(!gcHold.isBefore(inputWM),
"Garbage collection hold %s cannot be before input watermark %s",
gcHold, inputWM);
checkState(!gcHold.isAfter(BoundedWindow.TIMESTAMP_MAX_VALUE),
"Garbage collection hold %s is beyond end-of-time", gcHold);
// Same EXTRA_HOLD_TAG vs elementHoldTag discussion as in addEndOfWindowHold above.
context.state().access(EXTRA_HOLD_TAG).add(gcHold);
WindowTracing.trace(
"WatermarkHold.addGarbageCollectionHold: garbage collection hold at {} is on time for "
+ "key:{}; window:{}; inputWatermark:{}; outputWatermark:{}",
gcHold, context.key(), context.window(), inputWM, outputWM);
return gcHold;
}
/**
* Prefetch watermark holds in preparation for merging.
*/
public void prefetchOnMerge(MergingStateAccessor<?, W> state) {
StateMerging.prefetchWatermarks(state, elementHoldTag);
}
/**
* Updates the watermark hold when windows merge if it is possible the merged value does
* not equal all of the existing holds. For example, if the new window implies a later
* watermark hold, then earlier holds may be released.
*/
public void onMerge(ReduceFn<?, ?, ?, W>.OnMergeContext context) {
WindowTracing.debug("WatermarkHold.onMerge: for key:{}; window:{}; inputWatermark:{}; "
+ "outputWatermark:{}",
context.key(), context.window(), timerInternals.currentInputWatermarkTime(),
timerInternals.currentOutputWatermarkTime());
StateMerging.mergeWatermarks(context.state(), elementHoldTag, context.window());
// If we had a cheap way to determine if we have an element hold then we could
// avoid adding an unnecessary end-of-window or garbage collection hold.
// Simply reading the above merged watermark would impose an additional read for the
// common case that the active window has just one underlying state address window and
// the hold depends on the min of the element timestamps.
// At least one merged window must be non-empty for the merge to have been triggered.
StateMerging.clear(context.state(), EXTRA_HOLD_TAG);
addEndOfWindowOrGarbageCollectionHolds(context, false /*paneIsEmpty*/);
}
/**
* Result of {@link #extractAndRelease}.
*/
public static class OldAndNewHolds {
public final Instant oldHold;
@Nullable
public final Instant newHold;
public OldAndNewHolds(Instant oldHold, @Nullable Instant newHold) {
this.oldHold = oldHold;
this.newHold = newHold;
}
}
public void prefetchExtract(final ReduceFn<?, ?, ?, W>.Context context) {
context.state().access(elementHoldTag).readLater();
context.state().access(EXTRA_HOLD_TAG).readLater();
}
/**
* Return (a future for) the earliest hold for {@code context}. Clear all the holds after
* reading, but add/restore an end-of-window or garbage collection hold if required.
*
* <p>The returned timestamp is the output timestamp according to the {@link TimestampCombiner}
* from the windowing strategy of this {@link WatermarkHold}, combined across all the non-late
* elements in the current pane. If there is no such value the timestamp is the end
* of the window.
*/
public ReadableState<OldAndNewHolds> extractAndRelease(
final ReduceFn<?, ?, ?, W>.Context context, final boolean isFinished) {
WindowTracing.debug(
"WatermarkHold.extractAndRelease: for key:{}; window:{}; inputWatermark:{}; "
+ "outputWatermark:{}",
context.key(), context.window(), timerInternals.currentInputWatermarkTime(),
timerInternals.currentOutputWatermarkTime());
final WatermarkHoldState elementHoldState = context.state().access(elementHoldTag);
final WatermarkHoldState extraHoldState = context.state().access(EXTRA_HOLD_TAG);
return new ReadableState<OldAndNewHolds>() {
@Override
public ReadableState<OldAndNewHolds> readLater() {
elementHoldState.readLater();
extraHoldState.readLater();
return this;
}
@Override
public OldAndNewHolds read() {
// Read both the element and extra holds.
Instant elementHold = elementHoldState.read();
Instant extraHold = extraHoldState.read();
Instant oldHold;
// Find the minimum, accounting for null.
if (elementHold == null) {
oldHold = extraHold;
} else if (extraHold == null) {
oldHold = elementHold;
} else if (elementHold.isBefore(extraHold)) {
oldHold = elementHold;
} else {
oldHold = extraHold;
}
if (oldHold == null || oldHold.isAfter(context.window().maxTimestamp())) {
// If no hold (eg because all elements came in behind the output watermark), or
// the hold was for garbage collection, take the end of window as the result.
WindowTracing.debug(
"WatermarkHold.extractAndRelease.read: clipping from {} to end of window "
+ "for key:{}; window:{}",
oldHold, context.key(), context.window());
oldHold = context.window().maxTimestamp();
}
WindowTracing.debug("WatermarkHold.extractAndRelease.read: clearing for key:{}; window:{}",
context.key(), context.window());
// Clear the underlying state to allow the output watermark to progress.
elementHoldState.clear();
extraHoldState.clear();
@Nullable Instant newHold = null;
if (!isFinished) {
// Only need to leave behind an end-of-window or garbage collection hold
// if future elements will be processed.
newHold = addEndOfWindowOrGarbageCollectionHolds(context, true /*paneIsEmpty*/);
}
return new OldAndNewHolds(oldHold, newHold);
}
};
}
/**
* Clear any remaining holds.
*/
public void clearHolds(ReduceFn<?, ?, ?, W>.Context context) {
WindowTracing.debug(
"WatermarkHold.clearHolds: For key:{}; window:{}; inputWatermark:{}; outputWatermark:{}",
context.key(), context.window(), timerInternals.currentInputWatermarkTime(),
timerInternals.currentOutputWatermarkTime());
context.state().access(elementHoldTag).clear();
context.state().access(EXTRA_HOLD_TAG).clear();
}
/**
* Return the current data hold, or null if none. Does not clear. For debugging only.
*/
@Nullable
public Instant getDataCurrent(ReduceFn<?, ?, ?, W>.Context context) {
return context.state().access(elementHoldTag).read();
}
}