/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.beam.runners.core;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Function;
import com.google.common.base.Predicate;
import com.google.common.collect.Iterables;
import org.apache.beam.sdk.metrics.Counter;
import org.apache.beam.sdk.metrics.Metrics;
import org.apache.beam.sdk.state.TimeDomain;
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
import org.apache.beam.sdk.util.WindowTracing;
import org.apache.beam.sdk.util.WindowedValue;
import org.apache.beam.sdk.values.KV;
import org.apache.beam.sdk.values.WindowingStrategy;
import org.joda.time.Instant;
/**
* A customized {@link DoFnRunner} that handles late data dropping for
* a {@link KeyedWorkItem} input {@link DoFn}.
*
* <p>It expands windows before checking data lateness.
*
* <p>{@link KeyedWorkItem KeyedWorkItems} are always in empty windows.
*
* @param <K> key type
* @param <InputT> input value element type
* @param <OutputT> output value element type
* @param <W> window type
*/
public class LateDataDroppingDoFnRunner<K, InputT, OutputT, W extends BoundedWindow>
implements DoFnRunner<KeyedWorkItem<K, InputT>, KV<K, OutputT>> {
private final DoFnRunner<KeyedWorkItem<K, InputT>, KV<K, OutputT>> doFnRunner;
private final LateDataFilter lateDataFilter;
public static final String DROPPED_DUE_TO_LATENESS = "droppedDueToLateness";
public LateDataDroppingDoFnRunner(
DoFnRunner<KeyedWorkItem<K, InputT>, KV<K, OutputT>> doFnRunner,
WindowingStrategy<?, ?> windowingStrategy,
TimerInternals timerInternals) {
this.doFnRunner = doFnRunner;
lateDataFilter = new LateDataFilter(windowingStrategy, timerInternals);
}
@Override
public void startBundle() {
doFnRunner.startBundle();
}
@Override
public void processElement(WindowedValue<KeyedWorkItem<K, InputT>> elem) {
Iterable<WindowedValue<InputT>> nonLateElements = lateDataFilter.filter(
elem.getValue().key(), elem.getValue().elementsIterable());
KeyedWorkItem<K, InputT> keyedWorkItem = KeyedWorkItems.workItem(
elem.getValue().key(), elem.getValue().timersIterable(), nonLateElements);
doFnRunner.processElement(elem.withValue(keyedWorkItem));
}
@Override
public void onTimer(String timerId, BoundedWindow window, Instant timestamp,
TimeDomain timeDomain) {
doFnRunner.onTimer(timerId, window, timestamp, timeDomain);
}
@Override
public void finishBundle() {
doFnRunner.finishBundle();
}
/**
* It filters late data in a {@link KeyedWorkItem}.
*/
@VisibleForTesting
static class LateDataFilter {
private final WindowingStrategy<?, ?> windowingStrategy;
private final TimerInternals timerInternals;
private final Counter droppedDueToLateness;
public LateDataFilter(
WindowingStrategy<?, ?> windowingStrategy,
TimerInternals timerInternals) {
this.windowingStrategy = windowingStrategy;
this.timerInternals = timerInternals;
this.droppedDueToLateness = Metrics.counter(LateDataDroppingDoFnRunner.class,
DROPPED_DUE_TO_LATENESS);
}
/**
* Returns an {@code Iterable<WindowedValue<InputT>>} that only contains
* non-late input elements.
*/
public <K, InputT> Iterable<WindowedValue<InputT>> filter(
final K key, Iterable<WindowedValue<InputT>> elements) {
Iterable<Iterable<WindowedValue<InputT>>> windowsExpandedElements = Iterables.transform(
elements,
new Function<WindowedValue<InputT>, Iterable<WindowedValue<InputT>>>() {
@Override
public Iterable<WindowedValue<InputT>> apply(final WindowedValue<InputT> input) {
return Iterables.transform(
input.getWindows(),
new Function<BoundedWindow, WindowedValue<InputT>>() {
@Override
public WindowedValue<InputT> apply(BoundedWindow window) {
return WindowedValue.of(
input.getValue(), input.getTimestamp(), window, input.getPane());
}
});
}});
Iterable<WindowedValue<InputT>> concatElements = Iterables.concat(windowsExpandedElements);
// Bump the counter separately since we don't want multiple iterations to
// increase it multiple times.
for (WindowedValue<InputT> input : concatElements) {
BoundedWindow window = Iterables.getOnlyElement(input.getWindows());
if (canDropDueToExpiredWindow(window)) {
// The element is too late for this window.
droppedDueToLateness.inc();
WindowTracing.debug(
"ReduceFnRunner.processElement: Dropping element at {} for key:{}; window:{} "
+ "since too far behind inputWatermark:{}; outputWatermark:{}",
input.getTimestamp(), key, window, timerInternals.currentInputWatermarkTime(),
timerInternals.currentOutputWatermarkTime());
}
}
Iterable<WindowedValue<InputT>> nonLateElements = Iterables.filter(
concatElements,
new Predicate<WindowedValue<InputT>>() {
@Override
public boolean apply(WindowedValue<InputT> input) {
BoundedWindow window = Iterables.getOnlyElement(input.getWindows());
if (canDropDueToExpiredWindow(window)) {
return false;
} else {
return true;
}
}
});
return nonLateElements;
}
/** Is {@code window} expired w.r.t. the garbage collection watermark? */
private boolean canDropDueToExpiredWindow(BoundedWindow window) {
Instant inputWM = timerInternals.currentInputWatermarkTime();
return LateDataUtils.garbageCollectionTime(window, windowingStrategy).isBefore(inputWM);
}
}
}