/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.beam.examples.cookbook; import com.google.api.services.bigquery.model.TableFieldSchema; import com.google.api.services.bigquery.model.TableReference; import com.google.api.services.bigquery.model.TableRow; import com.google.api.services.bigquery.model.TableSchema; import java.util.ArrayList; import java.util.List; import java.util.concurrent.TimeUnit; import org.apache.beam.examples.common.ExampleBigQueryTableOptions; import org.apache.beam.examples.common.ExampleOptions; import org.apache.beam.examples.common.ExampleUtils; import org.apache.beam.sdk.Pipeline; import org.apache.beam.sdk.PipelineResult; import org.apache.beam.sdk.io.TextIO; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO; import org.apache.beam.sdk.options.Default; import org.apache.beam.sdk.options.Description; import org.apache.beam.sdk.options.PipelineOptionsFactory; import org.apache.beam.sdk.options.StreamingOptions; import org.apache.beam.sdk.transforms.DoFn; import org.apache.beam.sdk.transforms.GroupByKey; import org.apache.beam.sdk.transforms.PTransform; import org.apache.beam.sdk.transforms.ParDo; import org.apache.beam.sdk.transforms.windowing.AfterEach; import org.apache.beam.sdk.transforms.windowing.AfterProcessingTime; import org.apache.beam.sdk.transforms.windowing.AfterWatermark; import org.apache.beam.sdk.transforms.windowing.BoundedWindow; import org.apache.beam.sdk.transforms.windowing.FixedWindows; import org.apache.beam.sdk.transforms.windowing.Repeatedly; import org.apache.beam.sdk.transforms.windowing.Window; import org.apache.beam.sdk.values.KV; import org.apache.beam.sdk.values.PCollection; import org.apache.beam.sdk.values.PCollectionList; import org.joda.time.Duration; import org.joda.time.Instant; /** * This example illustrates the basic concepts behind triggering. It shows how to use different * trigger definitions to produce partial (speculative) results before all the data is processed and * to control when updated results are produced for late data. The example performs a streaming * analysis of the data coming in from a text file and writes the results to BigQuery. It divides * the data into {@link Window windows} to be processed, and demonstrates using various kinds of * {@link org.apache.beam.sdk.transforms.windowing.Trigger triggers} to control when the results for * each window are emitted. * * <p>This example uses a portion of real traffic data from San Diego freeways. It contains * readings from sensor stations set up along each freeway. Each sensor reading includes a * calculation of the 'total flow' across all lanes in that freeway direction. * * <p>Concepts: * <pre> * 1. The default triggering behavior * 2. Late data with the default trigger * 3. How to get speculative estimates * 4. Combining late data and speculative estimates * </pre> * * <p>Before running this example, it will be useful to familiarize yourself with Beam triggers * and understand the concept of 'late data', * See: <a href="https://beam.apache.org/documentation/programming-guide/#triggers"> * https://beam.apache.org/documentation/programming-guide/#triggers</a> * * <p>The example is configured to use the default BigQuery table from the example common package * (there are no defaults for a general Beam pipeline). * You can override them by using the {@code --bigQueryDataset}, and {@code --bigQueryTable} * options. If the BigQuery table do not exist, the example will try to create them. * * <p>The pipeline outputs its results to a BigQuery table. * Here are some queries you can use to see interesting results: * Replace {@code <enter_table_name>} in the query below with the name of the BigQuery table. * Replace {@code <enter_window_interval>} in the query below with the window interval. * * <p>To see the results of the default trigger, * Note: When you start up your pipeline, you'll initially see results from 'late' data. Wait after * the window duration, until the first pane of non-late data has been emitted, to see more * interesting results. * {@code SELECT * FROM enter_table_name WHERE trigger_type = "default" ORDER BY window DESC} * * <p>To see the late data i.e. dropped by the default trigger, * {@code SELECT * FROM <enter_table_name> WHERE trigger_type = "withAllowedLateness" and * (timing = "LATE" or timing = "ON_TIME") and freeway = "5" ORDER BY window DESC, processing_time} * * <p>To see the the difference between accumulation mode and discarding mode, * {@code SELECT * FROM <enter_table_name> WHERE (timing = "LATE" or timing = "ON_TIME") AND * (trigger_type = "withAllowedLateness" or trigger_type = "sequential") and freeway = "5" ORDER BY * window DESC, processing_time} * * <p>To see speculative results every minute, * {@code SELECT * FROM <enter_table_name> WHERE trigger_type = "speculative" and freeway = "5" * ORDER BY window DESC, processing_time} * * <p>To see speculative results every five minutes after the end of the window * {@code SELECT * FROM <enter_table_name> WHERE trigger_type = "sequential" and timing != "EARLY" * and freeway = "5" ORDER BY window DESC, processing_time} * * <p>To see the first and the last pane for a freeway in a window for all the trigger types, * {@code SELECT * FROM <enter_table_name> WHERE (isFirst = true or isLast = true) ORDER BY window} * * <p>To reduce the number of results for each query we can add additional where clauses. * For examples, To see the results of the default trigger, * {@code SELECT * FROM <enter_table_name> WHERE trigger_type = "default" AND freeway = "5" AND * window = "<enter_window_interval>"} * * <p>The example will try to cancel the pipelines on the signal to terminate the process (CTRL-C) * and then exits. */ public class TriggerExample { //Numeric value of fixed window duration, in minutes public static final int WINDOW_DURATION = 30; // Constants used in triggers. // Speeding up ONE_MINUTE or FIVE_MINUTES helps you get an early approximation of results. // ONE_MINUTE is used only with processing time before the end of the window public static final Duration ONE_MINUTE = Duration.standardMinutes(1); // FIVE_MINUTES is used only with processing time after the end of the window public static final Duration FIVE_MINUTES = Duration.standardMinutes(5); // ONE_DAY is used to specify the amount of lateness allowed for the data elements. public static final Duration ONE_DAY = Duration.standardDays(1); /** * This transform demonstrates using triggers to control when data is produced for each window * Consider an example to understand the results generated by each type of trigger. * The example uses "freeway" as the key. Event time is the timestamp associated with the data * element and processing time is the time when the data element gets processed in the pipeline. * For freeway 5, suppose there are 10 elements in the [10:00:00, 10:30:00) window. * Key (freeway) | Value (total_flow) | event time | processing time * 5 | 50 | 10:00:03 | 10:00:47 * 5 | 30 | 10:01:00 | 10:01:03 * 5 | 30 | 10:02:00 | 11:07:00 * 5 | 20 | 10:04:10 | 10:05:15 * 5 | 60 | 10:05:00 | 11:03:00 * 5 | 20 | 10:05:01 | 11.07:30 * 5 | 60 | 10:15:00 | 10:27:15 * 5 | 40 | 10:26:40 | 10:26:43 * 5 | 60 | 10:27:20 | 10:27:25 * 5 | 60 | 10:29:00 | 11:11:00 * * <p>Beam tracks a watermark which records up to what point in event time the data is * complete. For the purposes of the example, we'll assume the watermark is approximately 15m * behind the current processing time. In practice, the actual value would vary over time based * on the systems knowledge of the current delay and contents of the backlog (data * that has not yet been processed). * * <p>If the watermark is 15m behind, then the window [10:00:00, 10:30:00) (in event time) would * close at 10:44:59, when the watermark passes 10:30:00. */ static class CalculateTotalFlow extends PTransform <PCollection<KV<String, Integer>>, PCollectionList<TableRow>> { private int windowDuration; CalculateTotalFlow(int windowDuration) { this.windowDuration = windowDuration; } @Override public PCollectionList<TableRow> expand(PCollection<KV<String, Integer>> flowInfo) { // Concept #1: The default triggering behavior // By default Beam uses a trigger which fires when the watermark has passed the end of the // window. This would be written {@code Repeatedly.forever(AfterWatermark.pastEndOfWindow())}. // The system also defaults to dropping late data -- data which arrives after the watermark // has passed the event timestamp of the arriving element. This means that the default trigger // will only fire once. // Each pane produced by the default trigger with no allowed lateness will be the first and // last pane in the window, and will be ON_TIME. // The results for the example above with the default trigger and zero allowed lateness // would be: // Key (freeway) | Value (total_flow) | number_of_records | isFirst | isLast | timing // 5 | 260 | 6 | true | true | ON_TIME // At 11:03:00 (processing time) the system watermark may have advanced to 10:54:00. As a // result, when the data record with event time 10:05:00 arrives at 11:03:00, it is considered // late, and dropped. PCollection<TableRow> defaultTriggerResults = flowInfo .apply("Default", Window // The default window duration values work well if you're running the default input // file. You may want to adjust the window duration otherwise. .<KV<String, Integer>>into(FixedWindows.of(Duration.standardMinutes(windowDuration))) // The default trigger first emits output when the system's watermark passes the end // of the window. .triggering(Repeatedly.forever(AfterWatermark.pastEndOfWindow())) // Late data is dropped .withAllowedLateness(Duration.ZERO) // Discard elements after emitting each pane. // With no allowed lateness and the specified trigger there will only be a single // pane, so this doesn't have a noticeable effect. See concept 2 for more details. .discardingFiredPanes()) .apply(new TotalFlow("default")); // Concept #2: Late data with the default trigger // This uses the same trigger as concept #1, but allows data that is up to ONE_DAY late. This // leads to each window staying open for ONE_DAY after the watermark has passed the end of the // window. Any late data will result in an additional pane being fired for that same window. // The first pane produced will be ON_TIME and the remaining panes will be LATE. // To definitely get the last pane when the window closes, use // .withAllowedLateness(ONE_DAY, ClosingBehavior.FIRE_ALWAYS). // The results for the example above with the default trigger and ONE_DAY allowed lateness // would be: // Key (freeway) | Value (total_flow) | number_of_records | isFirst | isLast | timing // 5 | 260 | 6 | true | false | ON_TIME // 5 | 60 | 1 | false | false | LATE // 5 | 30 | 1 | false | false | LATE // 5 | 20 | 1 | false | false | LATE // 5 | 60 | 1 | false | false | LATE PCollection<TableRow> withAllowedLatenessResults = flowInfo .apply("WithLateData", Window .<KV<String, Integer>>into(FixedWindows.of(Duration.standardMinutes(windowDuration))) // Late data is emitted as it arrives .triggering(Repeatedly.forever(AfterWatermark.pastEndOfWindow())) // Once the output is produced, the pane is dropped and we start preparing the next // pane for the window .discardingFiredPanes() // Late data is handled up to one day .withAllowedLateness(ONE_DAY)) .apply(new TotalFlow("withAllowedLateness")); // Concept #3: How to get speculative estimates // We can specify a trigger that fires independent of the watermark, for instance after // ONE_MINUTE of processing time. This allows us to produce speculative estimates before // all the data is available. Since we don't have any triggers that depend on the watermark // we don't get an ON_TIME firing. Instead, all panes are either EARLY or LATE. // We also use accumulatingFiredPanes to build up the results across each pane firing. // The results for the example above for this trigger would be: // Key (freeway) | Value (total_flow) | number_of_records | isFirst | isLast | timing // 5 | 80 | 2 | true | false | EARLY // 5 | 100 | 3 | false | false | EARLY // 5 | 260 | 6 | false | false | EARLY // 5 | 320 | 7 | false | false | LATE // 5 | 370 | 9 | false | false | LATE // 5 | 430 | 10 | false | false | LATE PCollection<TableRow> speculativeResults = flowInfo .apply("Speculative" , Window .<KV<String, Integer>>into(FixedWindows.of(Duration.standardMinutes(windowDuration))) // Trigger fires every minute. .triggering(Repeatedly.forever(AfterProcessingTime.pastFirstElementInPane() // Speculative every ONE_MINUTE .plusDelayOf(ONE_MINUTE))) // After emitting each pane, it will continue accumulating the elements so that each // approximation includes all of the previous data in addition to the newly arrived // data. .accumulatingFiredPanes() .withAllowedLateness(ONE_DAY)) .apply(new TotalFlow("speculative")); // Concept #4: Combining late data and speculative estimates // We can put the previous concepts together to get EARLY estimates, an ON_TIME result, // and LATE updates based on late data. // Each time a triggering condition is satisfied it advances to the next trigger. // If there are new elements this trigger emits a window under following condition: // > Early approximations every minute till the end of the window. // > An on-time firing when the watermark has passed the end of the window // > Every five minutes of late data. // Every pane produced will either be EARLY, ON_TIME or LATE. // The results for the example above for this trigger would be: // Key (freeway) | Value (total_flow) | number_of_records | isFirst | isLast | timing // 5 | 80 | 2 | true | false | EARLY // 5 | 100 | 3 | false | false | EARLY // 5 | 260 | 6 | false | false | EARLY // [First pane fired after the end of the window] // 5 | 320 | 7 | false | false | ON_TIME // 5 | 430 | 10 | false | false | LATE // For more possibilities of how to build advanced triggers, see {@link Trigger}. PCollection<TableRow> sequentialResults = flowInfo .apply("Sequential", Window .<KV<String, Integer>>into(FixedWindows.of(Duration.standardMinutes(windowDuration))) .triggering(AfterEach.inOrder( Repeatedly.forever(AfterProcessingTime.pastFirstElementInPane() // Speculative every ONE_MINUTE .plusDelayOf(ONE_MINUTE)).orFinally(AfterWatermark.pastEndOfWindow()), Repeatedly.forever(AfterProcessingTime.pastFirstElementInPane() // Late data every FIVE_MINUTES .plusDelayOf(FIVE_MINUTES)))) .accumulatingFiredPanes() // For up to ONE_DAY .withAllowedLateness(ONE_DAY)) .apply(new TotalFlow("sequential")); // Adds the results generated by each trigger type to a PCollectionList. PCollectionList<TableRow> resultsList = PCollectionList.of(defaultTriggerResults) .and(withAllowedLatenessResults) .and(speculativeResults) .and(sequentialResults); return resultsList; } } ////////////////////////////////////////////////////////////////////////////////////////////////// // The remaining parts of the pipeline are needed to produce the output for each // concept above. Not directly relevant to understanding the trigger examples. /** * Calculate total flow and number of records for each freeway and format the results to TableRow * objects, to save to BigQuery. */ static class TotalFlow extends PTransform <PCollection<KV<String, Integer>>, PCollection<TableRow>> { private String triggerType; public TotalFlow(String triggerType) { this.triggerType = triggerType; } @Override public PCollection<TableRow> expand(PCollection<KV<String, Integer>> flowInfo) { PCollection<KV<String, Iterable<Integer>>> flowPerFreeway = flowInfo .apply(GroupByKey.<String, Integer>create()); PCollection<KV<String, String>> results = flowPerFreeway.apply(ParDo.of( new DoFn<KV<String, Iterable<Integer>>, KV<String, String>>() { @ProcessElement public void processElement(ProcessContext c) throws Exception { Iterable<Integer> flows = c.element().getValue(); Integer sum = 0; Long numberOfRecords = 0L; for (Integer value : flows) { sum += value; numberOfRecords++; } c.output(KV.of(c.element().getKey(), sum + "," + numberOfRecords)); } })); PCollection<TableRow> output = results.apply(ParDo.of(new FormatTotalFlow(triggerType))); return output; } } /** * Format the results of the Total flow calculation to a TableRow, to save to BigQuery. * Adds the triggerType, pane information, processing time and the window timestamp. * */ static class FormatTotalFlow extends DoFn<KV<String, String>, TableRow> { private String triggerType; public FormatTotalFlow(String triggerType) { this.triggerType = triggerType; } @ProcessElement public void processElement(ProcessContext c, BoundedWindow window) throws Exception { String[] values = c.element().getValue().split(","); TableRow row = new TableRow() .set("trigger_type", triggerType) .set("freeway", c.element().getKey()) .set("total_flow", Integer.parseInt(values[0])) .set("number_of_records", Long.parseLong(values[1])) .set("window", window.toString()) .set("isFirst", c.pane().isFirst()) .set("isLast", c.pane().isLast()) .set("timing", c.pane().getTiming().toString()) .set("event_time", c.timestamp().toString()) .set("processing_time", Instant.now().toString()); c.output(row); } } /** * Extract the freeway and total flow in a reading. * Freeway is used as key since we are calculating the total flow for each freeway. */ static class ExtractFlowInfo extends DoFn<String, KV<String, Integer>> { @ProcessElement public void processElement(ProcessContext c) throws Exception { String[] laneInfo = c.element().split(","); if (laneInfo[0].equals("timestamp")) { // Header row return; } if (laneInfo.length < 48) { //Skip the invalid input. return; } String freeway = laneInfo[2]; Integer totalFlow = tryIntegerParse(laneInfo[7]); // Ignore the records with total flow 0 to easily understand the working of triggers. // Skip the records with total flow -1 since they are invalid input. if (totalFlow == null || totalFlow <= 0) { return; } c.output(KV.of(freeway, totalFlow)); } } /** * Inherits standard configuration options. */ public interface TrafficFlowOptions extends ExampleOptions, ExampleBigQueryTableOptions, StreamingOptions { @Description("Input file to read from") @Default.String("gs://apache-beam-samples/traffic_sensor/" + "Freeways-5Minaa2010-01-01_to_2010-02-15.csv") String getInput(); void setInput(String value); @Description("Numeric value of window duration for fixed windows, in minutes") @Default.Integer(WINDOW_DURATION) Integer getWindowDuration(); void setWindowDuration(Integer value); } public static void main(String[] args) throws Exception { TrafficFlowOptions options = PipelineOptionsFactory.fromArgs(args) .withValidation() .as(TrafficFlowOptions.class); options.setStreaming(true); options.setBigQuerySchema(getSchema()); ExampleUtils exampleUtils = new ExampleUtils(options); exampleUtils.setup(); Pipeline pipeline = Pipeline.create(options); TableReference tableRef = getTableReference(options.getProject(), options.getBigQueryDataset(), options.getBigQueryTable()); PCollectionList<TableRow> resultList = pipeline .apply("ReadMyFile", TextIO.read().from(options.getInput())) .apply("InsertRandomDelays", ParDo.of(new InsertDelays())) .apply(ParDo.of(new ExtractFlowInfo())) .apply(new CalculateTotalFlow(options.getWindowDuration())); for (int i = 0; i < resultList.size(); i++){ resultList.get(i).apply(BigQueryIO.writeTableRows() .to(tableRef) .withSchema(getSchema())); } PipelineResult result = pipeline.run(); // ExampleUtils will try to cancel the pipeline and the injector before the program exits. exampleUtils.waitToFinish(result); } /** * Add current time to each record. * Also insert a delay at random to demo the triggers. */ public static class InsertDelays extends DoFn<String, String> { private static final double THRESHOLD = 0.001; // MIN_DELAY and MAX_DELAY in minutes. private static final int MIN_DELAY = 1; private static final int MAX_DELAY = 100; @ProcessElement public void processElement(ProcessContext c) throws Exception { Instant timestamp = Instant.now(); if (Math.random() < THRESHOLD){ int range = MAX_DELAY - MIN_DELAY; int delayInMinutes = (int) (Math.random() * range) + MIN_DELAY; long delayInMillis = TimeUnit.MINUTES.toMillis(delayInMinutes); timestamp = new Instant(timestamp.getMillis() - delayInMillis); } c.outputWithTimestamp(c.element(), timestamp); } } /** Sets the table reference. */ private static TableReference getTableReference(String project, String dataset, String table){ TableReference tableRef = new TableReference(); tableRef.setProjectId(project); tableRef.setDatasetId(dataset); tableRef.setTableId(table); return tableRef; } /** Defines the BigQuery schema used for the output. */ private static TableSchema getSchema() { List<TableFieldSchema> fields = new ArrayList<>(); fields.add(new TableFieldSchema().setName("trigger_type").setType("STRING")); fields.add(new TableFieldSchema().setName("freeway").setType("STRING")); fields.add(new TableFieldSchema().setName("total_flow").setType("INTEGER")); fields.add(new TableFieldSchema().setName("number_of_records").setType("INTEGER")); fields.add(new TableFieldSchema().setName("window").setType("STRING")); fields.add(new TableFieldSchema().setName("isFirst").setType("BOOLEAN")); fields.add(new TableFieldSchema().setName("isLast").setType("BOOLEAN")); fields.add(new TableFieldSchema().setName("timing").setType("STRING")); fields.add(new TableFieldSchema().setName("event_time").setType("TIMESTAMP")); fields.add(new TableFieldSchema().setName("processing_time").setType("TIMESTAMP")); TableSchema schema = new TableSchema().setFields(fields); return schema; } private static Integer tryIntegerParse(String number) { try { return Integer.parseInt(number); } catch (NumberFormatException e) { return null; } } }