/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.beam.examples.complete; import com.google.api.services.bigquery.model.TableFieldSchema; import com.google.api.services.bigquery.model.TableReference; import com.google.api.services.bigquery.model.TableRow; import com.google.api.services.bigquery.model.TableSchema; import com.google.common.collect.Lists; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.Hashtable; import java.util.List; import java.util.Map; import org.apache.avro.reflect.Nullable; import org.apache.beam.examples.common.ExampleBigQueryTableOptions; import org.apache.beam.examples.common.ExampleOptions; import org.apache.beam.examples.common.ExampleUtils; import org.apache.beam.sdk.Pipeline; import org.apache.beam.sdk.PipelineResult; import org.apache.beam.sdk.coders.AvroCoder; import org.apache.beam.sdk.coders.DefaultCoder; import org.apache.beam.sdk.io.TextIO; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO; import org.apache.beam.sdk.options.Default; import org.apache.beam.sdk.options.Description; import org.apache.beam.sdk.options.PipelineOptionsFactory; import org.apache.beam.sdk.transforms.DoFn; import org.apache.beam.sdk.transforms.GroupByKey; import org.apache.beam.sdk.transforms.PTransform; import org.apache.beam.sdk.transforms.ParDo; import org.apache.beam.sdk.transforms.windowing.SlidingWindows; import org.apache.beam.sdk.transforms.windowing.Window; import org.apache.beam.sdk.values.KV; import org.apache.beam.sdk.values.PBegin; import org.apache.beam.sdk.values.PCollection; import org.joda.time.Duration; import org.joda.time.Instant; import org.joda.time.format.DateTimeFormat; import org.joda.time.format.DateTimeFormatter; /** * A Beam Example that runs in both batch and streaming modes with traffic sensor data. * You can configure the running mode by setting {@literal --streaming} to true or false. * * <p>Concepts: The batch and streaming runners, GroupByKey, sliding windows. * * <p>This example analyzes traffic sensor data using SlidingWindows. For each window, * it calculates the average speed over the window for some small set of predefined 'routes', * and looks for 'slowdowns' in those routes. It writes its results to a BigQuery table. * * <p>The pipeline reads traffic sensor data from {@literal --inputFile}. * * <p>The example is configured to use the default BigQuery table from the example common package * (there are no defaults for a general Beam pipeline). * You can override them by using the {@literal --bigQueryDataset}, and {@literal --bigQueryTable} * options. If the BigQuery table do not exist, the example will try to create them. * * <p>The example will try to cancel the pipelines on the signal to terminate the process (CTRL-C) * and then exits. */ public class TrafficRoutes { // Instantiate some small predefined San Diego routes to analyze static Map<String, String> sdStations = buildStationInfo(); static final int WINDOW_DURATION = 3; // Default sliding window duration in minutes static final int WINDOW_SLIDE_EVERY = 1; // Default window 'slide every' setting in minutes /** * This class holds information about a station reading's average speed. */ @DefaultCoder(AvroCoder.class) static class StationSpeed implements Comparable<StationSpeed> { @Nullable String stationId; @Nullable Double avgSpeed; @Nullable Long timestamp; public StationSpeed() {} public StationSpeed(String stationId, Double avgSpeed, Long timestamp) { this.stationId = stationId; this.avgSpeed = avgSpeed; this.timestamp = timestamp; } public String getStationId() { return this.stationId; } public Double getAvgSpeed() { return this.avgSpeed; } @Override public int compareTo(StationSpeed other) { return Long.compare(this.timestamp, other.timestamp); } } /** * This class holds information about a route's speed/slowdown. */ @DefaultCoder(AvroCoder.class) static class RouteInfo { @Nullable String route; @Nullable Double avgSpeed; @Nullable Boolean slowdownEvent; public RouteInfo() {} public RouteInfo(String route, Double avgSpeed, Boolean slowdownEvent) { this.route = route; this.avgSpeed = avgSpeed; this.slowdownEvent = slowdownEvent; } public String getRoute() { return this.route; } public Double getAvgSpeed() { return this.avgSpeed; } public Boolean getSlowdownEvent() { return this.slowdownEvent; } } /** * Extract the timestamp field from the input string, and use it as the element timestamp. */ static class ExtractTimestamps extends DoFn<String, String> { private static final DateTimeFormatter dateTimeFormat = DateTimeFormat.forPattern("MM/dd/yyyy HH:mm:ss"); @ProcessElement public void processElement(DoFn<String, String>.ProcessContext c) throws Exception { String[] items = c.element().split(","); String timestamp = tryParseTimestamp(items); if (timestamp != null) { try { c.outputWithTimestamp(c.element(), new Instant(dateTimeFormat.parseMillis(timestamp))); } catch (IllegalArgumentException e) { // Skip the invalid input. } } } } /** * Filter out readings for the stations along predefined 'routes', and output * (station, speed info) keyed on route. */ static class ExtractStationSpeedFn extends DoFn<String, KV<String, StationSpeed>> { @ProcessElement public void processElement(ProcessContext c) { String[] items = c.element().split(","); String stationType = tryParseStationType(items); // For this analysis, use only 'main line' station types if (stationType != null && stationType.equals("ML")) { Double avgSpeed = tryParseAvgSpeed(items); String stationId = tryParseStationId(items); // For this simple example, filter out everything but some hardwired routes. if (avgSpeed != null && stationId != null && sdStations.containsKey(stationId)) { StationSpeed stationSpeed = new StationSpeed(stationId, avgSpeed, c.timestamp().getMillis()); // The tuple key is the 'route' name stored in the 'sdStations' hash. KV<String, StationSpeed> outputValue = KV.of(sdStations.get(stationId), stationSpeed); c.output(outputValue); } } } } /** * For a given route, track average speed for the window. Calculate whether * traffic is currently slowing down, via a predefined threshold. If a supermajority of * speeds in this sliding window are less than the previous reading we call this a 'slowdown'. * Note: these calculations are for example purposes only, and are unrealistic and oversimplified. */ static class GatherStats extends DoFn<KV<String, Iterable<StationSpeed>>, KV<String, RouteInfo>> { @ProcessElement public void processElement(ProcessContext c) throws IOException { String route = c.element().getKey(); double speedSum = 0.0; int speedCount = 0; int speedups = 0; int slowdowns = 0; List<StationSpeed> infoList = Lists.newArrayList(c.element().getValue()); // StationSpeeds sort by embedded timestamp. Collections.sort(infoList); Map<String, Double> prevSpeeds = new HashMap<>(); // For all stations in the route, sum (non-null) speeds. Keep a count of the non-null speeds. for (StationSpeed item : infoList) { Double speed = item.getAvgSpeed(); if (speed != null) { speedSum += speed; speedCount++; Double lastSpeed = prevSpeeds.get(item.getStationId()); if (lastSpeed != null) { if (lastSpeed < speed) { speedups += 1; } else { slowdowns += 1; } } prevSpeeds.put(item.getStationId(), speed); } } if (speedCount == 0) { // No average to compute. return; } double speedAvg = speedSum / speedCount; boolean slowdownEvent = slowdowns >= 2 * speedups; RouteInfo routeInfo = new RouteInfo(route, speedAvg, slowdownEvent); c.output(KV.of(route, routeInfo)); } } /** * Format the results of the slowdown calculations to a TableRow, to save to BigQuery. */ static class FormatStatsFn extends DoFn<KV<String, RouteInfo>, TableRow> { @ProcessElement public void processElement(ProcessContext c) { RouteInfo routeInfo = c.element().getValue(); TableRow row = new TableRow() .set("avg_speed", routeInfo.getAvgSpeed()) .set("slowdown_event", routeInfo.getSlowdownEvent()) .set("route", c.element().getKey()) .set("window_timestamp", c.timestamp().toString()); c.output(row); } /** * Defines the BigQuery schema used for the output. */ static TableSchema getSchema() { List<TableFieldSchema> fields = new ArrayList<>(); fields.add(new TableFieldSchema().setName("route").setType("STRING")); fields.add(new TableFieldSchema().setName("avg_speed").setType("FLOAT")); fields.add(new TableFieldSchema().setName("slowdown_event").setType("BOOLEAN")); fields.add(new TableFieldSchema().setName("window_timestamp").setType("TIMESTAMP")); TableSchema schema = new TableSchema().setFields(fields); return schema; } } /** * This PTransform extracts speed info from traffic station readings. * It groups the readings by 'route' and analyzes traffic slowdown for that route. * Lastly, it formats the results for BigQuery. */ static class TrackSpeed extends PTransform<PCollection<KV<String, StationSpeed>>, PCollection<TableRow>> { @Override public PCollection<TableRow> expand(PCollection<KV<String, StationSpeed>> stationSpeed) { // Apply a GroupByKey transform to collect a list of all station // readings for a given route. PCollection<KV<String, Iterable<StationSpeed>>> timeGroup = stationSpeed.apply( GroupByKey.<String, StationSpeed>create()); // Analyze 'slowdown' over the route readings. PCollection<KV<String, RouteInfo>> stats = timeGroup.apply(ParDo.of(new GatherStats())); // Format the results for writing to BigQuery PCollection<TableRow> results = stats.apply( ParDo.of(new FormatStatsFn())); return results; } } static class ReadFileAndExtractTimestamps extends PTransform<PBegin, PCollection<String>> { private final String inputFile; public ReadFileAndExtractTimestamps(String inputFile) { this.inputFile = inputFile; } @Override public PCollection<String> expand(PBegin begin) { return begin .apply(TextIO.read().from(inputFile)) .apply(ParDo.of(new ExtractTimestamps())); } } /** * Options supported by {@link TrafficRoutes}. * * <p>Inherits standard configuration options. */ public interface TrafficRoutesOptions extends ExampleOptions, ExampleBigQueryTableOptions { @Description("Path of the file to read from") @Default.String("gs://apache-beam-samples/traffic_sensor/" + "Freeways-5Minaa2010-01-01_to_2010-02-15_test2.csv") String getInputFile(); void setInputFile(String value); @Description("Numeric value of sliding window duration, in minutes") @Default.Integer(WINDOW_DURATION) Integer getWindowDuration(); void setWindowDuration(Integer value); @Description("Numeric value of window 'slide every' setting, in minutes") @Default.Integer(WINDOW_SLIDE_EVERY) Integer getWindowSlideEvery(); void setWindowSlideEvery(Integer value); } /** * Sets up and starts streaming pipeline. * * @throws IOException if there is a problem setting up resources */ public static void main(String[] args) throws IOException { TrafficRoutesOptions options = PipelineOptionsFactory.fromArgs(args) .withValidation() .as(TrafficRoutesOptions.class); options.setBigQuerySchema(FormatStatsFn.getSchema()); // Using ExampleUtils to set up required resources. ExampleUtils exampleUtils = new ExampleUtils(options); exampleUtils.setup(); Pipeline pipeline = Pipeline.create(options); TableReference tableRef = new TableReference(); tableRef.setProjectId(options.getProject()); tableRef.setDatasetId(options.getBigQueryDataset()); tableRef.setTableId(options.getBigQueryTable()); pipeline .apply("ReadLines", new ReadFileAndExtractTimestamps(options.getInputFile())) // row... => <station route, station speed> ... .apply(ParDo.of(new ExtractStationSpeedFn())) // map the incoming data stream into sliding windows. .apply(Window.<KV<String, StationSpeed>>into(SlidingWindows.of( Duration.standardMinutes(options.getWindowDuration())). every(Duration.standardMinutes(options.getWindowSlideEvery())))) .apply(new TrackSpeed()) .apply(BigQueryIO.writeTableRows().to(tableRef) .withSchema(FormatStatsFn.getSchema())); // Run the pipeline. PipelineResult result = pipeline.run(); // ExampleUtils will try to cancel the pipeline and the injector before the program exists. exampleUtils.waitToFinish(result); } private static Double tryParseAvgSpeed(String[] inputItems) { try { return Double.parseDouble(tryParseString(inputItems, 9)); } catch (NumberFormatException e) { return null; } catch (NullPointerException e) { return null; } } private static String tryParseStationType(String[] inputItems) { return tryParseString(inputItems, 4); } private static String tryParseStationId(String[] inputItems) { return tryParseString(inputItems, 1); } private static String tryParseTimestamp(String[] inputItems) { return tryParseString(inputItems, 0); } private static String tryParseString(String[] inputItems, int index) { return inputItems.length >= index ? inputItems[index] : null; } /** * Define some small hard-wired San Diego 'routes' to track based on sensor station ID. */ private static Map<String, String> buildStationInfo() { Map<String, String> stations = new Hashtable<String, String>(); stations.put("1108413", "SDRoute1"); // from freeway 805 S stations.put("1108699", "SDRoute2"); // from freeway 78 E stations.put("1108702", "SDRoute2"); return stations; } }