/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.beam.examples.complete;
import com.google.api.services.bigquery.model.TableFieldSchema;
import com.google.api.services.bigquery.model.TableReference;
import com.google.api.services.bigquery.model.TableRow;
import com.google.api.services.bigquery.model.TableSchema;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.avro.reflect.Nullable;
import org.apache.beam.examples.common.ExampleBigQueryTableOptions;
import org.apache.beam.examples.common.ExampleOptions;
import org.apache.beam.examples.common.ExampleUtils;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.PipelineResult;
import org.apache.beam.sdk.coders.AvroCoder;
import org.apache.beam.sdk.coders.DefaultCoder;
import org.apache.beam.sdk.io.TextIO;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
import org.apache.beam.sdk.options.Default;
import org.apache.beam.sdk.options.Description;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.transforms.Combine;
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.transforms.PTransform;
import org.apache.beam.sdk.transforms.ParDo;
import org.apache.beam.sdk.transforms.SerializableFunction;
import org.apache.beam.sdk.transforms.windowing.SlidingWindows;
import org.apache.beam.sdk.transforms.windowing.Window;
import org.apache.beam.sdk.values.KV;
import org.apache.beam.sdk.values.PBegin;
import org.apache.beam.sdk.values.PCollection;
import org.joda.time.Duration;
import org.joda.time.Instant;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;
/**
* A Beam Example that runs in both batch and streaming modes with traffic sensor data.
* You can configure the running mode by setting {@literal --streaming} to true or false.
*
* <p>Concepts: The batch and streaming runners, sliding windows,
* use of the AvroCoder to encode a custom class, and custom Combine transforms.
*
* <p>This example analyzes traffic sensor data using SlidingWindows. For each window,
* it finds the lane that had the highest flow recorded, for each sensor station. It writes
* those max values along with auxiliary info to a BigQuery table.
*
* <p>The pipeline reads traffic sensor data from {@literal --inputFile}.
*
* <p>The example is configured to use the default BigQuery table from the example common package
* (there are no defaults for a general Beam pipeline).
* You can override them by using the {@literal --bigQueryDataset}, and {@literal --bigQueryTable}
* options. If the BigQuery table do not exist, the example will try to create them.
*
* <p>The example will try to cancel the pipelines on the signal to terminate the process (CTRL-C)
* and then exits.
*/
public class TrafficMaxLaneFlow {
static final int WINDOW_DURATION = 60; // Default sliding window duration in minutes
static final int WINDOW_SLIDE_EVERY = 5; // Default window 'slide every' setting in minutes
/**
* This class holds information about each lane in a station reading, along with some general
* information from the reading.
*/
@DefaultCoder(AvroCoder.class)
static class LaneInfo {
@Nullable String stationId;
@Nullable String lane;
@Nullable String direction;
@Nullable String freeway;
@Nullable String recordedTimestamp;
@Nullable Integer laneFlow;
@Nullable Integer totalFlow;
@Nullable Double laneAO;
@Nullable Double laneAS;
public LaneInfo() {}
public LaneInfo(String stationId, String lane, String direction, String freeway,
String timestamp, Integer laneFlow, Double laneAO,
Double laneAS, Integer totalFlow) {
this.stationId = stationId;
this.lane = lane;
this.direction = direction;
this.freeway = freeway;
this.recordedTimestamp = timestamp;
this.laneFlow = laneFlow;
this.laneAO = laneAO;
this.laneAS = laneAS;
this.totalFlow = totalFlow;
}
public String getStationId() {
return this.stationId;
}
public String getLane() {
return this.lane;
}
public String getDirection() {
return this.direction;
}
public String getFreeway() {
return this.freeway;
}
public String getRecordedTimestamp() {
return this.recordedTimestamp;
}
public Integer getLaneFlow() {
return this.laneFlow;
}
public Double getLaneAO() {
return this.laneAO;
}
public Double getLaneAS() {
return this.laneAS;
}
public Integer getTotalFlow() {
return this.totalFlow;
}
}
/**
* Extract the timestamp field from the input string, and use it as the element timestamp.
*/
static class ExtractTimestamps extends DoFn<String, String> {
private static final DateTimeFormatter dateTimeFormat =
DateTimeFormat.forPattern("MM/dd/yyyy HH:mm:ss");
@ProcessElement
public void processElement(DoFn<String, String>.ProcessContext c) throws Exception {
String[] items = c.element().split(",");
if (items.length > 0) {
try {
String timestamp = items[0];
c.outputWithTimestamp(c.element(), new Instant(dateTimeFormat.parseMillis(timestamp)));
} catch (IllegalArgumentException e) {
// Skip the invalid input.
}
}
}
}
/**
* Extract flow information for each of the 8 lanes in a reading, and output as separate tuples.
* This will let us determine which lane has the max flow for that station over the span of the
* window, and output not only the max flow from that calculation, but other associated
* information. The number of lanes for which data is present depends upon which freeway the data
* point comes from.
*/
static class ExtractFlowInfoFn extends DoFn<String, KV<String, LaneInfo>> {
@ProcessElement
public void processElement(ProcessContext c) {
String[] items = c.element().split(",");
if (items.length < 48) {
// Skip the invalid input.
return;
}
// extract the sensor information for the lanes from the input string fields.
String timestamp = items[0];
String stationId = items[1];
String freeway = items[2];
String direction = items[3];
Integer totalFlow = tryIntParse(items[7]);
for (int i = 1; i <= 8; ++i) {
Integer laneFlow = tryIntParse(items[6 + 5 * i]);
Double laneAvgOccupancy = tryDoubleParse(items[7 + 5 * i]);
Double laneAvgSpeed = tryDoubleParse(items[8 + 5 * i]);
if (laneFlow == null || laneAvgOccupancy == null || laneAvgSpeed == null) {
return;
}
LaneInfo laneInfo = new LaneInfo(stationId, "lane" + i, direction, freeway, timestamp,
laneFlow, laneAvgOccupancy, laneAvgSpeed, totalFlow);
c.output(KV.of(stationId, laneInfo));
}
}
}
/**
* A custom 'combine function' used with the Combine.perKey transform. Used to find the max lane
* flow over all the data points in the Window. Extracts the lane flow from the input string and
* determines whether it's the max seen so far. We're using a custom combiner instead of the Max
* transform because we want to retain the additional information we've associated with the flow
* value.
*/
public static class MaxFlow implements SerializableFunction<Iterable<LaneInfo>, LaneInfo> {
@Override
public LaneInfo apply(Iterable<LaneInfo> input) {
Integer max = 0;
LaneInfo maxInfo = new LaneInfo();
for (LaneInfo item : input) {
Integer flow = item.getLaneFlow();
if (flow != null && (flow >= max)) {
max = flow;
maxInfo = item;
}
}
return maxInfo;
}
}
/**
* Format the results of the Max Lane flow calculation to a TableRow, to save to BigQuery.
* Add the timestamp from the window context.
*/
static class FormatMaxesFn extends DoFn<KV<String, LaneInfo>, TableRow> {
@ProcessElement
public void processElement(ProcessContext c) {
LaneInfo laneInfo = c.element().getValue();
TableRow row = new TableRow()
.set("station_id", c.element().getKey())
.set("direction", laneInfo.getDirection())
.set("freeway", laneInfo.getFreeway())
.set("lane_max_flow", laneInfo.getLaneFlow())
.set("lane", laneInfo.getLane())
.set("avg_occ", laneInfo.getLaneAO())
.set("avg_speed", laneInfo.getLaneAS())
.set("total_flow", laneInfo.getTotalFlow())
.set("recorded_timestamp", laneInfo.getRecordedTimestamp())
.set("window_timestamp", c.timestamp().toString());
c.output(row);
}
/** Defines the BigQuery schema used for the output. */
static TableSchema getSchema() {
List<TableFieldSchema> fields = new ArrayList<>();
fields.add(new TableFieldSchema().setName("station_id").setType("STRING"));
fields.add(new TableFieldSchema().setName("direction").setType("STRING"));
fields.add(new TableFieldSchema().setName("freeway").setType("STRING"));
fields.add(new TableFieldSchema().setName("lane_max_flow").setType("INTEGER"));
fields.add(new TableFieldSchema().setName("lane").setType("STRING"));
fields.add(new TableFieldSchema().setName("avg_occ").setType("FLOAT"));
fields.add(new TableFieldSchema().setName("avg_speed").setType("FLOAT"));
fields.add(new TableFieldSchema().setName("total_flow").setType("INTEGER"));
fields.add(new TableFieldSchema().setName("window_timestamp").setType("TIMESTAMP"));
fields.add(new TableFieldSchema().setName("recorded_timestamp").setType("STRING"));
TableSchema schema = new TableSchema().setFields(fields);
return schema;
}
}
/**
* This PTransform extracts lane info, calculates the max lane flow found for a given station (for
* the current Window) using a custom 'combiner', and formats the results for BigQuery.
*/
static class MaxLaneFlow
extends PTransform<PCollection<KV<String, LaneInfo>>, PCollection<TableRow>> {
@Override
public PCollection<TableRow> expand(PCollection<KV<String, LaneInfo>> flowInfo) {
// stationId, LaneInfo => stationId + max lane flow info
PCollection<KV<String, LaneInfo>> flowMaxes =
flowInfo.apply(Combine.<String, LaneInfo>perKey(
new MaxFlow()));
// <stationId, max lane flow info>... => row...
PCollection<TableRow> results = flowMaxes.apply(
ParDo.of(new FormatMaxesFn()));
return results;
}
}
static class ReadFileAndExtractTimestamps extends PTransform<PBegin, PCollection<String>> {
private final String inputFile;
public ReadFileAndExtractTimestamps(String inputFile) {
this.inputFile = inputFile;
}
@Override
public PCollection<String> expand(PBegin begin) {
return begin
.apply(TextIO.read().from(inputFile))
.apply(ParDo.of(new ExtractTimestamps()));
}
}
/**
* Options supported by {@link TrafficMaxLaneFlow}.
*
* <p>Inherits standard configuration options.
*/
public interface TrafficMaxLaneFlowOptions extends ExampleOptions, ExampleBigQueryTableOptions {
@Description("Path of the file to read from")
@Default.String("gs://apache-beam-samples/traffic_sensor/"
+ "Freeways-5Minaa2010-01-01_to_2010-02-15_test2.csv")
String getInputFile();
void setInputFile(String value);
@Description("Numeric value of sliding window duration, in minutes")
@Default.Integer(WINDOW_DURATION)
Integer getWindowDuration();
void setWindowDuration(Integer value);
@Description("Numeric value of window 'slide every' setting, in minutes")
@Default.Integer(WINDOW_SLIDE_EVERY)
Integer getWindowSlideEvery();
void setWindowSlideEvery(Integer value);
}
/**
* Sets up and starts streaming pipeline.
*
* @throws IOException if there is a problem setting up resources
*/
public static void main(String[] args) throws IOException {
TrafficMaxLaneFlowOptions options = PipelineOptionsFactory.fromArgs(args)
.withValidation()
.as(TrafficMaxLaneFlowOptions.class);
options.setBigQuerySchema(FormatMaxesFn.getSchema());
// Using ExampleUtils to set up required resources.
ExampleUtils exampleUtils = new ExampleUtils(options);
exampleUtils.setup();
Pipeline pipeline = Pipeline.create(options);
TableReference tableRef = new TableReference();
tableRef.setProjectId(options.getProject());
tableRef.setDatasetId(options.getBigQueryDataset());
tableRef.setTableId(options.getBigQueryTable());
pipeline
.apply("ReadLines", new ReadFileAndExtractTimestamps(options.getInputFile()))
// row... => <station route, station speed> ...
.apply(ParDo.of(new ExtractFlowInfoFn()))
// map the incoming data stream into sliding windows.
.apply(Window.<KV<String, LaneInfo>>into(SlidingWindows.of(
Duration.standardMinutes(options.getWindowDuration())).
every(Duration.standardMinutes(options.getWindowSlideEvery()))))
.apply(new MaxLaneFlow())
.apply(BigQueryIO.writeTableRows().to(tableRef)
.withSchema(FormatMaxesFn.getSchema()));
// Run the pipeline.
PipelineResult result = pipeline.run();
// ExampleUtils will try to cancel the pipeline and the injector before the program exists.
exampleUtils.waitToFinish(result);
}
private static Integer tryIntParse(String number) {
try {
return Integer.parseInt(number);
} catch (NumberFormatException e) {
return null;
}
}
private static Double tryDoubleParse(String number) {
try {
return Double.parseDouble(number);
} catch (NumberFormatException e) {
return null;
}
}
}