/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.beam.examples.complete.game;
import java.util.HashMap;
import java.util.Map;
import java.util.TimeZone;
import org.apache.beam.examples.complete.game.utils.WriteToText;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.io.TextIO;
import org.apache.beam.sdk.options.Default;
import org.apache.beam.sdk.options.Description;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.transforms.Filter;
import org.apache.beam.sdk.transforms.ParDo;
import org.apache.beam.sdk.transforms.WithTimestamps;
import org.apache.beam.sdk.transforms.windowing.FixedWindows;
import org.apache.beam.sdk.transforms.windowing.IntervalWindow;
import org.apache.beam.sdk.transforms.windowing.Window;
import org.apache.beam.sdk.values.KV;
import org.joda.time.DateTimeZone;
import org.joda.time.Duration;
import org.joda.time.Instant;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;
/**
* This class is the second in a series of four pipelines that tell a story in a 'gaming'
* domain, following {@link UserScore}. In addition to the concepts introduced in {@link UserScore},
* new concepts include: windowing and element timestamps; use of {@code Filter.by()}.
*
* <p>This pipeline processes data collected from gaming events in batch, building on {@link
* UserScore} but using fixed windows. It calculates the sum of scores per team, for each window,
* optionally allowing specification of two timestamps before and after which data is filtered out.
* This allows a model where late data collected after the intended analysis window can be included,
* and any late-arriving data prior to the beginning of the analysis window can be removed as well.
* By using windowing and adding element timestamps, we can do finer-grained analysis than with the
* {@link UserScore} pipeline. However, our batch processing is high-latency, in that we don't get
* results from plays at the beginning of the batch's time period until the batch is processed.
*
* <p>To execute this pipeline, specify the pipeline configuration like this:
* <pre>{@code
* --tempLocation=YOUR_TEMP_DIRECTORY
* --runner=YOUR_RUNNER
* --output=YOUR_OUTPUT_DIRECTORY
* (possibly options specific to your runner or permissions for your temp/output locations)
* }
* </pre>
*
* <p>Optionally include {@code --input} to specify the batch input file path.
* To indicate a time after which the data should be filtered out, include the
* {@code --stopMin} arg. E.g., {@code --stopMin=2015-10-18-23-59} indicates that any data
* timestamped after 23:59 PST on 2015-10-18 should not be included in the analysis.
* To indicate a time before which data should be filtered out, include the {@code --startMin} arg.
* If you're using the default input specified in {@link UserScore},
* "gs://apache-beam-samples/game/gaming_data*.csv", then
* {@code --startMin=2015-11-16-16-10 --stopMin=2015-11-17-16-10} are good values.
*/
public class HourlyTeamScore extends UserScore {
private static DateTimeFormatter fmt =
DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SSS")
.withZone(DateTimeZone.forTimeZone(TimeZone.getTimeZone("PST")));
private static DateTimeFormatter minFmt =
DateTimeFormat.forPattern("yyyy-MM-dd-HH-mm")
.withZone(DateTimeZone.forTimeZone(TimeZone.getTimeZone("PST")));
/**
* Options supported by {@link HourlyTeamScore}.
*/
interface Options extends UserScore.Options {
@Description("Numeric value of fixed window duration, in minutes")
@Default.Integer(60)
Integer getWindowDuration();
void setWindowDuration(Integer value);
@Description("String representation of the first minute after which to generate results,"
+ "in the format: yyyy-MM-dd-HH-mm . This time should be in PST."
+ "Any input data timestamped prior to that minute won't be included in the sums.")
@Default.String("1970-01-01-00-00")
String getStartMin();
void setStartMin(String value);
@Description("String representation of the first minute for which to not generate results,"
+ "in the format: yyyy-MM-dd-HH-mm . This time should be in PST."
+ "Any input data timestamped after that minute won't be included in the sums.")
@Default.String("2100-01-01-00-00")
String getStopMin();
void setStopMin(String value);
}
/**
* Create a map of information that describes how to write pipeline output to text. This map
* is passed to the {@link WriteToText} constructor to write team score sums and
* includes information about window start time.
*/
protected static Map<String, WriteToText.FieldFn<KV<String, Integer>>>
configureOutput() {
Map<String, WriteToText.FieldFn<KV<String, Integer>>> config =
new HashMap<String, WriteToText.FieldFn<KV<String, Integer>>>();
config.put("team", (c, w) -> c.element().getKey());
config.put("total_score", (c, w) -> c.element().getValue());
config.put(
"window_start",
(c, w) -> {
IntervalWindow window = (IntervalWindow) w;
return fmt.print(window.start());
});
return config;
}
/**
* Run a batch pipeline to do windowed analysis of the data.
*/
// [START DocInclude_HTSMain]
public static void main(String[] args) throws Exception {
// Begin constructing a pipeline configured by commandline flags.
Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
Pipeline pipeline = Pipeline.create(options);
final Instant stopMinTimestamp = new Instant(minFmt.parseMillis(options.getStopMin()));
final Instant startMinTimestamp = new Instant(minFmt.parseMillis(options.getStartMin()));
// Read 'gaming' events from a text file.
pipeline.apply(TextIO.read().from(options.getInput()))
// Parse the incoming data.
.apply("ParseGameEvent", ParDo.of(new ParseEventFn()))
// Filter out data before and after the given times so that it is not included
// in the calculations. As we collect data in batches (say, by day), the batch for the day
// that we want to analyze could potentially include some late-arriving data from the previous
// day. If so, we want to weed it out. Similarly, if we include data from the following day
// (to scoop up late-arriving events from the day we're analyzing), we need to weed out events
// that fall after the time period we want to analyze.
// [START DocInclude_HTSFilters]
.apply("FilterStartTime", Filter.by(
(GameActionInfo gInfo)
-> gInfo.getTimestamp() > startMinTimestamp.getMillis()))
.apply("FilterEndTime", Filter.by(
(GameActionInfo gInfo)
-> gInfo.getTimestamp() < stopMinTimestamp.getMillis()))
// [END DocInclude_HTSFilters]
// [START DocInclude_HTSAddTsAndWindow]
// Add an element timestamp based on the event log, and apply fixed windowing.
.apply("AddEventTimestamps",
WithTimestamps.of((GameActionInfo i) -> new Instant(i.getTimestamp())))
.apply("FixedWindowsTeam", Window.<GameActionInfo>into(
FixedWindows.of(Duration.standardMinutes(options.getWindowDuration()))))
// [END DocInclude_HTSAddTsAndWindow]
// Extract and sum teamname/score pairs from the event data.
.apply("ExtractTeamScore", new ExtractAndSumScore("team"))
.apply("WriteTeamScoreSums",
new WriteToText<KV<String, Integer>>(
options.getOutput(),
configureOutput(),
true));
pipeline.run().waitUntilFinish();
}
// [END DocInclude_HTSMain]
}