/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.beam.examples.complete.game;
import java.util.HashMap;
import java.util.Map;
import org.apache.avro.reflect.Nullable;
import org.apache.beam.examples.complete.game.utils.WriteToText;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.coders.AvroCoder;
import org.apache.beam.sdk.coders.DefaultCoder;
import org.apache.beam.sdk.io.TextIO;
import org.apache.beam.sdk.metrics.Counter;
import org.apache.beam.sdk.metrics.Metrics;
import org.apache.beam.sdk.options.Default;
import org.apache.beam.sdk.options.Description;
import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.options.Validation;
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.transforms.MapElements;
import org.apache.beam.sdk.transforms.PTransform;
import org.apache.beam.sdk.transforms.ParDo;
import org.apache.beam.sdk.transforms.Sum;
import org.apache.beam.sdk.values.KV;
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.TypeDescriptors;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* This class is the first in a series of four pipelines that tell a story in a 'gaming' domain.
* Concepts: batch processing, reading input from text files, writing output to
* text files, using standalone DoFns, use of the sum per key transform, and use of
* Java 8 lambda syntax.
*
* <p>In this gaming scenario, many users play, as members of different teams, over the course of a
* day, and their actions are logged for processing. Some of the logged game events may be late-
* arriving, if users play on mobile devices and go transiently offline for a period.
*
* <p>This pipeline does batch processing of data collected from gaming events. It calculates the
* sum of scores per user, over an entire batch of gaming data (collected, say, for each day). The
* batch processing will not include any late data that arrives after the day's cutoff point.
*
* <p>To execute this pipeline, specify the pipeline configuration like this:
* <pre>{@code
* --tempLocation=YOUR_TEMP_DIRECTORY
* --runner=YOUR_RUNNER
* --output=YOUR_OUTPUT_DIRECTORY
* (possibly options specific to your runner or permissions for your temp/output locations)
* }
* </pre>
*
* <p>Optionally include the --input argument to specify a batch input file.
* See the --input default value for example batch data file, or use {@code injector.Injector} to
* generate your own batch data.
*/
public class UserScore {
/**
* Class to hold info about a game event.
*/
@DefaultCoder(AvroCoder.class)
static class GameActionInfo {
@Nullable String user;
@Nullable String team;
@Nullable Integer score;
@Nullable Long timestamp;
public GameActionInfo() {}
public GameActionInfo(String user, String team, Integer score, Long timestamp) {
this.user = user;
this.team = team;
this.score = score;
this.timestamp = timestamp;
}
public String getUser() {
return this.user;
}
public String getTeam() {
return this.team;
}
public Integer getScore() {
return this.score;
}
public String getKey(String keyname) {
if (keyname.equals("team")) {
return this.team;
} else { // return username as default
return this.user;
}
}
public Long getTimestamp() {
return this.timestamp;
}
}
/**
* Parses the raw game event info into GameActionInfo objects. Each event line has the following
* format: username,teamname,score,timestamp_in_ms,readable_time
* e.g.:
* user2_AsparagusPig,AsparagusPig,10,1445230923951,2015-11-02 09:09:28.224
* The human-readable time string is not used here.
*/
static class ParseEventFn extends DoFn<String, GameActionInfo> {
// Log and count parse errors.
private static final Logger LOG = LoggerFactory.getLogger(ParseEventFn.class);
private final Counter numParseErrors = Metrics.counter("main", "ParseErrors");
@ProcessElement
public void processElement(ProcessContext c) {
String[] components = c.element().split(",");
try {
String user = components[0].trim();
String team = components[1].trim();
Integer score = Integer.parseInt(components[2].trim());
Long timestamp = Long.parseLong(components[3].trim());
GameActionInfo gInfo = new GameActionInfo(user, team, score, timestamp);
c.output(gInfo);
} catch (ArrayIndexOutOfBoundsException | NumberFormatException e) {
numParseErrors.inc();
LOG.info("Parse error on " + c.element() + ", " + e.getMessage());
}
}
}
/**
* A transform to extract key/score information from GameActionInfo, and sum the scores. The
* constructor arg determines whether 'team' or 'user' info is extracted.
*/
// [START DocInclude_USExtractXform]
public static class ExtractAndSumScore
extends PTransform<PCollection<GameActionInfo>, PCollection<KV<String, Integer>>> {
private final String field;
ExtractAndSumScore(String field) {
this.field = field;
}
@Override
public PCollection<KV<String, Integer>> expand(
PCollection<GameActionInfo> gameInfo) {
return gameInfo
.apply(MapElements
.into(TypeDescriptors.kvs(TypeDescriptors.strings(), TypeDescriptors.integers()))
.via((GameActionInfo gInfo) -> KV.of(gInfo.getKey(field), gInfo.getScore())))
.apply(Sum.<String>integersPerKey());
}
}
// [END DocInclude_USExtractXform]
/**
* Options supported by {@link UserScore}.
*/
public interface Options extends PipelineOptions {
@Description("Path to the data file(s) containing game data.")
// The default maps to two large Google Cloud Storage files (each ~12GB) holding two subsequent
// day's worth (roughly) of data.
@Default.String("gs://apache-beam-samples/game/gaming_data*.csv")
String getInput();
void setInput(String value);
// Set this required option to specify where to write the output.
@Description("Path of the file to write to.")
@Validation.Required
String getOutput();
void setOutput(String value);
}
/**
* Create a map of information that describes how to write pipeline output to text. This map
* is passed to the {@link WriteToText} constructor to write user score sums.
*/
protected static Map<String, WriteToText.FieldFn<KV<String, Integer>>>
configureOutput() {
Map<String, WriteToText.FieldFn<KV<String, Integer>>> config =
new HashMap<String, WriteToText.FieldFn<KV<String, Integer>>>();
config.put("user", (c, w) -> c.element().getKey());
config.put("total_score", (c, w) -> c.element().getValue());
return config;
}
/**
* Run a batch pipeline.
*/
// [START DocInclude_USMain]
public static void main(String[] args) throws Exception {
// Begin constructing a pipeline configured by commandline flags.
Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
Pipeline pipeline = Pipeline.create(options);
// Read events from a text file and parse them.
pipeline
.apply(TextIO.read().from(options.getInput()))
.apply("ParseGameEvent", ParDo.of(new ParseEventFn()))
// Extract and sum username/score pairs from the event data.
.apply("ExtractUserScore", new ExtractAndSumScore("user"))
.apply(
"WriteUserScoreSums",
new WriteToText<KV<String, Integer>>(
options.getOutput(),
configureOutput(),
false));
// Run the batch pipeline.
pipeline.run().waitUntilFinish();
}
// [END DocInclude_USMain]
}