/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.beam.examples.complete.game; import java.util.HashMap; import java.util.Map; import java.util.TimeZone; import org.apache.beam.examples.common.ExampleUtils; import org.apache.beam.examples.complete.game.utils.WriteWindowedToBigQuery; import org.apache.beam.sdk.Pipeline; import org.apache.beam.sdk.PipelineResult; import org.apache.beam.sdk.extensions.gcp.options.GcpOptions; import org.apache.beam.sdk.io.gcp.pubsub.PubsubIO; import org.apache.beam.sdk.metrics.Counter; import org.apache.beam.sdk.metrics.Metrics; import org.apache.beam.sdk.options.Default; import org.apache.beam.sdk.options.Description; import org.apache.beam.sdk.options.PipelineOptionsFactory; import org.apache.beam.sdk.transforms.Combine; import org.apache.beam.sdk.transforms.DoFn; import org.apache.beam.sdk.transforms.MapElements; import org.apache.beam.sdk.transforms.Mean; import org.apache.beam.sdk.transforms.PTransform; import org.apache.beam.sdk.transforms.ParDo; import org.apache.beam.sdk.transforms.Sum; import org.apache.beam.sdk.transforms.Values; import org.apache.beam.sdk.transforms.View; import org.apache.beam.sdk.transforms.windowing.BoundedWindow; import org.apache.beam.sdk.transforms.windowing.FixedWindows; import org.apache.beam.sdk.transforms.windowing.IntervalWindow; import org.apache.beam.sdk.transforms.windowing.Sessions; import org.apache.beam.sdk.transforms.windowing.TimestampCombiner; import org.apache.beam.sdk.transforms.windowing.Window; import org.apache.beam.sdk.values.KV; import org.apache.beam.sdk.values.PCollection; import org.apache.beam.sdk.values.PCollectionView; import org.apache.beam.sdk.values.TypeDescriptors; import org.joda.time.DateTimeZone; import org.joda.time.Duration; import org.joda.time.Instant; import org.joda.time.format.DateTimeFormat; import org.joda.time.format.DateTimeFormatter; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * This class is the fourth in a series of four pipelines that tell a story in a 'gaming' * domain, following {@link UserScore}, {@link HourlyTeamScore}, and {@link LeaderBoard}. * New concepts: session windows and finding session duration; use of both * singleton and non-singleton side inputs. * * <p>This pipeline builds on the {@link LeaderBoard} functionality, and adds some "business * intelligence" analysis: abuse detection and usage patterns. The pipeline derives the Mean user * score sum for a window, and uses that information to identify likely spammers/robots. (The robots * have a higher click rate than the human users). The 'robot' users are then filtered out when * calculating the team scores. * * <p>Additionally, user sessions are tracked: that is, we find bursts of user activity using * session windows. Then, the mean session duration information is recorded in the context of * subsequent fixed windowing. (This could be used to tell us what games are giving us greater * user retention). * * <p>Run {@code org.apache.beam.examples.complete.game.injector.Injector} to generate * pubsub data for this pipeline. The {@code Injector} documentation provides more detail. * * <p>To execute this pipeline, specify the pipeline configuration like this: * <pre>{@code * --project=YOUR_PROJECT_ID * --tempLocation=gs://YOUR_TEMP_DIRECTORY * --runner=YOUR_RUNNER * --dataset=YOUR-DATASET * --topic=projects/YOUR-PROJECT/topics/YOUR-TOPIC * } * </pre> * * <p>The BigQuery dataset you specify must already exist. The PubSub topic you specify should * be the same topic to which the Injector is publishing. */ public class GameStats extends LeaderBoard { private static final String TIMESTAMP_ATTRIBUTE = "timestamp_ms"; private static DateTimeFormatter fmt = DateTimeFormat.forPattern("yyyy-MM-dd HH:mm:ss.SSS") .withZone(DateTimeZone.forTimeZone(TimeZone.getTimeZone("PST"))); /** * Filter out all but those users with a high clickrate, which we will consider as 'spammy' uesrs. * We do this by finding the mean total score per user, then using that information as a side * input to filter out all but those user scores that are larger than * {@code (mean * SCORE_WEIGHT)}. */ // [START DocInclude_AbuseDetect] public static class CalculateSpammyUsers extends PTransform<PCollection<KV<String, Integer>>, PCollection<KV<String, Integer>>> { private static final Logger LOG = LoggerFactory.getLogger(CalculateSpammyUsers.class); private static final double SCORE_WEIGHT = 2.5; @Override public PCollection<KV<String, Integer>> expand(PCollection<KV<String, Integer>> userScores) { // Get the sum of scores for each user. PCollection<KV<String, Integer>> sumScores = userScores .apply("UserSum", Sum.<String>integersPerKey()); // Extract the score from each element, and use it to find the global mean. final PCollectionView<Double> globalMeanScore = sumScores.apply(Values.<Integer>create()) .apply(Mean.<Integer>globally().asSingletonView()); // Filter the user sums using the global mean. PCollection<KV<String, Integer>> filtered = sumScores .apply("ProcessAndFilter", ParDo // use the derived mean total score as a side input .of(new DoFn<KV<String, Integer>, KV<String, Integer>>() { private final Counter numSpammerUsers = Metrics.counter("main", "SpammerUsers"); @ProcessElement public void processElement(ProcessContext c) { Integer score = c.element().getValue(); Double gmc = c.sideInput(globalMeanScore); if (score > (gmc * SCORE_WEIGHT)) { LOG.info("user " + c.element().getKey() + " spammer score " + score + " with mean " + gmc); numSpammerUsers.inc(); c.output(c.element()); } } }).withSideInputs(globalMeanScore)); return filtered; } } // [END DocInclude_AbuseDetect] /** * Calculate and output an element's session duration. */ private static class UserSessionInfoFn extends DoFn<KV<String, Integer>, Integer> { @ProcessElement public void processElement(ProcessContext c, BoundedWindow window) { IntervalWindow w = (IntervalWindow) window; int duration = new Duration( w.start(), w.end()).toPeriod().toStandardMinutes().getMinutes(); c.output(duration); } } /** * Options supported by {@link GameStats}. */ interface Options extends LeaderBoard.Options { @Description("Numeric value of fixed window duration for user analysis, in minutes") @Default.Integer(60) Integer getFixedWindowDuration(); void setFixedWindowDuration(Integer value); @Description("Numeric value of gap between user sessions, in minutes") @Default.Integer(5) Integer getSessionGap(); void setSessionGap(Integer value); @Description("Numeric value of fixed window for finding mean of user session duration, " + "in minutes") @Default.Integer(30) Integer getUserActivityWindowDuration(); void setUserActivityWindowDuration(Integer value); @Description("Prefix used for the BigQuery table names") @Default.String("game_stats") String getGameStatsTablePrefix(); void setGameStatsTablePrefix(String value); } /** * Create a map of information that describes how to write pipeline output to BigQuery. This map * is used to write information about team score sums. */ protected static Map<String, WriteWindowedToBigQuery.FieldInfo<KV<String, Integer>>> configureWindowedWrite() { Map<String, WriteWindowedToBigQuery.FieldInfo<KV<String, Integer>>> tableConfigure = new HashMap<String, WriteWindowedToBigQuery.FieldInfo<KV<String, Integer>>>(); tableConfigure.put( "team", new WriteWindowedToBigQuery.FieldInfo<KV<String, Integer>>( "STRING", (c, w) -> c.element().getKey())); tableConfigure.put( "total_score", new WriteWindowedToBigQuery.FieldInfo<KV<String, Integer>>( "INTEGER", (c, w) -> c.element().getValue())); tableConfigure.put( "window_start", new WriteWindowedToBigQuery.FieldInfo<KV<String, Integer>>( "STRING", (c, w) -> { IntervalWindow window = (IntervalWindow) w; return fmt.print(window.start()); })); tableConfigure.put( "processing_time", new WriteWindowedToBigQuery.FieldInfo<KV<String, Integer>>( "STRING", (c, w) -> fmt.print(Instant.now()))); return tableConfigure; } /** * Create a map of information that describes how to write pipeline output to BigQuery. This map * is used to write information about mean user session time. */ protected static Map<String, WriteWindowedToBigQuery.FieldInfo<Double>> configureSessionWindowWrite() { Map<String, WriteWindowedToBigQuery.FieldInfo<Double>> tableConfigure = new HashMap<String, WriteWindowedToBigQuery.FieldInfo<Double>>(); tableConfigure.put( "window_start", new WriteWindowedToBigQuery.FieldInfo<Double>( "STRING", (c, w) -> { IntervalWindow window = (IntervalWindow) w; return fmt.print(window.start()); })); tableConfigure.put( "mean_duration", new WriteWindowedToBigQuery.FieldInfo<Double>("FLOAT", (c, w) -> c.element())); return tableConfigure; } public static void main(String[] args) throws Exception { Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); // Enforce that this pipeline is always run in streaming mode. options.setStreaming(true); ExampleUtils exampleUtils = new ExampleUtils(options); Pipeline pipeline = Pipeline.create(options); // Read Events from Pub/Sub using custom timestamps PCollection<GameActionInfo> rawEvents = pipeline .apply(PubsubIO.readStrings() .withTimestampAttribute(TIMESTAMP_ATTRIBUTE).fromTopic(options.getTopic())) .apply("ParseGameEvent", ParDo.of(new ParseEventFn())); // Extract username/score pairs from the event stream PCollection<KV<String, Integer>> userEvents = rawEvents.apply("ExtractUserScore", MapElements .into(TypeDescriptors.kvs(TypeDescriptors.strings(), TypeDescriptors.integers())) .via((GameActionInfo gInfo) -> KV.of(gInfo.getUser(), gInfo.getScore()))); // Calculate the total score per user over fixed windows, and // cumulative updates for late data. final PCollectionView<Map<String, Integer>> spammersView = userEvents .apply("FixedWindowsUser", Window.<KV<String, Integer>>into( FixedWindows.of(Duration.standardMinutes(options.getFixedWindowDuration())))) // Filter out everyone but those with (SCORE_WEIGHT * avg) clickrate. // These might be robots/spammers. .apply("CalculateSpammyUsers", new CalculateSpammyUsers()) // Derive a view from the collection of spammer users. It will be used as a side input // in calculating the team score sums, below. .apply("CreateSpammersView", View.<String, Integer>asMap()); // [START DocInclude_FilterAndCalc] // Calculate the total score per team over fixed windows, // and emit cumulative updates for late data. Uses the side input derived above-- the set of // suspected robots-- to filter out scores from those users from the sum. // Write the results to BigQuery. rawEvents .apply("WindowIntoFixedWindows", Window.<GameActionInfo>into( FixedWindows.of(Duration.standardMinutes(options.getFixedWindowDuration())))) // Filter out the detected spammer users, using the side input derived above. .apply("FilterOutSpammers", ParDo .of(new DoFn<GameActionInfo, GameActionInfo>() { @ProcessElement public void processElement(ProcessContext c) { // If the user is not in the spammers Map, output the data element. if (c.sideInput(spammersView).get(c.element().getUser().trim()) == null) { c.output(c.element()); } } }).withSideInputs(spammersView)) // Extract and sum teamname/score pairs from the event data. .apply("ExtractTeamScore", new ExtractAndSumScore("team")) // [END DocInclude_FilterAndCalc] // Write the result to BigQuery .apply("WriteTeamSums", new WriteWindowedToBigQuery<KV<String, Integer>>( options.as(GcpOptions.class).getProject(), options.getDataset(), options.getGameStatsTablePrefix() + "_team", configureWindowedWrite())); // [START DocInclude_SessionCalc] // Detect user sessions-- that is, a burst of activity separated by a gap from further // activity. Find and record the mean session lengths. // This information could help the game designers track the changing user engagement // as their set of games changes. userEvents .apply("WindowIntoSessions", Window.<KV<String, Integer>>into( Sessions.withGapDuration(Duration.standardMinutes(options.getSessionGap()))) .withTimestampCombiner(TimestampCombiner.END_OF_WINDOW)) // For this use, we care only about the existence of the session, not any particular // information aggregated over it, so the following is an efficient way to do that. .apply(Combine.perKey(x -> 0)) // Get the duration per session. .apply("UserSessionActivity", ParDo.of(new UserSessionInfoFn())) // [END DocInclude_SessionCalc] // [START DocInclude_Rewindow] // Re-window to process groups of session sums according to when the sessions complete. .apply("WindowToExtractSessionMean", Window.<Integer>into( FixedWindows.of(Duration.standardMinutes(options.getUserActivityWindowDuration())))) // Find the mean session duration in each window. .apply(Mean.<Integer>globally().withoutDefaults()) // Write this info to a BigQuery table. .apply("WriteAvgSessionLength", new WriteWindowedToBigQuery<Double>( options.as(GcpOptions.class).getProject(), options.getDataset(), options.getGameStatsTablePrefix() + "_sessions", configureSessionWindowWrite())); // [END DocInclude_Rewindow] // Run the pipeline and wait for the pipeline to finish; capture cancellation requests from the // command line. PipelineResult result = pipeline.run(); exampleUtils.waitToFinish(result); } }