/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.beam.examples.complete; import static com.google.common.base.Preconditions.checkArgument; import static com.google.datastore.v1.client.DatastoreHelper.makeKey; import static com.google.datastore.v1.client.DatastoreHelper.makeValue; import com.google.api.services.bigquery.model.TableFieldSchema; import com.google.api.services.bigquery.model.TableReference; import com.google.api.services.bigquery.model.TableRow; import com.google.api.services.bigquery.model.TableSchema; import com.google.common.base.MoreObjects; import com.google.datastore.v1.Entity; import com.google.datastore.v1.Key; import com.google.datastore.v1.Value; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.beam.examples.common.ExampleBigQueryTableOptions; import org.apache.beam.examples.common.ExampleOptions; import org.apache.beam.examples.common.ExampleUtils; import org.apache.beam.sdk.Pipeline; import org.apache.beam.sdk.PipelineResult; import org.apache.beam.sdk.coders.AvroCoder; import org.apache.beam.sdk.coders.DefaultCoder; import org.apache.beam.sdk.io.TextIO; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO; import org.apache.beam.sdk.io.gcp.datastore.DatastoreIO; import org.apache.beam.sdk.options.Default; import org.apache.beam.sdk.options.Description; import org.apache.beam.sdk.options.PipelineOptionsFactory; import org.apache.beam.sdk.options.StreamingOptions; import org.apache.beam.sdk.options.Validation; import org.apache.beam.sdk.transforms.Count; import org.apache.beam.sdk.transforms.DoFn; import org.apache.beam.sdk.transforms.Filter; import org.apache.beam.sdk.transforms.Flatten; import org.apache.beam.sdk.transforms.PTransform; import org.apache.beam.sdk.transforms.ParDo; import org.apache.beam.sdk.transforms.Partition; import org.apache.beam.sdk.transforms.Partition.PartitionFn; import org.apache.beam.sdk.transforms.SerializableFunction; import org.apache.beam.sdk.transforms.Top; import org.apache.beam.sdk.transforms.windowing.GlobalWindows; import org.apache.beam.sdk.transforms.windowing.SlidingWindows; import org.apache.beam.sdk.transforms.windowing.Window; import org.apache.beam.sdk.transforms.windowing.WindowFn; import org.apache.beam.sdk.values.KV; import org.apache.beam.sdk.values.PCollection; import org.apache.beam.sdk.values.PCollectionList; import org.joda.time.Duration; /** * An example that computes the most popular hash tags * for every prefix, which can be used for auto-completion. * * <p>Concepts: Using the same pipeline in both streaming and batch, combiners, * composite transforms. * * <p>To execute this pipeline in streaming mode, specify: * <pre>{@code * --streaming * }</pre> * * <p>To change the runner, specify: * <pre>{@code * --runner=YOUR_SELECTED_RUNNER * } * </pre> * See examples/java/README.md for instructions about how to configure different runners. * * <p>This will update the Cloud Datastore every 10 seconds based on the last * 30 minutes of data received. */ public class AutoComplete { /** * A PTransform that takes as input a list of tokens and returns * the most common tokens per prefix. */ public static class ComputeTopCompletions extends PTransform<PCollection<String>, PCollection<KV<String, List<CompletionCandidate>>>> { private final int candidatesPerPrefix; private final boolean recursive; protected ComputeTopCompletions(int candidatesPerPrefix, boolean recursive) { this.candidatesPerPrefix = candidatesPerPrefix; this.recursive = recursive; } public static ComputeTopCompletions top(int candidatesPerPrefix, boolean recursive) { return new ComputeTopCompletions(candidatesPerPrefix, recursive); } @Override public PCollection<KV<String, List<CompletionCandidate>>> expand(PCollection<String> input) { PCollection<CompletionCandidate> candidates = input // First count how often each token appears. .apply(Count.<String>perElement()) // Map the KV outputs of Count into our own CompletionCandiate class. .apply("CreateCompletionCandidates", ParDo.of( new DoFn<KV<String, Long>, CompletionCandidate>() { @ProcessElement public void processElement(ProcessContext c) { c.output(new CompletionCandidate(c.element().getKey(), c.element().getValue())); } })); // Compute the top via either a flat or recursive algorithm. if (recursive) { return candidates .apply(new ComputeTopRecursive(candidatesPerPrefix, 1)) .apply(Flatten.<KV<String, List<CompletionCandidate>>>pCollections()); } else { return candidates .apply(new ComputeTopFlat(candidatesPerPrefix, 1)); } } } /** * Lower latency, but more expensive. */ private static class ComputeTopFlat extends PTransform<PCollection<CompletionCandidate>, PCollection<KV<String, List<CompletionCandidate>>>> { private final int candidatesPerPrefix; private final int minPrefix; public ComputeTopFlat(int candidatesPerPrefix, int minPrefix) { this.candidatesPerPrefix = candidatesPerPrefix; this.minPrefix = minPrefix; } @Override public PCollection<KV<String, List<CompletionCandidate>>> expand( PCollection<CompletionCandidate> input) { return input // For each completion candidate, map it to all prefixes. .apply(ParDo.of(new AllPrefixes(minPrefix))) // Find and return the top candiates for each prefix. .apply(Top.<String, CompletionCandidate>largestPerKey(candidatesPerPrefix) .withHotKeyFanout(new HotKeyFanout())); } private static class HotKeyFanout implements SerializableFunction<String, Integer> { @Override public Integer apply(String input) { return (int) Math.pow(4, 5 - input.length()); } } } /** * Cheaper but higher latency. * * <p>Returns two PCollections, the first is top prefixes of size greater * than minPrefix, and the second is top prefixes of size exactly * minPrefix. */ private static class ComputeTopRecursive extends PTransform<PCollection<CompletionCandidate>, PCollectionList<KV<String, List<CompletionCandidate>>>> { private final int candidatesPerPrefix; private final int minPrefix; public ComputeTopRecursive(int candidatesPerPrefix, int minPrefix) { this.candidatesPerPrefix = candidatesPerPrefix; this.minPrefix = minPrefix; } private class KeySizePartitionFn implements PartitionFn<KV<String, List<CompletionCandidate>>> { @Override public int partitionFor(KV<String, List<CompletionCandidate>> elem, int numPartitions) { return elem.getKey().length() > minPrefix ? 0 : 1; } } private static class FlattenTops extends DoFn<KV<String, List<CompletionCandidate>>, CompletionCandidate> { @ProcessElement public void processElement(ProcessContext c) { for (CompletionCandidate cc : c.element().getValue()) { c.output(cc); } } } @Override public PCollectionList<KV<String, List<CompletionCandidate>>> expand( PCollection<CompletionCandidate> input) { if (minPrefix > 10) { // Base case, partitioning to return the output in the expected format. return input .apply(new ComputeTopFlat(candidatesPerPrefix, minPrefix)) .apply(Partition.of(2, new KeySizePartitionFn())); } else { // If a candidate is in the top N for prefix a...b, it must also be in the top // N for a...bX for every X, which is typlically a much smaller set to consider. // First, compute the top candidate for prefixes of size at least minPrefix + 1. PCollectionList<KV<String, List<CompletionCandidate>>> larger = input .apply(new ComputeTopRecursive(candidatesPerPrefix, minPrefix + 1)); // Consider the top candidates for each prefix of length minPrefix + 1... PCollection<KV<String, List<CompletionCandidate>>> small = PCollectionList .of(larger.get(1).apply(ParDo.of(new FlattenTops()))) // ...together with those (previously excluded) candidates of length // exactly minPrefix... .and(input.apply(Filter.by( new SerializableFunction<CompletionCandidate, Boolean>() { @Override public Boolean apply(CompletionCandidate c) { return c.getValue().length() == minPrefix; } }))) .apply("FlattenSmall", Flatten.<CompletionCandidate>pCollections()) // ...set the key to be the minPrefix-length prefix... .apply(ParDo.of(new AllPrefixes(minPrefix, minPrefix))) // ...and (re)apply the Top operator to all of them together. .apply(Top.<String, CompletionCandidate>largestPerKey(candidatesPerPrefix)); PCollection<KV<String, List<CompletionCandidate>>> flattenLarger = larger .apply("FlattenLarge", Flatten.<KV<String, List<CompletionCandidate>>>pCollections()); return PCollectionList.of(flattenLarger).and(small); } } } /** * A DoFn that keys each candidate by all its prefixes. */ private static class AllPrefixes extends DoFn<CompletionCandidate, KV<String, CompletionCandidate>> { private final int minPrefix; private final int maxPrefix; public AllPrefixes(int minPrefix) { this(minPrefix, Integer.MAX_VALUE); } public AllPrefixes(int minPrefix, int maxPrefix) { this.minPrefix = minPrefix; this.maxPrefix = maxPrefix; } @ProcessElement public void processElement(ProcessContext c) { String word = c.element().value; for (int i = minPrefix; i <= Math.min(word.length(), maxPrefix); i++) { c.output(KV.of(word.substring(0, i), c.element())); } } } /** * Class used to store tag-count pairs. */ @DefaultCoder(AvroCoder.class) static class CompletionCandidate implements Comparable<CompletionCandidate> { private long count; private String value; public CompletionCandidate(String value, long count) { this.value = value; this.count = count; } public long getCount() { return count; } public String getValue() { return value; } // Empty constructor required for Avro decoding. public CompletionCandidate() {} @Override public int compareTo(CompletionCandidate o) { if (this.count < o.count) { return -1; } else if (this.count == o.count) { return this.value.compareTo(o.value); } else { return 1; } } @Override public boolean equals(Object other) { if (other instanceof CompletionCandidate) { CompletionCandidate that = (CompletionCandidate) other; return this.count == that.count && this.value.equals(that.value); } else { return false; } } @Override public int hashCode() { return Long.valueOf(count).hashCode() ^ value.hashCode(); } @Override public String toString() { return "CompletionCandidate[" + value + ", " + count + "]"; } } /** * Takes as input a set of strings, and emits each #hashtag found therein. */ static class ExtractHashtags extends DoFn<String, String> { @ProcessElement public void processElement(ProcessContext c) { Matcher m = Pattern.compile("#\\S+").matcher(c.element()); while (m.find()) { c.output(m.group().substring(1)); } } } static class FormatForBigquery extends DoFn<KV<String, List<CompletionCandidate>>, TableRow> { @ProcessElement public void processElement(ProcessContext c) { List<TableRow> completions = new ArrayList<>(); for (CompletionCandidate cc : c.element().getValue()) { completions.add(new TableRow() .set("count", cc.getCount()) .set("tag", cc.getValue())); } TableRow row = new TableRow() .set("prefix", c.element().getKey()) .set("tags", completions); c.output(row); } /** * Defines the BigQuery schema used for the output. */ static TableSchema getSchema() { List<TableFieldSchema> tagFields = new ArrayList<>(); tagFields.add(new TableFieldSchema().setName("count").setType("INTEGER")); tagFields.add(new TableFieldSchema().setName("tag").setType("STRING")); List<TableFieldSchema> fields = new ArrayList<>(); fields.add(new TableFieldSchema().setName("prefix").setType("STRING")); fields.add(new TableFieldSchema() .setName("tags").setType("RECORD").setMode("REPEATED").setFields(tagFields)); return new TableSchema().setFields(fields); } } /** * Takes as input a the top candidates per prefix, and emits an entity * suitable for writing to Cloud Datastore. * * <p>Note: We use ancestor keys for strong consistency. See the Cloud Datastore documentation on * <a href="https://cloud.google.com/datastore/docs/concepts/structuring_for_strong_consistency"> * Structuring Data for Strong Consistency</a> */ static class FormatForDatastore extends DoFn<KV<String, List<CompletionCandidate>>, Entity> { private String kind; private String ancestorKey; public FormatForDatastore(String kind, String ancestorKey) { this.kind = kind; this.ancestorKey = ancestorKey; } @ProcessElement public void processElement(ProcessContext c) { Entity.Builder entityBuilder = Entity.newBuilder(); Key key = makeKey(makeKey(kind, ancestorKey).build(), kind, c.element().getKey()).build(); entityBuilder.setKey(key); List<Value> candidates = new ArrayList<>(); Map<String, Value> properties = new HashMap<>(); for (CompletionCandidate tag : c.element().getValue()) { Entity.Builder tagEntity = Entity.newBuilder(); properties.put("tag", makeValue(tag.value).build()); properties.put("count", makeValue(tag.count).build()); candidates.add(makeValue(tagEntity).build()); } properties.put("candidates", makeValue(candidates).build()); entityBuilder.putAllProperties(properties); c.output(entityBuilder.build()); } } /** * Options supported by this class. * * <p>Inherits standard Beam example configuration options. */ public interface Options extends ExampleOptions, ExampleBigQueryTableOptions, StreamingOptions { @Description("Input text file") @Validation.Required String getInputFile(); void setInputFile(String value); @Description("Whether to use the recursive algorithm") @Default.Boolean(true) Boolean getRecursive(); void setRecursive(Boolean value); @Description("Cloud Datastore entity kind") @Default.String("autocomplete-demo") String getKind(); void setKind(String value); @Description("Whether output to BigQuery") @Default.Boolean(true) Boolean getOutputToBigQuery(); void setOutputToBigQuery(Boolean value); @Description("Whether output to Cloud Datastore") @Default.Boolean(false) Boolean getOutputToDatastore(); void setOutputToDatastore(Boolean value); @Description("Cloud Datastore ancestor key") @Default.String("root") String getDatastoreAncestorKey(); void setDatastoreAncestorKey(String value); @Description("Cloud Datastore output project ID, defaults to project ID") String getOutputProject(); void setOutputProject(String value); } public static void main(String[] args) throws IOException { Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); options.setBigQuerySchema(FormatForBigquery.getSchema()); ExampleUtils exampleUtils = new ExampleUtils(options); // We support running the same pipeline in either // batch or windowed streaming mode. WindowFn<Object, ?> windowFn; if (options.isStreaming()) { checkArgument( !options.getOutputToDatastore(), "DatastoreIO is not supported in streaming."); windowFn = SlidingWindows.of(Duration.standardMinutes(30)).every(Duration.standardSeconds(5)); } else { windowFn = new GlobalWindows(); } // Create the pipeline. Pipeline p = Pipeline.create(options); PCollection<KV<String, List<CompletionCandidate>>> toWrite = p .apply(TextIO.read().from(options.getInputFile())) .apply(ParDo.of(new ExtractHashtags())) .apply(Window.<String>into(windowFn)) .apply(ComputeTopCompletions.top(10, options.getRecursive())); if (options.getOutputToDatastore()) { toWrite .apply("FormatForDatastore", ParDo.of(new FormatForDatastore(options.getKind(), options.getDatastoreAncestorKey()))) .apply(DatastoreIO.v1().write().withProjectId(MoreObjects.firstNonNull( options.getOutputProject(), options.getProject()))); } if (options.getOutputToBigQuery()) { exampleUtils.setupBigQueryTable(); TableReference tableRef = new TableReference(); tableRef.setProjectId(options.getProject()); tableRef.setDatasetId(options.getBigQueryDataset()); tableRef.setTableId(options.getBigQueryTable()); toWrite .apply(ParDo.of(new FormatForBigquery())) .apply(BigQueryIO.writeTableRows() .to(tableRef) .withSchema(FormatForBigquery.getSchema()) .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED) .withWriteDisposition(options.isStreaming() ? BigQueryIO.Write.WriteDisposition.WRITE_APPEND : BigQueryIO.Write.WriteDisposition.WRITE_TRUNCATE)); } // Run the pipeline. PipelineResult result = p.run(); // ExampleUtils will try to cancel the pipeline and the injector before the program exists. exampleUtils.waitToFinish(result); } }