AutoComplete.java example

Explorer
beam-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.beam.examples.complete;

import static com.google.common.base.Preconditions.checkArgument;
import static com.google.datastore.v1.client.DatastoreHelper.makeKey;
import static com.google.datastore.v1.client.DatastoreHelper.makeValue;

import com.google.api.services.bigquery.model.TableFieldSchema;
import com.google.api.services.bigquery.model.TableReference;
import com.google.api.services.bigquery.model.TableRow;
import com.google.api.services.bigquery.model.TableSchema;
import com.google.common.base.MoreObjects;
import com.google.datastore.v1.Entity;
import com.google.datastore.v1.Key;
import com.google.datastore.v1.Value;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.beam.examples.common.ExampleBigQueryTableOptions;
import org.apache.beam.examples.common.ExampleOptions;
import org.apache.beam.examples.common.ExampleUtils;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.PipelineResult;
import org.apache.beam.sdk.coders.AvroCoder;
import org.apache.beam.sdk.coders.DefaultCoder;
import org.apache.beam.sdk.io.TextIO;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
import org.apache.beam.sdk.io.gcp.datastore.DatastoreIO;
import org.apache.beam.sdk.options.Default;
import org.apache.beam.sdk.options.Description;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.options.StreamingOptions;
import org.apache.beam.sdk.options.Validation;
import org.apache.beam.sdk.transforms.Count;
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.transforms.Filter;
import org.apache.beam.sdk.transforms.Flatten;
import org.apache.beam.sdk.transforms.PTransform;
import org.apache.beam.sdk.transforms.ParDo;
import org.apache.beam.sdk.transforms.Partition;
import org.apache.beam.sdk.transforms.Partition.PartitionFn;
import org.apache.beam.sdk.transforms.SerializableFunction;
import org.apache.beam.sdk.transforms.Top;
import org.apache.beam.sdk.transforms.windowing.GlobalWindows;
import org.apache.beam.sdk.transforms.windowing.SlidingWindows;
import org.apache.beam.sdk.transforms.windowing.Window;
import org.apache.beam.sdk.transforms.windowing.WindowFn;
import org.apache.beam.sdk.values.KV;
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.PCollectionList;
import org.joda.time.Duration;

/**
 * An example that computes the most popular hash tags
 * for every prefix, which can be used for auto-completion.
 *
 * <p>Concepts: Using the same pipeline in both streaming and batch, combiners,
 *              composite transforms.
 *
 * <p>To execute this pipeline in streaming mode, specify:
 * <pre>{@code
 *   --streaming
 * }</pre>
 *
 * <p>To change the runner, specify:
 * <pre>{@code
 *   --runner=YOUR_SELECTED_RUNNER
 * }
 * </pre>
 * See examples/java/README.md for instructions about how to configure different runners.
 *
 * <p>This will update the Cloud Datastore every 10 seconds based on the last
 * 30 minutes of data received.
 */
public class AutoComplete {

  /**
   * A PTransform that takes as input a list of tokens and returns
   * the most common tokens per prefix.
   */
  public static class ComputeTopCompletions
      extends PTransform<PCollection<String>, PCollection<KV<String, List<CompletionCandidate>>>> {
    private final int candidatesPerPrefix;
    private final boolean recursive;

    protected ComputeTopCompletions(int candidatesPerPrefix, boolean recursive) {
      this.candidatesPerPrefix = candidatesPerPrefix;
      this.recursive = recursive;
    }

    public static ComputeTopCompletions top(int candidatesPerPrefix, boolean recursive) {
      return new ComputeTopCompletions(candidatesPerPrefix, recursive);
    }

    @Override
    public PCollection<KV<String, List<CompletionCandidate>>> expand(PCollection<String> input) {
      PCollection<CompletionCandidate> candidates = input
        // First count how often each token appears.
        .apply(Count.<String>perElement())

        // Map the KV outputs of Count into our own CompletionCandiate class.
        .apply("CreateCompletionCandidates", ParDo.of(
            new DoFn<KV<String, Long>, CompletionCandidate>() {
              @ProcessElement
              public void processElement(ProcessContext c) {
                c.output(new CompletionCandidate(c.element().getKey(), c.element().getValue()));
              }
            }));

      // Compute the top via either a flat or recursive algorithm.
      if (recursive) {
        return candidates
          .apply(new ComputeTopRecursive(candidatesPerPrefix, 1))
          .apply(Flatten.<KV<String, List<CompletionCandidate>>>pCollections());
      } else {
        return candidates
          .apply(new ComputeTopFlat(candidatesPerPrefix, 1));
      }
    }
  }

  /**
   * Lower latency, but more expensive.
   */
  private static class ComputeTopFlat
      extends PTransform<PCollection<CompletionCandidate>,
                         PCollection<KV<String, List<CompletionCandidate>>>> {
    private final int candidatesPerPrefix;
    private final int minPrefix;

    public ComputeTopFlat(int candidatesPerPrefix, int minPrefix) {
      this.candidatesPerPrefix = candidatesPerPrefix;
      this.minPrefix = minPrefix;
    }

    @Override
    public PCollection<KV<String, List<CompletionCandidate>>> expand(
        PCollection<CompletionCandidate> input) {
      return input
        // For each completion candidate, map it to all prefixes.
        .apply(ParDo.of(new AllPrefixes(minPrefix)))

        // Find and return the top candiates for each prefix.
        .apply(Top.<String, CompletionCandidate>largestPerKey(candidatesPerPrefix)
               .withHotKeyFanout(new HotKeyFanout()));
    }

    private static class HotKeyFanout implements SerializableFunction<String, Integer> {
      @Override
      public Integer apply(String input) {
        return (int) Math.pow(4, 5 - input.length());
      }
    }
  }

  /**
   * Cheaper but higher latency.
   *
   * <p>Returns two PCollections, the first is top prefixes of size greater
   * than minPrefix, and the second is top prefixes of size exactly
   * minPrefix.
   */
  private static class ComputeTopRecursive
      extends PTransform<PCollection<CompletionCandidate>,
                         PCollectionList<KV<String, List<CompletionCandidate>>>> {
    private final int candidatesPerPrefix;
    private final int minPrefix;

    public ComputeTopRecursive(int candidatesPerPrefix, int minPrefix) {
      this.candidatesPerPrefix = candidatesPerPrefix;
      this.minPrefix = minPrefix;
    }

    private class KeySizePartitionFn implements PartitionFn<KV<String, List<CompletionCandidate>>> {
      @Override
      public int partitionFor(KV<String, List<CompletionCandidate>> elem, int numPartitions) {
        return elem.getKey().length() > minPrefix ? 0 : 1;
      }
    }

    private static class FlattenTops
        extends DoFn<KV<String, List<CompletionCandidate>>, CompletionCandidate> {
      @ProcessElement
      public void processElement(ProcessContext c) {
        for (CompletionCandidate cc : c.element().getValue()) {
          c.output(cc);
        }
      }
    }

    @Override
    public PCollectionList<KV<String, List<CompletionCandidate>>> expand(
          PCollection<CompletionCandidate> input) {
        if (minPrefix > 10) {
          // Base case, partitioning to return the output in the expected format.
          return input
            .apply(new ComputeTopFlat(candidatesPerPrefix, minPrefix))
            .apply(Partition.of(2, new KeySizePartitionFn()));
        } else {
          // If a candidate is in the top N for prefix a...b, it must also be in the top
          // N for a...bX for every X, which is typlically a much smaller set to consider.
          // First, compute the top candidate for prefixes of size at least minPrefix + 1.
          PCollectionList<KV<String, List<CompletionCandidate>>> larger = input
            .apply(new ComputeTopRecursive(candidatesPerPrefix, minPrefix + 1));
          // Consider the top candidates for each prefix of length minPrefix + 1...
          PCollection<KV<String, List<CompletionCandidate>>> small =
            PCollectionList
            .of(larger.get(1).apply(ParDo.of(new FlattenTops())))
            // ...together with those (previously excluded) candidates of length
            // exactly minPrefix...
            .and(input.apply(Filter.by(
                new SerializableFunction<CompletionCandidate, Boolean>() {
                  @Override
                  public Boolean apply(CompletionCandidate c) {
                    return c.getValue().length() == minPrefix;
                  }
                })))
            .apply("FlattenSmall", Flatten.<CompletionCandidate>pCollections())
            // ...set the key to be the minPrefix-length prefix...
            .apply(ParDo.of(new AllPrefixes(minPrefix, minPrefix)))
            // ...and (re)apply the Top operator to all of them together.
            .apply(Top.<String, CompletionCandidate>largestPerKey(candidatesPerPrefix));

          PCollection<KV<String, List<CompletionCandidate>>> flattenLarger = larger
              .apply("FlattenLarge", Flatten.<KV<String, List<CompletionCandidate>>>pCollections());

          return PCollectionList.of(flattenLarger).and(small);
        }
    }
  }

  /**
   * A DoFn that keys each candidate by all its prefixes.
   */
  private static class AllPrefixes
      extends DoFn<CompletionCandidate, KV<String, CompletionCandidate>> {
    private final int minPrefix;
    private final int maxPrefix;
    public AllPrefixes(int minPrefix) {
      this(minPrefix, Integer.MAX_VALUE);
    }
    public AllPrefixes(int minPrefix, int maxPrefix) {
      this.minPrefix = minPrefix;
      this.maxPrefix = maxPrefix;
    }
    @ProcessElement
    public void processElement(ProcessContext c) {
      String word = c.element().value;
      for (int i = minPrefix; i <= Math.min(word.length(), maxPrefix); i++) {
        c.output(KV.of(word.substring(0, i), c.element()));
      }
    }
  }

  /**
   * Class used to store tag-count pairs.
   */
  @DefaultCoder(AvroCoder.class)
  static class CompletionCandidate implements Comparable<CompletionCandidate> {
    private long count;
    private String value;

    public CompletionCandidate(String value, long count) {
      this.value = value;
      this.count = count;
    }

    public long getCount() {
      return count;
    }

    public String getValue() {
      return value;
    }

    // Empty constructor required for Avro decoding.
    public CompletionCandidate() {}

    @Override
    public int compareTo(CompletionCandidate o) {
      if (this.count < o.count) {
        return -1;
      } else if (this.count == o.count) {
        return this.value.compareTo(o.value);
      } else {
        return 1;
      }
    }

    @Override
    public boolean equals(Object other) {
      if (other instanceof CompletionCandidate) {
        CompletionCandidate that = (CompletionCandidate) other;
        return this.count == that.count && this.value.equals(that.value);
      } else {
        return false;
      }
    }

    @Override
    public int hashCode() {
      return Long.valueOf(count).hashCode() ^ value.hashCode();
    }

    @Override
    public String toString() {
      return "CompletionCandidate[" + value + ", " + count + "]";
    }
  }

  /**
   * Takes as input a set of strings, and emits each #hashtag found therein.
   */
  static class ExtractHashtags extends DoFn<String, String> {
    @ProcessElement
    public void processElement(ProcessContext c) {
      Matcher m = Pattern.compile("#\\S+").matcher(c.element());
      while (m.find()) {
        c.output(m.group().substring(1));
      }
    }
  }

  static class FormatForBigquery extends DoFn<KV<String, List<CompletionCandidate>>, TableRow> {
    @ProcessElement
    public void processElement(ProcessContext c) {
      List<TableRow> completions = new ArrayList<>();
      for (CompletionCandidate cc : c.element().getValue()) {
        completions.add(new TableRow()
          .set("count", cc.getCount())
          .set("tag", cc.getValue()));
      }
      TableRow row = new TableRow()
        .set("prefix", c.element().getKey())
        .set("tags", completions);
      c.output(row);
    }

    /**
     * Defines the BigQuery schema used for the output.
     */
    static TableSchema getSchema() {
      List<TableFieldSchema> tagFields = new ArrayList<>();
      tagFields.add(new TableFieldSchema().setName("count").setType("INTEGER"));
      tagFields.add(new TableFieldSchema().setName("tag").setType("STRING"));
      List<TableFieldSchema> fields = new ArrayList<>();
      fields.add(new TableFieldSchema().setName("prefix").setType("STRING"));
      fields.add(new TableFieldSchema()
          .setName("tags").setType("RECORD").setMode("REPEATED").setFields(tagFields));
      return new TableSchema().setFields(fields);
    }
  }

  /**
   * Takes as input a the top candidates per prefix, and emits an entity
   * suitable for writing to Cloud Datastore.
   *
   * <p>Note: We use ancestor keys for strong consistency. See the Cloud Datastore documentation on
   * <a href="https://cloud.google.com/datastore/docs/concepts/structuring_for_strong_consistency">
   * Structuring Data for Strong Consistency</a>
   */
  static class FormatForDatastore extends DoFn<KV<String, List<CompletionCandidate>>, Entity> {
    private String kind;
    private String ancestorKey;
    public FormatForDatastore(String kind, String ancestorKey) {
      this.kind = kind;
      this.ancestorKey = ancestorKey;
    }

    @ProcessElement
    public void processElement(ProcessContext c) {
      Entity.Builder entityBuilder = Entity.newBuilder();
      Key key = makeKey(makeKey(kind, ancestorKey).build(), kind, c.element().getKey()).build();

      entityBuilder.setKey(key);
      List<Value> candidates = new ArrayList<>();
      Map<String, Value> properties = new HashMap<>();
      for (CompletionCandidate tag : c.element().getValue()) {
        Entity.Builder tagEntity = Entity.newBuilder();
        properties.put("tag", makeValue(tag.value).build());
        properties.put("count", makeValue(tag.count).build());
        candidates.add(makeValue(tagEntity).build());
      }
      properties.put("candidates", makeValue(candidates).build());
      entityBuilder.putAllProperties(properties);
      c.output(entityBuilder.build());
    }
  }

  /**
   * Options supported by this class.
   *
   * <p>Inherits standard Beam example configuration options.
   */
  public interface Options
      extends ExampleOptions, ExampleBigQueryTableOptions, StreamingOptions {
    @Description("Input text file")
    @Validation.Required
    String getInputFile();
    void setInputFile(String value);

    @Description("Whether to use the recursive algorithm")
    @Default.Boolean(true)
    Boolean getRecursive();
    void setRecursive(Boolean value);

    @Description("Cloud Datastore entity kind")
    @Default.String("autocomplete-demo")
    String getKind();
    void setKind(String value);

    @Description("Whether output to BigQuery")
    @Default.Boolean(true)
    Boolean getOutputToBigQuery();
    void setOutputToBigQuery(Boolean value);

    @Description("Whether output to Cloud Datastore")
    @Default.Boolean(false)
    Boolean getOutputToDatastore();
    void setOutputToDatastore(Boolean value);

    @Description("Cloud Datastore ancestor key")
    @Default.String("root")
    String getDatastoreAncestorKey();
    void setDatastoreAncestorKey(String value);

    @Description("Cloud Datastore output project ID, defaults to project ID")
    String getOutputProject();
    void setOutputProject(String value);
  }

  public static void main(String[] args) throws IOException {
    Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);

    options.setBigQuerySchema(FormatForBigquery.getSchema());
    ExampleUtils exampleUtils = new ExampleUtils(options);

    // We support running the same pipeline in either
    // batch or windowed streaming mode.
    WindowFn<Object, ?> windowFn;
    if (options.isStreaming()) {
      checkArgument(
          !options.getOutputToDatastore(), "DatastoreIO is not supported in streaming.");
      windowFn = SlidingWindows.of(Duration.standardMinutes(30)).every(Duration.standardSeconds(5));
    } else {
      windowFn = new GlobalWindows();
    }

    // Create the pipeline.
    Pipeline p = Pipeline.create(options);
    PCollection<KV<String, List<CompletionCandidate>>> toWrite = p
      .apply(TextIO.read().from(options.getInputFile()))
      .apply(ParDo.of(new ExtractHashtags()))
      .apply(Window.<String>into(windowFn))
      .apply(ComputeTopCompletions.top(10, options.getRecursive()));

    if (options.getOutputToDatastore()) {
      toWrite
      .apply("FormatForDatastore", ParDo.of(new FormatForDatastore(options.getKind(),
          options.getDatastoreAncestorKey())))
      .apply(DatastoreIO.v1().write().withProjectId(MoreObjects.firstNonNull(
          options.getOutputProject(), options.getProject())));
    }
    if (options.getOutputToBigQuery()) {
      exampleUtils.setupBigQueryTable();

      TableReference tableRef = new TableReference();
      tableRef.setProjectId(options.getProject());
      tableRef.setDatasetId(options.getBigQueryDataset());
      tableRef.setTableId(options.getBigQueryTable());

      toWrite
        .apply(ParDo.of(new FormatForBigquery()))
        .apply(BigQueryIO.writeTableRows()
               .to(tableRef)
               .withSchema(FormatForBigquery.getSchema())
               .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED)
               .withWriteDisposition(options.isStreaming()
                   ? BigQueryIO.Write.WriteDisposition.WRITE_APPEND
                   : BigQueryIO.Write.WriteDisposition.WRITE_TRUNCATE));
    }

    // Run the pipeline.
    PipelineResult result = p.run();

    // ExampleUtils will try to cancel the pipeline and the injector before the program exists.
    exampleUtils.waitToFinish(result);
  }
}