StreamingWordExtract.java example

Explorer
beam-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.beam.examples.complete;

import com.google.api.services.bigquery.model.TableFieldSchema;
import com.google.api.services.bigquery.model.TableRow;
import com.google.api.services.bigquery.model.TableSchema;
import java.io.IOException;
import java.util.ArrayList;
import org.apache.beam.examples.common.ExampleBigQueryTableOptions;
import org.apache.beam.examples.common.ExampleOptions;
import org.apache.beam.examples.common.ExampleUtils;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.PipelineResult;
import org.apache.beam.sdk.io.TextIO;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
import org.apache.beam.sdk.options.Default;
import org.apache.beam.sdk.options.Description;
import org.apache.beam.sdk.options.PipelineOptionsFactory;
import org.apache.beam.sdk.options.StreamingOptions;
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.transforms.ParDo;

/**
 * A streaming Beam Example using BigQuery output.
 *
 * <p>This pipeline example reads lines of the input text file, splits each line
 * into individual words, capitalizes those words, and writes the output to
 * a BigQuery table.
 *
 * <p>The example is configured to use the default BigQuery table from the example common package
 * (there are no defaults for a general Beam pipeline).
 * You can override them by using the {@literal --bigQueryDataset}, and {@literal --bigQueryTable}
 * options. If the BigQuery table do not exist, the example will try to create them.
 *
 * <p>The example will try to cancel the pipelines on the signal to terminate the process (CTRL-C)
 * and then exits.
 */
public class StreamingWordExtract {

  /** A {@link DoFn} that tokenizes lines of text into individual words. */
  static class ExtractWords extends DoFn<String, String> {
    @ProcessElement
    public void processElement(ProcessContext c) {
      String[] words = c.element().split(ExampleUtils.TOKENIZER_PATTERN);
      for (String word : words) {
        if (!word.isEmpty()) {
          c.output(word);
        }
      }
    }
  }

  /** A {@link DoFn} that uppercases a word. */
  static class Uppercase extends DoFn<String, String> {
    @ProcessElement
    public void processElement(ProcessContext c) {
      c.output(c.element().toUpperCase());
    }
  }

  /**
   * Converts strings into BigQuery rows.
   */
  static class StringToRowConverter extends DoFn<String, TableRow> {
    /**
     * In this example, put the whole string into single BigQuery field.
     */
    @ProcessElement
    public void processElement(ProcessContext c) {
      c.output(new TableRow().set("string_field", c.element()));
    }

    static TableSchema getSchema() {
      return new TableSchema().setFields(new ArrayList<TableFieldSchema>() {
            // Compose the list of TableFieldSchema from tableSchema.
            {
              add(new TableFieldSchema().setName("string_field").setType("STRING"));
            }
      });
    }
  }

  /**
   * Options supported by {@link StreamingWordExtract}.
   *
   * <p>Inherits standard configuration options.
   */
  private interface StreamingWordExtractOptions
      extends ExampleOptions, ExampleBigQueryTableOptions, StreamingOptions {
    @Description("Path of the file to read from")
    @Default.String("gs://apache-beam-samples/shakespeare/kinglear.txt")
    String getInputFile();
    void setInputFile(String value);
  }

  /**
   * Sets up and starts streaming pipeline.
   *
   * @throws IOException if there is a problem setting up resources
   */
  public static void main(String[] args) throws IOException {
    StreamingWordExtractOptions options = PipelineOptionsFactory.fromArgs(args)
        .withValidation()
        .as(StreamingWordExtractOptions.class);
    options.setStreaming(true);

    options.setBigQuerySchema(StringToRowConverter.getSchema());
    ExampleUtils exampleUtils = new ExampleUtils(options);
    exampleUtils.setup();

    Pipeline pipeline = Pipeline.create(options);

    String tableSpec = new StringBuilder()
        .append(options.getProject()).append(":")
        .append(options.getBigQueryDataset()).append(".")
        .append(options.getBigQueryTable())
        .toString();
    pipeline
        .apply("ReadLines", TextIO.read().from(options.getInputFile()))
        .apply(ParDo.of(new ExtractWords()))
        .apply(ParDo.of(new Uppercase()))
        .apply(ParDo.of(new StringToRowConverter()))
        .apply(BigQueryIO.writeTableRows().to(tableSpec)
            .withSchema(StringToRowConverter.getSchema()));

    PipelineResult result = pipeline.run();

    // ExampleUtils will try to cancel the pipeline before the program exists.
    exampleUtils.waitToFinish(result);
  }
}