/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.beam.examples.complete; import com.google.api.services.bigquery.model.TableFieldSchema; import com.google.api.services.bigquery.model.TableRow; import com.google.api.services.bigquery.model.TableSchema; import java.io.IOException; import java.util.ArrayList; import org.apache.beam.examples.common.ExampleBigQueryTableOptions; import org.apache.beam.examples.common.ExampleOptions; import org.apache.beam.examples.common.ExampleUtils; import org.apache.beam.sdk.Pipeline; import org.apache.beam.sdk.PipelineResult; import org.apache.beam.sdk.io.TextIO; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO; import org.apache.beam.sdk.options.Default; import org.apache.beam.sdk.options.Description; import org.apache.beam.sdk.options.PipelineOptionsFactory; import org.apache.beam.sdk.options.StreamingOptions; import org.apache.beam.sdk.transforms.DoFn; import org.apache.beam.sdk.transforms.ParDo; /** * A streaming Beam Example using BigQuery output. * * <p>This pipeline example reads lines of the input text file, splits each line * into individual words, capitalizes those words, and writes the output to * a BigQuery table. * * <p>The example is configured to use the default BigQuery table from the example common package * (there are no defaults for a general Beam pipeline). * You can override them by using the {@literal --bigQueryDataset}, and {@literal --bigQueryTable} * options. If the BigQuery table do not exist, the example will try to create them. * * <p>The example will try to cancel the pipelines on the signal to terminate the process (CTRL-C) * and then exits. */ public class StreamingWordExtract { /** A {@link DoFn} that tokenizes lines of text into individual words. */ static class ExtractWords extends DoFn<String, String> { @ProcessElement public void processElement(ProcessContext c) { String[] words = c.element().split(ExampleUtils.TOKENIZER_PATTERN); for (String word : words) { if (!word.isEmpty()) { c.output(word); } } } } /** A {@link DoFn} that uppercases a word. */ static class Uppercase extends DoFn<String, String> { @ProcessElement public void processElement(ProcessContext c) { c.output(c.element().toUpperCase()); } } /** * Converts strings into BigQuery rows. */ static class StringToRowConverter extends DoFn<String, TableRow> { /** * In this example, put the whole string into single BigQuery field. */ @ProcessElement public void processElement(ProcessContext c) { c.output(new TableRow().set("string_field", c.element())); } static TableSchema getSchema() { return new TableSchema().setFields(new ArrayList<TableFieldSchema>() { // Compose the list of TableFieldSchema from tableSchema. { add(new TableFieldSchema().setName("string_field").setType("STRING")); } }); } } /** * Options supported by {@link StreamingWordExtract}. * * <p>Inherits standard configuration options. */ private interface StreamingWordExtractOptions extends ExampleOptions, ExampleBigQueryTableOptions, StreamingOptions { @Description("Path of the file to read from") @Default.String("gs://apache-beam-samples/shakespeare/kinglear.txt") String getInputFile(); void setInputFile(String value); } /** * Sets up and starts streaming pipeline. * * @throws IOException if there is a problem setting up resources */ public static void main(String[] args) throws IOException { StreamingWordExtractOptions options = PipelineOptionsFactory.fromArgs(args) .withValidation() .as(StreamingWordExtractOptions.class); options.setStreaming(true); options.setBigQuerySchema(StringToRowConverter.getSchema()); ExampleUtils exampleUtils = new ExampleUtils(options); exampleUtils.setup(); Pipeline pipeline = Pipeline.create(options); String tableSpec = new StringBuilder() .append(options.getProject()).append(":") .append(options.getBigQueryDataset()).append(".") .append(options.getBigQueryTable()) .toString(); pipeline .apply("ReadLines", TextIO.read().from(options.getInputFile())) .apply(ParDo.of(new ExtractWords())) .apply(ParDo.of(new Uppercase())) .apply(ParDo.of(new StringToRowConverter())) .apply(BigQueryIO.writeTableRows().to(tableSpec) .withSchema(StringToRowConverter.getSchema())); PipelineResult result = pipeline.run(); // ExampleUtils will try to cancel the pipeline before the program exists. exampleUtils.waitToFinish(result); } }