/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.beam.examples.cookbook; import com.google.api.services.bigquery.model.TableFieldSchema; import com.google.api.services.bigquery.model.TableRow; import com.google.api.services.bigquery.model.TableSchema; import java.util.ArrayList; import java.util.List; import org.apache.beam.sdk.Pipeline; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO; import org.apache.beam.sdk.metrics.Counter; import org.apache.beam.sdk.metrics.Metrics; import org.apache.beam.sdk.options.Default; import org.apache.beam.sdk.options.Description; import org.apache.beam.sdk.options.PipelineOptions; import org.apache.beam.sdk.options.PipelineOptionsFactory; import org.apache.beam.sdk.options.Validation; import org.apache.beam.sdk.transforms.Combine; import org.apache.beam.sdk.transforms.DoFn; import org.apache.beam.sdk.transforms.PTransform; import org.apache.beam.sdk.transforms.ParDo; import org.apache.beam.sdk.transforms.SerializableFunction; import org.apache.beam.sdk.values.KV; import org.apache.beam.sdk.values.PCollection; /** * An example that reads the public 'Shakespeare' data, and for each word in * the dataset that is over a given length, generates a string containing the * list of play names in which that word appears, and saves this information * to a bigquery table. * * <p>Note: Before running this example, you must create a BigQuery dataset to contain your output * table. * * <p>To execute this pipeline locally, specify the BigQuery table for the output: * <pre>{@code * --output=YOUR_PROJECT_ID:DATASET_ID.TABLE_ID * }</pre> * * <p>To change the runner, specify: * <pre>{@code * --runner=YOUR_SELECTED_RUNNER * } * </pre> * See examples/java/README.md for instructions about how to configure different runners. * * <p>The BigQuery input table defaults to {@code publicdata:samples.shakespeare} and can * be overridden with {@code --input}. */ public class CombinePerKeyExamples { // Use the shakespeare public BigQuery sample private static final String SHAKESPEARE_TABLE = "publicdata:samples.shakespeare"; // We'll track words >= this word length across all plays in the table. private static final int MIN_WORD_LENGTH = 9; /** * Examines each row in the input table. If the word is greater than or equal to MIN_WORD_LENGTH, * outputs word, play_name. */ static class ExtractLargeWordsFn extends DoFn<TableRow, KV<String, String>> { private final Counter smallerWords = Metrics.counter(ExtractLargeWordsFn.class, "smallerWords"); @ProcessElement public void processElement(ProcessContext c){ TableRow row = c.element(); String playName = (String) row.get("corpus"); String word = (String) row.get("word"); if (word.length() >= MIN_WORD_LENGTH) { c.output(KV.of(word, playName)); } else { // Track how many smaller words we're not including. This information will be // visible in the Monitoring UI. smallerWords.inc(); } } } /** * Prepares the data for writing to BigQuery by building a TableRow object * containing a word with a string listing the plays in which it appeared. */ static class FormatShakespeareOutputFn extends DoFn<KV<String, String>, TableRow> { @ProcessElement public void processElement(ProcessContext c) { TableRow row = new TableRow() .set("word", c.element().getKey()) .set("all_plays", c.element().getValue()); c.output(row); } } /** * Reads the public 'Shakespeare' data, and for each word in the dataset * over a given length, generates a string containing the list of play names * in which that word appears. It does this via the Combine.perKey * transform, with the ConcatWords combine function. * * <p>Combine.perKey is similar to a GroupByKey followed by a ParDo, but * has more restricted semantics that allow it to be executed more * efficiently. These records are then formatted as BQ table rows. */ static class PlaysForWord extends PTransform<PCollection<TableRow>, PCollection<TableRow>> { @Override public PCollection<TableRow> expand(PCollection<TableRow> rows) { // row... => <word, play_name> ... PCollection<KV<String, String>> words = rows.apply( ParDo.of(new ExtractLargeWordsFn())); // word, play_name => word, all_plays ... PCollection<KV<String, String>> wordAllPlays = words.apply(Combine.<String, String>perKey( new ConcatWords())); // <word, all_plays>... => row... PCollection<TableRow> results = wordAllPlays.apply( ParDo.of(new FormatShakespeareOutputFn())); return results; } } /** * A 'combine function' used with the Combine.perKey transform. Builds a * comma-separated string of all input items. So, it will build a string * containing all the different Shakespeare plays in which the given input * word has appeared. */ public static class ConcatWords implements SerializableFunction<Iterable<String>, String> { @Override public String apply(Iterable<String> input) { StringBuilder all = new StringBuilder(); for (String item : input) { if (!item.isEmpty()) { if (all.length() == 0) { all.append(item); } else { all.append(","); all.append(item); } } } return all.toString(); } } /** * Options supported by {@link CombinePerKeyExamples}. * * <p>Inherits standard configuration options. */ private interface Options extends PipelineOptions { @Description("Table to read from, specified as " + "<project_id>:<dataset_id>.<table_id>") @Default.String(SHAKESPEARE_TABLE) String getInput(); void setInput(String value); @Description("Table to write to, specified as " + "<project_id>:<dataset_id>.<table_id>. " + "The dataset_id must already exist") @Validation.Required String getOutput(); void setOutput(String value); } public static void main(String[] args) throws Exception { Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class); Pipeline p = Pipeline.create(options); // Build the table schema for the output table. List<TableFieldSchema> fields = new ArrayList<>(); fields.add(new TableFieldSchema().setName("word").setType("STRING")); fields.add(new TableFieldSchema().setName("all_plays").setType("STRING")); TableSchema schema = new TableSchema().setFields(fields); p.apply(BigQueryIO.read().from(options.getInput())) .apply(new PlaysForWord()) .apply(BigQueryIO.writeTableRows() .to(options.getOutput()) .withSchema(schema) .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED) .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_TRUNCATE)); p.run().waitUntilFinish(); } }