StreamingWriteTables.java example

Explorer
beam-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.beam.sdk.io.gcp.bigquery;

import com.google.api.services.bigquery.model.TableRow;
import org.apache.beam.sdk.coders.KvCoder;
import org.apache.beam.sdk.coders.StringUtf8Coder;
import org.apache.beam.sdk.transforms.PTransform;
import org.apache.beam.sdk.transforms.ParDo;
import org.apache.beam.sdk.transforms.Reshuffle;
import org.apache.beam.sdk.transforms.windowing.DefaultTrigger;
import org.apache.beam.sdk.transforms.windowing.GlobalWindows;
import org.apache.beam.sdk.transforms.windowing.Window;
import org.apache.beam.sdk.values.KV;
import org.apache.beam.sdk.values.PCollection;

/**
 * This transform takes in key-value pairs of {@link TableRow} entries and the
 * {@link TableDestination} it should be written to. The BigQuery streaming-write service is used
 * to stream these writes to the appropriate table.
 *
 * <p>This transform assumes that all destination tables already exist by the time it sees a write
 * for that table.
 */
public class StreamingWriteTables extends PTransform<
    PCollection<KV<TableDestination, TableRow>>, WriteResult> {
  private BigQueryServices bigQueryServices;

  public StreamingWriteTables() {
    this(new BigQueryServicesImpl());
  }

  private StreamingWriteTables(BigQueryServices bigQueryServices) {
    this.bigQueryServices = bigQueryServices;
  }

  StreamingWriteTables withTestServices(BigQueryServices bigQueryServices) {
    return new StreamingWriteTables(bigQueryServices);
  }

  @Override
  public WriteResult expand(PCollection<KV<TableDestination, TableRow>> input) {
    // A naive implementation would be to simply stream data directly to BigQuery.
    // However, this could occasionally lead to duplicated data, e.g., when
    // a VM that runs this code is restarted and the code is re-run.

    // The above risk is mitigated in this implementation by relying on
    // BigQuery built-in best effort de-dup mechanism.

    // To use this mechanism, each input TableRow is tagged with a generated
    // unique id, which is then passed to BigQuery and used to ignore duplicates
    // We create 50 keys per BigQuery table to generate output on. This is few enough that we
    // get good batching into BigQuery's insert calls, and enough that we can max out the
    // streaming insert quota.
    PCollection<KV<ShardedKey<String>, TableRowInfo>> tagged =
        input.apply("ShardTableWrites", ParDo.of
        (new GenerateShardedTable(50)))
        .setCoder(KvCoder.of(ShardedKeyCoder.of(StringUtf8Coder.of()), TableRowJsonCoder.of()))
        .apply("TagWithUniqueIds", ParDo.of(new TagWithUniqueIds()));

    // To prevent having the same TableRow processed more than once with regenerated
    // different unique ids, this implementation relies on "checkpointing", which is
    // achieved as a side effect of having StreamingWriteFn immediately follow a GBK,
    // performed by Reshuffle.
    tagged
        .setCoder(KvCoder.of(ShardedKeyCoder.of(StringUtf8Coder.of()), TableRowInfoCoder.of()))
        .apply(Reshuffle.<ShardedKey<String>, TableRowInfo>of())
        // Put in the global window to ensure that DynamicDestinations side inputs are accessed
        // correctly.
        .apply("GlobalWindow",
            Window.<KV<ShardedKey<String>, TableRowInfo>>into(new GlobalWindows())
            .triggering(DefaultTrigger.of()).discardingFiredPanes())
        .apply("StreamingWrite",
            ParDo.of(
                new StreamingWriteFn(bigQueryServices)));
    return WriteResult.in(input.getPipeline());
  }
}