BigQueryIO.java example

Explorer
beam-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.beam.sdk.io.gcp.bigquery;

import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkState;
import static org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.createJobIdToken;
import static org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.getExtractJobId;
import static org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.resolveTempLocation;

import com.google.api.client.json.JsonFactory;
import com.google.api.services.bigquery.model.Job;
import com.google.api.services.bigquery.model.JobConfigurationQuery;
import com.google.api.services.bigquery.model.JobReference;
import com.google.api.services.bigquery.model.JobStatistics;
import com.google.api.services.bigquery.model.TableReference;
import com.google.api.services.bigquery.model.TableRow;
import com.google.api.services.bigquery.model.TableSchema;
import com.google.auto.value.AutoValue;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Predicates;
import com.google.common.base.Strings;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;
import javax.annotation.Nullable;
import org.apache.beam.sdk.PipelineRunner;
import org.apache.beam.sdk.coders.CannotProvideCoderException;
import org.apache.beam.sdk.coders.Coder;
import org.apache.beam.sdk.coders.KvCoder;
import org.apache.beam.sdk.coders.VoidCoder;
import org.apache.beam.sdk.extensions.gcp.options.GcpOptions;
import org.apache.beam.sdk.io.BoundedSource;
import org.apache.beam.sdk.io.FileSystems;
import org.apache.beam.sdk.io.fs.MoveOptions;
import org.apache.beam.sdk.io.fs.ResolveOptions;
import org.apache.beam.sdk.io.fs.ResourceId;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.JsonTableRefToTableRef;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.TableRefToJson;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.TableSchemaToJsonSchema;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.TableSpecToTableRef;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryServices.DatasetService;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryServices.JobService;
import org.apache.beam.sdk.io.gcp.bigquery.DynamicDestinationsHelpers.ConstantSchemaDestinations;
import org.apache.beam.sdk.io.gcp.bigquery.DynamicDestinationsHelpers.SchemaFromViewDestinations;
import org.apache.beam.sdk.io.gcp.bigquery.DynamicDestinationsHelpers.TableFunctionDestinations;
import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.options.ValueProvider;
import org.apache.beam.sdk.options.ValueProvider.NestedValueProvider;
import org.apache.beam.sdk.options.ValueProvider.StaticValueProvider;
import org.apache.beam.sdk.transforms.PTransform;
import org.apache.beam.sdk.transforms.SerializableFunction;
import org.apache.beam.sdk.transforms.display.DisplayData;
import org.apache.beam.sdk.util.Transport;
import org.apache.beam.sdk.util.gcsfs.GcsPath;
import org.apache.beam.sdk.values.KV;
import org.apache.beam.sdk.values.PBegin;
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.PCollection.IsBounded;
import org.apache.beam.sdk.values.PCollectionView;
import org.apache.beam.sdk.values.ValueInSingleWindow;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * {@link PTransform}s for reading and writing
 * <a href="https://developers.google.com/bigquery/">BigQuery</a> tables.
 *
 * <h3>Table References</h3>
 *
 * <p>A fully-qualified BigQuery table name consists of three components:
 * <ul>
 *   <li>{@code projectId}: the Cloud project id (defaults to
 *       {@link GcpOptions#getProject()}).
 *   <li>{@code datasetId}: the BigQuery dataset id, unique within a project.
 *   <li>{@code tableId}: a table id, unique within a dataset.
 * </ul>
 *
 * <p>BigQuery table references are stored as a {@link TableReference}, which comes from the <a
 * href="https://cloud.google.com/bigquery/client-libraries">BigQuery Java Client API</a>. Tables
 * can be referred to as Strings, with or without the {@code projectId}. A helper function is
 * provided ({@link BigQueryHelpers#parseTableSpec(String)}) that parses the following string forms
 * into a {@link TableReference}:
 *
 * <ul>
 *   <li>[{@code project_id}]:[{@code dataset_id}].[{@code table_id}]
 *   <li>[{@code dataset_id}].[{@code table_id}]
 * </ul>
 *
 * <h3>Reading</h3>
 *
 * <p>To read from a BigQuery table, apply a {@link BigQueryIO.Read} transformation.
 * This produces a {@link PCollection} of {@link TableRow TableRows} as output:
 *
 * <pre>{@code
 * PCollection<TableRow> weatherData = pipeline.apply(
 *     BigQueryIO.read().from("clouddataflow-readonly:samples.weather_stations"));
 * }</pre>
 *
 * <p>See {@link TableRow} for more information on the {@link TableRow} object.
 *
 * <p>Users may provide a query to read from rather than reading all of a BigQuery table. If
 * specified, the result obtained by executing the specified query will be used as the data of the
 * input transform.
 *
 * <pre>{@code
 * PCollection<TableRow> meanTemperatureData = pipeline.apply(
 *     BigQueryIO.read().fromQuery("SELECT year, mean_temp FROM [samples.weather_stations]"));
 * }</pre>
 *
 * <p>When creating a BigQuery input transform, users should provide either a query or a table.
 * Pipeline construction will fail with a validation error if neither or both are specified.
 *
 * <h3>Writing</h3>
 *
 * <p>To write to a BigQuery table, apply a {@link BigQueryIO.Write} transformation.
 * This consumes either a {@link PCollection} of {@link TableRow TableRows} as input when using
 * {@link BigQueryIO#writeTableRows()} or of a user-defined type when using
 * {@link BigQueryIO#write()}. When using a user-defined type, a function must be provided to
 * turn this type into a {@link TableRow} using
 * {@link BigQueryIO.Write#withFormatFunction(SerializableFunction)}.
 *
 * <pre>{@code
 * PCollection<TableRow> quotes = ...
 *
 * List<TableFieldSchema> fields = new ArrayList<>();
 * fields.add(new TableFieldSchema().setName("source").setType("STRING"));
 * fields.add(new TableFieldSchema().setName("quote").setType("STRING"));
 * TableSchema schema = new TableSchema().setFields(fields);
 *
 * quotes.apply(BigQueryIO.writeTableRows()
 *     .to("my-project:output.output_table")
 *     .withSchema(schema)
 *     .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_TRUNCATE));
 * }</pre>
 *
 * <p>See {@link BigQueryIO.Write} for details on how to specify if a write should append to an
 * existing table, replace the table, or verify that the table is empty. Note that the dataset being
 * written to must already exist. Unbounded PCollections can only be written using {@link
 * Write.WriteDisposition#WRITE_EMPTY} or {@link Write.WriteDisposition#WRITE_APPEND}.
 *
 * <h3>Sharding BigQuery output tables</h3>
 *
 * <p>A common use case is to dynamically generate BigQuery table names based on the current window
 * or the current value. To support this, {@link BigQueryIO.Write#to(SerializableFunction)} accepts
 * a function mapping the current element to a tablespec. For example, here's code that outputs
 * daily tables to BigQuery:
 *
 * <pre>{@code
 * PCollection<TableRow> quotes = ...
 * quotes.apply(Window.<TableRow>into(CalendarWindows.days(1)))
 *       .apply(BigQueryIO.writeTableRows()
 *         .withSchema(schema)
 *         .to(new SerializableFunction<ValueInSingleWindow, String>() {
 *           public String apply(ValueInSingleWindow value) {
 *             // The cast below is safe because CalendarWindows.days(1) produces IntervalWindows.
 *             String dayString = DateTimeFormat.forPattern("yyyy_MM_dd")
 *                  .withZone(DateTimeZone.UTC)
 *                  .print(((IntervalWindow) value.getWindow()).start());
 *             return "my-project:output.output_table_" + dayString;
 *           }
 *         }));
 * }</pre>
 *
 * <p>Note that this also allows the table to be a function of the element as well as the current
 * pane, in the case of triggered windows. In this case it might be convenient to call {@link
 * BigQueryIO#write()} directly instead of using the {@link BigQueryIO#writeTableRows()} helper.
 * This will allow the mapping function to access the element of the user-defined type. In this
 * case, a formatting function must be specified using {@link BigQueryIO.Write#withFormatFunction}
 * to convert each element into a {@link TableRow} object.
 *
 * <p>Per-table schemas can also be provided using {@link BigQueryIO.Write#withSchemaFromView}. This
 * allows you the schemas to be calculated based on a previous pipeline stage or statically via a
 * {@link org.apache.beam.sdk.transforms.Create} transform. This method expects to receive a
 * map-valued {@link PCollectionView}, mapping table specifications (project:dataset.table-id), to
 * JSON formatted {@link TableSchema} objects. All destination tables must be present in this map,
 * or the pipeline will fail to create tables. Care should be taken if the map value is based on a
 * triggered aggregation over and unbounded {@link PCollection}; the side input will contain the
 * entire history of all table schemas ever generated, which might blow up memory usage. This method
 * can also be useful when writing to a single table, as it allows a previous stage to calculate the
 * schema (possibly based on the full collection of records being written to BigQuery).
 *
 * <p>For the most general form of dynamic table destinations and schemas, look at
 * {@link BigQueryIO.Write#to(DynamicDestinations)}.
 *
 * <h3>Permissions</h3>
 *
 * <p>Permission requirements depend on the {@link PipelineRunner} that is used to execute the
 * pipeline. Please refer to the documentation of corresponding {@link PipelineRunner}s for more
 * details.
 *
 * <p>Please see <a href="https://cloud.google.com/bigquery/access-control">BigQuery Access Control
 * </a> for security and permission related information specific to BigQuery.
 */
public class BigQueryIO {
  private static final Logger LOG = LoggerFactory.getLogger(BigQueryIO.class);

  /**
   * Singleton instance of the JSON factory used to read and write JSON formatted rows.
   */
  static final JsonFactory JSON_FACTORY = Transport.getJsonFactory();

  /**
   * Project IDs must contain 6-63 lowercase letters, digits, or dashes.
   * IDs must start with a letter and may not end with a dash.
   * This regex isn't exact - this allows for patterns that would be rejected by
   * the service, but this is sufficient for basic parsing of table references.
   */
  private static final String PROJECT_ID_REGEXP = "[a-z][-a-z0-9:.]{4,61}[a-z0-9]";

  /**
   * Regular expression that matches Dataset IDs.
   */
  private static final String DATASET_REGEXP = "[-\\w.]{1,1024}";

  /**
   * Regular expression that matches Table IDs.
   */
  private static final String TABLE_REGEXP = "[-\\w$@]{1,1024}";

  /**
   * Matches table specifications in the form {@code "[project_id]:[dataset_id].[table_id]"} or
   * {@code "[dataset_id].[table_id]"}.
   */
  private static final String DATASET_TABLE_REGEXP =
      String.format("((?<PROJECT>%s):)?(?<DATASET>%s)\\.(?<TABLE>%s)", PROJECT_ID_REGEXP,
          DATASET_REGEXP, TABLE_REGEXP);

  static final Pattern TABLE_SPEC = Pattern.compile(DATASET_TABLE_REGEXP);

  /**
   * A formatting function that maps a TableRow to itself. This allows sending a
   * {@code PCollection<TableRow>} directly to BigQueryIO.Write.
   */
  static final SerializableFunction<TableRow, TableRow> IDENTITY_FORMATTER =
      new SerializableFunction<TableRow, TableRow>() {
        @Override
        public TableRow apply(TableRow input) {
          return input;
        }
      };

  /**
   * A {@link PTransform} that reads from a BigQuery table and returns a
   * {@link PCollection} of {@link TableRow TableRows} containing each of the rows of the table.
   *
   * <p>Each {@link TableRow} contains values indexed by column name. Here is a
   * sample processing function that processes a "line" column from rows:
   *
   * <pre>{@code
   * static class ExtractWordsFn extends DoFn<TableRow, String> {
   *   public void processElement(ProcessContext c) {
   *     // Get the "line" field of the TableRow object, split it into words, and emit them.
   *     TableRow row = c.element();
   *     String[] words = row.get("line").toString().split("[^a-zA-Z']+");
   *     for (String word : words) {
   *       if (!word.isEmpty()) {
   *         c.output(word);
   *       }
   *     }
   *   }
   * }
   * }</pre>
   */
  public static Read read() {
    return new AutoValue_BigQueryIO_Read.Builder()
        .setValidate(true)
        .setBigQueryServices(new BigQueryServicesImpl())
        .build();
  }

  /** Implementation of {@link #read}. */
  @AutoValue
  public abstract static class Read extends PTransform<PBegin, PCollection<TableRow>> {
    @Nullable abstract ValueProvider<String> getJsonTableRef();
    @Nullable abstract ValueProvider<String> getQuery();
    abstract boolean getValidate();
    @Nullable abstract Boolean getFlattenResults();
    @Nullable abstract Boolean getUseLegacySql();
    abstract BigQueryServices getBigQueryServices();
    abstract Builder toBuilder();

    @AutoValue.Builder
    abstract static class Builder {
      abstract Builder setJsonTableRef(ValueProvider<String> jsonTableRef);
      abstract Builder setQuery(ValueProvider<String> query);
      abstract Builder setValidate(boolean validate);
      abstract Builder setFlattenResults(Boolean flattenResults);
      abstract Builder setUseLegacySql(Boolean useLegacySql);
      abstract Builder setBigQueryServices(BigQueryServices bigQueryServices);
      abstract Read build();
    }

    /** Ensures that methods of the from() / fromQuery() family are called at most once. */
    private void ensureFromNotCalledYet() {
      checkState(
          getJsonTableRef() == null && getQuery() == null, "from() or fromQuery() already called");
    }

    /**
     * Reads a BigQuery table specified as {@code "[project_id]:[dataset_id].[table_id]"} or
     * {@code "[dataset_id].[table_id]"} for tables within the current project.
     */
    public Read from(String tableSpec) {
      return from(StaticValueProvider.of(tableSpec));
    }

    /** Same as {@code from(String)}, but with a {@link ValueProvider}. */
    public Read from(ValueProvider<String> tableSpec) {
      ensureFromNotCalledYet();
      return toBuilder()
          .setJsonTableRef(
              NestedValueProvider.of(
                  NestedValueProvider.of(tableSpec, new TableSpecToTableRef()),
                  new TableRefToJson()))
          .build();
    }

    /**
     * Reads results received after executing the given query.
     *
     * <p>By default, the query results will be flattened -- see "flattenResults" in the <a
     * href="https://cloud.google.com/bigquery/docs/reference/v2/jobs">Jobs documentation</a> for
     * more information. To disable flattening, use {@link BigQueryIO.Read#withoutResultFlattening}.
     *
     * <p>By default, the query will use BigQuery's legacy SQL dialect. To use the BigQuery Standard
     * SQL dialect, use {@link BigQueryIO.Read#usingStandardSql}.
     */
    public Read fromQuery(String query) {
      return fromQuery(StaticValueProvider.of(query));
    }

    /**
     * Same as {@code fromQuery(String)}, but with a {@link ValueProvider}.
     */
    public Read fromQuery(ValueProvider<String> query) {
      ensureFromNotCalledYet();
      return toBuilder().setQuery(query).setFlattenResults(true).setUseLegacySql(true).build();
    }

    /**
     * Read from table specified by a {@link TableReference}.
     */
    public Read from(TableReference table) {
      return from(StaticValueProvider.of(BigQueryHelpers.toTableSpec(table)));
    }

    private static final String QUERY_VALIDATION_FAILURE_ERROR =
        "Validation of query \"%1$s\" failed. If the query depends on an earlier stage of the"
        + " pipeline, This validation can be disabled using #withoutValidation.";

    /**
     * Disable validation that the table exists or the query succeeds prior to pipeline submission.
     * Basic validation (such as ensuring that a query or table is specified) still occurs.
     */
    public Read withoutValidation() {
      return toBuilder().setValidate(false).build();
    }

    /**
     * Disable <a href="https://cloud.google.com/bigquery/docs/reference/v2/jobs">flattening of
     * query results</a>.
     *
     * <p>Only valid when a query is used ({@link #fromQuery}). Setting this option when reading
     * from a table will cause an error during validation.
     */
    public Read withoutResultFlattening() {
      return toBuilder().setFlattenResults(false).build();
    }

    /**
     * Enables BigQuery's Standard SQL dialect when reading from a query.
     *
     * <p>Only valid when a query is used ({@link #fromQuery}). Setting this option when reading
     * from a table will cause an error during validation.
     */
    public Read usingStandardSql() {
      return toBuilder().setUseLegacySql(false).build();
    }

    @VisibleForTesting
    Read withTestServices(BigQueryServices testServices) {
      return toBuilder().setBigQueryServices(testServices).build();
    }

    @Override
    public void validate(PipelineOptions options) {
      // Even if existence validation is disabled, we need to make sure that the BigQueryIO
      // read is properly specified.
      BigQueryOptions bqOptions = options.as(BigQueryOptions.class);

      String tempLocation = bqOptions.getTempLocation();
      checkArgument(
          !Strings.isNullOrEmpty(tempLocation),
          "BigQueryIO.Read needs a GCS temp location to store temp files.");
      if (getBigQueryServices() == null) {
        try {
          GcsPath.fromUri(tempLocation);
        } catch (IllegalArgumentException e) {
          throw new IllegalArgumentException(
              String.format(
                  "BigQuery temp location expected a valid 'gs://' path, but was given '%s'",
                  tempLocation),
              e);
        }
      }

      ValueProvider<TableReference> table = getTableProvider();

      checkState(
          table == null || getQuery() == null,
          "Invalid BigQueryIO.Read: table reference and query may not both be set");
      checkState(
          table != null || getQuery() != null,
          "Invalid BigQueryIO.Read: one of table reference and query must be set");

      if (table != null) {
        checkState(
            getFlattenResults() == null,
            "Invalid BigQueryIO.Read: Specifies a table with a result flattening"
                + " preference, which only applies to queries");
        checkState(
            getUseLegacySql() == null,
            "Invalid BigQueryIO.Read: Specifies a table with a SQL dialect"
                + " preference, which only applies to queries");
        if (table.isAccessible() && Strings.isNullOrEmpty(table.get().getProjectId())) {
          LOG.info(
              "Project of {} not set. The value of {}.getProject() at execution time will be used.",
              TableReference.class.getSimpleName(),
              BigQueryOptions.class.getSimpleName());
        }
      } else /* query != null */ {
        checkState(
            getFlattenResults() != null, "flattenResults should not be null if query is set");
        checkState(getUseLegacySql() != null, "useLegacySql should not be null if query is set");
      }

      // Note that a table or query check can fail if the table or dataset are created by
      // earlier stages of the pipeline or if a query depends on earlier stages of a pipeline.
      // For these cases the withoutValidation method can be used to disable the check.
      if (getValidate() && table != null && table.isAccessible()
          && table.get().getProjectId() != null) {
        checkState(table.isAccessible(), "Cannot call validate if table is dynamically set.");
        // Check for source table presence for early failure notification.
        DatasetService datasetService = getBigQueryServices().getDatasetService(bqOptions);
        BigQueryHelpers.verifyDatasetPresence(datasetService, table.get());
        BigQueryHelpers.verifyTablePresence(datasetService, table.get());
      } else if (getValidate() && getQuery() != null) {
        checkState(getQuery().isAccessible(), "Cannot call validate if query is dynamically set.");
        JobService jobService = getBigQueryServices().getJobService(bqOptions);
        try {
          jobService.dryRunQuery(
              bqOptions.getProject(),
              new JobConfigurationQuery()
                  .setQuery(getQuery().get())
                  .setFlattenResults(getFlattenResults())
                  .setUseLegacySql(getUseLegacySql()));
        } catch (Exception e) {
          throw new IllegalArgumentException(
              String.format(QUERY_VALIDATION_FAILURE_ERROR, getQuery().get()), e);
        }
      }
    }

    @Override
    public PCollection<TableRow> expand(PBegin input) {
      final String stepUuid = BigQueryHelpers.randomUUIDString();
      BoundedSource<TableRow> source;

      if (getQuery() != null
          && (!getQuery().isAccessible() || !Strings.isNullOrEmpty(getQuery().get()))) {
        source =
            BigQueryQuerySource.create(
                stepUuid,
                getQuery(),
                getFlattenResults(),
                getUseLegacySql(),
                getBigQueryServices());
      } else {
        source =
            BigQueryTableSource.create(
                stepUuid,
                getTableProvider(),
                getBigQueryServices());
      }
      PassThroughThenCleanup.CleanupOperation cleanupOperation =
          new PassThroughThenCleanup.CleanupOperation() {
            @Override
            void cleanup(PipelineOptions options) throws Exception {
              BigQueryOptions bqOptions = options.as(BigQueryOptions.class);
              final String extractDestinationDir =
                  resolveTempLocation(
                      bqOptions.getTempLocation(),
                      "BigQueryExtractTemp",
                      stepUuid);

              JobReference jobRef =
                  new JobReference()
                      .setProjectId(bqOptions.getProject())
                      .setJobId(
                          getExtractJobId(createJobIdToken(bqOptions.getJobName(), stepUuid)));

              Job extractJob = getBigQueryServices().getJobService(bqOptions).getJob(jobRef);

              if (extractJob != null) {
                List<ResourceId> extractFiles =
                    getExtractFilePaths(extractDestinationDir, extractJob);
                if (extractFiles != null && !extractFiles.isEmpty()) {
                  FileSystems.delete(extractFiles,
                      MoveOptions.StandardMoveOptions.IGNORE_MISSING_FILES);
                }
              }
            }
          };
      return input.getPipeline()
          .apply(org.apache.beam.sdk.io.Read.from(source))
          .setCoder(getDefaultOutputCoder())
          .apply(new PassThroughThenCleanup<TableRow>(cleanupOperation));
    }

    @Override
    protected Coder<TableRow> getDefaultOutputCoder() {
      return TableRowJsonCoder.of();
    }

    @Override
    public void populateDisplayData(DisplayData.Builder builder) {
      super.populateDisplayData(builder);
      builder
          .addIfNotNull(DisplayData.item("table", BigQueryHelpers.displayTable(getTableProvider()))
                  .withLabel("Table"))
          .addIfNotNull(DisplayData.item("query", getQuery())
              .withLabel("Query"))
          .addIfNotNull(DisplayData.item("flattenResults", getFlattenResults())
                  .withLabel("Flatten Query Results"))
          .addIfNotNull(DisplayData.item("useLegacySql", getUseLegacySql())
                  .withLabel("Use Legacy SQL Dialect"))
          .addIfNotDefault(DisplayData.item("validation", getValidate())
              .withLabel("Validation Enabled"),
              true);
    }

    /**
     * Returns the table to read, or {@code null} if reading from a query instead.
     */
    @Nullable
    public ValueProvider<TableReference> getTableProvider() {
      return getJsonTableRef() == null
          ? null : NestedValueProvider.of(getJsonTableRef(), new JsonTableRefToTableRef());
    }
    /** Returns the table to read, or {@code null} if reading from a query instead. */
    @Nullable
    public TableReference getTable() {
      ValueProvider<TableReference> provider = getTableProvider();
      return provider == null ? null : provider.get();
    }
  }

  static String getExtractDestinationUri(String extractDestinationDir) {
    return String.format("%s/%s", extractDestinationDir, "*.avro");
  }

  static List<ResourceId> getExtractFilePaths(String extractDestinationDir, Job extractJob)
      throws IOException {
    JobStatistics jobStats = extractJob.getStatistics();
    List<Long> counts = jobStats.getExtract().getDestinationUriFileCounts();
    if (counts.size() != 1) {
      String errorMessage = (counts.size() == 0
              ? "No destination uri file count received."
              : String.format(
                  "More than one destination uri file count received. First two are %s, %s",
                  counts.get(0), counts.get(1)));
      throw new RuntimeException(errorMessage);
    }
    long filesCount = counts.get(0);

    ImmutableList.Builder<ResourceId> paths = ImmutableList.builder();
    ResourceId extractDestinationDirResourceId =
        FileSystems.matchNewResource(extractDestinationDir, true /* isDirectory */);
    for (long i = 0; i < filesCount; ++i) {
      ResourceId filePath = extractDestinationDirResourceId.resolve(
          String.format("%012d%s", i, ".avro"),
          ResolveOptions.StandardResolveOptions.RESOLVE_FILE);
      paths.add(filePath);
    }
    return paths.build();
  }

  /////////////////////////////////////////////////////////////////////////////

  /**
   * A {@link PTransform} that writes a {@link PCollection} to a BigQuery table. A formatting
   * function must be provided to convert each input element into a {@link TableRow} using
   * {@link Write#withFormatFunction(SerializableFunction)}.
   *
   * <p>In BigQuery, each table has an encosing dataset. The dataset being written must already
   * exist.
   *
   * <p>By default, tables will be created if they do not exist, which corresponds to a
   * {@link Write.CreateDisposition#CREATE_IF_NEEDED} disposition that matches the default of
   * BigQuery's Jobs API. A schema must be provided (via {@link Write#withSchema(TableSchema)}),
   * or else the transform may fail at runtime with an {@link IllegalArgumentException}.
   *
   * <p>By default, writes require an empty table, which corresponds to
   * a {@link Write.WriteDisposition#WRITE_EMPTY} disposition that matches the default of
   * BigQuery's Jobs API.
   *
   * <p>Here is a sample transform that produces TableRow values containing "word" and "count"
   * columns:
   *
   * <pre>{@code
   * static class FormatCountsFn extends DoFn<KV<String, Long>, TableRow> {
   *   public void processElement(ProcessContext c) {
   *     TableRow row = new TableRow()
   *         .set("word", c.element().getKey())
   *         .set("count", c.element().getValue().intValue());
   *     c.output(row);
   *   }
   * }
   * }</pre>
   */
  public static <T> Write<T> write() {
    return new AutoValue_BigQueryIO_Write.Builder<T>()
        .setValidate(true)
        .setBigQueryServices(new BigQueryServicesImpl())
        .setCreateDisposition(Write.CreateDisposition.CREATE_IF_NEEDED)
        .setWriteDisposition(Write.WriteDisposition.WRITE_EMPTY)
        .build();
  }

  /**
   * A {@link PTransform} that writes a {@link PCollection} containing {@link TableRow TableRows} to
   * a BigQuery table.
   */
  public static Write<TableRow> writeTableRows() {
    return BigQueryIO.<TableRow>write().withFormatFunction(IDENTITY_FORMATTER);
  }

  /** Implementation of {@link #write}. */
  @AutoValue
  public abstract static class Write<T> extends PTransform<PCollection<T>, WriteResult> {
    @Nullable abstract ValueProvider<String> getJsonTableRef();
    @Nullable abstract SerializableFunction<ValueInSingleWindow<T>, TableDestination>
      getTableFunction();
    @Nullable abstract SerializableFunction<T, TableRow> getFormatFunction();
    @Nullable abstract DynamicDestinations<T, ?> getDynamicDestinations();
    @Nullable abstract PCollectionView<Map<String, String>> getSchemaFromView();
    @Nullable abstract ValueProvider<String> getJsonSchema();
    abstract CreateDisposition getCreateDisposition();
    abstract WriteDisposition getWriteDisposition();
    /** Table description. Default is empty. */
    @Nullable abstract String getTableDescription();
    /** An option to indicate if table validation is desired. Default is true. */
    abstract boolean getValidate();
    abstract BigQueryServices getBigQueryServices();
    @Nullable abstract Integer getMaxFilesPerBundle();
    @Nullable abstract Long getMaxFileSize();

    abstract Builder<T> toBuilder();

    @AutoValue.Builder
    abstract static class Builder<T> {
      abstract Builder<T> setJsonTableRef(ValueProvider<String> jsonTableRef);
      abstract Builder<T> setTableFunction(
          SerializableFunction<ValueInSingleWindow<T>, TableDestination> tableFunction);
      abstract Builder<T> setFormatFunction(SerializableFunction<T, TableRow> formatFunction);
      abstract Builder<T> setDynamicDestinations(DynamicDestinations<T, ?> dynamicDestinations);
      abstract Builder<T> setSchemaFromView(PCollectionView<Map<String, String>> view);
      abstract Builder<T> setJsonSchema(ValueProvider<String> jsonSchema);
      abstract Builder<T> setCreateDisposition(CreateDisposition createDisposition);
      abstract Builder<T> setWriteDisposition(WriteDisposition writeDisposition);
      abstract Builder<T> setTableDescription(String tableDescription);
      abstract Builder<T> setValidate(boolean validate);
      abstract Builder<T> setBigQueryServices(BigQueryServices bigQueryServices);
      abstract Builder<T> setMaxFilesPerBundle(Integer maxFilesPerBundle);
      abstract Builder<T> setMaxFileSize(Long maxFileSize);

      abstract Write<T> build();
    }

    /**
     * An enumeration type for the BigQuery create disposition strings.
     *
     * @see
     * <a href="https://cloud.google.com/bigquery/docs/reference/v2/jobs#configuration.query.createDisposition">
     *     <code>configuration.query.createDisposition</code> in the BigQuery Jobs API</a>
     */
    public enum CreateDisposition {
      /**
       * Specifics that tables should not be created.
       *
       * <p>If the output table does not exist, the write fails.
       */
      CREATE_NEVER,

      /**
       * Specifies that tables should be created if needed. This is the default behavior.
       *
       * <p>Requires that a table schema is provided via {@link BigQueryIO.Write#withSchema}.
       * This precondition is checked before starting a job. The schema is not required to match an
       * existing table's schema.
       *
       * <p>When this transformation is executed, if the output table does not exist, the table is
       * created from the provided schema. Note that even if the table exists, it may be recreated
       * if necessary when paired with a {@link WriteDisposition#WRITE_TRUNCATE}.
       */
      CREATE_IF_NEEDED
    }

    /**
     * An enumeration type for the BigQuery write disposition strings.
     *
     * @see <a
     *     href="https://cloud.google.com/bigquery/docs/reference/v2/jobs#configuration.query.writeDisposition">
     *     <code>configuration.query.writeDisposition</code> in the BigQuery Jobs API</a>
     */
    public enum WriteDisposition {
      /**
       * Specifies that write should replace a table.
       *
       * <p>The replacement may occur in multiple steps - for instance by first removing the
       * existing table, then creating a replacement, then filling it in. This is not an atomic
       * operation, and external programs may see the table in any of these intermediate steps.
       */
      WRITE_TRUNCATE,

      /** Specifies that rows may be appended to an existing table. */
      WRITE_APPEND,

      /**
       * Specifies that the output table must be empty. This is the default behavior.
       *
       * <p>If the output table is not empty, the write fails at runtime.
       *
       * <p>This check may occur long before data is written, and does not guarantee exclusive
       * access to the table. If two programs are run concurrently, each specifying the same output
       * table and a {@link WriteDisposition} of {@link WriteDisposition#WRITE_EMPTY}, it is
       * possible for both to succeed.
       */
      WRITE_EMPTY
    }

    /**
     * Writes to the given table, specified in the format described in {@link
     * BigQueryHelpers#parseTableSpec}.
     */
    public Write<T> to(String tableSpec) {
      return to(StaticValueProvider.of(tableSpec));
    }

    /** Writes to the given table, specified as a {@link TableReference}. */
    public Write<T> to(TableReference table) {
      return to(StaticValueProvider.of(BigQueryHelpers.toTableSpec(table)));
    }

    /** Same as {@link #to(String)}, but with a {@link ValueProvider}. */
    public Write<T> to(ValueProvider<String> tableSpec) {
      return toBuilder()
          .setJsonTableRef(
              NestedValueProvider.of(
                  NestedValueProvider.of(tableSpec, new TableSpecToTableRef()),
                  new TableRefToJson()))
          .build();
    }

    /**
     * Writes to table specified by the specified table function. The table is a function of
     * {@link ValueInSingleWindow}, so can be determined by the value or by the window.
     */
    public Write<T> to(
        SerializableFunction<ValueInSingleWindow<T>, TableDestination> tableFunction) {
      return toBuilder().setTableFunction(tableFunction).build();
    }

    /**
     * Writes to the table and schema specified by the {@link DynamicDestinations} object.
     */
    public Write<T> to(DynamicDestinations<T, ?> dynamicDestinations) {
      return toBuilder().setDynamicDestinations(dynamicDestinations).build();
    }

    /**
     * Formats the user's type into a {@link TableRow} to be written to BigQuery.
     */
    public Write<T> withFormatFunction(SerializableFunction<T, TableRow> formatFunction) {
      return toBuilder().setFormatFunction(formatFunction).build();
    }

    /**
     * Uses the specified schema for rows to be written.
     *
     * <p>The schema is <i>required</i> only if writing to a table that does not already exist, and
     * {@link CreateDisposition} is set to {@link CreateDisposition#CREATE_IF_NEEDED}.
     */
    public Write<T> withSchema(TableSchema schema) {
      return withJsonSchema(StaticValueProvider.of(BigQueryHelpers.toJsonString(schema)));
    }

    /** Same as {@link #withSchema(TableSchema)} but using a deferred {@link ValueProvider}. */
    public Write<T> withSchema(ValueProvider<TableSchema> schema) {
      return withJsonSchema(NestedValueProvider.of(schema, new TableSchemaToJsonSchema()));
    }

    /**
     * Similar to {@link #withSchema(TableSchema)} but takes in a JSON-serialized {@link
     * TableSchema}.
     */
    public Write<T> withJsonSchema(String jsonSchema) {
      return withJsonSchema(StaticValueProvider.of(jsonSchema));
    }

    /**
     * Same as {@link #withJsonSchema(String)} but using a deferred {@link ValueProvider}.
     */
    public Write<T> withJsonSchema(ValueProvider<String> jsonSchema) {
      return toBuilder().setJsonSchema(jsonSchema).build();
    }

    /**
     * Allows the schemas for each table to be computed within the pipeline itself.
     *
     * <p>The input is a map-valued {@link PCollectionView} mapping string tablespecs to
     * JSON-formatted {@link TableSchema}s. Tablespecs must be in the same format as taken by
     * {@link #to(String)}.
     */
    public Write<T> withSchemaFromView(PCollectionView<Map<String, String>> view) {
      return toBuilder().setSchemaFromView(view).build();
    }

    /** Specifies whether the table should be created if it does not exist. */
    public Write<T> withCreateDisposition(CreateDisposition createDisposition) {
      return toBuilder().setCreateDisposition(createDisposition).build();
    }

    /** Specifies what to do with existing data in the table, in case the table already exists. */
    public Write<T> withWriteDisposition(WriteDisposition writeDisposition) {
      return toBuilder().setWriteDisposition(writeDisposition).build();
    }

    /** Specifies the table description. */
    public Write<T> withTableDescription(String tableDescription) {
      return toBuilder().setTableDescription(tableDescription).build();
    }

    /** Disables BigQuery table validation. */
    public Write<T> withoutValidation() {
      return toBuilder().setValidate(false).build();
    }

    @VisibleForTesting
    Write<T> withTestServices(BigQueryServices testServices) {
      return toBuilder().setBigQueryServices(testServices).build();
    }

    @VisibleForTesting
    Write<T> withMaxFilesPerBundle(int maxFilesPerBundle) {
      return toBuilder().setMaxFilesPerBundle(maxFilesPerBundle).build();
    }

    @VisibleForTesting
    Write<T> withMaxFileSize(long maxFileSize) {
      return toBuilder().setMaxFileSize(maxFileSize).build();
    }

    @Override
    public void validate(PipelineOptions pipelineOptions) {
      BigQueryOptions options = pipelineOptions.as(BigQueryOptions.class);

      // The user specified a table.
      if (getJsonTableRef() != null && getJsonTableRef().isAccessible() && getValidate()) {
        TableReference table = getTableWithDefaultProject(options).get();
        DatasetService datasetService = getBigQueryServices().getDatasetService(options);
        // Check for destination table presence and emptiness for early failure notification.
        // Note that a presence check can fail when the table or dataset is created by an earlier
        // stage of the pipeline. For these cases the #withoutValidation method can be used to
        // disable the check.
        BigQueryHelpers.verifyDatasetPresence(datasetService, table);
        if (getCreateDisposition() == BigQueryIO.Write.CreateDisposition.CREATE_NEVER) {
          BigQueryHelpers.verifyTablePresence(datasetService, table);
        }
        if (getWriteDisposition() == BigQueryIO.Write.WriteDisposition.WRITE_EMPTY) {
          BigQueryHelpers.verifyTableNotExistOrEmpty(datasetService, table);
        }
      }
    }

    @Override
    public WriteResult expand(PCollection<T> input) {
      // We must have a destination to write to!
      checkState(
          getTableFunction() != null || getJsonTableRef() != null
              || getDynamicDestinations() != null,
          "must set the table reference of a BigQueryIO.Write transform");

      checkArgument(getFormatFunction() != null,
          "A function must be provided to convert type into a TableRow. "
              + "use BigQueryIO.Write.withFormatFunction to provide a formatting function.");

      // Require a schema if creating one or more tables.
      checkArgument(getCreateDisposition() != CreateDisposition.CREATE_IF_NEEDED
              || getJsonSchema() != null
              || getDynamicDestinations() != null
              || getSchemaFromView() != null,
          "CreateDisposition is CREATE_IF_NEEDED, however no schema was provided.");

      List<?> allToArgs = Lists.newArrayList(getJsonTableRef(), getTableFunction(),
          getDynamicDestinations());
      checkArgument(1
              == Iterables.size(Iterables.filter(allToArgs, Predicates.notNull())),
          "Exactly one of jsonTableRef, tableFunction, or " + "dynamicDestinations must be set");

      List<?> allSchemaArgs = Lists.newArrayList(getJsonSchema(), getSchemaFromView(),
          getDynamicDestinations());
      checkArgument(2
              > Iterables.size(Iterables.filter(allSchemaArgs, Predicates.notNull())),
          "No more than one of jsonSchema, schemaFromView, or dynamicDestinations may "
              + "be set");

      DynamicDestinations<T, ?> dynamicDestinations = getDynamicDestinations();
      if (dynamicDestinations == null) {
        if (getJsonTableRef() != null) {
          dynamicDestinations =
              DynamicDestinationsHelpers.ConstantTableDestinations.fromJsonTableRef(
                  getJsonTableRef(), getTableDescription());
        } else if (getTableFunction() != null) {
          dynamicDestinations = new TableFunctionDestinations(getTableFunction());
        }

        // Wrap with a DynamicDestinations class that will provide a schema. There might be no
        // schema provided if the create disposition is CREATE_NEVER.
        if (getJsonSchema() != null) {
          dynamicDestinations =
              new ConstantSchemaDestinations(dynamicDestinations, getJsonSchema());
        } else if (getSchemaFromView() != null) {
          dynamicDestinations =
              new SchemaFromViewDestinations(dynamicDestinations, getSchemaFromView());
        }
      }
      return expandTyped(input, dynamicDestinations);
    }

    private <DestinationT> WriteResult expandTyped(
        PCollection<T> input, DynamicDestinations<T, DestinationT> dynamicDestinations) {
      Coder<DestinationT> destinationCoder = null;
      try {
        destinationCoder = dynamicDestinations.getDestinationCoderWithDefault(
            input.getPipeline().getCoderRegistry());
      } catch (CannotProvideCoderException e) {
          throw new RuntimeException(e);
      }

      PCollection<KV<DestinationT, TableRow>> rowsWithDestination =
          input
              .apply("PrepareWrite", new PrepareWrite<>(dynamicDestinations, getFormatFunction()))
              .setCoder(KvCoder.of(destinationCoder, TableRowJsonCoder.of()));

      // When writing an Unbounded PCollection, we use StreamingInserts and BigQuery's streaming
      // import API.
      if (input.isBounded() == IsBounded.UNBOUNDED) {
        checkArgument(
            getWriteDisposition() != WriteDisposition.WRITE_TRUNCATE,
            "WriteDisposition.WRITE_TRUNCATE is not supported for an unbounded"
                + " PCollection.");
        StreamingInserts<DestinationT> streamingInserts =
            new StreamingInserts<>(getCreateDisposition(), dynamicDestinations);
        streamingInserts.setTestServices(getBigQueryServices());
        return rowsWithDestination.apply(streamingInserts);
      } else {
        BatchLoads<DestinationT> batchLoads = new BatchLoads<>(
            getWriteDisposition(),
            getCreateDisposition(),
            getJsonTableRef() != null,
            dynamicDestinations,
            destinationCoder);
        batchLoads.setTestServices(getBigQueryServices());
        if (getMaxFilesPerBundle() != null) {
          batchLoads.setMaxNumWritersPerBundle(getMaxFilesPerBundle());
        }
        if (getMaxFileSize() != null) {
          batchLoads.setMaxFileSize(getMaxFileSize());
        }
        return rowsWithDestination.apply(batchLoads);
      }
    }

    @Override
    protected Coder<Void> getDefaultOutputCoder() {
      return VoidCoder.of();
    }

    @Override
    public void populateDisplayData(DisplayData.Builder builder) {
      super.populateDisplayData(builder);

      builder.addIfNotNull(DisplayData.item("table", getJsonTableRef())
          .withLabel("Table Reference"));
      if (getJsonSchema() != null) {
        builder.addIfNotNull(DisplayData.item("schema", getJsonSchema()).withLabel("Table Schema"));
      } else {
        builder.add(DisplayData.item("schema", "Custom Schema Function").withLabel("Table Schema"));
      }

      if (getTableFunction() != null) {
        builder.add(DisplayData.item("tableFn", getTableFunction().getClass())
                .withLabel("Table Reference Function"));
      }

      builder
          .add(DisplayData.item("createDisposition", getCreateDisposition().toString())
                  .withLabel("Table CreateDisposition"))
          .add(DisplayData.item("writeDisposition", getWriteDisposition().toString())
                  .withLabel("Table WriteDisposition"))
          .addIfNotDefault(DisplayData.item("validation", getValidate())
              .withLabel("Validation Enabled"), true)
          .addIfNotNull(DisplayData.item("tableDescription", getTableDescription())
                  .withLabel("Table Description"));
    }

    /**
     * Returns the table to write, or {@code null} if writing with {@code tableFunction}.
     *
     * <p>If the table's project is not specified, use the executing project.
     */
    @Nullable
    ValueProvider<TableReference> getTableWithDefaultProject(BigQueryOptions bqOptions) {
      ValueProvider<TableReference> table = getTable();
      if (table == null) {
        return table;
      }

      if (!table.isAccessible()) {
        LOG.info("Using a dynamic value for table input. This must contain a project"
                + " in the table reference: {}", table);
        return table;
      }
      if (Strings.isNullOrEmpty(table.get().getProjectId())) {
        // If user does not specify a project we assume the table to be located in
        // the default project.
        TableReference tableRef = table.get();
        tableRef.setProjectId(bqOptions.getProject());
        return NestedValueProvider.of(StaticValueProvider.of(
            BigQueryHelpers.toJsonString(tableRef)), new JsonTableRefToTableRef());
      }
      return table;
    }

    /** Returns the table reference, or {@code null}. */
    @Nullable
    public ValueProvider<TableReference> getTable() {
      return getJsonTableRef() == null ? null :
          NestedValueProvider.of(getJsonTableRef(), new JsonTableRefToTableRef());
    }
  }

  /**
   * Clear the cached map of created tables. Used for testing.
   */
  @VisibleForTesting
  static void clearCreatedTables() {
    CreateTables.clearCreatedTables();
  }

  /////////////////////////////////////////////////////////////////////////////

  /** Disallow construction of utility class. */
  private BigQueryIO() {}
}