BigQueryTableRowIterator.java example

Explorer
beam-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.beam.sdk.io.gcp.bigquery;

import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;
import static com.google.common.base.Preconditions.checkState;

import com.google.api.client.googleapis.services.AbstractGoogleClientRequest;
import com.google.api.client.util.BackOff;
import com.google.api.client.util.BackOffUtils;
import com.google.api.client.util.ClassInfo;
import com.google.api.client.util.Data;
import com.google.api.client.util.Sleeper;
import com.google.api.services.bigquery.Bigquery;
import com.google.api.services.bigquery.model.Dataset;
import com.google.api.services.bigquery.model.DatasetReference;
import com.google.api.services.bigquery.model.ErrorProto;
import com.google.api.services.bigquery.model.Job;
import com.google.api.services.bigquery.model.JobConfiguration;
import com.google.api.services.bigquery.model.JobConfigurationQuery;
import com.google.api.services.bigquery.model.JobReference;
import com.google.api.services.bigquery.model.JobStatistics;
import com.google.api.services.bigquery.model.JobStatus;
import com.google.api.services.bigquery.model.Table;
import com.google.api.services.bigquery.model.TableCell;
import com.google.api.services.bigquery.model.TableDataList;
import com.google.api.services.bigquery.model.TableFieldSchema;
import com.google.api.services.bigquery.model.TableReference;
import com.google.api.services.bigquery.model.TableRow;
import com.google.api.services.bigquery.model.TableSchema;
import com.google.common.collect.ImmutableList;
import com.google.common.util.concurrent.Uninterruptibles;
import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Objects;
import java.util.Random;
import java.util.concurrent.TimeUnit;
import javax.annotation.Nullable;

import org.apache.beam.sdk.util.BackOffAdapter;
import org.apache.beam.sdk.util.FluentBackoff;
import org.joda.time.Duration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Iterates over all rows in a table.
 */
class BigQueryTableRowIterator implements AutoCloseable {
  private static final Logger LOG = LoggerFactory.getLogger(BigQueryTableRowIterator.class);

  @Nullable private TableReference ref;
  @Nullable private final String projectId;
  @Nullable private TableSchema schema;
  @Nullable private JobConfigurationQuery queryConfig;
  private final Bigquery client;
  private String pageToken;
  private Iterator<TableRow> iteratorOverCurrentBatch;
  private TableRow current;
  // Set true when the final page is seen from the service.
  private boolean lastPage = false;

  // The maximum number of times a BigQuery request will be retried
  private static final int MAX_RETRIES = 3;
  // Initial wait time for the backoff implementation
  private static final Duration INITIAL_BACKOFF_TIME = Duration.standardSeconds(1);

  // After sending a query to BQ service we will be polling the BQ service to check the status with
  // following interval to check the status of query execution job
  private static final Duration QUERY_COMPLETION_POLL_TIME = Duration.standardSeconds(1);

  // Temporary dataset used to store query results.
  private String temporaryDatasetId = null;
  // Temporary table used to store query results.
  private String temporaryTableId = null;

  private BigQueryTableRowIterator(
      @Nullable TableReference ref, @Nullable JobConfigurationQuery queryConfig,
      @Nullable String projectId, Bigquery client) {
    this.ref = ref;
    this.queryConfig = queryConfig;
    this.projectId = projectId;
    this.client = checkNotNull(client, "client");
  }

  /**
   * Constructs a {@code BigQueryTableRowIterator} that reads from the specified table.
   */
  static BigQueryTableRowIterator fromTable(TableReference ref, Bigquery client) {
    checkNotNull(ref, "ref");
    checkNotNull(client, "client");
    return new BigQueryTableRowIterator(ref, /* queryConfig */null, ref.getProjectId(), client);
  }

  /**
   * Constructs a {@code BigQueryTableRowIterator} that reads from the results of executing the
   * specified query in the specified project.
   */
  static BigQueryTableRowIterator fromQuery(
      JobConfigurationQuery queryConfig, String projectId, Bigquery client) {
    checkNotNull(queryConfig, "queryConfig");
    checkNotNull(projectId, "projectId");
    checkNotNull(client, "client");
    return new BigQueryTableRowIterator(/* ref */null, queryConfig, projectId, client);
  }

  /**
   * Opens the table for read.
   * @throws IOException on failure
   */
  void open() throws IOException, InterruptedException {
    if (queryConfig != null) {
      ref = executeQueryAndWaitForCompletion();
    }
    // Get table schema.
    schema = getTable(ref).getSchema();
  }

  boolean advance() throws IOException, InterruptedException {
    while (true) {
      if (iteratorOverCurrentBatch != null && iteratorOverCurrentBatch.hasNext()) {
        // Embed schema information into the raw row, so that values have an
        // associated key.
        current = getTypedTableRow(schema.getFields(), iteratorOverCurrentBatch.next());
        return true;
      }
      if (lastPage) {
        return false;
      }

      Bigquery.Tabledata.List list =
          client.tabledata().list(ref.getProjectId(), ref.getDatasetId(), ref.getTableId());
      if (pageToken != null) {
        list.setPageToken(pageToken);
      }

      TableDataList result = executeWithBackOff(
          list,
          String.format(
              "Error reading from BigQuery table %s of dataset %s.",
              ref.getTableId(), ref.getDatasetId()));

      pageToken = result.getPageToken();
      iteratorOverCurrentBatch =
          result.getRows() != null
              ? result.getRows().iterator()
              : Collections.<TableRow>emptyIterator();

      // The server may return a page token indefinitely on a zero-length table.
      if (pageToken == null || result.getTotalRows() != null && result.getTotalRows() == 0) {
        lastPage = true;
      }
    }
  }

  TableRow getCurrent() {
    if (current == null) {
      throw new NoSuchElementException();
    }
    return current;
  }

  /**
   * Adjusts a field returned from the BigQuery API to match what we will receive when running
   * BigQuery's export-to-GCS and parallel read, which is the efficient parallel implementation
   * used for batch jobs executed on the Beam Runners that perform initial splitting.
   *
   * <p>The following is the relationship between BigQuery schema and Java types:
   *
   * <ul>
   *   <li>Nulls are {@code null}.
   *   <li>Repeated fields are {@code List} of objects.
   *   <li>Record columns are {@link TableRow} objects.
   *   <li>{@code BOOLEAN} columns are JSON booleans, hence Java {@code Boolean} objects.
   *   <li>{@code FLOAT} columns are JSON floats, hence Java {@code Double} objects.
   *   <li>{@code TIMESTAMP} columns are {@code String} objects that are of the format
   *       {@code yyyy-MM-dd HH:mm:ss[.SSSSSS] UTC}, where the {@code .SSSSSS} has no trailing
   *       zeros and can be 1 to 6 digits long.
   *   <li>Every other atomic type is a {@code String}.
   * </ul>
   *
   * <p>Note that integers are encoded as strings to match BigQuery's exported JSON format.
   *
   * <p>Finally, values are stored in the {@link TableRow} as {"field name": value} pairs
   * and are not accessible through the {@link TableRow#getF} function.
   */
  @Nullable private Object getTypedCellValue(TableFieldSchema fieldSchema, Object v) {
    if (Data.isNull(v)) {
      return null;
    }

    if (Objects.equals(fieldSchema.getMode(), "REPEATED")) {
      TableFieldSchema elementSchema = fieldSchema.clone().setMode("REQUIRED");
      @SuppressWarnings("unchecked")
      List<Map<String, Object>> rawCells = (List<Map<String, Object>>) v;
      ImmutableList.Builder<Object> values = ImmutableList.builder();
      for (Map<String, Object> element : rawCells) {
        values.add(getTypedCellValue(elementSchema, element.get("v")));
      }
      return values.build();
    }

    if (fieldSchema.getType().equals("RECORD")) {
      @SuppressWarnings("unchecked")
      Map<String, Object> typedV = (Map<String, Object>) v;
      return getTypedTableRow(fieldSchema.getFields(), typedV);
    }

    if (fieldSchema.getType().equals("FLOAT")) {
      return Double.parseDouble((String) v);
    }

    if (fieldSchema.getType().equals("BOOLEAN")) {
      return Boolean.parseBoolean((String) v);
    }

    if (fieldSchema.getType().equals("TIMESTAMP")) {
      return BigQueryAvroUtils.formatTimestamp((String) v);
    }

    // Returns the original value for:
    // 1. String, 2. base64 encoded BYTES, 3. DATE, DATETIME, TIME strings.
    return v;
  }

  /**
   * A list of the field names that cannot be used in BigQuery tables processed by Apache Beam,
   * because they are reserved keywords in {@link TableRow}.
   */
  // TODO: This limitation is unfortunate. We need to give users a way to use BigQueryIO that does
  // not indirect through our broken use of {@link TableRow}.
  //     See discussion: https://github.com/GoogleCloudPlatform/DataflowJavaSDK/pull/41
  private static final Collection<String> RESERVED_FIELD_NAMES =
      ClassInfo.of(TableRow.class).getNames();

  /**
   * Converts a row returned from the BigQuery JSON API as a {@code Map<String, Object>} into a
   * Java {@link TableRow} with nested {@link TableCell TableCells}. The {@code Object} values in
   * the cells are converted to Java types according to the provided field schemas.
   *
   * <p>See {@link #getTypedCellValue(TableFieldSchema, Object)} for details on how BigQuery
   * types are mapped to Java types.
   */
  private TableRow getTypedTableRow(List<TableFieldSchema> fields, Map<String, Object> rawRow) {
    // If rawRow is a TableRow, use it. If not, create a new one.
    TableRow row;
    List<? extends Map<String, Object>> cells;
    if (rawRow instanceof TableRow) {
      // Since rawRow is a TableRow it already has TableCell objects in setF. We do not need to do
      // any type conversion, but extract the cells for cell-wise processing below.
      row = (TableRow) rawRow;
      cells = row.getF();
      // Clear the cells from the row, so that row.getF() will return null. This matches the
      // behavior of rows produced by the BigQuery export API used on the service.
      row.setF(null);
    } else {
      row = new TableRow();

      // Since rawRow is a Map<String, Object> we use Map.get("f") instead of TableRow.getF() to
      // get its cells. Similarly, when rawCell is a Map<String, Object> instead of a TableCell,
      // we will use Map.get("v") instead of TableCell.getV() get its value.
      @SuppressWarnings("unchecked")
      List<? extends Map<String, Object>> rawCells =
          (List<? extends Map<String, Object>>) rawRow.get("f");
      cells = rawCells;
    }

    checkState(cells.size() == fields.size(),
        "Expected that the row has the same number of cells %s as fields in the schema %s",
        cells.size(), fields.size());

    // Loop through all the fields in the row, normalizing their types with the TableFieldSchema
    // and storing the normalized values by field name in the Map<String, Object> that
    // underlies the TableRow.
    Iterator<? extends Map<String, Object>> cellIt = cells.iterator();
    Iterator<TableFieldSchema> fieldIt = fields.iterator();
    while (cellIt.hasNext()) {
      Map<String, Object> cell = cellIt.next();
      TableFieldSchema fieldSchema = fieldIt.next();

      // Convert the object in this cell to the Java type corresponding to its type in the schema.
      Object convertedValue = getTypedCellValue(fieldSchema, cell.get("v"));

      String fieldName = fieldSchema.getName();
      checkArgument(!RESERVED_FIELD_NAMES.contains(fieldName),
          "BigQueryIO does not support records with columns named %s", fieldName);

      if (convertedValue == null) {
        // BigQuery does not include null values when the export operation (to JSON) is used.
        // To match that behavior, BigQueryTableRowiterator, and the DirectRunner,
        // intentionally omits columns with null values.
        continue;
      }

      row.set(fieldName, convertedValue);
    }
    return row;
  }

  // Get the BiqQuery table.
  private Table getTable(TableReference ref) throws IOException, InterruptedException {
    Bigquery.Tables.Get get =
        client.tables().get(ref.getProjectId(), ref.getDatasetId(), ref.getTableId());

    return executeWithBackOff(
        get,
        String.format(
            "Error opening BigQuery table %s of dataset %s.",
            ref.getTableId(),
            ref.getDatasetId()));
  }

  // Create a new BigQuery dataset
  private void createDataset(String datasetId, @Nullable String location)
      throws IOException, InterruptedException {
    Dataset dataset = new Dataset();
    DatasetReference reference = new DatasetReference();
    reference.setProjectId(projectId);
    reference.setDatasetId(datasetId);
    dataset.setDatasetReference(reference);
    if (location != null) {
      dataset.setLocation(location);
    }

    executeWithBackOff(
        client.datasets().insert(projectId, dataset),
        String.format(
            "Error when trying to create the temporary dataset %s in project %s.",
            datasetId, projectId));
  }

  // Delete the given table that is available in the given dataset.
  private void deleteTable(String datasetId, String tableId)
      throws IOException, InterruptedException {
    executeWithBackOff(
        client.tables().delete(projectId, datasetId, tableId),
        String.format(
            "Error when trying to delete the temporary table %s in dataset %s of project %s. "
            + "Manual deletion may be required.",
            tableId, datasetId, projectId));
  }

  // Delete the given dataset. This will fail if the given dataset has any tables.
  private void deleteDataset(String datasetId) throws IOException, InterruptedException {
    executeWithBackOff(
        client.datasets().delete(projectId, datasetId),
        String.format(
            "Error when trying to delete the temporary dataset %s in project %s. "
            + "Manual deletion may be required.",
            datasetId, projectId));
  }

  /**
   * Executes the specified query and returns a reference to the temporary BigQuery table created
   * to hold the results.
   *
   * @throws IOException if the query fails.
   */
  private TableReference executeQueryAndWaitForCompletion()
      throws IOException, InterruptedException {
    checkState(projectId != null, "Unable to execute a query without a configured project id");
    checkState(queryConfig != null, "Unable to execute a query without a configured query");
    // Dry run query to get source table location
    Job dryRunJob = new Job()
        .setConfiguration(new JobConfiguration()
            .setQuery(queryConfig)
            .setDryRun(true));
    JobStatistics jobStats = executeWithBackOff(
        client.jobs().insert(projectId, dryRunJob),
        String.format("Error when trying to dry run query %s.",
            queryConfig.toPrettyString())).getStatistics();

    // Let BigQuery to pick default location if the query does not read any tables.
    String location = null;
    @Nullable List<TableReference> tables = jobStats.getQuery().getReferencedTables();
    if (tables != null && !tables.isEmpty()) {
      Table table = getTable(tables.get(0));
      location = table.getLocation();
    }

    // Create a temporary dataset to store results.
    // Starting dataset name with an "_" so that it is hidden.
    Random rnd = new Random(System.currentTimeMillis());
    temporaryDatasetId = "_beam_temporary_dataset_" + rnd.nextInt(1000000);
    temporaryTableId = "beam_temporary_table_" + rnd.nextInt(1000000);

    createDataset(temporaryDatasetId, location);
    Job job = new Job();
    JobConfiguration config = new JobConfiguration();
    config.setQuery(queryConfig);
    job.setConfiguration(config);

    TableReference destinationTable = new TableReference();
    destinationTable.setProjectId(projectId);
    destinationTable.setDatasetId(temporaryDatasetId);
    destinationTable.setTableId(temporaryTableId);
    queryConfig.setDestinationTable(destinationTable);
    queryConfig.setAllowLargeResults(true);

    Job queryJob = executeWithBackOff(
        client.jobs().insert(projectId, job),
        String.format("Error when trying to execute the job for query %s.",
            queryConfig.toPrettyString()));
    JobReference jobId = queryJob.getJobReference();

    while (true) {
      Job pollJob = executeWithBackOff(
          client.jobs().get(projectId, jobId.getJobId()),
          String.format("Error when trying to get status of the job for query %s.",
              queryConfig.toPrettyString()));
      JobStatus status = pollJob.getStatus();
      if (status.getState().equals("DONE")) {
        // Job is DONE, but did not necessarily succeed.
        ErrorProto error = status.getErrorResult();
        if (error == null) {
          return pollJob.getConfiguration().getQuery().getDestinationTable();
        } else {
          // There will be no temporary table to delete, so null out the reference.
          temporaryTableId = null;
          throw new IOException(String.format(
              "Executing query %s failed: %s", queryConfig.toPrettyString(), error.getMessage()));
        }
      }
      Uninterruptibles.sleepUninterruptibly(
          QUERY_COMPLETION_POLL_TIME.getMillis(), TimeUnit.MILLISECONDS);
    }
  }

  // Execute a BQ request with exponential backoff and return the result.
  // client - BQ request to be executed
  // error - Formatted message to log if when a request fails. Takes exception message as a
  // formatter parameter.
  private static <T> T executeWithBackOff(AbstractGoogleClientRequest<T> client, String error)
      throws IOException, InterruptedException {
    Sleeper sleeper = Sleeper.DEFAULT;
    BackOff backOff =
        BackOffAdapter.toGcpBackOff(
            FluentBackoff.DEFAULT
                .withMaxRetries(MAX_RETRIES).withInitialBackoff(INITIAL_BACKOFF_TIME).backoff());

    T result = null;
    while (true) {
      try {
        result = client.execute();
        break;
      } catch (IOException e) {
        LOG.error("{}", error, e);
        if (!BackOffUtils.next(sleeper, backOff)) {
          String errorMessage = String.format(
              "%s Failing to execute job after %d attempts.", error, MAX_RETRIES + 1);
          LOG.error("{}", errorMessage, e);
          throw new IOException(errorMessage, e);
        }
      }
    }
    return result;
  }

  @Override
  public void close() {
    // Prevent any further requests.
    lastPage = true;

    try {
      // Deleting temporary table and dataset that gets generated when executing a query.
      if (temporaryDatasetId != null) {
        if (temporaryTableId != null) {
          deleteTable(temporaryDatasetId, temporaryTableId);
        }
        deleteDataset(temporaryDatasetId);
      }
    } catch (IOException | InterruptedException e) {
      if (e instanceof InterruptedException) {
        Thread.currentThread().interrupt();
      }
      throw new RuntimeException(e);
    }

  }
}