FakeJobService.java example

Explorer
beam-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.beam.sdk.io.gcp.bigquery;

import static com.google.common.base.Preconditions.checkArgument;

import com.google.api.client.json.JsonFactory;
import com.google.api.client.util.BackOff;
import com.google.api.client.util.BackOffUtils;
import com.google.api.client.util.Sleeper;
import com.google.api.services.bigquery.model.ErrorProto;
import com.google.api.services.bigquery.model.Job;
import com.google.api.services.bigquery.model.JobConfiguration;
import com.google.api.services.bigquery.model.JobConfigurationExtract;
import com.google.api.services.bigquery.model.JobConfigurationLoad;
import com.google.api.services.bigquery.model.JobConfigurationQuery;
import com.google.api.services.bigquery.model.JobConfigurationTableCopy;
import com.google.api.services.bigquery.model.JobReference;
import com.google.api.services.bigquery.model.JobStatistics;
import com.google.api.services.bigquery.model.JobStatistics4;
import com.google.api.services.bigquery.model.JobStatus;
import com.google.api.services.bigquery.model.Table;
import com.google.api.services.bigquery.model.TableReference;
import com.google.api.services.bigquery.model.TableRow;
import com.google.api.services.bigquery.model.TableSchema;
import com.google.common.collect.HashBasedTable;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.Serializable;
import java.nio.channels.Channels;
import java.nio.channels.WritableByteChannel;
import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ThreadLocalRandom;
import org.apache.avro.Schema;
import org.apache.avro.file.DataFileWriter;
import org.apache.avro.generic.GenericDatumWriter;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.GenericRecordBuilder;
import org.apache.beam.sdk.coders.Coder;
import org.apache.beam.sdk.coders.Coder.Context;
import org.apache.beam.sdk.io.FileSystems;
import org.apache.beam.sdk.io.fs.MoveOptions;
import org.apache.beam.sdk.io.fs.ResourceId;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryServices.JobService;
import org.apache.beam.sdk.util.BackOffAdapter;
import org.apache.beam.sdk.util.FluentBackoff;
import org.apache.beam.sdk.util.MimeTypes;
import org.apache.beam.sdk.util.Transport;
import org.joda.time.Duration;

/**
 * A fake implementation of BigQuery's job service.
 */
class FakeJobService implements JobService, Serializable {
  static final JsonFactory JSON_FACTORY = Transport.getJsonFactory();
  // Whenever a job is started, the first 2 calls to GetJob will report the job as pending,
  // the next 2 will return the job as running, and only then will the job report as done.
  private static final int GET_JOBS_TRANSITION_INTERVAL = 2;

  private FakeDatasetService datasetService;

  private static class JobInfo {
    Job job;
    int getJobCount = 0;

    JobInfo(Job job) {
      this.job = job;
    }
  }

  private static final com.google.common.collect.Table<String, String, JobInfo> allJobs =
      HashBasedTable.create();
  private static int numExtractJobCalls = 0;

  private static final com.google.common.collect.Table<String, String, List<ResourceId>>
      filesForLoadJobs = HashBasedTable.create();
  private static final com.google.common.collect.Table<String, String, JobStatistics>
      dryRunQueryResults = HashBasedTable.create();

  FakeJobService() {
    this.datasetService = new FakeDatasetService();
  }

  @Override
  public void startLoadJob(JobReference jobRef, JobConfigurationLoad loadConfig)
      throws InterruptedException, IOException {
    synchronized (allJobs) {
      Job job = new Job();
      job.setJobReference(jobRef);
      job.setConfiguration(new JobConfiguration().setLoad(loadConfig));
      job.setKind(" bigquery#job");
      job.setStatus(new JobStatus().setState("PENDING"));

      // Copy the files to a new location for import, as the temporary files will be deleted by
      // the caller.
      if (loadConfig.getSourceUris().size() > 0) {
        ImmutableList.Builder<ResourceId> sourceFiles = ImmutableList.builder();
        ImmutableList.Builder<ResourceId> loadFiles = ImmutableList.builder();
        for (String filename : loadConfig.getSourceUris()) {
          sourceFiles.add(FileSystems.matchNewResource(filename, false /* isDirectory */));
          loadFiles.add(FileSystems.matchNewResource(
              filename + ThreadLocalRandom.current().nextInt(), false /* isDirectory */));
        }

        FileSystems.copy(sourceFiles.build(), loadFiles.build(),
            MoveOptions.StandardMoveOptions.IGNORE_MISSING_FILES);
        filesForLoadJobs.put(jobRef.getProjectId(), jobRef.getJobId(), loadFiles.build());
      }

      allJobs.put(jobRef.getProjectId(), jobRef.getJobId(), new JobInfo(job));
    }
  }

  @Override
  public void startExtractJob(JobReference jobRef, JobConfigurationExtract extractConfig)
      throws InterruptedException, IOException {
    checkArgument(extractConfig.getDestinationFormat().equals("AVRO"),
        "Only extract to AVRO is supported");
    synchronized (allJobs) {
      ++numExtractJobCalls;

      Job job = new Job();
      job.setJobReference(jobRef);
      job.setConfiguration(new JobConfiguration().setExtract(extractConfig));
      job.setKind(" bigquery#job");
      job.setStatus(new JobStatus().setState("PENDING"));
      allJobs.put(jobRef.getProjectId(), jobRef.getJobId(), new JobInfo(job));
    }
  }

  public int getNumExtractJobCalls() {
    synchronized (allJobs) {
      return numExtractJobCalls;
    }
  }

  @Override
  public void startQueryJob(JobReference jobRef, JobConfigurationQuery query)
      throws IOException, InterruptedException {
    synchronized (allJobs) {
      Job job = new Job();
      job.setJobReference(jobRef);
      job.setConfiguration(new JobConfiguration().setQuery(query));
      job.setKind(" bigquery#job");
      job.setStatus(new JobStatus().setState("PENDING"));
      allJobs.put(jobRef.getProjectId(), jobRef.getJobId(), new JobInfo(job));
    }
  }

  @Override
  public void startCopyJob(JobReference jobRef, JobConfigurationTableCopy copyConfig)
      throws IOException, InterruptedException {
    synchronized (allJobs) {
      Job job = new Job();
      job.setJobReference(jobRef);
      job.setConfiguration(new JobConfiguration().setCopy(copyConfig));
      job.setKind(" bigquery#job");
      job.setStatus(new JobStatus().setState("PENDING"));
      allJobs.put(jobRef.getProjectId(), jobRef.getJobId(), new JobInfo(job));
    }
  }

  @Override
  public Job pollJob(JobReference jobRef, int maxAttempts)
      throws InterruptedException {
    BackOff backoff =
        BackOffAdapter.toGcpBackOff(
            FluentBackoff.DEFAULT
                .withMaxRetries(maxAttempts)
                .withInitialBackoff(Duration.millis(10))
                .withMaxBackoff(Duration.standardSeconds(1))
                .backoff());
    Sleeper sleeper = Sleeper.DEFAULT;
    try {
      do {
        Job job = getJob(jobRef);
        if (job != null) {
          JobStatus status = job.getStatus();
          if (status != null && status.getState() != null
              && (status.getState().equals("DONE") || status.getState().equals("FAILED"))) {
            return job;
          }
        }
      } while (BackOffUtils.next(sleeper, backoff));
    } catch (IOException e) {
      return null;
    }
    return null;
  }

  public void expectDryRunQuery(String projectId, String query, JobStatistics result) {
    synchronized (dryRunQueryResults) {
      dryRunQueryResults.put(projectId, query, result);
    }
  }

  @Override
  public JobStatistics dryRunQuery(String projectId, JobConfigurationQuery query)
      throws InterruptedException, IOException {
    synchronized (dryRunQueryResults) {
      JobStatistics result = dryRunQueryResults.get(projectId, query.getQuery());
      if (result != null) {
        return result;
      }
    }
    throw new UnsupportedOperationException();
  }

  @Override
  public Job getJob(JobReference jobRef) throws InterruptedException {
    try {
      synchronized (allJobs) {
        JobInfo job = allJobs.get(jobRef.getProjectId(), jobRef.getJobId());
        if (job == null) {
          return null;
        }
        try {
          ++job.getJobCount;
          if (job.getJobCount == GET_JOBS_TRANSITION_INTERVAL + 1) {
            job.job.getStatus().setState("RUNNING");
          } else if (job.getJobCount == 2 * GET_JOBS_TRANSITION_INTERVAL + 1) {
            job.job.setStatus(runJob(job.job));
          }
        } catch (Exception e) {
          job.job.getStatus().setState("FAILED").setErrorResult(
              new ErrorProto().setMessage(
                  String.format("Job %s failed: %s", job.job.getConfiguration(), e.toString())));
        }
        return JSON_FACTORY.fromString(JSON_FACTORY.toString(job.job), Job.class);
      }
    } catch (IOException e) {
      return null;
    }
  }

  private JobStatus runJob(Job job) throws InterruptedException, IOException {
    if (job.getConfiguration().getLoad() != null) {
      return runLoadJob(job.getJobReference(), job.getConfiguration().getLoad());
    } else if (job.getConfiguration().getCopy() != null) {
      return runCopyJob(job.getConfiguration().getCopy());
    } else if (job.getConfiguration().getExtract() != null) {
      return runExtractJob(job, job.getConfiguration().getExtract());
    } else if (job.getConfiguration().getQuery() != null) {
      return runQueryJob(job.getConfiguration().getQuery());
    }
    return new JobStatus().setState("DONE");
  }

  private boolean validateDispositions(Table table, CreateDisposition createDisposition,
                                       WriteDisposition writeDisposition)
      throws InterruptedException, IOException {
    if (table == null) {
      if (createDisposition == CreateDisposition.CREATE_NEVER) {
        return false;
      }
    } else if (writeDisposition == WriteDisposition.WRITE_TRUNCATE) {
      datasetService.deleteTable(table.getTableReference());
    } else if (writeDisposition == WriteDisposition.WRITE_EMPTY) {
      List<TableRow> allRows = datasetService.getAllRows(table.getTableReference().getProjectId(),
          table.getTableReference().getDatasetId(), table.getTableReference().getTableId());
      if (!allRows.isEmpty()) {
        return false;
      }
    }
    return true;
  }

  private JobStatus runLoadJob(JobReference jobRef, JobConfigurationLoad load)
      throws InterruptedException, IOException {
    TableReference destination = load.getDestinationTable();
    TableSchema schema = load.getSchema();
    List<ResourceId> sourceFiles = filesForLoadJobs.get(jobRef.getProjectId(), jobRef.getJobId());
    WriteDisposition writeDisposition = WriteDisposition.valueOf(load.getWriteDisposition());
    CreateDisposition createDisposition = CreateDisposition.valueOf(load.getCreateDisposition());
    checkArgument(load.getSourceFormat().equals("NEWLINE_DELIMITED_JSON"));
    Table existingTable = datasetService.getTable(destination);
    if (!validateDispositions(existingTable, createDisposition, writeDisposition)) {
      return new JobStatus().setState("FAILED").setErrorResult(new ErrorProto());
    }

    datasetService.createTable(new Table().setTableReference(destination).setSchema(schema));

    List<TableRow> rows = Lists.newArrayList();
    for (ResourceId filename : sourceFiles) {
      rows.addAll(readRows(filename.toString()));
    }
    datasetService.insertAll(destination, rows, null);
    return new JobStatus().setState("DONE");
  }

  private JobStatus runCopyJob(JobConfigurationTableCopy copy)
      throws InterruptedException, IOException {
    List<TableReference> sources = copy.getSourceTables();
    TableReference destination = copy.getDestinationTable();
    WriteDisposition writeDisposition = WriteDisposition.valueOf(copy.getWriteDisposition());
    CreateDisposition createDisposition = CreateDisposition.valueOf(copy.getCreateDisposition());
    Table existingTable = datasetService.getTable(destination);
    if (!validateDispositions(existingTable, createDisposition, writeDisposition)) {
      return new JobStatus().setState("FAILED").setErrorResult(new ErrorProto());
    }

    List<TableRow> allRows = Lists.newArrayList();
    for (TableReference source : sources) {
      allRows.addAll(datasetService.getAllRows(
          source.getProjectId(), source.getDatasetId(), source.getTableId()));
    }
    datasetService.createTable(new Table().setTableReference(destination));
    datasetService.insertAll(destination, allRows, null);
    return new JobStatus().setState("DONE");
  }

  private JobStatus runExtractJob(Job job, JobConfigurationExtract extract)
      throws InterruptedException, IOException {
    TableReference sourceTable = extract.getSourceTable();

    List<TableRow> rows = datasetService.getAllRows(
        sourceTable.getProjectId(), sourceTable.getDatasetId(), sourceTable.getTableId());
    TableSchema schema = datasetService.getTable(sourceTable).getSchema();
    List<Long> destinationFileCounts = Lists.newArrayList();
    for (String destination : extract.getDestinationUris()) {
      destinationFileCounts.add(writeRows(sourceTable.getTableId(), rows, schema, destination));
    }
    job.setStatistics(new JobStatistics().setExtract(
        new JobStatistics4().setDestinationUriFileCounts(destinationFileCounts)));
    return new JobStatus().setState("DONE");
  }

  private JobStatus runQueryJob(JobConfigurationQuery query)
      throws IOException, InterruptedException  {
    List<TableRow> rows = FakeBigQueryServices.rowsFromEncodedQuery(query.getQuery());
    datasetService.createTable(new Table().setTableReference(query.getDestinationTable()));
    datasetService.insertAll(query.getDestinationTable(), rows, null);
    return new JobStatus().setState("DONE");
  }

  private List<TableRow> readRows(String filename) throws IOException {
    Coder<TableRow> coder = TableRowJsonCoder.of();
    List<TableRow> tableRows = Lists.newArrayList();
    try (BufferedReader reader = new BufferedReader(new FileReader(filename))) {
      String line;
      while ((line = reader.readLine()) != null) {
        TableRow tableRow = coder.decode(
            new ByteArrayInputStream(line.getBytes(StandardCharsets.UTF_8)), Context.OUTER);
        tableRows.add(tableRow);
      }
    }
    return tableRows;
  }

  private long writeRows(String tableId, List<TableRow> rows, TableSchema schema,
                         String destinationPattern) throws IOException {
    Schema avroSchema = BigQueryAvroUtils.toGenericAvroSchema(tableId, schema.getFields());
    List<TableRow> rowsToWrite = Lists.newArrayList();
    int shard = 0;
    for (int i = 0; i < rows.size(); ++i) {
      rowsToWrite.add(rows.get(i));
      if (rowsToWrite.size() == 5) {
        writeRowsHelper(rowsToWrite, avroSchema, destinationPattern, shard++);
        rowsToWrite.clear();
      }
    }
    if (!rowsToWrite.isEmpty()) {
      writeRowsHelper(rowsToWrite, avroSchema, destinationPattern, shard++);
    }
    return shard;
  }

  private void writeRowsHelper(List<TableRow> rows, Schema avroSchema,
                               String destinationPattern, int shard) throws IOException {
    String filename = destinationPattern.replace("*", String.format("%012d", shard));
    try (WritableByteChannel channel = FileSystems.create(
        FileSystems.matchNewResource(filename, false /* isDirectory */), MimeTypes.BINARY);
         DataFileWriter<GenericRecord> tableRowWriter =
             new DataFileWriter<>(new GenericDatumWriter<GenericRecord>(avroSchema))
                 .create(avroSchema, Channels.newOutputStream(channel))) {
      for (Map<String, Object> record : rows) {
        GenericRecordBuilder genericRecordBuilder = new GenericRecordBuilder(avroSchema);
        for (Map.Entry<String, Object> field : record.entrySet()) {
          genericRecordBuilder.set(field.getKey(), field.getValue());
        }
        tableRowWriter.append(genericRecordBuilder.build());
      }
    } catch (IOException e) {
      throw new IllegalStateException(
          String.format("Could not create destination for extract job %s", filename), e);
    }
  }
}