/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.beam.sdk.io.gcp.bigquery; import static com.google.common.base.Preconditions.checkArgument; import com.google.api.client.json.JsonFactory; import com.google.api.client.util.BackOff; import com.google.api.client.util.BackOffUtils; import com.google.api.client.util.Sleeper; import com.google.api.services.bigquery.model.ErrorProto; import com.google.api.services.bigquery.model.Job; import com.google.api.services.bigquery.model.JobConfiguration; import com.google.api.services.bigquery.model.JobConfigurationExtract; import com.google.api.services.bigquery.model.JobConfigurationLoad; import com.google.api.services.bigquery.model.JobConfigurationQuery; import com.google.api.services.bigquery.model.JobConfigurationTableCopy; import com.google.api.services.bigquery.model.JobReference; import com.google.api.services.bigquery.model.JobStatistics; import com.google.api.services.bigquery.model.JobStatistics4; import com.google.api.services.bigquery.model.JobStatus; import com.google.api.services.bigquery.model.Table; import com.google.api.services.bigquery.model.TableReference; import com.google.api.services.bigquery.model.TableRow; import com.google.api.services.bigquery.model.TableSchema; import com.google.common.collect.HashBasedTable; import com.google.common.collect.ImmutableList; import com.google.common.collect.Lists; import java.io.BufferedReader; import java.io.ByteArrayInputStream; import java.io.FileReader; import java.io.IOException; import java.io.Serializable; import java.nio.channels.Channels; import java.nio.channels.WritableByteChannel; import java.nio.charset.StandardCharsets; import java.util.List; import java.util.Map; import java.util.concurrent.ThreadLocalRandom; import org.apache.avro.Schema; import org.apache.avro.file.DataFileWriter; import org.apache.avro.generic.GenericDatumWriter; import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.GenericRecordBuilder; import org.apache.beam.sdk.coders.Coder; import org.apache.beam.sdk.coders.Coder.Context; import org.apache.beam.sdk.io.FileSystems; import org.apache.beam.sdk.io.fs.MoveOptions; import org.apache.beam.sdk.io.fs.ResourceId; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition; import org.apache.beam.sdk.io.gcp.bigquery.BigQueryServices.JobService; import org.apache.beam.sdk.util.BackOffAdapter; import org.apache.beam.sdk.util.FluentBackoff; import org.apache.beam.sdk.util.MimeTypes; import org.apache.beam.sdk.util.Transport; import org.joda.time.Duration; /** * A fake implementation of BigQuery's job service. */ class FakeJobService implements JobService, Serializable { static final JsonFactory JSON_FACTORY = Transport.getJsonFactory(); // Whenever a job is started, the first 2 calls to GetJob will report the job as pending, // the next 2 will return the job as running, and only then will the job report as done. private static final int GET_JOBS_TRANSITION_INTERVAL = 2; private FakeDatasetService datasetService; private static class JobInfo { Job job; int getJobCount = 0; JobInfo(Job job) { this.job = job; } } private static final com.google.common.collect.Table<String, String, JobInfo> allJobs = HashBasedTable.create(); private static int numExtractJobCalls = 0; private static final com.google.common.collect.Table<String, String, List<ResourceId>> filesForLoadJobs = HashBasedTable.create(); private static final com.google.common.collect.Table<String, String, JobStatistics> dryRunQueryResults = HashBasedTable.create(); FakeJobService() { this.datasetService = new FakeDatasetService(); } @Override public void startLoadJob(JobReference jobRef, JobConfigurationLoad loadConfig) throws InterruptedException, IOException { synchronized (allJobs) { Job job = new Job(); job.setJobReference(jobRef); job.setConfiguration(new JobConfiguration().setLoad(loadConfig)); job.setKind(" bigquery#job"); job.setStatus(new JobStatus().setState("PENDING")); // Copy the files to a new location for import, as the temporary files will be deleted by // the caller. if (loadConfig.getSourceUris().size() > 0) { ImmutableList.Builder<ResourceId> sourceFiles = ImmutableList.builder(); ImmutableList.Builder<ResourceId> loadFiles = ImmutableList.builder(); for (String filename : loadConfig.getSourceUris()) { sourceFiles.add(FileSystems.matchNewResource(filename, false /* isDirectory */)); loadFiles.add(FileSystems.matchNewResource( filename + ThreadLocalRandom.current().nextInt(), false /* isDirectory */)); } FileSystems.copy(sourceFiles.build(), loadFiles.build(), MoveOptions.StandardMoveOptions.IGNORE_MISSING_FILES); filesForLoadJobs.put(jobRef.getProjectId(), jobRef.getJobId(), loadFiles.build()); } allJobs.put(jobRef.getProjectId(), jobRef.getJobId(), new JobInfo(job)); } } @Override public void startExtractJob(JobReference jobRef, JobConfigurationExtract extractConfig) throws InterruptedException, IOException { checkArgument(extractConfig.getDestinationFormat().equals("AVRO"), "Only extract to AVRO is supported"); synchronized (allJobs) { ++numExtractJobCalls; Job job = new Job(); job.setJobReference(jobRef); job.setConfiguration(new JobConfiguration().setExtract(extractConfig)); job.setKind(" bigquery#job"); job.setStatus(new JobStatus().setState("PENDING")); allJobs.put(jobRef.getProjectId(), jobRef.getJobId(), new JobInfo(job)); } } public int getNumExtractJobCalls() { synchronized (allJobs) { return numExtractJobCalls; } } @Override public void startQueryJob(JobReference jobRef, JobConfigurationQuery query) throws IOException, InterruptedException { synchronized (allJobs) { Job job = new Job(); job.setJobReference(jobRef); job.setConfiguration(new JobConfiguration().setQuery(query)); job.setKind(" bigquery#job"); job.setStatus(new JobStatus().setState("PENDING")); allJobs.put(jobRef.getProjectId(), jobRef.getJobId(), new JobInfo(job)); } } @Override public void startCopyJob(JobReference jobRef, JobConfigurationTableCopy copyConfig) throws IOException, InterruptedException { synchronized (allJobs) { Job job = new Job(); job.setJobReference(jobRef); job.setConfiguration(new JobConfiguration().setCopy(copyConfig)); job.setKind(" bigquery#job"); job.setStatus(new JobStatus().setState("PENDING")); allJobs.put(jobRef.getProjectId(), jobRef.getJobId(), new JobInfo(job)); } } @Override public Job pollJob(JobReference jobRef, int maxAttempts) throws InterruptedException { BackOff backoff = BackOffAdapter.toGcpBackOff( FluentBackoff.DEFAULT .withMaxRetries(maxAttempts) .withInitialBackoff(Duration.millis(10)) .withMaxBackoff(Duration.standardSeconds(1)) .backoff()); Sleeper sleeper = Sleeper.DEFAULT; try { do { Job job = getJob(jobRef); if (job != null) { JobStatus status = job.getStatus(); if (status != null && status.getState() != null && (status.getState().equals("DONE") || status.getState().equals("FAILED"))) { return job; } } } while (BackOffUtils.next(sleeper, backoff)); } catch (IOException e) { return null; } return null; } public void expectDryRunQuery(String projectId, String query, JobStatistics result) { synchronized (dryRunQueryResults) { dryRunQueryResults.put(projectId, query, result); } } @Override public JobStatistics dryRunQuery(String projectId, JobConfigurationQuery query) throws InterruptedException, IOException { synchronized (dryRunQueryResults) { JobStatistics result = dryRunQueryResults.get(projectId, query.getQuery()); if (result != null) { return result; } } throw new UnsupportedOperationException(); } @Override public Job getJob(JobReference jobRef) throws InterruptedException { try { synchronized (allJobs) { JobInfo job = allJobs.get(jobRef.getProjectId(), jobRef.getJobId()); if (job == null) { return null; } try { ++job.getJobCount; if (job.getJobCount == GET_JOBS_TRANSITION_INTERVAL + 1) { job.job.getStatus().setState("RUNNING"); } else if (job.getJobCount == 2 * GET_JOBS_TRANSITION_INTERVAL + 1) { job.job.setStatus(runJob(job.job)); } } catch (Exception e) { job.job.getStatus().setState("FAILED").setErrorResult( new ErrorProto().setMessage( String.format("Job %s failed: %s", job.job.getConfiguration(), e.toString()))); } return JSON_FACTORY.fromString(JSON_FACTORY.toString(job.job), Job.class); } } catch (IOException e) { return null; } } private JobStatus runJob(Job job) throws InterruptedException, IOException { if (job.getConfiguration().getLoad() != null) { return runLoadJob(job.getJobReference(), job.getConfiguration().getLoad()); } else if (job.getConfiguration().getCopy() != null) { return runCopyJob(job.getConfiguration().getCopy()); } else if (job.getConfiguration().getExtract() != null) { return runExtractJob(job, job.getConfiguration().getExtract()); } else if (job.getConfiguration().getQuery() != null) { return runQueryJob(job.getConfiguration().getQuery()); } return new JobStatus().setState("DONE"); } private boolean validateDispositions(Table table, CreateDisposition createDisposition, WriteDisposition writeDisposition) throws InterruptedException, IOException { if (table == null) { if (createDisposition == CreateDisposition.CREATE_NEVER) { return false; } } else if (writeDisposition == WriteDisposition.WRITE_TRUNCATE) { datasetService.deleteTable(table.getTableReference()); } else if (writeDisposition == WriteDisposition.WRITE_EMPTY) { List<TableRow> allRows = datasetService.getAllRows(table.getTableReference().getProjectId(), table.getTableReference().getDatasetId(), table.getTableReference().getTableId()); if (!allRows.isEmpty()) { return false; } } return true; } private JobStatus runLoadJob(JobReference jobRef, JobConfigurationLoad load) throws InterruptedException, IOException { TableReference destination = load.getDestinationTable(); TableSchema schema = load.getSchema(); List<ResourceId> sourceFiles = filesForLoadJobs.get(jobRef.getProjectId(), jobRef.getJobId()); WriteDisposition writeDisposition = WriteDisposition.valueOf(load.getWriteDisposition()); CreateDisposition createDisposition = CreateDisposition.valueOf(load.getCreateDisposition()); checkArgument(load.getSourceFormat().equals("NEWLINE_DELIMITED_JSON")); Table existingTable = datasetService.getTable(destination); if (!validateDispositions(existingTable, createDisposition, writeDisposition)) { return new JobStatus().setState("FAILED").setErrorResult(new ErrorProto()); } datasetService.createTable(new Table().setTableReference(destination).setSchema(schema)); List<TableRow> rows = Lists.newArrayList(); for (ResourceId filename : sourceFiles) { rows.addAll(readRows(filename.toString())); } datasetService.insertAll(destination, rows, null); return new JobStatus().setState("DONE"); } private JobStatus runCopyJob(JobConfigurationTableCopy copy) throws InterruptedException, IOException { List<TableReference> sources = copy.getSourceTables(); TableReference destination = copy.getDestinationTable(); WriteDisposition writeDisposition = WriteDisposition.valueOf(copy.getWriteDisposition()); CreateDisposition createDisposition = CreateDisposition.valueOf(copy.getCreateDisposition()); Table existingTable = datasetService.getTable(destination); if (!validateDispositions(existingTable, createDisposition, writeDisposition)) { return new JobStatus().setState("FAILED").setErrorResult(new ErrorProto()); } List<TableRow> allRows = Lists.newArrayList(); for (TableReference source : sources) { allRows.addAll(datasetService.getAllRows( source.getProjectId(), source.getDatasetId(), source.getTableId())); } datasetService.createTable(new Table().setTableReference(destination)); datasetService.insertAll(destination, allRows, null); return new JobStatus().setState("DONE"); } private JobStatus runExtractJob(Job job, JobConfigurationExtract extract) throws InterruptedException, IOException { TableReference sourceTable = extract.getSourceTable(); List<TableRow> rows = datasetService.getAllRows( sourceTable.getProjectId(), sourceTable.getDatasetId(), sourceTable.getTableId()); TableSchema schema = datasetService.getTable(sourceTable).getSchema(); List<Long> destinationFileCounts = Lists.newArrayList(); for (String destination : extract.getDestinationUris()) { destinationFileCounts.add(writeRows(sourceTable.getTableId(), rows, schema, destination)); } job.setStatistics(new JobStatistics().setExtract( new JobStatistics4().setDestinationUriFileCounts(destinationFileCounts))); return new JobStatus().setState("DONE"); } private JobStatus runQueryJob(JobConfigurationQuery query) throws IOException, InterruptedException { List<TableRow> rows = FakeBigQueryServices.rowsFromEncodedQuery(query.getQuery()); datasetService.createTable(new Table().setTableReference(query.getDestinationTable())); datasetService.insertAll(query.getDestinationTable(), rows, null); return new JobStatus().setState("DONE"); } private List<TableRow> readRows(String filename) throws IOException { Coder<TableRow> coder = TableRowJsonCoder.of(); List<TableRow> tableRows = Lists.newArrayList(); try (BufferedReader reader = new BufferedReader(new FileReader(filename))) { String line; while ((line = reader.readLine()) != null) { TableRow tableRow = coder.decode( new ByteArrayInputStream(line.getBytes(StandardCharsets.UTF_8)), Context.OUTER); tableRows.add(tableRow); } } return tableRows; } private long writeRows(String tableId, List<TableRow> rows, TableSchema schema, String destinationPattern) throws IOException { Schema avroSchema = BigQueryAvroUtils.toGenericAvroSchema(tableId, schema.getFields()); List<TableRow> rowsToWrite = Lists.newArrayList(); int shard = 0; for (int i = 0; i < rows.size(); ++i) { rowsToWrite.add(rows.get(i)); if (rowsToWrite.size() == 5) { writeRowsHelper(rowsToWrite, avroSchema, destinationPattern, shard++); rowsToWrite.clear(); } } if (!rowsToWrite.isEmpty()) { writeRowsHelper(rowsToWrite, avroSchema, destinationPattern, shard++); } return shard; } private void writeRowsHelper(List<TableRow> rows, Schema avroSchema, String destinationPattern, int shard) throws IOException { String filename = destinationPattern.replace("*", String.format("%012d", shard)); try (WritableByteChannel channel = FileSystems.create( FileSystems.matchNewResource(filename, false /* isDirectory */), MimeTypes.BINARY); DataFileWriter<GenericRecord> tableRowWriter = new DataFileWriter<>(new GenericDatumWriter<GenericRecord>(avroSchema)) .create(avroSchema, Channels.newOutputStream(channel))) { for (Map<String, Object> record : rows) { GenericRecordBuilder genericRecordBuilder = new GenericRecordBuilder(avroSchema); for (Map.Entry<String, Object> field : record.entrySet()) { genericRecordBuilder.set(field.getKey(), field.getValue()); } tableRowWriter.append(genericRecordBuilder.build()); } } catch (IOException e) { throw new IllegalStateException( String.format("Could not create destination for extract job %s", filename), e); } } }