/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.beam.sdk.io.gcp.bigquery;
import com.google.api.services.bigquery.model.Job;
import com.google.api.services.bigquery.model.JobConfigurationLoad;
import com.google.api.services.bigquery.model.JobReference;
import com.google.api.services.bigquery.model.TableReference;
import com.google.api.services.bigquery.model.TableSchema;
import com.google.common.base.Strings;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;
import java.io.IOException;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import javax.annotation.Nullable;
import org.apache.beam.sdk.io.FileSystems;
import org.apache.beam.sdk.io.fs.MoveOptions;
import org.apache.beam.sdk.io.fs.ResourceId;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.Status;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryServices.DatasetService;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryServices.JobService;
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.values.KV;
import org.apache.beam.sdk.values.PCollectionView;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Writes partitions to BigQuery tables.
*
* <p>The input is a list of files corresponding to each partition of a table. loadThese files are
* loaded into a temporary table (or into the final table if there is only one partition). The
* output is a {@link KV} mapping each final table to a list of the temporary tables containing its
* data.
*
* <p>In the case where all the data in the files fit into a single load job, this transform loads
* the data directly into the final table, skipping temporary tables. In this case, the output
* {@link KV} maps the final table to itself.
*/
class WriteTables<DestinationT>
extends DoFn<KV<ShardedKey<DestinationT>, List<String>>, KV<TableDestination, String>> {
private static final Logger LOG = LoggerFactory.getLogger(WriteTables.class);
private final boolean singlePartition;
private final BigQueryServices bqServices;
private final PCollectionView<String> jobIdToken;
private final PCollectionView<Map<DestinationT, String>> schemasView;
private final WriteDisposition writeDisposition;
private final CreateDisposition createDisposition;
private final DynamicDestinations<?, DestinationT> dynamicDestinations;
public WriteTables(
boolean singlePartition,
BigQueryServices bqServices,
PCollectionView<String> jobIdToken,
PCollectionView<Map<DestinationT, String>> schemasView,
WriteDisposition writeDisposition,
CreateDisposition createDisposition,
DynamicDestinations<?, DestinationT> dynamicDestinations) {
this.singlePartition = singlePartition;
this.bqServices = bqServices;
this.jobIdToken = jobIdToken;
this.schemasView = schemasView;
this.writeDisposition = writeDisposition;
this.createDisposition = createDisposition;
this.dynamicDestinations = dynamicDestinations;
}
@ProcessElement
public void processElement(ProcessContext c) throws Exception {
dynamicDestinations.setSideInputAccessorFromProcessContext(c);
DestinationT destination = c.element().getKey().getKey();
TableSchema tableSchema =
BigQueryHelpers.fromJsonString(
c.sideInput(schemasView).get(destination), TableSchema.class);
TableDestination tableDestination = dynamicDestinations.getTable(destination);
TableReference tableReference = tableDestination.getTableReference();
if (Strings.isNullOrEmpty(tableReference.getProjectId())) {
tableReference.setProjectId(
c.getPipelineOptions().as(BigQueryOptions.class).getProject());
tableDestination = new TableDestination(
tableReference, tableDestination.getTableDescription());
}
Integer partition = c.element().getKey().getShardNumber();
List<String> partitionFiles = Lists.newArrayList(c.element().getValue());
String jobIdPrefix =
BigQueryHelpers.createJobId(c.sideInput(jobIdToken), tableDestination, partition);
if (!singlePartition) {
tableReference.setTableId(jobIdPrefix);
}
load(
bqServices.getJobService(c.getPipelineOptions().as(BigQueryOptions.class)),
bqServices.getDatasetService(c.getPipelineOptions().as(BigQueryOptions.class)),
jobIdPrefix,
tableReference,
tableSchema,
partitionFiles,
writeDisposition,
createDisposition,
tableDestination.getTableDescription());
c.output(KV.of(tableDestination, BigQueryHelpers.toJsonString(tableReference)));
removeTemporaryFiles(partitionFiles);
}
private void load(
JobService jobService,
DatasetService datasetService,
String jobIdPrefix,
TableReference ref,
@Nullable TableSchema schema,
List<String> gcsUris,
WriteDisposition writeDisposition,
CreateDisposition createDisposition,
@Nullable String tableDescription)
throws InterruptedException, IOException {
JobConfigurationLoad loadConfig =
new JobConfigurationLoad()
.setDestinationTable(ref)
.setSchema(schema)
.setSourceUris(gcsUris)
.setWriteDisposition(writeDisposition.name())
.setCreateDisposition(createDisposition.name())
.setSourceFormat("NEWLINE_DELIMITED_JSON");
String projectId = ref.getProjectId();
Job lastFailedLoadJob = null;
for (int i = 0; i < BatchLoads.MAX_RETRY_JOBS; ++i) {
String jobId = jobIdPrefix + "-" + i;
JobReference jobRef = new JobReference().setProjectId(projectId).setJobId(jobId);
jobService.startLoadJob(jobRef, loadConfig);
Job loadJob = jobService.pollJob(jobRef, BatchLoads.LOAD_JOB_POLL_MAX_RETRIES);
Status jobStatus = BigQueryHelpers.parseStatus(loadJob);
switch (jobStatus) {
case SUCCEEDED:
if (tableDescription != null) {
datasetService.patchTableDescription(ref, tableDescription);
}
return;
case UNKNOWN:
throw new RuntimeException(
String.format(
"UNKNOWN status of load job [%s]: %s.",
jobId, BigQueryHelpers.jobToPrettyString(loadJob)));
case FAILED:
lastFailedLoadJob = loadJob;
continue;
default:
throw new IllegalStateException(
String.format(
"Unexpected status [%s] of load job: %s.",
jobStatus, BigQueryHelpers.jobToPrettyString(loadJob)));
}
}
throw new RuntimeException(
String.format(
"Failed to create load job with id prefix %s, "
+ "reached max retries: %d, last failed load job: %s.",
jobIdPrefix,
BatchLoads.MAX_RETRY_JOBS,
BigQueryHelpers.jobToPrettyString(lastFailedLoadJob)));
}
static void removeTemporaryFiles(Collection<String> files) throws IOException {
ImmutableList.Builder<ResourceId> fileResources = ImmutableList.builder();
for (String file: files) {
fileResources.add(FileSystems.matchNewResource(file, false/* isDirectory */));
}
FileSystems.delete(fileResources.build(), MoveOptions.StandardMoveOptions.IGNORE_MISSING_FILES);
}
}