/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.beam.sdk.io.gcp.bigquery;
import static com.google.common.base.Preconditions.checkState;
import com.google.api.services.bigquery.model.Job;
import com.google.api.services.bigquery.model.JobStatus;
import com.google.api.services.bigquery.model.TableReference;
import com.google.api.services.bigquery.model.TableSchema;
import com.google.cloud.hadoop.util.ApiErrorExtractor;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.hash.Hashing;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import java.util.regex.Matcher;
import javax.annotation.Nullable;
import org.apache.beam.sdk.io.FileSystems;
import org.apache.beam.sdk.io.fs.ResolveOptions;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryServices.DatasetService;
import org.apache.beam.sdk.options.ValueProvider;
import org.apache.beam.sdk.options.ValueProvider.NestedValueProvider;
import org.apache.beam.sdk.transforms.SerializableFunction;
/** A set of helper functions and classes used by {@link BigQueryIO}. */
public class BigQueryHelpers {
private static final String RESOURCE_NOT_FOUND_ERROR =
"BigQuery %1$s not found for table \"%2$s\" . Please create the %1$s before pipeline"
+ " execution. If the %1$s is created by an earlier stage of the pipeline, this"
+ " validation can be disabled using #withoutValidation.";
private static final String UNABLE_TO_CONFIRM_PRESENCE_OF_RESOURCE_ERROR =
"Unable to confirm BigQuery %1$s presence for table \"%2$s\". If the %1$s is created by"
+ " an earlier stage of the pipeline, this validation can be disabled using"
+ " #withoutValidation.";
/** Status of a BigQuery job or request. */
enum Status {
SUCCEEDED,
FAILED,
UNKNOWN,
}
@Nullable
/** Return a displayable string representation for a {@link TableReference}. */
static ValueProvider<String> displayTable(@Nullable ValueProvider<TableReference> table) {
if (table == null) {
return null;
}
return NestedValueProvider.of(table, new TableRefToTableSpec());
}
/** Returns a canonical string representation of the {@link TableReference}. */
public static String toTableSpec(TableReference ref) {
StringBuilder sb = new StringBuilder();
if (ref.getProjectId() != null) {
sb.append(ref.getProjectId());
sb.append(":");
}
sb.append(ref.getDatasetId()).append('.').append(ref.getTableId());
return sb.toString();
}
static <K, V> List<V> getOrCreateMapListValue(Map<K, List<V>> map, K key) {
List<V> value = map.get(key);
if (value == null) {
value = new ArrayList<>();
map.put(key, value);
}
return value;
}
/**
* Parse a table specification in the form {@code "[project_id]:[dataset_id].[table_id]"} or
* {@code "[dataset_id].[table_id]"}.
*
* <p>If the project id is omitted, the default project id is used.
*/
public static TableReference parseTableSpec(String tableSpec) {
Matcher match = BigQueryIO.TABLE_SPEC.matcher(tableSpec);
if (!match.matches()) {
throw new IllegalArgumentException(
"Table reference is not in [project_id]:[dataset_id].[table_id] "
+ "format: "
+ tableSpec);
}
TableReference ref = new TableReference();
ref.setProjectId(match.group("PROJECT"));
return ref.setDatasetId(match.group("DATASET")).setTableId(match.group("TABLE"));
}
static String jobToPrettyString(@Nullable Job job) throws IOException {
return job == null ? "null" : job.toPrettyString();
}
static String statusToPrettyString(@Nullable JobStatus status) throws IOException {
return status == null ? "Unknown status: null." : status.toPrettyString();
}
static Status parseStatus(@Nullable Job job) {
if (job == null) {
return Status.UNKNOWN;
}
JobStatus status = job.getStatus();
if (status.getErrorResult() != null) {
return Status.FAILED;
} else if (status.getErrors() != null && !status.getErrors().isEmpty()) {
return Status.FAILED;
} else {
return Status.SUCCEEDED;
}
}
@VisibleForTesting
static String toJsonString(Object item) {
if (item == null) {
return null;
}
try {
return BigQueryIO.JSON_FACTORY.toString(item);
} catch (IOException e) {
throw new RuntimeException(
String.format("Cannot serialize %s to a JSON string.", item.getClass().getSimpleName()),
e);
}
}
@VisibleForTesting
static <T> T fromJsonString(String json, Class<T> clazz) {
if (json == null) {
return null;
}
try {
return BigQueryIO.JSON_FACTORY.fromString(json, clazz);
} catch (IOException e) {
throw new RuntimeException(
String.format("Cannot deserialize %s from a JSON string: %s.", clazz, json), e);
}
}
/**
* Returns a randomUUID string.
*
* <p>{@code '-'} is removed because BigQuery doesn't allow it in dataset id.
*/
static String randomUUIDString() {
return UUID.randomUUID().toString().replaceAll("-", "");
}
static void verifyTableNotExistOrEmpty(DatasetService datasetService, TableReference tableRef) {
try {
if (datasetService.getTable(tableRef) != null) {
checkState(
datasetService.isTableEmpty(tableRef),
"BigQuery table is not empty: %s.",
toTableSpec(tableRef));
}
} catch (IOException | InterruptedException e) {
if (e instanceof InterruptedException) {
Thread.currentThread().interrupt();
}
throw new RuntimeException(
"unable to confirm BigQuery table emptiness for table " + toTableSpec(tableRef), e);
}
}
static void verifyDatasetPresence(DatasetService datasetService, TableReference table) {
try {
datasetService.getDataset(table.getProjectId(), table.getDatasetId());
} catch (Exception e) {
ApiErrorExtractor errorExtractor = new ApiErrorExtractor();
if ((e instanceof IOException) && errorExtractor.itemNotFound((IOException) e)) {
throw new IllegalArgumentException(
String.format(RESOURCE_NOT_FOUND_ERROR, "dataset", toTableSpec(table)), e);
} else if (e instanceof RuntimeException) {
throw (RuntimeException) e;
} else {
throw new RuntimeException(
String.format(
UNABLE_TO_CONFIRM_PRESENCE_OF_RESOURCE_ERROR, "dataset", toTableSpec(table)),
e);
}
}
}
static void verifyTablePresence(DatasetService datasetService, TableReference table) {
try {
datasetService.getTable(table);
} catch (Exception e) {
ApiErrorExtractor errorExtractor = new ApiErrorExtractor();
if ((e instanceof IOException) && errorExtractor.itemNotFound((IOException) e)) {
throw new IllegalArgumentException(
String.format(RESOURCE_NOT_FOUND_ERROR, "table", toTableSpec(table)), e);
} else if (e instanceof RuntimeException) {
throw (RuntimeException) e;
} else {
throw new RuntimeException(
String.format(
UNABLE_TO_CONFIRM_PRESENCE_OF_RESOURCE_ERROR, "table", toTableSpec(table)),
e);
}
}
}
// Create a unique job id for a table load.
static String createJobId(String prefix, TableDestination tableDestination, int partition) {
// Job ID must be different for each partition of each table.
String destinationHash =
Hashing.murmur3_128().hashUnencodedChars(tableDestination.toString()).toString();
if (partition >= 0) {
return String.format("%s_%s_%05d", prefix, destinationHash, partition);
} else {
return String.format("%s_%s", prefix, destinationHash);
}
}
@VisibleForTesting
static class JsonSchemaToTableSchema implements SerializableFunction<String, TableSchema> {
@Override
public TableSchema apply(String from) {
return fromJsonString(from, TableSchema.class);
}
}
static class TableSchemaToJsonSchema implements SerializableFunction<TableSchema, String> {
@Override
public String apply(TableSchema from) {
return toJsonString(from);
}
}
static class JsonTableRefToTableRef implements SerializableFunction<String, TableReference> {
@Override
public TableReference apply(String from) {
return fromJsonString(from, TableReference.class);
}
}
static class JsonTableRefToTableSpec implements SerializableFunction<String, String> {
@Override
public String apply(String from) {
return toTableSpec(fromJsonString(from, TableReference.class));
}
}
static class TableRefToTableSpec implements SerializableFunction<TableReference, String> {
@Override
public String apply(TableReference from) {
return toTableSpec(from);
}
}
static class TableRefToJson implements SerializableFunction<TableReference, String> {
@Override
public String apply(TableReference from) {
return toJsonString(from);
}
}
@VisibleForTesting
static class TableSpecToTableRef implements SerializableFunction<String, TableReference> {
@Override
public TableReference apply(String from) {
return parseTableSpec(from);
}
}
static String createJobIdToken(String jobName, String stepUuid) {
return String.format("beam_job_%s_%s", stepUuid, jobName.replaceAll("-", ""));
}
static String getExtractJobId(String jobIdToken) {
return String.format("%s-extract", jobIdToken);
}
static TableReference createTempTableReference(String projectId, String jobUuid) {
String queryTempDatasetId = "temp_dataset_" + jobUuid;
String queryTempTableId = "temp_table_" + jobUuid;
TableReference queryTempTableRef = new TableReference()
.setProjectId(projectId)
.setDatasetId(queryTempDatasetId)
.setTableId(queryTempTableId);
return queryTempTableRef;
}
static String resolveTempLocation(
String tempLocationDir, String bigQueryOperationName, String stepUuid) {
return FileSystems.matchNewResource(tempLocationDir, true)
.resolve(bigQueryOperationName, ResolveOptions.StandardResolveOptions.RESOLVE_DIRECTORY)
.resolve(stepUuid, ResolveOptions.StandardResolveOptions.RESOLVE_DIRECTORY)
.toString();
}
}