/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.beam.sdk.io.gcp.bigquery;
import static org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.resolveTempLocation;
import com.google.api.services.bigquery.model.TableRow;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.Serializable;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ThreadLocalRandom;
import org.apache.beam.sdk.coders.Coder;
import org.apache.beam.sdk.coders.CoderException;
import org.apache.beam.sdk.coders.StringUtf8Coder;
import org.apache.beam.sdk.coders.StructuredCoder;
import org.apache.beam.sdk.coders.VarLongCoder;
import org.apache.beam.sdk.io.gcp.bigquery.WriteBundlesToFiles.Result;
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
import org.apache.beam.sdk.values.KV;
import org.apache.beam.sdk.values.TupleTag;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Writes each bundle of {@link TableRow} elements out to separate file using {@link
* TableRowWriter}. Elements destined to different destinations are written to separate files.
* The transform will not write an element to a file if it is already writing to
* {@link #maxNumWritersPerBundle} files and the element is destined to a new destination. In this
* case, the element will be spilled into the output, and the {@link WriteGroupedRecordsToFiles}
* transform will take care of writing it to a file.
*/
class WriteBundlesToFiles<DestinationT>
extends DoFn<KV<DestinationT, TableRow>, Result<DestinationT>> {
private static final Logger LOG = LoggerFactory.getLogger(WriteBundlesToFiles.class);
// When we spill records, shard the output keys to prevent hotspots. Experiments running up to
// 10TB of data have shown a sharding of 10 to be a good choice.
private static final int SPILLED_RECORD_SHARDING_FACTOR = 10;
// Map from tablespec to a writer for that table.
private transient Map<DestinationT, TableRowWriter> writers;
private transient Map<DestinationT, BoundedWindow> writerWindows;
private final String stepUuid;
private final TupleTag<KV<ShardedKey<DestinationT>, TableRow>> unwrittedRecordsTag;
private int maxNumWritersPerBundle;
private long maxFileSize;
/**
* The result of the {@link WriteBundlesToFiles} transform. Corresponds to a single output file,
* and encapsulates the table it is destined to as well as the file byte size.
*/
public static final class Result<DestinationT> implements Serializable {
private static final long serialVersionUID = 1L;
public final String filename;
public final Long fileByteSize;
public final DestinationT destination;
public Result(String filename, Long fileByteSize, DestinationT destination) {
this.filename = filename;
this.fileByteSize = fileByteSize;
this.destination = destination;
}
}
/** a coder for the {@link Result} class. */
public static class ResultCoder<DestinationT> extends StructuredCoder<Result<DestinationT>> {
private static final StringUtf8Coder stringCoder = StringUtf8Coder.of();
private static final VarLongCoder longCoder = VarLongCoder.of();
private final Coder<DestinationT> destinationCoder;
public static <DestinationT> ResultCoder<DestinationT> of(
Coder<DestinationT> destinationCoder) {
return new ResultCoder<>(destinationCoder);
}
ResultCoder(Coder<DestinationT> destinationCoder) {
this.destinationCoder = destinationCoder;
}
@Override
public void encode(Result<DestinationT> value, OutputStream outStream)
throws IOException {
if (value == null) {
throw new CoderException("cannot encode a null value");
}
stringCoder.encode(value.filename, outStream);
longCoder.encode(value.fileByteSize, outStream);
destinationCoder.encode(value.destination, outStream);
}
@Override
public Result<DestinationT> decode(InputStream inStream) throws IOException {
String filename = stringCoder.decode(inStream);
long fileByteSize = longCoder.decode(inStream);
DestinationT destination = destinationCoder.decode(inStream);
return new Result<>(filename, fileByteSize, destination);
}
@Override
public List<? extends Coder<?>> getCoderArguments() {
return Collections.singletonList(destinationCoder);
}
@Override
public void verifyDeterministic() {}
}
WriteBundlesToFiles(
String stepUuid,
TupleTag<KV<ShardedKey<DestinationT>, TableRow>> unwrittedRecordsTag,
int maxNumWritersPerBundle,
long maxFileSize) {
this.stepUuid = stepUuid;
this.unwrittedRecordsTag = unwrittedRecordsTag;
this.maxNumWritersPerBundle = maxNumWritersPerBundle;
this.maxFileSize = maxFileSize;
}
@StartBundle
public void startBundle() {
// This must be done for each bundle, as by default the {@link DoFn} might be reused between
// bundles.
this.writers = Maps.newHashMap();
this.writerWindows = Maps.newHashMap();
}
TableRowWriter createAndInsertWriter(DestinationT destination, String tempFilePrefix,
BoundedWindow window) throws Exception {
TableRowWriter writer = new TableRowWriter(tempFilePrefix);
writers.put(destination, writer);
writerWindows.put(destination, window);
return writer;
}
@ProcessElement
public void processElement(ProcessContext c, BoundedWindow window) throws Exception {
String tempFilePrefix = resolveTempLocation(
c.getPipelineOptions().getTempLocation(), "BigQueryWriteTemp", stepUuid);
DestinationT destination = c.element().getKey();
TableRowWriter writer;
if (writers.containsKey(destination)) {
writer = writers.get(destination);
} else {
// Only create a new writer if we have fewer than maxNumWritersPerBundle already in this
// bundle.
if (writers.size() <= maxNumWritersPerBundle) {
writer = createAndInsertWriter(destination, tempFilePrefix, window);
} else {
// This means that we already had too many writers open in this bundle. "spill" this record
// into the output. It will be grouped and written to a file in a subsequent stage.
c.output(unwrittedRecordsTag,
KV.of(ShardedKey.of(destination,
ThreadLocalRandom.current().nextInt(SPILLED_RECORD_SHARDING_FACTOR)),
c.element().getValue()));
return;
}
}
if (writer.getByteSize() > maxFileSize) {
// File is too big. Close it and open a new file.
writer.close();
TableRowWriter.Result result = writer.getResult();
c.output(new Result<>(result.resourceId.toString(), result.byteSize, destination));
writer = createAndInsertWriter(destination, tempFilePrefix, window);
}
try {
writer.write(c.element().getValue());
} catch (Exception e) {
// Discard write result and close the write.
try {
writer.close();
// The writer does not need to be reset, as this DoFn cannot be reused.
} catch (Exception closeException) {
// Do not mask the exception that caused the write to fail.
e.addSuppressed(closeException);
}
throw e;
}
}
@FinishBundle
public void finishBundle(FinishBundleContext c) throws Exception {
List<Exception> exceptionList = Lists.newArrayList();
for (TableRowWriter writer : writers.values()) {
try {
writer.close();
} catch (Exception e) {
exceptionList.add(e);
}
}
if (!exceptionList.isEmpty()) {
Exception e = new IOException("Failed to close some writers");
for (Exception thrown : exceptionList) {
e.addSuppressed(thrown);
}
throw e;
}
for (Map.Entry<DestinationT, TableRowWriter> entry : writers.entrySet()) {
try {
DestinationT destination = entry.getKey();
TableRowWriter writer = entry.getValue();
TableRowWriter.Result result = writer.getResult();
c.output(
new Result<>(result.resourceId.toString(), result.byteSize, destination),
writerWindows.get(destination).maxTimestamp(),
writerWindows.get(destination));
} catch (Exception e) {
exceptionList.add(e);
}
}
writers.clear();
}
}