WritePartition.java example

Explorer
beam-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.beam.sdk.io.gcp.bigquery;

import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import java.util.List;
import java.util.Map;
import org.apache.beam.sdk.io.gcp.bigquery.WriteBundlesToFiles.Result;
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.values.KV;
import org.apache.beam.sdk.values.PCollectionView;
import org.apache.beam.sdk.values.TupleTag;

/**
 * Partitions temporary files based on number of files and file sizes. Output key is a pair of
 * tablespec and the list of files corresponding to each partition of that table.
 */
class WritePartition<DestinationT>
    extends DoFn<Void, KV<ShardedKey<DestinationT>, List<String>>> {
  private final boolean singletonTable;
  private final PCollectionView<String> tempFilePrefix;
  private final PCollectionView<Iterable<WriteBundlesToFiles.Result<DestinationT>>> results;
  private TupleTag<KV<ShardedKey<DestinationT>, List<String>>> multiPartitionsTag;
  private TupleTag<KV<ShardedKey<DestinationT>, List<String>>> singlePartitionTag;

  private static class PartitionData {
    private int numFiles = 0;
    private long byteSize = 0;
    private List<String> filenames = Lists.newArrayList();

    int getNumFiles() {
      return numFiles;
    }

    void addFiles(int numFiles) {
      this.numFiles += numFiles;
    }

    long getByteSize() {
      return byteSize;
    }

    void addBytes(long numBytes) {
      this.byteSize += numBytes;
    }

    List<String> getFilenames() {
      return filenames;
    }

    void addFilename(String filename) {
      filenames.add(filename);
    }

    // Check to see whether we can add to this partition without exceeding the maximum partition
    // size.
    boolean canAccept(int numFiles, long numBytes) {
      return this.numFiles + numFiles <= BatchLoads.MAX_NUM_FILES
          && this.byteSize + numBytes <= BatchLoads.MAX_SIZE_BYTES;
    }
  }

  private static class DestinationData {
    private List<PartitionData> partitions = Lists.newArrayList();

    DestinationData() {
      // Always start out with a single empty partition.
      partitions.add(new PartitionData());
    }

    List<PartitionData> getPartitions() {
      return partitions;
    }

    PartitionData getLatestPartition() {
      return partitions.get(partitions.size() - 1);
    }

    void addPartition(PartitionData partition) {
      partitions.add(partition);
    }
  }

  WritePartition(
      boolean singletonTable,
      PCollectionView<String> tempFilePrefix,
      PCollectionView<Iterable<WriteBundlesToFiles.Result<DestinationT>>> results,
      TupleTag<KV<ShardedKey<DestinationT>, List<String>>> multiPartitionsTag,
      TupleTag<KV<ShardedKey<DestinationT>, List<String>>> singlePartitionTag) {
    this.singletonTable = singletonTable;
    this.results = results;
    this.tempFilePrefix = tempFilePrefix;
    this.multiPartitionsTag = multiPartitionsTag;
    this.singlePartitionTag = singlePartitionTag;
  }

  @ProcessElement
  public void processElement(ProcessContext c) throws Exception {
    List<WriteBundlesToFiles.Result<DestinationT>> results =
        Lists.newArrayList(c.sideInput(this.results));

    // If there are no elements to write _and_ the user specified a constant output table, then
    // generate an empty table of that name.
    if (results.isEmpty() && singletonTable) {
      String tempFilePrefix = c.sideInput(this.tempFilePrefix);
      TableRowWriter writer = new TableRowWriter(tempFilePrefix);
      writer.close();
      TableRowWriter.Result writerResult = writer.getResult();
      // Return a null destination in this case - the constant DynamicDestinations class will
      // resolve it to the singleton output table.
      results.add(
          new Result<DestinationT>(
              writerResult.resourceId.toString(), writerResult.byteSize, null));
    }

    Map<DestinationT, DestinationData> currentResults = Maps.newHashMap();
    for (WriteBundlesToFiles.Result<DestinationT> fileResult : results) {
      DestinationT destination = fileResult.destination;
      DestinationData destinationData = currentResults.get(destination);
      if (destinationData == null) {
        destinationData = new DestinationData();
        currentResults.put(destination, destinationData);
      }

      PartitionData latestPartition = destinationData.getLatestPartition();
      if (!latestPartition.canAccept(1, fileResult.fileByteSize)) {
        // Too much data, roll over to a new partition.
        latestPartition = new PartitionData();
        destinationData.addPartition(latestPartition);
      }
      latestPartition.addFilename(fileResult.filename);
      latestPartition.addFiles(1);
      latestPartition.addBytes(fileResult.fileByteSize);
    }

    // Now that we've figured out which tables and partitions to write out, emit this information
    // to the next stage.
    for (Map.Entry<DestinationT, DestinationData> entry : currentResults.entrySet()) {
      DestinationT destination = entry.getKey();
      DestinationData destinationData = entry.getValue();
      // In the fast-path case where we only output one table, the transform loads it directly
      // to the final table. In this case, we output on a special TupleTag so the enclosing
      // transform knows to skip the rename step.
      TupleTag<KV<ShardedKey<DestinationT>, List<String>>> outputTag =
          (destinationData.getPartitions().size() == 1) ? singlePartitionTag : multiPartitionsTag;
      for (int i = 0; i < destinationData.getPartitions().size(); ++i) {
        PartitionData partitionData = destinationData.getPartitions().get(i);
        c.output(
            outputTag, KV.of(ShardedKey.of(destination, i + 1), partitionData.getFilenames()));
      }
    }
  }
}