WriteFiles.java example

Explorer
beam-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.beam.sdk.io;

import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;

import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import java.util.concurrent.ThreadLocalRandom;
import javax.annotation.Nullable;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.annotations.Experimental;
import org.apache.beam.sdk.coders.Coder;
import org.apache.beam.sdk.coders.KvCoder;
import org.apache.beam.sdk.coders.VoidCoder;
import org.apache.beam.sdk.io.FileBasedSink.FileResult;
import org.apache.beam.sdk.io.FileBasedSink.FileResultCoder;
import org.apache.beam.sdk.io.FileBasedSink.WriteOperation;
import org.apache.beam.sdk.io.FileBasedSink.Writer;
import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.options.ValueProvider;
import org.apache.beam.sdk.options.ValueProvider.StaticValueProvider;
import org.apache.beam.sdk.transforms.Create;
import org.apache.beam.sdk.transforms.DoFn;
import org.apache.beam.sdk.transforms.GroupByKey;
import org.apache.beam.sdk.transforms.PTransform;
import org.apache.beam.sdk.transforms.ParDo;
import org.apache.beam.sdk.transforms.View;
import org.apache.beam.sdk.transforms.WithKeys;
import org.apache.beam.sdk.transforms.display.DisplayData;
import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
import org.apache.beam.sdk.transforms.windowing.DefaultTrigger;
import org.apache.beam.sdk.transforms.windowing.GlobalWindows;
import org.apache.beam.sdk.transforms.windowing.PaneInfo;
import org.apache.beam.sdk.transforms.windowing.Window;
import org.apache.beam.sdk.values.KV;
import org.apache.beam.sdk.values.PCollection;
import org.apache.beam.sdk.values.PCollection.IsBounded;
import org.apache.beam.sdk.values.PCollectionView;
import org.apache.beam.sdk.values.PDone;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * A {@link PTransform} that writes to a {@link FileBasedSink}. A write begins with a sequential
 * global initialization of a sink, followed by a parallel write, and ends with a sequential
 * finalization of the write. The output of a write is {@link PDone}.
 *
 * <p>By default, every bundle in the input {@link PCollection} will be processed by a
 * {@link WriteOperation}, so the number of output
 * will vary based on runner behavior, though at least 1 output will always be produced. The
 * exact parallelism of the write stage can be controlled using {@link WriteFiles#withNumShards},
 * typically used to control how many files are produced or to globally limit the number of
 * workers connecting to an external service. However, this option can often hurt performance: it
 * adds an additional {@link GroupByKey} to the pipeline.
 *
 * <p>Example usage with runner-determined sharding:
 *
 * <pre>{@code p.apply(WriteFiles.to(new MySink(...)));}</pre>
 *
 * <p>Example usage with a fixed number of shards:
 *
 * <pre>{@code p.apply(WriteFiles.to(new MySink(...)).withNumShards(3));}</pre>
 */
@Experimental(Experimental.Kind.SOURCE_SINK)
public class WriteFiles<T> extends PTransform<PCollection<T>, PDone> {
  private static final Logger LOG = LoggerFactory.getLogger(WriteFiles.class);

  static final int UNKNOWN_SHARDNUM = -1;
  private FileBasedSink<T> sink;
  private WriteOperation<T> writeOperation;
  // This allows the number of shards to be dynamically computed based on the input
  // PCollection.
  @Nullable
  private final PTransform<PCollection<T>, PCollectionView<Integer>> computeNumShards;
  // We don't use a side input for static sharding, as we want this value to be updatable
  // when a pipeline is updated.
  @Nullable
  private final ValueProvider<Integer> numShardsProvider;
  private boolean windowedWrites;

  /**
   * Creates a {@link WriteFiles} transform that writes to the given {@link FileBasedSink}, letting
   * the runner control how many different shards are produced.
   */
  public static <T> WriteFiles<T> to(FileBasedSink<T> sink) {
    checkNotNull(sink, "sink");
    return new WriteFiles<>(sink, null /* runner-determined sharding */, null, false);
  }

  private WriteFiles(
      FileBasedSink<T> sink,
      @Nullable PTransform<PCollection<T>, PCollectionView<Integer>> computeNumShards,
      @Nullable ValueProvider<Integer> numShardsProvider,
      boolean windowedWrites) {
    this.sink = sink;
    this.computeNumShards = computeNumShards;
    this.numShardsProvider = numShardsProvider;
    this.windowedWrites = windowedWrites;
  }

  @Override
  public PDone expand(PCollection<T> input) {
    if (input.isBounded() == IsBounded.UNBOUNDED) {
      checkArgument(windowedWrites,
          "Must use windowed writes when applying %s to an unbounded PCollection",
          WriteFiles.class.getSimpleName());
      // The reason for this is https://issues.apache.org/jira/browse/BEAM-1438
      // and similar behavior in other runners.
      checkArgument(
          computeNumShards != null || numShardsProvider != null,
          "When applying %s to an unbounded PCollection, "
              + "must specify number of output shards explicitly",
          WriteFiles.class.getSimpleName());
    }
    this.writeOperation = sink.createWriteOperation();
    this.writeOperation.setWindowedWrites(windowedWrites);
    return createWrite(input);
  }

  @Override
  public void validate(PipelineOptions options) {
    sink.validate(options);
  }

  @Override
  public void populateDisplayData(DisplayData.Builder builder) {
    super.populateDisplayData(builder);
    builder
        .add(DisplayData.item("sink", sink.getClass()).withLabel("WriteFiles Sink"))
        .include("sink", sink);
    if (getSharding() != null) {
      builder.include("sharding", getSharding());
    } else if (getNumShards() != null) {
      String numShards = getNumShards().isAccessible()
          ? getNumShards().get().toString() : getNumShards().toString();
      builder.add(DisplayData.item("numShards", numShards)
          .withLabel("Fixed Number of Shards"));
    }
  }

  /**
   * Returns the {@link FileBasedSink} associated with this PTransform.
   */
  public FileBasedSink<T> getSink() {
    return sink;
  }

  /**
   * Gets the {@link PTransform} that will be used to determine sharding. This can be either a
   * static number of shards (as following a call to {@link #withNumShards(int)}), dynamic (by
   * {@link #withSharding(PTransform)}), or runner-determined (by {@link
   * #withRunnerDeterminedSharding()}.
   */
  @Nullable
  public PTransform<PCollection<T>, PCollectionView<Integer>> getSharding() {
    return computeNumShards;
  }

  public ValueProvider<Integer> getNumShards() {
    return numShardsProvider;
  }

  /**
   * Returns a new {@link WriteFiles} that will write to the current {@link FileBasedSink} using the
   * specified number of shards.
   *
   * <p>This option should be used sparingly as it can hurt performance. See {@link WriteFiles} for
   * more information.
   *
   * <p>A value less than or equal to 0 will be equivalent to the default behavior of
   * runner-determined sharding.
   */
  public WriteFiles<T> withNumShards(int numShards) {
    if (numShards > 0) {
      return withNumShards(StaticValueProvider.of(numShards));
    }
    return withRunnerDeterminedSharding();
  }

  /**
   * Returns a new {@link WriteFiles} that will write to the current {@link FileBasedSink} using the
   * {@link ValueProvider} specified number of shards.
   *
   * <p>This option should be used sparingly as it can hurt performance. See {@link WriteFiles} for
   * more information.
   */
  public WriteFiles<T> withNumShards(ValueProvider<Integer> numShardsProvider) {
    return new WriteFiles<>(sink, null, numShardsProvider, windowedWrites);
  }

  /**
   * Returns a new {@link WriteFiles} that will write to the current {@link FileBasedSink} using the
   * specified {@link PTransform} to compute the number of shards.
   *
   * <p>This option should be used sparingly as it can hurt performance. See {@link WriteFiles} for
   * more information.
   */
  public WriteFiles<T> withSharding(PTransform<PCollection<T>, PCollectionView<Integer>> sharding) {
    checkNotNull(
        sharding, "Cannot provide null sharding. Use withRunnerDeterminedSharding() instead");
    return new WriteFiles<>(sink, sharding, null, windowedWrites);
  }

  /**
   * Returns a new {@link WriteFiles} that will write to the current {@link FileBasedSink} with
   * runner-determined sharding.
   */
  public WriteFiles<T> withRunnerDeterminedSharding() {
    return new WriteFiles<>(sink, null, null, windowedWrites);
  }

  /**
   * Returns a new {@link WriteFiles} that writes preserves windowing on it's input.
   *
   * <p>If this option is not specified, windowing and triggering are replaced by
   * {@link GlobalWindows} and {@link DefaultTrigger}.
   *
   * <p>If there is no data for a window, no output shards will be generated for that window.
   * If a window triggers multiple times, then more than a single output shard might be
   * generated multiple times; it's up to the sink implementation to keep these output shards
   * unique.
   *
   * <p>This option can only be used if {@link #withNumShards(int)} is also set to a
   * positive value.
   */
  public WriteFiles<T> withWindowedWrites() {
    return new WriteFiles<>(sink, computeNumShards, numShardsProvider, true);
  }

  /**
   * Writes all the elements in a bundle using a {@link Writer} produced by the
   * {@link WriteOperation} associated with the {@link FileBasedSink} with windowed writes enabled.
   */
  private class WriteWindowedBundles extends DoFn<T, FileResult> {
    private Map<KV<BoundedWindow, PaneInfo>, Writer<T>> windowedWriters;

    @StartBundle
    public void startBundle(StartBundleContext c) {
      // Reset state in case of reuse. We need to make sure that each bundle gets unique writers.
      windowedWriters = Maps.newHashMap();
    }

    @ProcessElement
    public void processElement(ProcessContext c, BoundedWindow window) throws Exception {
      PaneInfo paneInfo = c.pane();
      Writer<T> writer;
      // If we are doing windowed writes, we need to ensure that we have separate files for
      // data in different windows/panes.
      KV<BoundedWindow, PaneInfo> key = KV.of(window, paneInfo);
      writer = windowedWriters.get(key);
      if (writer == null) {
        String uuid = UUID.randomUUID().toString();
        LOG.info(
            "Opening writer {} for write operation {}, window {} pane {}",
            uuid,
            writeOperation,
            window,
            paneInfo);
        writer = writeOperation.createWriter();
        writer.openWindowed(uuid, window, paneInfo, UNKNOWN_SHARDNUM);
        windowedWriters.put(key, writer);
        LOG.debug("Done opening writer");
      }

      writeOrClose(writer, c.element());
    }

    @FinishBundle
    public void finishBundle(FinishBundleContext c) throws Exception {
      for (Map.Entry<KV<BoundedWindow, PaneInfo>, Writer<T>> entry : windowedWriters.entrySet()) {
        FileResult result = entry.getValue().close();
        BoundedWindow window = entry.getKey().getKey();
        c.output(result, window.maxTimestamp(), window);
      }
    }

    @Override
    public void populateDisplayData(DisplayData.Builder builder) {
      builder.delegate(WriteFiles.this);
    }
  }

  /**
   * Writes all the elements in a bundle using a {@link Writer} produced by the
   * {@link WriteOperation} associated with the {@link FileBasedSink} with windowed writes disabled.
   */
  private class WriteUnwindowedBundles extends DoFn<T, FileResult> {
    // Writer that will write the records in this bundle. Lazily
    // initialized in processElement.
    private Writer<T> writer = null;
    private BoundedWindow window = null;

    @StartBundle
    public void startBundle(StartBundleContext c) {
      // Reset state in case of reuse. We need to make sure that each bundle gets unique writers.
      writer = null;
    }

    @ProcessElement
    public void processElement(ProcessContext c, BoundedWindow window) throws Exception {
      // Cache a single writer for the bundle.
      if (writer == null) {
        LOG.info("Opening writer for write operation {}", writeOperation);
        writer = writeOperation.createWriter();
        writer.openUnwindowed(UUID.randomUUID().toString(), UNKNOWN_SHARDNUM);
        LOG.debug("Done opening writer");
      }
      this.window = window;
      writeOrClose(this.writer, c.element());
    }

    @FinishBundle
    public void finishBundle(FinishBundleContext c) throws Exception {
      if (writer == null) {
        return;
      }
      FileResult result = writer.close();
      c.output(result, window.maxTimestamp(), window);
    }

    @Override
    public void populateDisplayData(DisplayData.Builder builder) {
      builder.delegate(WriteFiles.this);
    }
  }

  /**
   * Like {@link WriteWindowedBundles} and {@link WriteUnwindowedBundles}, but where the elements
   * for each shard have been collected into a single iterable.
   */
  private class WriteShardedBundles extends DoFn<KV<Integer, Iterable<T>>, FileResult> {
    @ProcessElement
    public void processElement(ProcessContext c, BoundedWindow window) throws Exception {
      // In a sharded write, single input element represents one shard. We can open and close
      // the writer in each call to processElement.
      LOG.info("Opening writer for write operation {}", writeOperation);
      Writer<T> writer = writeOperation.createWriter();
      if (windowedWrites) {
        writer.openWindowed(UUID.randomUUID().toString(), window, c.pane(), c.element().getKey());
      } else {
        writer.openUnwindowed(UUID.randomUUID().toString(), UNKNOWN_SHARDNUM);
      }
      LOG.debug("Done opening writer");

      try {
        for (T t : c.element().getValue()) {
          writeOrClose(writer, t);
        }

        // Close the writer; if this throws let the error propagate.
        FileResult result = writer.close();
        c.output(result);
      } catch (Exception e) {
        // If anything goes wrong, make sure to delete the temporary file.
        writer.cleanup();
        throw e;
      }
    }

    @Override
    public void populateDisplayData(DisplayData.Builder builder) {
      builder.delegate(WriteFiles.this);
    }
  }

  private static <T> void writeOrClose(Writer<T> writer, T t) throws Exception {
    try {
      writer.write(t);
    } catch (Exception e) {
      try {
        writer.close();
      } catch (Exception closeException) {
        if (closeException instanceof InterruptedException) {
          // Do not silently ignore interrupted state.
          Thread.currentThread().interrupt();
        }
        // Do not mask the exception that caused the write to fail.
        e.addSuppressed(closeException);
      }
      throw e;
    }
  }

  private static class ApplyShardingKey<T> extends DoFn<T, KV<Integer, T>> {
    private final PCollectionView<Integer> numShardsView;
    private final ValueProvider<Integer> numShardsProvider;
    private int shardNumber;

    ApplyShardingKey(PCollectionView<Integer> numShardsView,
                     ValueProvider<Integer> numShardsProvider) {
      this.numShardsView = numShardsView;
      this.numShardsProvider = numShardsProvider;
      shardNumber = UNKNOWN_SHARDNUM;
    }

    @ProcessElement
    public void processElement(ProcessContext context) {
      final int shardCount;
      if (numShardsView != null) {
        shardCount = context.sideInput(numShardsView);
      } else {
        checkNotNull(numShardsProvider);
        shardCount = numShardsProvider.get();
      }
      checkArgument(
          shardCount > 0,
          "Must have a positive number of shards specified for non-runner-determined sharding."
              + " Got %s",
          shardCount);
      if (shardNumber == UNKNOWN_SHARDNUM) {
        // We want to desynchronize the first record sharding key for each instance of
        // ApplyShardingKey, so records in a small PCollection will be statistically balanced.
        shardNumber = ThreadLocalRandom.current().nextInt(shardCount);
      } else {
        shardNumber = (shardNumber + 1) % shardCount;
      }
      context.output(KV.of(shardNumber, context.element()));
    }
  }

  /**
   * A write is performed as sequence of three {@link ParDo}'s.
   *
   * <p>This singleton collection containing the WriteOperation is then used as a side
   * input to a ParDo over the PCollection of elements to write. In this bundle-writing phase,
   * {@link WriteOperation#createWriter} is called to obtain a {@link Writer}.
   * {@link Writer#open} and {@link Writer#close} are called in
   * {@link DoFn.StartBundle} and {@link DoFn.FinishBundle}, respectively, and
   * {@link Writer#write} method is called for every element in the bundle. The output
   * of this ParDo is a PCollection of <i>writer result</i> objects (see {@link FileBasedSink}
   * for a description of writer results)-one for each bundle.
   *
   * <p>The final do-once ParDo uses a singleton collection asinput and the collection of writer
   * results as a side-input. In this ParDo, {@link WriteOperation#finalize} is called
   * to finalize the write.
   *
   * <p>If the write of any element in the PCollection fails, {@link Writer#close} will be
   * called before the exception that caused the write to fail is propagated and the write result
   * will be discarded.
   *
   * <p>Since the {@link WriteOperation} is serialized after the initialization ParDo and
   * deserialized in the bundle-writing and finalization phases, any state change to the
   * WriteOperation object that occurs during initialization is visible in the latter
   * phases. However, the WriteOperation is not serialized after the bundle-writing
   * phase. This is why implementations should guarantee that
   * {@link WriteOperation#createWriter} does not mutate WriteOperation).
   */
  private PDone createWrite(PCollection<T> input) {
    Pipeline p = input.getPipeline();

    if (!windowedWrites) {
      // Re-window the data into the global window and remove any existing triggers.
      input =
          input.apply(
              Window.<T>into(new GlobalWindows())
                  .triggering(DefaultTrigger.of())
                  .discardingFiredPanes());
    }


    // Perform the per-bundle writes as a ParDo on the input PCollection (with the
    // WriteOperation as a side input) and collect the results of the writes in a
    // PCollection. There is a dependency between this ParDo and the first (the
    // WriteOperation PCollection as a side input), so this will happen after the
    // initial ParDo.
    PCollection<FileResult> results;
    final PCollectionView<Integer> numShardsView;
    Coder<BoundedWindow> shardedWindowCoder =
        (Coder<BoundedWindow>) input.getWindowingStrategy().getWindowFn().windowCoder();
    if (computeNumShards == null && numShardsProvider == null) {
      numShardsView = null;
      results =
          input.apply(
              "WriteBundles",
              ParDo.of(windowedWrites ? new WriteWindowedBundles() : new WriteUnwindowedBundles()));
    } else {
      List<PCollectionView<?>> sideInputs = Lists.newArrayList();
      if (computeNumShards != null) {
        numShardsView = input.apply(computeNumShards);
        sideInputs.add(numShardsView);
      } else {
        numShardsView = null;
      }

      PCollection<KV<Integer, Iterable<T>>> sharded =
          input
              .apply("ApplyShardLabel", ParDo.of(
                  new ApplyShardingKey<T>(numShardsView,
                      (numShardsView != null) ? null : numShardsProvider))
                  .withSideInputs(sideInputs))
              .apply("GroupIntoShards", GroupByKey.<Integer, T>create());
      shardedWindowCoder =
          (Coder<BoundedWindow>) sharded.getWindowingStrategy().getWindowFn().windowCoder();

      results = sharded.apply("WriteShardedBundles", ParDo.of(new WriteShardedBundles()));
    }
    results.setCoder(FileResultCoder.of(shardedWindowCoder));

    if (windowedWrites) {
      // When processing streaming windowed writes, results will arrive multiple times. This
      // means we can't share the below implementation that turns the results into a side input,
      // as new data arriving into a side input does not trigger the listening DoFn. Instead
      // we aggregate the result set using a singleton GroupByKey, so the DoFn will be triggered
      // whenever new data arrives.
      PCollection<KV<Void, FileResult>> keyedResults =
          results.apply("AttachSingletonKey", WithKeys.<Void, FileResult>of((Void) null));
      keyedResults.setCoder(KvCoder.of(VoidCoder.of(),
          FileResultCoder.of(shardedWindowCoder)));

      // Is the continuation trigger sufficient?
      keyedResults
          .apply("FinalizeGroupByKey", GroupByKey.<Void, FileResult>create())
          .apply("Finalize", ParDo.of(new DoFn<KV<Void, Iterable<FileResult>>, Integer>() {
            @ProcessElement
            public void processElement(ProcessContext c) throws Exception {
              LOG.info("Finalizing write operation {}.", writeOperation);
              List<FileResult> results = Lists.newArrayList(c.element().getValue());
              writeOperation.finalize(results);
              LOG.debug("Done finalizing write operation");
            }
          }));
    } else {
      final PCollectionView<Iterable<FileResult>> resultsView =
          results.apply(View.<FileResult>asIterable());
      ImmutableList.Builder<PCollectionView<?>> sideInputs =
          ImmutableList.<PCollectionView<?>>builder().add(resultsView);
      if (numShardsView != null) {
        sideInputs.add(numShardsView);
      }

      // Finalize the write in another do-once ParDo on the singleton collection containing the
      // Writer. The results from the per-bundle writes are given as an Iterable side input.
      // The WriteOperation's state is the same as after its initialization in the first
      // do-once ParDo. There is a dependency between this ParDo and the parallel write (the writer
      // results collection as a side input), so it will happen after the parallel write.
      // For the non-windowed case, we guarantee that  if no data is written but the user has
      // set numShards, then all shards will be written out as empty files. For this reason we
      // use a side input here.
      PCollection<Void> singletonCollection = p.apply(Create.of((Void) null));
      singletonCollection
          .apply("Finalize", ParDo.of(new DoFn<Void, Integer>() {
            @ProcessElement
            public void processElement(ProcessContext c) throws Exception {
              LOG.info("Finalizing write operation {}.", writeOperation);
              List<FileResult> results = Lists.newArrayList(c.sideInput(resultsView));
              LOG.debug("Side input initialized to finalize write operation {}.", writeOperation);

              // We must always output at least 1 shard, and honor user-specified numShards if
              // set.
              int minShardsNeeded;
              if (numShardsView != null) {
                minShardsNeeded = c.sideInput(numShardsView);
              } else if (numShardsProvider != null) {
                minShardsNeeded = numShardsProvider.get();
              } else {
                minShardsNeeded = 1;
              }
              int extraShardsNeeded = minShardsNeeded - results.size();
              if (extraShardsNeeded > 0) {
                LOG.info(
                    "Creating {} empty output shards in addition to {} written for a total of {}.",
                    extraShardsNeeded, results.size(), minShardsNeeded);
                for (int i = 0; i < extraShardsNeeded; ++i) {
                  Writer<T> writer = writeOperation.createWriter();
                  writer.openUnwindowed(UUID.randomUUID().toString(), UNKNOWN_SHARDNUM);
                  FileResult emptyWrite = writer.close();
                  results.add(emptyWrite);
                }
                LOG.debug("Done creating extra shards.");
              }
              writeOperation.finalize(results);
              LOG.debug("Done finalizing write operation {}", writeOperation);
            }
          }).withSideInputs(sideInputs.build()));
    }
    return PDone.in(input.getPipeline());
  }
}