FileBasedSink.java example

Explorer
beam-master
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.beam.sdk.io;

import static com.google.common.base.MoreObjects.firstNonNull;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkNotNull;
import static com.google.common.base.Preconditions.checkState;
import static com.google.common.base.Verify.verifyNotNull;
import static org.apache.beam.sdk.io.WriteFiles.UNKNOWN_SHARDNUM;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.MoreObjects;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Ordering;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.Serializable;
import java.nio.channels.Channels;
import java.nio.channels.WritableByteChannel;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.atomic.AtomicLong;
import java.util.zip.GZIPOutputStream;
import javax.annotation.Nullable;
import org.apache.beam.sdk.annotations.Experimental;
import org.apache.beam.sdk.annotations.Experimental.Kind;
import org.apache.beam.sdk.coders.Coder;
import org.apache.beam.sdk.coders.CoderException;
import org.apache.beam.sdk.coders.NullableCoder;
import org.apache.beam.sdk.coders.StringUtf8Coder;
import org.apache.beam.sdk.coders.StructuredCoder;
import org.apache.beam.sdk.coders.VarIntCoder;
import org.apache.beam.sdk.io.FileBasedSink.FilenamePolicy.Context;
import org.apache.beam.sdk.io.FileBasedSink.FilenamePolicy.WindowedContext;
import org.apache.beam.sdk.io.fs.MatchResult;
import org.apache.beam.sdk.io.fs.MatchResult.Metadata;
import org.apache.beam.sdk.io.fs.MoveOptions.StandardMoveOptions;
import org.apache.beam.sdk.io.fs.ResolveOptions.StandardResolveOptions;
import org.apache.beam.sdk.io.fs.ResourceId;
import org.apache.beam.sdk.options.PipelineOptions;
import org.apache.beam.sdk.options.ValueProvider;
import org.apache.beam.sdk.options.ValueProvider.NestedValueProvider;
import org.apache.beam.sdk.options.ValueProvider.StaticValueProvider;
import org.apache.beam.sdk.transforms.SerializableFunction;
import org.apache.beam.sdk.transforms.display.DisplayData;
import org.apache.beam.sdk.transforms.display.HasDisplayData;
import org.apache.beam.sdk.transforms.windowing.BoundedWindow;
import org.apache.beam.sdk.transforms.windowing.PaneInfo;
import org.apache.beam.sdk.transforms.windowing.PaneInfo.PaneInfoCoder;
import org.apache.beam.sdk.util.MimeTypes;
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorOutputStream;
import org.apache.commons.compress.compressors.deflate.DeflateCompressorOutputStream;
import org.joda.time.Instant;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Abstract class for file-based output. An implementation of FileBasedSink writes file-based
 * output and defines the format of output files (how values are written, headers/footers, MIME
 * type, etc.).
 *
 * <p>At pipeline construction time, the methods of FileBasedSink are called to validate the sink
 * and to create a {@link WriteOperation} that manages the process of writing to the sink.
 *
 * <p>The process of writing to file-based sink is as follows:
 * <ol>
 * <li>An optional subclass-defined initialization,
 * <li>a parallel write of bundles to temporary files, and finally,
 * <li>these temporary files are renamed with final output filenames.
 * </ol>
 *
 * <p>In order to ensure fault-tolerance, a bundle may be executed multiple times (e.g., in the
 * event of failure/retry or for redundancy). However, exactly one of these executions will have its
 * result passed to the finalize method. Each call to {@link Writer#openWindowed}
 * or {@link Writer#openUnwindowed} is passed a unique <i>bundle id</i> when it is called
 * by the WriteFiles transform, so even redundant or retried bundles will have a unique way of
 * identifying
 * their output.
 *
 * <p>The bundle id should be used to guarantee that a bundle's output is unique. This uniqueness
 * guarantee is important; if a bundle is to be output to a file, for example, the name of the file
 * will encode the unique bundle id to avoid conflicts with other writers.
 *
 * {@link FileBasedSink} can take a custom {@link FilenamePolicy} object to determine output
 * filenames, and this policy object can be used to write windowed or triggered
 * PCollections into separate files per window pane. This allows file output from unbounded
 * PCollections, and also works for bounded PCollecctions.
 *
 * <p>Supported file systems are those registered with {@link FileSystems}.
 *
 * @param <T> the type of values written to the sink.
 */
@Experimental(Kind.FILESYSTEM)
public abstract class FileBasedSink<T> implements Serializable, HasDisplayData {
  private static final Logger LOG = LoggerFactory.getLogger(FileBasedSink.class);

  /**
   * Directly supported file output compression types.
   */
  public enum CompressionType implements WritableByteChannelFactory {
    /**
     * No compression, or any other transformation, will be used.
     */
    UNCOMPRESSED("", null) {
      @Override
      public WritableByteChannel create(WritableByteChannel channel) throws IOException {
        return channel;
      }
    },
    /**
     * Provides GZip output transformation.
     */
    GZIP(".gz", MimeTypes.BINARY) {
      @Override
      public WritableByteChannel create(WritableByteChannel channel) throws IOException {
        return Channels.newChannel(new GZIPOutputStream(Channels.newOutputStream(channel), true));
      }
    },
    /**
     * Provides BZip2 output transformation.
     */
    BZIP2(".bz2", MimeTypes.BINARY) {
      @Override
      public WritableByteChannel create(WritableByteChannel channel) throws IOException {
        return Channels
            .newChannel(new BZip2CompressorOutputStream(Channels.newOutputStream(channel)));
      }
    },
    /**
     * Provides deflate output transformation.
     */
    DEFLATE(".deflate", MimeTypes.BINARY) {
      @Override
      public WritableByteChannel create(WritableByteChannel channel) throws IOException {
        return Channels
            .newChannel(new DeflateCompressorOutputStream(Channels.newOutputStream(channel)));
      }
    };

    private String filenameSuffix;
    @Nullable private String mimeType;

    CompressionType(String suffix, @Nullable String mimeType) {
      this.filenameSuffix = suffix;
      this.mimeType = mimeType;
    }

    @Override
    public String getFilenameSuffix() {
      return filenameSuffix;
    }

    @Override
    @Nullable public String getMimeType() {
      return mimeType;
    }
  }

  /**
   * This is a helper function for turning a user-provided output filename prefix and converting it
   * into a {@link ResourceId} for writing output files. See {@link TextIO.Write#to(String)} for an
   * example use case.
   *
   * <p>Typically, the input prefix will be something like {@code /tmp/foo/bar}, and the user would
   * like output files to be named as {@code /tmp/foo/bar-0-of-3.txt}. Thus, this function tries to
   * interpret the provided string as a file {@link ResourceId} path.
   *
   * <p>However, this may fail, for example if the user gives a prefix that is a directory. E.g.,
   * {@code /}, {@code gs://my-bucket}, or {@code c://}. In that case, interpreting the string as a
   * file will fail and this function will return a directory {@link ResourceId} instead.
   */
  @Experimental(Kind.FILESYSTEM)
  public static ResourceId convertToFileResourceIfPossible(String outputPrefix) {
    try {
      return FileSystems.matchNewResource(outputPrefix, false /* isDirectory */);
    } catch (Exception e) {
      return FileSystems.matchNewResource(outputPrefix, true /* isDirectory */);
    }
  }

  /**
   * The {@link WritableByteChannelFactory} that is used to wrap the raw data output to the
   * underlying channel. The default is to not compress the output using
   * {@link CompressionType#UNCOMPRESSED}.
   */
  private final WritableByteChannelFactory writableByteChannelFactory;

  /**
   * A naming policy for output files.
   */
  public abstract static class FilenamePolicy implements Serializable {
    /**
     * Context used for generating a name based on shard number, and num shards.
     * The policy must produce unique filenames for unique {@link Context} objects.
     *
     * <p>Be careful about adding fields to this as existing strategies will not notice the new
     * fields, and may not produce unique filenames.
     */
    public static class Context {
      private int shardNumber;
      private int numShards;


      public Context(int shardNumber, int numShards) {
        this.shardNumber = shardNumber;
        this.numShards = numShards;
      }

      public int getShardNumber() {
        return shardNumber;
      }


      public int getNumShards() {
        return numShards;
      }
    }

    /**
     * Context used for generating a name based on window, pane, shard number, and num shards.
     * The policy must produce unique filenames for unique {@link WindowedContext} objects.
     *
     * <p>Be careful about adding fields to this as existing strategies will not notice the new
     * fields, and may not produce unique filenames.
     */
    public static class WindowedContext {
      private int shardNumber;
      private int numShards;
      private BoundedWindow window;
      private PaneInfo paneInfo;

      public WindowedContext(
          BoundedWindow window,
          PaneInfo paneInfo,
          int shardNumber,
          int numShards) {
        this.window = window;
        this.paneInfo = paneInfo;
        this.shardNumber = shardNumber;
        this.numShards = numShards;
      }

      public BoundedWindow getWindow() {
        return window;
      }

      public PaneInfo getPaneInfo() {
        return paneInfo;
      }

      public int getShardNumber() {
        return shardNumber;
      }

      public int getNumShards() {
        return numShards;
      }
    }

    /**
     * When a sink has requested windowed or triggered output, this method will be invoked to return
     * the file {@link ResourceId resource} to be created given the base output directory and a
     * (possibly empty) extension from {@link FileBasedSink} configuration
     * (e.g., {@link CompressionType}).
     *
     * <p>The {@link WindowedContext} object gives access to the window and pane,
     * as well as sharding information. The policy must return unique and consistent filenames
     * for different windows and panes.
     */
    @Experimental(Kind.FILESYSTEM)
    public abstract ResourceId windowedFilename(
        ResourceId outputDirectory, WindowedContext c, String extension);

    /**
     * When a sink has not requested windowed or triggered output, this method will be invoked to
     * return the file {@link ResourceId resource} to be created given the base output directory and
     * a (possibly empty) extension applied by additional {@link FileBasedSink} configuration
     * (e.g., {@link CompressionType}).
     *
     * <p>The {@link Context} object only provides sharding information, which is used by the policy
     * to generate unique and consistent filenames.
     */
    @Experimental(Kind.FILESYSTEM)
    @Nullable public abstract ResourceId unwindowedFilename(
        ResourceId outputDirectory, Context c, String extension);

    /**
     * Populates the display data.
     */
    public void populateDisplayData(DisplayData.Builder builder) {
    }
  }

  /** The policy used to generate names of files to be produced. */
  private final FilenamePolicy filenamePolicy;
  /** The directory to which files will be written. */
  private final ValueProvider<ResourceId> baseOutputDirectoryProvider;

  /**
   * Construct a {@link FileBasedSink} with the given filename policy, producing uncompressed files.
   */
  @Experimental(Kind.FILESYSTEM)
  public FileBasedSink(
      ValueProvider<ResourceId> baseOutputDirectoryProvider, FilenamePolicy filenamePolicy) {
    this(baseOutputDirectoryProvider, filenamePolicy, CompressionType.UNCOMPRESSED);
  }

  private static class ExtractDirectory implements SerializableFunction<ResourceId, ResourceId> {
    @Override
    public ResourceId apply(ResourceId input) {
      return input.getCurrentDirectory();
    }
  }

  /**
   * Construct a {@link FileBasedSink} with the given filename policy and output channel type.
   */
  @Experimental(Kind.FILESYSTEM)
  public FileBasedSink(
      ValueProvider<ResourceId> baseOutputDirectoryProvider,
      FilenamePolicy filenamePolicy,
      WritableByteChannelFactory writableByteChannelFactory) {
    this.baseOutputDirectoryProvider =
        NestedValueProvider.of(baseOutputDirectoryProvider, new ExtractDirectory());
    this.filenamePolicy = filenamePolicy;
    this.writableByteChannelFactory = writableByteChannelFactory;
  }

  /**
   * Returns the base directory inside which files will be written according to the configured
   * {@link FilenamePolicy}.
   */
  @Experimental(Kind.FILESYSTEM)
  public ValueProvider<ResourceId> getBaseOutputDirectoryProvider() {
    return baseOutputDirectoryProvider;
  }

  /**
   * Returns the policy by which files will be named inside of the base output directory. Note that
   * the {@link FilenamePolicy} may itself specify one or more inner directories before each output
   * file, say when writing windowed outputs in a {@code output/YYYY/MM/DD/file.txt} format.
   */
  @Experimental(Kind.FILESYSTEM)
  public final FilenamePolicy getFilenamePolicy() {
    return filenamePolicy;
  }

  public void validate(PipelineOptions options) {}

  /**
   * Return a subclass of {@link WriteOperation} that will manage the write
   * to the sink.
   */
  public abstract WriteOperation<T> createWriteOperation();

  public void populateDisplayData(DisplayData.Builder builder) {
    getFilenamePolicy().populateDisplayData(builder);
  }

  /**
   * Abstract operation that manages the process of writing to {@link FileBasedSink}.
   *
   * <p>The primary responsibilities of the WriteOperation is the management of output
   * files. During a write, {@link Writer}s write bundles to temporary file
   * locations. After the bundles have been written,
   * <ol>
   * <li>{@link WriteOperation#finalize} is given a list of the temporary
   * files containing the output bundles.
   * <li>During finalize, these temporary files are copied to final output locations and named
   * according to a file naming template.
   * <li>Finally, any temporary files that were created during the write are removed.
   * </ol>
   *
   * <p>Subclass implementations of WriteOperation must implement
   * {@link WriteOperation#createWriter} to return a concrete
   * FileBasedSinkWriter.
   *
   * <h2>Temporary and Output File Naming:</h2> During the write, bundles are written to temporary
   * files using the tempDirectory that can be provided via the constructor of
   * WriteOperation. These temporary files will be named
   * {@code {tempDirectory}/{bundleId}}, where bundleId is the unique id of the bundle.
   * For example, if tempDirectory is "gs://my-bucket/my_temp_output", the output for a
   * bundle with bundle id 15723 will be "gs://my-bucket/my_temp_output/15723".
   *
   * <p>Final output files are written to baseOutputFilename with the format
   * {@code {baseOutputFilename}-0000i-of-0000n.{extension}} where n is the total number of bundles
   * written and extension is the file extension. Both baseOutputFilename and extension are required
   * constructor arguments.
   *
   * <p>Subclass implementations can change the file naming template by supplying a value for
   * fileNamingTemplate.
   *
   * <p>Note that in the case of permanent failure of a bundle's write, no clean up of temporary
   * files will occur.
   *
   * <p>If there are no elements in the PCollection being written, no output will be generated.
   *
   * @param <T> the type of values written to the sink.
   */
  public abstract static class WriteOperation<T> implements Serializable {
    /**
     * The Sink that this WriteOperation will write to.
     */
    protected final FileBasedSink<T> sink;

    /** Directory for temporary output files. */
    protected final ValueProvider<ResourceId> tempDirectory;

    /** Whether windowed writes are being used. */
    @Experimental(Kind.FILESYSTEM)
    protected boolean windowedWrites;

    /** Constructs a temporary file resource given the temporary directory and a filename. */
    @Experimental(Kind.FILESYSTEM)
    protected static ResourceId buildTemporaryFilename(ResourceId tempDirectory, String filename)
        throws IOException {
      return tempDirectory.resolve(filename, StandardResolveOptions.RESOLVE_FILE);
    }

    /**
     * Constructs a WriteOperation using the default strategy for generating a temporary
     * directory from the base output filename.
     *
     * <p>Default is a uniquely named sibling of baseOutputFilename, e.g. if baseOutputFilename is
     * /path/to/foo, the temporary directory will be /path/to/temp-beam-foo-$date.
     *
     * @param sink the FileBasedSink that will be used to configure this write operation.
     */
    public WriteOperation(FileBasedSink<T> sink) {
      this(sink, NestedValueProvider.of(
          sink.getBaseOutputDirectoryProvider(), new TemporaryDirectoryBuilder()));
    }

    private static class TemporaryDirectoryBuilder
        implements SerializableFunction<ResourceId, ResourceId> {
      private static final AtomicLong TEMP_COUNT = new AtomicLong(0);
      private static final DateTimeFormatter TEMPDIR_TIMESTAMP =
          DateTimeFormat.forPattern("yyyy-MM-DD_HH-mm-ss");
      // The intent of the code is to have a consistent value of tempDirectory across
      // all workers, which wouldn't happen if now() was called inline.
      private final String timestamp = Instant.now().toString(TEMPDIR_TIMESTAMP);
      // Multiple different sinks may be used in the same output directory; use tempId to create a
      // separate temp directory for each.
      private final Long tempId = TEMP_COUNT.getAndIncrement();

      @Override
      public ResourceId apply(ResourceId baseOutputDirectory) {
        // Temp directory has a timestamp and a unique ID
        String tempDirName = String.format(".temp-beam-%s-%s", timestamp, tempId);
        return baseOutputDirectory.resolve(tempDirName, StandardResolveOptions.RESOLVE_DIRECTORY);
      }
    }

    /**
     * Create a new WriteOperation.
     *
     * @param sink the FileBasedSink that will be used to configure this write operation.
     * @param tempDirectory the base directory to be used for temporary output files.
     */
    @Experimental(Kind.FILESYSTEM)
    public WriteOperation(FileBasedSink<T> sink, ResourceId tempDirectory) {
      this(sink, StaticValueProvider.of(tempDirectory));
    }

    private WriteOperation(
        FileBasedSink<T> sink, ValueProvider<ResourceId> tempDirectory) {
      this.sink = sink;
      this.tempDirectory = tempDirectory;
      this.windowedWrites = false;
    }

    /**
     * Clients must implement to return a subclass of {@link Writer}. This
     * method must not mutate the state of the object.
     */
    public abstract Writer<T> createWriter() throws Exception;

    /**
     * Indicates that the operation will be performing windowed writes.
     */
    public void setWindowedWrites(boolean windowedWrites) {
      this.windowedWrites = windowedWrites;
    }

    /**
     * Finalizes writing by copying temporary output files to their final location and optionally
     * removing temporary files.
     *
     * <p>Finalization may be overridden by subclass implementations to perform customized
     * finalization (e.g., initiating some operation on output bundles, merging them, etc.).
     * {@code writerResults} contains the filenames of written bundles.
     *
     * <p>If subclasses override this method, they must guarantee that its implementation is
     * idempotent, as it may be executed multiple times in the case of failure or for redundancy. It
     * is a best practice to attempt to try to make this method atomic.
     *
     * @param writerResults the results of writes (FileResult).
     */
    public void finalize(Iterable<FileResult> writerResults) throws Exception {
      // Collect names of temporary files and rename them.
      Map<ResourceId, ResourceId> outputFilenames = buildOutputFilenames(writerResults);
      copyToOutputFiles(outputFilenames);

      // Optionally remove temporary files.
      // We remove the entire temporary directory, rather than specifically removing the files
      // from writerResults, because writerResults includes only successfully completed bundles,
      // and we'd like to clean up the failed ones too.
      // Note that due to GCS eventual consistency, matching files in the temp directory is also
      // currently non-perfect and may fail to delete some files.
      //
      // When windows or triggers are specified, files are generated incrementally so deleting
      // the entire directory in finalize is incorrect.
      removeTemporaryFiles(outputFilenames.keySet(), !windowedWrites);
    }

    @Experimental(Kind.FILESYSTEM)
    protected final Map<ResourceId, ResourceId> buildOutputFilenames(
        Iterable<FileResult> writerResults) {
      int numShards = Iterables.size(writerResults);
      Map<ResourceId, ResourceId> outputFilenames = new HashMap<>();

      FilenamePolicy policy = getSink().getFilenamePolicy();
      ResourceId baseOutputDir = getSink().getBaseOutputDirectoryProvider().get();

      // Either all results have a shard number set (if the sink is configured with a fixed
      // number of shards), or they all don't (otherwise).
      Boolean isShardNumberSetEverywhere = null;
      for (FileResult result : writerResults) {
        boolean isShardNumberSetHere = (result.getShard() != UNKNOWN_SHARDNUM);
        if (isShardNumberSetEverywhere == null) {
          isShardNumberSetEverywhere = isShardNumberSetHere;
        } else {
          checkArgument(
              isShardNumberSetEverywhere == isShardNumberSetHere,
              "Found a mix of files with and without shard number set: %s",
              result);
        }
      }

      if (isShardNumberSetEverywhere == null) {
        isShardNumberSetEverywhere = true;
      }

      List<FileResult> resultsWithShardNumbers = Lists.newArrayList();
      if (isShardNumberSetEverywhere) {
        resultsWithShardNumbers = Lists.newArrayList(writerResults);
      } else {
        // Sort files for idempotence. Sort by temporary filename.
        // Note that this codepath should not be used when processing triggered windows. In the
        // case of triggers, the list of FileResult objects in the Finalize iterable is not
        // deterministic, and might change over retries. This breaks the assumption below that
        // sorting the FileResult objects provides idempotency.
        List<FileResult> sortedByTempFilename =
            Ordering.from(
                new Comparator<FileResult>() {
                  @Override
                  public int compare(FileResult first, FileResult second) {
                    String firstFilename = first.getTempFilename().toString();
                    String secondFilename = second.getTempFilename().toString();
                    return firstFilename.compareTo(secondFilename);
                  }
                })
                .sortedCopy(writerResults);
        for (int i = 0; i < sortedByTempFilename.size(); i++) {
          resultsWithShardNumbers.add(sortedByTempFilename.get(i).withShard(i));
        }
      }

      for (FileResult result : resultsWithShardNumbers) {
        checkArgument(
            result.getShard() != UNKNOWN_SHARDNUM, "Should have set shard number on %s", result);
        outputFilenames.put(
            result.getTempFilename(),
            result.getDestinationFile(
                policy, baseOutputDir, numShards, getSink().getExtension()));
      }

      int numDistinctShards = new HashSet<>(outputFilenames.values()).size();
      checkState(numDistinctShards == outputFilenames.size(),
         "Only generated %s distinct file names for %s files.",
         numDistinctShards, outputFilenames.size());

      return outputFilenames;
    }

    /**
     * Copy temporary files to final output filenames using the file naming template.
     *
     * <p>Can be called from subclasses that override {@link WriteOperation#finalize}.
     *
     * <p>Files will be named according to the file naming template. The order of the output files
     * will be the same as the sorted order of the input filenames.  In other words, if the input
     * filenames are ["C", "A", "B"], baseOutputFilename is "file", the extension is ".txt", and
     * the fileNamingTemplate is "-SSS-of-NNN", the contents of A will be copied to
     * file-000-of-003.txt, the contents of B will be copied to file-001-of-003.txt, etc.
     *
     * @param filenames the filenames of temporary files.
     */
    @VisibleForTesting
    @Experimental(Kind.FILESYSTEM)
    final void copyToOutputFiles(Map<ResourceId, ResourceId> filenames)
        throws IOException {
      int numFiles = filenames.size();
      if (numFiles > 0) {
        LOG.debug("Copying {} files.", numFiles);
        List<ResourceId> srcFiles = new ArrayList<>(filenames.size());
        List<ResourceId> dstFiles = new ArrayList<>(filenames.size());
        for (Map.Entry<ResourceId, ResourceId> srcDestPair : filenames.entrySet()) {
          srcFiles.add(srcDestPair.getKey());
          dstFiles.add(srcDestPair.getValue());
        }
        // During a failure case, files may have been deleted in an earlier step. Thus
        // we ignore missing files here.
        FileSystems.copy(srcFiles, dstFiles, StandardMoveOptions.IGNORE_MISSING_FILES);
      } else {
        LOG.info("No output files to write.");
      }
    }

    /**
     * Removes temporary output files. Uses the temporary directory to find files to remove.
     *
     * <p>Can be called from subclasses that override {@link WriteOperation#finalize}.
     * <b>Note:</b>If finalize is overridden and does <b>not</b> rename or otherwise finalize
     * temporary files, this method will remove them.
     */
    @VisibleForTesting
    @Experimental(Kind.FILESYSTEM)
    final void removeTemporaryFiles(
        Set<ResourceId> knownFiles, boolean shouldRemoveTemporaryDirectory) throws IOException {
      ResourceId tempDir = tempDirectory.get();
      LOG.debug("Removing temporary bundle output files in {}.", tempDir);

      // To partially mitigate the effects of filesystems with eventually-consistent
      // directory matching APIs, we remove not only files that the filesystem says exist
      // in the directory (which may be incomplete), but also files that are known to exist
      // (produced by successfully completed bundles).

      // This may still fail to remove temporary outputs of some failed bundles, but at least
      // the common case (where all bundles succeed) is guaranteed to be fully addressed.
      Set<ResourceId> matches = new HashSet<>();
      // TODO: Windows OS cannot resolves and matches '*' in the path,
      // ignore the exception for now to avoid failing the pipeline.
      if (shouldRemoveTemporaryDirectory) {
        try {
          MatchResult singleMatch = Iterables.getOnlyElement(
              FileSystems.match(Collections.singletonList(tempDir.toString() + "*")));
          for (Metadata matchResult : singleMatch.metadata()) {
            matches.add(matchResult.resourceId());
          }
        } catch (Exception e) {
          LOG.warn("Failed to match temporary files under: [{}].", tempDir);
        }
      }
      Set<ResourceId> allMatches = new HashSet<>(matches);
      allMatches.addAll(knownFiles);
      LOG.debug(
          "Removing {} temporary files found under {} ({} matched glob, {} known files)",
          allMatches.size(),
          tempDir,
          matches.size(),
          allMatches.size() - matches.size());
      FileSystems.delete(allMatches, StandardMoveOptions.IGNORE_MISSING_FILES);

      // Deletion of the temporary directory might fail, if not all temporary files are removed.
      try {
        FileSystems.delete(
            Collections.singletonList(tempDir), StandardMoveOptions.IGNORE_MISSING_FILES);
      } catch (Exception e) {
        LOG.warn("Failed to remove temporary directory: [{}].", tempDir);
      }
    }

    /**
     * Returns the FileBasedSink for this write operation.
     */
    public FileBasedSink<T> getSink() {
      return sink;
    }

    @Override
    public String toString() {
      String tempDirectoryStr =
          tempDirectory.isAccessible() ? tempDirectory.get().toString() : tempDirectory.toString();
      return getClass().getSimpleName()
          + "{"
          + "tempDirectory="
          + tempDirectoryStr
          + ", windowedWrites="
          + windowedWrites
          + '}';
    }
  }

  /** Returns the extension that will be written to the produced files. */
  protected final String getExtension() {
    String extension = MoreObjects.firstNonNull(writableByteChannelFactory.getFilenameSuffix(), "");
    if (!extension.isEmpty() && !extension.startsWith(".")) {
      extension = "." + extension;
    }
    return extension;
  }

  /**
   * Abstract writer that writes a bundle to a {@link FileBasedSink}. Subclass
   * implementations provide a method that can write a single value to a
   * {@link WritableByteChannel}.
   *
   * <p>Subclass implementations may also override methods that write headers and footers before and
   * after the values in a bundle, respectively, as well as provide a MIME type for the output
   * channel.
   *
   * <p>Multiple {@link Writer} instances may be created on the same worker, and therefore
   * any access to static members or methods should be thread safe.
   *
   * @param <T> the type of values to write.
   */
  public abstract static class Writer<T> {
    private static final Logger LOG = LoggerFactory.getLogger(Writer.class);

    private final WriteOperation<T> writeOperation;

    /** Unique id for this output bundle. */
    private String id;

    private BoundedWindow window;
    private PaneInfo paneInfo;
    private int shard = -1;

    /** The output file for this bundle. May be null if opening failed. */
    private @Nullable ResourceId outputFile;

    /**
     * The channel to write to.
     */
    private WritableByteChannel channel;

    /**
     * The MIME type used in the creation of the output channel (if the file system supports it).
     *
     * <p>This is the default for the sink, but it may be overridden by a supplied
     * {@link WritableByteChannelFactory}. For example, {@link TextIO.Write} uses
     * {@link MimeTypes#TEXT} by default but if {@link CompressionType#BZIP2} is set then
     * the MIME type will be overridden to {@link MimeTypes#BINARY}.
     */
    private final String mimeType;

    /**
     * Construct a new {@link Writer} that will produce files of the given MIME type.
     */
    public Writer(WriteOperation<T> writeOperation, String mimeType) {
      checkNotNull(writeOperation);
      this.writeOperation = writeOperation;
      this.mimeType = mimeType;
    }

    /**
     * Called with the channel that a subclass will write its header, footer, and values to.
     * Subclasses should either keep a reference to the channel provided or create and keep a
     * reference to an appropriate object that they will use to write to it.
     *
     * <p>Called before any subsequent calls to writeHeader, writeFooter, and write.
     */
    protected abstract void prepareWrite(WritableByteChannel channel) throws Exception;

    /**
     * Writes header at the beginning of output files. Nothing by default; subclasses may override.
     */
    protected void writeHeader() throws Exception {}

    /**
     * Writes footer at the end of output files. Nothing by default; subclasses may override.
     */
    protected void writeFooter() throws Exception {}

    /**
     * Called after all calls to {@link #writeHeader}, {@link #write} and {@link #writeFooter}.
     * If any resources opened in the write processes need to be flushed, flush them here.
     */
    protected void finishWrite() throws Exception {}

    /**
     * Performs bundle initialization. For example, creates a temporary file for writing or
     * initializes any state that will be used across calls to {@link Writer#write}.
     *
     * <p>The unique id that is given to open should be used to ensure that the writer's output does
     * not interfere with the output of other Writers, as a bundle may be executed many times for
     * fault tolerance.
     *
     * <p>The window and paneInfo arguments are populated when windowed writes are requested. shard
     * id populated for the case of static sharding. In cases where the runner is dynamically
     * picking sharding, shard might be set to -1.
     */
    public final void openWindowed(String uId, BoundedWindow window, PaneInfo paneInfo, int shard)
        throws Exception {
      if (!getWriteOperation().windowedWrites) {
        throw new IllegalStateException("openWindowed called a non-windowed sink.");
      }
      open(uId, window, paneInfo, shard);
    }

    /**
     * Called for each value in the bundle.
     */
    public abstract void write(T value) throws Exception;

    /**
     * Similar to {@link #openWindowed} however for the case where unwindowed writes were
     * requested.
     */
    public final void openUnwindowed(String uId, int shard) throws Exception {
      if (getWriteOperation().windowedWrites) {
        throw new IllegalStateException("openUnwindowed called a windowed sink.");
      }
      open(uId, null, null, shard);
    }

    // Helper function to close a channel, on exception cases.
    // Always throws prior exception, with any new closing exception suppressed.
    private static void closeChannelAndThrow(
        WritableByteChannel channel, ResourceId filename, Exception prior) throws Exception {
      try {
        channel.close();
      } catch (Exception e) {
        LOG.error("Closing channel for {} failed.", filename, e);
        prior.addSuppressed(e);
        throw prior;
      }
    }

    private void open(String uId,
                      @Nullable BoundedWindow window,
                      @Nullable PaneInfo paneInfo,
                      int shard) throws Exception {
      this.id = uId;
      this.window = window;
      this.paneInfo = paneInfo;
      this.shard = shard;
      ResourceId tempDirectory = getWriteOperation().tempDirectory.get();
      outputFile = tempDirectory.resolve(id, StandardResolveOptions.RESOLVE_FILE);
      verifyNotNull(
          outputFile, "FileSystems are not allowed to return null from resolve: %s", tempDirectory);

      final WritableByteChannelFactory factory =
          getWriteOperation().getSink().writableByteChannelFactory;
      // The factory may force a MIME type or it may return null, indicating to use the sink's MIME.
      String channelMimeType = firstNonNull(factory.getMimeType(), mimeType);
      LOG.debug("Opening {} for write with MIME type {}.", outputFile, channelMimeType);
      WritableByteChannel tempChannel = FileSystems.create(outputFile, channelMimeType);
      try {
        channel = factory.create(tempChannel);
      } catch (Exception e) {
        // If we have opened the underlying channel but fail to open the compression channel,
        // we should still close the underlying channel.
        closeChannelAndThrow(tempChannel, outputFile, e);
      }

      // The caller shouldn't have to close() this Writer if it fails to open(), so close
      // the channel if prepareWrite() or writeHeader() fails.
      String step = "";
      try {
        LOG.debug("Preparing write to {}.", outputFile);
        prepareWrite(channel);

        LOG.debug("Writing header to {}.", outputFile);
        writeHeader();
      } catch (Exception e) {
        LOG.error("Beginning write to {} failed, closing channel.", step, outputFile, e);
        closeChannelAndThrow(channel, outputFile, e);
      }

      LOG.debug("Starting write of bundle {} to {}.", this.id, outputFile);
    }

    public final void cleanup() throws Exception {
      if (outputFile != null) {
        // outputFile may be null if open() was not called or failed.
        FileSystems.delete(
            Collections.singletonList(outputFile), StandardMoveOptions.IGNORE_MISSING_FILES);
      }
    }

    /** Closes the channel and returns the bundle result. */
    public final FileResult close() throws Exception {
      checkState(outputFile != null, "FileResult.close cannot be called with a null outputFile");

      LOG.debug("Writing footer to {}.", outputFile);
      try {
        writeFooter();
      } catch (Exception e) {
        LOG.error("Writing footer to {} failed, closing channel.", outputFile, e);
        closeChannelAndThrow(channel, outputFile, e);
      }

      LOG.debug("Finishing write to {}.", outputFile);
      try {
        finishWrite();
      } catch (Exception e) {
        LOG.error("Finishing write to {} failed, closing channel.", outputFile, e);
        closeChannelAndThrow(channel, outputFile, e);
      }

      checkState(
          channel.isOpen(),
          "Channel %s to %s should only be closed by its owner: %s", channel, outputFile);

      LOG.debug("Closing channel to {}.", outputFile);
      try {
        channel.close();
      } catch (Exception e) {
        throw new IOException(String.format("Failed closing channel to %s", outputFile), e);
      }

      FileResult result = new FileResult(outputFile, shard, window, paneInfo);
      LOG.debug("Result for bundle {}: {}", this.id, outputFile);
      return result;
    }

    /**
     * Return the WriteOperation that this Writer belongs to.
     */
    public WriteOperation<T> getWriteOperation() {
      return writeOperation;
    }
  }

  /**
   * Result of a single bundle write. Contains the filename produced by the bundle, and if known
   * the final output filename.
   */
  public static final class FileResult {
    private final ResourceId tempFilename;
    private final int shard;
    private final BoundedWindow window;
    private final PaneInfo paneInfo;

    @Experimental(Kind.FILESYSTEM)
    public FileResult(ResourceId tempFilename, int shard, BoundedWindow window, PaneInfo paneInfo) {
      this.tempFilename = tempFilename;
      this.shard = shard;
      this.window = window;
      this.paneInfo = paneInfo;
    }

    @Experimental(Kind.FILESYSTEM)
    public ResourceId getTempFilename() {
      return tempFilename;
    }

    public int getShard() {
      return shard;
    }

    public FileResult withShard(int shard) {
      return new FileResult(tempFilename, shard, window, paneInfo);
    }

    public BoundedWindow getWindow() {
      return window;
    }

    public PaneInfo getPaneInfo() {
      return paneInfo;
    }

    @Experimental(Kind.FILESYSTEM)
    public ResourceId getDestinationFile(FilenamePolicy policy, ResourceId outputDirectory,
                                         int numShards, String extension) {
      checkArgument(getShard() != UNKNOWN_SHARDNUM);
      checkArgument(numShards > 0);
      if (getWindow() != null) {
        return policy.windowedFilename(outputDirectory, new WindowedContext(
            getWindow(), getPaneInfo(), getShard(), numShards), extension);
      } else {
        return policy.unwindowedFilename(outputDirectory, new Context(getShard(), numShards),
            extension);
      }
    }

    public String toString() {
      return MoreObjects.toStringHelper(FileResult.class)
          .add("tempFilename", tempFilename)
          .add("shard", shard)
          .add("window", window)
          .add("paneInfo", paneInfo)
          .toString();
    }
  }

  /**
   * A coder for {@link FileResult} objects.
   */
  public static final class FileResultCoder extends StructuredCoder<FileResult> {
    private static final Coder<String> FILENAME_CODER = StringUtf8Coder.of();
    private static final Coder<Integer> SHARD_CODER = VarIntCoder.of();
    private static final Coder<PaneInfo> PANE_INFO_CODER = NullableCoder.of(PaneInfoCoder.INSTANCE);

    private final Coder<BoundedWindow> windowCoder;

    protected FileResultCoder(Coder<BoundedWindow> windowCoder) {
      this.windowCoder = NullableCoder.of(windowCoder);
    }

    public static FileResultCoder of(Coder<BoundedWindow> windowCoder) {
      return new FileResultCoder(windowCoder);
    }

    @Override
    public List<? extends Coder<?>> getCoderArguments() {
      return Arrays.asList(windowCoder);
    }

    @Override
    public void encode(FileResult value, OutputStream outStream)
        throws IOException {
      if (value == null) {
        throw new CoderException("cannot encode a null value");
      }
      FILENAME_CODER.encode(value.getTempFilename().toString(), outStream);
      windowCoder.encode(value.getWindow(), outStream);
      PANE_INFO_CODER.encode(value.getPaneInfo(), outStream);
      SHARD_CODER.encode(value.getShard(), outStream);
    }

    @Override
    public FileResult decode(InputStream inStream)
        throws IOException {
      String tempFilename = FILENAME_CODER.decode(inStream);
      BoundedWindow window = windowCoder.decode(inStream);
      PaneInfo paneInfo = PANE_INFO_CODER.decode(inStream);
      int shard = SHARD_CODER.decode(inStream);
      return new FileResult(FileSystems.matchNewResource(tempFilename, false /* isDirectory */),
          shard, window, paneInfo);
    }

    @Override
    public void verifyDeterministic() throws NonDeterministicException {
      FILENAME_CODER.verifyDeterministic();
      windowCoder.verifyDeterministic();
      PANE_INFO_CODER.verifyDeterministic();
      SHARD_CODER.verifyDeterministic();
    }
  }

  /**
   * Implementations create instances of {@link WritableByteChannel} used by {@link FileBasedSink}
   * and related classes to allow <em>decorating</em>, or otherwise transforming, the raw data that
   * would normally be written directly to the {@link WritableByteChannel} passed into
   * {@link WritableByteChannelFactory#create(WritableByteChannel)}.
   *
   * <p>Subclasses should override {@link #toString()} with something meaningful, as it is used when
   * building {@link DisplayData}.
   */
  public interface WritableByteChannelFactory extends Serializable {
    /**
     * @param channel the {@link WritableByteChannel} to wrap
     * @return the {@link WritableByteChannel} to be used during output
     */
    WritableByteChannel create(WritableByteChannel channel) throws IOException;

    /**
     * Returns the MIME type that should be used for the files that will hold the output data. May
     * return {@code null} if this {@code WritableByteChannelFactory} does not meaningfully change
     * the MIME type (e.g., for {@link CompressionType#UNCOMPRESSED}).
     *
     * @see MimeTypes
     * @see <a href=
     *      'http://www.iana.org/assignments/media-types/media-types.xhtml'>http://www.iana.org/assignments/media-types/media-types.xhtml</a>
     */
    @Nullable
    String getMimeType();

    /**
     * @return an optional filename suffix, eg, ".gz" is returned by {@link CompressionType#GZIP}
     */
    @Nullable
    String getFilenameSuffix();
  }
}