/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.beam.sdk.io; import static com.google.common.base.MoreObjects.firstNonNull; import com.google.common.annotations.VisibleForTesting; import java.text.DecimalFormat; import java.util.Arrays; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.annotation.Nullable; import org.apache.beam.sdk.io.FileBasedSink.FilenamePolicy; import org.apache.beam.sdk.io.fs.ResolveOptions.StandardResolveOptions; import org.apache.beam.sdk.io.fs.ResourceId; import org.apache.beam.sdk.options.ValueProvider; import org.apache.beam.sdk.options.ValueProvider.NestedValueProvider; import org.apache.beam.sdk.transforms.SerializableFunction; import org.apache.beam.sdk.transforms.display.DisplayData; /** * A default {@link FilenamePolicy} for unwindowed files. This policy is constructed using three * parameters that together define the output name of a sharded file, in conjunction with the number * of shards and index of the particular file, using {@link #constructName}. * * <p>Most users of unwindowed files will use this {@link DefaultFilenamePolicy}. For more advanced * uses in generating different files for each window and other sharding controls, see the * {@code WriteOneFilePerWindow} example pipeline. */ public final class DefaultFilenamePolicy extends FilenamePolicy { /** The default sharding name template used in {@link #constructUsingStandardParameters}. */ public static final String DEFAULT_SHARD_TEMPLATE = ShardNameTemplate.INDEX_OF_MAX; // Pattern that matches shard placeholders within a shard template. private static final Pattern SHARD_FORMAT_RE = Pattern.compile("(S+|N+)"); /** * Constructs a new {@link DefaultFilenamePolicy}. * * @see DefaultFilenamePolicy for more information on the arguments to this function. */ @VisibleForTesting DefaultFilenamePolicy(ValueProvider<String> prefix, String shardTemplate, String suffix) { this.prefix = prefix; this.shardTemplate = shardTemplate; this.suffix = suffix; } /** * A helper function to construct a {@link DefaultFilenamePolicy} using the standard filename * parameters, namely a provided {@link ResourceId} for the output prefix, and possibly-null * shard name template and suffix. * * <p>Any filename component of the provided resource will be used as the filename prefix. * * <p>If provided, the shard name template will be used; otherwise {@link #DEFAULT_SHARD_TEMPLATE} * will be used. * * <p>If provided, the suffix will be used; otherwise the files will have an empty suffix. */ public static DefaultFilenamePolicy constructUsingStandardParameters( ValueProvider<ResourceId> outputPrefix, @Nullable String shardTemplate, @Nullable String filenameSuffix) { return new DefaultFilenamePolicy( NestedValueProvider.of(outputPrefix, new ExtractFilename()), firstNonNull(shardTemplate, DEFAULT_SHARD_TEMPLATE), firstNonNull(filenameSuffix, "")); } private final ValueProvider<String> prefix; private final String shardTemplate; private final String suffix; /** * Constructs a fully qualified name from components. * * <p>The name is built from a prefix, shard template (with shard numbers * applied), and a suffix. All components are required, but may be empty * strings. * * <p>Within a shard template, repeating sequences of the letters "S" or "N" * are replaced with the shard number, or number of shards respectively. The * numbers are formatted with leading zeros to match the length of the * repeated sequence of letters. * * <p>For example, if prefix = "output", shardTemplate = "-SSS-of-NNN", and * suffix = ".txt", with shardNum = 1 and numShards = 100, the following is * produced: "output-001-of-100.txt". */ public static String constructName( String prefix, String shardTemplate, String suffix, int shardNum, int numShards) { // Matcher API works with StringBuffer, rather than StringBuilder. StringBuffer sb = new StringBuffer(); sb.append(prefix); Matcher m = SHARD_FORMAT_RE.matcher(shardTemplate); while (m.find()) { boolean isShardNum = (m.group(1).charAt(0) == 'S'); char[] zeros = new char[m.end() - m.start()]; Arrays.fill(zeros, '0'); DecimalFormat df = new DecimalFormat(String.valueOf(zeros)); String formatted = df.format(isShardNum ? shardNum : numShards); m.appendReplacement(sb, formatted); } m.appendTail(sb); sb.append(suffix); return sb.toString(); } @Override @Nullable public ResourceId unwindowedFilename(ResourceId outputDirectory, Context context, String extension) { String filename = constructName( prefix.get(), shardTemplate, suffix, context.getShardNumber(), context.getNumShards()) + extension; return outputDirectory.resolve(filename, StandardResolveOptions.RESOLVE_FILE); } @Override public ResourceId windowedFilename(ResourceId outputDirectory, WindowedContext c, String extension) { throw new UnsupportedOperationException("There is no default policy for windowed file" + " output. Please provide an explicit FilenamePolicy to generate filenames."); } @Override public void populateDisplayData(DisplayData.Builder builder) { String filenamePattern; if (prefix.isAccessible()) { filenamePattern = String.format("%s%s%s", prefix.get(), shardTemplate, suffix); } else { filenamePattern = String.format("%s%s%s", prefix, shardTemplate, suffix); } builder.add( DisplayData.item("filenamePattern", filenamePattern) .withLabel("Filename Pattern")); } private static class ExtractFilename implements SerializableFunction<ResourceId, String> { @Override public String apply(ResourceId input) { if (input.isDirectory()) { return ""; } else { return firstNonNull(input.getFilename(), ""); } } } }