/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.beam.sdk.io; import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.base.Preconditions.checkNotNull; import static com.google.common.base.Verify.verify; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Function; import com.google.common.base.Joiner; import com.google.common.base.Predicate; import com.google.common.collect.FluentIterable; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Iterables; import com.google.common.collect.Lists; import com.google.common.collect.Multimap; import com.google.common.collect.Ordering; import com.google.common.collect.Sets; import com.google.common.collect.TreeMultimap; import java.io.FileNotFoundException; import java.io.IOException; import java.nio.channels.ReadableByteChannel; import java.nio.channels.WritableByteChannel; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.ServiceLoader; import java.util.Set; import java.util.concurrent.atomic.AtomicReference; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.annotation.Nonnull; import org.apache.beam.sdk.annotations.Experimental; import org.apache.beam.sdk.annotations.Experimental.Kind; import org.apache.beam.sdk.annotations.Internal; import org.apache.beam.sdk.io.fs.CreateOptions; import org.apache.beam.sdk.io.fs.CreateOptions.StandardCreateOptions; import org.apache.beam.sdk.io.fs.MatchResult; import org.apache.beam.sdk.io.fs.MatchResult.Metadata; import org.apache.beam.sdk.io.fs.MatchResult.Status; import org.apache.beam.sdk.io.fs.MoveOptions; import org.apache.beam.sdk.io.fs.ResourceId; import org.apache.beam.sdk.options.PipelineOptions; import org.apache.beam.sdk.util.common.ReflectHelpers; import org.apache.beam.sdk.values.KV; /** * Clients facing {@link FileSystem} utility. */ @Experimental(Kind.FILESYSTEM) public class FileSystems { public static final String DEFAULT_SCHEME = "default"; private static final Pattern FILE_SCHEME_PATTERN = Pattern.compile("(?<scheme>[a-zA-Z][-a-zA-Z0-9+.]*):.*"); private static final AtomicReference<Map<String, FileSystem>> SCHEME_TO_FILESYSTEM = new AtomicReference<Map<String, FileSystem>>( ImmutableMap.<String, FileSystem>of("file", new LocalFileSystem())); /********************************** METHODS FOR CLIENT **********************************/ /** * This is the entry point to convert user-provided specs to {@link ResourceId ResourceIds}. * Callers should use {@link #match} to resolve users specs ambiguities before * calling other methods. * * <p>Implementation handles the following ambiguities of a user-provided spec: * <ol> * <li>{@code spec} could be a glob or a uri. {@link #match} should be able to tell and * choose efficient implementations. * <li>The user-provided {@code spec} might refer to files or directories. It is common that * users that wish to indicate a directory will omit the trailing path delimiter, such as * {@code "/tmp/dir"} in Linux. The {@link FileSystem} should be able to recognize a directory * with the trailing path delimiter omitted, but should always return a correct {@link ResourceId} * (e.g., {@code "/tmp/dir/"} inside the returned {@link MatchResult}. * </ol> * * <p>All {@link FileSystem} implementations should support glob in the final hierarchical path * component of {@link ResourceId}. This allows SDK libraries to construct file system agnostic * spec. {@link FileSystem FileSystems} can support additional patterns for user-provided specs. * * @return {@code List<MatchResult>} in the same order of the input specs. * * @throws IllegalArgumentException if specs are invalid -- empty or have different schemes. * @throws IOException if all specs failed to match due to issues like: * network connection, authorization. * Exception for individual spec is deferred until callers retrieve * metadata with {@link MatchResult#metadata()}. */ public static List<MatchResult> match(List<String> specs) throws IOException { return getFileSystemInternal(getOnlyScheme(specs)).match(specs); } /** * Like {@link #match(List)}, but for a single resource specification. * * <p>The function {@link #match(List)} is preferred when matching multiple patterns, as it allows * for bulk API calls to remote filesystems. */ public static MatchResult match(String spec) throws IOException { List<MatchResult> matches = match(Collections.singletonList(spec)); verify( matches.size() == 1, "FileSystem implementation for %s did not return exactly one MatchResult: %s", spec, matches); return matches.get(0); } /** * Returns the {@link Metadata} for a single file resource. Expects a resource specification * {@code spec} that matches a single result. * * @param spec a resource specification that matches exactly one result. * @return the {@link Metadata} for the specified resource. * @throws FileNotFoundException if the file resource is not found. * @throws IOException in the event of an error in the inner call to {@link #match}, * or if the given spec does not match exactly 1 result. */ public static Metadata matchSingleFileSpec(String spec) throws IOException { List<MatchResult> matches = FileSystems.match(Collections.singletonList(spec)); MatchResult matchResult = Iterables.getOnlyElement(matches); if (matchResult.status() == Status.NOT_FOUND) { throw new FileNotFoundException(String.format("File spec %s not found", spec)); } else if (matchResult.status() != Status.OK) { throw new IOException( String.format("Error matching file spec %s: status %s", spec, matchResult.status())); } else { List<Metadata> metadata = matchResult.metadata(); if (metadata.size() != 1) { throw new IOException( String.format( "Expecting spec %s to match exactly one file, but matched %s: %s", spec, metadata.size(), metadata)); } return metadata.get(0); } } /** * Returns {@link MatchResult MatchResults} for the given {@link ResourceId resourceIds}. * * @param resourceIds {@link ResourceId resourceIds} that might be derived from {@link #match}, * {@link ResourceId#resolve}, or {@link ResourceId#getCurrentDirectory()}. * * @throws IOException if all {@code resourceIds} failed to match due to issues like: * network connection, authorization. * Exception for individual {@link ResourceId} need to be deferred until callers retrieve * metadata with {@link MatchResult#metadata()}. */ public static List<MatchResult> matchResources(List<ResourceId> resourceIds) throws IOException { return match(FluentIterable .from(resourceIds) .transform(new Function<ResourceId, String>() { @Override public String apply(@Nonnull ResourceId resourceId) { return resourceId.toString(); }}) .toList()); } /** * Returns a write channel for the given {@link ResourceId}. * * <p>The resource is not expanded; it is used verbatim. * * @param resourceId the reference of the file-like resource to create * @param mimeType the mine type of the file-like resource to create */ public static WritableByteChannel create(ResourceId resourceId, String mimeType) throws IOException { return create(resourceId, StandardCreateOptions.builder().setMimeType(mimeType).build()); } /** * Returns a write channel for the given {@link ResourceId} with {@link CreateOptions}. * * <p>The resource is not expanded; it is used verbatim. * * @param resourceId the reference of the file-like resource to create * @param createOptions the configuration of the create operation */ public static WritableByteChannel create(ResourceId resourceId, CreateOptions createOptions) throws IOException { return getFileSystemInternal(resourceId.getScheme()).create(resourceId, createOptions); } /** * Returns a read channel for the given {@link ResourceId}. * * <p>The resource is not expanded; it is used verbatim. * * <p>If seeking is supported, then this returns a * {@link java.nio.channels.SeekableByteChannel}. * * @param resourceId the reference of the file-like resource to open */ public static ReadableByteChannel open(ResourceId resourceId) throws IOException { return getFileSystemInternal(resourceId.getScheme()).open(resourceId); } /** * Copies a {@link List} of file-like resources from one location to another. * * <p>The number of source resources must equal the number of destination resources. * Destination resources will be created recursively. * * <p>{@code srcResourceIds} and {@code destResourceIds} must have the same scheme. * * <p>It doesn't support copying globs. * * @param srcResourceIds the references of the source resources * @param destResourceIds the references of the destination resources */ public static void copy( List<ResourceId> srcResourceIds, List<ResourceId> destResourceIds, MoveOptions... moveOptions) throws IOException { validateSrcDestLists(srcResourceIds, destResourceIds); if (srcResourceIds.isEmpty()) { // Short-circuit. return; } List<ResourceId> srcToCopy = srcResourceIds; List<ResourceId> destToCopy = destResourceIds; if (Sets.newHashSet(moveOptions).contains( MoveOptions.StandardMoveOptions.IGNORE_MISSING_FILES)) { KV<List<ResourceId>, List<ResourceId>> existings = filterMissingFiles(srcResourceIds, destResourceIds); srcToCopy = existings.getKey(); destToCopy = existings.getValue(); } if (srcToCopy.isEmpty()) { return; } getFileSystemInternal(srcToCopy.iterator().next().getScheme()) .copy(srcToCopy, destToCopy); } /** * Renames a {@link List} of file-like resources from one location to another. * * <p>The number of source resources must equal the number of destination resources. * Destination resources will be created recursively. * * <p>{@code srcResourceIds} and {@code destResourceIds} must have the same scheme. * * <p>It doesn't support renaming globs. * * @param srcResourceIds the references of the source resources * @param destResourceIds the references of the destination resources */ public static void rename( List<ResourceId> srcResourceIds, List<ResourceId> destResourceIds, MoveOptions... moveOptions) throws IOException { validateSrcDestLists(srcResourceIds, destResourceIds); if (srcResourceIds.isEmpty()) { // Short-circuit. return; } List<ResourceId> srcToRename = srcResourceIds; List<ResourceId> destToRename = destResourceIds; if (Sets.newHashSet(moveOptions).contains( MoveOptions.StandardMoveOptions.IGNORE_MISSING_FILES)) { KV<List<ResourceId>, List<ResourceId>> existings = filterMissingFiles(srcResourceIds, destResourceIds); srcToRename = existings.getKey(); destToRename = existings.getValue(); } if (srcToRename.isEmpty()) { return; } getFileSystemInternal(srcToRename.iterator().next().getScheme()) .rename(srcToRename, destToRename); } /** * Deletes a collection of resources. * * <p>It is allowed but not recommended to delete directories recursively. * Callers depends on {@link FileSystems} and uses {@code DeleteOptions}. * * <p>{@code resourceIds} must have the same scheme. * * @param resourceIds the references of the resources to delete. */ public static void delete( Collection<ResourceId> resourceIds, MoveOptions... moveOptions) throws IOException { if (resourceIds.isEmpty()) { // Short-circuit. return; } Collection<ResourceId> resourceIdsToDelete; if (Sets.newHashSet(moveOptions).contains( MoveOptions.StandardMoveOptions.IGNORE_MISSING_FILES)) { resourceIdsToDelete = FluentIterable .from(matchResources(Lists.newArrayList(resourceIds))) .filter(new Predicate<MatchResult>() { @Override public boolean apply(@Nonnull MatchResult matchResult) { return !matchResult.status().equals(MatchResult.Status.NOT_FOUND); }}) .transformAndConcat(new Function<MatchResult, Iterable<Metadata>>() { @Nonnull @Override public Iterable<Metadata> apply(@Nonnull MatchResult input) { try { return Lists.newArrayList(input.metadata()); } catch (IOException e) { throw new RuntimeException( String.format("Failed to get metadata from MatchResult: %s.", input), e); } }}) .transform(new Function<Metadata, ResourceId>() { @Nonnull @Override public ResourceId apply(@Nonnull Metadata input) { return input.resourceId(); }}) .toList(); } else { resourceIdsToDelete = resourceIds; } if (resourceIdsToDelete.isEmpty()) { return; } getFileSystemInternal(resourceIdsToDelete.iterator().next().getScheme()) .delete(resourceIdsToDelete); } private static KV<List<ResourceId>, List<ResourceId>> filterMissingFiles( List<ResourceId> srcResourceIds, List<ResourceId> destResourceIds) throws IOException { validateSrcDestLists(srcResourceIds, destResourceIds); if (srcResourceIds.isEmpty()) { // Short-circuit. return KV.of(Collections.<ResourceId>emptyList(), Collections.<ResourceId>emptyList()); } List<ResourceId> srcToHandle = new ArrayList<>(); List<ResourceId> destToHandle = new ArrayList<>(); List<MatchResult> matchResults = matchResources(srcResourceIds); for (int i = 0; i < matchResults.size(); ++i) { if (!matchResults.get(i).status().equals(Status.NOT_FOUND)) { srcToHandle.add(srcResourceIds.get(i)); destToHandle.add(destResourceIds.get(i)); } } return KV.of(srcToHandle, destToHandle); } private static void validateSrcDestLists( List<ResourceId> srcResourceIds, List<ResourceId> destResourceIds) { checkArgument( srcResourceIds.size() == destResourceIds.size(), "Number of source resource ids %s must equal number of destination resource ids %s", srcResourceIds.size(), destResourceIds.size()); if (srcResourceIds.isEmpty()) { // nothing more to validate. return; } Set<String> schemes = FluentIterable.from(srcResourceIds) .append(destResourceIds) .transform(new Function<ResourceId, String>() { @Override public String apply(@Nonnull ResourceId resourceId) { return resourceId.getScheme(); }}) .toSet(); checkArgument( schemes.size() == 1, String.format( "Expect srcResourceIds and destResourceIds have the same scheme, but received %s.", Joiner.on(", ").join(schemes))); } private static String getOnlyScheme(List<String> specs) { checkArgument(!specs.isEmpty(), "Expect specs are not empty."); Set<String> schemes = FluentIterable.from(specs) .transform(new Function<String, String>() { @Override public String apply(String spec) { return parseScheme(spec); }}) .toSet(); return Iterables.getOnlyElement(schemes); } private static String parseScheme(String spec) { // The spec is almost, but not quite, a URI. In particular, // the reserved characters '[', ']', and '?' have meanings that differ // from their use in the URI spec. ('*' is not reserved). // Here, we just need the scheme, which is so circumscribed as to be // very easy to extract with a regex. Matcher matcher = FILE_SCHEME_PATTERN.matcher(spec); if (!matcher.matches()) { return "file"; } else { return matcher.group("scheme").toLowerCase(); } } /** * Internal method to get {@link FileSystem} for {@code scheme}. */ @VisibleForTesting static FileSystem getFileSystemInternal(String scheme) { String lowerCaseScheme = scheme.toLowerCase(); Map<String, FileSystem> schemeToFileSystem = SCHEME_TO_FILESYSTEM.get(); FileSystem rval = schemeToFileSystem.get(lowerCaseScheme); if (rval != null) { return rval; } rval = schemeToFileSystem.get(DEFAULT_SCHEME); if (rval != null) { return rval; } throw new IllegalStateException("Unable to find registrar for " + scheme); } /********************************** METHODS FOR REGISTRATION **********************************/ /** * Sets the default configuration in workers. * * <p>It will be used in {@link FileSystemRegistrar FileSystemRegistrars} for all schemes. * * <p>This is expected only to be used by runners after {@code Pipeline.run}, or in tests. */ @Internal public static void setDefaultPipelineOptions(PipelineOptions options) { checkNotNull(options, "options"); Set<FileSystemRegistrar> registrars = Sets.newTreeSet(ReflectHelpers.ObjectsClassComparator.INSTANCE); registrars.addAll(Lists.newArrayList( ServiceLoader.load(FileSystemRegistrar.class, ReflectHelpers.findClassLoader()))); SCHEME_TO_FILESYSTEM.set(verifySchemesAreUnique(options, registrars)); } @VisibleForTesting static Map<String, FileSystem> verifySchemesAreUnique( PipelineOptions options, Set<FileSystemRegistrar> registrars) { Multimap<String, FileSystem> fileSystemsBySchemes = TreeMultimap.create(Ordering.<String>natural(), Ordering.arbitrary()); for (FileSystemRegistrar registrar : registrars) { for (FileSystem fileSystem : registrar.fromOptions(options)) { fileSystemsBySchemes.put(fileSystem.getScheme(), fileSystem); } } for (Entry<String, Collection<FileSystem>> entry : fileSystemsBySchemes.asMap().entrySet()) { if (entry.getValue().size() > 1) { String conflictingFileSystems = Joiner.on(", ").join( FluentIterable.from(entry.getValue()) .transform(new Function<FileSystem, String>() { @Override public String apply(@Nonnull FileSystem input) { return input.getClass().getName(); }}) .toSortedList(Ordering.<String>natural())); throw new IllegalStateException(String.format( "Scheme: [%s] has conflicting filesystems: [%s]", entry.getKey(), conflictingFileSystems)); } } ImmutableMap.Builder<String, FileSystem> schemeToFileSystem = ImmutableMap.builder(); for (Entry<String, FileSystem> entry : fileSystemsBySchemes.entries()) { schemeToFileSystem.put(entry.getKey(), entry.getValue()); } return schemeToFileSystem.build(); } /** * Returns a new {@link ResourceId} that represents the named resource of a type corresponding * to the resource type. * * <p>The supplied {@code singleResourceSpec} is expected to be in a proper format, including * any necessary escaping, for the underlying {@link FileSystem}. * * <p>This function may throw an {@link IllegalArgumentException} if given an invalid argument, * such as when the specified {@code singleResourceSpec} is not a valid resource name. */ public static ResourceId matchNewResource(String singleResourceSpec, boolean isDirectory) { return getFileSystemInternal(parseScheme(singleResourceSpec)) .matchNewResource(singleResourceSpec, isDirectory); } }