/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.beam.sdk.util.gcsfs; import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.base.Strings.isNullOrEmpty; import com.google.api.services.storage.model.StorageObject; import java.io.File; import java.io.IOException; import java.io.Serializable; import java.net.URI; import java.net.URISyntaxException; import java.nio.file.FileSystem; import java.nio.file.LinkOption; import java.nio.file.Path; import java.nio.file.WatchEvent; import java.nio.file.WatchKey; import java.nio.file.WatchService; import java.util.Iterator; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.annotation.Nonnull; import javax.annotation.Nullable; /** * Implements the Java NIO {@link Path} API for Google Cloud Storage paths. * * <p>GcsPath uses a slash ('/') as a directory separator. Below is * a summary of how slashes are treated: * <ul> * <li> A GCS bucket may not contain a slash. An object may contain zero or * more slashes. * <li> A trailing slash always indicates a directory, which is compliant * with POSIX.1-2008. * <li> Slashes separate components of a path. Empty components are allowed, * these are represented as repeated slashes. An empty component always * refers to a directory, and always ends in a slash. * <li> {@link #getParent()}} always returns a path ending in a slash, as the * parent of a GcsPath is always a directory. * <li> Use {@link #resolve(String)} to append elements to a GcsPath -- this * applies the rules consistently and is highly recommended over any * custom string concatenation. * </ul> * * <p>GcsPath treats all GCS objects and buckets as belonging to the same * filesystem, so the root of a GcsPath is the GcsPath bucket="", object="". * * <p>Relative paths are not associated with any bucket. This matches common * treatment of Path in which relative paths can be constructed from one * filesystem and appended to another filesystem. * * @see <a href= * "http://docs.oracle.com/javase/tutorial/essential/io/pathOps.html" * >Java Tutorials: Path Operations</a> */ public class GcsPath implements Path, Serializable { public static final String SCHEME = "gs"; /** * Creates a GcsPath from a URI. * * <p>The URI must be in the form {@code gs://[bucket]/[path]}, and may not * contain a port, user info, a query, or a fragment. */ public static GcsPath fromUri(URI uri) { checkArgument(uri.getScheme().equalsIgnoreCase(SCHEME), "URI: %s is not a GCS URI", uri); checkArgument(uri.getPort() == -1, "GCS URI may not specify port: %s (%i)", uri, uri.getPort()); checkArgument( isNullOrEmpty(uri.getUserInfo()), "GCS URI may not specify userInfo: %s (%s)", uri, uri.getUserInfo()); checkArgument( isNullOrEmpty(uri.getQuery()), "GCS URI may not specify query: %s (%s)", uri, uri.getQuery()); checkArgument( isNullOrEmpty(uri.getFragment()), "GCS URI may not specify fragment: %s (%s)", uri, uri.getFragment()); return fromUri(uri.toString()); } /** * Pattern that is used to parse a GCS URL. * * <p>This is used to separate the components. Verification is handled * separately. */ public static final Pattern GCS_URI = Pattern.compile("(?<SCHEME>[^:]+)://(?<BUCKET>[^/]+)(/(?<OBJECT>.*))?"); /** * Creates a GcsPath from a URI in string form. * * <p>This does not use URI parsing, which means it may accept patterns that * the URI parser would not accept. */ public static GcsPath fromUri(String uri) { Matcher m = GCS_URI.matcher(uri); checkArgument(m.matches(), "Invalid GCS URI: %s", uri); checkArgument(m.group("SCHEME").equalsIgnoreCase(SCHEME), "URI: %s is not a GCS URI", uri); return new GcsPath(null, m.group("BUCKET"), m.group("OBJECT")); } /** * Pattern that is used to parse a GCS resource name. */ private static final Pattern GCS_RESOURCE_NAME = Pattern.compile("storage.googleapis.com/(?<BUCKET>[^/]+)(/(?<OBJECT>.*))?"); /** * Creates a GcsPath from a OnePlatform resource name in string form. */ public static GcsPath fromResourceName(String name) { Matcher m = GCS_RESOURCE_NAME.matcher(name); checkArgument(m.matches(), "Invalid GCS resource name: %s", name); return new GcsPath(null, m.group("BUCKET"), m.group("OBJECT")); } /** * Creates a GcsPath from a {@linkplain StorageObject}. */ public static GcsPath fromObject(StorageObject object) { return new GcsPath(null, object.getBucket(), object.getName()); } /** * Creates a GcsPath from bucket and object components. * * <p>A GcsPath without a bucket name is treated as a relative path, which * is a path component with no linkage to the root element. This is similar * to a Unix path that does not begin with the root marker (a slash). * GCS has different naming constraints and APIs for working with buckets and * objects, so these two concepts are kept separate to avoid accidental * attempts to treat objects as buckets, or vice versa, as much as possible. * * <p>A GcsPath without an object name is a bucket reference. * A bucket is always a directory, which could be used to lookup or add * files to a bucket, but could not be opened as a file. * * <p>A GcsPath containing neither bucket or object names is treated as * the root of the GCS filesystem. A listing on the root element would return * the buckets available to the user. * * <p>If {@code null} is passed as either parameter, it is converted to an * empty string internally for consistency. There is no distinction between * an empty string and a {@code null}, as neither are allowed by GCS. * * @param bucket a GCS bucket name, or none ({@code null} or an empty string) * if the object is not associated with a bucket * (e.g. relative paths or the root node). * @param object a GCS object path, or none ({@code null} or an empty string) * for no object. */ public static GcsPath fromComponents(@Nullable String bucket, @Nullable String object) { return new GcsPath(null, bucket, object); } @Nullable private transient FileSystem fs; @Nonnull private final String bucket; @Nonnull private final String object; /** * Constructs a GcsPath. * * @param fs the associated FileSystem, if any * @param bucket the associated bucket, or none ({@code null} or an empty * string) for a relative path component * @param object the object, which is a fully-qualified object name if bucket * was also provided, or none ({@code null} or an empty string) * for no object * @throws java.lang.IllegalArgumentException if the bucket of object names * are invalid. */ public GcsPath(@Nullable FileSystem fs, @Nullable String bucket, @Nullable String object) { if (bucket == null) { bucket = ""; } checkArgument(!bucket.contains("/"), "GCS bucket may not contain a slash"); checkArgument(bucket.isEmpty() || bucket.matches("[a-z0-9][-_a-z0-9.]+[a-z0-9]"), "GCS bucket names must contain only lowercase letters, numbers, " + "dashes (-), underscores (_), and dots (.). Bucket names " + "must start and end with a number or letter. " + "See https://developers.google.com/storage/docs/bucketnaming " + "for more details. Bucket name: " + bucket); if (object == null) { object = ""; } checkArgument( object.indexOf('\n') < 0 && object.indexOf('\r') < 0, "GCS object names must not contain Carriage Return or " + "Line Feed characters."); this.fs = fs; this.bucket = bucket; this.object = object; } /** * Returns the bucket name associated with this GCS path, or an empty string * if this is a relative path component. */ public String getBucket() { return bucket; } /** * Returns the object name associated with this GCS path, or an empty string * if no object is specified. */ public String getObject() { return object; } public void setFileSystem(FileSystem fs) { this.fs = fs; } @Override public FileSystem getFileSystem() { return fs; } // Absolute paths are those that have a bucket and the root path. @Override public boolean isAbsolute() { return !bucket.isEmpty() || object.isEmpty(); } @Override public GcsPath getRoot() { return new GcsPath(fs, "", ""); } @Override public GcsPath getFileName() { int nameCount = getNameCount(); if (nameCount < 2) { throw new UnsupportedOperationException( "Can't get filename from root path in the bucket: " + this); } return getName(nameCount - 1); } /** * Returns the <em>parent path</em>, or {@code null} if this path does not * have a parent. * * <p>Returns a path that ends in '/', as the parent path always refers to * a directory. */ @Override public GcsPath getParent() { if (bucket.isEmpty() && object.isEmpty()) { // The root path has no parent, by definition. return null; } if (object.isEmpty()) { // A GCS bucket. All buckets come from a common root. return getRoot(); } // Skip last character, in case it is a trailing slash. int i = object.lastIndexOf('/', object.length() - 2); if (i <= 0) { if (bucket.isEmpty()) { // Relative paths are not attached to the root node. return null; } return new GcsPath(fs, bucket, ""); } // Retain trailing slash. return new GcsPath(fs, bucket, object.substring(0, i + 1)); } @Override public int getNameCount() { int count = bucket.isEmpty() ? 0 : 1; if (object.isEmpty()) { return count; } // Add another for each separator found. int index = -1; while ((index = object.indexOf('/', index + 1)) != -1) { count++; } return object.endsWith("/") ? count : count + 1; } @Override public GcsPath getName(int count) { checkArgument(count >= 0); Iterator<Path> iterator = iterator(); for (int i = 0; i < count; ++i) { checkArgument(iterator.hasNext()); iterator.next(); } checkArgument(iterator.hasNext()); return (GcsPath) iterator.next(); } @Override public GcsPath subpath(int beginIndex, int endIndex) { checkArgument(beginIndex >= 0); checkArgument(endIndex > beginIndex); Iterator<Path> iterator = iterator(); for (int i = 0; i < beginIndex; ++i) { checkArgument(iterator.hasNext()); iterator.next(); } GcsPath path = null; while (beginIndex < endIndex) { checkArgument(iterator.hasNext()); if (path == null) { path = (GcsPath) iterator.next(); } else { path = path.resolve(iterator.next()); } ++beginIndex; } return path; } @Override public boolean startsWith(Path other) { if (other instanceof GcsPath) { GcsPath gcsPath = (GcsPath) other; return startsWith(gcsPath.bucketAndObject()); } else { return startsWith(other.toString()); } } @Override public boolean startsWith(String prefix) { return bucketAndObject().startsWith(prefix); } @Override public boolean endsWith(Path other) { if (other instanceof GcsPath) { GcsPath gcsPath = (GcsPath) other; return endsWith(gcsPath.bucketAndObject()); } else { return endsWith(other.toString()); } } @Override public boolean endsWith(String suffix) { return bucketAndObject().endsWith(suffix); } // TODO: support "." and ".." path components? @Override public GcsPath normalize() { return this; } @Override public GcsPath resolve(Path other) { if (other instanceof GcsPath) { GcsPath path = (GcsPath) other; if (path.isAbsolute()) { return path; } else { return resolve(path.getObject()); } } else { return resolve(other.toString()); } } @Override public GcsPath resolve(String other) { if (bucket.isEmpty() && object.isEmpty()) { // Resolve on a root path is equivalent to looking up a bucket and object. other = SCHEME + "://" + other; } if (other.startsWith(SCHEME + "://")) { GcsPath path = GcsPath.fromUri(other); path.setFileSystem(getFileSystem()); return path; } if (other.isEmpty()) { // An empty component MUST refer to a directory. other = "/"; } if (object.isEmpty()) { return new GcsPath(fs, bucket, other); } else if (object.endsWith("/")) { return new GcsPath(fs, bucket, object + other); } else { return new GcsPath(fs, bucket, object + "/" + other); } } @Override public Path resolveSibling(Path other) { throw new UnsupportedOperationException(); } @Override public Path resolveSibling(String other) { if (getNameCount() < 2) { throw new UnsupportedOperationException("Can't resolve the sibling of a root path: " + this); } GcsPath parent = getParent(); return (parent == null) ? fromUri(other) : parent.resolve(other); } @Override public Path relativize(Path other) { throw new UnsupportedOperationException(); } @Override public GcsPath toAbsolutePath() { return this; } @Override public GcsPath toRealPath(LinkOption... options) throws IOException { return this; } @Override public File toFile() { throw new UnsupportedOperationException(); } @Override public WatchKey register(WatchService watcher, WatchEvent.Kind<?>[] events, WatchEvent.Modifier... modifiers) throws IOException { throw new UnsupportedOperationException(); } @Override public WatchKey register(WatchService watcher, WatchEvent.Kind<?>... events) throws IOException { throw new UnsupportedOperationException(); } @Override public Iterator<Path> iterator() { return new NameIterator(fs, !bucket.isEmpty(), bucketAndObject()); } private static class NameIterator implements Iterator<Path> { private final FileSystem fs; private boolean fullPath; private String name; NameIterator(FileSystem fs, boolean fullPath, String name) { this.fs = fs; this.fullPath = fullPath; this.name = name; } @Override public boolean hasNext() { return !isNullOrEmpty(name); } @Override public GcsPath next() { int i = name.indexOf('/'); String component; if (i >= 0) { component = name.substring(0, i); name = name.substring(i + 1); } else { component = name; name = null; } if (fullPath) { fullPath = false; return new GcsPath(fs, component, ""); } else { // Relative paths have no bucket. return new GcsPath(fs, "", component); } } @Override public void remove() { throw new UnsupportedOperationException(); } } @Override public int compareTo(Path other) { if (!(other instanceof GcsPath)) { throw new ClassCastException(); } GcsPath path = (GcsPath) other; int b = bucket.compareTo(path.bucket); if (b != 0) { return b; } // Compare a component at a time, so that the separator char doesn't // get compared against component contents. Eg, "a/b" < "a-1/b". Iterator<Path> left = iterator(); Iterator<Path> right = path.iterator(); while (left.hasNext() && right.hasNext()) { String leftStr = left.next().toString(); String rightStr = right.next().toString(); int c = leftStr.compareTo(rightStr); if (c != 0) { return c; } } if (!left.hasNext() && !right.hasNext()) { return 0; } else { return left.hasNext() ? 1 : -1; } } @Override public boolean equals(Object o) { if (this == o) { return true; } if (o == null || getClass() != o.getClass()) { return false; } GcsPath paths = (GcsPath) o; return bucket.equals(paths.bucket) && object.equals(paths.object); } @Override public int hashCode() { int result = bucket.hashCode(); result = 31 * result + object.hashCode(); return result; } @Override public String toString() { if (!isAbsolute()) { return object; } StringBuilder sb = new StringBuilder(); sb.append(SCHEME) .append("://"); if (!bucket.isEmpty()) { sb.append(bucket) .append('/'); } sb.append(object); return sb.toString(); } // TODO: Consider using resource names for all GCS paths used by the SDK. public String toResourceName() { StringBuilder sb = new StringBuilder(); sb.append("storage.googleapis.com/"); if (!bucket.isEmpty()) { sb.append(bucket).append('/'); } sb.append(object); return sb.toString(); } @Override public URI toUri() { try { return new URI(SCHEME, "//" + bucketAndObject(), null); } catch (URISyntaxException e) { throw new RuntimeException("Unable to create URI for GCS path " + this); } } private String bucketAndObject() { if (bucket.isEmpty()) { return object; } else { return bucket + "/" + object; } } }