/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.beam.sdk.io.hdfs; import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.ImmutableList; import java.io.IOException; import java.net.URI; import java.nio.ByteBuffer; import java.nio.channels.Channels; import java.nio.channels.ReadableByteChannel; import java.nio.channels.SeekableByteChannel; import java.nio.channels.WritableByteChannel; import java.util.ArrayList; import java.util.Collection; import java.util.List; import org.apache.beam.sdk.io.FileSystem; import org.apache.beam.sdk.io.fs.CreateOptions; import org.apache.beam.sdk.io.fs.MatchResult; import org.apache.beam.sdk.io.fs.MatchResult.Metadata; import org.apache.beam.sdk.io.fs.MatchResult.Status; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSInputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.Path; /** * Adapts {@link org.apache.hadoop.fs.FileSystem} connectors to be used as * Apache Beam {@link FileSystem FileSystems}. * * <p>The following HDFS FileSystem(s) are known to be unsupported: * <ul> * <li>FTPFileSystem: Missing seek support within FTPInputStream</li> * </ul> * * <p>This implementation assumes that the underlying Hadoop {@link FileSystem} is seek * efficient when reading. The source code for the following {@link FSInputStream} implementations * (as of Hadoop 2.7.1) do provide seek implementations: * <ul> * <li>HarFsInputStream</li> * <li>S3InputStream</li> * <li>DFSInputStream</li> * <li>SwiftNativeInputStream</li> * <li>NativeS3FsInputStream</li> * <li>LocalFSFileInputStream</li> * <li>NativeAzureFsInputStream</li> * <li>S3AInputStream</li> * </ul> */ class HadoopFileSystem extends FileSystem<HadoopResourceId> { @VisibleForTesting final org.apache.hadoop.fs.FileSystem fileSystem; HadoopFileSystem(Configuration configuration) throws IOException { this.fileSystem = org.apache.hadoop.fs.FileSystem.newInstance(configuration); } @Override protected List<MatchResult> match(List<String> specs) { ImmutableList.Builder<MatchResult> resultsBuilder = ImmutableList.builder(); for (String spec : specs) { try { FileStatus[] fileStatuses = fileSystem.globStatus(new Path(spec)); List<Metadata> metadata = new ArrayList<>(); for (FileStatus fileStatus : fileStatuses) { if (fileStatus.isFile()) { URI uri = dropEmptyAuthority(fileStatus.getPath().toUri().toString()); metadata.add(Metadata.builder() .setResourceId(new HadoopResourceId(uri)) .setIsReadSeekEfficient(true) .setSizeBytes(fileStatus.getLen()) .build()); } } resultsBuilder.add(MatchResult.create(Status.OK, metadata)); } catch (IOException e) { resultsBuilder.add(MatchResult.create(Status.ERROR, e)); } } return resultsBuilder.build(); } @Override protected WritableByteChannel create(HadoopResourceId resourceId, CreateOptions createOptions) throws IOException { return Channels.newChannel(fileSystem.create(resourceId.toPath())); } @Override protected ReadableByteChannel open(HadoopResourceId resourceId) throws IOException { FileStatus fileStatus = fileSystem.getFileStatus(resourceId.toPath()); return new HadoopSeekableByteChannel(fileStatus, fileSystem.open(resourceId.toPath())); } @Override protected void copy( List<HadoopResourceId> srcResourceIds, List<HadoopResourceId> destResourceIds) throws IOException { for (int i = 0; i < srcResourceIds.size(); ++i) { // Unfortunately HDFS FileSystems don't support a native copy operation so we are forced // to use the inefficient implementation found in FileUtil which copies all the bytes through // the local machine. // // HDFS FileSystem does define a concat method but could only find the DFSFileSystem // implementing it. The DFSFileSystem implemented concat by deleting the srcs after which // is not what we want. Also, all the other FileSystem implementations I saw threw // UnsupportedOperationException within concat. FileUtil.copy( fileSystem, srcResourceIds.get(i).toPath(), fileSystem, destResourceIds.get(i).toPath(), false, true, fileSystem.getConf()); } } @Override protected void rename( List<HadoopResourceId> srcResourceIds, List<HadoopResourceId> destResourceIds) throws IOException { for (int i = 0; i < srcResourceIds.size(); ++i) { fileSystem.rename( srcResourceIds.get(i).toPath(), destResourceIds.get(i).toPath()); } } @Override protected void delete(Collection<HadoopResourceId> resourceIds) throws IOException { for (HadoopResourceId resourceId : resourceIds) { fileSystem.delete(resourceId.toPath(), false); } } @Override protected HadoopResourceId matchNewResource(String singleResourceSpec, boolean isDirectory) { if (singleResourceSpec.endsWith("/") && !isDirectory) { throw new IllegalArgumentException(String.format( "Expected file path but received directory path %s", singleResourceSpec)); } return !singleResourceSpec.endsWith("/") && isDirectory ? new HadoopResourceId(dropEmptyAuthority(singleResourceSpec + "/")) : new HadoopResourceId(dropEmptyAuthority(singleResourceSpec)); } @Override protected String getScheme() { return fileSystem.getScheme(); } /** An adapter around {@link FSDataInputStream} that implements {@link SeekableByteChannel}. */ private static class HadoopSeekableByteChannel implements SeekableByteChannel { private final FileStatus fileStatus; private final FSDataInputStream inputStream; private boolean closed; private HadoopSeekableByteChannel(FileStatus fileStatus, FSDataInputStream inputStream) { this.fileStatus = fileStatus; this.inputStream = inputStream; this.closed = false; } @Override public int read(ByteBuffer dst) throws IOException { if (closed) { throw new IOException("Channel is closed"); } return inputStream.read(dst); } @Override public int write(ByteBuffer src) throws IOException { throw new UnsupportedOperationException(); } @Override public long position() throws IOException { if (closed) { throw new IOException("Channel is closed"); } return inputStream.getPos(); } @Override public SeekableByteChannel position(long newPosition) throws IOException { if (closed) { throw new IOException("Channel is closed"); } inputStream.seek(newPosition); return this; } @Override public long size() throws IOException { if (closed) { throw new IOException("Channel is closed"); } return fileStatus.getLen(); } @Override public SeekableByteChannel truncate(long size) throws IOException { throw new UnsupportedOperationException(); } @Override public boolean isOpen() { return !closed; } @Override public void close() throws IOException { closed = true; inputStream.close(); } } private static URI dropEmptyAuthority(String uriStr) { URI uri = URI.create(uriStr); String prefix = uri.getScheme() + ":///"; if (uriStr.startsWith(prefix)) { return URI.create(uri.getScheme() + ":/" + uriStr.substring(prefix.length())); } else { return uri; } } }