/* * Copyright © 2014-2016 Cask Data, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not * use this file except in compliance with the License. You may obtain a copy of * the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations under * the License. */ package co.cask.cdap.common.io; import co.cask.cdap.common.lang.FunctionWithException; import com.google.common.base.Supplier; import com.google.common.base.Suppliers; import com.google.common.base.Throwables; import com.google.common.collect.Iterators; import com.google.common.io.Closeables; import com.google.common.io.InputSupplier; import com.google.common.io.OutputSupplier; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileContext; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.RemoteIterator; import org.apache.hadoop.hdfs.DistributedFileSystem; import org.apache.twill.filesystem.FileContextLocationFactory; import org.apache.twill.filesystem.HDFSLocationFactory; import org.apache.twill.filesystem.LocalLocationFactory; import org.apache.twill.filesystem.Location; import org.apache.twill.filesystem.LocationFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.lang.reflect.Method; import java.net.URI; import java.nio.file.Files; import java.nio.file.Paths; import java.security.PrivilegedExceptionAction; import java.util.Comparator; import java.util.Iterator; import java.util.LinkedList; import javax.annotation.Nullable; /** * Utility class to help interaction with {@link Location}. */ public final class Locations { private static final Logger LOG = LoggerFactory.getLogger(Locations.class); // For converting local file into Location. private static final LocalLocationFactory LOCAL_LOCATION_FACTORY = new LocalLocationFactory(); // For converting FileStatus to LocationStatus private static final FunctionWithException<FileStatus, LocationStatus, IOException> FILE_STATUS_TO_LOCATION_STATUS = new FunctionWithException<FileStatus, LocationStatus, IOException>() { @Override public LocationStatus apply(FileStatus status) throws IOException { return new LocationStatus(status.getPath().toUri(), status.getLen(), status.isDirectory()); } }; // For converting Location to LocationStatus private static final FunctionWithException<Location, LocationStatus, IOException> LOCATION_TO_LOCATION_STATUS = new FunctionWithException<Location, LocationStatus, IOException>() { @Override public LocationStatus apply(Location location) throws IOException { return new LocationStatus(location.toURI(), location.length(), location.isDirectory()); } }; public static final Comparator<Location> LOCATION_COMPARATOR = new Comparator<Location>() { @Override public int compare(Location o1, Location o2) { return o1.toURI().compareTo(o2.toURI()); } }; /** * Creates a new {@link InputSupplier} that can provides {@link SeekableInputStream} of the given path. * * @param fs The {@link org.apache.hadoop.fs.FileSystem} for the given path. * @param path The path to create {@link co.cask.cdap.common.io.SeekableInputStream} when requested. * @return A {@link InputSupplier}. */ public static InputSupplier<? extends SeekableInputStream> newInputSupplier(final FileSystem fs, final Path path) { return new InputSupplier<SeekableInputStream>() { @Override public SeekableInputStream getInput() throws IOException { FSDataInputStream input = fs.open(path); try { return new DFSSeekableInputStream(input, createDFSStreamSizeProvider(fs, path, input)); } catch (Throwable t) { Closeables.closeQuietly(input); Throwables.propagateIfInstanceOf(t, IOException.class); throw new IOException(t); } } }; } /** * Creates a new {@link InputSupplier} that can provides {@link SeekableInputStream} from the given location. * * @param location Location for the input stream. * @return A {@link InputSupplier}. */ public static InputSupplier<? extends SeekableInputStream> newInputSupplier(final Location location) { return new InputSupplier<SeekableInputStream>() { @Override public SeekableInputStream getInput() throws IOException { InputStream input = location.getInputStream(); try { if (input instanceof FileInputStream) { return new FileSeekableInputStream((FileInputStream) input); } if (input instanceof FSDataInputStream) { FSDataInputStream dataInput = (FSDataInputStream) input; LocationFactory locationFactory = location.getLocationFactory(); FileSystem fs = null; if (locationFactory instanceof HDFSLocationFactory) { fs = ((HDFSLocationFactory) locationFactory).getFileSystem(); } else if (locationFactory instanceof FileContextLocationFactory) { final FileContextLocationFactory lf = (FileContextLocationFactory) locationFactory; fs = lf.getFileContext().getUgi().doAs(new PrivilegedExceptionAction<FileSystem>() { @Override public FileSystem run() throws IOException { return FileSystem.get(lf.getConfiguration()); } }); } if (fs != null) { return new DFSSeekableInputStream(dataInput, createDFSStreamSizeProvider(fs, new Path(location.toURI()), dataInput)); } // This shouldn't happen return new DFSSeekableInputStream(dataInput, new StreamSizeProvider() { @Override public long size() throws IOException { // Assumption is if the FS is not a HDFS fs, the location length tells the stream size return location.length(); } }); } throw new IOException("Failed to create SeekableInputStream from location " + location); } catch (Throwable t) { Closeables.closeQuietly(input); Throwables.propagateIfInstanceOf(t, IOException.class); throw new IOException(t); } } }; } /** * Do some processing on the locations contained in the {@code startLocation}, using the {@code processor}. If this * location is a directory, all the locations contained in it will also be processed. If the {@code recursive} tag * is set to true, those locations that are directories will also be processed recursively. If the * {@code startLocation} is not a directory, this method will return the result of the processing of that location. * * @param startLocation location to start the processing from * @param recursive {@code true} if this method should be called on the directory {@link Location}s found from * {@code startLocation}. If the {@code startLocation} is a directory, all the locations under it * will be processed, regardless of the value of {@code recursive} * @param processor used to process locations. If the {@link Processor#process} method returns false on any * {@link Location} object processed, this method will return the current result of the processor. * @param <R> Type of the return value * @throws IOException if the locations could not be read */ public static <R> R processLocations(Location startLocation, boolean recursive, Processor<LocationStatus, R> processor) throws IOException { boolean topLevel = true; LocationFactory lf = startLocation.getLocationFactory(); LinkedList<LocationStatus> statusStack = new LinkedList<>(); statusStack.push(getLocationStatus(startLocation)); while (!statusStack.isEmpty()) { LocationStatus status = statusStack.poll(); if (!processor.process(status)) { return processor.getResult(); } if (status.isDir() && (topLevel || recursive)) { topLevel = false; RemoteIterator<LocationStatus> itor = listLocationStatus(lf.create(status.getUri())); while (itor.hasNext()) { statusStack.add(0, itor.next()); } } } return processor.getResult(); } /** * Tries to create a hardlink to the given {@link Location} if it is on the local file system. If creation * of the hardlink failed or if the Location is not local, it will copy the the location to the given target path. * * @param location location to hardlink or copy from * @param targetPath the target file path * @return the target path * @throws IOException if copying failed */ public static File linkOrCopy(Location location, File targetPath) throws IOException { URI uri = location.toURI(); if ("file".equals(uri.getScheme())) { try { Files.createLink(targetPath.toPath(), Paths.get(uri)); return targetPath; } catch (Exception e) { // Ignore. Fallback to copy } } try (InputStream is = location.getInputStream()) { Files.copy(is, targetPath.toPath()); } return targetPath; } /** * Returns a {@link LocationStatus} describing the status of the given {@link Location}. */ private static LocationStatus getLocationStatus(Location location) throws IOException { LocationFactory lf = location.getLocationFactory(); if (lf instanceof HDFSLocationFactory) { return FILE_STATUS_TO_LOCATION_STATUS.apply( ((HDFSLocationFactory) lf).getFileSystem().getFileLinkStatus(new Path(location.toURI()))); } if (lf instanceof FileContextLocationFactory) { return FILE_STATUS_TO_LOCATION_STATUS.apply( ((FileContextLocationFactory) lf).getFileContext().getFileLinkStatus(new Path(location.toURI()))); } return LOCATION_TO_LOCATION_STATUS.apply(location); } /** * Returns {@link RemoteIterator} of {@link LocationStatus} under a directory * represented by the given {@link Location}. */ private static RemoteIterator<LocationStatus> listLocationStatus(Location location) throws IOException { LocationFactory lf = location.getLocationFactory(); if (lf instanceof HDFSLocationFactory) { FileStatus[] fileStatuses = ((HDFSLocationFactory) lf).getFileSystem() .listStatus(new Path(location.toURI())); return transform(asRemoteIterator(Iterators.forArray(fileStatuses)), FILE_STATUS_TO_LOCATION_STATUS); } if (lf instanceof FileContextLocationFactory) { FileContext fc = ((FileContextLocationFactory) lf).getFileContext(); return transform(fc.listStatus(new Path(location.toURI())), FILE_STATUS_TO_LOCATION_STATUS); } return transform(asRemoteIterator(location.list().iterator()), LOCATION_TO_LOCATION_STATUS); } /** * Converts a {@link Iterator} into {@link RemoteIterator}. */ private static <E> RemoteIterator<E> asRemoteIterator(final Iterator<? extends E> itor) { return new RemoteIterator<E>() { @Override public boolean hasNext() throws IOException { return itor.hasNext(); } @Override public E next() throws IOException { return itor.next(); } }; } /** * Transform a {@link RemoteIterator} using a {@link FunctionWithException}. */ private static <F, T> RemoteIterator<T> transform(final RemoteIterator<F> itor, final FunctionWithException<F, T, IOException> transform) { return new RemoteIterator<T>() { @Override public boolean hasNext() throws IOException { return itor.hasNext(); } @Override public T next() throws IOException { return transform.apply(itor.next()); } }; } /** * Creates a new {@link OutputSupplier} that can provides {@link OutputStream} for the given location. * * @param location Location for the output. * @return A {@link OutputSupplier}. */ public static OutputSupplier<? extends OutputStream> newOutputSupplier(final Location location) { return new OutputSupplier<OutputStream>() { @Override public OutputStream getOutput() throws IOException { return location.getOutputStream(); } }; } /** * Creates a {@link Location} instance which represents the parent of the given location. * * @param location location to extra parent from. * @return an instance representing the parent location or {@code null} if there is no parent. */ @Nullable public static Location getParent(Location location) { URI source = location.toURI(); // If it is root, return null if ("/".equals(source.getPath())) { return null; } URI resolvedParent = URI.create(source.toString() + "/..").normalize(); // NOTE: if there is a trailing slash at the end, rename(), getName() and other operations on file // does not work in MapR. so we remove the trailing slash (if any) at the end. if (resolvedParent.toString().endsWith("/")) { String parent = resolvedParent.toString(); resolvedParent = URI.create(parent.substring(0, parent.length() - 1)); } return location.getLocationFactory().create(resolvedParent); } /** * Create the directory represented by the location if not exists. * * @param location the location for the directory. * @throws IOException If the location cannot be created */ public static void mkdirsIfNotExists(Location location) throws IOException { // Need to check && mkdir && check to deal with race condition if (!location.isDirectory() && !location.mkdirs() && !location.isDirectory()) { throw new IOException("Failed to create directory at " + location); } } public static void deleteQuietly(Location location) { deleteQuietly(location, false); } public static void deleteQuietly(Location location, boolean recursive) { try { location.delete(recursive); } catch (IOException e) { LOG.error("IOException while deleting location {}", location, e); } } /** * Deletes the content of the given location, but keeping the location itself. */ public static void deleteContent(Location location) { try { for (Location child : location.list()) { deleteQuietly(child, true); } } catch (IOException e) { LOG.error("IOException while deleting content of {}", location, e); } } /** * Converts the given file into a local {@link Location}. */ public static Location toLocation(File file) { return LOCAL_LOCATION_FACTORY.create(file.getAbsoluteFile().toURI()); } /** * Creates a {@link StreamSizeProvider} for determining the size of the given {@link FSDataInputStream}. */ private static StreamSizeProvider createDFSStreamSizeProvider(final FileSystem fs, final Path path, FSDataInputStream input) { // This is the default provider to use. It will try to determine if the file is closed and return the size of it. final StreamSizeProvider defaultSizeProvider = new StreamSizeProvider() { @Override public long size() throws IOException { if (fs instanceof DistributedFileSystem) { if (((DistributedFileSystem) fs).isFileClosed(path)) { return fs.getFileStatus(path).getLen(); } else { return -1L; } } // If the the underlying file system is not DistributedFileSystem, just assume the file length tells the size return fs.getFileStatus(path).getLen(); } }; // This supplier is to abstract out the logic for getting the DFSInputStream#getFileLength method using reflection // Reflection is used to avoid ClassLoading error if the DFSInputStream class is moved or method get renamed final InputStream wrappedStream = input.getWrappedStream(); final Supplier<Method> getFileLengthMethodSupplier = Suppliers.memoize(new Supplier<Method>() { @Override public Method get() { try { // This is a hack to get to the underlying DFSInputStream // Need to revisit it when need to support different distributed file system Class<? extends InputStream> cls = wrappedStream.getClass(); String expectedName = "org.apache.hadoop.hdfs.DFSInputStream"; if (!cls.getName().equals(expectedName)) { throw new Exception("Expected wrapper class be " + expectedName + ", but got " + cls.getName()); } Method getFileLengthMethod = cls.getMethod("getFileLength"); if (!getFileLengthMethod.isAccessible()) { getFileLengthMethod.setAccessible(true); } return getFileLengthMethod; } catch (Exception e) { throw Throwables.propagate(e); } } }); return new StreamSizeProvider() { @Override public long size() throws IOException { // Try to determine the size using default provider long size = defaultSizeProvider.size(); if (size >= 0) { return size; } try { // If not able to get length from the default provider, use the DFSInputStream#getFileLength method return (Long) getFileLengthMethodSupplier.get().invoke(wrappedStream); } catch (Throwable t) { LOG.warn("Unable to get actual file length from DFS input.", t); return size; } } }; } private Locations() { } }