/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package gobblin.util; import java.io.FileNotFoundException; import java.io.IOException; import java.util.Comparator; import java.util.List; import org.apache.commons.lang.ArrayUtils; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.PathFilter; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.collect.Lists; import com.google.common.primitives.Longs; /** * Utility class for listing files on a {@link FileSystem}. * * @see FileSystem */ public class FileListUtils { private static final Logger LOG = LoggerFactory.getLogger(FileListUtils.class); public static final Comparator<FileStatus> LATEST_MOD_TIME_ORDER = new Comparator<FileStatus>() { @Override public int compare(FileStatus file1, FileStatus file2) { return Longs.compare(Long.valueOf(file2.getModificationTime()), Long.valueOf(file1.getModificationTime())); } }; public static final PathFilter NO_OP_PATH_FILTER = new PathFilter() { @Override public boolean accept(Path path) { return true; } }; public static List<FileStatus> listFilesRecursively(FileSystem fs, Path path) throws IOException { return listFilesRecursively(fs, path, NO_OP_PATH_FILTER); } public static List<FileStatus> listFilesRecursively(FileSystem fs, Iterable<Path> paths) throws IOException { List<FileStatus> results = Lists.newArrayList(); for (Path path : paths) { results.addAll(listFilesRecursively(fs, path)); } return results; } /** * Helper method to list out all files under a specified path. The specified {@link PathFilter} is treated as a file * filter, that is it is only applied to file {@link Path}s. */ public static List<FileStatus> listFilesRecursively(FileSystem fs, Path path, PathFilter fileFilter) throws IOException { return listFilesRecursivelyHelper(fs, Lists.<FileStatus> newArrayList(), fs.getFileStatus(path), fileFilter, false); } /** * Helper method to list out all files under a specified path. If applyFilterToDirectories is false, the supplied * {@link PathFilter} will only be applied to files. */ public static List<FileStatus> listFilesRecursively(FileSystem fs, Path path, PathFilter fileFilter, boolean applyFilterToDirectories) throws IOException { return listFilesRecursivelyHelper(fs, Lists.<FileStatus> newArrayList(), fs.getFileStatus(path), fileFilter, applyFilterToDirectories); } private static List<FileStatus> listFilesRecursivelyHelper(FileSystem fs, List<FileStatus> files, FileStatus fileStatus, PathFilter fileFilter, boolean applyFilterToDirectories) throws FileNotFoundException, IOException { if (fileStatus.isDirectory()) { for (FileStatus status : fs.listStatus(fileStatus.getPath(), applyFilterToDirectories ? fileFilter : NO_OP_PATH_FILTER)) { if (fileStatus.isDirectory()) { listFilesRecursivelyHelper(fs, files, status, fileFilter, applyFilterToDirectories); } else { files.add(fileStatus); } } } else if (fileFilter.accept(fileStatus.getPath())) { files.add(fileStatus); } return files; } /** * Method to list out all files, or directory if no file exists, under a specified path. */ public static List<FileStatus> listMostNestedPathRecursively(FileSystem fs, Path path) throws IOException { return listMostNestedPathRecursively(fs, path, NO_OP_PATH_FILTER); } public static List<FileStatus> listMostNestedPathRecursively(FileSystem fs, Iterable<Path> paths) throws IOException { List<FileStatus> results = Lists.newArrayList(); for (Path path : paths) { results.addAll(listMostNestedPathRecursively(fs, path)); } return results; } /** * Method to list out all files, or directory if no file exists, under a specified path. * The specified {@link PathFilter} is treated as a file filter, that is it is only applied to file {@link Path}s. */ public static List<FileStatus> listMostNestedPathRecursively(FileSystem fs, Path path, PathFilter fileFilter) throws IOException { return listMostNestedPathRecursivelyHelper(fs, Lists.<FileStatus> newArrayList(), fs.getFileStatus(path), fileFilter); } private static List<FileStatus> listMostNestedPathRecursivelyHelper(FileSystem fs, List<FileStatus> files, FileStatus fileStatus, PathFilter fileFilter) throws IOException { if (fileStatus.isDirectory()) { FileStatus[] curFileStatus = fs.listStatus(fileStatus.getPath()); if (ArrayUtils.isEmpty(curFileStatus)) { files.add(fileStatus); } else { for (FileStatus status : curFileStatus) { listMostNestedPathRecursivelyHelper(fs, files, status, fileFilter); } } } else if (fileFilter.accept(fileStatus.getPath())) { files.add(fileStatus); } return files; } /** * Helper method to list out all paths under a specified path. If the {@link org.apache.hadoop.fs.FileSystem} is * unable to list the contents of a relevant directory, will log an error and skip. */ public static List<FileStatus> listPathsRecursively(FileSystem fs, Path path, PathFilter fileFilter) throws IOException { return listPathsRecursivelyHelper(fs, Lists.<FileStatus> newArrayList(), fs.getFileStatus(path), fileFilter); } private static List<FileStatus> listPathsRecursivelyHelper(FileSystem fs, List<FileStatus> files, FileStatus fileStatus, PathFilter fileFilter) { if (fileFilter.accept(fileStatus.getPath())) { files.add(fileStatus); } if (fileStatus.isDirectory()) { try { for (FileStatus status : fs.listStatus(fileStatus.getPath())) { listPathsRecursivelyHelper(fs, files, status, fileFilter); } } catch (IOException ioe) { LOG.error("Could not list contents of path " + fileStatus.getPath()); } } return files; } }