package hip.util; import com.google.common.base.Function; import com.google.common.collect.Iterables; import com.google.common.collect.Lists; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.*; import javax.annotation.Nullable; import java.io.IOException; import java.util.Arrays; import java.util.List; /** * Utilities to help with HDFS activities. */ public class HdfsIoUtils { public static final PathFilter hiddenFileFilter = new PathFilter() { public boolean accept(Path p) { String name = p.getName(); return !name.startsWith("_") && !name.startsWith("."); } }; /** * Glob-able mapping of String paths to Path objects, and in the process * expanding path patterns allowing for inputs such as "/some/path*". * * @param conf Hadoop config * @param paths paths to convert and glob * @return globbed paths * @throws IOException if something goes wrong */ public static Iterable<Path> stringsToPaths(Configuration conf, Iterable<String> paths) throws IOException { Iterable<Path> pathIterable = Iterables.transform(paths, new Function<String, Path>() { @Nullable @Override public Path apply(String path) { return new Path(path); } }); List<Path> result = Lists.newArrayList(); for (Path p : pathIterable) { FileSystem fs = p.getFileSystem(conf); FileStatus[] matches = fs.globStatus(p, hiddenFileFilter); if (matches != null && matches.length > 0) { for (FileStatus globStat : matches) { if (globStat.isDirectory()) { RemoteIterator<LocatedFileStatus> iter = fs.listLocatedStatus(globStat.getPath()); while (iter.hasNext()) { LocatedFileStatus stat = iter.next(); if (hiddenFileFilter.accept(stat.getPath())) { if (stat.isDirectory()) { addInputPathRecursively(result, fs, stat.getPath(), hiddenFileFilter); } else { result.add(stat.getPath()); } } } } else { result.add(globStat.getPath()); } } } } return result; } /** * Add files in the input path recursively into the results. * * @param result The List to store all files. * @param fs The FileSystem. * @param path The input path. * @param inputFilter The input filter that can be used to filter files/dirs. * @throws IOException if something goes wrong */ public static void addInputPathRecursively(List<Path> result, FileSystem fs, Path path, PathFilter inputFilter) throws IOException { RemoteIterator<LocatedFileStatus> iter = fs.listLocatedStatus(path); while (iter.hasNext()) { LocatedFileStatus stat = iter.next(); if (inputFilter.accept(stat.getPath())) { if (stat.isDirectory()) { addInputPathRecursively(result, fs, stat.getPath(), inputFilter); } else { result.add(stat.getPath()); } } } } public static Iterable<Path> stringsToPaths(Configuration conf, String... paths) throws IOException { return stringsToPaths(conf, Arrays.asList(paths)); } }