/**
* Copyright 2013 Netflix, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.netflix.aegisthus.tools;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.util.List;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReentrantLock;
import java.util.regex.Pattern;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import com.google.common.collect.Lists;
/**
* A small tool for recursing directories in Hadoop.
*/
public class DirectoryWalker implements Cloneable {
private static class Cache implements Runnable {
public static Cache with(DirectoryWalker dw) {
return new Cache(dw);
}
private DirectoryWalker dw;
private List<PartitionInfo> files;
private PartitionInfo partitionInfo;
protected Cache(DirectoryWalker dw) {
this.dw = dw;
}
public Cache cache(PartitionInfo partitionInfo) {
this.partitionInfo = partitionInfo;
return this;
}
public Cache into(List<PartitionInfo> files) {
this.files = files;
return this;
}
public void run() {
try {
List<PartitionInfo> files = null;
if (partitionInfo.getStatus(dw.conf) != null) {
files = Lists.newArrayList(dw.add(partitionInfo).partitionInfo());
if (files.size() > 0) {
LOG.info(String.format("%s : % 4d file(s)", partitionInfo
.getStatus(dw.conf)
.getPath()
.toUri()
.toString(), files.size()));
}
}
try {
dw.lock.lock();
this.files.addAll(files);
} finally {
dw.lock.unlock();
}
} catch (IOException e) {
LOG.warn(String.format("%s : doesn't exist", partitionInfo
.getStatus(dw.conf)
.getPath()
.toUri()
.toString()));
throw new RuntimeException(e);
}
}
}
public static class PartitionInfo {
private String location;
private String partitionName;
private FileStatus status;
public PartitionInfo(FileStatus status) {
this.status = status;
}
public PartitionInfo(String partitionName) {
this.partitionName = partitionName;
}
public PartitionInfo(String partitionName, FileStatus status) {
this.partitionName = partitionName;
this.status = status;
}
public PartitionInfo(String partitionName, String location) {
this.partitionName = partitionName;
this.setLocation(location);
}
public String getLocation() {
return location;
}
public String getPartitionName() {
return partitionName;
}
public FileStatus getStatus(Configuration conf) {
if (status == null && location != null) {
Path path = new Path(location);
try {
FileSystem fs = path.getFileSystem(conf);
status = fs.getFileStatus(path);
} catch (IOException e) {
throw new RuntimeException(e);
}
}
return status;
}
public void setLocation(String location) {
this.location = location;
}
public void setPartitionName(String partitionName) {
this.partitionName = partitionName;
}
public void setStatus(FileStatus status) {
this.status = status;
}
}
public static final Pattern batch = Pattern.compile("batch_?id=[0-9]+/?$");
public static final PathFilter HIDDEN_FILE_FILTER = new PathFilter() {
public boolean accept(Path p) {
String name = p.getName();
return !name.startsWith("_") && !name.startsWith(".");
}
};
private static final Log LOG = LogFactory.getLog(DirectoryWalker.class);
private static FileStatus[] filterBatch(FileStatus[] files, boolean batched) {
if (batched && files.length > 0 && batch.matcher(files[0].getPath().toString()).find()) {
FileStatus first = files[0];
FileStatus last = files[files.length - 1];
if (first.getPath().toUri().toString().compareTo(last.getPath().toUri().toString()) > 0) {
return new FileStatus[] { first };
}
return new FileStatus[] { last };
}
return files;
}
public static DirectoryWalker with(Configuration conf) {
return new DirectoryWalker(conf);
}
private String base = null;
private boolean batched = false;
private final Configuration conf;
private ExecutorService es = null;
private List<PartitionInfo> files = Lists.newArrayList();
public ChainedPathFilter filter = new ChainedPathFilter();
private FileSystem fs;
protected Lock lock = new ReentrantLock();
private boolean manifest = false;
private boolean omitHidden = true;
private boolean onlyOne = false;
private boolean recursive = true;
private boolean stopAdding = false;
private boolean threaded = false;
protected DirectoryWalker(Configuration conf) {
this.conf = conf;
}
public DirectoryWalker add(FileStatus status) throws IOException {
this.fs = status.getPath().getFileSystem(conf);
this.base = status.getPath().toUri().toString();
if (!base.endsWith("/")) {
base = base + "/";
}
process(new PartitionInfo(status));
return this;
}
public DirectoryWalker add(PartitionInfo partitionInfo) throws IOException {
this.base = partitionInfo.getStatus(conf).getPath().toUri().toString();
if (!base.endsWith("/")) {
base = base + "/";
}
this.fs = partitionInfo.getStatus(conf).getPath().getFileSystem(conf);
process(partitionInfo);
return this;
}
public DirectoryWalker add(Path path) throws IOException {
this.base = path.toUri().toString();
if (!base.endsWith("/")) {
base = base + "/";
}
this.fs = path.getFileSystem(conf);
process(new PartitionInfo(fs.getFileStatus(path)));
return this;
}
public DirectoryWalker add(String location) throws IOException {
if (stopAdding) {
return this;
}
Path path = new Path(location);
fs = path.getFileSystem(conf);
this.base = location;
if (!base.endsWith("/")) {
base = base + "/";
}
if (fs.exists(path)) {
FileStatus status = fs.getFileStatus(path);
process(new PartitionInfo(status));
} else {
LOG.warn(String.format("%s does not exist", location));
}
return this;
}
public DirectoryWalker addAll(List<String> locations) throws IOException {
if (stopAdding) {
return this;
}
for (String location : locations) {
if (threaded) {
// We do the thread here to get around having to do a
// getFileStatus on each string in the main thread,
// which is slow for a large number of partitions on s3.
try {
es.submit(Cache
.with((DirectoryWalker) this.clone())
.into(this.files)
.cache(new PartitionInfo(null, location)));
} catch (CloneNotSupportedException e) {
}
} else {
add(location);
}
}
return this;
}
public DirectoryWalker addAllPartitions(List<PartitionInfo> partitions) throws IOException {
if (stopAdding) {
return this;
}
for (PartitionInfo partitionInfo : partitions) {
process(partitionInfo);
}
return this;
}
public DirectoryWalker addAllStatuses(List<FileStatus> locations) throws IOException {
if (stopAdding) {
return this;
}
for (FileStatus status : locations) {
process(new PartitionInfo(status));
}
return this;
}
private void awaitTermination() {
if (threaded && !es.isShutdown()) {
es.shutdown();
try {
es.awaitTermination(Long.MAX_VALUE, TimeUnit.DAYS);
} catch (InterruptedException e) {
}
}
}
public DirectoryWalker batched(boolean batched) {
this.batched = batched;
return this;
}
protected boolean cache(PartitionInfo partitionInfo) throws IOException {
FileStatus status = partitionInfo.getStatus(conf);
if (manifest && status.isDir()) {
Path manifest = new Path(status.getPath(), "_manifest/_manifest");
if (fs.exists(manifest)) {
LOG.info(String.format("Using manifest for partition %s", partitionInfo.partitionName));
BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(manifest)));
String file;
while ((file = br.readLine()) != null) {
files.add(new PartitionInfo(partitionInfo.getPartitionName(), fs.getFileStatus(new Path(file))));
}
return true;
}
}
if (status.isDir()) {
if (filter.accept(status.getPath())) {
for (FileStatus file : filterBatch(fs.listStatus(status.getPath()), batched)) {
if (!omitHidden || HIDDEN_FILE_FILTER.accept(file.getPath())) {
PartitionInfo child = new PartitionInfo(partitionInfo.getPartitionName(), file);
if ((recursive || !file.isDir()) && !cache(child)) {
return false;
}
}
}
}
} else {
files.add(partitionInfo);
stopAdding = onlyOne;
return !onlyOne;
}
return true;
}
public DirectoryWalker clearFilter() {
filter.clear();
return this;
}
@Override
protected Object clone() throws CloneNotSupportedException {
DirectoryWalker dw = DirectoryWalker.with(conf);
dw.onlyOne = this.onlyOne;
dw.omitHidden = this.omitHidden;
dw.stopAdding = this.stopAdding;
dw.batched = this.batched;
dw.recursive = this.recursive;
dw.filter = this.filter;
dw.lock = this.lock;
dw.manifest = this.manifest;
return dw;
}
public DirectoryWalker filter(PathFilter filter) {
this.filter.add(filter);
return this;
}
public DirectoryWalker manifest(boolean manifest) {
this.manifest = manifest;
return this;
}
public DirectoryWalker omitHidden() {
omitHidden = true;
return this;
}
public DirectoryWalker omitHidden(boolean omit) {
omitHidden = omit;
return this;
}
/**
* This will stop the walk of the directory when we find a single file that
* matches the filter
*/
public DirectoryWalker onlyOne() {
onlyOne = true;
return this;
}
public DirectoryWalker onlyOne(boolean onlyOne) {
this.onlyOne = onlyOne;
return this;
}
public List<PartitionInfo> partitionInfo() {
awaitTermination();
return files;
}
public Iterable<Path> paths() {
awaitTermination();
List<Path> paths = Lists.newArrayList();
for (PartitionInfo partitionInfo : files) {
paths.add(partitionInfo.getStatus(conf).getPath());
}
return paths;
}
public Iterable<String> pathsString() {
awaitTermination();
List<String> paths = Lists.newArrayList();
for (PartitionInfo partitionInfo : files) {
paths.add(partitionInfo.getStatus(conf).getPath().toUri().getPath());
}
return paths;
}
protected boolean process(PartitionInfo partitionInfo) throws IOException {
if (threaded) {
try {
es.submit(Cache.with((DirectoryWalker) this.clone()).into(this.files).cache(partitionInfo));
} catch (CloneNotSupportedException e) {
}
return true;
} else {
return cache(partitionInfo);
}
}
public DirectoryWalker recursive(boolean recursive) {
this.recursive = recursive;
return this;
}
/**
* Example:<br/>
* DirectoryWalker.with(conf).add("/my/path").relativePathStrings();<br/>
* will return the relative paths of all the files under /my/path
*
* @return the relative paths in a directory.
*/
public Iterable<String> relativePathStrings() {
awaitTermination();
if (base == null) {
throw new RuntimeException("relativePathStrings will only work with a single base using add");
}
List<String> paths = Lists.newArrayList();
for (PartitionInfo partitionInfo : files) {
FileStatus status = partitionInfo.getStatus(conf);
String path = status.getPath().toUri().toString();
if (!path.contains(base)) {
throw new RuntimeException("relativePathStrings will only work with a single base using add");
}
LOG.info(path);
LOG.info(base);
path = path.substring(path.indexOf(base) + base.length());
try {
path = URLDecoder.decode(path, "UTF-8");
} catch (UnsupportedEncodingException e) {
throw new RuntimeException(e);
}
LOG.info(path);
paths.add(path);
}
return paths;
}
public Iterable<FileStatus> statuses() {
awaitTermination();
List<FileStatus> statuses = Lists.newArrayList();
for (PartitionInfo partitionInfo : files) {
statuses.add(partitionInfo.getStatus(conf));
}
return statuses;
}
public DirectoryWalker threaded() {
return threaded(25);
}
public DirectoryWalker threaded(int threads) {
this.es = Executors.newFixedThreadPool(threads);
this.threaded = true;
return this;
}
}