/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.hadoop.raid; import java.io.FileNotFoundException; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.LinkedList; import java.util.List; import java.util.Stack; import java.util.concurrent.Executor; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; import java.util.concurrent.Semaphore; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.util.StringUtils; /** * Implements depth-first traversal using a Stack object. The traversal * can be stopped at any time and the state of traversal is saved. */ public class DirectoryTraversal { public static final Log LOG = LogFactory.getLog("org.apache.hadoop.raid.DirectoryTraversal"); private FileSystem fs; private List<FileStatus> paths; private int pathIdx = 0; // Next path to process. private Stack<Node> stack = new Stack<Node>(); private ExecutorService executor; private int numThreads; /** * A FileFilter object can be used to choose files during directory traversal. */ public interface FileFilter { /** * @return a boolean value indicating if the file passes the filter. */ boolean check(FileStatus f) throws IOException; } /** * Represents a directory node in directory traversal. */ static class Node { private FileStatus path; // Path that this node represents. private FileStatus[] elements; // Elements in the node. private int idx = 0; public Node(FileStatus path, FileStatus[] elements) { this.path = path; this.elements = elements; } public boolean hasNext() { return idx < elements.length; } public FileStatus next() { return elements[idx++]; } public FileStatus path() { return this.path; } } /** * Constructor. * @param fs The filesystem to use. * @param startPaths A list of paths that need to be traversed */ public DirectoryTraversal(FileSystem fs, List<FileStatus> startPaths) { this(fs, startPaths, 1); } public DirectoryTraversal( FileSystem fs, List<FileStatus> startPaths, int numThreads) { this.fs = fs; paths = startPaths; pathIdx = 0; this.numThreads = numThreads; executor = Executors.newFixedThreadPool(numThreads); } public List<FileStatus> getFilteredFiles(FileFilter filter, int limit) { List<FileStatus> filtered = new ArrayList<FileStatus>(); // We need this semaphore to block when the number of running workitems // is equal to the number of threads. FixedThreadPool limits the number // of threads, but not the queue size. This way we will limit the memory // usage. Semaphore slots = new Semaphore(numThreads); while (true) { synchronized(filtered) { if (filtered.size() >= limit) break; } FilterFileWorkItem work = null; try { Node next = getNextDirectoryNode(); if (next == null) { break; } work = new FilterFileWorkItem(filter, next, filtered, slots); slots.acquire(); } catch (InterruptedException ie) { break; } catch (IOException e) { break; } executor.execute(work); } try { // Wait for all submitted items to finish. slots.acquire(numThreads); // If this traversal is finished, shutdown the executor. if (doneTraversal()) { executor.shutdown(); executor.awaitTermination(1, TimeUnit.HOURS); } } catch (InterruptedException ie) { } return filtered; } class FilterFileWorkItem implements Runnable { FileFilter filter; Node dir; List<FileStatus> filtered; Semaphore slots; FilterFileWorkItem(FileFilter filter, Node dir, List<FileStatus> filtered, Semaphore slots) { this.slots = slots; this.filter = filter; this.dir = dir; this.filtered = filtered; } @SuppressWarnings("deprecation") public void run() { try { LOG.info("Initiating file filtering for " + dir.path.getPath()); for (FileStatus f: dir.elements) { if (!f.isFile()) { continue; } if (filter.check(f)) { synchronized(filtered) { filtered.add(f); } } } } catch (Exception e) { LOG.error("Error in directory traversal: " + StringUtils.stringifyException(e)); } finally { slots.release(); } } } /** * Return the next file. * @throws IOException */ public FileStatus getNextFile() throws IOException { // Check if traversal is done. while (!doneTraversal()) { // If traversal is not done, check if the stack is not empty. while (!stack.isEmpty()) { // If the stack is not empty, look at the top node. Node node = stack.peek(); // Check if the top node has an element. if (node.hasNext()) { FileStatus element = node.next(); // Is the next element a directory. if (!element.isDir()) { // It is a file, return it. return element; } // Next element is a directory, push it on to the stack and // continue try { pushNewNode(element); } catch (FileNotFoundException e) { // Ignore and move to the next element. } continue; } else { // Top node has no next element, pop it and continue. stack.pop(); continue; } } // If the stack is empty, do we have more paths? while (!paths.isEmpty()) { FileStatus next = paths.remove(0); pathIdx++; if (!next.isDir()) { return next; } try { pushNewNode(next); } catch (FileNotFoundException e) { continue; } break; } } return null; } /** * Gets the next directory in the tree. The algorithm returns deeper directories * first. * @return A FileStatus representing the directory. * @throws IOException */ public FileStatus getNextDirectory() throws IOException { Node dirNode = getNextDirectoryNode(); if (dirNode != null) { return dirNode.path; } return null; } private Node getNextDirectoryNode() throws IOException { // Check if traversal is done. while (!doneTraversal()) { // If traversal is not done, check if the stack is not empty. while (!stack.isEmpty()) { // If the stack is not empty, look at the top node. Node node = stack.peek(); // Check if the top node has an element. if (node.hasNext()) { FileStatus element = node.next(); // Is the next element a directory. if (element.isDir()) { // Next element is a directory, push it on to the stack and // continue try { pushNewNode(element); } catch (FileNotFoundException e) { // Ignore and move to the next element. } continue; } } else { stack.pop(); return node; } } // If the stack is empty, do we have more paths? while (!paths.isEmpty()) { FileStatus next = paths.remove(0); pathIdx++; if (next.isDir()) { try { pushNewNode(next); } catch (FileNotFoundException e) { continue; } break; } } } return null; } private void pushNewNode(FileStatus stat) throws IOException { if (!stat.isDir()) { return; } Path p = stat.getPath(); FileStatus[] elements = fs.listStatus(p); Node newNode = new Node(stat, (elements == null? new FileStatus[0]: elements)); stack.push(newNode); } public boolean doneTraversal() { return paths.isEmpty() && stack.isEmpty(); } }