package org.icij.extract.queue; import org.icij.kaxxa.concurrent.SealableLatch; import org.icij.kaxxa.events.Notifiable; import org.icij.kaxxa.concurrent.ExecutorProxy; import org.icij.extract.document.Document; import org.icij.extract.document.DocumentFactory; import org.icij.kaxxa.io.file.DosHiddenFileMatcher; import org.icij.kaxxa.io.file.PosixHiddenFileMatcher; import org.icij.kaxxa.io.file.SystemFileMatcher; import org.icij.task.Options; import org.icij.task.annotation.Option; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.Set; import java.util.List; import java.util.ArrayList; import java.util.EnumSet; import java.util.ArrayDeque; import java.io.IOException; import java.util.concurrent.*; import java.nio.file.Path; import java.nio.file.Paths; import java.nio.file.Files; import java.nio.file.SimpleFileVisitor; import java.nio.file.FileVisitOption; import java.nio.file.FileVisitResult; import java.nio.file.PathMatcher; import java.nio.file.FileSystem; import java.nio.file.attribute.BasicFileAttributes; /** * Scanner for scanning the directory tree starting at a given path. * * Each time {@link #scan} is called, the job is put in an unbounded queue and executed in serial. This makes sense as * it's usually the file system which is a bottleneck and not the CPU, so parallelization won't help. * * The {@link #scan} method is non-blocking, which is useful for creating parallelized producer-consumer setups, where * files are processed as they're scanned. * * Encountered documents are put in a given queue. This is a classic producer, putting elements in a queue which are * then extracted by a consumer. * * The queue should be bounded, to avoid the scanner filling up memory, but the bound should be high enough to create a * significant buffer between the scanner and the consumer. * * Documents are pushed into the queue synchronously and if the queue is bounded, only when a space becomes available. * * This implementation is thread-safe. * * @since 1.0.0-beta */ @Option(name = "includeHiddenFiles", description = "Don't ignore hidden files. On DOS file systems, this" + " means all files or directories with the \"hidden\" file attribute. On all other file systems, this means " + "all file or directories starting with a dot. Hidden files are ignored by default.") @Option(name = "includeOSFiles", description = "Include files and directories generated by common " + "operating systems. This includes \"Thumbs.db\" and \".DS_Store\". The list is not determined by the current " + "operating system. OS-generated files are ignored by default.") @Option(name = "includePattern", description = "Glob pattern for matching files e.g. \"**/*.{tif,pdf}\". " + "Files not matching the pattern will be ignored.", parameter = "pattern") @Option(name = "excludePattern", description = "Glob pattern for excluding files and directories. Files " + "and directories matching the pattern will be ignored.", parameter = "pattern") @Option(name = "followSymlinks", description = "Follow symbolic links, which are not followed by default.") @Option(name = "maxDepth", description = "The maximum depth to which the scanner will recurse.", parameter = "integer") public class Scanner extends ExecutorProxy { private static final Logger logger = LoggerFactory.getLogger(Scanner.class); protected final BlockingQueue<Document> queue; private final ArrayDeque<String> includeGlobs = new ArrayDeque<>(); private final ArrayDeque<String> excludeGlobs = new ArrayDeque<>(); private final DocumentFactory factory; private final SealableLatch latch; private final Notifiable notifiable; private long queued = 0; private int maxDepth = Integer.MAX_VALUE; private boolean followLinks = false; private boolean ignoreHiddenFiles = false; private boolean ignoreSystemFiles = true; /** * @see Scanner(BlockingQueue, SealableLatch, Notifiable) */ public Scanner(final DocumentFactory factory, final BlockingQueue<Document> queue) { this(factory, queue, null, null); } /** * @see Scanner(BlockingQueue, SealableLatch, Notifiable) */ public Scanner(final DocumentFactory factory, final BlockingQueue<Document> queue, final SealableLatch latch) { this(factory, queue, latch, null); } /** * Creates a {@code Scanner} that sends all results straight to the underlying {@link BlockingQueue<Document>} on a * single thread. * * @param queue results from the scanner will be put on this queue * @param latch signalled when a document is queued * @param notifiable receives notifications when new file documents are queued */ public Scanner(final DocumentFactory factory, final BlockingQueue<Document> queue, final SealableLatch latch, final Notifiable notifiable) { super(Executors.newSingleThreadExecutor()); this.factory = factory; this.queue = queue; this.notifiable = notifiable; this.latch = latch; } /** * Configure the scanner with the given options. * * @param options options for configuring the scanner * @return the scanner */ public Scanner configure(final Options<String> options) { options.get("includeOSFiles").parse().asBoolean().ifPresent(this::ignoreSystemFiles); options.get("includeHiddenFiles").parse().asBoolean().ifPresent(this::ignoreHiddenFiles); options.get("followSymlinks").parse().asBoolean().ifPresent(this::followSymLinks); options.get("includePattern").values().forEach(this::include); options.get("excludePattern").values().forEach(this::exclude); options.get("maxDepth").parse().asInteger().ifPresent(this::setMaxDepth); return this; } /** * Add a glob pattern for including files. Files not matching the pattern will be ignored. * * @param pattern the glob pattern */ public void include(final String pattern) { includeGlobs.add("glob:" + pattern); } /** * Add a glob pattern for excluding files and directories. * * @param pattern the glob pattern */ public void exclude(final String pattern) { excludeGlobs.add("glob:" + pattern); } /** * Set whether symlinks should be followed. * * @param followLinks whether to follow symlinks */ public void followSymLinks(final boolean followLinks) { this.followLinks = followLinks; } /** * Check whether symlinks will be followed. * * @return whether symlinks will be followed */ public boolean followSymLinks() { return followLinks; } /** * Set whether hidden files should be ignored. * * File names starting with a dot will always be ignored if set to {@literal true}, but DOS hidden files will * only be ignored if the filesystem supports the DOS hidden fileattribute. * * @param ignoreHiddenFiles whether to ignore hidden files */ public void ignoreHiddenFiles(final boolean ignoreHiddenFiles) { this.ignoreHiddenFiles = ignoreHiddenFiles; } /** * Check whether hidden files will be ignored. * * @return whether hidden files will be ignored */ public boolean ignoreHiddenFiles() { return ignoreHiddenFiles; } /** * Set whether system files should be ignored. * * @param ignoreSystemFiles whether to ignore system files */ public void ignoreSystemFiles(final boolean ignoreSystemFiles) { this.ignoreSystemFiles = ignoreSystemFiles; } /** * Check whether system files will be ignored. * * @return whether system files are ignore */ public boolean ignoreSystemFiles() { return ignoreSystemFiles; } /** * Set the maximum depth to recurse when scanning. * * @param maxDepth maximum depth */ public void setMaxDepth(final int maxDepth) { this.maxDepth = maxDepth; } /** * Get the currently set maximum depth to recurse when scanning. * * @return maximum depth */ public int getMaxDepth() { return maxDepth; } /** * Get the latch. * * @return The latch or null if none is set. */ public SealableLatch getLatch() { return latch; } /** * @return The total number of queued documents over the lifetime of this scanner. */ public long queued() { return queued; } /** * Queue a scanning job. * * Jobs are put in an unbounded queue and executed in serial, in a separate thread. * This method doesn't block. Call {@link #awaitTermination(long, TimeUnit)} to block. * * @param path the path to scan * @return A {@link Future} that can be used to wait on the result or cancel. */ public Future<Path> scan(final Path path) { final FileSystem fileSystem = path.getFileSystem(); final ScannerVisitor visitor = new ScannerVisitor(path); // In order to make hidden-file-ignoring logic more predictable, always ignore file names starting with a // dot, but only ignore DOS hidden files if the file system supports that attribute. if (ignoreHiddenFiles) { visitor.exclude(new PosixHiddenFileMatcher()); if (fileSystem.supportedFileAttributeViews().contains("dos")) { visitor.exclude(new DosHiddenFileMatcher()); } } if (ignoreSystemFiles) { visitor.exclude(new SystemFileMatcher()); } for (String excludeGlob : excludeGlobs) { visitor.exclude(fileSystem.getPathMatcher(excludeGlob)); } for (String includeGlob : includeGlobs) { visitor.include(fileSystem.getPathMatcher(includeGlob)); } logger.info(String.format("Queuing scan of: \"%s\".", path)); return executor.submit(visitor); } /** * Submit all of the given paths to the scanner for execution, returning a list of {@link Future} objects * representing those tasks. * * @see #scan(Path) * @return a {@link Future} for each path scanned */ public List<Future<Path>> scan(final Path[] paths) { final List<Future<Path>> futures = new ArrayList<>(); for (Path path : paths) futures.add(scan(path)); return futures; } /** * @see #scan(Path[]) */ public List<Future<Path>> scan(final String[] paths) { final Path[] _paths = new Path[paths.length]; for (int i = 0; i < paths.length; i++) _paths[i] = Paths.get(paths[i]); return scan(_paths); } /** * @see Scanner#scan(Path) */ public Future<Path> scan(final String path) { return scan(Paths.get(path)); } private class ScannerVisitor extends SimpleFileVisitor<Path> implements Callable<Path> { private final ArrayDeque<PathMatcher> includeMatchers = new ArrayDeque<>(); private final ArrayDeque<PathMatcher> excludeMatchers = new ArrayDeque<>(); private final Path path; /** * Instantiate a new task for scanning the given path. * * @param path the path to scan */ ScannerVisitor(final Path path) { this.path = path; } /** * Recursively walks the file tree of a directory. When walking is finished or stopped by an exception, the * latch is sealed and signalled. * * @return the path at which scanning started */ @Override public Path call() throws Exception { final Set<FileVisitOption> options; if (followLinks) { options = EnumSet.of(FileVisitOption.FOLLOW_LINKS); } else { options = EnumSet.noneOf(FileVisitOption.class); } logger.info(String.format("Starting scan of: \"%s\".", path)); try { Files.walkFileTree(path, options, maxDepth, this); } catch (IOException e) { logger.error(String.format("Error while scanning path: \"%s\".", path), e); throw e; } finally { if (null != latch){ latch.seal(); latch.signal(); } } logger.info(String.format("Completed scan of: \"%s\".", path)); return path; } /** * Queue a result from the scanner. Blocks until a queue slot is available. * * @throws InterruptedException if interrupted while waiting for a queue slot */ void queue(final Path file) throws InterruptedException { final Document document = factory.create(file); queue.put(document); queued++; if (null != latch) { latch.signal(); } if (null != notifiable) { notifiable.notifyListeners(file); } } /** * Add a path matcher for files to exclude. * * @param matcher the matcher */ void exclude(final PathMatcher matcher) { excludeMatchers.add(matcher); } /** * Add a path matcher for files to include. * * @param matcher the matcher */ void include(final PathMatcher matcher) { includeMatchers.add(matcher); } /** * Check whether a path should be excluded. * * @param path path to check * @return whether the path should be excluded */ boolean shouldExclude(final Path path) { return matches(path, excludeMatchers); } /** * Check whether a path should be included. * * @param path path to check * @return whether the path should be included */ boolean shouldInclude(final Path path) { return includeMatchers.size() == 0 || matches(path, includeMatchers); } /** * Check whether a path matches any of the given matchers. * * @param path path to check * @return whether the path matches */ private boolean matches(final Path path, final ArrayDeque<PathMatcher> matchers) { for (PathMatcher matcher : matchers) { if (matcher.matches(path)) { return true; } } return false; } @Override public FileVisitResult preVisitDirectory(final Path directory, final BasicFileAttributes attributes) throws IOException { if (Thread.currentThread().isInterrupted()) { logger.warn("Scanner interrupted. Terminating job."); return FileVisitResult.TERMINATE; } if (shouldExclude(directory)) { return FileVisitResult.SKIP_SUBTREE; } logger.info(String.format("Entering directory: \"%s\".", directory)); return FileVisitResult.CONTINUE; } @Override public FileVisitResult visitFile(final Path file, final BasicFileAttributes attributes) throws IOException { if (Thread.currentThread().isInterrupted()) { logger.warn("Scanner interrupted. Terminating job."); return FileVisitResult.TERMINATE; } // From the documentation: // "When following links, and the attributes of the target cannot be read, then this method attempts to // parse the BasicFileAttributes of the link." if (attributes.isSymbolicLink()) { if (followLinks) { logger.warn(String.format("Unable to read attributes of symlink target: \"%s\". Skipping.", file)); } return FileVisitResult.CONTINUE; } // Only skip the file if all of the include matchers return false. if (!shouldInclude(file)) { return FileVisitResult.CONTINUE; } if (shouldExclude(file)) { return FileVisitResult.CONTINUE; } try { queue(file); } catch (InterruptedException e) { Thread.currentThread().interrupt(); logger.warn("Interrupted. Terminating scanner."); return FileVisitResult.TERMINATE; } catch (Exception e) { logger.error(String.format("Exception while queuing file: \"%s\".", file), e); throw e; } return FileVisitResult.CONTINUE; } @Override public FileVisitResult visitFileFailed(final Path file, final IOException e) throws IOException { // If the file or directory was going to be excluded anyway, suppress the exception. // Don't re-throw the error. Scanning must be robust. Just log it. if (!shouldExclude(file)) { logger.error(String.format("Unable to read attributes of file: \"%s\".", file), e); } return FileVisitResult.CONTINUE; } } }