//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.collectionreaders;
import static java.nio.file.StandardWatchEventKinds.ENTRY_CREATE;
import static java.nio.file.StandardWatchEventKinds.ENTRY_DELETE;
import static java.nio.file.StandardWatchEventKinds.ENTRY_MODIFY;
import static java.nio.file.StandardWatchEventKinds.OVERFLOW;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.FileSystems;
import java.nio.file.FileVisitResult;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.SimpleFileVisitor;
import java.nio.file.WatchEvent;
import java.nio.file.WatchKey;
import java.nio.file.WatchService;
import java.nio.file.attribute.BasicFileAttributes;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import org.apache.uima.UimaContext;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import uk.gov.dstl.baleen.core.utils.BaleenDefaults;
import uk.gov.dstl.baleen.exceptions.InvalidParameterException;
import uk.gov.dstl.baleen.uima.BaleenCollectionReader;
import uk.gov.dstl.baleen.uima.IContentExtractor;
/**
* Inspect a folder for unprocessed files, and process them through the pipeline.
* Currently, the list of previously processed files is held in memory and so will be lost if the server is restarted.
* This can be avoided by using the MoveSourceConsumer (for example), and removing the files after processing
*
* @baleen.javadoc
*/
public class FolderReader extends BaleenCollectionReader {
/**
* A list of folders to watch
*
* @baleen.config <i>Current directory</i>
*/
public static final String PARAM_FOLDERS = "folders";
@ConfigurationParameter(name = PARAM_FOLDERS, defaultValue = {})
private String[] folders;
/**
* Should files be reprocessed when modified?
*
* @baleen.config false
*/
public static final String PARAM_REPROCESS_ON_MODIFY = "reprocess";
@ConfigurationParameter(name = PARAM_REPROCESS_ON_MODIFY, defaultValue="false")
private boolean reprocessOnModify = false;
/**
* Should folders be processed recursively (i.e. should we watch subfolders too)?
*
* @baleen.config true
*/
public static final String PARAM_RECURSIVE = "recursive";
@ConfigurationParameter(name = PARAM_RECURSIVE, defaultValue="true")
private boolean recursive = true;
/**
* The content extractor to use to extract content from files
*
* @baleen.config Value of BaleenDefaults.DEFAULT_CONTENT_EXTRACTOR
*/
public static final String PARAM_CONTENT_EXTRACTOR = "contentExtractor";
@ConfigurationParameter(name = PARAM_CONTENT_EXTRACTOR, defaultValue=BaleenDefaults.DEFAULT_CONTENT_EXTRACTOR)
private String contentExtractor;
/**
* A list of patterns that the filename must match.
* This can be used for specifying allowed file extensions (e.g. .*\\.txt would accept all text files).
*
* Files are accepted if they match any of the specified patterns.
* If no patterns are specified, then all files will be accepted.
*
* Patterns are treated as case insensitive.
*
* @baleen.config
*/
public static final String PARAM_ACCEPTED_PATTERNS = "acceptedFilenames";
@ConfigurationParameter(name = PARAM_ACCEPTED_PATTERNS, defaultValue = {})
private String[] acceptedFilenames;
private List<Pattern> acceptedFilenamesSet = new ArrayList<>();
private WatchService watcher;
private Map<WatchKey, Path> watchKeys = new HashMap<>();
private List<Path> queue = new ArrayList<>();
private IContentExtractor extractor;
@Override
public void doInitialize(UimaContext context) throws ResourceInitializationException {
if(folders == null || folders.length == 0){
folders = new String[1];
folders[0] = System.getProperty("user.dir");
}
for(String pattern : acceptedFilenames){
try{
Pattern p = Pattern.compile(pattern, Pattern.CASE_INSENSITIVE);
acceptedFilenamesSet.add(p);
}catch(PatternSyntaxException pse){
getMonitor().warn("Could not compile pattern '{}', it will not be included in the set of accepted filenames", pattern, pse);
}
}
try{
extractor = getContentExtractor(contentExtractor);
}catch(InvalidParameterException ipe){
throw new ResourceInitializationException(ipe);
}
extractor.initialize(context, getConfigParameters(context));
try{
watcher = FileSystems.getDefault().newWatchService();
}catch(IOException ioe){
throw new ResourceInitializationException(ioe);
}
registerFolders();
}
private void registerFolders() {
for(String folder : folders){
try{
Path p = Paths.get(folder);
p = p.toRealPath();
if(recursive){
Files.walkFileTree(p, new SimpleFileVisitor<Path>(){
@Override
public FileVisitResult preVisitDirectory(Path dir, BasicFileAttributes attr) throws IOException{
registerDirectory(dir);
return FileVisitResult.CONTINUE;
}
});
}else{
registerDirectory(p);
}
addFilesFromDir(p.toFile());
}catch(IOException ioe){
getMonitor().warn("Could not find or register folder '{}' or it's subfolders - folder will be skipped", folder,ioe);
}
}
}
private void registerDirectory(Path path) throws IOException{
WatchKey key;
if(reprocessOnModify){
key = path.register(watcher, ENTRY_CREATE, ENTRY_MODIFY, ENTRY_DELETE);
}else{
key = path.register(watcher, ENTRY_CREATE, ENTRY_DELETE);
}
watchKeys.put(key, path);
getMonitor().counter("directories").inc();
}
private void addFilesFromDir(File dir){
File[] files = dir.listFiles();
if(files == null){
return;
}
for (int i = 0; i < files.length; i++) {
if(!files[i].isDirectory()){
addFile(files[i].toPath());
getMonitor().counter("files").inc();
}else if (recursive) {
addFilesFromDir(files[i]);
}
}
}
@Override
public void doGetNext(JCas jCas) throws IOException, CollectionException {
if(queue.isEmpty()){
getMonitor().error("No documents on the queue - this method should not have been called");
throw new CollectionException();
}
Path path = queue.remove(0);
getMonitor().info("Processing file {}", path.toString());
try(
InputStream is = new FileInputStream(path.toFile());
){
extractor.processStream(is, path.toString(), jCas);
}
}
@Override
public void doClose() throws IOException {
if(watcher != null) {
watcher.close();
watcher = null;
}
watchKeys.clear();
queue.clear();
if(extractor != null) {
extractor.destroy();
extractor = null;
}
}
/**
* Every time doHasNext() is called, check the WatchService for new events and add all new events to the queue.
* Then return true if there are files on the queue, or false otherwise.
*
* If the event indicates that a file has been deleted, ensure it is removed from the queue.
*/
@Override
public boolean doHasNext() throws IOException, CollectionException {
WatchKey key;
while((key = watcher.poll()) != null){
for(WatchEvent<?> event : key.pollEvents()){
processEvent(key, event);
getMonitor().meter("events").mark();
}
key.reset();
}
return !queue.isEmpty();
}
private void processEvent(WatchKey key, WatchEvent<?> event) {
@SuppressWarnings("unchecked")
WatchEvent<Path> pathEvent = (WatchEvent<Path>) event;
if(event.kind() == OVERFLOW){
getMonitor().warn("OVERFLOW event received - some files may be missing from the queue");
}else if(event.kind() == ENTRY_DELETE){
getMonitor().debug("ENTRY_DELETE event received - file '{}' will be removed from queue", pathEvent.context());
try{
Path dir = watchKeys.get(key);
if(dir != null){
Path resolved = dir.resolve(pathEvent.context());
queue.remove(resolved);
}else{
getMonitor().warn("WatchKey not found - file '{}' will not be removed from the queue", pathEvent.context());
}
}catch(Exception ioe){
getMonitor().warn("An error occurred - file '{}' will not be removed from the queue", pathEvent.context(), ioe);
}
queue.remove(pathEvent.context());
}else{
getMonitor().debug(event.kind().name() + " event received - file '{}' will be added to the queue", pathEvent.context());
try{
Path dir = watchKeys.get(key);
if(dir != null){
Path resolved = dir.resolve(pathEvent.context());
if (resolved.toFile().isDirectory()) {
if (recursive) {
addFilesFromDir(resolved.toFile());
registerDirectory(resolved);
}
} else {
addFile(resolved);
}
}else{
getMonitor().warn("WatchKey not found - file '{}' will not be added to the queue", pathEvent.context());
}
}catch(Exception ioe){
getMonitor().warn("An error occurred - file '{}' will not be added to the queue", pathEvent.context(), ioe);
}
}
}
private void addFile(Path path){
if(acceptedFilenamesSet.isEmpty()){
queue.add(path);
}else{
for(Pattern p : acceptedFilenamesSet){
Matcher m = p.matcher(path.getFileName().toString());
if(m.matches()){
queue.add(path);
return;
}
}
}
}
}