/**
* villemos solutions [space^] (http://www.villemos.com)
* Probe. Send. Act. Emergent solution.
* Copyright 2011 Gert Villemos
* All Rights Reserved.
*
* Released under the Apache license, version 2.0 (do what ever
* you want, just dont claim ownership).
*
* NOTICE: All information contained herein is, and remains
* the property of villemos solutions, and its suppliers
* if any. The intellectual and technical concepts contained
* herein are proprietary to villemos solutions
* and its suppliers and may be covered by European and Foreign Patents,
* patents in process, and are protected by trade secret or copyright law.
*
* Dissemination of this information or reproduction of this material
* is strictly forbidden unless prior written permission is obtained
* from villemos solutions.
*
* And it wouldn't be nice either.
*
*/
package com.villemos.ispace.aperture;
import java.io.File;
import java.io.FileFilter;
import java.io.IOException;
import java.util.HashMap;
import java.util.Set;
import org.semanticdesktop.aperture.accessor.DataAccessor;
import org.semanticdesktop.aperture.accessor.DataAccessorFactory;
import org.semanticdesktop.aperture.accessor.DataObject;
import org.semanticdesktop.aperture.accessor.RDFContainerFactory;
import org.semanticdesktop.aperture.accessor.UrlNotFoundException;
import org.semanticdesktop.aperture.crawler.ExitCode;
import org.semanticdesktop.aperture.crawler.filesystem.FileSystemCrawler;
import org.semanticdesktop.aperture.datasource.DataSource;
import org.semanticdesktop.aperture.datasource.filesystem.FileSystemDataSource;
import org.semanticdesktop.aperture.util.OSUtils;
import org.semanticdesktop.aperture.vocabulary.NIE;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* A Crawler implementation for crawling file system sources modeled by a FileSystemDataSource.
*/
public class ExtendedFileSystemCrawler extends FileSystemCrawler {
private static final boolean DEFAULT_IGNORE_HIDDEN_FILES = true;
private static final boolean DEFAULT_FOLLOW_SYMBOLIC_LINKS = false;
private static final boolean DEFAULT_SUPPRESS_PARENT_CHILD_LINKS = false;
private static final int DEFAULT_MAX_DEPTH = Integer.MAX_VALUE;
private static final long DEFAULT_MAX_SIZE = Long.MAX_VALUE;
private Logger logger = LoggerFactory.getLogger(getClass());
private boolean ignoreHiddenFiles;
private boolean followSymbolicLinks;
private boolean suppressParentChildLinks;
private long maximumSize;
private DataAccessorFactory accessorFactory;
private HashMap params;
private File root;
private FileSystemDataSource source;
protected ExitCode crawlObjects() {
// fetch the source and its configuration
DataSource dataSource = getDataSource();
if (!(dataSource instanceof FileSystemDataSource)) {
return reportFatalErrorCause("wrong data source type, should be " + FileSystemDataSource.class.getName()
+ " is: " + dataSource.getClass().getName());
}
source = (FileSystemDataSource)dataSource;
// determine the root file
String rootFolder = source.getRootFolder();
if (rootFolder == null) {
// treat this as an error rather than an "empty source" to prevent information loss when e.g. a
// network drive is temporarily unavailable
return reportFatalErrorCause("rootFolder property missing");
}
root = new File(rootFolder);
if (!root.exists()) {
return reportFatalErrorCause("root folder does not exist: '" + root + "'");
}
// Resolve the root folder to its canonical form. Canonicalization is also done in CrawlFileTree for
// every single accessed File but this is part of a procedure to determine whether the file is a
// symbolic link. Doing it here one extra time for the root folder allows the specification of a root
// folder whose path includes a symbolic link. Without this extra step, the "follow symbolic links"
// setting could make crawling of such a source impossible. Therefore, symbolic links in the path of
// the root folder are always allowed, symbolic links that are encountered later on when descending in
// the file tree are optionally crawled
try {
root = root.getCanonicalFile();
}
catch (IOException e) {
return reportFatalErrorCause("unable to determine canonical file of root folder " + root, e);
}
// determine the maximum depth
Integer i = source.getMaximumDepth();
int maxDepth = i == null ? DEFAULT_MAX_DEPTH : i.intValue();
// determine the maximum byte size
Long l = source.getMaximumSize();
maximumSize = l == null ? DEFAULT_MAX_SIZE : l.longValue();
// determine whether we should crawl hidden files and directories
Boolean b = source.getIncludeHiddenResources();
ignoreHiddenFiles = b == null ? DEFAULT_IGNORE_HIDDEN_FILES : !b.booleanValue();
// determine whether we should crawl symbolic links
b = source.getFollowSymbolicLinks();
followSymbolicLinks = b == null ? DEFAULT_FOLLOW_SYMBOLIC_LINKS : b.booleanValue();
// determine whether we should suppress the parent->child hasPart triples from the output
b = source.getSuppressParentChildLinks();
suppressParentChildLinks = b == null ? DEFAULT_SUPPRESS_PARENT_CHILD_LINKS : b.booleanValue();
// init some other params
params = new HashMap(2);
getAccessorFactory();
// crawl the file tree
boolean crawlCompleted = crawlFileTree(root, maxDepth);
// clean-up
params = null;
// determine the exit code
return crawlCompleted ? ExitCode.COMPLETED : ExitCode.STOP_REQUESTED;
}
/**
* Retrieves a DataAccessorFactory for the file scheme and throws an exception when there is no such
* factory or when the DataAccessorRegistry has not been set.
*/
private void getAccessorFactory() {
if (accessorRegistry == null) {
throw new IllegalStateException("DataAccessorRegistry not set");
}
Set factories = accessorRegistry.get("file");
if (factories != null && !factories.isEmpty()) {
accessorFactory = (DataAccessorFactory) factories.iterator().next();
}
else {
throw new IllegalStateException("Could not retrieve a file data accessor");
}
}
/**
* Crawls a File tree.
*
* @return true if the path has been crawled completely, false if the crawl was aborted.
*/
private boolean crawlFileTree(File file, int depth) {
// resolve the file to its canonical form
try {
// determine absolute and canonical paths
String absolutePath = file.getAbsolutePath();
String canonicalPath = file.getCanonicalPath();
// optionally skip symbolic links
if (!followSymbolicLinks && !absolutePath.equals(canonicalPath)) {
return true;
}
// create the canonical File
file = new File(canonicalPath);
}
catch (IOException e) {
logger.warn("unable to resolve file to its canocical form, continuing with original file: "
+ file, e);
}
if (file.isFile() && depth >= 0) {
boolean inDomain = inDomain(file.toURI().toString());
boolean canRead = file.canRead();
boolean smallerThanMax = file.length() <= maximumSize;
if ( inDomain && canRead && smallerThanMax) {
// report the File
crawlSingleFile(file);
} else if (!inDomain) {
logger.info("File " + file.toURI() + " is outside the domain boundaries for this data source. Skipping.");
} else if (!canRead) {
logger.info("Can't read file " + file.toURI() + ". Skipping.");
} else if (!smallerThanMax) {
logger.info("File " + file.toURI() + " exceeds the maximum size specified for this data source. Skipping.");
}
// by definition we've completed this subtree
return true;
}
else if (file.isDirectory() && depth >= 0) {
// report the Folder itself
if (inDomain(file.toURI().toString())) {
crawlSingleFile(file);
}
else {
logger.info("Directory " + file.toURI() + " is not in domain. Skipping.");
}
// Dont crawl into MacOSX bundles.
if (OSUtils.isMac() && OSUtils.isMacOSXBundle(file))
return true;
// report nested Files (if the folder itself is in the domain)
if (depth > 0 && inDomain(file.toURI().toString())) {
//return iterateOverFolderContent(file, depth);
return filterThroughFolderContent(file, depth);
}
else {
return true;
}
}
else {
// Unknown path type (is this possible?) or depth < 0
return true;
}
}
private boolean filterThroughFolderContent(File file, int depth) {
CrawlerFileFilter filter = new CrawlerFileFilter(depth);
file.listFiles(filter);
return filter.getResult();
}
private boolean iterateOverFolderContent(File file, int depth) {
File[] nestedFiles = file.listFiles();
if (nestedFiles == null) {
// This happens on certain "special" directories, although the
// API documentation doesn't mention it, see java bug #4803836.
return true;
}
int i = 0;
for (; !stopRequested && i < nestedFiles.length; i++) {
File nestedFile = nestedFiles[i];
if (ignoreHiddenFiles && nestedFile.isHidden()) {
continue;
}
boolean scanCompleted = crawlFileTree(nestedFile, depth - 1);
if (!scanCompleted) {
return false;
}
}
// scan has been completed when i has reached the end of the array successfully
return i == nestedFiles.length;
}
/**
* Crawls a single File and reports it to the registered DataSourceListeners.
*/
private void crawlSingleFile(File file) {
// create an identifier for the file
String url = file.toURI().toString();
// register that we're processing this file
//handler.accessingObject(this, url);
//deprecatedUrls.remove(url);
reportAccessingObject(url);
// see if this object has been encountered before (we must do this before applying the accessor!)
boolean knownObject = accessData == null ? false : accessData.isKnownId(url);
// fetch a RDFContainer from the handler (note: is done for every
//RDFContainerFactory containerFactory = handler.getRDFContainerFactory(this, url);
RDFContainerFactory containerFactory = getRDFContainerFactory(url);
// fetch the DataObject
DataAccessor accessor = accessorFactory.get();
params.put("file", file);
// TODO return here after resolving the addParent issue
//if (file.equals(root)) {
// params.put("addParent",Boolean.FALSE);
//}
if (suppressParentChildLinks) {
params.put("suppressParentChildLinks", Boolean.TRUE);
}
DataObject dataObject = null;
try {
dataObject = accessor.getDataObjectIfModified(url, source, accessData, params,
containerFactory);
if (dataObject == null) {
// the object was not modified
//handler.objectNotModified(this, url);
//crawlReport.increaseUnchangedCount();
reportUnmodifiedDataObject(url);
}
else {
// If this is the root folder, add that info to the metadata
if (file.equals(root)) {
dataObject.getMetadata().add(NIE.rootElementOf, source.getID());
}
// we scanned a new or changed object
if (knownObject) {
//handler.objectChanged(this, dataObject);
//crawlReport.increaseChangedCount();
reportModifiedDataObject(dataObject);
}
else {
//handler.objectNew(this, dataObject);
//crawlReport.increaseNewCount();
reportNewDataObject(dataObject);
}
}
}
catch (UrlNotFoundException e) {
logger.warn("unable to access " + url, e);
}
catch (IOException e) {
logger.warn("I/O error while processing " + url, e);
}
finally {
if (dataObject != null) {
dataObject.getMetadata().dispose();
dataObject.dispose();
}
}
}
private class CrawlerFileFilter implements FileFilter {
private int depth;
private boolean result;
public CrawlerFileFilter(int depth) {
this.depth = depth;
this.result = true;
}
public boolean accept(File nestedFile) {
// there is no way to stop the listFiles method in the middle, so if a stop is
// requested so bail out as soon as possible
// also if the subtree starting at the given file has not been completed,
// we pass that knowledge upwards without crawling anything else
if (stopRequested || !result) {
result = false; // this means that we have not crawled the nestedFile
// which implies that the entire subtree has NOT been completed
return false; // note that this false does NOT mean the same as the result=false;
}
if (ignoreHiddenFiles && nestedFile.isHidden()) {
// this means that we should not crawl the nestedFile, but the entire subtree
// may still be considered completed, so we do not modify the result
return false;
}
result = crawlFileTree(nestedFile, depth - 1);
// return false for everything, we're done
return false;
}
public boolean getResult() {
return result;
}
}
// private static org.apache.log4j.Logger Logger = org.apache.log4j.Logger.getLogger(ExtendedFileSystemCrawler.class);
//
// protected DataAccessorFactory accessorFactory;
//
// protected FileSystemDataSource source;
//
// protected long maxSize = 5000000;
//
// private File root;
//
// // protected ExtendedCrawlerHandler handler = new ExtendedCrawlerHandler();
//
//
// protected ExitCode crawlObjects() {
// // fetch the source and its configuration
// DataSource dataSource = getDataSource();
// if (!(dataSource instanceof FileSystemDataSource)) {
// return reportFatalErrorCause("wrong data source type, should be " + FileSystemDataSource.class.getName()
// + " is: " + dataSource.getClass().getName());
// }
//
// source = (FileSystemDataSource)dataSource;
//
// // determine the root file
// String rootFolder = source.getRootFolder();
// if (rootFolder == null) {
// // treat this as an error rather than an "empty source" to prevent information loss when e.g. a
// // network drive is temporarily unavailable
// return reportFatalErrorCause("rootFolder property missing");
// }
// root = new File(rootFolder);
// if (!root.exists()) {
// return reportFatalErrorCause("root folder does not exist: '" + root + "'");
// }
//
// // Resolve the root folder to its canonical form. Canonicalization is also done in CrawlFileTree for
// // every single accessed File but this is part of a procedure to determine whether the file is a
// // symbolic link. Doing it here one extra time for the root folder allows the specification of a root
// // folder whose path includes a symbolic link. Without this extra step, the "follow symbolic links"
// // setting could make crawling of such a source impossible. Therefore, symbolic links in the path of
// // the root folder are always allowed, symbolic links that are encountered later on when descending in
// // the file tree are optionally crawled
// try {
// root = root.getCanonicalFile();
// }
// catch (IOException e) {
// return reportFatalErrorCause("unable to determine canonical file of root folder " + root, e);
// }
//
// // determine the maximum depth
// Integer i = source.getMaximumDepth();
// int maxDepth = i == null ? DEFAULT_MAX_DEPTH : i.intValue();
//
// // determine the maximum byte size
// Long l = source.getMaximumSize();
// maximumSize = l == null ? DEFAULT_MAX_SIZE : l.longValue();
//
// // determine whether we should crawl hidden files and directories
// Boolean b = source.getIncludeHiddenResources();
// ignoreHiddenFiles = b == null ? DEFAULT_IGNORE_HIDDEN_FILES : !b.booleanValue();
//
// // determine whether we should crawl symbolic links
// b = source.getFollowSymbolicLinks();
// followSymbolicLinks = b == null ? DEFAULT_FOLLOW_SYMBOLIC_LINKS : b.booleanValue();
//
// // determine whether we should suppress the parent->child hasPart triples from the output
// b = source.getSuppressParentChildLinks();
// suppressParentChildLinks = b == null ? DEFAULT_SUPPRESS_PARENT_CHILD_LINKS : b.booleanValue();
//
// // init some other params
// params = new HashMap(2);
// getAccessorFactory();
//
// // crawl the file tree
// boolean crawlCompleted = crawlFileTree(root, maxDepth);
//
// // clean-up
// params = null;
//
// // determine the exit code
// return crawlCompleted ? ExitCode.COMPLETED : ExitCode.STOP_REQUESTED;
// }
// protected ExitCode crawlObjects() {
//
// DataObject dataObject = null;
// try {
// // create an identifier for the file
// // String url = file.toURI().toString();
// String url = UUID.randomUUID().toString();
//
// // register that we're processing this file
// reportAccessingObject(url);
//
// // fetch a RDFContainer from the handler (note: is done for every
// RDFContainerFactory containerFactory = getRDFContainerFactory(url);
//
// // dataObject = getAccessorFactory().get().getDataObject(url, source, null, containerFactory);
// reportNewDataObject(dataObject);
// }
// catch (Exception e) {
// Logger.error("Courght exception while processing object" + e);
// e.printStackTrace();
// }
// catch (Error r) {
// Logger.error("Courght error while processing object " + r);
// r.printStackTrace();
// }
// finally {
// if (dataObject != null) {
// dataObject.dispose();
// }
// }
//
// return ExitCode.COMPLETED;
// }
}