ExtendedFileSystemCrawler.java example

Explorer

iSpace-master
- base
  - DirectoryAssembler
    - src
      - main
        java
        com
        villemos
        ispace
        assembler
        DirectoryAssemblerComponent.java
        DirectoryAssemblerEndpoint.java
        DirectoryAssemblerProducer.java
        DocumentRetriever.java
        helper
        Buffer.java
        LanguageDetector.java
        ReferenceIdBodyEnricher.java
  - api
    - src
      - main
        java
        com
        villemos
        ispace
        api
        Acronym.java
        EntityTypes.java
        Facet.java
        ICallback.java
        IData.java
        IMetaData.java
        RegularExpression.java
        ResultSet.java
        SolrOptions.java
        Statistics.java
        Suggestion.java
        Synonym.java
        Taxonomy.java
        TaxonomyFields.java
  - api-camel
    - src
      - main
        java
        com
        villemos
        ispace
        apicamel
        Data.java
        MetaData.java
  - api-rmi
    - src
      - main
        java
        com
        villemos
        ispace
        apirmi
        Data.java
        MetaData.java
  - consolidator
    - src
      - main
        java
        com
        villemos
        ispace
        consolidator
        ReferenceBooster.java
        SynonymConsolidator.java
  - core
    - src
      - main
        java
        com
        villemos
        ispace
        Starter.java
        core
        caches
        AcronymCache.java
        IAcronymManager.java
        ISynonymManager.java
        ITaxonomyManager.java
        search
        AbstractExpander.java
        AbstractExpanderTest.java
        AcronymExpander.java
        AutoCompletionProxy.java
        ICallback.java
        QueueExpansionTest.java
        RepositoryProxy.java
        SynonymExpander.java
        SynonymExpanderTest.java
        TaxonomyExpander.java
        TaxonomyExpanderTest.java
        ZombieExpander.java
        utilities
        EndpointConfigurer.java
        IoFieldSetter.java
        StringComparison.java
        log
        RmiAppender.java
  - databasecrawler
    - src
      - main
        java
        com
        villemos
        ispace
        databasecrawler
        SqlResultsetMapper.java
  - documentcrawler
    - src
      - main
        java
        com
        villemos
        ispace
        aperture
        DocumentProcessor.java
        ExtendedCrawlerHandler.java
        ExtendedFileSystemCrawler.java
        InformationObject.java
        enricher
        MicrosoftPropertyReader.java
        processor
        IProcessor.java
  - enricher
    - src
      - main
        java
        com
        villemos
        ispace
        enricher
        AcronymDetector.java
        ConstantFields.java
        DateFilter.java
        ExpressionBasedTransformer.java
        PatternBasedEnricher.java
        RegularExpressionBuffer.java
        SynonymBuffer.java
        SynonymConsolidator.java
        TaxonomyBuffer.java
  - excell
    - src
      - main
        java
        com
        villemos
        ispace
        excell
        DefaultSheetFormatter.java
        DefaultWorkbookFormatter.java
        ExcellComponent.java
        ExcellConsumer.java
        ExcellEndpoint.java
        ExcellFileConsumer.java
        ExcellProducer.java
        ISheetFormatter.java
        IWorkbookFormatter.java
        TemplateBasedSheetFormatter.java
        TemplateBasedWorkbookFormatter.java
      - test
        java
        com
        villemos
        ispace
        excell
        ExcellTest.java
        NamingTest.java
        TestClass.java
        TestClassSpecialization.java
        TestClassUnrelated.java
        TestDateFormat.java
  - httpcrawler
    - src
      - main
        java
        com
        villemos
        ispace
        httpcrawler
        EasyX509TrustManager.java
        HttpAccessor.java
        HttpClientConfigurer.java
        HttpCrawlerComponent.java
        HttpCrawlerConsumer.java
        HttpCrawlerEndpoint.java
        InformationObject.java
  - ktree
    - src
      - main
        java
        com
        villemos
        ispace
        ktree
        KtreeAccessor.java
        KtreeCrawlerComponent.java
        KtreeCrawlerConsumer.java
        KtreeCrawlerEndpoint.java
        KtreeCrawlerProducer.java
        folder
        Folder.java
        Item.java
        Items.java
        Result.java
        metadata
        MetadataField.java
        MetadataItem.java
        session
        Session.java
      - test
        java
        com
        villemos
        ispace
        ktree
        MetadataTest.java
  - solr
    - src
      - main
        java
        com
        villemos
        ispace
        solr
        SolrComponent.java
        SolrConsumer.java
        SolrEndpoint.java
        SolrProducer.java
        Utilities.java
  - system-test
    - src
      - test
        java
        com
        villemos
        ispace
        solr
        RetrievalBuffer.java
        SolrTest.java
  - testclient
    - src
      - main
        java
        com
        villemos
        ispace
        testclient
        DataPrinter.java
        TestClientComponent.java
        TestClientConsumer.java
        TestClientEndpoint.java
  - tool-ktree-crawler
    - src
      - main
        java
        com
        villemos
        ispace
        ktreecrawler
        MiddleBean.java
        StatusHolder.java
  - webster
    - src
      - main
        java
        com
        villemos
        ispace
        webster
        WebsterComponent.java
        WebsterEndpoint.java
        WebsterProducer.java
- solutions
  - oam
    - src
      - main
        java
        com
        logica
        oam
        ktree
        RequestChecker.java
        enricher
        AccessUrlEnricher.java
        DomainDetector.java
        ReferenceIdEnricher.java
        ReleaseIdEnricher.java
        SetFolderUrl.java
        VersionEnricher.java
        filter
        DateFilter.java
        ElementFilter.java
        ReleaseFilter.java
        statistics
        ExtractorStatistics.java
        Statistic.java
        StatisticsCollector.java
        transformer
        DuplicateMerger.java
        DuplicateSimplifier.java
        NewApplicationSplitter.java
        OldApplicationSplitter.java
        ReferenceIdSelector.java
        types
        AlphabeticSorter.java
        ApplicationData.java
        ReleaseComperator.java

/**
 * villemos solutions [space^] (http://www.villemos.com) 
 * Probe. Send. Act. Emergent solution. 
 * Copyright 2011 Gert Villemos
 * All Rights Reserved.
 * 
 * Released under the Apache license, version 2.0 (do what ever
 * you want, just dont claim ownership).
 * 
 * NOTICE:  All information contained herein is, and remains
 * the property of villemos solutions, and its suppliers
 * if any. The intellectual and technical concepts contained
 * herein are proprietary to villemos solutions
 * and its suppliers and may be covered by European and Foreign Patents,
 * patents in process, and are protected by trade secret or copyright law.
 * 
 * Dissemination of this information or reproduction of this material
 * is strictly forbidden unless prior written permission is obtained
 * from villemos solutions.
 * 
 * And it wouldn't be nice either.
 * 
 */
package com.villemos.ispace.aperture;

import java.io.File;
import java.io.FileFilter;
import java.io.IOException;
import java.util.HashMap;
import java.util.Set;


import org.semanticdesktop.aperture.accessor.DataAccessor;
import org.semanticdesktop.aperture.accessor.DataAccessorFactory;
import org.semanticdesktop.aperture.accessor.DataObject;
import org.semanticdesktop.aperture.accessor.RDFContainerFactory;
import org.semanticdesktop.aperture.accessor.UrlNotFoundException;
import org.semanticdesktop.aperture.crawler.ExitCode;
import org.semanticdesktop.aperture.crawler.filesystem.FileSystemCrawler;
import org.semanticdesktop.aperture.datasource.DataSource;
import org.semanticdesktop.aperture.datasource.filesystem.FileSystemDataSource;
import org.semanticdesktop.aperture.util.OSUtils;
import org.semanticdesktop.aperture.vocabulary.NIE;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * A Crawler implementation for crawling file system sources modeled by a FileSystemDataSource.
 */
public class ExtendedFileSystemCrawler extends FileSystemCrawler {

    private static final boolean DEFAULT_IGNORE_HIDDEN_FILES = true;

    private static final boolean DEFAULT_FOLLOW_SYMBOLIC_LINKS = false;
    
    private static final boolean DEFAULT_SUPPRESS_PARENT_CHILD_LINKS = false;

    private static final int DEFAULT_MAX_DEPTH = Integer.MAX_VALUE;

    private static final long DEFAULT_MAX_SIZE = Long.MAX_VALUE;

    private Logger logger = LoggerFactory.getLogger(getClass());

    private boolean ignoreHiddenFiles;

    private boolean followSymbolicLinks;
    
    private boolean suppressParentChildLinks;

    private long maximumSize;

    private DataAccessorFactory accessorFactory;

    private HashMap params;

    private File root;
    
    private FileSystemDataSource source;

    protected ExitCode crawlObjects() {        
        // fetch the source and its configuration
        DataSource dataSource = getDataSource();
        if (!(dataSource instanceof FileSystemDataSource)) {
            return reportFatalErrorCause("wrong data source type, should be " + FileSystemDataSource.class.getName()
                    + " is: " + dataSource.getClass().getName());
        }
        
        source = (FileSystemDataSource)dataSource;
        
        // determine the root file
        String rootFolder = source.getRootFolder();
        if (rootFolder == null) {
            // treat this as an error rather than an "empty source" to prevent information loss when e.g. a
            // network drive is temporarily unavailable
            return reportFatalErrorCause("rootFolder property missing");
        }
        root = new File(rootFolder);
        if (!root.exists()) {
            return reportFatalErrorCause("root folder does not exist: '" + root + "'");
        }

        // Resolve the root folder to its canonical form. Canonicalization is also done in CrawlFileTree for
        // every single accessed File but this is part of a procedure to determine whether the file is a
        // symbolic link. Doing it here one extra time for the root folder allows the specification of a root
        // folder whose path includes a symbolic link. Without this extra step, the "follow symbolic links"
        // setting could make crawling of such a source impossible. Therefore, symbolic links in the path of
        // the root folder are always allowed, symbolic links that are encountered later on when descending in
        // the file tree are optionally crawled
        try {
            root = root.getCanonicalFile();
        }
        catch (IOException e) {
            return reportFatalErrorCause("unable to determine canonical file of root folder " + root, e);
        }

        // determine the maximum depth
        Integer i = source.getMaximumDepth();
        int maxDepth = i == null ? DEFAULT_MAX_DEPTH : i.intValue();

        // determine the maximum byte size
        Long l = source.getMaximumSize();
        maximumSize = l == null ? DEFAULT_MAX_SIZE : l.longValue();

        // determine whether we should crawl hidden files and directories
        Boolean b = source.getIncludeHiddenResources();
        ignoreHiddenFiles = b == null ? DEFAULT_IGNORE_HIDDEN_FILES : !b.booleanValue();

        // determine whether we should crawl symbolic links
        b = source.getFollowSymbolicLinks();
        followSymbolicLinks = b == null ? DEFAULT_FOLLOW_SYMBOLIC_LINKS : b.booleanValue();

        // determine whether we should suppress the parent->child hasPart triples from the output
        b = source.getSuppressParentChildLinks();
        suppressParentChildLinks = b == null ? DEFAULT_SUPPRESS_PARENT_CHILD_LINKS : b.booleanValue();
        
        // init some other params
        params = new HashMap(2);
        getAccessorFactory();

        // crawl the file tree
        boolean crawlCompleted = crawlFileTree(root, maxDepth);

        // clean-up
        params = null;

        // determine the exit code
        return crawlCompleted ? ExitCode.COMPLETED : ExitCode.STOP_REQUESTED;
    }

    /**
     * Retrieves a DataAccessorFactory for the file scheme and throws an exception when there is no such
     * factory or when the DataAccessorRegistry has not been set.
     */
    private void getAccessorFactory() {
        if (accessorRegistry == null) {
            throw new IllegalStateException("DataAccessorRegistry not set");
        }

        Set factories = accessorRegistry.get("file");

        if (factories != null && !factories.isEmpty()) {
            accessorFactory = (DataAccessorFactory) factories.iterator().next();
        }
        else {
            throw new IllegalStateException("Could not retrieve a file data accessor");
        }
    }

    /**
     * Crawls a File tree.
     * 
     * @return true if the path has been crawled completely, false if the crawl was aborted.
     */
    private boolean crawlFileTree(File file, int depth) {
        // resolve the file to its canonical form
        try {
            // determine absolute and canonical paths
            String absolutePath = file.getAbsolutePath();
            String canonicalPath = file.getCanonicalPath();

            // optionally skip symbolic links
            if (!followSymbolicLinks && !absolutePath.equals(canonicalPath)) {
                return true;
            }

            // create the canonical File
            file = new File(canonicalPath);
        }
        catch (IOException e) {
            logger.warn("unable to resolve file to its canocical form, continuing with original file: "
                    + file, e);
        }

        if (file.isFile() && depth >= 0) {
            boolean inDomain = inDomain(file.toURI().toString());
            boolean canRead = file.canRead();
            boolean smallerThanMax = file.length() <= maximumSize;
            if ( inDomain && canRead && smallerThanMax) {
                // report the File
                crawlSingleFile(file);
            } else if (!inDomain) {
                logger.info("File " + file.toURI() + " is outside the domain boundaries for this data source. Skipping.");
            } else if (!canRead) {
                logger.info("Can't read file " + file.toURI() + ". Skipping.");
            } else if (!smallerThanMax) {
                logger.info("File " + file.toURI() + " exceeds the maximum size specified for this data source. Skipping.");
            }

            // by definition we've completed this subtree
            return true;
        }
        else if (file.isDirectory() && depth >= 0) {
            // report the Folder itself
            if (inDomain(file.toURI().toString())) {
                crawlSingleFile(file);
            }
            else {
                logger.info("Directory " + file.toURI() + " is not in domain. Skipping.");
            }

            // Dont crawl into MacOSX bundles.
            if (OSUtils.isMac() && OSUtils.isMacOSXBundle(file))
                return true;

            // report nested Files (if the folder itself is in the domain)
            if (depth > 0 && inDomain(file.toURI().toString())) {
                //return iterateOverFolderContent(file, depth);
                return filterThroughFolderContent(file, depth);
            }
            else {
                return true;
            }
        }
        else {
            // Unknown path type (is this possible?) or depth < 0
            return true;
        }
    }
    
    private boolean filterThroughFolderContent(File file, int depth) {
        CrawlerFileFilter filter = new CrawlerFileFilter(depth);
        file.listFiles(filter);
        return filter.getResult();
    }

    private boolean iterateOverFolderContent(File file, int depth) {
        File[] nestedFiles = file.listFiles();

        if (nestedFiles == null) {
            // This happens on certain "special" directories, although the
            // API documentation doesn't mention it, see java bug #4803836.
            return true;
        }

        int i = 0;
        for (; !stopRequested && i < nestedFiles.length; i++) {
            File nestedFile = nestedFiles[i];

            if (ignoreHiddenFiles && nestedFile.isHidden()) {
                continue;
            }

            boolean scanCompleted = crawlFileTree(nestedFile, depth - 1);

            if (!scanCompleted) {
                return false;
            }
        }

        // scan has been completed when i has reached the end of the array successfully
        return i == nestedFiles.length;
    }

    /**
     * Crawls a single File and reports it to the registered DataSourceListeners.
     */
    private void crawlSingleFile(File file) {
        // create an identifier for the file
        String url = file.toURI().toString();

        // register that we're processing this file
        //handler.accessingObject(this, url);
        //deprecatedUrls.remove(url);
        reportAccessingObject(url);
        
        // see if this object has been encountered before (we must do this before applying the accessor!)
        boolean knownObject = accessData == null ? false : accessData.isKnownId(url);

        // fetch a RDFContainer from the handler (note: is done for every
        //RDFContainerFactory containerFactory = handler.getRDFContainerFactory(this, url);
        RDFContainerFactory containerFactory = getRDFContainerFactory(url);
        
        // fetch the DataObject
        DataAccessor accessor = accessorFactory.get();
        params.put("file", file);
        
        // TODO return here after resolving the addParent issue
        //if (file.equals(root)) {
        //    params.put("addParent",Boolean.FALSE);
        //}
        
        if (suppressParentChildLinks) {
            params.put("suppressParentChildLinks", Boolean.TRUE);
        }
        
        DataObject dataObject = null;
        try {
            dataObject = accessor.getDataObjectIfModified(url, source, accessData, params,
                containerFactory);

            if (dataObject == null) {
                // the object was not modified
                //handler.objectNotModified(this, url);
                //crawlReport.increaseUnchangedCount();
                reportUnmodifiedDataObject(url);
            }
            else {

                // If this is the root folder, add that info to the metadata
                if (file.equals(root)) {
                    dataObject.getMetadata().add(NIE.rootElementOf, source.getID());
                }

                // we scanned a new or changed object
                if (knownObject) {
                    //handler.objectChanged(this, dataObject);
                    //crawlReport.increaseChangedCount();
                    reportModifiedDataObject(dataObject);
                }
                else {
                    //handler.objectNew(this, dataObject);
                    //crawlReport.increaseNewCount();
                    reportNewDataObject(dataObject);
                }
            }
        }
        catch (UrlNotFoundException e) {
            logger.warn("unable to access " + url, e);
        }
        catch (IOException e) {
            logger.warn("I/O error while processing " + url, e);
        }
        finally {
        	if (dataObject != null) {
        		dataObject.getMetadata().dispose();
        		dataObject.dispose();
        	}
        }
    }
    
private class CrawlerFileFilter implements FileFilter {
        
        private int depth;
        private boolean result;
        
        public CrawlerFileFilter(int depth) {
            this.depth = depth;
            this.result = true;
        }
        
        public boolean accept(File nestedFile) {
            // there is no way to stop the listFiles method in the middle, so if a stop is
            // requested so bail out as soon as possible
            // also if the subtree starting at the given file has not been completed,
            // we pass that knowledge upwards without crawling anything else
            if (stopRequested || !result) {
                result = false; // this means that we have not crawled the nestedFile
                // which implies that the entire subtree has NOT been completed
                return false; // note that this false does NOT mean the same as the result=false;
            }
            
            if (ignoreHiddenFiles && nestedFile.isHidden()) {
                // this means that we should not crawl the nestedFile, but the entire subtree
                // may still be considered completed, so we do not modify the result
                return false; 
            }

            result = crawlFileTree(nestedFile, depth - 1);
            
            // return false for everything, we're done
            return false;
        }
        
        public boolean getResult() {
            return result;
        }
    }

	
	
	
	
	
	
//	private static org.apache.log4j.Logger Logger = org.apache.log4j.Logger.getLogger(ExtendedFileSystemCrawler.class);
//
//	protected DataAccessorFactory accessorFactory;
//
//	protected FileSystemDataSource source;
//
//	protected long maxSize = 5000000;
//
//    private File root;
//	
//	// protected ExtendedCrawlerHandler handler = new ExtendedCrawlerHandler();
//	
//	
//    protected ExitCode crawlObjects() {        
//        // fetch the source and its configuration
//        DataSource dataSource = getDataSource();
//        if (!(dataSource instanceof FileSystemDataSource)) {
//            return reportFatalErrorCause("wrong data source type, should be " + FileSystemDataSource.class.getName()
//                    + " is: " + dataSource.getClass().getName());
//        }
//        
//        source = (FileSystemDataSource)dataSource;
//        
//        // determine the root file
//        String rootFolder = source.getRootFolder();
//        if (rootFolder == null) {
//            // treat this as an error rather than an "empty source" to prevent information loss when e.g. a
//            // network drive is temporarily unavailable
//            return reportFatalErrorCause("rootFolder property missing");
//        }
//        root = new File(rootFolder);
//        if (!root.exists()) {
//            return reportFatalErrorCause("root folder does not exist: '" + root + "'");
//        }
//
//        // Resolve the root folder to its canonical form. Canonicalization is also done in CrawlFileTree for
//        // every single accessed File but this is part of a procedure to determine whether the file is a
//        // symbolic link. Doing it here one extra time for the root folder allows the specification of a root
//        // folder whose path includes a symbolic link. Without this extra step, the "follow symbolic links"
//        // setting could make crawling of such a source impossible. Therefore, symbolic links in the path of
//        // the root folder are always allowed, symbolic links that are encountered later on when descending in
//        // the file tree are optionally crawled
//        try {
//            root = root.getCanonicalFile();
//        }
//        catch (IOException e) {
//            return reportFatalErrorCause("unable to determine canonical file of root folder " + root, e);
//        }
//
//        // determine the maximum depth
//        Integer i = source.getMaximumDepth();
//        int maxDepth = i == null ? DEFAULT_MAX_DEPTH : i.intValue();
//
//        // determine the maximum byte size
//        Long l = source.getMaximumSize();
//        maximumSize = l == null ? DEFAULT_MAX_SIZE : l.longValue();
//
//        // determine whether we should crawl hidden files and directories
//        Boolean b = source.getIncludeHiddenResources();
//        ignoreHiddenFiles = b == null ? DEFAULT_IGNORE_HIDDEN_FILES : !b.booleanValue();
//
//        // determine whether we should crawl symbolic links
//        b = source.getFollowSymbolicLinks();
//        followSymbolicLinks = b == null ? DEFAULT_FOLLOW_SYMBOLIC_LINKS : b.booleanValue();
//
//        // determine whether we should suppress the parent->child hasPart triples from the output
//        b = source.getSuppressParentChildLinks();
//        suppressParentChildLinks = b == null ? DEFAULT_SUPPRESS_PARENT_CHILD_LINKS : b.booleanValue();
//        
//        // init some other params
//        params = new HashMap(2);
//        getAccessorFactory();
//
//        // crawl the file tree
//        boolean crawlCompleted = crawlFileTree(root, maxDepth);
//
//        // clean-up
//        params = null;
//
//        // determine the exit code
//        return crawlCompleted ? ExitCode.COMPLETED : ExitCode.STOP_REQUESTED;
//    }

	
//	protected ExitCode crawlObjects() {
//
//		DataObject dataObject = null;
//		try {
//			// create an identifier for the file
//			// String url = file.toURI().toString();
//			String url = UUID.randomUUID().toString();
//
//			// register that we're processing this file
//			reportAccessingObject(url);
//
//			// fetch a RDFContainer from the handler (note: is done for every
//			RDFContainerFactory containerFactory = getRDFContainerFactory(url);
//
//			// dataObject = getAccessorFactory().get().getDataObject(url, source, null, containerFactory);
//			reportNewDataObject(dataObject);
//		}
//		catch (Exception e) {
//			Logger.error("Courght exception while processing object" + e);
//			e.printStackTrace();
//		} 
//		catch (Error r) {
//			Logger.error("Courght error while processing object " + r);
//			r.printStackTrace();			
//		}
//		finally {
//			if (dataObject != null) {
//				dataObject.dispose();
//			}
//		}			
//		
//		return ExitCode.COMPLETED;
//	}
}