Crawler.java example

Explorer

jbake-master
- src
  - main
    - java
      - org
        jbake
        app
        Asset.java
        ConfigUtil.java
        ContentStore.java
        Crawler.java
        DBUtil.java
        DocumentList.java
        FileUtil.java
        JBakeException.java
        Oven.java
        Parser.java
        Renderer.java
        ZipUtil.java
        launcher
        BakeWatcher.java
        Baker.java
        CustomFSChangeListener.java
        Init.java
        JettyServer.java
        LaunchOptions.java
        Main.java
        model
        DocumentAttributes.java
        DocumentStatus.java
        DocumentTypeListener.java
        DocumentTypeUtils.java
        DocumentTypes.java
        parser
        AsciidoctorEngine.java
        Engines.java
        ErrorEngine.java
        MarkdownEngine.java
        MarkupEngine.java
        ParserContext.java
        ParserEngine.java
        RawMarkupEngine.java
        render
        ArchiveRenderer.java
        DocumentsRenderer.java
        FeedRenderer.java
        IndexRenderer.java
        RenderingTool.java
        SitemapRenderer.java
        TagsRenderer.java
        template
        AbstractTemplateEngine.java
        DelegatingTemplateEngine.java
        FreemarkerTemplateEngine.java
        GroovyMarkupTemplateEngine.java
        GroovyTemplateEngine.java
        JadeTemplateEngine.java
        ModelExtractor.java
        ModelExtractors.java
        ModelExtractorsDocumentTypeListener.java
        NoModelExtractorException.java
        RenderingException.java
        TemplateEngineAdapter.java
        TemplateEngines.java
        ThymeleafTemplateEngine.java
        model
        AllContentExtractor.java
        AllTagsExtractor.java
        DBExtractor.java
        PublishedContentExtractor.java
        PublishedCustomExtractor.java
        PublishedDateExtractor.java
        PublishedPagesExtractor.java
        PublishedPostsExtractor.java
        TagPostsExtractor.java
        TaggedDocumentsExtractor.java
        TypedDocumentsExtractor.java
        util
        PagingHelper.java
  - test
    - java
      - org
        jbake
        FakeDocumentBuilder.java
        app
        AssetTest.java
        ConfigUtilTest.java
        ContentStoreTest.java
        CrawlerTest.java
        FileUtilTest.java
        InitTest.java
        MdParserTest.java
        OvenTest.java
        PaginationTest.java
        ParserTest.java
        template
        AbstractTemplateEngineRenderingTest.java
        FreemarkerTemplateEngineRenderingTest.java
        GroovyMarkupTemplateEngineRenderingTest.java
        GroovyTemplateEngineRenderingTest.java
        JadeTemplateEngineRenderingTest.java
        ThymeleafTemplateEngineRenderingTest.java
        launcher
        LaunchOptionsTest.java
        MainTest.java
        model
        DocumentTypesTest.java
        render
        ArchiveRendererTest.java
        DocumentsRendererTest.java
        FeedRendererTest.java
        IndexRendererTest.java
        RendererTest.java
        ServiceLoaderTest.java
        SitemapRendererTest.java
        TagsRendererTest.java
        support
        MockCompositeConfiguration.java
        template
        ModelExtractorsDocumentTypeListenerTest.java
        ModelExtractorsTest.java
        util
        PagingHelperTest.java

package org.jbake.app;


import com.orientechnologies.orient.core.record.impl.ODocument;
import org.apache.commons.configuration.CompositeConfiguration;
import org.apache.commons.io.FilenameUtils;
import org.jbake.app.ConfigUtil.Keys;
import org.jbake.app.Crawler.Attributes.Status;
import org.jbake.model.DocumentAttributes;
import org.jbake.model.DocumentStatus;
import org.jbake.model.DocumentTypes;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.util.Arrays;
import java.util.Date;
import java.util.Map;

import static java.io.File.separator;

/**
 * Crawls a file system looking for content.
 *
 * @author Jonathan Bullock <a href="mailto:jonbullock@gmail.com">jonbullock@gmail.com</a>
 */
public class Crawler {
	public interface Attributes {
		/**
		 * Possible values of the {@link Attributes#STATUS} property
		 * @author ndx
		 *
		 */
        interface Status {
			String PUBLISHED_DATE = "published-date";
			String PUBLISHED = "published";
			String DRAFT = "draft";
		}
		String DATE = "date";
		String STATUS = "status";
		String TYPE = "type";
		String TITLE = "title";
		String URI = "uri";
		String FILE = "file";
		String TAGS = "tags";
		String TAG = "tag";
		String ROOTPATH = "rootpath";
		String ID = "id";
		String NO_EXTENSION_URI = "noExtensionUri";
		String ALLTAGS = "alltags";
		String PUBLISHED_DATE = "published_date";
		String BODY = "body";
	}

    private static final Logger LOGGER = LoggerFactory.getLogger(Crawler.class);

    private CompositeConfiguration config;
    private Parser parser;
    private final ContentStore db;
    private String contentPath;

    /**
     * Creates new instance of Crawler.
	 * @param db		Database instance for content
	 * @param source	Base directory where content directory is located
	 * @param config	Project configuration
     */
    public Crawler(ContentStore db, File source, CompositeConfiguration config) {
        this.db = db;
        this.config = config;
        this.contentPath = FilenameUtils.concat(source.getAbsolutePath(), config.getString(ConfigUtil.Keys.CONTENT_FOLDER));
        this.parser = new Parser(config, contentPath);
    }

    /**
     * Crawl all files and folders looking for content.
     *
     * @param path Folder to start from
     */
    public void crawl(File path) {
        File[] contents = path.listFiles(FileUtil.getFileFilter());
        if (contents != null) {
            Arrays.sort(contents);
            for (File sourceFile : contents) {
                if (sourceFile.isFile()) {
                    StringBuilder sb = new StringBuilder();
                    sb.append("Processing [").append(sourceFile.getPath()).append("]... ");
                    String sha1 = buildHash(sourceFile);
                    String uri = buildURI(sourceFile);
                    boolean process = true;
                    DocumentStatus status = DocumentStatus.NEW;
                    for (String docType : DocumentTypes.getDocumentTypes()) {
                        status = findDocumentStatus(docType, uri, sha1);
                        switch (status) {
                            case UPDATED:
                                sb.append(" : modified ");
                                db.deleteContent(docType, uri);
                                break;
                            case IDENTICAL:
                                sb.append(" : same ");
                                process = false;
                        }
                        if (!process) {
                            break;
                        }
                    }
                    if (DocumentStatus.NEW == status) {
                        sb.append(" : new ");
                    }
                    if (process) { // new or updated
                        crawlSourceFile(sourceFile, sha1, uri);
                    }
                    LOGGER.info(sb.toString());
                }
                if (sourceFile.isDirectory()) {
                    crawl(sourceFile);
                }
            }
        }
    }

    private String buildHash(final File sourceFile) {
        String sha1;
        try {
            sha1 = FileUtil.sha1(sourceFile);
        } catch (Exception e) {
            e.printStackTrace();
            sha1 = "";
        }
        return sha1;
    }

    private String buildURI(final File sourceFile) {
    	String uri = FileUtil.asPath(sourceFile.getPath()).replace(FileUtil.asPath( contentPath), "");

    	boolean noExtensionUri = config.getBoolean(Keys.URI_NO_EXTENSION);
    	String noExtensionUriPrefix = config.getString(Keys.URI_NO_EXTENSION_PREFIX);
    	if (noExtensionUri && noExtensionUriPrefix != null && noExtensionUriPrefix.length() > 0) {
        	// convert URI from xxx.html to xxx/index.html
    		if (uri.startsWith(noExtensionUriPrefix)) {
    			uri = "/" + FilenameUtils.getPath(uri) + FilenameUtils.getBaseName(uri) + "/index" + config.getString(Keys.OUTPUT_EXTENSION);
    		}
        } else {
            uri = uri.substring(0, uri.lastIndexOf(".")) + config.getString(Keys.OUTPUT_EXTENSION);
        }

        // strip off leading / to enable generating non-root based sites
    	if (uri.startsWith("/")) {
    		uri = uri.substring(1, uri.length());
    	}
        return uri;
    }

    private void crawlSourceFile(final File sourceFile, final String sha1, final String uri) {
        Map<String, Object> fileContents = parser.processFile(sourceFile);
        if (fileContents != null) {
        	fileContents.put(Attributes.ROOTPATH, getPathToRoot(sourceFile));
            fileContents.put(String.valueOf(DocumentAttributes.SHA1), sha1);
            fileContents.put(String.valueOf(DocumentAttributes.RENDERED), false);
            if (fileContents.get(Attributes.TAGS) != null) {
                // store them as a String[]
                String[] tags = (String[]) fileContents.get(Attributes.TAGS);
                fileContents.put(Attributes.TAGS, tags);
            }
            fileContents.put(Attributes.FILE, sourceFile.getPath());
            fileContents.put(String.valueOf(DocumentAttributes.SOURCE_URI), uri);
            fileContents.put(Attributes.URI, uri);

            String documentType = (String) fileContents.get(Attributes.TYPE);
            if (fileContents.get(Attributes.STATUS).equals(Status.PUBLISHED_DATE)) {
                if (fileContents.get(Attributes.DATE) != null && (fileContents.get(Attributes.DATE) instanceof Date)) {
                    if (new Date().after((Date) fileContents.get(Attributes.DATE))) {
                        fileContents.put(Attributes.STATUS, Status.PUBLISHED);
                    }
                }
            }

            if (config.getBoolean(Keys.URI_NO_EXTENSION)) {
            	fileContents.put(Attributes.NO_EXTENSION_URI, uri.replace("/index.html", "/"));
            }

            ODocument doc = new ODocument(documentType);
            doc.fields(fileContents);
            boolean cached = fileContents.get(DocumentAttributes.CACHED) != null ? Boolean.valueOf((String)fileContents.get(DocumentAttributes.CACHED)):true;
            doc.field(String.valueOf(DocumentAttributes.CACHED), cached);
            doc.save();
        } else {
            LOGGER.warn("{} has an invalid header, it has been ignored!", sourceFile);
        }
    }

    public String getPathToRoot(File sourceFile) {
    	File rootPath = new File(contentPath);
    	File parentPath = sourceFile.getParentFile();
    	int parentCount = 0;
    	while (!parentPath.equals(rootPath)) {
    		parentPath = parentPath.getParentFile();
    		parentCount++;
    	}
    	StringBuilder sb = new StringBuilder();
    	for (int i = 0; i < parentCount; i++) {
    		sb.append("../");
    	}
    	return sb.toString();
    }

    private DocumentStatus findDocumentStatus(String docType, String uri, String sha1) {
        DocumentList match = db.getDocumentStatus(docType, uri);
        if (!match.isEmpty()) {
            Map entries = match.get(0);
            String oldHash = (String) entries.get(String.valueOf(DocumentAttributes.SHA1));
            if (!(oldHash.equals(sha1)) || Boolean.FALSE.equals(entries.get(String.valueOf(DocumentAttributes.RENDERED)))) {
                return DocumentStatus.UPDATED;
            } else {
                return DocumentStatus.IDENTICAL;
            }
        } else {
            return DocumentStatus.NEW;
        }
    }
}