CrawlerContext.java example

Explorer

leech-master
- src
  - main
    - java
      - de
        dfki
        km
        leech
        Leech.java
        SubDataEntityContentHandler.java
        config
        CrawlerContext.java
        DirectoryCrawlerContext.java
        HtmlCrawlerContext.java
        ImapCrawlerContext.java
        LeechConfig.java
        detect
        DatasourceMediaTypes.java
        DirectoryDatasourceDetector.java
        ImapDatasourceDetector.java
        LeechDefaultDetector.java
        io
        FileURLStreamProvider.java
        HttpURLStreamProvider.java
        ImapURLStreamProvider.java
        ShiftInitInputStream.java
        URLStreamProvider.java
        lucene
        LeechDefaultFieldConfig.java
        ToLuceneContentHandler.java
        metadata
        LeechMetadata.java
        parser
        CrawlerParser.java
        DirectoryCrawlerParser.java
        HtmlCrawlerParser.java
        ImapCrawlerParser.java
        NonRecursiveCrawlerParser.java
        SambaCrawlerParser.java
        UrlListCrawlerParser.java
        filter
        RegExpPattern.java
        SubstringPattern.java
        URLFilter.java
        URLFilterPattern.java
        URLFilteringParser.java
        incremental
        IncrementalCrawlingHistory.java
        IncrementalCrawlingParser.java
        rss
        FeedParser2.java
        wikipedia
        WikipediaDumpParser.java
        sax
        CrawlReportContentHandler.java
        DataSinkContentHandler.java
        DataSinkContentHandlerAdapter.java
        DataSinkContentHandlerDecorator.java
        PrintlnContentHandler.java
        solr
        ToSolrContentHandler.java
        util
        CookieManager.java
        ExceptionUtils.java
        IndexPostprocessor.java
        LeechException.java
        LuceneIndexCreator.java
        OSUtils.java
        SolrIndexCreator.java
        TikaUtils.java
        UrlUtil.java
        ValueHolder.java
        certificates
        CertificateIgnoringSocketFactory.java
        CertificateStore.java
        Decision.java
        PersistentCertificateStore.java
        RootCertificateStore.java
        SessionCertificateStore.java
        StandardTrustManager.java
        TrustDecider.java

/*
    Leech - crawling capabilities for Apache Tika
    
    Copyright (C) 2012 DFKI GmbH, Author: Christian Reuschling

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.

    Contact us by mail: christian.reuschling@dfki.de
*/

package de.dfki.km.leech.config;



import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;

import de.dfki.km.leech.util.CookieManager;
import org.apache.tika.parser.ParseContext;
import org.xml.sax.ContentHandler;

import de.dfki.km.leech.parser.filter.URLFilter;
import de.dfki.km.leech.parser.incremental.IncrementalCrawlingHistory;



/**
 * A class to give a context / configuration for the crawling process. In the CrawlerContext class you find all configuration issues that are common
 * for all crawling parser implementations. There exists also context implementations with configurations that are special for a specific
 * CrawlerParser, e.g. {@link DirectoryCrawlerContext}. For them, have a look to the other classes of this package. Aspects as e.g. incremental
 * indexing, or even stopping a running crawling process can be set here. An Object of this class can be given to the ParseContext.<br>
 * Examples:<br>
 * <code>
 * CrawlerContext crawlerContext = new CrawlerContext().setIncrementalCrawlingHistoryPath("./history/forResourceDir");<br><br>
 * URLFilter boundaries = new URLFilter().addIncludePattern(new SubstringPattern("www.leech.de", SubstringPattern.STARTS_WITH));<br>
 * crawlerContext.setURLFilter(boundaries);<br>
 * <br>
 * ParseContext parseContext = new ParseContext();<br>
 * parseContext.set(CrawlerContext.class, crawlerContext);<br>
 * </code> or for convinience<br>
 * <code>
 * ParseContext parseContext = crawlerContext.createParseContext()<br>
 * </code>
 * 
 * 
 * @author Christian Reuschling, Dipl.Ing.(BA)
 */
public class CrawlerContext
{

    protected Boolean m_bCheckForRemovedEntities = true;

    protected Boolean m_bDetectCycles = true;

    protected Boolean m_bInterruptIfException = false;

    protected Boolean m_bStopRequested = false;

    protected Boolean m_bVerbose = false;

    protected ContentHandler m_contentHandler;

    protected int m_crawlingDepth = Integer.MAX_VALUE;

    protected IncrementalCrawlingHistory m_incrementalCrawlingHistory;

    protected String m_strContentHandlerClassName;;

    protected String m_strIncrementalCrawlingHistoryPath;

    protected URLFilter m_urlFilter = new URLFilter();

    protected CookieManager m_cookieManager = new CookieManager();

    protected String m_userAgent = null;

    protected Map<String, String> m_userHeaders = null;

    /**
     * Creates a new ParseContext Object with an entry with this {@link #CrawlerContext} configuration. This method is only for convenience.
     * 
     * @return the created ParseContext Object.
     */
    public ParseContext createParseContext()
    {
        ParseContext parseContext = new ParseContext();
        parseContext.set(CrawlerContext.class, this);

        return parseContext;
    }

    /**
     * Gets whether or not the crawler should check for removed entities after the crawl. All entities that are not 'touched' during a crawl but has
     * an entry inside the history will be considered as removed. This works good in the case you watch e.g. a directory periodically, and all new
     * differences should be reflected. Nevertheless, there could be the situation where you have an existing history (e.g. for a directory) and want
     * to add a single entity (e.g. file) with a Leech call, to add and process it only in the case it was modified or new with respect to its
     * potential history entry. In this case you don't want to flag all other existing entries beside this one file as removed. In this situation, you
     * can disable the 'removed entity check' with this method.
     * 
     * @return true: all files that were processed during a former crawl and has an history entry, but didn't processed/touched during this crawl will
     *         be arked as removed. False otherwise.
     */
    public Boolean getCheckForRemovedEntities()
    {
        return m_bCheckForRemovedEntities;
    }

    /**
     * Gets the contentHandler that will be used during the crawl - and thus for every recursive call.
     * 
     * @return the contentHandler that will be used during the crawl - and thus for every recursive call.
     */
    public ContentHandler getContentHandler()
    {
        return m_contentHandler;
    }

    /**
     * Gets the class name for the content handler that should be instantiated on every recursive call during the crawl. In the case it is null or no
     * CrawlerConfig is used at all, Leech will reuse the given contenthandler, which is only possible if this instance is in a reusable state after a
     * parse operation (e.g. non re-initialised writers, etc.). Make sure that this is the case by e.g. clearing the internal states inside the
     * endDocument() method.
     * 
     * @return the class name for the content handler that should be instantiated on every recursive call during the crawl
     */
    public String getContentHandlerClassName()
    {
        return m_strContentHandlerClassName;
    }



    /**
     * Gets the maximum depth of recursive calls the crawling process will follow. Default is Integer.MAX_VALUE
     * 
     * @return the maximum depth of recursive calls the crawling process will follow. Default is Integer.MAX_VALUE
     */
    public int getCrawlingDepth()
    {
        return m_crawlingDepth;
    }



    /**
     * Gets whether the crawlers should detect cycles during the crawl or not. Cycle detection might be not necessary e.g. when you crawl a file
     * system directory without following symbolic links. Nevertheless you could run into a hard link cycle. Cycle detection is important when you
     * e.g. crawl websites, where links easily can result into cyclic structures. If cycle detection is enabled, Leech simply enables a temporar
     * incremental crawling history for this crawl, that will be removed after the crawl. This also means that when you index incrementally by
     * specifying an incremental crawling history, cycle detection is given anyway - no further history will be created by enabling cycle detection
     * with this CrawlerContext object. The default is the enabled cycle detection - which is more or less a no-brainer, unless you have really hard
     * performance constraints.
     * 
     * @return true in the case cycle detection is enabled, false otherwise. Note that if you specify an incrementyl crawling history, cycle detection
     *         is given anyway.
     */
    public Boolean getDetectCycles()
    {
        return m_bDetectCycles;
    }



    /**
     * Gets an IncrementalCrawlingHistory Object for the configured IncrementalCrawlingHistoryPath. At first invocation, the history Object will be
     * created.
     * 
     * @return the IncrementalCrawlingHistory Object for the configured IncrementalCrawlingHistoryPath, null in the case no path is configured.
     */
    public IncrementalCrawlingHistory getIncrementalCrawlingHistory()
    {
        if(m_strIncrementalCrawlingHistoryPath == null) return null;

        if(m_incrementalCrawlingHistory == null) m_incrementalCrawlingHistory = new IncrementalCrawlingHistory(m_strIncrementalCrawlingHistoryPath);


        return m_incrementalCrawlingHistory;
    }



    /**
     * Gets a path to the incremental crawling history. In the case a path is specified, the crawlers will use incremental parsing, which means that
     * they check whether a data entity is new, modified or deleted. Time consuming extraction will only performed in the new- and modified case. A
     * DataSinkContentHandler will take care that deleted entities will also deleted from the data sink. In the case an entity has not changed at all,
     * no extraction of the data will be performed. In the case the path is specified as null or empty, no crawling history will be used - everthing
     * will be simply extracted.
     * 
     * @return the path to the crawling history that is used - null or empty in the case no crawling history will be used (which is the default)
     */
    public String getIncrementalCrawlingHistoryPath()
    {
        return m_strIncrementalCrawlingHistoryPath;
    }



    /**
     * Gets whether the whole crawling process will be interrupted in the case of an Exception while processing one data entityor not
     * 
     * @return true: the whole crawling process will be interrupted in the case of an exception, false otherwise. The default is to not interrupt.
     */
    public Boolean getInterruptIfException()
    {
        return m_bInterruptIfException;
    }



    /**
     * Gets the domain boundaries to constrain the data entities that should be considered during this crawl
     * 
     * @return the domain boundaries to constrain the data entities that should be considered during this crawl. The default is a domainboundary that
     *         skips nothing.
     */
    public URLFilter getURLFilter()
    {
        return m_urlFilter;
    }



    /**
     * Gets whether the crawling process is verbose or not
     * 
     * @return true: verbosity on, false otherwise
     */
    public Boolean getVerbose()
    {
        return m_bVerbose;
    }



    /**
     * Request to stop the crawling process. The method will wait until the crawling process is stopped (by performing a wait() on the return value of
     * stopRequested()). The currently running crawler will call a notify when finished.
     */
    public void requestStop()
    {
        if(m_bStopRequested == true) return;

        m_bStopRequested = true;

        synchronized (m_bStopRequested)
        {
            try
            {
                m_bStopRequested.wait();

                m_bStopRequested = false;
            }
            catch (InterruptedException e)
            {
                Logger.getLogger(CrawlerContext.class.getName()).log(Level.SEVERE, "Error", e);
            }
        }
    }




    /**
     * Sets whether or not the crawler should check for removed entities after the crawl. All entities that are not 'touched' during a crawl but has
     * an entry inside the history will be considered as removed. This works good in the case you watch e.g. a directory periodically, and all new
     * differences should be reflected. Nevertheless, there could be the situation where you have an existing history (e.g. for a directory) and want
     * to add a single entity (e.g. file) with a Leech call, to add and process it only in the case it was modified or new with respect to its
     * potential history entry. In this case you don't want to flag all other existing entries beside this one file as removed. In this situation, you
     * can disable the 'removed entity check' with this method.
     * 
     * @param checkForRemovedEntities true: all files that were processed during a former crawl and has an history entry, but didn't processed/touched
     *            during this crawl will be arked as removed. False otherwise.
     * 
     * @return this
     */
    public CrawlerContext setCheckForRemovedEntities(Boolean checkForRemovedEntities)
    {
        m_bCheckForRemovedEntities = checkForRemovedEntities;

        return this;
    }



    /**
     * Sets the contentHandler that will be used during the crawl - and thus for every recursive call. You can specify the contentHandler either here,
     * inside the CrawlerContext, or simply use one of the Leech methods with an contentHandler parameter. These methods simply invoke this
     * setContentHandler method. In the case the ContentHandlerClassName was set, this contentHandler object will be ignored.
     * 
     * @param contentHandler the ContentHandler that should be used during the crawl
     * 
     * @return this for convenience
     */
    public CrawlerContext setContentHandler(ContentHandler contentHandler)
    {
        m_contentHandler = contentHandler;

        return this;
    }



    /**
     * Specifies the class name for the content handler that should be instantiated on every recursive call during the crawl. In the case it is null
     * or no CrawlerConfig is used at all, Leech will reuse the given contenthandler object also specified inside setContentHandler, which is only
     * possible if this object is in a reusable state after a parse operation (e.g. non re-initialised writers, etc.). Make sure that this is the case
     * by e.g. clearing the internal states inside the endDocument() method.
     * 
     * @param strContentHandlerClassName the class name for the content handler that should be instantiated on every recursive call during the crawl
     * 
     * @return this with the new entry. For convenience.
     */
    public CrawlerContext setContentHandlerClassName(String strContentHandlerClassName)
    {
        m_strContentHandlerClassName = strContentHandlerClassName;

        return this;
    }



    /**
     * Sets the maximum depth of recursive calls the crawling process will follow. Default is Integer.MAX_VALUE
     * 
     * @param crawlingDepth the maximum depth of recursive calls the crawling process will follow. Default is Integer.MAX_VALUE
     * 
     * @return this with the new entry. For convenience.
     */
    public CrawlerContext setCrawlingDepth(int crawlingDepth)
    {
        m_crawlingDepth = crawlingDepth;

        return this;
    }



    /**
     * Sets whether the crawlers should detect cycles during the crawl or not. Cycle detection might be not necessary e.g. when you crawl a file
     * system directory without following symbolic links. Nevertheless you could run into a hard link cycle. Cycle detection is important when you
     * e.g. crawl websites, where links easily can result into cyclic structures. If cycle detection is enabled, Leech simply enables a temporar
     * incremental crawling history for this crawl, that will be removed after the crawl. This also means that when you index incrementally by
     * specifying an incremental crawling history, cycle detection is given anyway - no further history will be created by enabling cycle detection
     * with this method. The default is the enabled cycle detection.
     * 
     * @param detectCycles true in the case you want to enable cycle detection (default), false otherwise. In the case you specified incremental
     *            indexing, cycle detection is given anyway. (but you don't have to disable it with this method for performance reasons).
     */
    public void setDetectCycles(Boolean detectCycles)
    {
        m_bDetectCycles = detectCycles;
    }



    /**
     * Sets a path to the incremental crawling history. In the case a path is specified, the crawlers will use incremental parsing, which means that
     * they check whether a data entity is new, modified or deleted. Time consuming extraction will only performed in the new- and modified case. A
     * DataSinkContentHandler will take care that deleted entities will also deleted from the data sink. In the case an entity has not changed at all,
     * no extraction of the data will be performed. In the case the path is specified as null or empty, no crawling history will be used - everthing
     * will be simply extracted.
     * 
     * @param strIncrementalCrawlingHistoryPath the path to the crawling history that should be used - null or empty in the case no crawling history
     *            should be used (which is the default)
     * 
     * @return this with the new entry. For convenience.
     */
    public CrawlerContext setIncrementalCrawlingHistoryPath(String strIncrementalCrawlingHistoryPath)
    {
        m_strIncrementalCrawlingHistoryPath = strIncrementalCrawlingHistoryPath;

        return this;
    }



    /**
     * In the case there is an Exception when one data entity is processed, the whole crawling process will be interrupted or not. The default is to
     * not interrupt.
     * 
     * @param bInterruptIfException true: the whole crawling process will be interrupted in the case of an exception, false otherwise.
     * 
     * @return this with the new entry. For convenience.
     */
    public CrawlerContext setInterruptIfException(Boolean bInterruptIfException)
    {
        m_bInterruptIfException = bInterruptIfException;

        return this;
    }



    /**
     * URLFilter uses patterns (regular expressions or substrings checks) to determine whether a URL/source string belongs to a datasource domain or
     * not. Use these pattern in order you want to constrain the crawling process to some root directories, web domains, ... or if you want to exclude
     * some specific directories/files/links, etc. <br>
     * Example:<br>
     * <code>
     * URLFilter boundaries = new URLFilter().addIncludePattern(new SubstringPattern("www.leech.de", SubstringPattern.STARTS_WITH));
     * </code> <br>
     * or <br>
     * <code>
     * URLFilter boundaries = new URLFilter().addExcludePattern(new SubstringPattern("liquorice", SubstringPattern.CONTAINS));
     * </code>
     * 
     * 
     * @param urlFilter the domain boundaries to constrain the data entities that should be considered during this crawl
     * 
     * @return this with the new entry. For convenience.
     */
    public CrawlerContext setURLFilter(URLFilter urlFilter)
    {
        m_urlFilter = urlFilter;

        return this;
    }



    /**
     * Sets the crawling process to verbose. Some messages as skipped entities will be shown additionally
     * 
     * @param bVerbose true: verbosity on, false otherwise
     * 
     * @return this with the new entry. For convenience.
     */
    public CrawlerContext setVerbose(Boolean bVerbose)
    {
        m_bVerbose = bVerbose;
        
        return this;
    }



    /**
     * Used to check by a crawler or other parser implementation whether a stop was requested or not. In the case a stop was requested, the parser
     * implementation has to call a notify() or notifyAll() to the returned Boolean Object when finished, to 'wake up' the waiting
     * {@link #requestStop()} method.
     * 
     * @return true in the case a stop was requested. Don't forget to call a notify() on the returned Object in order to 'wake up' the waiting
     *         {@link #requestStop()} method.
     */
    public Boolean stopRequested()
    {
        return m_bStopRequested;
    }

    public CookieManager getCookieManager() { return m_cookieManager; }

    public CrawlerContext setUserAgent(String userAgent)
    {
        m_userAgent = userAgent;

        return this;
    }

    public String getUserAgent() { return m_userAgent; }

    public CrawlerContext setUserHeaders(Map<String, String> userHeaders)
    {
        m_userHeaders = userHeaders;

        return this;
    }

    public Map<String, String> getUserHeaders() { return m_userHeaders; }
}