/* Leech - crawling capabilities for Apache Tika Copyright (C) 2012 DFKI GmbH, Author: Christian Reuschling This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. Contact us by mail: christian.reuschling@dfki.de */ package de.dfki.km.leech.config; import java.util.Map; import java.util.logging.Level; import java.util.logging.Logger; import de.dfki.km.leech.util.CookieManager; import org.apache.tika.parser.ParseContext; import org.xml.sax.ContentHandler; import de.dfki.km.leech.parser.filter.URLFilter; import de.dfki.km.leech.parser.incremental.IncrementalCrawlingHistory; /** * A class to give a context / configuration for the crawling process. In the CrawlerContext class you find all configuration issues that are common * for all crawling parser implementations. There exists also context implementations with configurations that are special for a specific * CrawlerParser, e.g. {@link DirectoryCrawlerContext}. For them, have a look to the other classes of this package. Aspects as e.g. incremental * indexing, or even stopping a running crawling process can be set here. An Object of this class can be given to the ParseContext.<br> * Examples:<br> * <code> * CrawlerContext crawlerContext = new CrawlerContext().setIncrementalCrawlingHistoryPath("./history/forResourceDir");<br><br> * URLFilter boundaries = new URLFilter().addIncludePattern(new SubstringPattern("www.leech.de", SubstringPattern.STARTS_WITH));<br> * crawlerContext.setURLFilter(boundaries);<br> * <br> * ParseContext parseContext = new ParseContext();<br> * parseContext.set(CrawlerContext.class, crawlerContext);<br> * </code> or for convinience<br> * <code> * ParseContext parseContext = crawlerContext.createParseContext()<br> * </code> * * * @author Christian Reuschling, Dipl.Ing.(BA) */ public class CrawlerContext { protected Boolean m_bCheckForRemovedEntities = true; protected Boolean m_bDetectCycles = true; protected Boolean m_bInterruptIfException = false; protected Boolean m_bStopRequested = false; protected Boolean m_bVerbose = false; protected ContentHandler m_contentHandler; protected int m_crawlingDepth = Integer.MAX_VALUE; protected IncrementalCrawlingHistory m_incrementalCrawlingHistory; protected String m_strContentHandlerClassName;; protected String m_strIncrementalCrawlingHistoryPath; protected URLFilter m_urlFilter = new URLFilter(); protected CookieManager m_cookieManager = new CookieManager(); protected String m_userAgent = null; protected Map<String, String> m_userHeaders = null; /** * Creates a new ParseContext Object with an entry with this {@link #CrawlerContext} configuration. This method is only for convenience. * * @return the created ParseContext Object. */ public ParseContext createParseContext() { ParseContext parseContext = new ParseContext(); parseContext.set(CrawlerContext.class, this); return parseContext; } /** * Gets whether or not the crawler should check for removed entities after the crawl. All entities that are not 'touched' during a crawl but has * an entry inside the history will be considered as removed. This works good in the case you watch e.g. a directory periodically, and all new * differences should be reflected. Nevertheless, there could be the situation where you have an existing history (e.g. for a directory) and want * to add a single entity (e.g. file) with a Leech call, to add and process it only in the case it was modified or new with respect to its * potential history entry. In this case you don't want to flag all other existing entries beside this one file as removed. In this situation, you * can disable the 'removed entity check' with this method. * * @return true: all files that were processed during a former crawl and has an history entry, but didn't processed/touched during this crawl will * be arked as removed. False otherwise. */ public Boolean getCheckForRemovedEntities() { return m_bCheckForRemovedEntities; } /** * Gets the contentHandler that will be used during the crawl - and thus for every recursive call. * * @return the contentHandler that will be used during the crawl - and thus for every recursive call. */ public ContentHandler getContentHandler() { return m_contentHandler; } /** * Gets the class name for the content handler that should be instantiated on every recursive call during the crawl. In the case it is null or no * CrawlerConfig is used at all, Leech will reuse the given contenthandler, which is only possible if this instance is in a reusable state after a * parse operation (e.g. non re-initialised writers, etc.). Make sure that this is the case by e.g. clearing the internal states inside the * endDocument() method. * * @return the class name for the content handler that should be instantiated on every recursive call during the crawl */ public String getContentHandlerClassName() { return m_strContentHandlerClassName; } /** * Gets the maximum depth of recursive calls the crawling process will follow. Default is Integer.MAX_VALUE * * @return the maximum depth of recursive calls the crawling process will follow. Default is Integer.MAX_VALUE */ public int getCrawlingDepth() { return m_crawlingDepth; } /** * Gets whether the crawlers should detect cycles during the crawl or not. Cycle detection might be not necessary e.g. when you crawl a file * system directory without following symbolic links. Nevertheless you could run into a hard link cycle. Cycle detection is important when you * e.g. crawl websites, where links easily can result into cyclic structures. If cycle detection is enabled, Leech simply enables a temporar * incremental crawling history for this crawl, that will be removed after the crawl. This also means that when you index incrementally by * specifying an incremental crawling history, cycle detection is given anyway - no further history will be created by enabling cycle detection * with this CrawlerContext object. The default is the enabled cycle detection - which is more or less a no-brainer, unless you have really hard * performance constraints. * * @return true in the case cycle detection is enabled, false otherwise. Note that if you specify an incrementyl crawling history, cycle detection * is given anyway. */ public Boolean getDetectCycles() { return m_bDetectCycles; } /** * Gets an IncrementalCrawlingHistory Object for the configured IncrementalCrawlingHistoryPath. At first invocation, the history Object will be * created. * * @return the IncrementalCrawlingHistory Object for the configured IncrementalCrawlingHistoryPath, null in the case no path is configured. */ public IncrementalCrawlingHistory getIncrementalCrawlingHistory() { if(m_strIncrementalCrawlingHistoryPath == null) return null; if(m_incrementalCrawlingHistory == null) m_incrementalCrawlingHistory = new IncrementalCrawlingHistory(m_strIncrementalCrawlingHistoryPath); return m_incrementalCrawlingHistory; } /** * Gets a path to the incremental crawling history. In the case a path is specified, the crawlers will use incremental parsing, which means that * they check whether a data entity is new, modified or deleted. Time consuming extraction will only performed in the new- and modified case. A * DataSinkContentHandler will take care that deleted entities will also deleted from the data sink. In the case an entity has not changed at all, * no extraction of the data will be performed. In the case the path is specified as null or empty, no crawling history will be used - everthing * will be simply extracted. * * @return the path to the crawling history that is used - null or empty in the case no crawling history will be used (which is the default) */ public String getIncrementalCrawlingHistoryPath() { return m_strIncrementalCrawlingHistoryPath; } /** * Gets whether the whole crawling process will be interrupted in the case of an Exception while processing one data entityor not * * @return true: the whole crawling process will be interrupted in the case of an exception, false otherwise. The default is to not interrupt. */ public Boolean getInterruptIfException() { return m_bInterruptIfException; } /** * Gets the domain boundaries to constrain the data entities that should be considered during this crawl * * @return the domain boundaries to constrain the data entities that should be considered during this crawl. The default is a domainboundary that * skips nothing. */ public URLFilter getURLFilter() { return m_urlFilter; } /** * Gets whether the crawling process is verbose or not * * @return true: verbosity on, false otherwise */ public Boolean getVerbose() { return m_bVerbose; } /** * Request to stop the crawling process. The method will wait until the crawling process is stopped (by performing a wait() on the return value of * stopRequested()). The currently running crawler will call a notify when finished. */ public void requestStop() { if(m_bStopRequested == true) return; m_bStopRequested = true; synchronized (m_bStopRequested) { try { m_bStopRequested.wait(); m_bStopRequested = false; } catch (InterruptedException e) { Logger.getLogger(CrawlerContext.class.getName()).log(Level.SEVERE, "Error", e); } } } /** * Sets whether or not the crawler should check for removed entities after the crawl. All entities that are not 'touched' during a crawl but has * an entry inside the history will be considered as removed. This works good in the case you watch e.g. a directory periodically, and all new * differences should be reflected. Nevertheless, there could be the situation where you have an existing history (e.g. for a directory) and want * to add a single entity (e.g. file) with a Leech call, to add and process it only in the case it was modified or new with respect to its * potential history entry. In this case you don't want to flag all other existing entries beside this one file as removed. In this situation, you * can disable the 'removed entity check' with this method. * * @param checkForRemovedEntities true: all files that were processed during a former crawl and has an history entry, but didn't processed/touched * during this crawl will be arked as removed. False otherwise. * * @return this */ public CrawlerContext setCheckForRemovedEntities(Boolean checkForRemovedEntities) { m_bCheckForRemovedEntities = checkForRemovedEntities; return this; } /** * Sets the contentHandler that will be used during the crawl - and thus for every recursive call. You can specify the contentHandler either here, * inside the CrawlerContext, or simply use one of the Leech methods with an contentHandler parameter. These methods simply invoke this * setContentHandler method. In the case the ContentHandlerClassName was set, this contentHandler object will be ignored. * * @param contentHandler the ContentHandler that should be used during the crawl * * @return this for convenience */ public CrawlerContext setContentHandler(ContentHandler contentHandler) { m_contentHandler = contentHandler; return this; } /** * Specifies the class name for the content handler that should be instantiated on every recursive call during the crawl. In the case it is null * or no CrawlerConfig is used at all, Leech will reuse the given contenthandler object also specified inside setContentHandler, which is only * possible if this object is in a reusable state after a parse operation (e.g. non re-initialised writers, etc.). Make sure that this is the case * by e.g. clearing the internal states inside the endDocument() method. * * @param strContentHandlerClassName the class name for the content handler that should be instantiated on every recursive call during the crawl * * @return this with the new entry. For convenience. */ public CrawlerContext setContentHandlerClassName(String strContentHandlerClassName) { m_strContentHandlerClassName = strContentHandlerClassName; return this; } /** * Sets the maximum depth of recursive calls the crawling process will follow. Default is Integer.MAX_VALUE * * @param crawlingDepth the maximum depth of recursive calls the crawling process will follow. Default is Integer.MAX_VALUE * * @return this with the new entry. For convenience. */ public CrawlerContext setCrawlingDepth(int crawlingDepth) { m_crawlingDepth = crawlingDepth; return this; } /** * Sets whether the crawlers should detect cycles during the crawl or not. Cycle detection might be not necessary e.g. when you crawl a file * system directory without following symbolic links. Nevertheless you could run into a hard link cycle. Cycle detection is important when you * e.g. crawl websites, where links easily can result into cyclic structures. If cycle detection is enabled, Leech simply enables a temporar * incremental crawling history for this crawl, that will be removed after the crawl. This also means that when you index incrementally by * specifying an incremental crawling history, cycle detection is given anyway - no further history will be created by enabling cycle detection * with this method. The default is the enabled cycle detection. * * @param detectCycles true in the case you want to enable cycle detection (default), false otherwise. In the case you specified incremental * indexing, cycle detection is given anyway. (but you don't have to disable it with this method for performance reasons). */ public void setDetectCycles(Boolean detectCycles) { m_bDetectCycles = detectCycles; } /** * Sets a path to the incremental crawling history. In the case a path is specified, the crawlers will use incremental parsing, which means that * they check whether a data entity is new, modified or deleted. Time consuming extraction will only performed in the new- and modified case. A * DataSinkContentHandler will take care that deleted entities will also deleted from the data sink. In the case an entity has not changed at all, * no extraction of the data will be performed. In the case the path is specified as null or empty, no crawling history will be used - everthing * will be simply extracted. * * @param strIncrementalCrawlingHistoryPath the path to the crawling history that should be used - null or empty in the case no crawling history * should be used (which is the default) * * @return this with the new entry. For convenience. */ public CrawlerContext setIncrementalCrawlingHistoryPath(String strIncrementalCrawlingHistoryPath) { m_strIncrementalCrawlingHistoryPath = strIncrementalCrawlingHistoryPath; return this; } /** * In the case there is an Exception when one data entity is processed, the whole crawling process will be interrupted or not. The default is to * not interrupt. * * @param bInterruptIfException true: the whole crawling process will be interrupted in the case of an exception, false otherwise. * * @return this with the new entry. For convenience. */ public CrawlerContext setInterruptIfException(Boolean bInterruptIfException) { m_bInterruptIfException = bInterruptIfException; return this; } /** * URLFilter uses patterns (regular expressions or substrings checks) to determine whether a URL/source string belongs to a datasource domain or * not. Use these pattern in order you want to constrain the crawling process to some root directories, web domains, ... or if you want to exclude * some specific directories/files/links, etc. <br> * Example:<br> * <code> * URLFilter boundaries = new URLFilter().addIncludePattern(new SubstringPattern("www.leech.de", SubstringPattern.STARTS_WITH)); * </code> <br> * or <br> * <code> * URLFilter boundaries = new URLFilter().addExcludePattern(new SubstringPattern("liquorice", SubstringPattern.CONTAINS)); * </code> * * * @param urlFilter the domain boundaries to constrain the data entities that should be considered during this crawl * * @return this with the new entry. For convenience. */ public CrawlerContext setURLFilter(URLFilter urlFilter) { m_urlFilter = urlFilter; return this; } /** * Sets the crawling process to verbose. Some messages as skipped entities will be shown additionally * * @param bVerbose true: verbosity on, false otherwise * * @return this with the new entry. For convenience. */ public CrawlerContext setVerbose(Boolean bVerbose) { m_bVerbose = bVerbose; return this; } /** * Used to check by a crawler or other parser implementation whether a stop was requested or not. In the case a stop was requested, the parser * implementation has to call a notify() or notifyAll() to the returned Boolean Object when finished, to 'wake up' the waiting * {@link #requestStop()} method. * * @return true in the case a stop was requested. Don't forget to call a notify() on the returned Object in order to 'wake up' the waiting * {@link #requestStop()} method. */ public Boolean stopRequested() { return m_bStopRequested; } public CookieManager getCookieManager() { return m_cookieManager; } public CrawlerContext setUserAgent(String userAgent) { m_userAgent = userAgent; return this; } public String getUserAgent() { return m_userAgent; } public CrawlerContext setUserHeaders(Map<String, String> userHeaders) { m_userHeaders = userHeaders; return this; } public Map<String, String> getUserHeaders() { return m_userHeaders; } }