/*
* Leech - crawling capabilities for Apache Tika
*
* Copyright (C) 2012 DFKI GmbH, Author: Christian Reuschling
*
* This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation,
* either version 3 of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
* PARTICULAR PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* Contact us by mail: christian.reuschling@dfki.de
*/
package de.dfki.km.leech;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.net.URL;
import java.rmi.server.UID;
import java.util.LinkedList;
import java.util.logging.Level;
import java.util.logging.Logger;
import javax.mail.URLName;
import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.DublinCore;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import de.dfki.inquisition.text.StringUtils;
import de.dfki.km.leech.config.CrawlerContext;
import de.dfki.km.leech.config.DirectoryCrawlerContext;
import de.dfki.km.leech.config.LeechConfig;
import de.dfki.km.leech.io.URLStreamProvider;
import de.dfki.km.leech.parser.DirectoryCrawlerParser;
import de.dfki.km.leech.parser.filter.URLFilteringParser;
import de.dfki.km.leech.parser.incremental.IncrementalCrawlingHistory;
import de.dfki.km.leech.parser.incremental.IncrementalCrawlingParser;
import de.dfki.km.leech.sax.CrawlReportContentHandler;
import de.dfki.km.leech.sax.DataSinkContentHandler;
import de.dfki.km.leech.sax.PrintlnContentHandler;
import de.dfki.km.leech.sax.PrintlnContentHandler.Verbosity;
import de.dfki.km.leech.util.ExceptionUtils;
import de.dfki.km.leech.util.UrlUtil;
/**
* This is the main class, the entry point. Feel free to select one of the plenty of parse-methods<br>
* <br>
* Crawling in Leech will be performed by using a ContentHandler, that is invoked for every data entity that is recognized during the crawl. Leech offers a special
* {@link DataSinkContentHandler} with abstract methods you can implement to store the data into your data store, e.g. a Lucene index. By using
* {@link DataSinkContentHandler}, it is easy for you to perfom incremental indexing, which means that during a crawl, Leech remarks which data entities were crawled, and
* at subsequent crawls only those data entities that are new or have changed will be parsed again. Further, you will get the information which data entities were removed
* since the last crawl. <br>
* To configure a crawl, there exists several **Context classes for passing into the ParseContext. E.g. a CrawlerContext Object will be used for all configuration
* parameters that are common for all types of possible datasources. There you can pass things like crawling depth or the path to an incremental crawling history. You
* also can request stopping a running crawling process.<br>
* There exists some more specialised **Context classes to adjust the crawling of specific data sources. For example, {@link DirectoryCrawlerContext} let you choose
* whether symbolic links should be followed or not during crawling a file system. Look into the package de.dfki.km.leech.config for all Context classes offered by Leech.<br>
* <br>
* Now some examples as a starting point.<br>
* To enable incremental indexing during a crawl, pass a {@link CrawlerContext} instance with a path to the history into the ParseContext parameter of a Leech.parse(..)
* method. Leech will create a new history in the case there is no existing one under the given path. {@link PrintlnContentHandler} is an implementation of
* {@link DataSinkContentHandler} which simply writes all content inclusive metadata to stout. :<br>
* <br>
* <code>
* Leech leech = new Leech();<br>
* CrawlerContext crawlerContext = new CrawlerContext().setIncrementalCrawlingHistoryPath("./history/forResourceDir");<br>
* leech.parse(new File("resource"), new PrintlnContentHandler(), crawlerContext.createParseContext());<br>
* </code> <br>
* To request stopping a crawl (should be invoked in a different thread than leech.parse(..):<br>
* <br>
* <code>
* crawlerContext.requestStop()
* </code>
*
*
* @author Christian Reuschling, Dipl.Ing.(BA)
*/
public class Leech extends Tika
{
public static void main(String[] args) throws IOException, SAXException, TikaException
{
Logger.getLogger(Leech.class.getName()).info(
"Usage: leech <source2crawl_1> <source2crawl_2> ... <source2crawl_N>\n\n"
+ "A source can be an URL for file://, http://, imap:// or -maybe in future- other urls (e.g. for databases, webDAV, etc...).\n"
+ "In the case the string is no correct url string, the method will use the string as file path and then generates an\n"
+ "according URL. Examples: 'file://myDataDir', 'file://bla.pdf', 'http://reuschling.github.com/leech/',\n"
+ "'imap://usr:pswd@myImapServer.de:993/inbox', 'imaps://usr:pswd@myImapServer.de:993/inbox;uid=22'\n\n"
+ "This executable crawls all data and simply shows the metadata on the screen. Because leech is designed to be used as a\n"
+ "java library, this exec is for quick testing purposes.\n\n");
Leech leech = new Leech();
CrawlerContext crawlerContext = new CrawlerContext();
CrawlReportContentHandler reportContentHandler = new CrawlReportContentHandler(new PrintlnContentHandler(Verbosity.all)).setCyclicReportPrintln(7000);
for (String strSource2Crawl : args)
{
Logger.getLogger(Leech.class.getName()).info("Will start crawling " + strSource2Crawl + '\n');
leech.parse(strSource2Crawl, reportContentHandler, crawlerContext.createParseContext());
}
}
public Leech()
{
super(LeechConfig.getDefaultLeechConfig());
}
public Leech(LeechConfig leechConfig)
{
super(leechConfig);
}
@Override
public String detect(File file) throws IOException
{
return detect(new URLName(file.toURI().toURL()));
}
@Override
public String detect(URL url)
{
throw new UnsupportedOperationException(
"The java.net.URL class methods are not supported because our mechanism supporting new protocols and the according stream creation differ.\n"
+ "Use the according URLName method instead");
}
public String detect(URLName url) throws IOException
{
InputStream stream = null;
try
{
Metadata metadata = URLStreamProvider.getURLStreamProvider(url).addFirstMetadata(url, null, null);
stream = URLStreamProvider.getURLStreamProvider(url).getStream(url, metadata, null);
return detect(stream, metadata);
}
catch (Throwable e)
{
Logger.getLogger(Leech.class.getName()).log(Level.SEVERE, "Error", e);
return null;
}
finally
{
if(stream != null) stream.close();
}
}
protected ContentHandler getContentHandler(ParseContext context)
{
CrawlerContext crawlerContext = context.get(CrawlerContext.class);
if(crawlerContext == null)
throw new IllegalStateException(
"no crawlerContext was set. Set a CrawlerContext with a configured handler or use another method with directly specifying a handler.");
ContentHandler handler2use4recursiveCall = crawlerContext.getContentHandler();
if(!StringUtils.nullOrWhitespace(crawlerContext.getContentHandlerClassName()))
try
{
handler2use4recursiveCall = (ContentHandler) Class.forName(crawlerContext.getContentHandlerClassName()).newInstance();
}
catch (Throwable e)
{
Logger.getLogger(DirectoryCrawlerParser.class.getName()).log(Level.SEVERE,
"Error during the instantiation of the configured content handler " + crawlerContext.getContentHandlerClassName(), e);
}
if(handler2use4recursiveCall == null) throw new IllegalStateException("no contentHandler was set. Have a look into the class CrawlerContext");
return handler2use4recursiveCall;
}
@Override
public Parser getParser()
{
return new URLFilteringParser(new IncrementalCrawlingParser(super.getParser()));
}
@Override
public Reader parse(File file) throws IOException
{
return parse(new URLName(file.toURI().toURL()));
}
/**
* Parse a directory or a file with a callback-contenthandler. We recommend to use an own implementation of DataSinkContentHandler. In the case you want to use
* another ContentHandler, be aware that this Object is re-used at every recursive invocation. So make sure that this is possible, and all internal members (e.g.
* writers, etc.) are re-initialized at the new invocation (maybe clear them inside endDocument(), or inside startDocument()). In the case the handler does not have
* any internal states that are critical, there should be no problems at all. In the case you have a critical handler with a default constructor, you can also set the
* class name inside the CrawlerContext object inside ParseContext. In this case, a new handler object will be created at every recursive call..
*
* @param file the file you want to crawl/extract content from
* @param handler the handler that should handle the extracted data
*
* @throws IOException
* @throws SAXException
* @throws TikaException
*/
public void parse(File file, ContentHandler handler) throws IOException, SAXException, TikaException
{
ParseContext context = new ParseContext();
context.set(Parser.class, super.getParser());
context.set(CrawlerContext.class, new CrawlerContext().setContentHandler(handler));
Metadata metadata = new Metadata();
InputStream stream = null;
if(handler instanceof DataSinkContentHandler) metadata = ((DataSinkContentHandler) handler).getMetaData();
try
{
URLName url = new URLName(file.toURI().toURL());
metadata = URLStreamProvider.getURLStreamProvider(url).addFirstMetadata(url, metadata, context);
stream = URLStreamProvider.getURLStreamProvider(url).getStream(url, metadata, context);
getParser().parse(stream, handler, metadata, context);
}
catch (Throwable e)
{
ExceptionUtils.handleException(e, null, metadata, context.get(CrawlerContext.class), context, 0, handler);
}
finally
{
if(handler instanceof DataSinkContentHandler) ((DataSinkContentHandler) handler).crawlFinished();
if(stream != null) stream.close();
}
}
/**
* Parse a directory or a file with a callback-contenthandler. We recommend to use an own implementation of DataSinkContentHandler. In the case you want to use
* another ContentHandler, be aware that this Object is re-used at every recursive invocation. So make sure that this is possible, and all internal members (e.g.
* writers, etc.) are re-initialized at the new invocation (maybe clear them inside endDocument(), or inside startDocument()). In the case the handler does not have
* any internal states that are critical, there should be no problems at all. In the case you have a critical handler with a default constructor, you can also set the
* class name inside the CrawlerContext object inside ParseContext. In this case, a new handler object will be created at every recursive call..
*
* @param file the file you want to crawl/extract content from
* @param handler the handler that should handle the extracted data
* @param context the parsing context to use. An entry with the configured parser will be added by the method. You can pass in an CrawlerContext instance to e.g. set
* the contentHandler for recursive crawls or enable incremental crawling.
*
* @throws IOException
* @throws SAXException
* @throws TikaException
*/
public void parse(File file, ContentHandler handler, ParseContext context) throws IOException, SAXException, TikaException
{
context.set(Parser.class, super.getParser());
CrawlerContext crawlerContext = context.get(CrawlerContext.class);
if(crawlerContext == null)
{
crawlerContext = new CrawlerContext();
context.set(CrawlerContext.class, crawlerContext);
}
crawlerContext.setContentHandler(handler);
Metadata metadata = new Metadata();
InputStream stream = null;
if(handler instanceof DataSinkContentHandler) metadata = ((DataSinkContentHandler) handler).getMetaData();
try
{
URLName url = new URLName(file.toURI().toURL());
metadata = URLStreamProvider.getURLStreamProvider(url).addFirstMetadata(url, metadata, context);
stream = URLStreamProvider.getURLStreamProvider(url).getStream(url, metadata, context);
getParser().parse(stream, handler, metadata, context);
}
catch (Throwable e)
{
ExceptionUtils.handleException(e, null, metadata, context.get(CrawlerContext.class), context, 0, handler);
}
finally
{
if(handler instanceof DataSinkContentHandler) ((DataSinkContentHandler) handler).crawlFinished();
if(stream != null) stream.close();
}
}
/**
* Parse a directory or a file by specifying a ParseContext config. You can pass in an CrawlerContext instance to e.g. set the ContentHandler for recursive crawls.
* This one will be newly instantiated with the default constructor for every recursive call. Alternatively, you can also set a contentHandler object for reuse.
*
* @param file the file you want to crawl/extract content from
* @param context the parsing context to use. An entry with the configured parser will be added by the method. You can pass in an CrawlerContext instance to e.g. set
* the contentHandler for recursive crawls or enable incremental crawling.
*
* @throws IOException
* @throws SAXException
* @throws TikaException
*/
public void parse(File file, ParseContext context) throws IOException, SAXException, TikaException
{
context.set(Parser.class, super.getParser());
Metadata metadata = new Metadata();
InputStream stream = null;
ContentHandler handler = getContentHandler(context);
if(handler instanceof DataSinkContentHandler) metadata = ((DataSinkContentHandler) handler).getMetaData();
try
{
URLName url = new URLName(file.toURI().toURL());
metadata = URLStreamProvider.getURLStreamProvider(url).addFirstMetadata(url, metadata, context);
stream = URLStreamProvider.getURLStreamProvider(url).getStream(url, metadata, context);
getParser().parse(stream, handler, metadata, context);
}
catch (Throwable e)
{
ExceptionUtils.handleException(e, null, metadata, new CrawlerContext(), context, 0, handler);
}
finally
{
if(handler instanceof DataSinkContentHandler) ((DataSinkContentHandler) handler).crawlFinished();
if(stream != null) stream.close();
}
}
/**
* Parse a directory or a file with a callback-contenthandler. We recommend to use an own implementation of DataSinkContentHandler. In the case you want to use
* another ContentHandler, be aware that this Object is re-used at every recursive invocation. So make sure that this is possible, and all internal members (e.g.
* writers, etc.) are re-initialized at the new invocation (maybe clear them inside endDocument(), or inside startDocument()). In the case the handler does not have
* any internal states that are critical, there should be no problems at all.In the case you have a critical handler with a default constructor, you can also set the
* class name inside the CrawlerContext object inside ParseContext. In this case, a new handler object will be created at every recursive call..
*
* @param strSourceString the URL string you want to crawl/extract content from. This can ether be a file://, http:// or - in future - other urls (e.g. for databases,
* imap, webDAV, etc...). In the case the string is no correct url string, the method will use the string as file path and then generates an according URL.
* Examples: 'file://myDataDir', 'file://bla.pdf', 'http://reuschling.github.com/leech/', 'imap://usr:pswd@myImapServer.de:993/inbox',
* 'imaps://usr:pswd@myImapServer.de:993/inbox;uid=22'
* @param handler the handler that should handle the extracted data
* @throws IOException
* @throws SAXException
* @throws TikaException
*/
public void parse(String strSourceString, ContentHandler handler) throws IOException, SAXException, TikaException
{
parse(UrlUtil.sourceString2URL(strSourceString), handler);
}
/**
* This overridden method don't use only the name but tries to generate a URL out of the given name and uses the underlying data if possible
*/
@Override
public String detect(String name)
{
try
{
return detect(UrlUtil.sourceString2URL(name));
}
catch (Throwable e)
{
Logger.getLogger(Leech.class.getName()).log(Level.SEVERE, "Error", e);
return null;
}
}
/**
* Parse a directory or a file with a callback-contenthandler. We recommend to use an own implementation of DataSinkContentHandler. In the case you want to use
* another ContentHandler, be aware that this Object is re-used at every recursive invocation. So make sure that this is possible, and all internal members (e.g.
* writers, etc.) are re-initialized at the new invocation (maybe clear them inside endDocument(), or inside startDocument()). In the case the handler does not have
* any internal states that are critical, there should be no problems at all.In the case you have a critical handler with a default constructor, you can also set the
* class name inside the CrawlerContext object inside ParseContext. In this case, a new handler object will be created at every recursive call..
*
* @param strSourceString the URL string you want to crawl/extract content from. This can ether be a file://, http://, imap:// or - in future - other urls (e.g. for
* databases, webDAV, etc...). In the case the string is no correct url string, the method will use the string as file path and then generates an according
* URL. Examples: 'file://myDataDir', 'file://bla.pdf', 'http://reuschling.github.com/leech/', 'imap://usr:pswd@myImapServer.de:993/inbox',
* 'imaps://usr:pswd@myImapServer.de:993/inbox;uid=22'
* @param handler the handler that should handle the extracted data
* @param context the parsing context to use. An entry with the configured parser will be added by the method. You can pass in an CrawlerContext instance to e.g. set
* the contentHandler for recursive crawls or enable incremental crawling.
*
* @throws IOException
* @throws SAXException
* @throws TikaException
*/
public void parse(String strSourceString, ContentHandler handler, ParseContext context) throws IOException, SAXException, TikaException
{
parse(UrlUtil.sourceString2URL(strSourceString), handler, context);
}
/**
* Parse a directory or a file with a callback-contenthandler. We recommend to use an own implementation of DataSinkContentHandler. In the case you want to use
* another ContentHandler, be aware that this Object is re-used at every recursive invocation. So make sure that this is possible, and all internal members (e.g.
* writers, etc.) are re-initialized at the new invocation (maybe clear them inside endDocument(), or inside startDocument()). In the case the handler does not have
* any internal states that are critical, there should be no problems at all.In the case you have a critical handler with a default constructor, you can also set the
* class name inside the CrawlerContext object inside ParseContext. In this case, a new handler object will be created at every recursive call..
*
* @param lSourceStrings the URL strings you want to crawl/extract content from. This can ether be a file://, http://, imap:// or - in future - other urls (e.g. for
* databases, webDAV, etc...). In the case the string is no correct url string, the method will use the string as file path and then generates an according
* URL. Examples: 'file://myDataDir', 'file://bla.pdf', 'http://reuschling.github.com/leech/', 'imap://usr:pswd@myImapServer.de:993/inbox',
* 'imaps://usr:pswd@myImapServer.de:993/inbox;uid=22'
* @param handler the handler that should handle the extracted data
* @param context the parsing context to use. An entry with the configured parser will be added by the method. You can pass in an CrawlerContext instance to e.g. set
* the contentHandler for recursive crawls or enable incremental crawling.
*
* @throws IOException
* @throws SAXException
* @throws TikaException
*/
public void parse(String[] lSourceStrings, ContentHandler handler, ParseContext context) throws IOException, SAXException, TikaException
{
LinkedList<URLName> llUrls = new LinkedList<>();
for (String strSourceString : lSourceStrings)
llUrls.add(UrlUtil.sourceString2URL(strSourceString));
parse(llUrls.toArray(new URLName[0]), handler, context);
}
/**
* Parse an URL by specifying a ParseContext config. You can pass in an CrawlerContext instance to e.g. set the ContentHandler for recursive crawls. This one will be
* newly instantiated with the default constructor for every recursive call.
*
* @param strSourceString the URL string you want to crawl/extract content from. This can ether be a file://, http:// or - in future - other urls (e.g. for databases,
* imap, webDAV, etc...). In the case the string is no correct url string, the method will use the string as file path and then generates an according URL.
* Examples: 'file://myDataDir', 'file://bla.pdf', 'http://reuschling.github.com/leech/', 'imap://usr:pswd@myImapServer.de:993/inbox',
* 'imaps://usr:pswd@myImapServer.de:993/inbox;uid=22'
* @param context the parsing context to use. An entry with the configured parser will be added by the method. You can pass in an CrawlerContext instance to e.g. set
* the contentHandler for recursive crawls or enable incremental crawling.
*
* @throws IOException
* @throws SAXException
* @throws TikaException
*/
public void parse(String strSourceString, ParseContext context) throws IOException, SAXException, TikaException
{
parse(UrlUtil.sourceString2URL(strSourceString), context);
}
@Override
public Reader parse(URL url) throws IOException
{
throw new UnsupportedOperationException(
"The java.net.URL class methods are not supported because our mechanism supporting new protocols and the according stream creation differ.\n"
+ "Use the according URLName method instead");
}
public Reader parse(URLName url) throws IOException
{
url = UrlUtil.normalizeURL(url);
InputStream stream;
try
{
Metadata metadata = URLStreamProvider.getURLStreamProvider(url).addFirstMetadata(url, null, null);
stream = URLStreamProvider.getURLStreamProvider(url).getStream(url, metadata, null);
return parse(stream, metadata);
}
catch (Throwable e)
{
Logger.getLogger(Leech.class.getName()).log(Level.SEVERE, "Error", e);
return null;
}
}
/**
* Parse a directory or a file with a callback-contenthandler. We recommend to use an own implementation of DataSinkContentHandler. In the case you want to use
* another ContentHandler, be aware that this Object is re-used at every recursive invocation. So make sure that this is possible, and all internal members (e.g.
* writers, etc.) are re-initialized at the new invocation (maybe clear them inside endDocument(), or inside startDocument()). In the case the handler does not have
* any internal states that are critical, there should be no problems at all.In the case you have a critical handler with a default constructor, you can also set the
* class name inside the CrawlerContext object inside ParseContext. In this case, a new handler object will be created at every recursive call..
*
* @param url the URL you want to crawl/extract content from. This can ether be a file://, http:// or - in future - other urls (e.g. for databases, imap, webDAV,
* etc...)
* @param handler the handler that should handle the extracted data
* @throws IOException
* @throws SAXException
* @throws TikaException
*/
public void parse(URLName url, ContentHandler handler) throws IOException, SAXException, TikaException
{
url = UrlUtil.normalizeURL(url);
ParseContext context = new ParseContext();
context.set(Parser.class, super.getParser());
context.set(CrawlerContext.class, new CrawlerContext().setContentHandler(handler));
Metadata metadata = new Metadata();
InputStream stream = null;
if(handler instanceof DataSinkContentHandler) metadata = ((DataSinkContentHandler) handler).getMetaData();
try
{
metadata = URLStreamProvider.getURLStreamProvider(url).addFirstMetadata(url, metadata, context);
stream = URLStreamProvider.getURLStreamProvider(url).getStream(url, metadata, context);
getParser().parse(stream, handler, metadata, context);
}
catch (Throwable e)
{
ExceptionUtils.handleException(e, null, metadata, context.get(CrawlerContext.class), context, 0, handler);
}
finally
{
if(handler instanceof DataSinkContentHandler) ((DataSinkContentHandler) handler).crawlFinished();
if(stream != null) stream.close();
}
}
/**
* Parse a directory or a file with a callback-contenthandler. We recommend to use an own implementation of DataSinkContentHandler. In the case you want to use
* another ContentHandler, be aware that this Object is re-used at every recursive invocation. So make sure that this is possible, and all internal members (e.g.
* writers, etc.) are re-initialized at the new invocation (maybe clear them inside endDocument(), or inside startDocument()). In the case the handler does not have
* any internal states that are critical, there should be no problems at all.In the case you have a critical handler with a default constructor, you can also set the
* class name inside the CrawlerContext object inside ParseContext. In this case, a new handler object will be created at every recursive call..
*
* @param url the URL you want to crawl/extract content from. This can ether be a file://, http:// or - in future - other urls (e.g. for databases, imap, webDAV,
* etc...)
* @param handler the handler that should handle the extracted data
* @param context the parsing context to use. An entry with the configured parser will be added by the method. You can pass in an CrawlerContext instance to e.g. set
* the contentHandler for recursive crawls or enable incremental crawling.
*
* @throws IOException
* @throws SAXException
* @throws TikaException
*/
public void parse(URLName url, ContentHandler handler, ParseContext context) throws IOException, SAXException, TikaException
{
url = UrlUtil.normalizeURL(url);
context.set(Parser.class, super.getParser());
CrawlerContext crawlerContext = context.get(CrawlerContext.class);
if(crawlerContext == null)
{
crawlerContext = new CrawlerContext();
context.set(CrawlerContext.class, crawlerContext);
}
crawlerContext.setContentHandler(handler);
Metadata metadata = new Metadata();
InputStream stream = null;
if(handler instanceof DataSinkContentHandler) metadata = ((DataSinkContentHandler) handler).getMetaData();
try
{
metadata = URLStreamProvider.getURLStreamProvider(url).addFirstMetadata(url, metadata, context);
stream = URLStreamProvider.getURLStreamProvider(url).getStream(url, metadata, context);
getParser().parse(stream, handler, metadata, context);
}
catch (Throwable e)
{
ExceptionUtils.handleException(e, null, metadata, context.get(CrawlerContext.class), context, 0, handler);
}
finally
{
if(handler instanceof DataSinkContentHandler) ((DataSinkContentHandler) handler).crawlFinished();
if(stream != null) stream.close();
}
}
/**
* Parse a directory or a file with a callback-contenthandler. We recommend to use an own implementation of DataSinkContentHandler. In the case you want to use
* another ContentHandler, be aware that this Object is re-used at every recursive invocation. So make sure that this is possible, and all internal members (e.g.
* writers, etc.) are re-initialized at the new invocation (maybe clear them inside endDocument(), or inside startDocument()). In the case the handler does not have
* any internal states that are critical, there should be no problems at all.In the case you have a critical handler with a default constructor, you can also set the
* class name inside the CrawlerContext object inside ParseContext. In this case, a new handler object will be created at every recursive call..
*
* @param urls the URLs you want to crawl/extract content from. This can ether be a file://, http:// or - in future - other urls (e.g. for databases, imap, webDAV,
* etc...)
* @param handler the handler that should handle the extracted data
* @param context the parsing context to use. An entry with the configured parser will be added by the method. You can pass in an CrawlerContext instance to e.g. set
* the contentHandler for recursive crawls or enable incremental crawling.
*
* @throws IOException
* @throws SAXException
* @throws TikaException
*/
public void parse(URLName[] urls, ContentHandler handler, ParseContext context) throws IOException, SAXException, TikaException
{
for (int i = 0; i < urls.length; i++)
urls[i] = UrlUtil.normalizeURL(urls[i]);
context.set(Parser.class, super.getParser());
CrawlerContext crawlerContext = context.get(CrawlerContext.class);
if(crawlerContext == null)
{
crawlerContext = new CrawlerContext();
context.set(CrawlerContext.class, crawlerContext);
}
crawlerContext.setContentHandler(handler);
Metadata metadata = new Metadata();
InputStream stream = null;
if(handler instanceof DataSinkContentHandler) metadata = ((DataSinkContentHandler) handler).getMetaData();
try
{
String strUid = new UID().toString();
metadata.add(Metadata.RESOURCE_NAME_KEY, "leechUrlList " + strUid);
metadata.add(DublinCore.SOURCE, strUid + "_leechUrlList.urlList");
metadata.add(IncrementalCrawlingHistory.dataEntityId, strUid + "_leechUrlList.urlList");
metadata.add(IncrementalCrawlingHistory.dataEntityContentFingerprint, strUid + "_leechUrlList.urlList");
metadata.add(Metadata.CONTENT_TYPE, "application/leechUrlList");
String strSourcesString = "";
for (URLName url : urls)
strSourcesString += url.toString() + "\n";
stream = new ByteArrayInputStream(strSourcesString.getBytes());
// wir müssen noch aufpassen: wenn wir eine Liste übergeben und mit dem URListParser arbeiten, dann dürfen wir die erste Rekursionsstufe nicht dazuzählen
int iCrawlingDepth = crawlerContext.getCrawlingDepth();
if(iCrawlingDepth < Integer.MAX_VALUE) crawlerContext.setCrawlingDepth(iCrawlingDepth + 1);
getParser().parse(stream, handler, metadata, context);
}
catch (Throwable e)
{
ExceptionUtils.handleException(e, null, metadata, context.get(CrawlerContext.class), context, 0, handler);
}
finally
{
if(handler instanceof DataSinkContentHandler) ((DataSinkContentHandler) handler).crawlFinished();
if(stream != null) stream.close();
}
}
/**
* Parse an URL by specifying a ParseContext config. You can pass in an CrawlerContext instance to e.g. set the ContentHandler for recursive crawls. This one will be
* newly instantiated with the default constructor for every recursive call.
*
* @param url the URL you want to crawl/extract content from. This can ether be a file://, http:// or - in future - other urls (e.g. for databases, imap, webDAV,
* etc...)
* @param context the parsing context to use. An entry with the configured parser will be added by the method. You can pass in an CrawlerContext instance to e.g. set
* the contentHandler for recursive crawls or enable incremental crawling.
*
* @throws IOException
* @throws SAXException
* @throws TikaException
*/
public void parse(URLName url, ParseContext context) throws IOException, SAXException, TikaException
{
url = UrlUtil.normalizeURL(url);
context.set(Parser.class, super.getParser());
Metadata metadata = new Metadata();
InputStream stream = null;
ContentHandler handler = getContentHandler(context);
if(handler instanceof DataSinkContentHandler) metadata = ((DataSinkContentHandler) handler).getMetaData();
try
{
metadata = URLStreamProvider.getURLStreamProvider(url).addFirstMetadata(url, metadata, context);
stream = URLStreamProvider.getURLStreamProvider(url).getStream(url, metadata, context);
getParser().parse(stream, handler, metadata, context);
}
catch (Throwable e)
{
ExceptionUtils.handleException(e, null, metadata, context.get(CrawlerContext.class), context, 0, handler);
}
finally
{
if(handler instanceof DataSinkContentHandler) ((DataSinkContentHandler) handler).crawlFinished();
if(stream != null) stream.close();
}
}
@Override
public String parseToString(File file) throws IOException, TikaException
{
return parseToString(new URLName(file.toURI().toURL()));
}
@Override
public String parseToString(URL url) throws IOException, TikaException
{
throw new UnsupportedOperationException(
"The java.net.URL class methods are not supported because our mechanism supporting new protocols and the according stream creation differ.\n"
+ "Use the according URLName method instead");
}
public String parseToString(URLName url) throws IOException, TikaException
{
url = UrlUtil.normalizeURL(url);
InputStream stream = null;
try
{
Metadata metadata = URLStreamProvider.getURLStreamProvider(url).addFirstMetadata(url, null, null);
stream = URLStreamProvider.getURLStreamProvider(url).getStream(url, metadata, null);
return parseToString(stream, metadata);
}
catch (Throwable e)
{
throw new TikaException("Error while parsing " + url.getFile(), e);
}
}
}