Leech.java example

Explorer
leech-master
- src
  - main
    - java
      - de
        dfki
        km
        leech
        Leech.java
        SubDataEntityContentHandler.java
        config
        CrawlerContext.java
        DirectoryCrawlerContext.java
        HtmlCrawlerContext.java
        ImapCrawlerContext.java
        LeechConfig.java
        detect
        DatasourceMediaTypes.java
        DirectoryDatasourceDetector.java
        ImapDatasourceDetector.java
        LeechDefaultDetector.java
        io
        FileURLStreamProvider.java
        HttpURLStreamProvider.java
        ImapURLStreamProvider.java
        ShiftInitInputStream.java
        URLStreamProvider.java
        lucene
        LeechDefaultFieldConfig.java
        ToLuceneContentHandler.java
        metadata
        LeechMetadata.java
        parser
        CrawlerParser.java
        DirectoryCrawlerParser.java
        HtmlCrawlerParser.java
        ImapCrawlerParser.java
        NonRecursiveCrawlerParser.java
        SambaCrawlerParser.java
        UrlListCrawlerParser.java
        filter
        RegExpPattern.java
        SubstringPattern.java
        URLFilter.java
        URLFilterPattern.java
        URLFilteringParser.java
        incremental
        IncrementalCrawlingHistory.java
        IncrementalCrawlingParser.java
        rss
        FeedParser2.java
        wikipedia
        WikipediaDumpParser.java
        sax
        CrawlReportContentHandler.java
        DataSinkContentHandler.java
        DataSinkContentHandlerAdapter.java
        DataSinkContentHandlerDecorator.java
        PrintlnContentHandler.java
        solr
        ToSolrContentHandler.java
        util
        CookieManager.java
        ExceptionUtils.java
        IndexPostprocessor.java
        LeechException.java
        LuceneIndexCreator.java
        OSUtils.java
        SolrIndexCreator.java
        TikaUtils.java
        UrlUtil.java
        ValueHolder.java
        certificates
        CertificateIgnoringSocketFactory.java
        CertificateStore.java
        Decision.java
        PersistentCertificateStore.java
        RootCertificateStore.java
        SessionCertificateStore.java
        StandardTrustManager.java
        TrustDecider.java
/*
 * Leech - crawling capabilities for Apache Tika
 * 
 * Copyright (C) 2012 DFKI GmbH, Author: Christian Reuschling
 * 
 * This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation,
 * either version 3 of the License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
 * PARTICULAR PURPOSE. See the GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>.
 * 
 * Contact us by mail: christian.reuschling@dfki.de
 */

package de.dfki.km.leech;



import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.net.URL;
import java.rmi.server.UID;
import java.util.LinkedList;
import java.util.logging.Level;
import java.util.logging.Logger;

import javax.mail.URLName;

import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.DublinCore;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

import de.dfki.inquisition.text.StringUtils;
import de.dfki.km.leech.config.CrawlerContext;
import de.dfki.km.leech.config.DirectoryCrawlerContext;
import de.dfki.km.leech.config.LeechConfig;
import de.dfki.km.leech.io.URLStreamProvider;
import de.dfki.km.leech.parser.DirectoryCrawlerParser;
import de.dfki.km.leech.parser.filter.URLFilteringParser;
import de.dfki.km.leech.parser.incremental.IncrementalCrawlingHistory;
import de.dfki.km.leech.parser.incremental.IncrementalCrawlingParser;
import de.dfki.km.leech.sax.CrawlReportContentHandler;
import de.dfki.km.leech.sax.DataSinkContentHandler;
import de.dfki.km.leech.sax.PrintlnContentHandler;
import de.dfki.km.leech.sax.PrintlnContentHandler.Verbosity;
import de.dfki.km.leech.util.ExceptionUtils;
import de.dfki.km.leech.util.UrlUtil;



/**
 * This is the main class, the entry point. Feel free to select one of the plenty of parse-methods<br>
 * <br>
 * Crawling in Leech will be performed by using a ContentHandler, that is invoked for every data entity that is recognized during the crawl. Leech offers a special
 * {@link DataSinkContentHandler} with abstract methods you can implement to store the data into your data store, e.g. a Lucene index. By using
 * {@link DataSinkContentHandler}, it is easy for you to perfom incremental indexing, which means that during a crawl, Leech remarks which data entities were crawled, and
 * at subsequent crawls only those data entities that are new or have changed will be parsed again. Further, you will get the information which data entities were removed
 * since the last crawl. <br>
 * To configure a crawl, there exists several **Context classes for passing into the ParseContext. E.g. a CrawlerContext Object will be used for all configuration
 * parameters that are common for all types of possible datasources. There you can pass things like crawling depth or the path to an incremental crawling history. You
 * also can request stopping a running crawling process.<br>
 * There exists some more specialised **Context classes to adjust the crawling of specific data sources. For example, {@link DirectoryCrawlerContext} let you choose
 * whether symbolic links should be followed or not during crawling a file system. Look into the package de.dfki.km.leech.config for all Context classes offered by Leech.<br>
 * <br>
 * Now some examples as a starting point.<br>
 * To enable incremental indexing during a crawl, pass a {@link CrawlerContext} instance with a path to the history into the ParseContext parameter of a Leech.parse(..)
 * method. Leech will create a new history in the case there is no existing one under the given path. {@link PrintlnContentHandler} is an implementation of
 * {@link DataSinkContentHandler} which simply writes all content inclusive metadata to stout. :<br>
 * <br>
 * <code>
 * Leech leech = new Leech();<br>
 * CrawlerContext crawlerContext = new CrawlerContext().setIncrementalCrawlingHistoryPath("./history/forResourceDir");<br>
 * leech.parse(new File("resource"), new PrintlnContentHandler(), crawlerContext.createParseContext());<br>
 * </code> <br>
 * To request stopping a crawl (should be invoked in a different thread than leech.parse(..):<br>
 * <br>
 * <code>
 * crawlerContext.requestStop()
 * </code>
 * 
 * 
 * @author Christian Reuschling, Dipl.Ing.(BA)
 */



public class Leech extends Tika
{


    public static void main(String[] args) throws IOException, SAXException, TikaException
    {
        Logger.getLogger(Leech.class.getName()).info(
                "Usage: leech <source2crawl_1> <source2crawl_2> ... <source2crawl_N>\n\n"
                        + "A source can be an URL for file://, http://, imap:// or -maybe in future- other urls (e.g. for databases, webDAV, etc...).\n"
                        + "In the case the string is no correct url string, the method will use the string as file path and then generates an\n"
                        + "according URL. Examples: 'file://myDataDir', 'file://bla.pdf', 'http://reuschling.github.com/leech/',\n"
                        + "'imap://usr:pswd@myImapServer.de:993/inbox', 'imaps://usr:pswd@myImapServer.de:993/inbox;uid=22'\n\n"
                        + "This executable crawls all data and simply shows the metadata on the screen. Because leech is designed to be used as a\n"
                        + "java library, this exec is for quick testing purposes.\n\n");

        Leech leech = new Leech();

        CrawlerContext crawlerContext = new CrawlerContext();
        CrawlReportContentHandler reportContentHandler = new CrawlReportContentHandler(new PrintlnContentHandler(Verbosity.all)).setCyclicReportPrintln(7000);

        for (String strSource2Crawl : args)
        {
            Logger.getLogger(Leech.class.getName()).info("Will start crawling " + strSource2Crawl + '\n');
            leech.parse(strSource2Crawl, reportContentHandler, crawlerContext.createParseContext());
        }
    }



    public Leech()
    {
        super(LeechConfig.getDefaultLeechConfig());
    }

    
    
    public Leech(LeechConfig leechConfig)
    {
        super(leechConfig);
    }


    @Override
    public String detect(File file) throws IOException
    {
        return detect(new URLName(file.toURI().toURL()));
    }




    @Override
    public String detect(URL url)
    {
        throw new UnsupportedOperationException(
                "The java.net.URL class methods are not supported because our mechanism supporting new protocols and the according stream creation differ.\n"
                        + "Use the according URLName method instead");
    }



    public String detect(URLName url) throws IOException
    {
        InputStream stream = null;

        try
        {

            Metadata metadata = URLStreamProvider.getURLStreamProvider(url).addFirstMetadata(url, null, null);
            stream = URLStreamProvider.getURLStreamProvider(url).getStream(url, metadata, null);


            return detect(stream, metadata);

        }
        catch (Throwable e)
        {
            Logger.getLogger(Leech.class.getName()).log(Level.SEVERE, "Error", e);

            return null;
        }
        finally
        {
            if(stream != null) stream.close();
        }
    }



    protected ContentHandler getContentHandler(ParseContext context)
    {

        CrawlerContext crawlerContext = context.get(CrawlerContext.class);

        if(crawlerContext == null)
            throw new IllegalStateException(
                    "no crawlerContext was set. Set a CrawlerContext with a configured handler or use another method with directly specifying a handler.");


        ContentHandler handler2use4recursiveCall = crawlerContext.getContentHandler();

        if(!StringUtils.nullOrWhitespace(crawlerContext.getContentHandlerClassName()))
            try
            {
                handler2use4recursiveCall = (ContentHandler) Class.forName(crawlerContext.getContentHandlerClassName()).newInstance();
            }
            catch (Throwable e)
            {
                Logger.getLogger(DirectoryCrawlerParser.class.getName()).log(Level.SEVERE,
                        "Error during the instantiation of the configured content handler " + crawlerContext.getContentHandlerClassName(), e);
            }


        if(handler2use4recursiveCall == null) throw new IllegalStateException("no contentHandler was set. Have a look into the class CrawlerContext");


        return handler2use4recursiveCall;
    }







    @Override
    public Parser getParser()
    {
        return new URLFilteringParser(new IncrementalCrawlingParser(super.getParser()));
    }



    @Override
    public Reader parse(File file) throws IOException
    {
        return parse(new URLName(file.toURI().toURL()));
    }



    /**
     * Parse a directory or a file with a callback-contenthandler. We recommend to use an own implementation of DataSinkContentHandler. In the case you want to use
     * another ContentHandler, be aware that this Object is re-used at every recursive invocation. So make sure that this is possible, and all internal members (e.g.
     * writers, etc.) are re-initialized at the new invocation (maybe clear them inside endDocument(), or inside startDocument()). In the case the handler does not have
     * any internal states that are critical, there should be no problems at all. In the case you have a critical handler with a default constructor, you can also set the
     * class name inside the CrawlerContext object inside ParseContext. In this case, a new handler object will be created at every recursive call..
     * 
     * @param file the file you want to crawl/extract content from
     * @param handler the handler that should handle the extracted data
     * 
     * @throws IOException
     * @throws SAXException
     * @throws TikaException
     */
    public void parse(File file, ContentHandler handler) throws IOException, SAXException, TikaException
    {
        ParseContext context = new ParseContext();
        context.set(Parser.class, super.getParser());

        context.set(CrawlerContext.class, new CrawlerContext().setContentHandler(handler));

        Metadata metadata = new Metadata();
        InputStream stream = null;

        if(handler instanceof DataSinkContentHandler) metadata = ((DataSinkContentHandler) handler).getMetaData();

        try
        {
            URLName url = new URLName(file.toURI().toURL());

            metadata = URLStreamProvider.getURLStreamProvider(url).addFirstMetadata(url, metadata, context);
            stream = URLStreamProvider.getURLStreamProvider(url).getStream(url, metadata, context);

            getParser().parse(stream, handler, metadata, context);

        }
        catch (Throwable e)
        {
            ExceptionUtils.handleException(e, null, metadata, context.get(CrawlerContext.class), context, 0, handler);
        }
        finally
        {
            if(handler instanceof DataSinkContentHandler) ((DataSinkContentHandler) handler).crawlFinished();

            if(stream != null) stream.close();
        }
    }



    /**
     * Parse a directory or a file with a callback-contenthandler. We recommend to use an own implementation of DataSinkContentHandler. In the case you want to use
     * another ContentHandler, be aware that this Object is re-used at every recursive invocation. So make sure that this is possible, and all internal members (e.g.
     * writers, etc.) are re-initialized at the new invocation (maybe clear them inside endDocument(), or inside startDocument()). In the case the handler does not have
     * any internal states that are critical, there should be no problems at all. In the case you have a critical handler with a default constructor, you can also set the
     * class name inside the CrawlerContext object inside ParseContext. In this case, a new handler object will be created at every recursive call..
     * 
     * @param file the file you want to crawl/extract content from
     * @param handler the handler that should handle the extracted data
     * @param context the parsing context to use. An entry with the configured parser will be added by the method. You can pass in an CrawlerContext instance to e.g. set
     *            the contentHandler for recursive crawls or enable incremental crawling.
     * 
     * @throws IOException
     * @throws SAXException
     * @throws TikaException
     */
    public void parse(File file, ContentHandler handler, ParseContext context) throws IOException, SAXException, TikaException
    {
        context.set(Parser.class, super.getParser());

        CrawlerContext crawlerContext = context.get(CrawlerContext.class);
        if(crawlerContext == null)
        {
            crawlerContext = new CrawlerContext();
            context.set(CrawlerContext.class, crawlerContext);
        }
        crawlerContext.setContentHandler(handler);

        Metadata metadata = new Metadata();
        InputStream stream = null;

        if(handler instanceof DataSinkContentHandler) metadata = ((DataSinkContentHandler) handler).getMetaData();

        try
        {
            URLName url = new URLName(file.toURI().toURL());

            metadata = URLStreamProvider.getURLStreamProvider(url).addFirstMetadata(url, metadata, context);
            stream = URLStreamProvider.getURLStreamProvider(url).getStream(url, metadata, context);

            getParser().parse(stream, handler, metadata, context);

        }
        catch (Throwable e)
        {
            ExceptionUtils.handleException(e, null, metadata, context.get(CrawlerContext.class), context, 0, handler);
        }
        finally
        {
            if(handler instanceof DataSinkContentHandler) ((DataSinkContentHandler) handler).crawlFinished();

            if(stream != null) stream.close();
        }
    }





    /**
     * Parse a directory or a file by specifying a ParseContext config. You can pass in an CrawlerContext instance to e.g. set the ContentHandler for recursive crawls.
     * This one will be newly instantiated with the default constructor for every recursive call. Alternatively, you can also set a contentHandler object for reuse.
     * 
     * @param file the file you want to crawl/extract content from
     * @param context the parsing context to use. An entry with the configured parser will be added by the method. You can pass in an CrawlerContext instance to e.g. set
     *            the contentHandler for recursive crawls or enable incremental crawling.
     * 
     * @throws IOException
     * @throws SAXException
     * @throws TikaException
     */
    public void parse(File file, ParseContext context) throws IOException, SAXException, TikaException
    {
        context.set(Parser.class, super.getParser());

        Metadata metadata = new Metadata();
        InputStream stream = null;

        ContentHandler handler = getContentHandler(context);
        if(handler instanceof DataSinkContentHandler) metadata = ((DataSinkContentHandler) handler).getMetaData();

        try
        {
            URLName url = new URLName(file.toURI().toURL());

            metadata = URLStreamProvider.getURLStreamProvider(url).addFirstMetadata(url, metadata, context);
            stream = URLStreamProvider.getURLStreamProvider(url).getStream(url, metadata, context);


            getParser().parse(stream, handler, metadata, context);

        }
        catch (Throwable e)
        {
            ExceptionUtils.handleException(e, null, metadata, new CrawlerContext(), context, 0, handler);
        }
        finally
        {
            if(handler instanceof DataSinkContentHandler) ((DataSinkContentHandler) handler).crawlFinished();

            if(stream != null) stream.close();
        }
    }







    /**
     * Parse a directory or a file with a callback-contenthandler. We recommend to use an own implementation of DataSinkContentHandler. In the case you want to use
     * another ContentHandler, be aware that this Object is re-used at every recursive invocation. So make sure that this is possible, and all internal members (e.g.
     * writers, etc.) are re-initialized at the new invocation (maybe clear them inside endDocument(), or inside startDocument()). In the case the handler does not have
     * any internal states that are critical, there should be no problems at all.In the case you have a critical handler with a default constructor, you can also set the
     * class name inside the CrawlerContext object inside ParseContext. In this case, a new handler object will be created at every recursive call..
     * 
     * @param strSourceString the URL string you want to crawl/extract content from. This can ether be a file://, http:// or - in future - other urls (e.g. for databases,
     *            imap, webDAV, etc...). In the case the string is no correct url string, the method will use the string as file path and then generates an according URL.
     *            Examples: 'file://myDataDir', 'file://bla.pdf', 'http://reuschling.github.com/leech/', 'imap://usr:pswd@myImapServer.de:993/inbox',
     *            'imaps://usr:pswd@myImapServer.de:993/inbox;uid=22'
     * @param handler the handler that should handle the extracted data
     * @throws IOException
     * @throws SAXException
     * @throws TikaException
     */
    public void parse(String strSourceString, ContentHandler handler) throws IOException, SAXException, TikaException
    {
        parse(UrlUtil.sourceString2URL(strSourceString), handler);
    }






    /**
     * This overridden method don't use only the name but tries to generate a URL out of the given name and uses the underlying data if possible
     */
    @Override
    public String detect(String name)
    {
        try
        {
            return detect(UrlUtil.sourceString2URL(name));
        }
        catch (Throwable e)
        {
            Logger.getLogger(Leech.class.getName()).log(Level.SEVERE, "Error", e);
            return null;
        }
    }



    /**
     * Parse a directory or a file with a callback-contenthandler. We recommend to use an own implementation of DataSinkContentHandler. In the case you want to use
     * another ContentHandler, be aware that this Object is re-used at every recursive invocation. So make sure that this is possible, and all internal members (e.g.
     * writers, etc.) are re-initialized at the new invocation (maybe clear them inside endDocument(), or inside startDocument()). In the case the handler does not have
     * any internal states that are critical, there should be no problems at all.In the case you have a critical handler with a default constructor, you can also set the
     * class name inside the CrawlerContext object inside ParseContext. In this case, a new handler object will be created at every recursive call..
     * 
     * @param strSourceString the URL string you want to crawl/extract content from. This can ether be a file://, http://, imap:// or - in future - other urls (e.g. for
     *            databases, webDAV, etc...). In the case the string is no correct url string, the method will use the string as file path and then generates an according
     *            URL. Examples: 'file://myDataDir', 'file://bla.pdf', 'http://reuschling.github.com/leech/', 'imap://usr:pswd@myImapServer.de:993/inbox',
     *            'imaps://usr:pswd@myImapServer.de:993/inbox;uid=22'
     * @param handler the handler that should handle the extracted data
     * @param context the parsing context to use. An entry with the configured parser will be added by the method. You can pass in an CrawlerContext instance to e.g. set
     *            the contentHandler for recursive crawls or enable incremental crawling.
     * 
     * @throws IOException
     * @throws SAXException
     * @throws TikaException
     */
    public void parse(String strSourceString, ContentHandler handler, ParseContext context) throws IOException, SAXException, TikaException
    {
        parse(UrlUtil.sourceString2URL(strSourceString), handler, context);
    }



    /**
     * Parse a directory or a file with a callback-contenthandler. We recommend to use an own implementation of DataSinkContentHandler. In the case you want to use
     * another ContentHandler, be aware that this Object is re-used at every recursive invocation. So make sure that this is possible, and all internal members (e.g.
     * writers, etc.) are re-initialized at the new invocation (maybe clear them inside endDocument(), or inside startDocument()). In the case the handler does not have
     * any internal states that are critical, there should be no problems at all.In the case you have a critical handler with a default constructor, you can also set the
     * class name inside the CrawlerContext object inside ParseContext. In this case, a new handler object will be created at every recursive call..
     * 
     * @param lSourceStrings the URL strings you want to crawl/extract content from. This can ether be a file://, http://, imap:// or - in future - other urls (e.g. for
     *            databases, webDAV, etc...). In the case the string is no correct url string, the method will use the string as file path and then generates an according
     *            URL. Examples: 'file://myDataDir', 'file://bla.pdf', 'http://reuschling.github.com/leech/', 'imap://usr:pswd@myImapServer.de:993/inbox',
     *            'imaps://usr:pswd@myImapServer.de:993/inbox;uid=22'
     * @param handler the handler that should handle the extracted data
     * @param context the parsing context to use. An entry with the configured parser will be added by the method. You can pass in an CrawlerContext instance to e.g. set
     *            the contentHandler for recursive crawls or enable incremental crawling.
     * 
     * @throws IOException
     * @throws SAXException
     * @throws TikaException
     */
    public void parse(String[] lSourceStrings, ContentHandler handler, ParseContext context) throws IOException, SAXException, TikaException
    {
        LinkedList<URLName> llUrls = new LinkedList<>();

        for (String strSourceString : lSourceStrings)
            llUrls.add(UrlUtil.sourceString2URL(strSourceString));

        parse(llUrls.toArray(new URLName[0]), handler, context);
    }



    /**
     * Parse an URL by specifying a ParseContext config. You can pass in an CrawlerContext instance to e.g. set the ContentHandler for recursive crawls. This one will be
     * newly instantiated with the default constructor for every recursive call.
     * 
     * @param strSourceString the URL string you want to crawl/extract content from. This can ether be a file://, http:// or - in future - other urls (e.g. for databases,
     *            imap, webDAV, etc...). In the case the string is no correct url string, the method will use the string as file path and then generates an according URL.
     *            Examples: 'file://myDataDir', 'file://bla.pdf', 'http://reuschling.github.com/leech/', 'imap://usr:pswd@myImapServer.de:993/inbox',
     *            'imaps://usr:pswd@myImapServer.de:993/inbox;uid=22'
     * @param context the parsing context to use. An entry with the configured parser will be added by the method. You can pass in an CrawlerContext instance to e.g. set
     *            the contentHandler for recursive crawls or enable incremental crawling.
     * 
     * @throws IOException
     * @throws SAXException
     * @throws TikaException
     */
    public void parse(String strSourceString, ParseContext context) throws IOException, SAXException, TikaException
    {
        parse(UrlUtil.sourceString2URL(strSourceString), context);
    }



    @Override
    public Reader parse(URL url) throws IOException
    {
        throw new UnsupportedOperationException(
                "The java.net.URL class methods are not supported because our mechanism supporting new protocols and the according stream creation differ.\n"
                        + "Use the according URLName method instead");
    }



    public Reader parse(URLName url) throws IOException
    {
        url = UrlUtil.normalizeURL(url);


        InputStream stream;
        try
        {
            Metadata metadata = URLStreamProvider.getURLStreamProvider(url).addFirstMetadata(url, null, null);
            stream = URLStreamProvider.getURLStreamProvider(url).getStream(url, metadata, null);


            return parse(stream, metadata);

        }
        catch (Throwable e)
        {
            Logger.getLogger(Leech.class.getName()).log(Level.SEVERE, "Error", e);

            return null;
        }
    }



    /**
     * Parse a directory or a file with a callback-contenthandler. We recommend to use an own implementation of DataSinkContentHandler. In the case you want to use
     * another ContentHandler, be aware that this Object is re-used at every recursive invocation. So make sure that this is possible, and all internal members (e.g.
     * writers, etc.) are re-initialized at the new invocation (maybe clear them inside endDocument(), or inside startDocument()). In the case the handler does not have
     * any internal states that are critical, there should be no problems at all.In the case you have a critical handler with a default constructor, you can also set the
     * class name inside the CrawlerContext object inside ParseContext. In this case, a new handler object will be created at every recursive call..
     * 
     * @param url the URL you want to crawl/extract content from. This can ether be a file://, http:// or - in future - other urls (e.g. for databases, imap, webDAV,
     *            etc...)
     * @param handler the handler that should handle the extracted data
     * @throws IOException
     * @throws SAXException
     * @throws TikaException
     */
    public void parse(URLName url, ContentHandler handler) throws IOException, SAXException, TikaException
    {
        url = UrlUtil.normalizeURL(url);

        ParseContext context = new ParseContext();
        context.set(Parser.class, super.getParser());

        context.set(CrawlerContext.class, new CrawlerContext().setContentHandler(handler));

        Metadata metadata = new Metadata();
        InputStream stream = null;

        if(handler instanceof DataSinkContentHandler) metadata = ((DataSinkContentHandler) handler).getMetaData();

        try
        {

            metadata = URLStreamProvider.getURLStreamProvider(url).addFirstMetadata(url, metadata, context);
            stream = URLStreamProvider.getURLStreamProvider(url).getStream(url, metadata, context);

            getParser().parse(stream, handler, metadata, context);

        }
        catch (Throwable e)
        {
            ExceptionUtils.handleException(e, null, metadata, context.get(CrawlerContext.class), context, 0, handler);
        }
        finally
        {
            if(handler instanceof DataSinkContentHandler) ((DataSinkContentHandler) handler).crawlFinished();

            if(stream != null) stream.close();
        }

    }



    /**
     * Parse a directory or a file with a callback-contenthandler. We recommend to use an own implementation of DataSinkContentHandler. In the case you want to use
     * another ContentHandler, be aware that this Object is re-used at every recursive invocation. So make sure that this is possible, and all internal members (e.g.
     * writers, etc.) are re-initialized at the new invocation (maybe clear them inside endDocument(), or inside startDocument()). In the case the handler does not have
     * any internal states that are critical, there should be no problems at all.In the case you have a critical handler with a default constructor, you can also set the
     * class name inside the CrawlerContext object inside ParseContext. In this case, a new handler object will be created at every recursive call..
     * 
     * @param url the URL you want to crawl/extract content from. This can ether be a file://, http:// or - in future - other urls (e.g. for databases, imap, webDAV,
     *            etc...)
     * @param handler the handler that should handle the extracted data
     * @param context the parsing context to use. An entry with the configured parser will be added by the method. You can pass in an CrawlerContext instance to e.g. set
     *            the contentHandler for recursive crawls or enable incremental crawling.
     * 
     * @throws IOException
     * @throws SAXException
     * @throws TikaException
     */
    public void parse(URLName url, ContentHandler handler, ParseContext context) throws IOException, SAXException, TikaException
    {
        url = UrlUtil.normalizeURL(url);

        context.set(Parser.class, super.getParser());

        CrawlerContext crawlerContext = context.get(CrawlerContext.class);
        if(crawlerContext == null)
        {
            crawlerContext = new CrawlerContext();
            context.set(CrawlerContext.class, crawlerContext);
        }
        crawlerContext.setContentHandler(handler);

        Metadata metadata = new Metadata();
        InputStream stream = null;

        if(handler instanceof DataSinkContentHandler) metadata = ((DataSinkContentHandler) handler).getMetaData();

        try
        {
            metadata = URLStreamProvider.getURLStreamProvider(url).addFirstMetadata(url, metadata, context);
            stream = URLStreamProvider.getURLStreamProvider(url).getStream(url, metadata, context);


            getParser().parse(stream, handler, metadata, context);

        }
        catch (Throwable e)
        {
            ExceptionUtils.handleException(e, null, metadata, context.get(CrawlerContext.class), context, 0, handler);
        }
        finally
        {
            if(handler instanceof DataSinkContentHandler) ((DataSinkContentHandler) handler).crawlFinished();

            if(stream != null) stream.close();
        }

    }



    /**
     * Parse a directory or a file with a callback-contenthandler. We recommend to use an own implementation of DataSinkContentHandler. In the case you want to use
     * another ContentHandler, be aware that this Object is re-used at every recursive invocation. So make sure that this is possible, and all internal members (e.g.
     * writers, etc.) are re-initialized at the new invocation (maybe clear them inside endDocument(), or inside startDocument()). In the case the handler does not have
     * any internal states that are critical, there should be no problems at all.In the case you have a critical handler with a default constructor, you can also set the
     * class name inside the CrawlerContext object inside ParseContext. In this case, a new handler object will be created at every recursive call..
     * 
     * @param urls the URLs you want to crawl/extract content from. This can ether be a file://, http:// or - in future - other urls (e.g. for databases, imap, webDAV,
     *            etc...)
     * @param handler the handler that should handle the extracted data
     * @param context the parsing context to use. An entry with the configured parser will be added by the method. You can pass in an CrawlerContext instance to e.g. set
     *            the contentHandler for recursive crawls or enable incremental crawling.
     * 
     * @throws IOException
     * @throws SAXException
     * @throws TikaException
     */
    public void parse(URLName[] urls, ContentHandler handler, ParseContext context) throws IOException, SAXException, TikaException
    {
        for (int i = 0; i < urls.length; i++)
            urls[i] = UrlUtil.normalizeURL(urls[i]);

        context.set(Parser.class, super.getParser());

        CrawlerContext crawlerContext = context.get(CrawlerContext.class);
        if(crawlerContext == null)
        {
            crawlerContext = new CrawlerContext();
            context.set(CrawlerContext.class, crawlerContext);
        }
        crawlerContext.setContentHandler(handler);

        Metadata metadata = new Metadata();
        InputStream stream = null;

        if(handler instanceof DataSinkContentHandler) metadata = ((DataSinkContentHandler) handler).getMetaData();

        try
        {

            String strUid = new UID().toString();
            metadata.add(Metadata.RESOURCE_NAME_KEY, "leechUrlList " + strUid);
            metadata.add(DublinCore.SOURCE, strUid + "_leechUrlList.urlList");
            metadata.add(IncrementalCrawlingHistory.dataEntityId, strUid + "_leechUrlList.urlList");
            metadata.add(IncrementalCrawlingHistory.dataEntityContentFingerprint, strUid + "_leechUrlList.urlList");
            metadata.add(Metadata.CONTENT_TYPE, "application/leechUrlList");

            String strSourcesString = "";
            for (URLName url : urls)
                strSourcesString += url.toString() + "\n";

            stream = new ByteArrayInputStream(strSourcesString.getBytes());


            // wir müssen noch aufpassen: wenn wir eine Liste übergeben und mit dem URListParser arbeiten, dann dürfen wir die erste Rekursionsstufe nicht dazuzählen
            int iCrawlingDepth = crawlerContext.getCrawlingDepth();
            if(iCrawlingDepth < Integer.MAX_VALUE) crawlerContext.setCrawlingDepth(iCrawlingDepth + 1);

            getParser().parse(stream, handler, metadata, context);

        }
        catch (Throwable e)
        {
            ExceptionUtils.handleException(e, null, metadata, context.get(CrawlerContext.class), context, 0, handler);
        }
        finally
        {
            if(handler instanceof DataSinkContentHandler) ((DataSinkContentHandler) handler).crawlFinished();

            if(stream != null) stream.close();
        }

    }



    /**
     * Parse an URL by specifying a ParseContext config. You can pass in an CrawlerContext instance to e.g. set the ContentHandler for recursive crawls. This one will be
     * newly instantiated with the default constructor for every recursive call.
     * 
     * @param url the URL you want to crawl/extract content from. This can ether be a file://, http:// or - in future - other urls (e.g. for databases, imap, webDAV,
     *            etc...)
     * @param context the parsing context to use. An entry with the configured parser will be added by the method. You can pass in an CrawlerContext instance to e.g. set
     *            the contentHandler for recursive crawls or enable incremental crawling.
     * 
     * @throws IOException
     * @throws SAXException
     * @throws TikaException
     */
    public void parse(URLName url, ParseContext context) throws IOException, SAXException, TikaException
    {
        url = UrlUtil.normalizeURL(url);

        context.set(Parser.class, super.getParser());

        Metadata metadata = new Metadata();
        InputStream stream = null;

        ContentHandler handler = getContentHandler(context);

        if(handler instanceof DataSinkContentHandler) metadata = ((DataSinkContentHandler) handler).getMetaData();

        try
        {
            metadata = URLStreamProvider.getURLStreamProvider(url).addFirstMetadata(url, metadata, context);
            stream = URLStreamProvider.getURLStreamProvider(url).getStream(url, metadata, context);

            getParser().parse(stream, handler, metadata, context);

        }
        catch (Throwable e)
        {
            ExceptionUtils.handleException(e, null, metadata, context.get(CrawlerContext.class), context, 0, handler);
        }
        finally
        {
            if(handler instanceof DataSinkContentHandler) ((DataSinkContentHandler) handler).crawlFinished();

            if(stream != null) stream.close();
        }

    }



    @Override
    public String parseToString(File file) throws IOException, TikaException
    {
        return parseToString(new URLName(file.toURI().toURL()));
    }



    @Override
    public String parseToString(URL url) throws IOException, TikaException
    {
        throw new UnsupportedOperationException(
                "The java.net.URL class methods are not supported because our mechanism supporting new protocols and the according stream creation differ.\n"
                        + "Use the according URLName method instead");
    }




    public String parseToString(URLName url) throws IOException, TikaException
    {
        url = UrlUtil.normalizeURL(url);

        InputStream stream = null;

        try
        {
            Metadata metadata = URLStreamProvider.getURLStreamProvider(url).addFirstMetadata(url, null, null);
            stream = URLStreamProvider.getURLStreamProvider(url).getStream(url, metadata, null);


            return parseToString(stream, metadata);
        }
        catch (Throwable e)
        {
            throw new TikaException("Error while parsing " + url.getFile(), e);
        }
    }







}