CrawlerParser.java example

Explorer

leech-master
- src
  - main
    - java
      - de
        dfki
        km
        leech
        Leech.java
        SubDataEntityContentHandler.java
        config
        CrawlerContext.java
        DirectoryCrawlerContext.java
        HtmlCrawlerContext.java
        ImapCrawlerContext.java
        LeechConfig.java
        detect
        DatasourceMediaTypes.java
        DirectoryDatasourceDetector.java
        ImapDatasourceDetector.java
        LeechDefaultDetector.java
        io
        FileURLStreamProvider.java
        HttpURLStreamProvider.java
        ImapURLStreamProvider.java
        ShiftInitInputStream.java
        URLStreamProvider.java
        lucene
        LeechDefaultFieldConfig.java
        ToLuceneContentHandler.java
        metadata
        LeechMetadata.java
        parser
        CrawlerParser.java
        DirectoryCrawlerParser.java
        HtmlCrawlerParser.java
        ImapCrawlerParser.java
        NonRecursiveCrawlerParser.java
        SambaCrawlerParser.java
        UrlListCrawlerParser.java
        filter
        RegExpPattern.java
        SubstringPattern.java
        URLFilter.java
        URLFilterPattern.java
        URLFilteringParser.java
        incremental
        IncrementalCrawlingHistory.java
        IncrementalCrawlingParser.java
        rss
        FeedParser2.java
        wikipedia
        WikipediaDumpParser.java
        sax
        CrawlReportContentHandler.java
        DataSinkContentHandler.java
        DataSinkContentHandlerAdapter.java
        DataSinkContentHandlerDecorator.java
        PrintlnContentHandler.java
        solr
        ToSolrContentHandler.java
        util
        CookieManager.java
        ExceptionUtils.java
        IndexPostprocessor.java
        LeechException.java
        LuceneIndexCreator.java
        OSUtils.java
        SolrIndexCreator.java
        TikaUtils.java
        UrlUtil.java
        ValueHolder.java
        certificates
        CertificateIgnoringSocketFactory.java
        CertificateStore.java
        Decision.java
        PersistentCertificateStore.java
        RootCertificateStore.java
        SessionCertificateStore.java
        StandardTrustManager.java
        TrustDecider.java

/*
 * Leech - crawling capabilities for Apache Tika
 * 
 * Copyright (C) 2012 DFKI GmbH, Author: Christian Reuschling
 * 
 * This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation,
 * either version 3 of the License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
 * PARTICULAR PURPOSE. See the GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>.
 * 
 * Contact us by mail: christian.reuschling@dfki.de
 */

package de.dfki.km.leech.parser;



import java.io.IOException;
import java.io.InputStream;
import java.util.Iterator;
import java.util.LinkedList;

import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

import de.dfki.inquisition.collections.MultiValueHashMap;
import de.dfki.km.leech.config.CrawlerContext;
import de.dfki.km.leech.parser.incremental.IncrementalCrawlingParser;
import de.dfki.km.leech.sax.DataSinkContentHandler;
import de.dfki.km.leech.util.ExceptionUtils;
import de.dfki.km.leech.util.TikaUtils;



/**
 * This is the upper class for all crawling parsers. If you want to write a crawling parser, implement this class. CrawlerParser will first invoke
 * {@link #processCurrentDataEntity(InputStream, Metadata, ContentHandler, ParseContext)} to process the input stream and pushing it to a ContentHandler, simply the
 * standard Tika parsing way. Next, it will call {@link #getSubDataEntitiesInformation(InputStream, ContentHandler, Metadata, ParseContext)} to determine all succeeding
 * sub data entities from this data entity. CrawlerParser then iterates over these entries and give them to
 * {@link #processSubDataEntity(MultiValueHashMap, Metadata, ContentHandler, ParseContext)} in order to further process the sub data entities individually. This is the
 * recursive call, which starts the whole parsing process again with a new entity. <br>
 * <br>
 * The crawling process can be configured with specific context classes, have a look into the 'config' package, especially {@link CrawlerContext}.
 * 
 * @author Christian Reuschling, Dipl.Ing.(BA)
 */
public abstract class CrawlerParser implements Parser
{





    private static final long serialVersionUID = -6707880965147815349L;

    static public final String CURRENT_CRAWLING_DEPTH = "currentCrawlingDepth";

    static public final String SOURCEID = "sourceId";







    /**
     * Gets information about all data entities that should be (sub)crawled by this crawler instance. This e.g. could be all files and directories inside the current
     * directory. You can return arbritrary information about a data entity - it will be offered as-is at the invocation of
     * {@link #processSubDataEntity(MultiValueHashMap, Metadata, ContentHandler, ParseContext)} in order to deal with it. <br>
     * <br>
     * To consider constraints given from the user for Url/datasource string filtering, use the potential CrawlerContext Object inside the ParseContext and use the
     * URLFilter. Same is for the stop request, which is also offered by the CrawlerContext. Leech deals automatically with stop requests and data entity filtering, but
     * you can enhance the performance when you filter subentities early in this class. This is because otherwise there will be a stream initialization or established
     * connection before filtering. <br>
     * <br>
     * While creating the information Map for a (sub) data entity, it is recommended to put at least one key entry with CrawlerParser.SOURCEID for use in potential error
     * messages, to identify a problematic data entity. In the case you do so, you can simply throw all Exceptions inside your implementation of processSubDataEntity, the
     * super class will deal with it.<br>
     * <br>
     * 
     * @param stream the stream-parameter from the Parser.parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) invocation
     * @param handler the handler-parameter from the Parser.parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) invocation
     * @param metadata a copy of the metadata-parameter from the Parser.parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
     *            invocation
     * @param context the context-parameter from the Parser.parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) invocation
     * 
     * @return an iterator with all information about a data entity that should be crawled, that is enough to deal with it inside the other method implementations
     * 
     * @throws Exception
     */
    abstract protected Iterator<MultiValueHashMap<String, Object>> getSubDataEntitiesInformation(InputStream stream, ContentHandler handler, Metadata metadata,
            ParseContext context) throws Exception;






    @Override
    public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException
    {


        CrawlerContext crawlerContext = context.get(CrawlerContext.class);
        if(crawlerContext == null) crawlerContext = new CrawlerContext();



        String strSourceURL = metadata.get(Metadata.SOURCE);
        int iCurrentCrawlingDepth = 0;
        TikaInputStream tmpStream = null;

        try
        {

            String strDepth = metadata.get(CURRENT_CRAWLING_DEPTH);
            if(strDepth != null) iCurrentCrawlingDepth = Integer.valueOf(strDepth);




            // ## die momentan zu crawlende entity (der potentielle container mit pot. eigenem Inhalt)

            String strDataEntityModState = metadata.get(IncrementalCrawlingParser.DATA_ENTITY_MODIFICATION_STATE);

            if(!IncrementalCrawlingParser.UNMODIFIED.equals(strDataEntityModState))
            {
                // hier ist es vermutlich besser, auf das tmp-file-Angebot vom TikaStream einzugehen - die Platte wird vermutlich schneller sein als
                // eine durchschnittliche Internetverbindung.Auch schreibend. Sollte kein File hinter dem stream stecken (z.B. bei einer
                // http-connection) wird Tika automatisch ein temporäres File erzeugen.
                tmpStream = TikaInputStream.get((TikaInputStream.get(stream).getPath()));

                ContentHandler handler2use4recursiveCall = TikaUtils.createContentHandler4SubCrawl(crawlerContext);

                processCurrentDataEntity(tmpStream, metadata, handler2use4recursiveCall, context);
            }






            // ## die SubEntities - machen wir nur, wenn wir nicht schon die maximale crawlingdepth erreicht haben


            Iterator<MultiValueHashMap<String, Object>> subDataEntitiesInformation;

            if(iCurrentCrawlingDepth + 1 > crawlerContext.getCrawlingDepth())
                subDataEntitiesInformation = new LinkedList<MultiValueHashMap<String, Object>>().iterator();
            else
            {
                // wir kopieren das Metadata-Teil hier, damit wir in der Schleife das Original-Objekt verwenden können (der iterator wird evtl. erst
                // während des Schleifendurchlaufs in einem anderen Thread beschickt, und da sollte das Metadata-Objekt noch gültig sein. Wir
                // verändern allerdings dessen Inhalte in der Schleife
                subDataEntitiesInformation = getSubDataEntitiesInformation(stream, handler, TikaUtils.copyMetadata(metadata), context);
            }



            int iEntityIndex = 0;
            while (subDataEntitiesInformation.hasNext() && !crawlerContext.stopRequested())
            {


                MultiValueHashMap<String, Object> subDataEntityInfo = subDataEntitiesInformation.next();

                // bei jeder Entität schauen wir, ob wir einen neuen Handler erzeugen müssen
                ContentHandler handler2use4recursiveCall = TikaUtils.createContentHandler4SubCrawl(crawlerContext);
                try
                {


                    // wir löschen die Inhalte im Metadata-Objekt, da wir zwar die Referenz behalten wollen (falls ein Handler das auch hat), aber die
                    // Inhalte für die subEntity neu gefüllt werden sollen.
                    TikaUtils.clearMetadata(metadata);

                    // wir tragen dann noch die aktuelle depth ein, damit wir gegebenenfalls abbrechen können
                    metadata.set(CURRENT_CRAWLING_DEPTH, String.valueOf(iCurrentCrawlingDepth + 1));


                    processSubDataEntity(subDataEntityInfo, metadata, handler2use4recursiveCall, context);



                }
                catch (Throwable e)
                {
                    Object sourceId = subDataEntityInfo.getFirst(SOURCEID);

                    ExceptionUtils.handleException(e, sourceId == null ? "noSourceId" : sourceId.toString(), metadata, crawlerContext, context, iCurrentCrawlingDepth,
                            handler2use4recursiveCall);
                }


                iEntityIndex++;

                if(iEntityIndex % 10000 == 0)
                {
                    // twice is full gc
                    System.gc();
                    System.gc();
                }

            }




            if(iCurrentCrawlingDepth != 0) return;



            // am Schluß auch noch die Metadata abräumen, falls man die an einem anderen Leech-Aufruf wiederverwenden will
            TikaUtils.clearMetadata(metadata);



        }
        catch (Exception e)
        {
            if(e instanceof TikaException) throw (TikaException) e;
            throw new TikaException("Error while crawling '" + strSourceURL + "'", e);
        }
        finally
        {

            if(tmpStream != null) tmpStream.close();



            // hier wollen wir auch noch brav unterbrechen, wenn ein stop requested wurde
            Boolean bStopRequested = crawlerContext.stopRequested();
            synchronized (bStopRequested)
            {
                if(bStopRequested && iCurrentCrawlingDepth == 0) bStopRequested.notifyAll();
            }
        }

    }



    /**
     * Processes the current data entity that should be parsed. This method extracts the content by e.g. delegating the stream to a specific Parser in order to push the
     * content to the ContentHandler, whereby the {@link #getSubDataEntitiesInformation(InputStream, ContentHandler, Metadata, ParseContext)} method extracts all the
     * links to other data entites out of this content, for further processing them individually. <br>
     * <br>
     * For example, the {@link HtmlCrawlerParser} simply delegates the parameters to a Tika HtmlParser Object when its unmodified (this info is inside the metadata
     * possibly generated by {@link IncrementalCrawlingParser}).
     * 
     * @param stream a 'cloned' stream from the stream given from the parse method
     * @param metadata the metadata given from the parse method
     * @param handler the origin content handler instance from the parse method, OR an instance created newly at every data entity as configured inside CrawlerContext
     * @param context the ParseContext Object given from the parse method
     * 
     * @throws Exception
     */
    abstract protected void processCurrentDataEntity(InputStream stream, Metadata metadata, ContentHandler handler, ParseContext context) throws Exception;



    /**
     * Processes a sub data entity from this parsed 'container' data entity - this can be the recursive call, in the case you have other complex data types behind your
     * sub data entities, which needs further parsing again. In this case, normally you invoke some kind of Leech.parse(...) method here, e.g.<br>
     * <br>
     * <code>
     * Parser parser = m_leech.getParser();<br>
     * parser.parse(stream, handler2use4recursiveCall, metadata, context);<br>
     * <br>
     * </code> In the other case, you have all the information yet, ready for the final handler. In this case, you can send it directly, without further processing: <br>
     * <br>
     * <code>
     *  SubDataEntityContentHandler subHandler = new SubDataEntityContentHandler(handler, metadata, strBody);<br>
     *   if(ignoreHistory)<br>
     *           subHandler.triggerSubDataEntityHandling();<br>
     *   else<br>
     *         subHandler.triggerSubDataEntityHandling(context);<br>
     *  </code> <br>
     * The stream and possible additional metadata entries you get(or create) out of information inside the subDataEntityInformation. Make sure that you reuse the
     * metadata Object for the case that the handler has also an internal metadata member that must be the same object (as inside {@link DataSinkContentHandler})
     * 
     * @param subDataEntityInformation one entry out of the formerly returned iterator from
     *            {@link #getSubDataEntitiesInformation(InputStream, ContentHandler, Metadata, ParseContext)}
     * @param metadata2use4recursiveCall the metadata object that should be used for handling / recursive calls
     * @param handler2use4recursiveCall the origin content handler instance from the root crawl invocation, OR an instance created newly at every data entity as
     *            configured inside CrawlerContext
     * @param context the origin ParseContext instance given from the parse method
     * 
     * @throws Exception
     */
    abstract protected void processSubDataEntity(MultiValueHashMap<String, Object> subDataEntityInformation, Metadata metadata2use4recursiveCall,
            ContentHandler handler2use4recursiveCall, ParseContext context) throws Exception;




}