DataSinkContentHandler.java example

Explorer

leech-master
- src
  - main
    - java
      - de
        dfki
        km
        leech
        Leech.java
        SubDataEntityContentHandler.java
        config
        CrawlerContext.java
        DirectoryCrawlerContext.java
        HtmlCrawlerContext.java
        ImapCrawlerContext.java
        LeechConfig.java
        detect
        DatasourceMediaTypes.java
        DirectoryDatasourceDetector.java
        ImapDatasourceDetector.java
        LeechDefaultDetector.java
        io
        FileURLStreamProvider.java
        HttpURLStreamProvider.java
        ImapURLStreamProvider.java
        ShiftInitInputStream.java
        URLStreamProvider.java
        lucene
        LeechDefaultFieldConfig.java
        ToLuceneContentHandler.java
        metadata
        LeechMetadata.java
        parser
        CrawlerParser.java
        DirectoryCrawlerParser.java
        HtmlCrawlerParser.java
        ImapCrawlerParser.java
        NonRecursiveCrawlerParser.java
        SambaCrawlerParser.java
        UrlListCrawlerParser.java
        filter
        RegExpPattern.java
        SubstringPattern.java
        URLFilter.java
        URLFilterPattern.java
        URLFilteringParser.java
        incremental
        IncrementalCrawlingHistory.java
        IncrementalCrawlingParser.java
        rss
        FeedParser2.java
        wikipedia
        WikipediaDumpParser.java
        sax
        CrawlReportContentHandler.java
        DataSinkContentHandler.java
        DataSinkContentHandlerAdapter.java
        DataSinkContentHandlerDecorator.java
        PrintlnContentHandler.java
        solr
        ToSolrContentHandler.java
        util
        CookieManager.java
        ExceptionUtils.java
        IndexPostprocessor.java
        LeechException.java
        LuceneIndexCreator.java
        OSUtils.java
        SolrIndexCreator.java
        TikaUtils.java
        UrlUtil.java
        ValueHolder.java
        certificates
        CertificateIgnoringSocketFactory.java
        CertificateStore.java
        Decision.java
        PersistentCertificateStore.java
        RootCertificateStore.java
        SessionCertificateStore.java
        StandardTrustManager.java
        TrustDecider.java

/*
 * Leech - crawling capabilities for Apache Tika
 * 
 * Copyright (C) 2012 DFKI GmbH, Author: Christian Reuschling
 * 
 * This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation,
 * either version 3 of the License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
 * PARTICULAR PURPOSE. See the GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>.
 * 
 * Contact us by mail: christian.reuschling@dfki.de
 */

package de.dfki.km.leech.sax;




import java.io.StringWriter;

import org.apache.tika.metadata.HttpHeaders;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ContentHandlerDecorator;
import org.apache.tika.sax.WriteOutContentHandler;
import org.xml.sax.SAXException;

import de.dfki.km.leech.parser.CrawlerParser;
import de.dfki.km.leech.parser.incremental.IncrementalCrawlingHistory;
import de.dfki.km.leech.parser.incremental.IncrementalCrawlingParser;
import de.dfki.km.leech.util.UrlUtil;



/**
 * A ContentHandler implementation to store data with a hook/push-interface. This is for crawling datasources recursively. Implement the processData methods and process
 * your data as you wish.<br>
 * <br>
 * This handler deals with the data entity modification state entries inside the metadata offered from {@link IncrementalCrawlingParser} and {@link CrawlerParser} (in
 * case of an error).
 * 
 * @author Christian Reuschling, Dipl.Ing.(BA)
 */
public abstract class DataSinkContentHandler extends ContentHandlerDecorator
{

    protected int m_iWriteLimit = -1;

    protected Metadata m_metadata = new Metadata();

    protected StringWriter m_writer;



    /**
     * Creates a new {@link DataSinkContentHandler}.<br>
     * CAUTION:Note that the internal metadata object has to be the same than the one given to the parser that works with this contentHandler. Use
     * {@link #setMetaData(Metadata)} or one of the Leech methods with the DataSinkContentHandlers. In the second case Leech will make sure that the metadata objects will
     * be set correctly.
     */
    public DataSinkContentHandler()
    {
    }



    /**
     * Creates a content handler that writes XHTML body character events to an internal string buffer, and forwards it together with the metadata object to a
     * callback/processing method.<br>
     * CAUTION:Note that the internal metadata object has to be the same than the one given to the parser that works with this contentHandler. Use
     * {@link #setMetaData(Metadata)} or one of the Leech methods with the DataSinkContentHandlers. In the second case Leech will make sure that the metadata objects will
     * be set correctly.
     * <p>
     * <p>
     * The internal string buffer is bounded at the given number of characters. If this write limit is reached, then a {@link SAXException} is thrown.
     * 
     * @param writeLimit maximum number of characters to include in the string, or -1 to disable the write limit
     */
    public DataSinkContentHandler(int writeLimit)
    {
        m_iWriteLimit = writeLimit;
    }



    /**
     * Creates a content handler that writes XHTML body character events to an internal string buffer, and forwards it together with the metadata object to a
     * callback/processing method.
     * <p>
     * The internal string buffer is bounded at 6 * 1024 * 1024 characters. If this write limit is reached, then a {@link SAXException} is thrown.
     * 
     * @param metadata the metadata object given to the parser object that works with this ContentHandler. This is to forward this reference to the processing method, so
     *            make sure that both objects holds the same object
     */
    public DataSinkContentHandler(Metadata metadata)
    {
        m_metadata = metadata;
    }



    /**
     * Creates a content handler that writes XHTML body character events to an internal string buffer, and forwards it together with the metadata object to a
     * callback/processing method.
     * <p>
     * <p>
     * The internal string buffer is bounded at the given number of characters. If this write limit is reached, then a {@link SAXException} is thrown.
     * 
     * @param writeLimit maximum number of characters to include in the string, or -1 to disable the write limit
     * @param metadata the metadata object given to the parser object that works with this ContentHandler. This is to forward this reference to the processing method, so
     *            make sure that both objects holds the same object
     */
    public DataSinkContentHandler(Metadata metadata, int writeLimit)
    {
        m_iWriteLimit = writeLimit;
        m_metadata = metadata;
    }



    /**
     * This method will be invoked by the leech class at the end of the parse method. You can perform some shutdown stuff after the crawl if you implement this method.
     */
    public abstract void crawlFinished();



    @Override
    public void endDocument() throws SAXException
    {
        super.endDocument();


        String strDataEntitiyModState = m_metadata.get(IncrementalCrawlingParser.DATA_ENTITY_MODIFICATION_STATE);

        // wir entfernen die Dinge, die wir gar nicht drin haben wollen
        m_metadata.remove(IncrementalCrawlingParser.DATA_ENTITY_MODIFICATION_STATE);
        m_metadata.remove(CrawlerParser.CURRENT_CRAWLING_DEPTH);

        // und passen auf, daß nicht noch Passwörter in einer URL stehen
        String strBadAttName = IncrementalCrawlingHistory.dataEntityId;
        String[] straUrlsWithPwd = m_metadata.getValues(strBadAttName);
        m_metadata.remove(strBadAttName);
        for (String strPossiblePwdUrlString : straUrlsWithPwd)
            m_metadata.add(strBadAttName, UrlUtil.urlNameWithoutPassword(strPossiblePwdUrlString));

        strBadAttName = Metadata.SOURCE;
        straUrlsWithPwd = m_metadata.getValues(strBadAttName);
        m_metadata.remove(strBadAttName);
        for (String strPossiblePwdUrlString : straUrlsWithPwd)
            m_metadata.add(strBadAttName, UrlUtil.urlNameWithoutPassword(strPossiblePwdUrlString));

        strBadAttName = Metadata.RESOURCE_NAME_KEY;
        straUrlsWithPwd = m_metadata.getValues(strBadAttName);
        m_metadata.remove(strBadAttName);
        for (String strPossiblePwdUrlString : straUrlsWithPwd)
            m_metadata.add(strBadAttName, UrlUtil.urlNameWithoutPassword(strPossiblePwdUrlString));




        if(IncrementalCrawlingParser.MODIFIED.equals(strDataEntitiyModState))
        {
            processModifiedData(m_metadata, this.toString());
        }
        else if(IncrementalCrawlingParser.REMOVED.equals(strDataEntitiyModState))
        {
            // these are set because of the dummy stream
            m_metadata.remove(HttpHeaders.CONTENT_ENCODING);
            m_metadata.remove(HttpHeaders.CONTENT_TYPE);
            processRemovedData(m_metadata);
        }
        else if(IncrementalCrawlingParser.ERROR.equals(strDataEntitiyModState))
        {
            processErrorData(m_metadata);
        }
        else if(IncrementalCrawlingParser.UNMODIFIED.equals(strDataEntitiyModState))
        {
            // these are set because of the dummy stream
            m_metadata.remove(HttpHeaders.CONTENT_ENCODING);
            m_metadata.remove(HttpHeaders.CONTENT_TYPE);
            processUnmodifiedData(m_metadata);
        }
        else if(IncrementalCrawlingParser.PROCESSED.equals(strDataEntitiyModState))
        {
            // these are set because of the dummy stream
            m_metadata.remove(HttpHeaders.CONTENT_ENCODING);
            m_metadata.remove(HttpHeaders.CONTENT_TYPE);
            processProcessedData(m_metadata);
        }
        else
            processNewData(m_metadata, this.toString());



        // da wir diesen handler über die rekursiven Aufrufe wiederverwenden möchten, setzen wir hier die members zurück. Das metadata-Object wird im
        // CrawlerParser zurückgesetzt
        if(m_writer != null) m_writer.getBuffer().delete(0, m_writer.getBuffer().length());

    }



    /**
     * This is invoked if we have an entity that was crawled at another crawl in the past, according to the crawling history, and was not modified, according to the
     * dataEntityContentFingerprint.
     * 
     * @param metadata some metadata (at least an identifying Id) to deal with the entity
     */
    abstract public void processUnmodifiedData(Metadata metadata);



    /**
     * This is invoked if we have an entity that was processed in this crawl yet. This is if we have somehow a double entry, or if we have cycles, e.g. during a web
     * crawl, where we sometimes come back to a link we started from.
     * 
     * @param metadata some metadata (at least an identifying Id) to deal with the entity
     */
    abstract public void processProcessedData(Metadata metadata);



    public Metadata getMetaData()
    {
        return m_metadata;
    }



    protected void init()
    {
        m_writer = new StringWriter();
        setContentHandler(new BodyContentHandler(new WriteOutContentHandler(m_writer, m_iWriteLimit)));
    }



    /**
     * Will be invoked in the case a data entity causes an error during indexing.
     * 
     * @param metadata some metadata (at least an identifying Id) to deal with the error entity
     */
    public abstract void processErrorData(Metadata metadata);



    /**
     * Will be invoked in the case a data entity was modified since the last crawl.
     * 
     * @param metadata the metadata of the data entity
     * @param strFulltext the full body text of the data entity
     */
    public abstract void processModifiedData(Metadata metadata, String strFulltext);



    /**
     * Will be invoked in the case a new data entity was found.
     * 
     * @param metadata the metadata of the data entity
     * @param strFulltext the full body text of the data entity
     */
    public abstract void processNewData(Metadata metadata, String strFulltext);



    /**
     * Will be invoked in the case a data entity was removed since the last crawl.
     * 
     * @param metadata some metadata (at least an identifying Id) to deal with the removed entity
     */
    public abstract void processRemovedData(Metadata metadata);




    public void setMetaData(Metadata metadata)
    {
        m_metadata = metadata;
    }




    @Override
    public void startDocument() throws SAXException
    {
        init();
        super.startDocument();
    }

}