IncrementalCrawlingParser.java example

Explorer

leech-master
- src
  - main
    - java
      - de
        dfki
        km
        leech
        Leech.java
        SubDataEntityContentHandler.java
        config
        CrawlerContext.java
        DirectoryCrawlerContext.java
        HtmlCrawlerContext.java
        ImapCrawlerContext.java
        LeechConfig.java
        detect
        DatasourceMediaTypes.java
        DirectoryDatasourceDetector.java
        ImapDatasourceDetector.java
        LeechDefaultDetector.java
        io
        FileURLStreamProvider.java
        HttpURLStreamProvider.java
        ImapURLStreamProvider.java
        ShiftInitInputStream.java
        URLStreamProvider.java
        lucene
        LeechDefaultFieldConfig.java
        ToLuceneContentHandler.java
        metadata
        LeechMetadata.java
        parser
        CrawlerParser.java
        DirectoryCrawlerParser.java
        HtmlCrawlerParser.java
        ImapCrawlerParser.java
        NonRecursiveCrawlerParser.java
        SambaCrawlerParser.java
        UrlListCrawlerParser.java
        filter
        RegExpPattern.java
        SubstringPattern.java
        URLFilter.java
        URLFilterPattern.java
        URLFilteringParser.java
        incremental
        IncrementalCrawlingHistory.java
        IncrementalCrawlingParser.java
        rss
        FeedParser2.java
        wikipedia
        WikipediaDumpParser.java
        sax
        CrawlReportContentHandler.java
        DataSinkContentHandler.java
        DataSinkContentHandlerAdapter.java
        DataSinkContentHandlerDecorator.java
        PrintlnContentHandler.java
        solr
        ToSolrContentHandler.java
        util
        CookieManager.java
        ExceptionUtils.java
        IndexPostprocessor.java
        LeechException.java
        LuceneIndexCreator.java
        OSUtils.java
        SolrIndexCreator.java
        TikaUtils.java
        UrlUtil.java
        ValueHolder.java
        certificates
        CertificateIgnoringSocketFactory.java
        CertificateStore.java
        Decision.java
        PersistentCertificateStore.java
        RootCertificateStore.java
        SessionCertificateStore.java
        StandardTrustManager.java
        TrustDecider.java

/*
 * Leech - crawling capabilities for Apache Tika
 * 
 * Copyright (C) 2012 DFKI GmbH, Author: Christian Reuschling
 * 
 * This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation,
 * either version 3 of the License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
 * PARTICULAR PURPOSE. See the GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>.
 * 
 * Contact us by mail: christian.reuschling@dfki.de
 */

package de.dfki.km.leech.parser.incremental;



import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.channels.FileLock;
import java.util.Iterator;
import java.util.UUID;

import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.EmptyParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParserDecorator;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

import de.dfki.km.leech.Leech;
import de.dfki.km.leech.config.CrawlerContext;
import de.dfki.km.leech.parser.CrawlerParser;
import de.dfki.km.leech.parser.incremental.IncrementalCrawlingHistory.Exist;
import de.dfki.km.leech.util.TikaUtils;



/**
 * A Parser decorator which enables incremental indexing during the crawl. For this, {@link IncrementalCrawlingParser} needs two entries inside the metadata given from
 * the parse method:<br>
 * <br>
 * <li>{@link IncrementalCrawlingHistory}.dataEntityId: an identifier for a data entity that is independent from the content of this entity. It is only for identifying
 * the occurence, not to check whether it has changed (e.g. a filename) <li>
 * <br>
 * {@link IncrementalCrawlingHistory}.dataEntityContentFingerprint: some fingerprint/identifier that gives the hint whether the content of the data entity has changed,
 * e.g. the modifed date of a file These entries depends on the type of the datasource, which will considered by creating the InputStream for the parse method. Thus, both
 * metadata entries will be performed in {@link TikaUtils} during stream creation.<br>
 * <br>
 * Dependent on these entries this decorator writes a data entity modification state (new, modified, unmodified, removed) into the metadata before delegating to the
 * wrapped parser. In the case of a cycle during a crawl (when a data entity comes a second time during a crawl), nothing will be delegated.
 * 
 * 
 * 
 * @author Christian Reuschling, Dipl.Ing.(BA)
 */
public class IncrementalCrawlingParser extends ParserDecorator
{

    static public final String DATA_ENTITY_MODIFICATION_STATE = "dataEntitiyModificationState";

    static public final String MODIFIED = "modified";

    static public final String NEW = "new";

    static public final String PROCESSED = "processed";

    static public final String REMOVED = "removed";

    static public final String ERROR = "error";

    private static final long serialVersionUID = 3823147926764040243L;

    static public final String UNMODIFIED = "unmodified";



    protected Leech m_leech = new Leech();



    public IncrementalCrawlingParser(Parser parser)
    {
        super(parser);
    };



    @Override
    public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException
    {

        // wir machen hier das ganze inkrementelle history-Zeugs, dann parsen wir des Teil wie gewohnt, die entsprechenden crawlerParser werten u.U.
        // auch noch die Einträge in den Metadaten aus. Am Schluß machen wir noch die history zu.


        IncrementalCrawlingHistory crawlingHistory = null;
        boolean bIsTmpHistory = false;
        FileLock tmpLock = null;
        FileOutputStream fosTmpLock = null;
        int iCurrentCrawlingDepth = 0;

        try
        {
            CrawlerContext crawlerContext = context.get(CrawlerContext.class);
            if(crawlerContext == null)
            {
                crawlerContext = new CrawlerContext();
                context.set(CrawlerContext.class, crawlerContext);
            }


            // die momentane crawlingdepth brauchen wir um festzustellen, wann ein kompletter crawlingVorgang abgeschlossen ist. Ein CrawlerParser
            // aktualisiert diese Info in der metadata
            String strDepth = metadata.get(CrawlerParser.CURRENT_CRAWLING_DEPTH);
            if(strDepth != null) iCurrentCrawlingDepth = Integer.valueOf(strDepth);



            // ## die crawling history
            crawlingHistory = crawlerContext.getIncrementalCrawlingHistory();
            if(crawlingHistory == null && crawlerContext.getDetectCycles() && iCurrentCrawlingDepth == 0)
            {
                // wir erstellen eine temporäre crawlerhistory, die am Schluß des Crawls auch wieder gelöscht wird
                File parentDir = new File(System.getProperty("java.io.tmpdir"));

                File fTmpHistory = new File(parentDir.getAbsolutePath() + "/leechTmp/" + UUID.randomUUID().toString().replaceAll("\\W", "_"));
                fTmpHistory.mkdirs();
                // wir erstellen noch ein lock file, mit dem wir steuern, ob das temporöre Verzeichnis später gelöscht werden kann oder nicht
                fosTmpLock = new FileOutputStream(fTmpHistory.getAbsolutePath() + "/lock");
                tmpLock = fosTmpLock.getChannel().tryLock();

                crawlerContext.setIncrementalCrawlingHistoryPath(fTmpHistory.getAbsolutePath());
                crawlingHistory = crawlerContext.getIncrementalCrawlingHistory();

                bIsTmpHistory = true;
            }


            if(iCurrentCrawlingDepth == 0 && crawlingHistory != null) crawlingHistory.crawlStarted();




            // ## content and history


            boolean bProcessEntity = performHistoryStuff(crawlingHistory, metadata);

            if(bProcessEntity)
            {
                // wenn wir unmodified sind, dann wollen wir nur weitermachen, wenn es ein crawlerParser ist - evtl. sind verlinkte Inhalte ja auch
                // modified
                String strDataEntityModState = metadata.get(IncrementalCrawlingParser.DATA_ENTITY_MODIFICATION_STATE);
                MediaType type = m_leech.getDetector().detect(stream, metadata);
                Parser finalParser4Type = TikaUtils.getParser4Type((CompositeParser) getWrappedParser(), type, context);

                if(!IncrementalCrawlingParser.UNMODIFIED.equals(strDataEntityModState))
                {
                    getWrappedParser().parse(stream, handler, metadata, context);
                }
                else if(finalParser4Type instanceof CrawlerParser)
                {
                    getWrappedParser().parse(stream, handler, metadata, context);
                }
                else
                {
                    // das Teil ist unmodified, wir machen nix
                    // Logger.getLogger(IncrementalCrawlingParser.class.getName()).info("unmodified entity, will skip it. " + metadata);
                    InputStream dummyStream = new ByteArrayInputStream("leech sucks - hopefully :)".getBytes("UTF-8"));
                    EmptyParser.INSTANCE.parse(dummyStream, handler, metadata, context);
                }
            }
            else
            {
                // das Teil war in diesem run schon mal dran - Zykel oder ein einfaches Duplikat
                // Logger.getLogger(IncrementalCrawlingParser.class.getName()).info("entity was processed this crawl yet, will skip it. " + metadata);
                InputStream dummyStream = new ByteArrayInputStream("leech sucks - hopefully :)".getBytes("UTF-8"));
                EmptyParser.INSTANCE.parse(dummyStream, handler, metadata, context);
            }


            // removed entites and finishing stuff



            if(iCurrentCrawlingDepth != 0 || crawlingHistory == null) return;



            Iterator<String> itRemovedDataEntitiesIDs = crawlingHistory.crawlFinished();
            while (itRemovedDataEntitiesIDs.hasNext() && !crawlerContext.stopRequested() && crawlerContext.getCheckForRemovedEntities())
            {


                // wenn der Handler gleich geblieben ist, dann muß dieses Metadata Object das selbe wie u.U. beim übergebenen handler bleiben. Das ist
                // bei unserem DataSinkcontentHandler der Fall
                ContentHandler handler4RemovedData = TikaUtils.createContentHandler4SubCrawl(crawlerContext);
                // wir löschen die Inhalte im Metadata-Objekt, da wir zwar die Referenz behalten wollen (falls ein Handler das auch hat), aber die
                // Inhalte für das subObject neu gefüllt werden sollen.
                TikaUtils.clearMetadata(metadata);


                metadata.set(DATA_ENTITY_MODIFICATION_STATE, REMOVED);

                String strDataEntityId2Remove = itRemovedDataEntitiesIDs.next();
                metadata.set(IncrementalCrawlingHistory.dataEntityId, strDataEntityId2Remove);

                InputStream dummyStream = new ByteArrayInputStream("leech sucks - hopefully :)".getBytes("UTF-8"));

                EmptyParser.INSTANCE.parse(dummyStream, handler4RemovedData, metadata, context);
            }





        }
        catch (Exception e)
        {
            String strUrlOrSource = metadata.get(Metadata.SOURCE);
            if(strUrlOrSource == null) strUrlOrSource = metadata.get(Metadata.RESOURCE_NAME_KEY);
            if(strUrlOrSource == null) strUrlOrSource = metadata.get(IncrementalCrawlingHistory.dataEntityId);
            if(strUrlOrSource == null) strUrlOrSource = "no entity id known in metadata";

            if(e instanceof TikaException) throw (TikaException) e;
            throw new TikaException("Error while crawling " + strUrlOrSource, e);
        }
        finally
        {

            if(crawlingHistory != null && iCurrentCrawlingDepth == 0) crawlingHistory.closeLuceneStuff();

            if(tmpLock != null) tmpLock.release();
            if(fosTmpLock != null) fosTmpLock.close();
            if(crawlingHistory != null && iCurrentCrawlingDepth == 0 && bIsTmpHistory)
            {

                // hier werden jetzt alle tmp-Verzeichnisse gelöscht, die nicht gelockt sind. Damit entfernen wir auch evtl. Leichen
                File parentDir = new File(System.getProperty("java.io.tmpdir"));
                File leechTmpDir = new File(parentDir.getAbsolutePath() + "/leechTmp");

                for (File historyDir : leechTmpDir.listFiles())
                {
                    if(!historyDir.isDirectory()) continue;

                    FileOutputStream fosLock = new FileOutputStream(historyDir.getAbsolutePath() + "/lock");
                    FileLock lock = fosLock.getChannel().tryLock();

                    if(lock != null)
                    {
                        // unlocked - we can delete the directory
                        File fTmpHistory = new File(historyDir.getAbsolutePath());
                        for (File fSubFile : fTmpHistory.listFiles())
                            fSubFile.delete();
                        fTmpHistory.delete();
                        lock.release();
                    }

                    if(fosLock != null) fosLock.close();
                }
            }
        }



    }






    /**
     * Performs the entries into the incremental crawling history and put the data entity modification state into the metadata object. In the case this data entity was
     * processed during this crawl yet (when we have a cycle or double entry), the method will return false which means that it don't have to be processed again.
     * 
     * @param crawlingHistory the crawling history. Can be null, in this case the data entity will be flagged as NEW in any case
     * @param metadata the metadata of the data entity. The method will put the data entity modification state into
     * 
     * @return true: process the data entity (it was not processed formerly in this crawl), false otherwise (it was processed during this call, we have a circle)
     * 
     * @throws Exception
     */
    public static boolean performHistoryStuff(IncrementalCrawlingHistory crawlingHistory, Metadata metadata) throws Exception
    {

        if(crawlingHistory == null)
        {
            metadata.set(DATA_ENTITY_MODIFICATION_STATE, NEW);

            return true;
        }
        else
        {
            // wir wollen inkrementelles indexieren - war das Teil schon mal da?

            String strDataEntityId = metadata.get(IncrementalCrawlingHistory.dataEntityId);
            String strMasterDataEntityId = metadata.get(IncrementalCrawlingHistory.masterDataEntityId);


            Exist exist = crawlingHistory.exists(strDataEntityId);

            // wenn wir es in diesem Crawl schon mal prozessiert haben, dann machen wir gar nix - und verfolgen auch keine Links mehr
            // weiter. Dann haben wir einen Zykel.
            if(exist.equals(Exist.YES_PROCESSED))
            {
                metadata.set(DATA_ENTITY_MODIFICATION_STATE, PROCESSED);

                return false;
            }


            String strDataEntityContentFingerprint = metadata.get(IncrementalCrawlingHistory.dataEntityContentFingerprint);

            if(exist.equals(Exist.NOT))
            {
                metadata.set(DATA_ENTITY_MODIFICATION_STATE, NEW);

                crawlingHistory.addDataEntity(strDataEntityId, strDataEntityContentFingerprint, strMasterDataEntityId);

                return true;
            }


            // es war schon mal da - hat es sich verändert?
            boolean bExistsWithContent = crawlingHistory.existsWithContent(strDataEntityId, strDataEntityContentFingerprint);
            if(bExistsWithContent)
            {
                // nicht verändert - wir merken uns, daß es bei diesem crawl immer noch dabei war
                metadata.set(DATA_ENTITY_MODIFICATION_STATE, UNMODIFIED);

                crawlingHistory.updateDataEntityLastCrawledTime(strDataEntityId);

                return true;
            }


            // verändert
            metadata.set(DATA_ENTITY_MODIFICATION_STATE, MODIFIED);

            crawlingHistory.updateDataEntity(strDataEntityId, strDataEntityContentFingerprint, strMasterDataEntityId);

            return true;
        }

    }







}