IncrementalCrawlingHistory.java example

Explorer
leech-master
- src
  - main
    - java
      - de
        dfki
        km
        leech
        Leech.java
        SubDataEntityContentHandler.java
        config
        CrawlerContext.java
        DirectoryCrawlerContext.java
        HtmlCrawlerContext.java
        ImapCrawlerContext.java
        LeechConfig.java
        detect
        DatasourceMediaTypes.java
        DirectoryDatasourceDetector.java
        ImapDatasourceDetector.java
        LeechDefaultDetector.java
        io
        FileURLStreamProvider.java
        HttpURLStreamProvider.java
        ImapURLStreamProvider.java
        ShiftInitInputStream.java
        URLStreamProvider.java
        lucene
        LeechDefaultFieldConfig.java
        ToLuceneContentHandler.java
        metadata
        LeechMetadata.java
        parser
        CrawlerParser.java
        DirectoryCrawlerParser.java
        HtmlCrawlerParser.java
        ImapCrawlerParser.java
        NonRecursiveCrawlerParser.java
        SambaCrawlerParser.java
        UrlListCrawlerParser.java
        filter
        RegExpPattern.java
        SubstringPattern.java
        URLFilter.java
        URLFilterPattern.java
        URLFilteringParser.java
        incremental
        IncrementalCrawlingHistory.java
        IncrementalCrawlingParser.java
        rss
        FeedParser2.java
        wikipedia
        WikipediaDumpParser.java
        sax
        CrawlReportContentHandler.java
        DataSinkContentHandler.java
        DataSinkContentHandlerAdapter.java
        DataSinkContentHandlerDecorator.java
        PrintlnContentHandler.java
        solr
        ToSolrContentHandler.java
        util
        CookieManager.java
        ExceptionUtils.java
        IndexPostprocessor.java
        LeechException.java
        LuceneIndexCreator.java
        OSUtils.java
        SolrIndexCreator.java
        TikaUtils.java
        UrlUtil.java
        ValueHolder.java
        certificates
        CertificateIgnoringSocketFactory.java
        CertificateStore.java
        Decision.java
        PersistentCertificateStore.java
        RootCertificateStore.java
        SessionCertificateStore.java
        StandardTrustManager.java
        TrustDecider.java
/*
 * Leech - crawling capabilities for Apache Tika
 * 
 * Copyright (C) 2012 DFKI GmbH, Author: Christian Reuschling
 * 
 * This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation,
 * either version 3 of the License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
 * PARTICULAR PURPOSE. See the GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>.
 * 
 * Contact us by mail: christian.reuschling@dfki.de
 */

package de.dfki.km.leech.parser.incremental;



import java.io.IOException;
import java.nio.file.Paths;
import java.util.Collections;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.lucene.analysis.core.KeywordAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.LongPoint;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.*;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Bits;

import de.dfki.inquisition.text.StringUtils;
import de.dfki.km.leech.config.CrawlerContext;



/**
 * A persistent history database, remarking everything that was processed during a crawl. This history makes it possible to fulfill incremental crawling, where you can
 * quickly check whether a data entity found during the crawl is new or modified with respect to the last crawl. Further, all data entities that was removed since the
 * last crawl can be determined for final synchronization.<br>
 * To check whether a file is new or modified, IncrementalCrawlingHistory needs two informations: a 'data entity exists ID', which is an identifier for a data entity that
 * is independent from the content of this entity. It is only for identifying the existence, not to check whether it has changed. A 'data entity content fingerprint'
 * gives the hint whether the content of the data entity has changed. This e.g. can be the modifed date of a file, or a mail header hash.<br>
 * To determine the data entities that were removed since the last crawl, IncrementalCrawlingHistory remarks the crawl starting time, and updates a 'last crawled/checked
 * time' entry for every data entity. When the crawl is finished, every data entity which 'last crawled/checked time' is before the remarked crawl starting time is
 * considered as outdated and thus as removed.<br>
 * This is an easy, intuitive, general approach that should work for almost all possible data entities. Other approaches stores e.g. parent/child relationships of data
 * entities, maintain resulting relationship lists, and infer whether an entity was deleted or not. These approaches have the advantage that you can determine, in some
 * cases, immediately by crawling a container data source whether a data entity was deleted or not, before the recursive call. Nevertheless, where this is easy in e.g.
 * file system data sources, in other scenarios as web crawlers this is much more complicated, where a link can be potentially part of several 'container websites'.<br>
 * The timestamp-approach we choose is much easier and works in all scenarios with the same conditions, but has 2 disadvantages against the other approaches:<br>
 * <li>You have to update every data entity history entry on every crawl with the new 'last crawled/checked time', even if the entity has not changed at all.<br> <li>The
 * information which data entities were removed can be determined only at the end of a crawl, for the whole history. <br>
 * <br>
 * We realized this crawling history with an underlying Lucene index.
 * 
 * <br>
 * To enable incremental indexing during a crawl, pass a CrawlerConfig instance with a path to the history into the ParseContext parameter of the Leech.parse(..) method:<br>
 * <code>
 * Leech leech = new Leech();<br>
 * Metadata metadata = new Metadata();<br>
 * {@link CrawlerContext} crawlerContext = new {@link CrawlerContext}().setIncrementalCrawlingHistoryPath("./history/forResourceDir");<br>
 * leech.parse(new File("resource"), new PrintlnContentHandler(metadata), crawlerContext.createParseContext());<br>
 * </code> <br>
 * Make sure that you always use the according history for a specific crawling source - this is a 1:1 relationship, you can't mix. Otherwise, all new stuff will be
 * considered as new, and all old stuff as deleted.
 * 
 * @author Christian Reuschling, Dipl.Ing.(BA)
 */
public class IncrementalCrawlingHistory
{


    protected class CrawlFinishedIterator implements Iterator<String>
    {

        protected LinkedList<String> m_llQueuedOutdatedIDs = new LinkedList<String>();

        protected Query m_query = null;



        protected CrawlFinishedIterator() throws IOException
        {

            if(m_lCrawlStartingTime == null) throw new IllegalStateException("No crawl starting time found. Did you invoke crawlStarted?");


            m_query = LongPoint.newRangeQuery(lastCrawledTime, 0l, m_lCrawlStartingTime - 1);
        }



        @Override
        public boolean hasNext()
        {
            try
            {

                // soo - hier stellen wir die Suchfrage, ob noch outdated entities vorhanden sind - wenn wir nicht noch welche in der queue von der
                // letzten Anfrage haben. Wenn wir false zurück geben, dann machen wir die ganzen Lucene-Teile zu.

                if(m_query == null) return false;

                // wenn wir nix mehr haben, dann stellen wir eine Suchanfrage
                if(m_llQueuedOutdatedIDs.size() == 0)
                {

                    refreshIndexReaderz();
                    TopDocs topDocs = m_indexSearcher.search(m_query, 5000);

                    Bits liveDocs = MultiFields.getLiveDocs(m_indexReader);

                    for (ScoreDoc scoreDoc : topDocs.scoreDocs)
                    {
                        // skip deleted documents
                        if(liveDocs != null && !liveDocs.get(scoreDoc.doc)) continue;

                        Document doc4Queue = m_indexReader.document(scoreDoc.doc, Collections.singleton(dataEntityId));


                        m_llQueuedOutdatedIDs.add(doc4Queue.get(dataEntityId));
                    }


                    // wenn die queue immer noch leer ist, dann gibts nix mehr
                    if(m_llQueuedOutdatedIDs.size() == 0)
                    {
                        // alles zu - wir sind fertig
                        closeLuceneStuff();

                        return false;
                    }


                }


                return true;


            }
            catch (IOException e)
            {
                Logger.getLogger(IncrementalCrawlingHistory.CrawlFinishedIterator.class.getName()).log(Level.SEVERE, "Error", e);

                return false;
            }
        }



        @Override
        public String next()
        {
            // hier geben wir die id zurück und löschen sie vorher aus der queue und dem index
            try
            {
                if(m_llQueuedOutdatedIDs.isEmpty()) return null;


                m_indexWriter.deleteDocuments(new Term(dataEntityId, m_llQueuedOutdatedIDs.getFirst()));

                return m_llQueuedOutdatedIDs.poll();

            }
            catch (Exception e)
            {
                Logger.getLogger(IncrementalCrawlingHistory.CrawlFinishedIterator.class.getName()).log(Level.SEVERE, "Error", e);
            }

            return null;
        }



        @Override
        public void remove()
        {
            throw new UnsupportedOperationException();
        }


    }




    /**
     * Defines the states whether a data entity is in the history or not. There are three states: Exist.NOT says that the data entity has no entry inside the history at
     * all. Exist.YES_UNPROCESSED means that the entity has an entry inside the history, and that it still wasn't processed during the current crawl. Exist.YES_PROCESSED
     * means that there is an entry but the data entity was processed in this run yet, so normally another processing is unnecessary. This is to detect cycles.
     * 
     * @author Christian Reuschling, Dipl.Ing.(BA)
     */
    public enum Exist {
        NOT, YES_PROCESSED, YES_UNPROCESSED
    }


    static public final String dataEntityContentFingerprint = "dataEntityContentFingerprint";

    static public final String dataEntityId = "dataEntityId";

    static public final String masterDataEntityId = "masterDataEntityId";

    static public final String lastCrawledTime = "lastCrawledTime";

    protected DirectoryReader m_indexReader = null;

    protected IndexSearcher m_indexSearcher = null;

    protected IndexWriter m_indexWriter = null;



    protected Long m_lCrawlStartingTime = null;



    protected final String m_strHistoryPath;



    public IncrementalCrawlingHistory(String strHistoryPath)
    {
        m_strHistoryPath = strHistoryPath;


        Runtime.getRuntime().addShutdownHook(new Thread("IncrementalCrawlingHistory shutdown hook for " + strHistoryPath)
        {
            @Override
            public void run()
            {
                try
                {
                    closeLuceneStuff();
                }
                catch (IOException e)
                {
                    Logger.getLogger(IncrementalCrawlingHistory.class.getName()).log(Level.SEVERE, "Error", e);
                }
            }
        });
    }



    /**
     * Remarks a new data entity, together with the current time as 'last crawled/checked time'.
     * 
     * @param strDataEntityId an identifier for a data entity that is independent from the content of this entity. It is only for identifying the occurence, not to
     *            check whether it has changed (e.g. a filename)
     * @param strDataEntityContentFingerprint some fingerprint/identifier that gives the hint whether the content of the data entity has changed, e.g. the modifed date of
     *            a file
     * 
     * @throws IOException
     * @throws CorruptIndexException
     */
    public void addDataEntity(String strDataEntityId, String strDataEntityContentFingerprint) throws CorruptIndexException, IOException
    {
        addDataEntity(strDataEntityId, strDataEntityContentFingerprint, null);
    }



    /**
     * Remarks a new data entity, together with the current time as 'last crawled/checked time'.
     * 
     * @param strDataEntityId an identifier for a data entity that is independent from the content of this entity. It is only for identifying the occurence, not to
     *            check whether it has changed (e.g. a filename)
     * @param strDataEntityContentFingerprint some fingerprint/identifier that gives the hint whether the content of the data entity has changed, e.g. the modifed date of
     *            a file
     * @param strMasterDataEntityId optional: an EntityId of another data entity that is our 'master' which means that when the master is updated with
     *            {@link #updateDataEntityLastCrawledTime(String)}, all associated slaves will be also updated. This is e.g. for the case when you are in a second run for
     *            RSS-File indexing, and leech recognizes that this file didn't changed. Now we don't want to go unnecessarily into the fil and mark each entry on it's
     *            own. We know no subentry has changed, and can immediately mark them as processed with {@link #updateDataEntityLastCrawledTime(String)} on the master
     *            dataEntityId, which is the one from the RSS file. Leave it null or empty in the case you don't need to use it.
     * 
     * @throws IOException
     * @throws CorruptIndexException
     */
    public void addDataEntity(String strDataEntityId, String strDataEntityContentFingerprint, String strMasterDataEntityId) throws CorruptIndexException,
            IOException
    {

        Document doc = new Document();

        doc.add(new StringField(dataEntityId, strDataEntityId, Store.YES));
        doc.add(new StringField(dataEntityContentFingerprint, strDataEntityContentFingerprint, Store.YES));
        doc.add(new LongPoint(lastCrawledTime, System.currentTimeMillis()));
        doc.add(new StoredField(lastCrawledTime, System.currentTimeMillis()));
        if(!StringUtils.nullOrWhitespace(strMasterDataEntityId)) doc.add(new StringField(masterDataEntityId, strMasterDataEntityId, Store.YES));

        m_indexWriter.addDocument(doc);

    }



    public void closeLuceneStuff() throws IOException
    {


        if(m_indexSearcher != null)
        {
            m_indexSearcher = null;
        }

        if(m_indexReader != null)
        {
            m_indexReader.close();
            m_indexReader = null;
        }

        if(m_indexWriter != null)
        {
            m_indexWriter.commit();
            m_indexWriter.close();
            m_indexWriter = null;
        }

    }



    /**
     * Returns all DataEntityIds with a 'last crawled/checked time' before the 'crawl starting time' as outdated data entities. These are all entities that doesn't
     * exist in this crawl anymore, and thus can be considered as removed.<br>
     * You can only invoke and walk to the iterator once - while iterating, the outdated entries inside the history will be deleted. In the case you invoke this method
     * twice, the second invocation will result into an empty list. This is to ensure that also huge deleted entity lists can be handled without problematic memory
     * consumption.<br>
     * Remark: The writer and reader instance for the underlying lucene index will be closed when you walk the iterator to the end, all data will be committed before.
     * 
     * @return all DataEntityIds with a 'last crawled/checked time' before the 'crawl starting time', thus all entities that can be considered as removed.
     */
    public Iterator<String> crawlFinished()
    {

        try
        {

            return new CrawlFinishedIterator();

        }
        catch (IOException e)
        {
            Logger.getLogger(IncrementalCrawlingHistory.class.getName()).log(Level.SEVERE, "Error", e);
            return null;
        }

    }



    /**
     * Informs the history that a new crawl has started. The history will save the current time as 'crawl starting time'. <br>
     * Remark: The writer and reader instance for the underlying lucene index will be opened if necessary
     * 
     * @throws IOException
     * @throws LockObtainFailedException
     * @throws CorruptIndexException
     */
    public void crawlStarted() throws CorruptIndexException, LockObtainFailedException, IOException
    {

        openLuceneStuff();


        // wir merken uns die aktuelle crawlStartingTime - diese wird in CrawlFinished gebraucht, um die outdated entities zu ermitteln.
        m_lCrawlStartingTime = System.currentTimeMillis();

    }




    /**
     * Checks whether an ID exists inside the incremental crawling history or not. During the crawl, this is to identify quickly whether a data entity is completely new
     * or not.
     * 
     * @param strDataEntityId an identifier for a data entity that is independent from the content of this entity. It is only for identifying the occurence, not to
     *            check whether it has changed (e.g. a filename)
     * 
     * @return There are three states: Exist.NOT says that the data entity has no entry inside the history at all. Exist.YES_UNPROCESSED means that the entity has an
     *         entry inside the history, and that it still wasn't processed during the current crawl. Exist.YES_PROCESSED means that there is an entry but the data entity
     *         was processed in this run yet, so normally another processing is unnecessary. This is to detect cycles.
     * 
     * @throws IOException
     */
    public Exist exists(String strDataEntityId) throws IOException
    {

        Long lDataEntityLastCrawledTime = getDataEntityLastCrawledTime(strDataEntityId);


        if(lDataEntityLastCrawledTime == null) return Exist.NOT;

        if(lDataEntityLastCrawledTime >= m_lCrawlStartingTime) return Exist.YES_PROCESSED;

        return Exist.YES_UNPROCESSED;
    }



    /**
     * Checks whether an ID with a specific content fingerprint exists in the crawling history or not. During the crawl, this is to identify quickly whether a data entity
     * has changed its content or not. Of course, this makes only sense in the case the content fingerprint that gives the hint whether the entity has changed can be
     * created quickly, at best without extracting the content. Such a fingerprint can be e.g. a modified date of a file, or the time attribute of an email.
     * 
     * @param strDataEntityId an identifier for a data entity that is independent from the content of this entity. It is only for identifying the occurence, not to
     *            check whether it has changed (e.g. a filename)
     * @param strDataEntityContentFingerprint some fingerprint/identifier that gives the hint whether the content of the data entity has changed, e.g. the modifed date of
     *            a file
     * 
     * @return true in the case this identifier exists with exact this content fingerprint inside the crawling history
     * 
     * @throws IOException
     */
    public boolean existsWithContent(String strDataEntityId, String strDataEntityContentFingerprint) throws IOException
    {
        if(StringUtils.nullOrWhitespace(strDataEntityId)) return false;
        
        BooleanQuery query = (new BooleanQuery.Builder())
                                 .add(new TermQuery(new Term(dataEntityId, strDataEntityId)), Occur.MUST)
                                 .add(new TermQuery(new Term(dataEntityContentFingerprint, strDataEntityContentFingerprint)), Occur.MUST)
                                 .build();

        TotalHitCountCollector collector = new TotalHitCountCollector();

        refreshIndexReaderz();
        m_indexSearcher.search(query, collector);


        if(collector.getTotalHits() > 0) return true;

        return false;
    }



    /**
     * Gets the stored content fingerprint for a given data entity entry.
     * 
     * @param strDataEntityId an identifier for a data entity that is independent from the content of this entity. It is only for identifying the occurence, not to
     *            check whether it has changed (e.g. a filename)
     * 
     * @return the according content fingerprint stored for this data entity, null in the case this data entity was not found
     * 
     * @throws IOException
     */
    public String getDataEntityContentFingerprint(String strDataEntityId) throws IOException
    {
        if(StringUtils.nullOrWhitespace(strDataEntityId)) return null;
        
        Term termId = new Term(dataEntityId, strDataEntityId);

        refreshIndexReaderz();
        TopDocs topDocs = m_indexSearcher.search(new TermQuery(termId), 1);

        if(topDocs.totalHits == 0) return null;

        Document doc = m_indexReader.document(topDocs.scoreDocs[0].doc, Collections.singleton(dataEntityContentFingerprint));

        return doc.get(dataEntityContentFingerprint);
    }



    /**
     * Gets the stored last crawled time for a given data entity entry. This can be used to e.g. determine whether a data entity was already processed during the current
     * crawl or not. If it was processed already, this is a hint for a cycle.
     * 
     * @param strDataEntityId an identifier for a data entity that is independent from the content of this entity. It is only for identifying the occurence, not to
     *            check whether it has changed (e.g. a filename)
     * 
     * @return the according last crawled time stored for this data entity, null in the case this data entity was not found
     * 
     * @throws IOException
     */
    public Long getDataEntityLastCrawledTime(String strDataEntityId) throws IOException
    {
        if(StringUtils.nullOrWhitespace(strDataEntityId)) return null;
        
        Term termId = new Term(dataEntityId, strDataEntityId);

        refreshIndexReaderz();
        TopDocs topDocs = m_indexSearcher.search(new TermQuery(termId), 1);

        if(topDocs.totalHits == 0) return null;

        Document doc = m_indexReader.document(topDocs.scoreDocs[0].doc, Collections.singleton(lastCrawledTime));

        return Long.valueOf(doc.get(lastCrawledTime));
    }



    /**
     * Gets the path to this history
     * 
     * @return the path to this history
     */
    public String getHistoryPath()
    {
        return m_strHistoryPath;
    }




    /**
     * Creates all writer, reader, and searcher objects if necessary
     * 
     * @throws CorruptIndexException
     * @throws LockObtainFailedException
     * @throws IOException
     */
    public void openLuceneStuff() throws CorruptIndexException, LockObtainFailedException, IOException
    {
        if(m_indexWriter == null)
        {
            IndexWriterConfig config = new IndexWriterConfig(new KeywordAnalyzer());
            config.setOpenMode(OpenMode.CREATE_OR_APPEND);

            m_indexWriter = new IndexWriter(new SimpleFSDirectory(Paths.get(m_strHistoryPath)), config);
        }

        if(m_indexReader == null) m_indexReader = DirectoryReader.open(m_indexWriter, true, true);

        if(m_indexSearcher == null) m_indexSearcher = new IndexSearcher(m_indexReader);
    }



    protected void refreshIndexReaderz()
    {
        try
        {
            DirectoryReader newReader = DirectoryReader.openIfChanged(m_indexReader);

            if(newReader != null)
            {
                m_indexReader.close();
                m_indexReader = newReader;
                m_indexSearcher = new IndexSearcher(m_indexReader);
            }

        }
        catch (IOException e)
        {
            Logger.getLogger(IncrementalCrawlingHistory.class.getName()).log(Level.SEVERE, "Error", e);
        }
    }




    /**
     * Updates a whole data entity - same as addDataEntity, but removes a former entry before storing the new one
     * 
     * @param strDataEntityId an identifier for a data entity that is independent from the content of this entity. It is only for identifying the occurence, not to
     *            check whether it has changed (e.g. a filename)
     * @param strDataEntityContentFingerprint some fingerprint/identifier that gives the hint whether the content of the data entity has changed, e.g. the modifed date of
     *            a file
     * 
     * @throws IOException
     * @throws CorruptIndexException
     */
    public void updateDataEntity(String strDataEntityId, String strDataEntityContentFingerprint) throws CorruptIndexException, IOException
    {
        updateDataEntity(strDataEntityId, strDataEntityContentFingerprint, null);
    }



    /**
     * Updates a whole data entity - same as addDataEntity, but removes a former entry before storing the new one
     * 
     * @param strDataEntityId an identifier for a data entity that is independent from the content of this entity. It is only for identifying the occurence, not to
     *            check whether it has changed (e.g. a filename)
     * @param strDataEntityContentFingerprint some fingerprint/identifier that gives the hint whether the content of the data entity has changed, e.g. the modifed date of
     *            a file
     * @param strMasterDataEntityId optional: an EntityId of another data entity that is our 'master' which means that when the master is updated with
     *            {@link #updateDataEntityLastCrawledTime(String)}, all associated slaves will be also updated. This is e.g. for the case when you are in a second run for
     *            RSS-File indexing, and leech recognizes that this file didn't changed. Now we don't want to go unnecessarily into the fil and mark each entry on it's
     *            own. We know no subentry has changed, and can immediately mark them as processed with {@link #updateDataEntityLastCrawledTime(String)} on the master
     *            dataEntityId, which is the one from the RSS file. Leave it null or empty in the case you don't need to use it.
     * 
     * @throws IOException
     * @throws CorruptIndexException
     */
    public void updateDataEntity(String strDataEntityId, String strDataEntityContentFingerprint, String strMasterDataEntityId) throws CorruptIndexException,
            IOException
    {

        Term termId = new Term(dataEntityId, strDataEntityId);


        Document doc = new Document();

        doc.add(new StringField(dataEntityId, strDataEntityId, Store.YES));
        doc.add(new StringField(dataEntityContentFingerprint, strDataEntityContentFingerprint, Store.YES));
        doc.add(new LongPoint(lastCrawledTime, System.currentTimeMillis()));
        doc.add(new StoredField(lastCrawledTime, System.currentTimeMillis()));
        if(!StringUtils.nullOrWhitespace(strMasterDataEntityId))
            doc.add(new StringField(masterDataEntityId, strMasterDataEntityId, Store.YES));


        m_indexWriter.updateDocument(termId, doc);

    }



    /**
     * Sets a data entities 'last crawled/checked time' entry to the current time. In the case this data entity is a master entity, all slave documents will be updated
     * also. You can set an entity as a master entity with {@link #addDataEntity(String, String, String)} or {@link #updateDataEntity(String, String, String)}
     * 
     * @param strDataEntityId the data entity which is finally checked/crawled
     * 
     * @throws IOException
     * @throws CorruptIndexException
     */
    public void updateDataEntityLastCrawledTime(String strDataEntityId) throws CorruptIndexException, IOException
    {

        Term termId = new Term(dataEntityId, strDataEntityId);

        refreshIndexReaderz();
        TopDocs topDocs = m_indexSearcher.search(new TermQuery(termId), 1);

        if(topDocs.totalHits == 0) throw new IllegalStateException("there has to be an data entry with Id " + strDataEntityId + " for updating. Nothing was found.");


        long lCurrentTime = System.currentTimeMillis();

        Document doc = m_indexReader.document(topDocs.scoreDocs[0].doc);

        doc.removeFields(lastCrawledTime);
        doc.add(new LongPoint(lastCrawledTime, lCurrentTime));
        doc.add(new StoredField(lastCrawledTime, lCurrentTime));

        m_indexWriter.updateDocument(termId, doc);




        // wenn das Teil eine MasterDataEntity ist, dann müssen alle assoziierten Sklaven auch noch aktualisiert werden

        termId = new Term(masterDataEntityId, strDataEntityId);

        topDocs = m_indexSearcher.search(new TermQuery(termId), Integer.MAX_VALUE);

        for( int i=0; i<topDocs.scoreDocs.length; i++)
        {

            Document slaveDoc = m_indexReader.document(topDocs.scoreDocs[i].doc);

            slaveDoc.removeFields(lastCrawledTime);
            slaveDoc.add(new LongPoint(lastCrawledTime, lCurrentTime));
            slaveDoc.add(new StoredField(lastCrawledTime, lCurrentTime));

            m_indexWriter.updateDocument(termId, doc);
        }

    }






}