ToLuceneContentHandler.java example

Explorer
leech-master
- src
  - main
    - java
      - de
        dfki
        km
        leech
        Leech.java
        SubDataEntityContentHandler.java
        config
        CrawlerContext.java
        DirectoryCrawlerContext.java
        HtmlCrawlerContext.java
        ImapCrawlerContext.java
        LeechConfig.java
        detect
        DatasourceMediaTypes.java
        DirectoryDatasourceDetector.java
        ImapDatasourceDetector.java
        LeechDefaultDetector.java
        io
        FileURLStreamProvider.java
        HttpURLStreamProvider.java
        ImapURLStreamProvider.java
        ShiftInitInputStream.java
        URLStreamProvider.java
        lucene
        LeechDefaultFieldConfig.java
        ToLuceneContentHandler.java
        metadata
        LeechMetadata.java
        parser
        CrawlerParser.java
        DirectoryCrawlerParser.java
        HtmlCrawlerParser.java
        ImapCrawlerParser.java
        NonRecursiveCrawlerParser.java
        SambaCrawlerParser.java
        UrlListCrawlerParser.java
        filter
        RegExpPattern.java
        SubstringPattern.java
        URLFilter.java
        URLFilterPattern.java
        URLFilteringParser.java
        incremental
        IncrementalCrawlingHistory.java
        IncrementalCrawlingParser.java
        rss
        FeedParser2.java
        wikipedia
        WikipediaDumpParser.java
        sax
        CrawlReportContentHandler.java
        DataSinkContentHandler.java
        DataSinkContentHandlerAdapter.java
        DataSinkContentHandlerDecorator.java
        PrintlnContentHandler.java
        solr
        ToSolrContentHandler.java
        util
        CookieManager.java
        ExceptionUtils.java
        IndexPostprocessor.java
        LeechException.java
        LuceneIndexCreator.java
        OSUtils.java
        SolrIndexCreator.java
        TikaUtils.java
        UrlUtil.java
        ValueHolder.java
        certificates
        CertificateIgnoringSocketFactory.java
        CertificateStore.java
        Decision.java
        PersistentCertificateStore.java
        RootCertificateStore.java
        SessionCertificateStore.java
        StandardTrustManager.java
        TrustDecider.java
/*
 * Leech - crawling capabilities for Apache Tika
 * 
 * Copyright (C) 2012 DFKI GmbH, Author: Christian Reuschling
 * 
 * This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation,
 * either version 3 of the License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
 * PARTICULAR PURPOSE. See the GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>.
 * 
 * Contact us by mail: christian.reuschling@dfki.de
 */

package de.dfki.km.leech.lucene;



import java.io.File;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.rmi.server.UID;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.UUID;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.CyclicBarrier;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;
import org.apache.tika.metadata.Metadata;

import de.dfki.inquisition.collections.MultiValueHashMap;
import de.dfki.inquisition.file.FileUtils;
import de.dfki.inquisition.lucene.FieldConfig;
import de.dfki.km.leech.Leech;
import de.dfki.km.leech.metadata.LeechMetadata;
import de.dfki.km.leech.parser.incremental.IncrementalCrawlingHistory;
import de.dfki.km.leech.sax.DataSinkContentHandler;



/**
 * This is a content handler that allows to store crawled data into a Lucene index. You are able to configure the field types and the analyzers that should be used.
 * Further, blockindexing with {@link IndexWriter#addDocuments(java.util.Collection, Analyzer)} is supported, you can enable it with
 * {@link ToLuceneContentHandler#setBlockIndexing(boolean)}. If it is enabled, {@link ToLuceneContentHandler} checks whether inside the metadata is a
 * {@link LeechMetadata#childId} or a {@link LeechMetadata#parentId} key. Documents with a {@link LeechMetadata#childId} entry will appear as parent documents, docs with
 * an {@link LeechMetadata#parentId} as childs. {@link ToLuceneContentHandler} collects the child documents if they appear at a processXXX method, and writes them as
 * block at the time a succeeding parent document appears. In the case a non-parent doc appears, all collected docs will be indexed normally, not as block.
 * 
 * @author Christian Reuschling, Dipl.Ing.(BA)
 * 
 */
public class ToLuceneContentHandler extends DataSinkContentHandler
{


    protected class DocConsumer implements Runnable
    {


        @Override
        public void run()
        {
            try
            {
                while (true)
                {
                    List<Document> llDocs = m_addDocsQueue.take();

                    if(llDocs instanceof InterruptThreadList)
                    {
                        break;
                    }

                    try
                    {


                        if(llDocs.size() == 1)
                        {
                            getCurrentWriter().addDocument(llDocs.get(0));
                        }
                        else if(llDocs.size() > 1)
                        {
                            getCurrentWriter().addDocuments(llDocs);
                        }

                    }
                    catch (Exception e)
                    {
                        Logger.getLogger(ToLuceneContentHandler.DocConsumer.class.getName()).log(
                                Level.WARNING,
                                "Error during writing a document to the index (lucene exception while addDocument) - will ignore it. This is a hint to a lucene bug."
                                        + llDocs);
                    }

                }
            }
            catch (InterruptedException e)
            {
                // NOP
            }
            catch (Exception e)
            {
                Logger.getLogger(ToLuceneContentHandler.DocConsumer.class.getName()).log(Level.SEVERE, "Error", e);

            }
            finally
            {
                try
                {
                    m_cyclicBarrier4DocConsumerThreads.await();
                }
                catch (Exception e2)
                {
                    Logger.getLogger(ToLuceneContentHandler.DocConsumer.class.getName()).log(Level.SEVERE, "Error", e2);
                }
            }


        }
    }




    protected class InterruptThreadList extends LinkedList<Document>
    {
        private static final long serialVersionUID = 196832081918659203L;
    }



    protected final BlockingQueue<List<Document>> m_addDocsQueue = new LinkedBlockingQueue<List<Document>>(23);



    protected boolean m_bBlockIndexing = true;



    protected CyclicBarrier m_cyclicBarrier4DocConsumerThreads;







    protected FieldConfig m_fieldConfig = new FieldConfig();



    protected HashSet<String> m_hsAttNamesNot2Store = new HashSet<String>();





    protected Map<String, String> m_hsFieldName2FieldValueConstraint;

    protected MultiValueHashMap<String, String> m_hsSource2TargetFieldnames = new MultiValueHashMap<String, String>();

    protected MultiValueHashMap<String, String> m_hsStaticAttValuePairs = new MultiValueHashMap<String, String>();


    protected MultiValueHashMap<String, String> m_hsTarget2SourcesFieldnames = new MultiValueHashMap<String, String>();



    protected HashSet<String> m_hsTmpLuceneWriterPaths2Merge = new HashSet<String>();


    protected IndexWriter m_initialLuceneWriter;


    protected int m_iSplitIndexDocumentCount = -1;



    protected LinkedList<Thread> m_llConsumerThreads = new LinkedList<Thread>();




    protected LinkedList<IndexWriter> m_llIndexWriter2Close = new LinkedList<IndexWriter>();



    protected LinkedList<Document> m_llLastChildDocuments = new LinkedList<Document>();



    protected IndexWriter m_luceneWriter;



    public ToLuceneContentHandler(FieldConfig fieldConfig, IndexWriter luceneWriter) throws Exception
    {
        super();
        m_fieldConfig = fieldConfig;
        m_luceneWriter = luceneWriter;
        m_initialLuceneWriter = m_luceneWriter;

        init();
    }



    public ToLuceneContentHandler(int writeLimit, FieldConfig fieldConfig, IndexWriter luceneWriter) throws Exception
    {
        super(writeLimit);
        m_fieldConfig = fieldConfig;
        m_luceneWriter = luceneWriter;
        m_initialLuceneWriter = m_luceneWriter;

        init();
    }



    public ToLuceneContentHandler(Metadata metadata, FieldConfig fieldConfig, IndexWriter luceneWriter) throws Exception
    {
        super(metadata);
        m_fieldConfig = fieldConfig;
        m_luceneWriter = luceneWriter;
        m_initialLuceneWriter = m_luceneWriter;

        init();
    }



    public ToLuceneContentHandler(Metadata metadata, int writeLimit, FieldConfig fieldConfig, IndexWriter luceneWriter) throws Exception
    {
        super(metadata, writeLimit);
        m_fieldConfig = fieldConfig;
        m_luceneWriter = luceneWriter;
        m_initialLuceneWriter = m_luceneWriter;

        init();
    }



    /**
     * Will merge all temporar indices together into the initial indexWriter index. This is only necessary if SplitAndMerge is enabled. Otherwise you don't have to invoke
     * this method.
     */
    @Override
    public void crawlFinished()
    {
        try
        {

            for (int i = 0; i < m_llConsumerThreads.size(); i++)
                m_addDocsQueue.put(new InterruptThreadList());

            m_cyclicBarrier4DocConsumerThreads.await();

            m_llConsumerThreads.clear();

            if(getSplitAndMergeIndex() <= 0) return;

            // hier mergen wir nun alle temporären indices in den originalen

            // der temporären müssen noch geschlossen werden - das machen wir jetzt. Der letzte steht noch nicht in der Liste
            if(m_luceneWriter != m_initialLuceneWriter)
            {
                for (IndexWriter writer2close : m_llIndexWriter2Close)
                    writer2close.close();
                m_luceneWriter.close();
            }

            LinkedList<Directory> llIndicesDirs2Merge = new LinkedList<Directory>();

            for (String strTmpPath : m_hsTmpLuceneWriterPaths2Merge)
                llIndicesDirs2Merge.add(new SimpleFSDirectory(Paths.get(strTmpPath)));

            if(llIndicesDirs2Merge.size() == 0) return;

            Logger.getLogger(ToLuceneContentHandler.class.getName()).info("Will merge " + llIndicesDirs2Merge.size() + " temporary indices to the final one.");


            m_initialLuceneWriter.addIndexes(llIndicesDirs2Merge.toArray(new Directory[0]));

            m_initialLuceneWriter.commit();

            for (String strTmpPath : m_hsTmpLuceneWriterPaths2Merge)
                FileUtils.deleteDirectory(new File(strTmpPath));


        }
        catch (Exception e)
        {
            Logger.getLogger(ToLuceneContentHandler.class.getName()).log(Level.SEVERE, "Error", e);
        }

    }



    public boolean getBlockIndexing()
    {
        return m_bBlockIndexing;
    }



    /**
     * Gets the field aggregation map. This means that you want to generate a field entry, whereby its value should be copied from another, existing metadata entry. You
     * can specify a list of these source-attributes, the first who have an entry wins and appears as new attribute, so the source field name list is in fact a priorized
     * list.
     * 
     * @return the current field aggregation map
     */
    public MultiValueHashMap<String, String> getFieldAggregationMap()
    {
        return m_hsTarget2SourcesFieldnames;
    }



    /**
     * Gets the field config
     * 
     * @return the field config
     */
    public FieldConfig getFieldConfig()
    {
        return m_fieldConfig;
    }



    /**
     * Gets the field copy mappings. This means that the content of every metadata key that is specified as key inside hsSource2TargetFieldnames will be copied into
     * several other fields. The field names of these fields are specified as corresponding value inside hsSource2TargetFieldnames. In the case you want to rename
     * attribute names, specify a field mapping and ignore the source field name with {@link #setFieldNames2Ignore(HashSet)}
     * 
     * @return the current field mappings
     */
    public MultiValueHashMap<String, String> getFieldCopyMap()
    {
        return m_hsSource2TargetFieldnames;
    }



    /**
     * Gets the set of field names / metadata key values that will NOT be stored into the lucene index.
     * 
     * @return the set of field names / metadata key values that will NOT be stored into the lucene index.
     */
    public HashSet<String> getFields2Ignore()
    {
        return m_hsAttNamesNot2Store;
    }



    /**
     * All docs without at least one of the given fieldname-value pairs will be ignored. You can specif regular expressions as field values
     * 
     * @return the fieldname-value pairs. At least one have to match that a document will be written into the index
     */
    public Map<String, String> getIgnoreAllDocsWithout()
    {
        return m_hsFieldName2FieldValueConstraint;
    }



    /**
     * If split and merge is enabled, {@link ToLuceneContentHandler} will check at each {@link #processNewData(Metadata, String)} invocation whether the current
     * indexWriter has more than iSplitIndexDocumentCount documents. In the case it has more, {@link ToLuceneContentHandler} will create an entirely new index for
     * writing, until this one also gets 'overfilled'. In the case your crawl is finished, {@link Leech} invokes {@link ToLuceneContentHandler#crawlFinished()}. This will
     * merge all temporary indices into the initial indexWriter object. This is for performance reasons because writing into a Lucene index tends to get slow after a
     * certain size. Splitting and merging afterwards is faster.
     * 
     * @return the document count a new index will be created
     */
    public int getSplitAndMergeIndex()
    {
        return m_iSplitIndexDocumentCount;
    }



    /**
     * Sets some attribute value pairs that will be added to every crawled document.
     * 
     * @return the current static attribute value pairs
     */
    public MultiValueHashMap<String, String> getStaticAttributeValuePairs()
    {
        return m_hsStaticAttValuePairs;
    }



    @Override
    public void processErrorData(Metadata metadata)
    {
        // NOP
    }



    @Override
    public void processModifiedData(Metadata metadata, String strFulltext)
    {

        try
        {

            // hier modifizieren wir ein schon vorhandenes Dokument
            Document luceneDocument = createAndFillLuceneDocument(metadata, strFulltext);
            if(luceneDocument == null) return;



            // TODO: was passiert hier mit block-indexierten Dokumenten?
            m_initialLuceneWriter
                    .updateDocument(new Term(IncrementalCrawlingHistory.dataEntityId, metadata.get(IncrementalCrawlingHistory.dataEntityId)), luceneDocument);

        }
        catch (Exception e)
        {
            Logger.getLogger(ToLuceneContentHandler.class.getName()).log(Level.SEVERE, "Error during writing into the index", e);
        }

    }



    @Override
    public void processNewData(Metadata metadata, String strFulltext)
    {

        try
        {
            if(m_initialLuceneWriter == null) throw new IllegalStateException("Lucene writer was not specified");

            m_luceneWriter = getCurrentWriter();

            ensureConsumerThreadsRunning();


            Document doc = createAndFillLuceneDocument(metadata, strFulltext);
            if(doc == null) return;



            // wenn es ein parent oder childDoc ist, dann merken wir uns dieses erst mal, bis wir einen ganzen Block haben. Wenn wir auf ein childDoc
            // stossen, dann schreiben wir beim nächsten parendDoc, und merken uns alle childs bis dahin
            // - wenn wir auf ein Doc ohne parent-oder child-Id stossen, dann schreiben wir alle bisherigen Docs als Einzeldokumente raus - nicht im
            // Block

            if(ToLuceneContentHandler.this.getBlockIndexing())
            {


                if(metadata.get(LeechMetadata.parentId) != null)
                {
                    // wir haben ein child-Doc (wir haben eine Referenz zu unserem parent). Das merken wir uns einfach
                    m_llLastChildDocuments.add(doc);
                }
                else if(metadata.get(LeechMetadata.childId) != null)
                {
                    // wir haben ein parentDoc (ein parent hat min eine childId) - wir schreiben zusammen mit den bisher gesammelten im block. Das
                    // parentDoc ist das letzte
                    m_llLastChildDocuments.add(doc);

                    m_addDocsQueue.put(new LinkedList<Document>(m_llLastChildDocuments));

                    m_llLastChildDocuments.clear();
                }
                else
                {
                    // wir haben weder child-noch parent ID - alle gemerkten childDocs werden als Einzeldocs rausgeschrieben
                    for (Document orphanDoc : m_llLastChildDocuments)
                        m_addDocsQueue.put(Collections.singletonList(orphanDoc));

                    m_addDocsQueue.put(Collections.singletonList(doc));
                }


            }
            else
            {
                m_addDocsQueue.put(Collections.singletonList(doc));
            }





        }
        catch (Exception e)
        {
            Logger.getLogger(ToLuceneContentHandler.class.getName()).log(Level.SEVERE, "Error", e);
        }

    }



    public void processNewDocument(Document doc)
    {

        try
        {
            if(m_initialLuceneWriter == null) throw new IllegalStateException("Lucene writer was not specified");

            m_luceneWriter = getCurrentWriter();

            ensureConsumerThreadsRunning();

            if(doc == null) return;



            // wenn es ein parent oder childDoc ist, dann merken wir uns dieses erst mal, bis wir einen ganzen Block haben. Wenn wir auf ein childDoc
            // stossen, dann schreiben wir beim nächsten parendDoc, und merken uns alle childs bis dahin
            // - wenn wir auf ein Doc ohne parent-oder child-Id stossen, dann schreiben wir alle bisherigen Docs als Einzeldokumente raus - nicht im
            // Block

            if(ToLuceneContentHandler.this.getBlockIndexing())
            {


                if(doc.get(LeechMetadata.parentId) != null)
                {
                    // wir haben ein child-Doc (wir haben eine Referenz zu unserem parent). Das merken wir uns einfach
                    m_llLastChildDocuments.add(doc);
                }
                else if(doc.get(LeechMetadata.childId) != null)
                {
                    // wir haben ein parentDoc (ein parent hat min eine childId) - wir schreiben zusammen mit den bisher gesammelten im block. Das
                    // parentDoc ist das letzte
                    m_llLastChildDocuments.add(doc);

                    m_addDocsQueue.put(new LinkedList<Document>(m_llLastChildDocuments));

                    m_llLastChildDocuments.clear();
                }
                else
                {
                    // wir haben weder child-noch parent ID - alle gemerkten childDocs werden als Einzeldocs rausgeschrieben
                    for (Document orphanDoc : m_llLastChildDocuments)
                        m_addDocsQueue.put(Collections.singletonList(orphanDoc));

                    m_addDocsQueue.put(Collections.singletonList(doc));
                }


            }
            else
            {
                m_addDocsQueue.put(Collections.singletonList(doc));
            }





        }
        catch (Exception e)
        {
            Logger.getLogger(ToLuceneContentHandler.class.getName()).log(Level.SEVERE, "Error", e);
        }

    }



    @Override
    public void processProcessedData(Metadata metadata)
    {
        // NOP
    }



    @Override
    public void processRemovedData(Metadata metadata)
    {
        // da kann man ja mit den inkremental-Ids spielen, die stehen ja evtl. noch in den Metadaten drin :) :)

        try
        {

            // TODO: was passiert hier mit block-indexierten Dokumenten?
            m_initialLuceneWriter.deleteDocuments(new Term(IncrementalCrawlingHistory.dataEntityId, metadata.get(IncrementalCrawlingHistory.dataEntityId)));

        }
        catch (Exception e)
        {
            Logger.getLogger(ToLuceneContentHandler.class.getName()).log(Level.SEVERE, "Error during writing into the index", e);
        }

    }




    @Override
    public void processUnmodifiedData(Metadata metadata)
    {
        // NOP
    }



    /**
     * Sets whether block indexing with {@link IndexWriter#addDocuments(java.util.Collection, Analyzer)} is enabled or not. If it is enabled,
     * {@link ToLuceneContentHandler} checks whether inside the metadata is a {@link LeechMetadata#childId} or a {@link LeechMetadata#parentId} key. Documents with a
     * {@link LeechMetadata#childId} entry will appear as parent documents, docs with an {@link LeechMetadata#parentId} as childs. {@link ToLuceneContentHandler} collects
     * the child documents if they appear at a processXXX method, and writes them as block at the time a succeeding parent document appears. In the case a non-parent doc
     * appears, all collected docs will be indexed normally, not as block.
     * 
     * @param blockIndexing true in the case blockindexing should be inabled, false otherwise.
     */
    public void setBlockIndexing(boolean blockIndexing)
    {
        this.m_bBlockIndexing = blockIndexing;
    }



    /**
     * Sets the field aggregation map. This means that you want to generate a field entry, whereby its value should be copied from another, existing metadata entry. You
     * can specify a list of these source-attributes, the first who have an entry wins and appears as new attribute, so the source field name list is in fact a priorized
     * list.
     * 
     * @param hsTarget2SourcesFieldnames the field aggregation map
     */
    public void setFieldAggregationMap(MultiValueHashMap<String, String> hsTarget2SourcesFieldnames)
    {
        m_hsTarget2SourcesFieldnames = hsTarget2SourcesFieldnames;
    }






    /**
     * Sets the field copy mappings. This means that the content of every metadata key that is specified as key inside hsSource2TargetFieldnames will be copied into
     * several other fields. The field names of these fields are specified as corresponding value inside hsSource2TargetFieldnames. In the case you want to rename
     * attribute names, specify a field mapping and ignore the source field name with {@link #setFieldNames2Ignore(HashSet)}
     * 
     * @param hsSource2TargetFieldnames keys: source field names, given as metadata keys. values: target field names - the content will also appear under these fields
     *            inside a lucene document
     */
    public void setFieldCopyMap(MultiValueHashMap<String, String> hsSource2TargetFieldnames)
    {
        m_hsSource2TargetFieldnames = hsSource2TargetFieldnames;
    }



    /**
     * Sets the set of field names / metadata key values that will NOT be stored into the lucene index. Nevertheless, you can consider these in
     * {@link #setFieldCopyMap(MultiValueHashMap)}. In this case you have 'moved' the attribute value into another attribute (or several ones).
     * 
     * @param hsAttNamesNot2Store the set of attribute/field names that will not stored into the lucene index
     */
    public void setFieldNames2Ignore(HashSet<String> hsAttNamesNot2Store)
    {
        m_hsAttNamesNot2Store = hsAttNamesNot2Store;
    }



    /**
     * All docs without at least one of the given fieldname-value pairs will be ignored. You can specif regular expressions as field values. If this is set to null or to
     * an empty map, all documents will be accepted.
     * 
     * @param hsFieldName2FieldValue the fieldname-value pairs. At least one have to match that a document will be written into the index
     * 
     * @return this
     */
    public ToLuceneContentHandler setIgnoreAllDocsWithout(Map<String, String> hsFieldName2FieldValue)
    {
        m_hsFieldName2FieldValueConstraint = hsFieldName2FieldValue;

        return this;
    }




    /**
     * If split and merge is enabled, {@link ToLuceneContentHandler} will check at each {@link #processNewData(Metadata, String)} invocation whether the current
     * indexWriter has more than iSplitIndexDocumentCount documents. In the case it has more, {@link ToLuceneContentHandler} will create an entirely new index for
     * writing, until this one also gets 'overfilled'. In the case your crawl is finished, invoking {@link ToLuceneContentHandler#crawlFinished()} merges all temporary
     * indices into the initial indexWriter object. This invocation will be done automatically by the {@link Leech} class. This is for performance reasons because writing
     * into a Lucene index tends to get slow after a certain size. Splitting and merging afterwards is faster. Update: this behaviour depends on the Lucene version used,
     * currently this seems to be not a problem. Thus, this functionality is disabled per default.
     * 
     * @param iSplitIndexDocumentCount the document count a new index will be created. A good size is 500 000 (from my stomach feeling, if it is necessary). -1 in the
     *            case you want to disable SplitAndMerge, which is the default.
     * 
     * @return this
     */
    public ToLuceneContentHandler setSplitAndMergeIndex(int iSplitIndexDocumentCount)
    {
        m_iSplitIndexDocumentCount = iSplitIndexDocumentCount;

        return this;
    }



    /**
     * Sets some attribute value pairs that will be added to every crawled document.
     * 
     * @param hsStaticAttValuePairs a multi value map containing the additional attribute value pairs
     * 
     * @return this
     */
    public ToLuceneContentHandler setStaticAttributeValuePairs(MultiValueHashMap<String, String> hsStaticAttValuePairs)
    {
        m_hsStaticAttValuePairs = hsStaticAttValuePairs;

        return this;
    }



    protected void addStaticAttValuePairs(Document doc) throws Exception
    {
        for (Entry<String, String> fieldName2Value : getStaticAttributeValuePairs().entryList())
        {
            IndexableField field = m_fieldConfig.createField(fieldName2Value.getKey(), fieldName2Value.getValue());
            if(field != null)
                doc.add(field);
            else
                Logger.getLogger(ToLuceneContentHandler.class.getName()).warning(
                        "Could not create lucene field for " + fieldName2Value.getKey() + ":" + fieldName2Value.getValue() + ". Will ignore it.");
        }
    }




    /**
     * Returns null in the case the documents should be ignored according the given constraints (given with {@link #setIgnoreAllDocsWithout(Map)})
     * 
     * @param metadata
     * @param strFulltext
     * 
     * @return null in the case the documents should be ignored according the given constraints (given with {@link #setIgnoreAllDocsWithout(Map)})
     * 
     * @throws Exception
     */
    protected Document createAndFillLuceneDocument(Metadata metadata, String strFulltext) throws Exception
    {
        // // wir erstellen kein Document-Object neu, wenn es nicht unbedingt nötig ist - dazu merken wir uns die Referenzen auf die schon allokierten
        // // Document Objekte
        // // Document Object reuse
        // Document doc = null;
        // for (Document preAllocatedDoc : m_llAllocatedDocuments)
        // {
        // if(!m_llLastChildDocuments.contains(preAllocatedDoc))
        // {
        // doc = preAllocatedDoc;
        // LinkedList<String> llFieldNames = new
        // for (Fieldable field : doc.getFields())
        // doc.removeFields(field.name());
        //
        // break;
        // }
        // }
        // if(doc == null)
        // {
        // doc = new Document();
        // m_llAllocatedDocuments.add(doc);
        // }

        Document doc = new Document();



        // Das man kein Field aus einem reader machen kann ist der Grund, warum processNewMetaData den Fulltext als String und nicht als reader
        // übergibt

        // eine eindeutige ID muß da sein
        if(metadata.getValues(LeechMetadata.id).length == 0) doc.add(m_fieldConfig.createField(LeechMetadata.id, new UID().toString()));
        if(!getFields2Ignore().contains(LeechMetadata.body)) doc.add(m_fieldConfig.createField(LeechMetadata.body, strFulltext));
        // die kopien
        for (String strFieldCopy : getFieldCopyMap().get(LeechMetadata.body))
            if(!getFields2Ignore().contains(strFieldCopy)) doc.add(m_fieldConfig.createField(strFieldCopy, strFulltext));


        // die restlichen metadaten
        for (String strFieldName : metadata.names())
        {
            if(!getFields2Ignore().contains(strFieldName))
            {
                for (String strValue : metadata.getValues(strFieldName))
                {
                    IndexableField field = m_fieldConfig.createField(strFieldName, strValue);
                    if(field != null)
                        doc.add(field);
                    else
                        Logger.getLogger(ToLuceneContentHandler.class.getName()).warning(
                                "Could not create lucene field for " + strFieldName + ":" + strValue + ". Will ignore it.");
                }

            }

            // die kopien
            for (String strFieldCopy : getFieldCopyMap().get(strFieldName))
                if(!getFields2Ignore().contains(strFieldCopy))
                {
                    for (String strValue : metadata.getValues(strFieldName))
                    {
                        IndexableField field = m_fieldConfig.createField(strFieldCopy, strValue);
                        if(field != null)
                            doc.add(field);
                        else
                            Logger.getLogger(ToLuceneContentHandler.class.getName()).warning(
                                    "Could not create lucene field for " + strFieldCopy + ":" + strValue + ". Will ignore it.");
                    }
                }
        }

        // die statischen Attribut-Value-Paare
        addStaticAttValuePairs(doc);

        // und jetzt aggregieren wir noch
        for (String strTargetAtt : getFieldAggregationMap().keySet())
        {
            // wenn es das TargetAtt schon im doc gibt, dann aggregieren wir nix
            if(doc.get(strTargetAtt) != null) continue;

            Collection<String> colSourceAtts = getFieldAggregationMap().get(strTargetAtt);

            for (String strSourceAtt : colSourceAtts)
            {
                String strNewValue = metadata.get(strSourceAtt);
                if(strNewValue == null) strNewValue = getStaticAttributeValuePairs().getFirst(strSourceAtt);

                if(strNewValue != null)
                {
                    IndexableField field = m_fieldConfig.createField(strTargetAtt, strNewValue);
                    if(field != null)
                        doc.add(field);
                    else
                        Logger.getLogger(ToLuceneContentHandler.class.getName()).warning(
                                "Could not create lucene field for " + strTargetAtt + ":" + strNewValue + ". Will ignore it.");

                    break;
                }
            }
        }



        // wenn ein Doc nicht unseren constraints entspricht, dann ignorieren wir das hier, indem wir null zurück geben
        if(m_hsFieldName2FieldValueConstraint == null || m_hsFieldName2FieldValueConstraint.size() == 0) return doc;

        for (Entry<String, String> fieldname2fieldValRegEx : m_hsFieldName2FieldValueConstraint.entrySet())
        {
            IndexableField[] fieldables = doc.getFields(fieldname2fieldValRegEx.getKey());
            for (IndexableField fieldable : fieldables)
            {
                String strVal = fieldable.stringValue();
                if(strVal.matches(fieldname2fieldValRegEx.getValue()))
                {
                    // wir haben einen Treffer
                    return doc;
                }
            }
        }


        return null;
    }







    protected void ensureConsumerThreadsRunning()
    {
        if(m_llConsumerThreads.size() != 0) return;

        int iCoreCount = Runtime.getRuntime().availableProcessors();
        int iThreadCount = (int) Math.round(iCoreCount / 2d);
        iThreadCount = Math.max(iThreadCount, 1);

        m_cyclicBarrier4DocConsumerThreads = new CyclicBarrier(iThreadCount + 1);
        for (int i = 0; i < iThreadCount; i++)
        {
            Thread consumerThread = new Thread(new DocConsumer(), "ToLuceneContentHandlerDocConsumer " + i);
            m_llConsumerThreads.add(consumerThread);
            consumerThread.setDaemon(true);

            consumerThread.start();
        }
    }



    synchronized protected IndexWriter getCurrentWriter() throws CorruptIndexException, LockObtainFailedException, IOException
    {


        if(getSplitAndMergeIndex() <= 0) return m_initialLuceneWriter;

        if(m_luceneWriter.maxDoc() < getSplitAndMergeIndex()) return m_luceneWriter;


        Directory directory = m_initialLuceneWriter.getDirectory();

        Path fOurTmpDir = null;
        if(directory instanceof FSDirectory)
        {
            if(m_luceneWriter != m_initialLuceneWriter) m_llIndexWriter2Close.add(m_luceneWriter);

            String strTmpPath = ((FSDirectory) directory).getDirectory().toAbsolutePath().toString();
            // if(strTmpPath.charAt(strTmpPath.length() - 1) == '/' || strTmpPath.charAt(strTmpPath.length() - 1) == '\\')
            // strTmpPath = strTmpPath.substring(0, strTmpPath.length() - 1);
            strTmpPath += "_" + (m_hsTmpLuceneWriterPaths2Merge.size() + 1);
            fOurTmpDir = Paths.get(strTmpPath);
        }
        else
        {
            // wir brauchen was temporäres
            File parentDir = new File(System.getProperty("java.io.tmpdir"));
            fOurTmpDir = Paths.get(parentDir.getAbsolutePath() + "/leechTmp/" + UUID.randomUUID().toString().replaceAll("\\W", "_"));
        }

        Logger.getLogger(ToLuceneContentHandler.class.getName()).info(
                "Current index exceeds " + m_iSplitIndexDocumentCount + " documents. Will create another temporary one under " + fOurTmpDir);


        @SuppressWarnings("deprecation")
        IndexWriterConfig config = new IndexWriterConfig(m_initialLuceneWriter.getConfig().getAnalyzer());
        config.setOpenMode(OpenMode.CREATE);

        m_luceneWriter = new IndexWriter(new SimpleFSDirectory(fOurTmpDir), config);
        m_hsTmpLuceneWriterPaths2Merge.add(fOurTmpDir.toAbsolutePath().toString());

        return m_luceneWriter;
    }



    @Override
    protected void init()
    {
        Logger.getLogger(ToLuceneContentHandler.class.getName()).info("Will write crawled data into " + m_luceneWriter.getDirectory().toString());

        ensureConsumerThreadsRunning();
    }







}