LuceneIndexTransformer.java example

Explorer
cocoon-master
- cocoon-BRANCH_2_1_X
  - src
  - tools
    - src
      - anttasks
        DocumentCache.java
        ManifestToolTask.java
        PoolSetterTask.java
        SitemapTask.java
        XConfToolTask.java
      - loader
        Loader.java
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.cocoon.transformation;

import java.io.File;
import java.io.IOException;
import java.io.Serializable;
import java.util.Map;
import java.util.Stack;

import org.apache.avalon.framework.configuration.Configurable;
import org.apache.avalon.framework.configuration.Configuration;
import org.apache.avalon.framework.configuration.ConfigurationException;
import org.apache.avalon.framework.context.Context;
import org.apache.avalon.framework.context.ContextException;
import org.apache.avalon.framework.context.Contextualizable;
import org.apache.avalon.framework.parameters.Parameters;
import org.apache.cocoon.Constants;
import org.apache.cocoon.ProcessingException;
import org.apache.cocoon.caching.CacheableProcessingComponent;
import org.apache.cocoon.components.search.LuceneCocoonHelper;
import org.apache.cocoon.components.search.LuceneXMLIndexer;
import org.apache.cocoon.environment.SourceResolver;
import org.apache.commons.lang.BooleanUtils;
import org.apache.excalibur.source.SourceValidity;
import org.apache.excalibur.source.impl.validity.NOPValidity;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;

/**
  * <p style="font-weight: bold;">A lucene index creation transformer.</p>
  * <p>This transformer reads a document with elements in the namespace 
  * <code>http://apache.org/cocoon/lucene/1.0</code>, and creates a new Lucene Index,
  * or updates an existing one.</p>
  * <p>It has several parameters which can be set in the sitemap component configuration or as 
  * parameters to the transformation step in the pipeline, or finally as attributes of the root element
  * in the source XML document. The source document over-rides the transformation parameters, 
  * which in turn over-ride any configuration parameters.</p>
  * <dl>
  * <dt>
  * <dt style="font-weight: bold;">directory</dt>
  * <dd><p>Location of directory where index files are stored. 
  * This path is relative to the Cocoon work directory</p></dd>
  * <dt style="font-weight: bold;">create</dt>
  * <dd><p>This attribute controls whether the index is recreated.  </p>
  *    <ul><li><p>If create = "false" and the index already exists then the index will be updated. 
  *    Any documents which had already been indexed will be removed from the index and reinserted.</p></li>
  *    <li><p>If the index does not exist then it will be created even if <code>create</code>="false".</p></li>
  *    <li><p>If <code>create</code>="true" then any existing index will be destroyed and a new index created. 
  *     If you are rebuilding your entire index then you should set <code>create</code>="true" because the 
  *     indexer doesn't need to remove old documents from the index, so it will be faster.</p></li></ul>
  * </dd>
  * <dt style="font-weight: bold;">max-field-length</dt>
  * <dd><p>Maximum number of terms to index in a field (as far as the index is concerned,
  *    the document will effectively be truncated at this point. The default value, 10k, may not be sufficient for large documents.</p></dd>
  * <dt style="font-weight: bold;">analyzer</dt>
  * <dd><p>Class name of the Lucene text analyzer to use. Typically depends on the language of the text being indexed.
  * See the Lucene documentation for more information.</p></dd>
  * <dt style="font-weight: bold;">merge-factor</dt>
  * <dd><p>Determines how often segment indices are merged. See the Lucene documentation for more information.</p></dd>
  * <dt style="font-weight: bold;">optimize-frequency</dt>
  * <dd><p>Determines how often the lucene index will be optimized. When you have 1000's of documents, optimizing the index
  * can become quite slow (eg. 7 seconds for 9000 small docs, P4).</p>
  *
  * <ul>
  * <li>1: always optimize (default)</li>
  * <li>0: never optimize</li>
  * <li>x: update every x times. You can use any number, it is a random generator which will determine to optimize or not. </li>   
  * </ul>
  * 
  * </dd>
  * </dl>
  * <dl>
  * <dt style="font-weight: bold;">A simple example of the input:</dt>
  * <dd>
  * <pre><?xml version="1.0" encoding="UTF-8"?>
  * <lucene:index xmlns:lucene="http://apache.org/cocoon/lucene/1.0" 
  *     merge-factor="20" 
  *     create="false" 
  *     directory="index" 
  *     max-field-length="10000"
  *     optimize-frequency="1" 
  *     analyzer="org.apache.lucene.analysis.standard.StandardAnalyzer">
  *     <lucene:document url="a.html">
  *             <documentTitle lucene:store="true">Doggerel</documentTitle>
  *             <body>The quick brown fox jumped over the lazy dog</body>    
  *     </lucene:document>
  *     <lucene:document url="b.html">
  *             <documentTitle lucene:store="true">Lorem Ipsum</documentTitle>
  *             <body>Lorem ipsum dolor sit amet, consectetuer adipiscing elit.</body>
  *             <body>Nunc a mauris blandit ligula scelerisque tristique.</body>    
  *     </lucene:document>
  * </lucene:index>
  * </pre>
 * </dd>
 * </dl>
 *
 * @version $Id$
 */
public class LuceneIndexTransformer extends AbstractTransformer
    implements CacheableProcessingComponent,
               Configurable,
               Contextualizable {

    public static final String ANALYZER_CLASSNAME_CONFIG = "analyzer-classname";
    public static final String ANALYZER_CLASSNAME_PARAMETER = "analyzer-classname";
    public static final String ANALYZER_CLASSNAME_DEFAULT = "org.apache.lucene.analysis.standard.StandardAnalyzer";
    public static final String DIRECTORY_CONFIG = "directory";
    public static final String DIRECTORY_PARAMETER = "directory";
    public static final String DIRECTORY_DEFAULT = "index";
    public static final String MERGE_FACTOR_CONFIG = "merge-factor";
    public static final String MERGE_FACTOR_PARAMETER = "merge-factor";
    public static final int MERGE_FACTOR_DEFAULT = 20;
    
    public static final String OPTIMIZE_FREQUENCY_CONFIG = "optimize-frequency";
    public static final String OPTIMIZE_FREQUENCY_PARAMETER = "optimize-frequency";
    // by default, optimizing will take place on every update (previous behaviour)
    public static final int OPTIMIZE_FREQUENCY_DEFAULT = 1;
    
    public static final String MAX_FIELD_LENGTH_CONFIG = "max-field-length";
    public static final String MAX_FIELD_LENGTH_PARAMETER = "max-field-length";
    public static final int MAX_FIELD_LENGTH_DEFAULT = IndexWriter.DEFAULT_MAX_FIELD_LENGTH;

    public static final String LUCENE_URI = "http://apache.org/cocoon/lucene/1.0";
    public static final String LUCENE_QUERY_ELEMENT = "index";
    public static final String LUCENE_QUERY_ANALYZER_ATTRIBUTE = "analyzer";
    public static final String LUCENE_QUERY_DIRECTORY_ATTRIBUTE = "directory";
    public static final String LUCENE_QUERY_CREATE_ATTRIBUTE = "create";
    public static final String LUCENE_QUERY_MERGE_FACTOR_ATTRIBUTE = "merge-factor";
    public static final String LUCENE_QUERY_MAX_FIELD_LENGTH_ATTRIBUTE = "max-field-length";
    public static final String LUCENE_QUERY_OPTIMIZE_FREQUENCY_CONFIG_ATTRIBUTE = "optimize-frequency";
    public static final String LUCENE_DOCUMENT_ELEMENT = "document";
    public static final String LUCENE_DOCUMENT_URL_ATTRIBUTE = "url";
    public static final String LUCENE_ELEMENT_ATTR_TO_TEXT_ATTRIBUTE = "text-attr";
    public static final String LUCENE_ELEMENT_ATTR_STORE_VALUE = "store";
    public static final String LUCENE_ELAPSED_TIME_ATTRIBUTE = "elapsed-time";
    public static final String CDATA = "CDATA";

    // The 3 states of the state machine
    private static final int STATE_GROUND = 0; // initial or "ground" state
    private static final int STATE_QUERY = 1; // processing a lucene:index (Query) element
    private static final int STATE_DOCUMENT = 2; // processing a lucene:document element

    // Initialization time variables
    protected File workDir = null;

    // Declaration time parameters values (specified in sitemap component config)
    private IndexerConfiguration configureConfiguration;
    // Invocation time parameters values (specified in sitemap transform parameters)
    private IndexerConfiguration setupConfiguration;
    // Parameters specified in the input document
    private IndexerConfiguration queryConfiguration;

    // Runtime variables
    private int processing;
    private boolean createIndex = false;
    private IndexWriter writer;
    private StringBuffer bodyText;
    private Document bodyDocument;
    private String bodyDocumentURL;
    private Stack elementStack = new Stack();
    /**
     * Storage for the document element's attributes until the document has been
     * indexed, so that they can be copied to the output along with a boolean
     * <code>indexed</code> attribute.
     */
    private AttributesImpl documentAttributes; 
    private long documentStartTime;

    /**
     * Class name of the Lucene text analyzer to use. Typically depends on the
     * language of the text being indexed. See the Lucene documentation for more
     * information.
     */
    private String analyzer = ANALYZER_CLASSNAME_DEFAULT;

    /**
     * Location of directory where index files are stored. This path is relative
     * to the Cocoon work directory
     */
    private String directory = DIRECTORY_DEFAULT;

    /**
     * Determines how often segment indices are merged. See the Lucene
     * documentation for more information.
     */
    private int mergeFactor = MERGE_FACTOR_DEFAULT;

    /**
     * Maximum number of terms to index in a field (as far as the index is
     * concerned, the document will effectively be truncated at this point. The
     * default value, 10k, may not be sufficient for large documents.
     */
    private int maxFieldLength = MAX_FIELD_LENGTH_DEFAULT;

    /** Determines how often the lucene index will be optimized. */
    private int optimizeFrequency = OPTIMIZE_FREQUENCY_DEFAULT;

    private static String uid(String url) {
        return url.replace('/', '\u0000');
    }

    /**
     * Configure the transformer. The configuration parameters are stored as
     * general defaults, which may be over-ridden by parameters specified as
     * parameters in the sitemap pipeline, or by attributes of the query
     * element(s) in the XML input document.
     */
    public void configure(Configuration conf) throws ConfigurationException {
        this.configureConfiguration = new IndexerConfiguration(
                conf.getChild(ANALYZER_CLASSNAME_CONFIG).getValue(ANALYZER_CLASSNAME_DEFAULT), 
                conf.getChild(DIRECTORY_CONFIG).getValue(DIRECTORY_DEFAULT), 
                conf.getChild(MERGE_FACTOR_CONFIG).getValueAsInteger(MERGE_FACTOR_DEFAULT),
                conf.getChild(MAX_FIELD_LENGTH_CONFIG).getValueAsInteger(MAX_FIELD_LENGTH_DEFAULT),
                conf.getChild(OPTIMIZE_FREQUENCY_CONFIG).getValueAsInteger(OPTIMIZE_FREQUENCY_DEFAULT));
    }

    /**
     * Setup the transformer. Called when the pipeline is assembled. The
     * parameters are those specified as child elements of the
     * <code><map:transform></code> element in the sitemap. These
     * parameters are optional: If no parameters are specified here then the
     * defaults are supplied by the component configuration. Any parameters
     * specified here may be over-ridden by attributes of the lucene:index
     * element in the input document.
     */
    public void setup(SourceResolver resolver, Map objectModel, String src, Parameters parameters)
    throws ProcessingException, SAXException, IOException {
        setupConfiguration = new IndexerConfiguration(
            parameters.getParameter(ANALYZER_CLASSNAME_PARAMETER, configureConfiguration.analyzerClassname),
            parameters.getParameter(DIRECTORY_PARAMETER, configureConfiguration.indexDirectory),
            parameters.getParameterAsInteger(MERGE_FACTOR_PARAMETER, configureConfiguration.indexerMergeFactor),
            parameters.getParameterAsInteger(MAX_FIELD_LENGTH_PARAMETER, configureConfiguration.indexerMaxFieldLength),
            parameters.getParameterAsInteger(OPTIMIZE_FREQUENCY_PARAMETER, configureConfiguration.indexerOptimizeFrequency));
    }

    /**
     * Contextualize this class
     */
    public void contextualize(Context context) throws ContextException {
        this.workDir = (File) context.get(Constants.CONTEXT_WORK_DIR);
    }

    /**
     * @see org.apache.cocoon.xml.AbstractXMLProducer#recycle()
     */
    public void recycle() {
        this.processing = STATE_GROUND;
        if (this.writer != null) {
            try {
                this.writer.close();
            } catch (IOException ioe) {
            }
            this.writer = null;
        }
        this.bodyText = null;
        this.bodyDocument = null;
        this.bodyDocumentURL = null;
        this.elementStack.clear();
        super.recycle();
    }

    /**
     * Generate the unique key. This key must be unique inside the space of this
     * component.
     *
     * @return The generated key
     */
    public Serializable getKey() {
        return "1";
    }

    /**
     * Generate the validity object.
     *
     * @return The generated validity object or <code>null</code> if the
     *         component is currently not cacheable.
     */
    public SourceValidity getValidity() {
        return NOPValidity.SHARED_INSTANCE;
    }

    public void startDocument() throws SAXException {
        super.startDocument();
    }

    public void endDocument() throws SAXException {
        super.endDocument();
    }

    /**
     * Begin the scope of a prefix-URI Namespace mapping.
     *
     * @param prefix
     *            The Namespace prefix being declared.
     * @param uri
     *            The Namespace URI the prefix is mapped to.
     */
    public void startPrefixMapping(String prefix, String uri) throws SAXException {
        if (processing == STATE_GROUND) {
            super.startPrefixMapping(prefix, uri);
        }
    }

    /**
     * End the scope of a prefix-URI mapping.
     *
     * @param prefix
     *            The prefix that was being mapping.
     */
    public void endPrefixMapping(String prefix) throws SAXException {
        if (processing == STATE_GROUND) {
            super.endPrefixMapping(prefix);
        }
    }

    public void startElement(String namespaceURI, String localName, String qName, Attributes atts)
        throws SAXException {

        if (processing == STATE_GROUND) {
            if (LUCENE_URI.equals(namespaceURI) && LUCENE_QUERY_ELEMENT.equals(localName)) {
                String sCreate = atts.getValue(LUCENE_QUERY_CREATE_ATTRIBUTE);
                createIndex = BooleanUtils.toBoolean(sCreate);

                String analyzerClassname = atts.getValue(LUCENE_QUERY_ANALYZER_ATTRIBUTE);
                String indexDirectory  = atts.getValue(LUCENE_QUERY_DIRECTORY_ATTRIBUTE);
                String mergeFactorStr = atts.getValue(LUCENE_QUERY_MERGE_FACTOR_ATTRIBUTE);
                String maxFieldLengthStr = atts.getValue(LUCENE_QUERY_MAX_FIELD_LENGTH_ATTRIBUTE);
                String optimizeFrequencyStr = atts.getValue(LUCENE_QUERY_OPTIMIZE_FREQUENCY_CONFIG_ATTRIBUTE);

                queryConfiguration = new IndexerConfiguration(
                        analyzerClassname != null ? analyzerClassname : setupConfiguration.analyzerClassname,
                        indexDirectory != null ? indexDirectory : setupConfiguration.indexDirectory,
                        mergeFactorStr != null ? Integer.parseInt(mergeFactorStr) : setupConfiguration.indexerMergeFactor,
                        maxFieldLengthStr != null ? Integer.parseInt(maxFieldLengthStr) : setupConfiguration.indexerMaxFieldLength,
                        optimizeFrequencyStr != null ? Integer.parseInt(optimizeFrequencyStr) : setupConfiguration.indexerOptimizeFrequency);

                if (!createIndex) {
                    // Not asked to create the index - but check if this is necessary anyway:
                    try {
                        IndexReader reader = openReader();
                        reader.close();
                    } catch (IOException ioe) {
                        // couldn't open the index - so recreate it
                        createIndex = true;
                    }
                }
                // propagate the lucene:index to the next stage in the pipeline
                super.startElement(namespaceURI, localName, qName, atts);
                processing = STATE_QUERY;
            } else {
                super.startElement(namespaceURI, localName, qName, atts);
            }
        } else if (processing == STATE_QUERY) {
            // processing a lucene:index - expecting a lucene:document
            if (LUCENE_URI.equals(namespaceURI) && LUCENE_DOCUMENT_ELEMENT.equals(localName)) {
                this.bodyDocumentURL = atts.getValue(LUCENE_DOCUMENT_URL_ATTRIBUTE);
                if (this.bodyDocumentURL == null) {
                    throw new SAXException("<lucene:document> must have @url attribute");
                }

                // Remember the time the document indexing began
                this.documentStartTime = System.currentTimeMillis();
                // remember these attributes so they can be passed on to the next stage in the pipeline,
                // when this document element is ended.
                this.documentAttributes = new AttributesImpl(atts);
                this.bodyText = new StringBuffer();
                this.bodyDocument = new Document();
                this.elementStack.clear();
                processing = STATE_DOCUMENT;
            } else {
                throw new SAXException("<lucene:index> element can contain only <lucene:document> elements!");
            }
        } else if (processing == STATE_DOCUMENT) {
            elementStack.push(new IndexHelperField(localName, new AttributesImpl(atts)));
        }
    }

    public void endElement(String namespaceURI, String localName, String qName)
        throws SAXException {

        if (processing == STATE_QUERY) {
            if (LUCENE_URI.equals(namespaceURI) && LUCENE_QUERY_ELEMENT.equals(localName)) {
                if (needToOptimize()) {
                    // End query processing
                    try {
                        if (this.writer == null) {
                            openWriter();
                        }
                        this.writer.optimize();
                        this.writer.close();
                        this.writer = null;
                    } catch (IOException e) {
                        throw new SAXException(e);
                    }
                }
                // propagate the query element to the next stage in the pipeline
                super.endElement(namespaceURI, localName, qName);
                this.processing = STATE_GROUND;
            } else {
                throw new SAXException("</lucene:index> was expected!");
            }
        } else if (processing == STATE_DOCUMENT) {
            if (LUCENE_URI.equals(namespaceURI) && LUCENE_DOCUMENT_ELEMENT.equals(localName)) {
                // End document processing
                this.bodyDocument.add(Field.UnStored(LuceneXMLIndexer.BODY_FIELD, this.bodyText.toString()));
                this.bodyText = null;

                this.bodyDocument.add(Field.UnIndexed(LuceneXMLIndexer.URL_FIELD, this.bodyDocumentURL));
                // store: false, index: true, tokenize: false
                this.bodyDocument.add(new Field(LuceneXMLIndexer.UID_FIELD, uid(this.bodyDocumentURL), false, true, false));
                try {
                    reindexDocument();
                } catch (IOException e) {
                    throw new SAXException(e);
                }
                this.bodyDocumentURL = null;

                // propagate the lucene:document element to the next stage in the pipeline
                long elapsedTime = System.currentTimeMillis() - this.documentStartTime;

                this.documentAttributes.addAttribute("", 
                                                     LUCENE_ELAPSED_TIME_ATTRIBUTE, 
                                                     LUCENE_ELAPSED_TIME_ATTRIBUTE, 
                                                     CDATA, 
                                                     String.valueOf(elapsedTime));
                super.startElement(namespaceURI, localName, qName, this.documentAttributes);
                super.endElement(namespaceURI, localName, qName);
                this.processing = STATE_QUERY;
            } else {                
                // End element processing
                IndexHelperField tos = (IndexHelperField) elementStack.pop();
                StringBuffer text = tos.getText();

                Attributes atts = tos.getAttributes();
                boolean attributesToText = atts.getIndex(LUCENE_URI, LUCENE_ELEMENT_ATTR_TO_TEXT_ATTRIBUTE) != -1;
                for (int i = 0; i < atts.getLength(); i++) {
                    // Ignore Lucene attributes
                    if (LUCENE_URI.equals(atts.getURI(i))) {
                        continue;
                    }

                    String atts_lname = atts.getLocalName(i);
                    String atts_value = atts.getValue(i);
                    bodyDocument.add(Field.UnStored(localName + "@" + atts_lname, atts_value));
                    if (attributesToText) {
                        text.append(atts_value);
                        text.append(' ');
                        bodyText.append(atts_value);
                        bodyText.append(' ');
                    }
                }

                boolean store = atts.getIndex(LUCENE_URI, LUCENE_ELEMENT_ATTR_STORE_VALUE) != -1;
                if (text != null && text.length() > 0) {
                    if (store) {
                        bodyDocument.add(Field.Text(localName, text.toString()));
                    } else {
                        bodyDocument.add(Field.UnStored(localName, text.toString()));
                    }
                }
            }
        } else {
            // All other tags
            super.endElement(namespaceURI, localName, qName);
        }
    }

    public void characters(char[] ch, int start, int length)
        throws SAXException {

        if (processing == STATE_DOCUMENT && ch.length > 0 && start >= 0 && length > 1 && elementStack.size() > 0) {
            String text = new String(ch, start, length);
            ((IndexHelperField) elementStack.peek()).append(text);
            bodyText.append(text);
            bodyText.append(' ');
        } else if (processing == STATE_GROUND) {
            super.characters(ch, start, length);
        }
    }

    private void openWriter() throws IOException {
        File indexDirectory = new File(queryConfiguration.indexDirectory);
        if (!indexDirectory.isAbsolute()) {
            indexDirectory = new File(workDir, queryConfiguration.indexDirectory);
        }

        // If the index directory doesn't exist, then always create it.
        boolean indexExists = IndexReader.indexExists(indexDirectory);
        if (!indexExists) {
            createIndex = true;
        }

        // Get the index directory, creating it if necessary
        Directory directory = LuceneCocoonHelper.getDirectory(indexDirectory, createIndex);
        Analyzer analyzer = LuceneCocoonHelper.getAnalyzer(queryConfiguration.analyzerClassname);
        this.writer = new IndexWriter(directory, analyzer, createIndex);
        this.writer.mergeFactor = queryConfiguration.indexerMergeFactor;
        this.writer.maxFieldLength = queryConfiguration.indexerMaxFieldLength;
    }    

    private IndexReader openReader() throws IOException {
        File indexDirectory = new File(queryConfiguration.indexDirectory);
        if (!indexDirectory.isAbsolute()) {
            indexDirectory = new File(workDir, queryConfiguration.indexDirectory);
        }
        Directory directory = LuceneCocoonHelper.getDirectory(indexDirectory, createIndex);
        IndexReader reader = IndexReader.open(directory);
        return reader;
    }    

    private void reindexDocument() throws IOException {
        if (this.createIndex) {
            // The index is being created, so there's no need to delete the doc from an existing index.
            // This means we can keep a single IndexWriter open throughout the process.
            if (this.writer == null) {
                openWriter();
            }
            this.writer.addDocument(this.bodyDocument);
        } else {
            // This is an incremental reindex, so the document should be removed from the index before adding it
            try {
                IndexReader reader = openReader();
                reader.delete(new Term(LuceneXMLIndexer.UID_FIELD, uid(this.bodyDocumentURL)));
                reader.close();
            } catch (IOException e) {
                /* ignore */
            }
            openWriter();
            this.writer.addDocument(this.bodyDocument);
            this.writer.close();
            this.writer = null;
        }
        this.bodyDocument = null;
    }

    private static class IndexHelperField {
        String localName;
        StringBuffer text;
        Attributes attributes;

        IndexHelperField(String localName, Attributes atts) {
            this.localName = localName;
            this.attributes = atts;
            this.text = new StringBuffer();
        }

        Attributes getAttributes() {
            return attributes;
        }

        StringBuffer getText() {
            return text;
        }

        void append(String text) {
            this.text.append(text);
        }

        void append(char[] str, int offset, int length) {
            this.text.append(str, offset, length);
        }
    }

    private static class IndexerConfiguration {
        String analyzerClassname;
        String indexDirectory;
        int indexerMergeFactor;
        int indexerMaxFieldLength;
        int indexerOptimizeFrequency;

        IndexerConfiguration(String analyzerClassname,
                             String indexDirectory,
                             int indexerMergeFactor,
                             int indexerMaxFieldLength,
                             int indexerOptimizeFrequency) {
            this.analyzerClassname = analyzerClassname;
            this.indexDirectory = indexDirectory;
            this.indexerMergeFactor = indexerMergeFactor;
            this.indexerMaxFieldLength = indexerMaxFieldLength;
            this.indexerOptimizeFrequency = indexerOptimizeFrequency;
        }
    }
    
    /**
     * Will check if, based on the configuration (optimize-frequency option),
     * the lucene index should be optimized. It uses a random number generator
     * to determine if it should optimize or not.
     *
     * This check was added because of large indexes, optimizing becomes quite
     * slow.
     *
     * From the lucene documentation: The IndexWriter class supports an
     * optimize() method that compacts the index database and speedup queries.
     * You may want to use this method after performing a complete indexing of
     * your document set or after incremental updates of the index. If your
     * incremental update adds documents frequently, you want to perform the
     * optimization only once in a while to avoid the extra overhead of the
     * optimization.
     *  
     * @return true if we should optimize the index
     */
    private boolean needToOptimize() {
        int optimizeFrequency = queryConfiguration.indexerOptimizeFrequency;
        if (optimizeFrequency == 0) {
            return false;
        }
        if (optimizeFrequency == 1) {
            return true;
        }

        // use a random int to determine if we may execute
        int randomInt = 1 + (int) (Math.random() * optimizeFrequency);
        if (randomInt == 1) {
            return true;
        } else {
            return false;
        }
    }

    /**
     * @return the analyzer
     */
    public String getAnalyzer() {
        return analyzer;
    }

    /**
     * @param analyzer
     *            the analyzer to set
     */
    public void setAnalyzer(String analyzer) {
        this.analyzer = analyzer;
    }

    /**
     * @return the directory
     */
    public String getDirectory() {
        return directory;
    }

    /**
     * @param directory
     *            the directory to set
     */
    public void setDirectory(String directory) {
        this.directory = directory;
    }

    /**
     * @return the mergeFactor
     */
    public int getMergeFactor() {
        return mergeFactor;
    }

    /**
     * @param mergeFactor
     *            the mergeFactor to set
     */
    public void setMergeFactor(int mergeFactor) {
        this.mergeFactor = mergeFactor;
    }

    /**
     * @return the maxFieldLength
     */
    public int getMaxFieldLength() {
        return maxFieldLength;
    }

    /**
     * @param maxFieldLength
     *            the maxFieldLength to set
     */
    public void setMaxFieldLength(int maxFieldLength) {
        this.maxFieldLength = maxFieldLength;
    }

    /**
     * @return the optimizeFrequency
     */
    public int getOptimizeFrequency() {
        return optimizeFrequency;
    }

    /**
     * @param optimizeFrequency
     *            the optimizeFrequency to set
     */
    public void setOptimizeFrequency(int optimizeFrequency) {
        this.optimizeFrequency = optimizeFrequency;
    }
}