/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.cocoon.transformation; import java.io.File; import java.io.IOException; import java.io.Serializable; import java.util.Map; import java.util.Stack; import org.apache.avalon.framework.configuration.Configurable; import org.apache.avalon.framework.configuration.Configuration; import org.apache.avalon.framework.configuration.ConfigurationException; import org.apache.avalon.framework.context.Context; import org.apache.avalon.framework.context.ContextException; import org.apache.avalon.framework.context.Contextualizable; import org.apache.avalon.framework.parameters.Parameters; import org.apache.cocoon.Constants; import org.apache.cocoon.ProcessingException; import org.apache.cocoon.caching.CacheableProcessingComponent; import org.apache.cocoon.components.search.LuceneCocoonHelper; import org.apache.cocoon.components.search.LuceneXMLIndexer; import org.apache.cocoon.environment.SourceResolver; import org.apache.commons.lang.BooleanUtils; import org.apache.excalibur.source.SourceValidity; import org.apache.excalibur.source.impl.validity.NOPValidity; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.store.Directory; import org.xml.sax.Attributes; import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; /** * <p style="font-weight: bold;">A lucene index creation transformer.</p> * <p>This transformer reads a document with elements in the namespace * <code>http://apache.org/cocoon/lucene/1.0</code>, and creates a new Lucene Index, * or updates an existing one.</p> * <p>It has several parameters which can be set in the sitemap component configuration or as * parameters to the transformation step in the pipeline, or finally as attributes of the root element * in the source XML document. The source document over-rides the transformation parameters, * which in turn over-ride any configuration parameters.</p> * <dl> * <dt> * <dt style="font-weight: bold;">directory</dt> * <dd><p>Location of directory where index files are stored. * This path is relative to the Cocoon work directory</p></dd> * <dt style="font-weight: bold;">create</dt> * <dd><p>This attribute controls whether the index is recreated. </p> * <ul><li><p>If create = "false" and the index already exists then the index will be updated. * Any documents which had already been indexed will be removed from the index and reinserted.</p></li> * <li><p>If the index does not exist then it will be created even if <code>create</code>="false".</p></li> * <li><p>If <code>create</code>="true" then any existing index will be destroyed and a new index created. * If you are rebuilding your entire index then you should set <code>create</code>="true" because the * indexer doesn't need to remove old documents from the index, so it will be faster.</p></li></ul> * </dd> * <dt style="font-weight: bold;">max-field-length</dt> * <dd><p>Maximum number of terms to index in a field (as far as the index is concerned, * the document will effectively be truncated at this point. The default value, 10k, may not be sufficient for large documents.</p></dd> * <dt style="font-weight: bold;">analyzer</dt> * <dd><p>Class name of the Lucene text analyzer to use. Typically depends on the language of the text being indexed. * See the Lucene documentation for more information.</p></dd> * <dt style="font-weight: bold;">merge-factor</dt> * <dd><p>Determines how often segment indices are merged. See the Lucene documentation for more information.</p></dd> * <dt style="font-weight: bold;">optimize-frequency</dt> * <dd><p>Determines how often the lucene index will be optimized. When you have 1000's of documents, optimizing the index * can become quite slow (eg. 7 seconds for 9000 small docs, P4).</p> * * <ul> * <li>1: always optimize (default)</li> * <li>0: never optimize</li> * <li>x: update every x times. You can use any number, it is a random generator which will determine to optimize or not. </li> * </ul> * * </dd> * </dl> * <dl> * <dt style="font-weight: bold;">A simple example of the input:</dt> * <dd> * <pre><?xml version="1.0" encoding="UTF-8"?> * <lucene:index xmlns:lucene="http://apache.org/cocoon/lucene/1.0" * merge-factor="20" * create="false" * directory="index" * max-field-length="10000" * optimize-frequency="1" * analyzer="org.apache.lucene.analysis.standard.StandardAnalyzer"> * <lucene:document url="a.html"> * <documentTitle lucene:store="true">Doggerel</documentTitle> * <body>The quick brown fox jumped over the lazy dog</body> * </lucene:document> * <lucene:document url="b.html"> * <documentTitle lucene:store="true">Lorem Ipsum</documentTitle> * <body>Lorem ipsum dolor sit amet, consectetuer adipiscing elit.</body> * <body>Nunc a mauris blandit ligula scelerisque tristique.</body> * </lucene:document> * </lucene:index> * </pre> * </dd> * </dl> * * @version $Id$ */ public class LuceneIndexTransformer extends AbstractTransformer implements CacheableProcessingComponent, Configurable, Contextualizable { public static final String ANALYZER_CLASSNAME_CONFIG = "analyzer-classname"; public static final String ANALYZER_CLASSNAME_PARAMETER = "analyzer-classname"; public static final String ANALYZER_CLASSNAME_DEFAULT = "org.apache.lucene.analysis.standard.StandardAnalyzer"; public static final String DIRECTORY_CONFIG = "directory"; public static final String DIRECTORY_PARAMETER = "directory"; public static final String DIRECTORY_DEFAULT = "index"; public static final String MERGE_FACTOR_CONFIG = "merge-factor"; public static final String MERGE_FACTOR_PARAMETER = "merge-factor"; public static final int MERGE_FACTOR_DEFAULT = 20; public static final String OPTIMIZE_FREQUENCY_CONFIG = "optimize-frequency"; public static final String OPTIMIZE_FREQUENCY_PARAMETER = "optimize-frequency"; // by default, optimizing will take place on every update (previous behaviour) public static final int OPTIMIZE_FREQUENCY_DEFAULT = 1; public static final String MAX_FIELD_LENGTH_CONFIG = "max-field-length"; public static final String MAX_FIELD_LENGTH_PARAMETER = "max-field-length"; public static final int MAX_FIELD_LENGTH_DEFAULT = IndexWriter.DEFAULT_MAX_FIELD_LENGTH; public static final String LUCENE_URI = "http://apache.org/cocoon/lucene/1.0"; public static final String LUCENE_QUERY_ELEMENT = "index"; public static final String LUCENE_QUERY_ANALYZER_ATTRIBUTE = "analyzer"; public static final String LUCENE_QUERY_DIRECTORY_ATTRIBUTE = "directory"; public static final String LUCENE_QUERY_CREATE_ATTRIBUTE = "create"; public static final String LUCENE_QUERY_MERGE_FACTOR_ATTRIBUTE = "merge-factor"; public static final String LUCENE_QUERY_MAX_FIELD_LENGTH_ATTRIBUTE = "max-field-length"; public static final String LUCENE_QUERY_OPTIMIZE_FREQUENCY_CONFIG_ATTRIBUTE = "optimize-frequency"; public static final String LUCENE_DOCUMENT_ELEMENT = "document"; public static final String LUCENE_DOCUMENT_URL_ATTRIBUTE = "url"; public static final String LUCENE_ELEMENT_ATTR_TO_TEXT_ATTRIBUTE = "text-attr"; public static final String LUCENE_ELEMENT_ATTR_STORE_VALUE = "store"; public static final String LUCENE_ELAPSED_TIME_ATTRIBUTE = "elapsed-time"; public static final String CDATA = "CDATA"; // The 3 states of the state machine private static final int STATE_GROUND = 0; // initial or "ground" state private static final int STATE_QUERY = 1; // processing a lucene:index (Query) element private static final int STATE_DOCUMENT = 2; // processing a lucene:document element // Initialization time variables protected File workDir = null; // Declaration time parameters values (specified in sitemap component config) private IndexerConfiguration configureConfiguration; // Invocation time parameters values (specified in sitemap transform parameters) private IndexerConfiguration setupConfiguration; // Parameters specified in the input document private IndexerConfiguration queryConfiguration; // Runtime variables private int processing; private boolean createIndex = false; private IndexWriter writer; private StringBuffer bodyText; private Document bodyDocument; private String bodyDocumentURL; private Stack elementStack = new Stack(); /** * Storage for the document element's attributes until the document has been * indexed, so that they can be copied to the output along with a boolean * <code>indexed</code> attribute. */ private AttributesImpl documentAttributes; private long documentStartTime; /** * Class name of the Lucene text analyzer to use. Typically depends on the * language of the text being indexed. See the Lucene documentation for more * information. */ private String analyzer = ANALYZER_CLASSNAME_DEFAULT; /** * Location of directory where index files are stored. This path is relative * to the Cocoon work directory */ private String directory = DIRECTORY_DEFAULT; /** * Determines how often segment indices are merged. See the Lucene * documentation for more information. */ private int mergeFactor = MERGE_FACTOR_DEFAULT; /** * Maximum number of terms to index in a field (as far as the index is * concerned, the document will effectively be truncated at this point. The * default value, 10k, may not be sufficient for large documents. */ private int maxFieldLength = MAX_FIELD_LENGTH_DEFAULT; /** Determines how often the lucene index will be optimized. */ private int optimizeFrequency = OPTIMIZE_FREQUENCY_DEFAULT; private static String uid(String url) { return url.replace('/', '\u0000'); } /** * Configure the transformer. The configuration parameters are stored as * general defaults, which may be over-ridden by parameters specified as * parameters in the sitemap pipeline, or by attributes of the query * element(s) in the XML input document. */ public void configure(Configuration conf) throws ConfigurationException { this.configureConfiguration = new IndexerConfiguration( conf.getChild(ANALYZER_CLASSNAME_CONFIG).getValue(ANALYZER_CLASSNAME_DEFAULT), conf.getChild(DIRECTORY_CONFIG).getValue(DIRECTORY_DEFAULT), conf.getChild(MERGE_FACTOR_CONFIG).getValueAsInteger(MERGE_FACTOR_DEFAULT), conf.getChild(MAX_FIELD_LENGTH_CONFIG).getValueAsInteger(MAX_FIELD_LENGTH_DEFAULT), conf.getChild(OPTIMIZE_FREQUENCY_CONFIG).getValueAsInteger(OPTIMIZE_FREQUENCY_DEFAULT)); } /** * Setup the transformer. Called when the pipeline is assembled. The * parameters are those specified as child elements of the * <code><map:transform></code> element in the sitemap. These * parameters are optional: If no parameters are specified here then the * defaults are supplied by the component configuration. Any parameters * specified here may be over-ridden by attributes of the lucene:index * element in the input document. */ public void setup(SourceResolver resolver, Map objectModel, String src, Parameters parameters) throws ProcessingException, SAXException, IOException { setupConfiguration = new IndexerConfiguration( parameters.getParameter(ANALYZER_CLASSNAME_PARAMETER, configureConfiguration.analyzerClassname), parameters.getParameter(DIRECTORY_PARAMETER, configureConfiguration.indexDirectory), parameters.getParameterAsInteger(MERGE_FACTOR_PARAMETER, configureConfiguration.indexerMergeFactor), parameters.getParameterAsInteger(MAX_FIELD_LENGTH_PARAMETER, configureConfiguration.indexerMaxFieldLength), parameters.getParameterAsInteger(OPTIMIZE_FREQUENCY_PARAMETER, configureConfiguration.indexerOptimizeFrequency)); } /** * Contextualize this class */ public void contextualize(Context context) throws ContextException { this.workDir = (File) context.get(Constants.CONTEXT_WORK_DIR); } /** * @see org.apache.cocoon.xml.AbstractXMLProducer#recycle() */ public void recycle() { this.processing = STATE_GROUND; if (this.writer != null) { try { this.writer.close(); } catch (IOException ioe) { } this.writer = null; } this.bodyText = null; this.bodyDocument = null; this.bodyDocumentURL = null; this.elementStack.clear(); super.recycle(); } /** * Generate the unique key. This key must be unique inside the space of this * component. * * @return The generated key */ public Serializable getKey() { return "1"; } /** * Generate the validity object. * * @return The generated validity object or <code>null</code> if the * component is currently not cacheable. */ public SourceValidity getValidity() { return NOPValidity.SHARED_INSTANCE; } public void startDocument() throws SAXException { super.startDocument(); } public void endDocument() throws SAXException { super.endDocument(); } /** * Begin the scope of a prefix-URI Namespace mapping. * * @param prefix * The Namespace prefix being declared. * @param uri * The Namespace URI the prefix is mapped to. */ public void startPrefixMapping(String prefix, String uri) throws SAXException { if (processing == STATE_GROUND) { super.startPrefixMapping(prefix, uri); } } /** * End the scope of a prefix-URI mapping. * * @param prefix * The prefix that was being mapping. */ public void endPrefixMapping(String prefix) throws SAXException { if (processing == STATE_GROUND) { super.endPrefixMapping(prefix); } } public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException { if (processing == STATE_GROUND) { if (LUCENE_URI.equals(namespaceURI) && LUCENE_QUERY_ELEMENT.equals(localName)) { String sCreate = atts.getValue(LUCENE_QUERY_CREATE_ATTRIBUTE); createIndex = BooleanUtils.toBoolean(sCreate); String analyzerClassname = atts.getValue(LUCENE_QUERY_ANALYZER_ATTRIBUTE); String indexDirectory = atts.getValue(LUCENE_QUERY_DIRECTORY_ATTRIBUTE); String mergeFactorStr = atts.getValue(LUCENE_QUERY_MERGE_FACTOR_ATTRIBUTE); String maxFieldLengthStr = atts.getValue(LUCENE_QUERY_MAX_FIELD_LENGTH_ATTRIBUTE); String optimizeFrequencyStr = atts.getValue(LUCENE_QUERY_OPTIMIZE_FREQUENCY_CONFIG_ATTRIBUTE); queryConfiguration = new IndexerConfiguration( analyzerClassname != null ? analyzerClassname : setupConfiguration.analyzerClassname, indexDirectory != null ? indexDirectory : setupConfiguration.indexDirectory, mergeFactorStr != null ? Integer.parseInt(mergeFactorStr) : setupConfiguration.indexerMergeFactor, maxFieldLengthStr != null ? Integer.parseInt(maxFieldLengthStr) : setupConfiguration.indexerMaxFieldLength, optimizeFrequencyStr != null ? Integer.parseInt(optimizeFrequencyStr) : setupConfiguration.indexerOptimizeFrequency); if (!createIndex) { // Not asked to create the index - but check if this is necessary anyway: try { IndexReader reader = openReader(); reader.close(); } catch (IOException ioe) { // couldn't open the index - so recreate it createIndex = true; } } // propagate the lucene:index to the next stage in the pipeline super.startElement(namespaceURI, localName, qName, atts); processing = STATE_QUERY; } else { super.startElement(namespaceURI, localName, qName, atts); } } else if (processing == STATE_QUERY) { // processing a lucene:index - expecting a lucene:document if (LUCENE_URI.equals(namespaceURI) && LUCENE_DOCUMENT_ELEMENT.equals(localName)) { this.bodyDocumentURL = atts.getValue(LUCENE_DOCUMENT_URL_ATTRIBUTE); if (this.bodyDocumentURL == null) { throw new SAXException("<lucene:document> must have @url attribute"); } // Remember the time the document indexing began this.documentStartTime = System.currentTimeMillis(); // remember these attributes so they can be passed on to the next stage in the pipeline, // when this document element is ended. this.documentAttributes = new AttributesImpl(atts); this.bodyText = new StringBuffer(); this.bodyDocument = new Document(); this.elementStack.clear(); processing = STATE_DOCUMENT; } else { throw new SAXException("<lucene:index> element can contain only <lucene:document> elements!"); } } else if (processing == STATE_DOCUMENT) { elementStack.push(new IndexHelperField(localName, new AttributesImpl(atts))); } } public void endElement(String namespaceURI, String localName, String qName) throws SAXException { if (processing == STATE_QUERY) { if (LUCENE_URI.equals(namespaceURI) && LUCENE_QUERY_ELEMENT.equals(localName)) { if (needToOptimize()) { // End query processing try { if (this.writer == null) { openWriter(); } this.writer.optimize(); this.writer.close(); this.writer = null; } catch (IOException e) { throw new SAXException(e); } } // propagate the query element to the next stage in the pipeline super.endElement(namespaceURI, localName, qName); this.processing = STATE_GROUND; } else { throw new SAXException("</lucene:index> was expected!"); } } else if (processing == STATE_DOCUMENT) { if (LUCENE_URI.equals(namespaceURI) && LUCENE_DOCUMENT_ELEMENT.equals(localName)) { // End document processing this.bodyDocument.add(Field.UnStored(LuceneXMLIndexer.BODY_FIELD, this.bodyText.toString())); this.bodyText = null; this.bodyDocument.add(Field.UnIndexed(LuceneXMLIndexer.URL_FIELD, this.bodyDocumentURL)); // store: false, index: true, tokenize: false this.bodyDocument.add(new Field(LuceneXMLIndexer.UID_FIELD, uid(this.bodyDocumentURL), false, true, false)); try { reindexDocument(); } catch (IOException e) { throw new SAXException(e); } this.bodyDocumentURL = null; // propagate the lucene:document element to the next stage in the pipeline long elapsedTime = System.currentTimeMillis() - this.documentStartTime; this.documentAttributes.addAttribute("", LUCENE_ELAPSED_TIME_ATTRIBUTE, LUCENE_ELAPSED_TIME_ATTRIBUTE, CDATA, String.valueOf(elapsedTime)); super.startElement(namespaceURI, localName, qName, this.documentAttributes); super.endElement(namespaceURI, localName, qName); this.processing = STATE_QUERY; } else { // End element processing IndexHelperField tos = (IndexHelperField) elementStack.pop(); StringBuffer text = tos.getText(); Attributes atts = tos.getAttributes(); boolean attributesToText = atts.getIndex(LUCENE_URI, LUCENE_ELEMENT_ATTR_TO_TEXT_ATTRIBUTE) != -1; for (int i = 0; i < atts.getLength(); i++) { // Ignore Lucene attributes if (LUCENE_URI.equals(atts.getURI(i))) { continue; } String atts_lname = atts.getLocalName(i); String atts_value = atts.getValue(i); bodyDocument.add(Field.UnStored(localName + "@" + atts_lname, atts_value)); if (attributesToText) { text.append(atts_value); text.append(' '); bodyText.append(atts_value); bodyText.append(' '); } } boolean store = atts.getIndex(LUCENE_URI, LUCENE_ELEMENT_ATTR_STORE_VALUE) != -1; if (text != null && text.length() > 0) { if (store) { bodyDocument.add(Field.Text(localName, text.toString())); } else { bodyDocument.add(Field.UnStored(localName, text.toString())); } } } } else { // All other tags super.endElement(namespaceURI, localName, qName); } } public void characters(char[] ch, int start, int length) throws SAXException { if (processing == STATE_DOCUMENT && ch.length > 0 && start >= 0 && length > 1 && elementStack.size() > 0) { String text = new String(ch, start, length); ((IndexHelperField) elementStack.peek()).append(text); bodyText.append(text); bodyText.append(' '); } else if (processing == STATE_GROUND) { super.characters(ch, start, length); } } private void openWriter() throws IOException { File indexDirectory = new File(queryConfiguration.indexDirectory); if (!indexDirectory.isAbsolute()) { indexDirectory = new File(workDir, queryConfiguration.indexDirectory); } // If the index directory doesn't exist, then always create it. boolean indexExists = IndexReader.indexExists(indexDirectory); if (!indexExists) { createIndex = true; } // Get the index directory, creating it if necessary Directory directory = LuceneCocoonHelper.getDirectory(indexDirectory, createIndex); Analyzer analyzer = LuceneCocoonHelper.getAnalyzer(queryConfiguration.analyzerClassname); this.writer = new IndexWriter(directory, analyzer, createIndex); this.writer.mergeFactor = queryConfiguration.indexerMergeFactor; this.writer.maxFieldLength = queryConfiguration.indexerMaxFieldLength; } private IndexReader openReader() throws IOException { File indexDirectory = new File(queryConfiguration.indexDirectory); if (!indexDirectory.isAbsolute()) { indexDirectory = new File(workDir, queryConfiguration.indexDirectory); } Directory directory = LuceneCocoonHelper.getDirectory(indexDirectory, createIndex); IndexReader reader = IndexReader.open(directory); return reader; } private void reindexDocument() throws IOException { if (this.createIndex) { // The index is being created, so there's no need to delete the doc from an existing index. // This means we can keep a single IndexWriter open throughout the process. if (this.writer == null) { openWriter(); } this.writer.addDocument(this.bodyDocument); } else { // This is an incremental reindex, so the document should be removed from the index before adding it try { IndexReader reader = openReader(); reader.delete(new Term(LuceneXMLIndexer.UID_FIELD, uid(this.bodyDocumentURL))); reader.close(); } catch (IOException e) { /* ignore */ } openWriter(); this.writer.addDocument(this.bodyDocument); this.writer.close(); this.writer = null; } this.bodyDocument = null; } private static class IndexHelperField { String localName; StringBuffer text; Attributes attributes; IndexHelperField(String localName, Attributes atts) { this.localName = localName; this.attributes = atts; this.text = new StringBuffer(); } Attributes getAttributes() { return attributes; } StringBuffer getText() { return text; } void append(String text) { this.text.append(text); } void append(char[] str, int offset, int length) { this.text.append(str, offset, length); } } private static class IndexerConfiguration { String analyzerClassname; String indexDirectory; int indexerMergeFactor; int indexerMaxFieldLength; int indexerOptimizeFrequency; IndexerConfiguration(String analyzerClassname, String indexDirectory, int indexerMergeFactor, int indexerMaxFieldLength, int indexerOptimizeFrequency) { this.analyzerClassname = analyzerClassname; this.indexDirectory = indexDirectory; this.indexerMergeFactor = indexerMergeFactor; this.indexerMaxFieldLength = indexerMaxFieldLength; this.indexerOptimizeFrequency = indexerOptimizeFrequency; } } /** * Will check if, based on the configuration (optimize-frequency option), * the lucene index should be optimized. It uses a random number generator * to determine if it should optimize or not. * * This check was added because of large indexes, optimizing becomes quite * slow. * * From the lucene documentation: The IndexWriter class supports an * optimize() method that compacts the index database and speedup queries. * You may want to use this method after performing a complete indexing of * your document set or after incremental updates of the index. If your * incremental update adds documents frequently, you want to perform the * optimization only once in a while to avoid the extra overhead of the * optimization. * * @return true if we should optimize the index */ private boolean needToOptimize() { int optimizeFrequency = queryConfiguration.indexerOptimizeFrequency; if (optimizeFrequency == 0) { return false; } if (optimizeFrequency == 1) { return true; } // use a random int to determine if we may execute int randomInt = 1 + (int) (Math.random() * optimizeFrequency); if (randomInt == 1) { return true; } else { return false; } } /** * @return the analyzer */ public String getAnalyzer() { return analyzer; } /** * @param analyzer * the analyzer to set */ public void setAnalyzer(String analyzer) { this.analyzer = analyzer; } /** * @return the directory */ public String getDirectory() { return directory; } /** * @param directory * the directory to set */ public void setDirectory(String directory) { this.directory = directory; } /** * @return the mergeFactor */ public int getMergeFactor() { return mergeFactor; } /** * @param mergeFactor * the mergeFactor to set */ public void setMergeFactor(int mergeFactor) { this.mergeFactor = mergeFactor; } /** * @return the maxFieldLength */ public int getMaxFieldLength() { return maxFieldLength; } /** * @param maxFieldLength * the maxFieldLength to set */ public void setMaxFieldLength(int maxFieldLength) { this.maxFieldLength = maxFieldLength; } /** * @return the optimizeFrequency */ public int getOptimizeFrequency() { return optimizeFrequency; } /** * @param optimizeFrequency * the optimizeFrequency to set */ public void setOptimizeFrequency(int optimizeFrequency) { this.optimizeFrequency = optimizeFrequency; } }