/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.cocoon.components.search; import org.apache.avalon.framework.activity.Disposable; import org.apache.avalon.framework.configuration.Configurable; import org.apache.avalon.framework.configuration.Configuration; import org.apache.avalon.framework.configuration.ConfigurationException; import org.apache.avalon.framework.logger.AbstractLogEnabled; import org.apache.avalon.framework.service.ServiceException; import org.apache.avalon.framework.service.ServiceManager; import org.apache.avalon.framework.service.Serviceable; import org.apache.cocoon.ProcessingException; import org.apache.cocoon.components.crawler.CocoonCrawler; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermEnum; import org.apache.lucene.store.Directory; import java.io.IOException; import java.net.URL; import java.util.Iterator; /** * A lucene indexer. * * <p> * XML documents are indexed using lucene. * Links to XML documents are supplied by * a crawler, requesting links of documents by specifying a cocoon-view, and * HTTP protocol. * </p> * * @author <a href="mailto:berni_huber@a1.net">Bernhard Huber</a> * @version CVS $Id$ */ public class SimpleLuceneCocoonIndexerImpl extends AbstractLogEnabled implements LuceneCocoonIndexer, Configurable, Serviceable, Disposable { /** * configuration tagname for specifying the analyzer class */ public final static String ANALYZER_CLASSNAME_CONFIG = "analyzer-classname"; /** * configuration default analyzer class */ public final static String ANALYZER_CLASSNAME_DEFAULT = "org.apache.lucene.analysis.standard.StandardAnalyzer"; /** * configuration tagname for specifying lucene's index directory */ public final static String DIRECTORY_CONFIG = "directory"; /** * configuration default directory, ie. no default. */ public final static String DIRECTORY_DEFAULT = null; /** * configuration tagname for specifying lucene's merge factor. */ public final static String MERGE_FACTOR_CONFIG = "merge-factor"; /** * configuration default value for * <a href="http://www.mail-archive.com/lucene-user@jakarta.apache.org/msg00373.html">lucene's merge factor</a>. */ public final static int MERGE_FACTOR_DEFAULT = 10; /** * The service manager for looking up components used. */ protected ServiceManager manager = null; /** The used lucene analyzer */ protected Analyzer analyzer; // private String analyzerClassnameDefault = ANALYZER_CLASSNAME_DEFAULT; /** The Lucene Merge Factor */ private int mergeFactor = MERGE_FACTOR_DEFAULT; /** * Sets the analyzer attribute of the SimpleLuceneCocoonIndexerImpl object * * @param analyzer * The new analyzer value */ public void setAnalyzer(Analyzer analyzer) { this.analyzer = analyzer; } /** * Configure this component. * * @param conf is the configuration * @exception ConfigurationException is thrown if configuring fails */ public void configure(Configuration conf) throws ConfigurationException { Configuration child; /* child = conf.getChild(ANALYZER_CLASSNAME_CONFIG, false); if (child != null) { // fix Bugzilla Bug 25277, use child.getValue // and in all following blocks String value = child.getValue(ANALYZER_CLASSNAME_DEFAULT); if (value != null) { analyzerClassnameDefault = value; } } */ child = conf.getChild(MERGE_FACTOR_CONFIG, false); if (child != null) { // fix Bugzilla Bug 25277, use child instead of conf int int_value = child.getValueAsInteger(MERGE_FACTOR_DEFAULT); mergeFactor = int_value; } } /** * Set the current <code>ServiceManager</code> instance used by this * <code>Serviceable</code>. * * @param manager used by this component * @exception ServiceException is never thrown */ public void service(ServiceManager manager) throws ServiceException { this.manager = manager; } /** * Dispose this component. */ public void dispose() { } /** * index content of base_url, index content of links from base_url. * * @param index * the lucene store to write the index to * @param create * if true create, or overwrite existing index, else update * existing index. * @param base_url * index content of base_url, and crawl through all its links * recursivly. * @exception ProcessingException * is thrown if indexing fails */ public void index(Directory index, boolean create, URL base_url) throws ProcessingException { IndexWriter writer = null; LuceneXMLIndexer lxi = null; CocoonCrawler cocoonCrawler = null; try { lxi = (LuceneXMLIndexer) manager.lookup(LuceneXMLIndexer.ROLE); writer = new IndexWriter(index, analyzer, create); writer.mergeFactor = this.mergeFactor; cocoonCrawler = (CocoonCrawler) manager.lookup(CocoonCrawler.ROLE); cocoonCrawler.crawl(base_url); Iterator cocoonCrawlerIterator = cocoonCrawler.iterator(); while (cocoonCrawlerIterator.hasNext()) { URL crawl_url = (URL) cocoonCrawlerIterator.next(); // result of fix Bugzilla Bug 25270, in SimpleCocoonCrawlerImpl // check if crawl_url is null if (crawl_url == null) { continue; } else if (!crawl_url.getHost().equals(base_url.getHost()) || crawl_url.getPort() != base_url.getPort()) { // skip urls using different host, or port than host, // or port of base url if (getLogger().isDebugEnabled()) { getLogger().debug("Skipping crawling URL " + crawl_url.toString() + " as base_url is " + base_url.toString()); } continue; } // build lucene documents from the content of the crawl_url Iterator i = lxi.build(crawl_url).iterator(); // add all built lucene documents while (i.hasNext()) { writer.addDocument((Document) i.next()); } } // optimize it writer.optimize(); } catch (IOException ioe) { throw new ProcessingException("IOException in index()", ioe); } catch (ServiceException se) { throw new ProcessingException("Could not lookup service in index()", se); } finally { if (writer != null) { try { writer.close(); } catch (IOException ioe) { } writer = null; } if (lxi != null) { manager.release(lxi); lxi = null; } if (cocoonCrawler != null) { manager.release(cocoonCrawler); cocoonCrawler = null; } } } /** * A document iterator deleting "old" documents form the index. * * TODO: use this class before indexing, in non-creating mode. */ static class DocumentDeletableIterator { private IndexReader reader; // existing index private TermEnum uidIter; // document id iterator /** * Constructor for the DocumentDeletableIterator object * * @param directory * Description of Parameter * @exception IOException * Description of Exception */ public DocumentDeletableIterator(Directory directory) throws IOException { reader = IndexReader.open(directory); // open existing index uidIter = reader.terms(new Term("uid", "")); // init uid iterator } /** * Description of the Method * * @exception IOException * Description of Exception */ public void deleteAllStaleDocuments() throws IOException { while (uidIter.term() != null && uidIter.term().field().equals("uid")) { reader.delete(uidIter.term()); uidIter.next(); } } /** * Description of the Method * * @param uid * Description of Parameter * @exception IOException * Description of Exception */ public void deleteModifiedDocuments(String uid) throws IOException { while (documentHasBeenModified(uidIter.term(), uid)) { reader.delete(uidIter.term()); uidIter.next(); } if (documentHasNotBeenModified(uidIter.term(), uid)) { uidIter.next(); } } /** * Description of the Method * * @exception Throwable * Description of Exception */ protected void finalize() throws Throwable { super.finalize(); if (uidIter != null) { uidIter.close(); // close uid iterator uidIter = null; } if (reader != null) { reader.close(); // close existing index reader = null; } } /** * Description of the Method * * @param term * Description of Parameter * @return Description of the Returned Value */ boolean documentIsDeletable(Term term) { return term != null && term.field().equals("uid"); } /** * Description of the Method * * @param term * Description of Parameter * @param uid * Description of Parameter * @return Description of the Returned Value */ boolean documentHasBeenModified(Term term, String uid) { return documentIsDeletable(term) && term.text().compareTo(uid) < 0; } /** * Description of the Method * * @param term * Description of Parameter * @param uid * Description of Parameter * @return Description of the Returned Value */ boolean documentHasNotBeenModified(Term term, String uid) { return documentIsDeletable(term) && term.text().compareTo(uid) == 0; } } }