/* * Copyright 2002-2005 the original author or authors. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.springmodules.lucene.index.object.directory; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Comparator; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexWriter; import org.springmodules.lucene.index.LuceneIndexAccessException; import org.springmodules.lucene.index.LuceneIndexingException; import org.springmodules.lucene.index.document.handler.DocumentHandler; import org.springmodules.lucene.index.document.handler.DocumentHandlerManager; import org.springmodules.lucene.index.document.handler.file.AbstractInputStreamDocumentHandler; import org.springmodules.lucene.index.factory.IndexFactory; import org.springmodules.lucene.index.factory.IndexWriterFactoryUtils; import org.springmodules.lucene.index.factory.LuceneIndexWriter; import org.springmodules.lucene.index.object.AbstractDocumentManagerIndexer; import org.springmodules.lucene.util.IOUtils; /** * <b>This is the central class in the lucene directory indexing package.</b> * It simplifies the use of lucene to index a directory specifying the base * directory and the way to index every files contained in this directory and * its sub directories. * It helps to avoid common errors and to manage these resource in a flexible * manner. * It executes core Lucene workflow, leaving application code to focus on * the way to create Lucene documents from a row for a request. * * <p>This class is based on a DocumentHandlerManager instance that can be * injected with IoC. So several document handlers for different file types * can be registred declaratively or using the registration methods of this * class. By default, only the text handler is registred for files * with the "txt" extension. * * <p>This class is based on the IndexFactory abstraction which is a * factory to create IndexWriter for the configured Directory. For the * execution and the indexation of the corresponding data, the indexer * uses the same IndexWriter. It calls the IndexWriterFactoryUtils * class to eventually release it. So the indexer doesn't need to always * hold resources during the indexation of every requests and * this avoids some locking problems on the index. You can too apply * different strategies for managing index resources. * * <p>Can be used within a service implementation via direct instantiation * with a IndexFactory reference, or get prepared in an application context * and given to services as bean reference. Note: The IndexFactory should * always be configured as a bean in the application context, in the first case * given to the service directly, in the second case to the prepared template. * * @author Thierry Templier * @see org.springmodules.lucene.index.object.AbstractIndexer * @see org.springmodules.lucene.index.factory.IndexFactory * @see org.springmodules.lucene.index.support.file.DocumentHandlerManager * @see org.springmodules.lucene.index.support.file.ExtensionDocumentHandlerManager * @see org.springmodules.lucene.index.support.file.ExtensionDocumentHandlerManagerFactoryBean * @see org.springmodules.lucene.index.support.file.DocumentMatching * @see org.springmodules.lucene.index.support.file.DocumentHandler * @see org.springmodules.lucene.index.object.directory.DocumentIndexingListener * @see org.springmodules.lucene.index.factory.IndexWriterFactoryUtils#getIndexWriter(IndexFactory) * @see org.springmodules.lucene.index.factory.IndexWriterFactoryUtils#releaseIndexWriter(IndexFactory, IndexWriter) */ public class DefaultDirectoryIndexer extends AbstractDocumentManagerIndexer implements DirectoryIndexer { private List listeners; /** * Construct a new DirectoryIndexer, given an IndexFactory to obtain IndexWriter. * * @param indexFactory IndexFactory to obtain IndexWriter */ public DefaultDirectoryIndexer(IndexFactory indexFactory) { this(indexFactory, null); } /** * Construct a new DirectoryIndexer, given an IndexFactory to obtain IndexWriter * and a DocumentHandlerManager which has been configured with Spring. * * @param indexFactory IndexFactory to obtain IndexWriter * @param documentHandlerManager DocumentHandlerManager which will be used by the indexer */ public DefaultDirectoryIndexer(IndexFactory indexFactory, DocumentHandlerManager documentHandlerManager) { setIndexFactory(indexFactory); init(documentHandlerManager); } protected void init(DocumentHandlerManager documentHandlerManager) { super.init(documentHandlerManager); this.listeners = new ArrayList(); } /** * This method is used to add a listener to be notified during the * indexing execution. * * @param listener the listener to add */ public void addListener(FileDocumentIndexingListener listener) { if( listener!=null ) { listeners.add(listener); } } /** * This method is used to remove a specified listener. * * @param listener the listener to remove */ public void removeListener(FileDocumentIndexingListener listener) { if( listener!=null ) { listeners.remove(listener); } } /** * This method is used to get the list of listeners to notify * during the indexing execution. * * @return the list of listeners to notify */ public List getListeners() { return listeners; } /** * This method is used to fire the "on before directory" event to * every listeners. * * <p>This event will be fired before the indexing of a directory. * * @param file the directory which will be indexed */ protected void fireListenersOnBeforeDirectory(File file) { for(Iterator i = listeners.iterator(); i.hasNext();) { FileDocumentIndexingListener listener = (FileDocumentIndexingListener)i.next(); listener.beforeIndexingDirectory(file); } } /** * This method is used to fire the "on after directory" event to * every listeners. * * <p>This event will be fired after the indexing of a directory, even * if there have been errors during the indexing of its files. * * @param file the directory which has been indexed */ protected void fireListenersOnAfterDirectory(File file) { for(Iterator i = listeners.iterator(); i.hasNext();) { FileDocumentIndexingListener listener = (FileDocumentIndexingListener)i.next(); listener.afterIndexingDirectory(file); } } /** * This method is used to fire the "on before file" event to * every listeners. * * <p>This event will be fired before the indexing of a file, even * if there are errors during its indexing. * * @param file the file which will be indexed */ protected void fireListenersOnBeforeFile(File file) { for(Iterator i = listeners.iterator(); i.hasNext();) { FileDocumentIndexingListener listener = (FileDocumentIndexingListener)i.next(); listener.beforeIndexingFile(file); } } /** * This method is used to fire the "on after file" event to * every listeners. * * <p>This event will be fired after the indexing of a file. It will * not happen if there is an indexing error. * * @param file the file which have been indexed */ protected void fireListenersOnAfterFile(File file) { for(Iterator i = listeners.iterator(); i.hasNext();) { FileDocumentIndexingListener listener = (FileDocumentIndexingListener)i.next(); listener.afterIndexingFile(file); } } /** * This method is used to fire the "on error file" event to * every listeners. * * <p>This event will be fired if there is an indexing error. * * @param file the file on which the error occurs */ protected void fireListenersOnErrorFile(File file,Exception ex) { for(Iterator i = listeners.iterator(); i.hasNext();) { FileDocumentIndexingListener listener = (FileDocumentIndexingListener)i.next(); listener.onErrorIndexingFile(file,ex); } } /** * This method is used to fire the "on no handler available" event to * every listeners. * * <p>This event will be fired if there is no matching handler to index * the file. * * @param file the file to index */ protected void fireListenersOnNoHandlerAvailable(File file) { for(Iterator i = listeners.iterator(); i.hasNext();) { FileDocumentIndexingListener listener = (FileDocumentIndexingListener)i.next(); listener.onNotAvailableHandler(file); } } /** * This method parses the directory, its index every files and * calls itself recursively for all its sub directories in order * to index them. * * <p>This method fires too the different events corresponding to the * indexing. At the beginning, the onBeforeDirectory method is called * on every listeners. At the end, the onAfterDirectory method is called * on every listeners. * * @param writer the IndexWriter used to index files * @param dirToParse the based directory to index * @throws IOException if thrown by a Lucene method, to be auto-converted * to a LuceneManipulateIndexException * @see DocumentIndexingListener */ private void indexDirectory(LuceneIndexWriter writer, File dirToParse) throws IOException { fireListenersOnBeforeDirectory(dirToParse); File[] files = dirToParse.listFiles(); if( files==null ) { return; } Arrays.sort(files, new Comparator() { public int compare(Object o1, Object o2) { File f1 = (File) o1; File f2 = (File) o2; return f1.getName().compareTo(f2.getName()); } }); for(int cpt=0; cpt<files.length; cpt++) { File currentFile = files[cpt]; if (currentFile.isDirectory()) { indexDirectory(writer, currentFile); } else { indexFile(writer, currentFile); } } fireListenersOnAfterDirectory(dirToParse); } /** * This method defines the description of the file to pass as parameter * to the handler used and invokes it to get the indexed document for * the file. * * @param file the file to index * @param inputStream the corresponding input stream * @param handler the handler to use to index the file * @return the indexed document to add to the index * @throws IOException if thrown by a Lucene method, to be auto-converted * to a LuceneManipulateIndexException */ private Document doCallHandler(File file,FileInputStream inputStream, DocumentHandler handler) throws Exception { Map description = new HashMap(); description.put(AbstractInputStreamDocumentHandler.FILENAME, file.getAbsolutePath()); return handler.getDocument(description, inputStream); } /** * This method indexes a file if there is a registred document handler * which matches. It correctly opens and closes the file even if there * are errors during the indexing. * * <p>This method fires too the different events corresponding to the * indexing. At the beginning, the beforeIndexingFile method is called * on every listeners. If there is no matching document handler, the * onNotAvailableHandler method is called on every listeners. If the * indexing of the file is correctly done, the afterIndexingFile method * is called and the onErrorIndexingFile otherwise. * * @param writer the IndexWriter used to index files * @param file the file to index * @throws IOException if thrown by a Lucene method, to be auto-converted * to a LuceneManipulateIndexException * @see #doCallHandler(File, FileInputStream, DocumentHandler) * @see DocumentIndexingListener */ private void indexFile(LuceneIndexWriter writer, File file) throws IOException { fireListenersOnBeforeFile(file); DocumentHandler handler = doGetDocumentHandler(file); if( handler!=null ) { FileInputStream inputStream = null; try { inputStream = new FileInputStream(file); Document document = doCallHandler(file, inputStream, handler); if( document!=null ) { writer.addDocument(document); } fireListenersOnAfterFile(file); } catch(IOException ex) { fireListenersOnErrorFile(file, ex); } catch(Exception ex) { logger.error("Error during indexing the file "+file.getName(), ex); fireListenersOnErrorFile(file, ex); } finally { IOUtils.closeInputStream(inputStream); } } else { fireListenersOnNoHandlerAvailable(file); } } /** * This method is used to intercept the exception if the * handler corresponding to the file is not found. * * The returned document handler in this case is null. * * @param file the file to use * @return the corresponding document handler */ private DocumentHandler doGetDocumentHandler(File file) { DocumentHandler handler = null; try { handler = getDocumentHandler(file.getPath()); } catch(Exception ex) { } return handler; } /** * This method is the entry point to index a directory recursively. It uses * the registred document handlers to index every files. * * <p>In this case, the index will not be optimized. * * @param dirToParse the base directory to index recursively * @see #index(String, boolean) */ public void index(String dirToParse) { index(dirToParse, false); } /** * This method checks if the directory to parse exists. * * @param dirToParse the directory to check * @return true if it exists, otherwise false */ private boolean checkBaseDirectory(String dirToParse) { File dir = new File(dirToParse); return dir.exists(); } /** * This method is the entry point to index a directory recursively. It uses * the registred document handlers to index every files. * * <p>In this case, the index will be optimized at the end * if the value of the optimizeIndex parameter is true. * * <p>If there is an error during executing a file, the other files will be * executed. However the error will notify to specified listeners. * * <p>This method gets an IndexWriter instance from the IndexWriterFactoryUtils * class and release it at the end if necessary. * * @param dirToParse the base directory to index recursively * @param optimizeIndex if the index must be optimized after * the request indexing * @see #indexDirectory(IndexWriter, File) * @see #indexFile(IndexWriter, File) * @see IndexWriterFactoryUtils#getIndexWriter(IndexFactory) * @see IndexWriterFactoryUtils#releaseIndexWriter(IndexFactory, IndexWriter) */ public void index(String dirToParse, boolean optimizeIndex) { if( !checkBaseDirectory(dirToParse) ) { throw new LuceneIndexingException("The base directory doesn't exist!"); } LuceneIndexWriter writer = IndexWriterFactoryUtils.getIndexWriter(getIndexFactory()); try { File file = new File(dirToParse); //Indexing the directory if( file.isDirectory() ) { indexDirectory(writer, new File(dirToParse)); } else { indexFile(writer, file); } //Optimize the index if( optimizeIndex ) { writer.optimize(); } } catch(IOException ex) { logger.error("Error during indexing the directory : "+dirToParse, ex); throw new LuceneIndexAccessException("Error during indexing the directory : "+dirToParse, ex); } catch(Exception ex) { ex.printStackTrace(); } finally { IndexWriterFactoryUtils.releaseIndexWriter(getIndexFactory(), writer); } } }