DefaultDirectoryIndexer.java example

Explorer
spring-modules-master
/*
 * Copyright 2002-2005 the original author or authors.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.springmodules.lucene.index.object.directory;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexWriter;
import org.springmodules.lucene.index.LuceneIndexAccessException;
import org.springmodules.lucene.index.LuceneIndexingException;
import org.springmodules.lucene.index.document.handler.DocumentHandler;
import org.springmodules.lucene.index.document.handler.DocumentHandlerManager;
import org.springmodules.lucene.index.document.handler.file.AbstractInputStreamDocumentHandler;
import org.springmodules.lucene.index.factory.IndexFactory;
import org.springmodules.lucene.index.factory.IndexWriterFactoryUtils;
import org.springmodules.lucene.index.factory.LuceneIndexWriter;
import org.springmodules.lucene.index.object.AbstractDocumentManagerIndexer;
import org.springmodules.lucene.util.IOUtils;

/**
 * <b>This is the central class in the lucene directory indexing package.</b>
 * It simplifies the use of lucene to index a directory specifying the base
 * directory and the way to index every files contained in this directory and
 * its sub directories.
 * It helps to avoid common errors and to manage these resource in a flexible
 * manner.
 * It executes core Lucene workflow, leaving application code to focus on
 * the way to create Lucene documents from a row for a request.
 * 
 * <p>This class is based on a DocumentHandlerManager instance that can be
 * injected with IoC. So several document handlers for different file types
 * can be registred declaratively or using the registration methods of this
 * class. By default, only the text handler is registred for files
 * with the "txt" extension.
 *
 * <p>This class is based on the IndexFactory abstraction which is a
 * factory to create IndexWriter for the configured Directory. For the
 * execution and the indexation of the corresponding data, the indexer
 * uses the same IndexWriter. It calls the IndexWriterFactoryUtils
 * class to eventually release it. So the indexer doesn't need to always
 * hold resources during the indexation of every requests and
 * this avoids some locking problems on the index. You can too apply
 * different strategies for managing index resources.
 *
 * <p>Can be used within a service implementation via direct instantiation
 * with a IndexFactory reference, or get prepared in an application context
 * and given to services as bean reference. Note: The IndexFactory should
 * always be configured as a bean in the application context, in the first case
 * given to the service directly, in the second case to the prepared template.
 * 
 * @author Thierry Templier
 * @see org.springmodules.lucene.index.object.AbstractIndexer
 * @see org.springmodules.lucene.index.factory.IndexFactory
 * @see org.springmodules.lucene.index.support.file.DocumentHandlerManager
 * @see org.springmodules.lucene.index.support.file.ExtensionDocumentHandlerManager
 * @see org.springmodules.lucene.index.support.file.ExtensionDocumentHandlerManagerFactoryBean
 * @see org.springmodules.lucene.index.support.file.DocumentMatching
 * @see org.springmodules.lucene.index.support.file.DocumentHandler
 * @see org.springmodules.lucene.index.object.directory.DocumentIndexingListener
 * @see org.springmodules.lucene.index.factory.IndexWriterFactoryUtils#getIndexWriter(IndexFactory)
 * @see org.springmodules.lucene.index.factory.IndexWriterFactoryUtils#releaseIndexWriter(IndexFactory, IndexWriter)
 */
public class DefaultDirectoryIndexer extends AbstractDocumentManagerIndexer implements DirectoryIndexer {

	private List listeners;

	/**
	 * Construct a new DirectoryIndexer, given an IndexFactory to obtain IndexWriter.
	 * 
	 * @param indexFactory IndexFactory to obtain IndexWriter
	 */
	public DefaultDirectoryIndexer(IndexFactory indexFactory) {
		this(indexFactory, null);
	}

	/**
	 * Construct a new DirectoryIndexer, given an IndexFactory to obtain IndexWriter
	 * and a DocumentHandlerManager which has been configured with Spring.
	 * 
	 * @param indexFactory IndexFactory to obtain IndexWriter
	 * @param documentHandlerManager DocumentHandlerManager which will be used by the indexer
	 */
	public DefaultDirectoryIndexer(IndexFactory indexFactory, DocumentHandlerManager documentHandlerManager) {
		setIndexFactory(indexFactory);
		init(documentHandlerManager);
	}

	protected void init(DocumentHandlerManager documentHandlerManager) {
		super.init(documentHandlerManager);
		this.listeners = new ArrayList();
	}

	/**
	 * This method is used to add a listener to be notified during the
	 * indexing execution.
	 * 
	 * @param listener the listener to add
	 */
	public void addListener(FileDocumentIndexingListener listener) {
		if( listener!=null ) {
			listeners.add(listener);
		}
	}

	/**
	 * This method is used to remove a specified listener.
	 * 
	 * @param listener the listener to remove
	 */
	public void removeListener(FileDocumentIndexingListener listener) {
		if( listener!=null ) {
			listeners.remove(listener);
		}
	}

	/**
	 * This method is used to get the list of listeners to notify
	 * during the indexing execution.
	 * 
	 * @return the list of listeners to notify
	 */
	public List getListeners() {
		return listeners;
	}

	/**
	 * This method is used to fire the "on before directory" event to
	 * every listeners.
	 * 
	 * <p>This event will be fired before the indexing of a directory.
	 * 
	 * @param file the directory which will be indexed
	 */
	protected void fireListenersOnBeforeDirectory(File file) {
		for(Iterator i = listeners.iterator(); i.hasNext();) {
			FileDocumentIndexingListener listener = (FileDocumentIndexingListener)i.next();
			listener.beforeIndexingDirectory(file);
		}
	}

	/**
	 * This method is used to fire the "on after directory" event to
	 * every listeners.
	 * 
	 * <p>This event will be fired after the indexing of a directory, even
	 * if there have been errors during the indexing of its files.
	 * 
	 * @param file the directory which has been indexed
	 */
	protected void fireListenersOnAfterDirectory(File file) {
		for(Iterator i = listeners.iterator(); i.hasNext();) {
			FileDocumentIndexingListener listener = (FileDocumentIndexingListener)i.next();
			listener.afterIndexingDirectory(file);
		}
	}

	/**
	 * This method is used to fire the "on before file" event to
	 * every listeners.
	 * 
	 * <p>This event will be fired before the indexing of a file, even
	 * if there are errors during its indexing.
	 * 
	 * @param file the file which will be indexed
	 */
	protected void fireListenersOnBeforeFile(File file) {
		for(Iterator i = listeners.iterator(); i.hasNext();) {
			FileDocumentIndexingListener listener = (FileDocumentIndexingListener)i.next();
			listener.beforeIndexingFile(file);
		}
	}

	/**
	 * This method is used to fire the "on after file" event to
	 * every listeners.
	 * 
	 * <p>This event will be fired after the indexing of a file. It will
	 * not happen if there is an indexing error.
	 * 
	 * @param file the file which have been indexed
	 */
	protected void fireListenersOnAfterFile(File file) {
		for(Iterator i = listeners.iterator(); i.hasNext();) {
			FileDocumentIndexingListener listener = (FileDocumentIndexingListener)i.next();
			listener.afterIndexingFile(file);
		}
	}

	/**
	 * This method is used to fire the "on error file" event to
	 * every listeners.
	 * 
	 * <p>This event will be fired if there is an indexing error.
	 * 
	 * @param file the file on which the error occurs
	 */
	protected void fireListenersOnErrorFile(File file,Exception ex) {
		for(Iterator i = listeners.iterator(); i.hasNext();) {
			FileDocumentIndexingListener listener = (FileDocumentIndexingListener)i.next();
			listener.onErrorIndexingFile(file,ex);
		}
	}

	/**
	 * This method is used to fire the "on no handler available" event to
	 * every listeners.
	 * 
	 * <p>This event will be fired if there is no matching handler to index
	 * the file.
	 * 
	 * @param file the file to index
	 */
	protected void fireListenersOnNoHandlerAvailable(File file) {
		for(Iterator i = listeners.iterator(); i.hasNext();) {
			FileDocumentIndexingListener listener = (FileDocumentIndexingListener)i.next();
			listener.onNotAvailableHandler(file);
		}
	}

	/**
	 * This method parses the directory, its index every files and
	 * calls itself recursively for all its sub directories in order
	 * to index them.
	 * 
	 * <p>This method fires too the different events corresponding to the
	 * indexing. At the beginning, the onBeforeDirectory method is called
	 * on every listeners. At the end, the onAfterDirectory method is called
	 * on every listeners.
	 * 
	 * @param writer the IndexWriter used to index files
	 * @param dirToParse the based directory to index
	 * @throws IOException if thrown by a Lucene method, to be auto-converted
	 * to a LuceneManipulateIndexException
	 * @see DocumentIndexingListener
	 */
	private void indexDirectory(LuceneIndexWriter writer, File dirToParse) throws IOException {
		fireListenersOnBeforeDirectory(dirToParse);
		File[] files = dirToParse.listFiles();
		if( files==null ) {
			return;
		}

		Arrays.sort(files, new Comparator() {
			public int compare(Object o1, Object o2) {
				File f1 = (File) o1;
				File f2 = (File) o2;
				return f1.getName().compareTo(f2.getName());
			}
		});

		for(int cpt=0; cpt<files.length; cpt++) {
			File currentFile = files[cpt];
			if (currentFile.isDirectory()) {
				indexDirectory(writer, currentFile);
			} else {
				indexFile(writer, currentFile);
			}
		}
		fireListenersOnAfterDirectory(dirToParse);
	}

	/**
	 * This method defines the description of the file to pass as parameter
	 * to the handler used and invokes it to get the indexed document for
	 * the file.
	 *  
	 * @param file the file to index
	 * @param inputStream the corresponding input stream
	 * @param handler the handler to use to index the file
	 * @return the indexed document to add to the index
	 * @throws IOException if thrown by a Lucene method, to be auto-converted
	 * to a LuceneManipulateIndexException
	 */
	private Document doCallHandler(File file,FileInputStream inputStream, DocumentHandler handler) throws Exception {
		Map description = new HashMap();
		description.put(AbstractInputStreamDocumentHandler.FILENAME, file.getAbsolutePath());
		return handler.getDocument(description, inputStream);
	}

	/**
	 * This method indexes a file if there is a registred document handler
	 * which matches. It correctly opens and closes the file even if there
	 * are errors during the indexing.
	 * 
	 * <p>This method fires too the different events corresponding to the
	 * indexing. At the beginning, the beforeIndexingFile method is called
	 * on every listeners. If there is no matching document handler, the
	 * onNotAvailableHandler method is called on every listeners. If the
	 * indexing of the file is correctly done, the afterIndexingFile method
	 * is called and the onErrorIndexingFile otherwise.
	 * 
	 * @param writer the IndexWriter used to index files
	 * @param file the file to index
	 * @throws IOException if thrown by a Lucene method, to be auto-converted
	 * to a LuceneManipulateIndexException
	 * @see #doCallHandler(File, FileInputStream, DocumentHandler)
	 * @see DocumentIndexingListener
	 */
	private void indexFile(LuceneIndexWriter writer, File file) throws IOException {
		fireListenersOnBeforeFile(file);
		DocumentHandler handler = doGetDocumentHandler(file);
		if( handler!=null ) {
			FileInputStream inputStream = null;
			try {
				inputStream = new FileInputStream(file);
				Document document = doCallHandler(file, inputStream, handler);
				if( document!=null ) {
					writer.addDocument(document);
				}
				fireListenersOnAfterFile(file);
			} catch(IOException ex) {
				fireListenersOnErrorFile(file, ex);
			} catch(Exception ex) {
				logger.error("Error during indexing the file "+file.getName(), ex);
				fireListenersOnErrorFile(file, ex);
			} finally {
				IOUtils.closeInputStream(inputStream);
			}
		} else {
			fireListenersOnNoHandlerAvailable(file);
		}
	}

	/**
	 * This method is used to intercept the exception if the
	 * handler corresponding to the file is not found.
	 * 
	 * The returned document handler in this case is null.
	 * 
	 * @param file the file to use
	 * @return the corresponding document handler
	 */
	private DocumentHandler doGetDocumentHandler(File file) {
		DocumentHandler handler = null;
		try {
			handler = getDocumentHandler(file.getPath());
		} catch(Exception ex) {
		}
		return handler;
	}

	/**
	 * This method is the entry point to index a directory recursively. It uses
	 * the registred document handlers to index every files.
	 * 
	 * <p>In this case, the index will not be optimized.
	 *  
	 * @param dirToParse the base directory to index recursively
	 * @see #index(String, boolean)
	 */
	public void index(String dirToParse) {
		index(dirToParse, false);
	}

	/**
	 * This method checks if the directory to parse exists.
	 * 
	 * @param dirToParse the directory to check
	 * @return true if it exists, otherwise false
	 */
	private boolean checkBaseDirectory(String dirToParse) {
		File dir = new File(dirToParse);
		return dir.exists();
	}

	/**
	 * This method is the entry point to index a directory recursively. It uses
	 * the registred document handlers to index every files.
	 * 
	 * <p>In this case, the index will be optimized at the end 
	 * if the value of the optimizeIndex parameter is true.
	 * 
	 * <p>If there is an error during executing a file, the other files will be
	 * executed. However the error will notify to specified listeners.
	 *  
	 * <p>This method gets an IndexWriter instance from the IndexWriterFactoryUtils
	 * class and release it at the end if necessary. 
	 * 
	 * @param dirToParse the base directory to index recursively
	 * @param optimizeIndex if the index must be optimized after
	 * the request indexing
	 * @see #indexDirectory(IndexWriter, File)
	 * @see #indexFile(IndexWriter, File)
	 * @see IndexWriterFactoryUtils#getIndexWriter(IndexFactory)
	 * @see IndexWriterFactoryUtils#releaseIndexWriter(IndexFactory, IndexWriter)
	 */
	public void index(String dirToParse, boolean optimizeIndex) {
		if( !checkBaseDirectory(dirToParse) ) {
			throw new LuceneIndexingException("The base directory doesn't exist!");
		}

		LuceneIndexWriter writer = IndexWriterFactoryUtils.getIndexWriter(getIndexFactory());
		try {
			File file = new File(dirToParse);
			//Indexing the directory
			if( file.isDirectory() ) {
				indexDirectory(writer, new File(dirToParse));
			} else {
				indexFile(writer, file);
			}
			//Optimize the index
			if( optimizeIndex ) {
				writer.optimize();
			}
		} catch(IOException ex) {
			logger.error("Error during indexing the directory : "+dirToParse, ex);
			throw new LuceneIndexAccessException("Error during indexing the directory : "+dirToParse, ex);
		} catch(Exception ex) {
			ex.printStackTrace();
		} finally {
			IndexWriterFactoryUtils.releaseIndexWriter(getIndexFactory(), writer);
		}
	}

}