package org.cdlib.xtf.textIndexer; /** * Copyright (c) 2004, Regents of the University of California * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the University of California nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ import java.io.File; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexReader; import org.cdlib.xtf.textEngine.NativeFSDirectory; import org.cdlib.xtf.util.Path; import org.cdlib.xtf.util.Trace; //////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////// /** * This class purges "incomplete" documents from a Lucene index. <br><br> * * A "complete" document consists of all the overlapping text chunks for the * document plus a special docInfo chunk that provides summary information * about the rest of the chunks in the document. Since the summary chunk is * the last chunk written for a document, any early termination of the indexer * (due to errors, or user abort) will leave text chunks in the database * without the summary chunk, which is called an "incomplete" document. <br><br> * * Since the search engine relies on the summary chunk to correctly search * overlapping text chunks, the absence of the summary chunk will cause * problems. Consequently, this class is used to purge text chunks from the * index that do not have a corresponding summary chunk. <br><br> * * To use this class, simply instantiate a copy, and call the * {@link IdxTreeCleaner#processDir(File) processDir()} * method on a directory containing an index. Note that the directory passed * may also be a root directory with many index sub-directories if desired. * */ public class IdxTreeCleaner { //////////////////////////////////////////////////////////////////////////// /** * Create an <code>IdxTreeCleaner</code> instance and call this method to * remove "incomplete" documents from an index directory or a root * directory containing multiple indices. * <br><br> * * @param dir The index database directory clean. May be a directory * containing a single index, or the root directory of a * tree containing multiple indices. * <br><br> * * @throws Exception Passes back any exceptions generated by the * cleanIndex() function, which is called for * each index sub-directory found. * <br><br> * * @.notes This method also calls itself recursively to process * potential index sub-directories below the passed * directory. <br><br> * * For an explanation of "complete" and "incomplete" documents, see the * <code>IdxTreeCleaner<code> class description. */ public void processDir(File dir) throws Exception { // If the file we were passed was in fact a directory... if (dir.isDirectory()) { // And it contains an index, see if it needs any culling. if (IndexReader.indexExists(dir)) cleanIndex(dir); else { // Get the list of files it contains. String[] files = dir.list(); // And process each of them. for (int i = 0; i < files.length; i++) processDir(new File(dir, files[i])); } return; } // if( dir.isDirectory() ) // The current file is not a directory, so skip it. } // processDir() //////////////////////////////////////////////////////////////////////////// /** * Performs the actual work of removing incomplete documents from an index. * <br><br> * * @param idxDirToClean The index database directory clean. This directory * must contain a single Lucene index. * <br><br> * * @throws Exception Passes back any exceptions generated by Lucene * during the opening of, reading of, or writing to * the specified index. <br><br> * * For an explanation of "complete" and "incomplete" documents, see the * <code>IdxTreeCleaner</code> class description. */ public void cleanIndex(File idxDirToClean) throws Exception { IndexReader indexReader; // Tell what index we're working on... Trace.info("Index: [" + Path.normalizePath(idxDirToClean.toString()) + "] "); // Try to open the index for reading. If we fail and // throw, skip the index. // try { indexReader = IndexReader.open(NativeFSDirectory.getDirectory(idxDirToClean)); } catch (Throwable t) { Trace.warning( "*** Warning: Unable to Open Index [" + idxDirToClean + "] for Cleaning."); return; } // Determine the number of chunks in the index, and which one is last. int chunkCount = indexReader.numDocs(); int lastChunk = chunkCount - 1; // Start with no incomplete documents cleaned. int cleanCount = 0; // The last chunk in an index must be a docInfo chunk. If it is not, the // chunk is a partial write of an incomplete document and must be removed. // // In the case where the last chunk is marked as 'deleted', it could be // because either (1) the last completed document was deleted, or (2) the // last cleanIndex() pass didn't finish. We keep on going to make sure we // complete in the case of (2). // while (lastChunk > 0) { // If deleted, keep going until we reach a non-deleted chunk. if (indexReader.isDeleted(lastChunk)) { lastChunk--; continue; } // Get the last chunk in the index. Document chunk = indexReader.document(lastChunk); // If this chunk is a docInfo chunk, the index ends in a complete // document, and we're done. // if (chunk.get("docInfo") != null) break; // Otherwise, it is a chunk from an incomplete document, so delete it. try { indexReader.deleteDocument(lastChunk); } catch (Exception e) { // Log the problem. Trace.tab(); Trace.error("*** Exception Purging Incomplete Document: " + e.getMessage()); Trace.untab(); // Close the index. indexReader.close(); // And pass the exception up the call chain. throw e; } catch (Throwable t) { // Log the problem. Trace.tab(); Trace.error("*** Exception Purging Incomplete Document: " + t.getMessage()); Trace.untab(); // Close the index. indexReader.close(); // And pass the exception up the call chain. throw new RuntimeException(t); } cleanCount++; lastChunk--; } // Close up the index reader. indexReader.close(); // Now if the number of chunks encounted equals the number // of chunks cleaned, we can delete the whole index directory. // if (chunkCount == cleanCount) { int deleteFailCount = 0; // FIrst, we need to delete all the files in the index // directory, before we can delete the directory itself. // File[] fileList = idxDirToClean.listFiles(); // Delete the files. for (int j = 0; j < fileList.length; j++) { // Try to delete the current file. try { fileList[j].delete(); } // If we could not, display a warning and track the delete // failure count. // catch (Throwable t) { Trace.tab(); Trace.warning( "*** Warning: Unable to Delete [ " + fileList[j].toString() + " ]."); Trace.untab(); deleteFailCount++; } } // for( int j = 0; j < fileList.length; j++ ) // If some files couldn't be deleted, there's no point in // continuing, so stop gracefully now. // if (deleteFailCount > 0) { if (deleteFailCount > 1) Trace.info( "Empty Index not deleted because " + deleteFailCount + " files could not " + "be removed from index directory."); else Trace.info( "Empty Index not deleted because " + "a file could not be removed from " + "index directory."); return; } // Now start with the index directory... File dir = idxDirToClean; // And delete it and all the empty parent directories // above it. // for (;;) { // If the current directory is not empty, we're done. File[] contents = dir.listFiles(); if (contents.length != 0) break; // Otherwise, hang on to the parent directory for // the current directory. // File parentDir = dir.getParentFile(); // Try to delete the current directory. try { dir.delete(); } // If we could not, display a warning and end gracefully, // since we can't continue to delete parent directories if // the current one can't be deleted. // catch (Throwable t) { Trace.tab(); Trace.warning( "*** Warning: Unable to delete empty " + "index directory [" + dir.getAbsolutePath() + "]."); Trace.untab(); return; } // catch( Throwable t ) // Then back up to the parent and repeat. dir = parentDir; } // for(;;) } // if( docCount == cleanCount ) // The current index isn't empty, but if we deleted a // document from it, say so. // else if (cleanCount == 1) Trace.info(cleanCount + " Incomplete Document Fragment Purged."); // Likewise, if we deleted more than one document, say so. else if (cleanCount > 1) Trace.info(cleanCount + " Incomplete Document Fragments Purged."); // If we didn't delete any documents from the directory, say so. else Trace.info("No Incomplete Documents Found."); // If the entire index was deleted, say so. if (chunkCount == cleanCount) Trace.info("Empty Index Deleted."); } // cleanIndex() } // class IdxTreeCleaner