package org.cdlib.xtf.textIndexer; /** * Copyright (c) 2004, Regents of the University of California * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the University of California nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * Acknowledgements: * * A significant amount of new and/or modified code in this module * was made possible by a grant from the Andrew W. Mellon Foundation, * as part of the Melvyl Recommender Project. */ import java.io.File; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermDocs; import org.apache.lucene.index.TermEnum; import org.cdlib.xtf.textEngine.IndexUtil; import org.cdlib.xtf.textEngine.NativeFSDirectory; import org.cdlib.xtf.util.Path; import org.cdlib.xtf.util.SubDirFilter; import org.cdlib.xtf.util.Trace; import java.io.IOException; //////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////// /** * This class provides a simple mechanism for removing documents from an index * when the source text no longer exists in the document library. <br><br> * * This class locates all the summary chunks for documents in an index, and * checks to see if the associated source text files exist. If a source text * file no longer exists for an indexed document, the summary and text chunks * for that document are removed from the index. <br><br> * * To use this class, simply instantiate a copy, and call the * {@link #cullIndex(File,IndexInfo,File,SubDirFilter) cullIndex()} * method on a directory containing an index. Note that the directory passed * may also be a root directory with many index sub-directories if desired. */ public class IdxTreeCuller { //////////////////////////////////////////////////////////////////////////// /** * Create an <code>IdxTreeCuller</code> instance and call this method to * remove documents from indices when the associated source text no longer * exists. <br><br> * * Performs the actual work of removing missing documents from an index. * <br><br> * * @param xtfHome The base directory relative to which file paths * are interpreted. * <br><br> * * @param idxInfo The index to cull. * <br><br> * * @param subDirFilter Sub-directory limitation, or null for all. * <br><br> * * @throws Exception Passes back any exceptions generated by Lucene * during the opening of, reading of, or writing to * the specified index. * <br><br> * */ public void cullIndex(File xtfHome, IndexInfo idxInfo, File srcRootFile, SubDirFilter subDirFilter) throws Exception { // Start with no Path fields encountered, and no documents culled. int docCount = 0; int cullCount = 0; IndexReader indexReader = null; TermEnum termEnum = null; try { // Try to open the index for reading. If we fail and // throw, skip the index. // String idxPath = Path.resolveRelOrAbs(xtfHome, idxInfo.indexPath); indexReader = IndexReader.open(NativeFSDirectory.getDirectory(idxPath)); termEnum = indexReader.terms(new Term("key", "")); do { Term term = termEnum.term(); if (term == null || !term.field().equals("key")) break; // Get the key, which contains the index name and the path from its // source directory. // String key = term.text(); assert key.indexOf(':') >= 0 : "Invalid index key - missing ':'"; String indexName = key.substring(0, key.indexOf(':')); String relPath = key.substring(key.indexOf(':') + 1); // Skip documents that aren't part of the index we want. if (!indexName.equals(idxInfo.indexName)) continue; // Create a reference to the source XML document. File currFile = new File(Path.resolveRelOrAbs(srcRootFile, relPath)); // If a subdirectory was specified, skip docs that aren't within it. if (subDirFilter != null && !subDirFilter.approve(currFile)) continue; // Track how many documents there are. docCount++; // If the source XML document doesn't exist... if (!currFile.exists()) { // In a non-optimized index, the document may still be in the term list // but actually have been deleted. // TermDocs docs = indexReader.termDocs(term); if (docs == null || !docs.next()) { docCount--; continue; } // Indicate which document we're looking at. Trace.tab(); Trace.info("[" + relPath + "] ... "); // Delete all chunks for the missing document. int nDel = indexReader.deleteDocuments(new Term("key", key)); // If no chunks were deleted, something's wrong, so bail. if (nDel == 0) { // Create an exception that we can throw. TextIndexerException e = new TextIndexerException( "*** Error: Unable to " + "delete chunks from index."); // Output an error message. Trace.tab(); Trace.error(e.getMessage()); Trace.untab(); // And throw the exception. throw e; } // Also delete the lazy file, if any. Might as well delete // empty parent directories as well. // File lazyFile = IndexUtil.calcLazyPath(xtfHome, idxInfo, currFile, false); if (lazyFile.canRead()) { if (!Path.deletePath(lazyFile.toString())) Trace.warning("Could not delete lazy-tree file"); } /////////////////////////// // Diagnostic Output // /////////////////////////// // Trace.tab(); Trace.debug("Deleted " + nDel + "Chunks."); Trace.untab(); // Track how many documents we've culled. cullCount++; // Output info. Trace.more(Trace.info, "Missing: Removed from Index."); Trace.untab(); } // if( !currFile.exists() ) } while (termEnum.next()); // Now if the number of documents encounted equals the number // of documents deleted, there's a good chance the index is // empty and we can delete the whole index directory. // boolean indexDeleted = false; if (docCount == cullCount) { boolean anyNotDeleted = false; for (int i = 1; i < indexReader.maxDoc(); i++) { if (!indexReader.isDeleted(i)) { anyNotDeleted = true; break; } } if (!anyNotDeleted) { deleteIndex(new File(Path.resolveRelOrAbs(xtfHome, idxInfo.indexPath))); indexDeleted = true; } } // if( docCount == cullCount ) // Close the term enumeration and reader. termEnum.close(); termEnum = null; indexReader.close(); indexReader = null; // The current index isn't empty, but if we deleted a // document from it, say so. // if (cullCount == 1) Trace.info(cullCount + " Missing Document Removed."); // Likewise, if we deleted more than one document, say so. else if (cullCount > 1) Trace.info(cullCount + " Missing Documents Removed."); // If we didn't delete any documents from the directory, say so. else Trace.info("No Missing Documents to Remove."); // If the entire index was deleted, say so. if (indexDeleted) Trace.info("Empty Index Deleted."); } // try( to open the specified index ) catch (Exception e) { // Close the term enumeration if (termEnum != null) { try { termEnum.close(); } catch (Exception e2) { } } // Close up the index reader. if (indexReader != null) { try { indexReader.close(); } catch (Exception e2) { } } // Log the problem. Trace.info( "*** Exception encountered removing missing documents: " + e.getClass() + "\n" + " With message: " + e.getMessage()); Trace.error("Skipped Due to Errors."); // Pass the exception on. throw e; } } // cullIndex() //////////////////////////////////////////////////////////////////////////// private void deleteIndex(File idxDirToCull) throws IOException { int deleteFailCount = 0; // First, we need to delete all the files in the index // directory, before we can delete the directory itself. // File[] fileList = idxDirToCull.listFiles(); // Delete the files. for (int j = 0; j < fileList.length; j++) { // Try to delete the current file. try { fileList[j].delete(); } // If we could not, display a warning and track the delete // failure count. // catch (Throwable t) { Trace.tab(); Trace.warning( "*** Warning: Unable to Delete [ " + Path.normalizeFileName(fileList[j].toString()) + " ]."); Trace.untab(); deleteFailCount++; } } // for( int j = 0; j < fileList.length; j++ ) // If some files couldn't be deleted, there's no point in // continuing, so stop gracefully now. // if (deleteFailCount > 0) { if (deleteFailCount > 1) Trace.info( "Empty Index not deleted because " + deleteFailCount + " files could not " + "be removed from index directory."); else Trace.info( "Empty Index not deleted because " + "a file could not be removed from " + "index directory."); return; } // Now start with the index directory... File dir = idxDirToCull; // And delete it and all the empty parent directories // above it. // for (;;) { // If the current directory is not empty, we're done. File[] contents = dir.listFiles(); if (contents.length != 0) break; // Otherwise, hang on to the parent directory for // the current directory. // File parentDir = dir.getParentFile(); // Try to delete the current directory. try { dir.delete(); } // If we could not, display a warning and end gracefully, // since we can't continue to delete parent directories if // the current one can't be deleted. // catch (Throwable t) { Trace.tab(); Trace.info( "*** Warning: Unable to delete empty " + "index directory [" + dir.toString() + "]."); Trace.untab(); return; } // catch( Throwable t ) // Then back up to the parent and repeat. dir = parentDir; } // for(;;) } // deleteIndex() } // class IdxTreeCuller