package org.cdlib.xtf.textIndexer; /** * Copyright (c) 2006, Regents of the University of California * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the University of California nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * Acknowledgements: * * A significant amount of new and/or modified code in this module * was made possible by a grant from the Andrew W. Mellon Foundation, * as part of the Melvyl Recommender Project. */ import java.io.File; import org.apache.lucene.index.IndexReader; import org.apache.lucene.spelt.SpellWriter; import org.apache.lucene.util.ProgressTracker; import org.cdlib.xtf.util.Path; import org.cdlib.xtf.util.Trace; //////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////// /** * This class provides a simple mechanism for generating a spelling correction * dictionary after new documents have been added or updated. <br><br> * * To use this class, simply instantiate a copy, and call the * {@link IdxTreeDictMaker#processDir(File) processDir()} * method on a directory containing an index. Note that the directory passed * may also be a root directory with many index sub-directories if desired. */ public class IdxTreeDictMaker { //////////////////////////////////////////////////////////////////////////// /** * Create an <code>IdxTreeDictMaker</code> instance and call this method to * create spelling dictionaries for one or more Lucene indices. <br><br> * * @param dir The index database directory to scan. May be a * directory containing a single index, or the root * directory of a tree containing multiple indices. * <br><br> * * @.notes This method also calls itself recursively to process * potential index sub-directories below the passed * directory. */ public void processDir(File dir) throws Exception { // If the file we were passed was in fact a directory... if (dir.getAbsoluteFile().isDirectory()) { // And it contains an index, optimize it. if (IndexReader.indexExists(dir.getAbsoluteFile())) makeDict(dir); else { // Get the list of files it contains. String[] files = dir.getAbsoluteFile().list(); // And process each of them. for (int i = 0; i < files.length; i++) processDir(new File(dir, files[i])); } return; } // if( dir.isDirectory() ) // The current file is not a directory, so skip it. } // processDir() //////////////////////////////////////////////////////////////////////////// /** * Performs the actual work of creating a spelling dictionary. * <br><br> * * @param mainIdxDir The index database directory to scan. This * directory must contain a single Lucene index. * <br><br> * * @throws Exception Passes back any exceptions generated by Lucene * during the dictionary generation process. * <br><br> */ public void makeDict(File mainIdxDir) throws Exception { // Detect if spelling data is present. String indexPath = Path.normalizePath(mainIdxDir.toString()); String spellIdxPath = indexPath + "spellDict/"; String wordQueuePath = spellIdxPath + "newWords.txt"; String pairQueuePath = spellIdxPath + "newPairs.txt"; if (new File(wordQueuePath).length() < 1 && new File(pairQueuePath).length() < 1) { return; } // Tell what index we're working on... String mainIdxPath = Path.normalizePath(mainIdxDir.toString()); Trace.info("Index: [" + mainIdxPath + "] ... "); Trace.tab(); Trace.tab(); // for phase SpellWriter spellWriter = null; try { // Open the SpellWriter. We don't have to specify a stopword set for // this phase (it's only used during queuing.) // spellWriter = SpellWriter.open(new File(spellIdxPath)); spellWriter.setMinWordFreq(3); // Perform the update. spellWriter.flushQueuedWords(new ProgressTracker() { public void report(int pctDone, String descrip) { String pctTxt = Integer.toString(pctDone); while (pctTxt.length() < 3) pctTxt = " " + pctTxt; Trace.info("[" + pctTxt + "%] " + descrip); } }); } // try( to open the specified index ) catch (Exception e) { Trace.error("*** Dictionary Creation Halted Due to Error:" + e); throw e; } finally { spellWriter.close(); } Trace.untab(); // for phase Trace.untab(); Trace.info("Done."); } // makeDict() } // class IdxTreeDictMaker