package org.apache.lucene.spelt; /* * Copyright 2007 The Apache Software Foundation. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.File; import java.io.IOException; import java.io.StringReader; import java.util.List; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.StopAnalyzer; import org.apache.lucene.analysis.StopFilter; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.ProgressTracker; /** * Utility class to convert the stored fields of a Lucene index into a spelling * dictionary. This is generally less desirable than integrating dictionary * creation into the original index creation process (e.g. using * {@link SpellWritingAnalyzer} or {@link SpellWritingFilter}) since that will * grab non-stored as well as stored fields. Still, if that isn't an option or * if you simply want to test out spelling correction, after-the-fact dictionary * creation may be useful. * * @author Martin Haye */ public class LuceneIndexToDict { /** * Read a Lucene index and make a spelling dictionary from it. A minimal token * analyzer will be used, which is usually just what is needed for the * dictionary. The default set of English stop words will be used (see * {@link StopAnalyzer#ENGLISH_STOP_WORDS}). * * @param indexDir directory containing the Lucene index * @param dictDir directory to receive the spelling dictionary */ public static void createDict(Directory indexDir, File dictDir) throws IOException { createDict(indexDir, dictDir, null); } /** * Read a Lucene index and make a spelling dictionary from it. A minimal token * analyzer will be used, which is usually just what is needed for the * dictionary. The default set of English stop words will be used (see * {@link StopAnalyzer#ENGLISH_STOP_WORDS}). * * @param indexDir directory containing the Lucene index * @param dictDir directory to receive the spelling dictionary * @param prog tracker called periodically to display progress */ public static void createDict(Directory indexDir, File dictDir, ProgressTracker prog) throws IOException { // Open and clear the dictionary (since we're going to totally rebuild it) SpellWriter spellWriter = SpellWriter.open(dictDir); spellWriter.clearDictionary(); spellWriter.setStopwords(StopFilter.makeStopSet(StopAnalyzer.ENGLISH_STOP_WORDS)); // Now re-tokenize all the fields and queue the words for the dictionary. IndexReader indexReader = IndexReader.open(indexDir); createDict(indexReader, new MinimalAnalyzer(), spellWriter, prog); // All done. spellWriter.close(); } /** * Read a Lucene index and make a spelling dictionary from it. A minimal token * analyzer will be used, which is usually just what is needed for the * dictionary. The default set of English stop words will be used (see * {@link StopAnalyzer#ENGLISH_STOP_WORDS}). * * @param indexReader used to read fields from a Lucene index * @param analyzer used to tokenize fields from the index; generally, * this should do minimal filtering, taking care to avoid substantive * token modification (such as stemming or depluralization). A good * choice is {@link MinimalAnalyzer}. * @param spellWriter receives words to be added to the dictionary * @param prog tracker called periodically to display progress */ public static void createDict(IndexReader indexReader, Analyzer analyzer, SpellWriter spellWriter, ProgressTracker prog) throws IOException { // Supply a null progress tracker if none supplied. if (prog == null) { prog = new ProgressTracker() { public @Override void report(int pctDone, String descrip) { } }; } // Split into phases. Seems like the re-analysis takes a lot longer than // dictionary creation. // ProgressTracker[] phaseTrackers = prog.split(70, 30); // Now re-tokenize all the fields and queue the words for the dictionary. queueWords(indexReader, analyzer, spellWriter, phaseTrackers[0]); indexReader.close(); // Perform the final dictionary creation spellWriter.flushQueuedWords(phaseTrackers[1]); } /** * Re-tokenize all the words in stored fields within a Lucene index, * and queue them to a spelling dictionary. Does not flush the writer * to form the final dictionary, so could be called repeatedly to * queue words from multiple Lucene indexes. * * @param reader used to read fields from a Lucene index * @param analyzer used to tokenize fields from the index; generally, * this should do minimal filtering, taking care to avoid substantive * token modification (such as stemming or depluralization). A good * choice is {@link MinimalAnalyzer}. * @param writer receives words to be added to the dictionary * @param prog tracker called periodically to display progress */ public static void queueWords(IndexReader reader, Analyzer analyzer, SpellWriter writer, ProgressTracker prog) throws IOException { // Iterate every document in the source index for (int docId = 0; docId < reader.maxDoc(); docId++) { // Give periodic feedback. if ((docId & 0xff) == 0) prog.progress(docId, reader.maxDoc(), "Re-analyzed " + docId + " documents."); // Skip deleted documents if (reader.isDeleted(docId)) continue; // Get the document. Document doc = reader.document(docId); if (doc == null) continue; // Iterate every stored field in the document. for (Field field : (List<Field>) doc.getFields()) { // Skip fields that aren't tokenized. if (!field.isTokenized()) continue; // Iterate every value of that field. String[] values = doc.getValues(field.name()); if (values == null) continue; for (String val : values) { // Add each word to the dictionary. TokenStream toks = analyzer.tokenStream(field.name(), new StringReader(val)); Token tok; while ((tok = toks.next()) != null) writer.queueWord(tok.termText()); writer.queueBreak(); } } } // Force the final progress message. prog.progress(100, 100, "Re-analyzed " + reader.maxDoc() + " documents.", true); } /** * Command-line interface for build a dictionary directly from a Lucene index * without writing any code. */ public static void main(String[] args) { if (args.length != 2) { System.err.println("Usage: ... LuceneIndexToDict <luceneIndexDir> <targetDictDir>"); System.exit(1); } System.out.println("\n*** Lucene to dictionary conversion utility ***\n"); IndexReader indexReader = null; SpellWriter spellWriter = null; int exitVal = 1; try { File indexDir = new File(args[0]); File dictDir = new File(args[1]); // We'll want to print out status messages periodically. final long startTime = System.currentTimeMillis(); ProgressTracker prog = new ProgressTracker() { public void report(int pctDone, String descrip) { System.out.println(String.format("%6.1f sec [%3d%%] %s", (System.currentTimeMillis() - startTime) / 1000.0f, pctDone, descrip)); } }; prog.setMinInterval(3000); // Go for it. createDict(FSDirectory.getDirectory(indexDir), dictDir, prog); exitVal = 0; } catch (IOException e) { System.out.flush(); System.err.println("Unexpected exception: " + e); e.printStackTrace(System.err); } finally { try { if (indexReader != null) indexReader.close(); if (spellWriter != null) spellWriter.close(); } catch (IOException e) { } } System.exit(exitVal); } }