package org.cdlib.xtf.textIndexer;
/**
* Copyright (c) 2004, Regents of the University of California
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* - Neither the name of the University of California nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import org.apache.lucene.index.IndexReader;
import org.cdlib.xtf.textEngine.IndexValidator;
import org.cdlib.xtf.textEngine.NativeFSDirectory;
import org.cdlib.xtf.util.DirSync;
import org.cdlib.xtf.util.Path;
import org.cdlib.xtf.util.SubDirFilter;
import org.cdlib.xtf.util.Trace;
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
/**
* This class is the main class for the TextIndexer program. <br><br>
*
* Internally, this class retrieves command line arguments, and processes them
* in order to index source XML files into one or more Lucene databases. The
* command line arguments required by the TextIndexer program are as follows:
*
* <blockquote dir=ltr style="MARGIN-RIGHT: 0px"><code>
* <b>TextIndexer -config</b> <font color=#0000ff><i>CfgFilePath</i></font>
* { {<b>-clean</b>|<b>-incremental</b>}?
* {<b>-trace errors</b>|<b>warnings</b>|<b>info</b>|<b>debug</b>}?
* <b>-index</b> <font color=#0000ff><i>IndexName</i></font> }+
* </b></code></blockquote>
*
* The <code>-config</code> argument identifies an XML configuration file that
* defines one or more indices to be created, updated, or deleted. This argument
* must be the first argument passed, and it must be passed only once. For a
* complete description of the contents of the configuration file, see the
* {@link XMLConfigParser} class.<br><br>
*
* The <code>-clean</code> / <code>-incremental</code> argument is an optional
* argument that specifies whether Lucene indices should be rebuilt from scratch
* (<code>-clean</code>) or should be updated (<code>-incremental</code>). If
* this argument is not specified, the default behavior is incremental. <br><br>
*
* The <code>-buildlazy</code> / <code>-nobuildlazy</code> argument is an
* optional argument that specifies whether the indexer should build a
* persistent ("lazy") version of each document during the indexing process.
* The lazy files are stored in the index directory, and they speed dynaXML
* access later. If this argument is not specified, the default behavior is
* to build lazy versions of the documents. <br><br>
*
* The <code>-optimize</code> / <code>-nooptimize</code> argument is an optional
* argument that specifies whether the indexer should optimize the indexes after
* they are built. Optimization improves query speed, but can take a very long
* time to complete depending on the index size. If this argument is not
* specified, the default behavior is to optimize. <br><br>
*
* The <code>-trace</code> argument is an optional argument that sets the level
* of output displayed by the text indexer. The output levels are defined as
* follows:
*
* <blockquote dir=ltr style="MARGIN-RIGHT: 0px">
* <code>errors</code> - Only error messages are displayed. <br>
* <code>warnings</code> - Both error and warning messages are displayed. <br>
* <code>info</code> - Error, warning, and informational messages are displayed. <br>
* <code>debug</code> - Low level debug output is displayed in addition to
* error, warning and informational messages.<br><br>
* </blockquote>
*
* If this argument is not specified, the TextIndexer defaults to displaying
* informational (<code>info</code>) level messages.<br><br>
*
* The <code>-index</code> argument identifies the name of the index to be
* created/updated. The name must be one of the index names contained in the
* configuration file specified as the first parameter. As is mentioned above,
* the <code>-config</code> parameter must be specified first. After that,
* the remaining arguments may be used one or more times to update a single
* index or multiple indices. <br><br><br>
*
* A simple example of a command line parameters for the TextIndexer might
* look like this:
* <br><br>
*
* <code><blockquote dir=ltr style="MARGIN-RIGHT: 0px"><b>
* TextIndexer -config IdxConfig.xml -clean -index AllText
* </b></blockquote></code>
*
* This example assumes that the config file is called <code>IdxConfig.xml</code>,
* that the config file contains an entry for an index called <b>AllText</b>, and
* that the user wants the index to be rebuilt from scratch (because of the
* <code>-clean</code> argument. <br><br>
*
*/
public class TextIndexer
{
/** The version to be shown to the user (does not need to string compare as higher than prev.) */
public static final String SHOW_VERSION = "2.2";
/** The version of the text indexer (placed into any indexes created) */
public static final String CURRENT_VERSION = "2.2b";
/** The minimum index version that we can read */
public static final String REQUIRED_VERSION = "2.2b";
//////////////////////////////////////////////////////////////////////////////
/** Main entry-point for the Text Indexer. <br><br>
*
* This function takes the command line arguments passed and uses them to
* create or update the specified indices with the specified source text.
* <br><br>
*
* @param args Command line arguments to process. The command line
* arguments required by the TextIndexer program are as follows:
*
* <blockquote dir=ltr style="MARGIN-RIGHT: 0px"><code>
* <b>TextIndexer -config</b> <font color=#0000ff><i>CfgFilePath</i></font>
* { {<b>-clean</b>|<b>-incremental</b>}?
* {<b>-trace errors</b>|<b>warnings</b>|<b>info</b>|<b>debug</b>}?
* <b>-index</b> <font color=#0000ff><i>IndexName</i></font> }+
* </b></code></blockquote>
*
* For a complete description of each command line argument, see the
* {@link TextIndexer} class description.
* <br><br>
*
*/
public static void main(String[] args)
{
try
{
IndexerConfig cfgInfo = new IndexerConfig();
XMLConfigParser cfgParser = new XMLConfigParser();
IdxTreeCleaner indexCleaner = new IdxTreeCleaner();
int startArg = 0;
boolean showUsage = false;
boolean firstIndex = true;
long startTime = System.currentTimeMillis();
// Regardless of whether we succeed or fail, say our name.
Trace.info("TextIndexer v" + SHOW_VERSION);
Trace.info("");
Trace.tab();
// Make sure the XTF_HOME environment variable is specified.
cfgInfo.xtfHomePath = System.getProperty("xtf.home");
if (cfgInfo.xtfHomePath == null || cfgInfo.xtfHomePath.length() == 0) {
Trace.error("Error: xtf.home property not found");
System.exit(1);
}
cfgInfo.xtfHomePath = Path.normalizePath(cfgInfo.xtfHomePath);
if (!new File(cfgInfo.xtfHomePath).isDirectory()) {
Trace.error(
"Error: xtf.home directory \"" + cfgInfo.xtfHomePath +
"\" does not exist or cannot be read.");
System.exit(1);
}
File xtfHomeFile = new File(cfgInfo.xtfHomePath);
// Perform indexing for each index specified.
for (;;)
{
// The minimum set of arguments consists of the name of an index
// to update. If we don't get at least that two, we will show the
// usage text and bail.
//
if (args.length < 2)
showUsage = true;
// We have enough arguments, so...
else
{
// Read the command line arguments until we find what we
// need to do some work, or until we run out.
//
int ret = cfgInfo.readCmdLine(args, startArg);
// If we didn't find enough command line arguments...
if (ret == -1)
{
// And this is the first time we've visited the command
// line arguments, avoid trying to doing work and just
// display the usage text. Otherwise, we're done.
//
if (startArg == 0)
showUsage = true;
else
break;
} // if( ret == -1 )
// We did find enough command line arguments, so...
//
else
{
// Make sure the configuration path is absolute
if (!(new File(cfgInfo.cfgFilePath).isAbsolute())) {
cfgInfo.cfgFilePath = Path.resolveRelOrAbs(cfgInfo.xtfHomePath,
cfgInfo.cfgFilePath);
}
// Get the configuration for the index specified by the
// current command line arguments.
//
if (cfgParser.configure(cfgInfo) < 0) {
Trace.error(
"Error: index '" + cfgInfo.indexInfo.indexName +
"' not found\n");
System.exit(1);
}
// Since we're starting a new index, reset the 'must
// clean' flag so that the index will get cleaned if
// requested.
//
cfgInfo.mustClean = cfgInfo.clean;
// Set the tracing level specified by the user.
Trace.setOutputLevel(cfgInfo.traceLevel);
} // else( ret != -1 )
// Save the new start argument for the next time.
startArg = ret;
} // else( args.length >= 4 )
// If the config file was not okay, print a message and bail out.
if (showUsage)
{
// Do so...
Trace.error("Usage: textIndexer {options} -index indexname");
Trace.error("Basic options:");
Trace.tab();
Trace.error("-config <configfile> Default: -config textIndexer.conf");
Trace.error("-incremental|-clean|-force Default: -incremental");
Trace.error("-dir <subdir1> -dir <subdir2>... Default: (all directories)");
Trace.error("-dirlist <fileOfSubdirs> Default: (all directories)");
Trace.error("\n");
Trace.untab();
Trace.error("Advanced options:");
Trace.error("-nosync|-syncfrom <otherIndexDir> Default: -nosync");
Trace.error("-optimize|-nooptimize Default: -optimize");
Trace.error("-trace errors|warnings|info|debug Default: -trace info");
Trace.error("-buildlazy|-nobuildlazy Default: -buildlazy");
Trace.error("-updatespell|-noupdatespell Default: -updatespell");
Trace.error("-rotate|-norotate Default: -rotate");
Trace.error("-validate|-novalidate Default: -validate");
Trace.tab();
Trace.error("\n");
Trace.untab();
// And then bail.
System.exit(1);
} // if( showUsage )
// If rotation is enabled for this index, target the "-new" directory.
if (cfgInfo.indexInfo.rotate)
{
cfgInfo.indexInfo.indexPath =
cfgInfo.indexInfo.indexPath.replaceFirst("/$", "-new/");
}
// Lock each index directory as we encounter it.
// If this is our first time through, purge any incomplete
// documents from the indices, and tell the user what
// we're doing.
//
// Clean all indices below the root index directory.
File idxRoot = new File(
Path.resolveRelOrAbs(xtfHomeFile, cfgInfo.indexInfo.indexPath));
if (firstIndex)
{
if (!cfgInfo.mustClean)
{
Trace.info("");
Trace.info("Purging Incomplete Documents From Indexes:");
Trace.tab();
indexCleaner.processDir(idxRoot);
Trace.untab();
Trace.info("Done.");
}
Trace.info("");
Trace.info("Indexing New/Updated Documents:");
Trace.tab();
// Indicate that the next pass through this loop is not
// the first one.
//
firstIndex = false;
}
// Say what index we're working on.
Trace.info("Index: \"" + cfgInfo.indexInfo.indexName + "\"");
// And if we're debugging, say some more about the index.
if (Trace.getOutputLevel() == Trace.debug)
Trace.more(
" [ Chunk Size = " + cfgInfo.indexInfo.getChunkSize() +
", Overlap = " + cfgInfo.indexInfo.getChunkOvlp() + " ]");
Trace.tab();
// Process the index directories.
if (!cfgInfo.skipIndexing)
doIndexing(cfgInfo, xtfHomeFile);
Trace.untab();
Trace.info("Done.");
} // for(;;)
Trace.untab();
Trace.info("Done.");
// Optimize the indices, now that we're all done processing them.
if (cfgInfo.optimize)
{
// Create a tree culler.
IdxTreeOptimizer optimizer = new IdxTreeOptimizer();
Trace.info("");
Trace.info("Optimizing Index:");
Trace.tab();
File idxRootDir = new File(Path.resolveRelOrAbs(
cfgInfo.xtfHomePath,
cfgInfo.indexInfo.indexPath));
optimizer.processDir(idxRootDir);
Trace.untab();
Trace.info("Done.");
}
else {
Trace.info("");
Trace.info("Skipping Optimization Pass.");
}
// Create spelling dictionaries, now that we're done indexing.
if (cfgInfo.updateSpellDict && cfgInfo.indexInfo.createSpellcheckDict)
{
IdxTreeDictMaker dictMaker = new IdxTreeDictMaker();
Trace.info("");
Trace.info("Updating Spellcheck Dictionary:");
Trace.tab();
File idxRootDir = new File(Path.resolveRelOrAbs(
cfgInfo.xtfHomePath,
cfgInfo.indexInfo.indexPath));
dictMaker.processDir(idxRootDir);
Trace.untab();
Trace.info("Done.");
}
else {
Trace.info("");
Trace.info("Skipping Spellcheck Dictionary Pass.");
}
// Validate the index if specified.
if (cfgInfo.indexInfo.validationPath != null &&
cfgInfo.indexInfo.validationPath.length() > 0)
{
Trace.info("");
if (cfgInfo.validate)
doValidation(cfgInfo);
else
Trace.info("Skipping Index Validation.");
}
// Finally, perform index rotation if specified (and the index supports it)
if (cfgInfo.indexInfo.rotate)
{
if (cfgInfo.rotate)
{
Trace.info("");
Trace.info("Performing Index Rotation:");
Trace.tab();
doRotation(cfgInfo);
Trace.untab();
Trace.info("Done.");
}
else {
Trace.info("");
Trace.info("Skipping Index Rotation Pass.");
}
}
Trace.untab();
Trace.info("");
long timeMsec = System.currentTimeMillis() - startTime;
long timeSec = (timeMsec+500) / 1000;
long timeMin = timeSec / 60;
long timeHour = timeMin / 60;
Trace.info("Total time: ");
if (timeHour > 0) {
String ending = (timeHour == 1) ? "" : "s";
Trace.more(Trace.info, timeHour + " hour" + ending + ", ");
}
if (timeMin > 0) {
String ending = ((timeMin % 60) == 1) ? "" : "s";
Trace.more(Trace.info, (timeMin % 60) + " minute" + ending + ", ");
}
String ending = ((timeSec % 60) == 1) ? "" : "s";
Trace.more(Trace.info, (timeSec % 60) + " second" + ending + ".");
Trace.info("Indexing complete.");
Trace.info("");
} // try
// Log any unhandled exceptions.
catch (Throwable t)
{
Trace.clearTabs();
Trace.error("*** Error: " + t.getClass());
Trace.error("");
t.printStackTrace(System.out);
Trace.error("Indexing Process Aborted.");
System.exit(1);
}
// Exit successfully.
return;
} // main()
//////////////////////////////////////////////////////////////////////////////
/**
* Handles the main work of adding and removing documents to/from the index.
*/
private static void doIndexing(IndexerConfig cfgInfo, File xtfHomeFile)
throws Exception
{
SrcTreeProcessor srcTreeProcessor = new SrcTreeProcessor();
srcTreeProcessor.open(cfgInfo);
// Start at the root directory specified by the config file.
String srcRoot = Path.resolveRelOrAbs(xtfHomeFile,
cfgInfo.indexInfo.sourcePath);
File srcRootFile = new File(srcRoot);
// Also figure out where the index is going to go.
String indexPath = Path.resolveRelOrAbs(cfgInfo.xtfHomePath,
cfgInfo.indexInfo.indexPath);
File indexFile = new File(indexPath);
// Record the directories that we scan, for incremental syncing runs later.
writeScanDirs(indexFile, cfgInfo.indexInfo);
// Make a filter for the specified source directories (null for all)
SubDirFilter subDirFilter = makeSubDirFilter(srcRootFile, cfgInfo);
// If requested, clone the data and use that as our source.
if (cfgInfo.indexInfo.cloneData)
{
Trace.info("Cloning Data Directories.");
// Clone the source data.
File cloneRootFile = new File(indexFile, "dataClone/" + cfgInfo.indexInfo.indexName);
if (!cloneRootFile.exists() && !cloneRootFile.mkdirs())
throw new IOException("Error creating clone directory '" + cloneRootFile + "'");
DirSync sync = new DirSync(subDirFilter);
sync.syncDirs(srcRootFile, cloneRootFile);
// Switch to using the clone data as our source
subDirFilter = makeSubDirFilter(cloneRootFile, cfgInfo);
srcRootFile = cloneRootFile;
Trace.more(Trace.info, " Done.");
}
Trace.info("Scanning Data Directories...");
srcTreeProcessor.processDir(srcRootFile, subDirFilter, true);
Trace.more(Trace.info, " Done.");
srcTreeProcessor.close();
// Cull files which are present in the index but missing
// from the filesystem.
//
IdxTreeCuller culler = new IdxTreeCuller();
Trace.info("Removing Missing Documents From Index:");
Trace.tab();
culler.cullIndex(new File(cfgInfo.xtfHomePath), cfgInfo.indexInfo,
srcRootFile, subDirFilter);
Trace.untab();
Trace.info("Done.");
}
//////////////////////////////////////////////////////////////////////////////
/**
* Append the current subdirectories we're about to scan to the scanDirs.list
* file. This file is used in incremental index rotation to figure out which
* data and lazy subdirectories need to be scanned for changes.
*/
private static void writeScanDirs(File indexFile, IndexInfo idxInfo)
throws IOException
{
File oldScanFile = new File(indexFile, "scanDirs.list");
File newScanFile = new File(indexFile, "scanDirs.list.new");
try
{
BufferedWriter scanWriter = new BufferedWriter(new FileWriter(newScanFile));
if (oldScanFile.canRead())
{
// Copy the contents of the old file. We can't just append to the old file
// because it may be hard linked to a older index.
//
BufferedReader scanReader = new BufferedReader(new FileReader(oldScanFile));
while (true) {
String line = scanReader.readLine();
if (line == null)
break;
scanWriter.write(line+"\n");
}
scanReader.close();
}
// And append directories being scanned this time.
if (idxInfo.subDirs == null)
scanWriter.write(idxInfo.indexName + ":/\n");
else {
for (String subdir : idxInfo.subDirs)
scanWriter.write(idxInfo.indexName + ":" + subdir + "\n");
}
scanWriter.close();
// Finally, get rid of the old file.
newScanFile.renameTo(oldScanFile);
}
catch (IOException e) {
newScanFile.delete();
throw e;
}
}
//////////////////////////////////////////////////////////////////////////////
/**
* Create a subdirectory filter, using the specified source root directory
* and the given configuration info.
*/
private static SubDirFilter makeSubDirFilter(File srcRootFile, IndexerConfig cfgInfo)
{
if (cfgInfo.indexInfo.subDirs == null)
return null;
SubDirFilter filter = new SubDirFilter();
for (String subdir : cfgInfo.indexInfo.subDirs)
filter.add(new File(Path.normalizePath(srcRootFile.toString() + "/" + subdir)));
return filter;
}
//////////////////////////////////////////////////////////////////////////////
/**
* Rotates a rotation-enabled index.
*
* @throws IOException if anything goes wrong
*/
private static void doRotation(IndexerConfig cfgInfo)
throws IOException
{
// Let's figure out the paths to the various versions of the index
String indexPath = cfgInfo.indexInfo.indexPath;
assert indexPath.endsWith("-new/"); // Should have been modified above
String home = cfgInfo.xtfHomePath;
File newIndex = new File(Path.resolveRelOrAbs(home,
indexPath));
File currentIndex = new File(Path.resolveRelOrAbs(home,
indexPath.replaceFirst("-new/$", "/")));
File pendingIndex = new File(Path.resolveRelOrAbs(home,
indexPath.replaceFirst("-new/$", "-pending/")));
File spareIndex = new File(Path.resolveRelOrAbs(home,
indexPath.replaceFirst("-new/$", "-spare/")));
// If nothing has happened to the new index since the current one was
// made, then it would be silly to rotate.
//
if (currentIndex.exists() &&
IndexSync.newestTime(currentIndex).equals(IndexSync.newestTime(newIndex)))
{
Trace.info("Nothing has changed, so not rotating.");
return;
}
// If nothing has happened to the new index since the current one was
// made, then it would be silly to rotate.
//
if (pendingIndex.exists() &&
IndexSync.newestTime(pendingIndex).equals(IndexSync.newestTime(newIndex)))
{
Trace.info("Nothing has changed, so not rotating.");
return;
}
// If there's a pending index, it means the servlets haven't gotten around
// to grabbing it yet, so we can't rotate yet.
//
if (pendingIndex.exists()) {
Trace.info("A previous index is still pending, so not rotating.");
return;
}
// To rotate, we'll need a spare. If there isn't one yet, create it from
// scratch by cloning the entire new index.
//
if (!spareIndex.exists()) {
Trace.info("Creating spare index clone.");
if (!spareIndex.mkdir())
throw new IOException("Error creating spare index '" + spareIndex + "'");
new DirSync().syncDirs(newIndex, spareIndex);
Trace.more(Trace.info, " Done.");
}
// If there is a spare already, update it from the new index. We use
// the fancy IndexSync class that knows how to do this faster than a
// brute-force scan.
//
else {
Trace.info("Bringing spare index clone up to date:");
Trace.tab();
new IndexSync().syncDirs(cfgInfo.indexInfo.indexName, newIndex, spareIndex);
Trace.untab();
Trace.info("Done.");
}
// Ready to rotate. Let's do it!
Trace.info("Rotating Indexes: [pending] <- [new] <- [spare]");
renameOrElse(newIndex, pendingIndex);
renameOrElse(spareIndex, newIndex);
}
private static void doValidation(IndexerConfig cfgInfo) throws IOException
{
// First, open the index.
String indexPath = Path.resolveRelOrAbs(cfgInfo.xtfHomePath,
cfgInfo.indexInfo.indexPath);
IndexReader reader = IndexReader.open(NativeFSDirectory.getDirectory(
Path.normalizePath(indexPath)));
// Then validate if specified in the index.
IndexValidator validator = new IndexValidator();
if (validator.validate(cfgInfo.xtfHomePath, indexPath, reader))
return;
Trace.untab();
Trace.error("Indexing aborted.");
Trace.error("");
System.exit(1);
}
/**
* Utility function to perform a rename, and throw an exception if the it
* fails.
* @throws IOException
*/
private static void renameOrElse(File from, File to) throws IOException
{
if (!from.renameTo(to))
throw new IOException("Error renaming '" + from + "' to '" + to + "'");
}
} // class textIndexer