package org.cdlib.xtf.textIndexer;
import java.io.File;
import java.io.IOException;
import java.text.DecimalFormat;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;
import org.cdlib.xtf.textEngine.IndexUtil;
import org.cdlib.xtf.textEngine.NativeFSDirectory;
import org.cdlib.xtf.util.Path;
import org.cdlib.xtf.util.Trace;
/**
* Copyright (c) 2004, Regents of the University of California
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* - Neither the name of the University of California nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Acknowledgements:
*
* A significant amount of new and/or modified code in this module
* was made possible by a grant from the Andrew W. Mellon Foundation,
* as part of the Melvyl Recommender Project.
*/
/**
* This class calculates and prints out some useful statistics about an
* existing index, such as number of documents, size, etc.
*
* @author Martin Haye
*/
public class IndexStats
{
//////////////////////////////////////////////////////////////////////////////
/** Main entry-point for the statistics gatherer. <br><br>
*
* This function takes the command line arguments passed and uses them to
* find an index and calculate statistics for it.
* <br><br>
*
* @param args Command line arguments to process. The command line
* arguments required by the IndexStats program are as follows:
*
* <blockquote dir=ltr style="MARGIN-RIGHT: 0px"><code>
* <b>IndexStats {-config</b> <font color=#0000ff><i>CfgFilePath</i></font>}
* <b>-index</b> <font color=#0000ff><i>IndexName</i></font> }+
* </b></code></blockquote>
*
* For a complete description of each command line argument, see the
* {@link TextIndexer} class description.
* <br><br>
*
*/
public static void main(String[] args)
{
try
{
IndexerConfig cfgInfo = new IndexerConfig();
XMLConfigParser cfgParser = new XMLConfigParser();
int startArg = 0;
boolean showUsage = false;
// Regardless of whether we succeed or fail, say our name.
Trace.info("IndexStats v2.2");
Trace.info("");
// Make sure the XTF_HOME environment variable is specified.
cfgInfo.xtfHomePath = System.getProperty("xtf.home");
if (cfgInfo.xtfHomePath == null || cfgInfo.xtfHomePath.length() == 0) {
Trace.error("Error: xtf.home property not found");
return;
}
cfgInfo.xtfHomePath = Path.normalizePath(cfgInfo.xtfHomePath);
if (!new File(cfgInfo.xtfHomePath).isDirectory()) {
Trace.error(
"Error: xtf.home directory \"" + cfgInfo.xtfHomePath +
"\" does not exist or cannot be read.");
return;
}
// Process each index
for (;;)
{
// The minimum set of arguments consists of the name of an index
// to scan. That requires two args; if we don't get that many,
// we will show the usage text and bail.
//
if (args.length < 2)
showUsage = true;
// We have enough arguments, so...
else
{
// Read the command line arguments until we find what we
// need to do some work, or until we run out.
//
int ret = cfgInfo.readCmdLine(args, startArg);
// If we didn't find enough command line arguments...
if (ret == -1)
{
// And this is the first time we've visited the command
// line arguments, avoid trying to doing work and just
// display the usage text. Otherwise, we're done.
//
if (startArg == 0)
showUsage = true;
else
break;
} // if( ret == -1 )
// We did find enough command line arguments, so...
//
else
{
// Make sure the configuration path is absolute
if (!(new File(cfgInfo.cfgFilePath).isAbsolute())) {
cfgInfo.cfgFilePath = Path.resolveRelOrAbs(cfgInfo.xtfHomePath,
cfgInfo.cfgFilePath);
}
// Get the configuration for the index specified by the
// current command line arguments.
//
if (cfgParser.configure(cfgInfo) < 0) {
Trace.error(
"Error: index '" + cfgInfo.indexInfo.indexName +
"' not found\n");
return;
}
} // else( ret != -1 )
// Save the new start argument for the next time.
startArg = ret;
} // else( args.length >= 4 )
// If the config file was read successfully, we can begin processing.
if (showUsage)
{
// Do so...
Trace.error(" usage: ");
Trace.tab();
Trace.error(
"indexStats {-config <configfile>}? " +
"-index <indexname>}+ \n\n");
Trace.untab();
// And then bail.
return;
} // if( showUsage )
try
{
// Say what index we're working on.
Trace.info("Index: \"" + cfgInfo.indexInfo.indexName + "\"");
Trace.tab();
// Output general information about the index
Trace.info("");
Trace.info("Configuration Info...");
Trace.tab();
Trace.info(
"Chunk Size = " + cfgInfo.indexInfo.getChunkSize() +
", Overlap = " + cfgInfo.indexInfo.getChunkOvlp());
Trace.info("Index Path = " +
Path.resolveRelOrAbs(cfgInfo.xtfHomePath,
cfgInfo.indexInfo.indexPath));
Trace.info("Data Path = " +
Path.resolveRelOrAbs(cfgInfo.xtfHomePath,
cfgInfo.indexInfo.sourcePath));
Trace.info("Stop Words = " + cfgInfo.indexInfo.stopWords);
Trace.untab();
// Calculate and print the remaining statistics
Trace.info("");
Trace.info("Statistics...");
Trace.tab();
calcStats(cfgInfo);
Trace.info("");
Trace.untab();
Trace.untab();
Trace.info("Done.");
Trace.info("");
} // try to index a document
catch (Exception e)
{
Trace.clearTabs();
Trace.error("*** Last Chance Exception: " + e.getClass());
Trace.error(" With message: " + e.getMessage());
Trace.error("");
e.printStackTrace(System.out);
return;
}
catch (Throwable t) {
Trace.clearTabs();
Trace.error("*** Last Chance Exception: " + t.getClass());
Trace.error("");
t.printStackTrace(System.out);
return;
}
} // for(;;)
} // try
// Log any unhandled exceptions.
catch (Exception e) {
Trace.error("*** Last Chance Exception: " + e.getClass());
Trace.error(" With message: " + e.getMessage());
Trace.error("");
e.printStackTrace(System.out);
}
catch (Throwable t) {
Trace.error("*** Last Chance Exception: " + t.getClass());
Trace.error(" With message: " + t);
Trace.error("");
t.printStackTrace(System.out);
}
// Exit successfully.
return;
} // main()
//////////////////////////////////////////////////////////////////////////////
private static void calcStats(IndexerConfig cfgInfo)
throws IOException
{
IndexInfo idxInfo = cfgInfo.indexInfo;
DecimalFormat decFmt = (DecimalFormat)DecimalFormat.getInstance();
decFmt.setMaximumFractionDigits(1);
// Try to open the index for reading. If we fail and throw, skip the
// index.
//
String idxPath = Path.resolveRelOrAbs(cfgInfo.xtfHomePath, idxInfo.indexPath);
File idxFile = new File(idxPath);
IndexReader indexReader = IndexReader.open(NativeFSDirectory.getDirectory(idxPath));
// Give an estimate of the total number of documents.
long totalDocs = indexReader.docFreq(new Term("docInfo", "1"));
long totalChunks = indexReader.numDocs();
Trace.info("Total Documents (Records) = " + totalDocs);
Trace.info("Total Chunks = " + totalChunks);
if (indexReader.hasDeletions())
Trace.more(Trace.info, " (total includes some deleted documents)");
Trace.info(
"Avg Chunks Per Doc/Rec = " +
decFmt.format((totalChunks + 1) / (double)(totalDocs + 1)));
// Calculate the total number of files.
int totalFiles = 0;
TermEnum termEnum = indexReader.terms(new Term("key", ""));
do {
Term term = termEnum.term();
if (term == null || !term.field().equals("key"))
break;
totalFiles++;
} while (termEnum.next());
Trace.info("Total Number of Src Files = " + totalFiles);
Trace.info(
"Avg Docs/Recs Per File = " +
decFmt.format((totalDocs + 1) / (double)(totalFiles + 1)));
// Now calculate the size of the Lucene index files.
long totalLuceneSize = 0;
if (idxFile.isDirectory())
{
String[] children = idxFile.list();
for (int i = 0; i < children.length; i++) {
File child = new File(idxFile, children[i]);
if (child.isFile())
totalLuceneSize += child.length();
}
}
Trace.info("Size of Lucene Index = " + printBig(totalLuceneSize));
// Now calculate the size of the lazy files
termEnum = indexReader.terms(new Term("key", ""));
File xtfHomeFile = new File(cfgInfo.xtfHomePath);
long totalSrcSize = 0;
long totalLazySize = 0;
long filesDone = 0;
long prevPercent = 0;
do
{
Term term = termEnum.term();
if (term == null || !term.field().equals("key"))
break;
long percentDone = filesDone * 100 / totalFiles;
if (filesDone > 500 && percentDone >= (prevPercent + 5)) {
if (prevPercent == 0)
Trace.info("");
Trace.more(Trace.info,
"\r (Calculating Source/Lazy File Sizes... " +
percentDone + "%)");
prevPercent = percentDone;
}
filesDone++;
// Get the key, which contains the index name and the path from its
// source directory.
//
String key = term.text();
assert key.indexOf(':') >= 0 : "Invalid index key - missing ':'";
String indexName = key.substring(0, key.indexOf(':'));
String relPath = key.substring(key.indexOf(':') + 1);
// Skip documents that aren't part of the index we want.
if (!indexName.equals(idxInfo.indexName))
continue;
// Create a reference to the source XML document.
String sourceDir = Path.resolveRelOrAbs(cfgInfo.xtfHomePath,
idxInfo.sourcePath);
String currPath = Path.resolveRelOrAbs(sourceDir, relPath);
File currFile = new File(currPath);
// Add up the size of the source files
totalSrcSize += currFile.length();
// Also add up the size of the lazy files.
File lazyFile = IndexUtil.calcLazyPath(xtfHomeFile, idxInfo, currFile,
false);
totalLazySize += lazyFile.length();
} while (termEnum.next());
if (prevPercent > 0 && prevPercent < 100) {
Trace.more(Trace.info,
"\r (Calculating Source/Lazy File Sizes... 100%)");
Trace.info("");
}
// Print out the total sizes.
Trace.info("Size of Source Files = " + printBig(totalSrcSize));
Trace.info("Size of Lazy Trees = " + printBig(totalLazySize));
Trace.info(
"Total Index Size = " +
printBig(totalLuceneSize + totalLazySize) + " (Lucene + Lazy)");
// Close the term enumeration and reader.
termEnum.close();
termEnum = null;
indexReader.close();
indexReader = null;
} // calcStats()
//////////////////////////////////////////////////////////////////////////////
private static String printBig(long num) {
float mBytes = num / 1024.0f / 1024.0f;
mBytes = ((int)(mBytes * 100)) / 100.0f;
return mBytes + " Mb";
} // printBig()
} // class IndexStats