package org.cdlib.xtf.textIndexer;
/**
* Copyright (c) 2005, Regents of the University of California
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* - Neither the name of the University of California nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* Acknowledgements:
*
* A significant amount of new and/or modified code in this module
* was made possible by a grant from the Andrew W. Mellon Foundation,
* as part of the Melvyl Recommender Project.
*/
import java.io.File;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Vector;
import org.apache.lucene.chunk.DocNumMap;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.TermEnum;
import org.cdlib.xtf.textEngine.Constants;
import org.cdlib.xtf.textEngine.XtfSearcher;
import org.cdlib.xtf.util.Path;
import org.cdlib.xtf.util.Trace;
/**
* This class dumps the contents of user-selected fields from an XTF text
* index.
*
* @author Martin Haye
*/
public class IndexDump
{
//////////////////////////////////////////////////////////////////////////////
/** Main entry-point for the index dumper. <br><br>
*
* This function takes the command line arguments passed and uses them to
* find an index and print out fields in it.
*/
public static void main(String[] args)
{
try
{
IndexerConfig cfgInfo = new IndexerConfig();
XMLConfigParser cfgParser = new XMLConfigParser();
int startArg = 0;
boolean showUsage = false;
boolean termFreqMode = false;
boolean allFieldsMode = false;
boolean xmlMode = false;
// Make sure the XTF_HOME environment variable is specified.
cfgInfo.xtfHomePath = System.getProperty("xtf.home");
if (cfgInfo.xtfHomePath == null || cfgInfo.xtfHomePath.length() == 0) {
Trace.error("Error: xtf.home property not found");
return;
}
cfgInfo.xtfHomePath = Path.normalizePath(cfgInfo.xtfHomePath);
if (!new File(cfgInfo.xtfHomePath).isDirectory()) {
Trace.error(
"Error: xtf.home directory \"" + cfgInfo.xtfHomePath +
"\" does not exist or cannot be read.");
return;
}
// Write output in UTF-8 format.
Writer out = new OutputStreamWriter(System.out, "UTF-8");
// Process each index
for (;;)
{
// The minimum set of arguments consists of the name of an index
// to scan and a field to dump. That requires three args; if we don't
// get that many, we will show the usage text and bail.
//
if (args.length < 3)
showUsage = true;
// We have enough arguments, so...
else
{
// Read the command line arguments until we find what we
// need to do some work, or until we run out.
//
int ret = cfgInfo.readCmdLine(args, startArg);
// If we didn't find enough command line arguments...
if (ret == -1)
{
// And this is the first time we've visited the command
// line arguments, avoid trying to doing work and just
// display the usage text. Otherwise, we're done.
//
if (startArg == 0)
showUsage = true;
else
break;
} // if( ret == -1 )
// We did find enough command line arguments, so...
//
else
{
// Make sure the configuration path is absolute
if (!(new File(cfgInfo.cfgFilePath).isAbsolute())) {
cfgInfo.cfgFilePath = Path.resolveRelOrAbs(cfgInfo.xtfHomePath,
cfgInfo.cfgFilePath);
}
// Get the configuration for the index specified by the
// current command line arguments.
//
if (cfgParser.configure(cfgInfo) < 0) {
Trace.error(
"Error: index '" + cfgInfo.indexInfo.indexName +
"' not found\n");
return;
}
} // else( ret != -1 )
// Save the new start argument for the next time.
startArg = ret;
} // else( args.length >= 4 )
// Parse additional mode parameters
Vector fieldNames = new Vector();
while (startArg < args.length)
{
// Is term frequency mode enabled?
if (args[startArg].equalsIgnoreCase("-termFreq"))
{
startArg++;
termFreqMode = true;
}
// Is all fields mode enabled?
else if (args[startArg].equalsIgnoreCase("-allFields"))
{
startArg++;
allFieldsMode = true;
}
// Is XML mode enabled?
else if (args[startArg].equalsIgnoreCase("-xml"))
{
startArg++;
xmlMode = true;
}
// Is a field name specified?
else if (args[startArg].equals("-field"))
{
startArg++;
if (startArg == args.length || args[startArg].startsWith("-"))
showUsage = true;
else
{
if (args[startArg].equals("text") && !termFreqMode) {
Trace.error("Error: contents of the 'text' field cannot be dumped");
System.exit(1);
}
fieldNames.add(args[startArg]);
startArg++;
}
}
// Barf on other parameters
else {
showUsage = true;
break;
}
}
// Do a little checking for sanity
if ((allFieldsMode && !fieldNames.isEmpty()) ||
(!allFieldsMode && fieldNames.isEmpty()))
showUsage = true;
String[] fieldNameArray = (String[])fieldNames.toArray(
new String[fieldNames.size()]);
// If the config file was read successfully, we can begin processing.
if (showUsage)
{
// Do so...
Trace.error(" usage: ");
Trace.tab();
Trace.error(
"indexDump {-config <configfile>} -index <indexname> " +
"{-xml} {-termFreq} {-allFields|-field fieldName1 {-field fieldName2}*}... \n\n");
Trace.untab();
// And then bail.
System.exit(1);
} // if( showUsage )
// Try to open the index for reading. If we fail and throw, skip the
// index.
//
IndexInfo idxInfo = cfgInfo.indexInfo;
String idxPath = Path.resolveRelOrAbs(cfgInfo.xtfHomePath,
idxInfo.indexPath);
XtfSearcher searcher = new XtfSearcher(idxPath, 30);
IndexReader indexReader = searcher.indexReader();
DocNumMap docNumMap = searcher.docNumMap();
// Go for it.
if (termFreqMode)
dumpTermFreqs(indexReader, docNumMap, fieldNameArray, out);
else
dumpFields(indexReader, fieldNameArray, xmlMode, allFieldsMode, out);
// Close the index reader, and make sure all output is displayed.
indexReader.close();
out.flush();
} // for(;;)
} // try
// Log any unhandled exceptions.
catch (Exception e) {
Trace.error("*** Last Chance Exception: " + e.getClass());
Trace.error(" With message: " + e.getMessage());
Trace.error("");
e.printStackTrace(System.out);
System.exit(1);
}
catch (Throwable t) {
Trace.error("*** Last Chance Exception: " + t.getClass());
Trace.error(" With message: " + t);
Trace.error("");
t.printStackTrace(System.out);
System.exit(1);
}
// Exit successfully.
System.exit(0);
} // main()
//////////////////////////////////////////////////////////////////////////////
private static void dumpDelimitedRecord(ArrayList<Field> fieldData, Writer out)
throws IOException
{
String prevName = null;
for (Field f : fieldData)
{
if (prevName != null) {
if (f.name().equals(prevName))
out.write(";");
else
out.write("|");
}
prevName = f.name();
out.write(stripValue(f.stringValue(), true));
}
out.write("|\n");
}
//////////////////////////////////////////////////////////////////////////////
private static void dumpXmlRecord(ArrayList<Field> fieldData, Writer out)
throws IOException
{
out.write(" <document>\n");
for (Field f : fieldData)
{
out.write(" <" + f.name() + ">");
out.write(stripValue(f.stringValue(), false));
out.write("</" + f.name() + ">\n");
}
out.write(" </document>\n");
}
//////////////////////////////////////////////////////////////////////////////
private static void dumpFields(IndexReader indexReader, String[] fieldNames,
boolean xmlMode, boolean allFieldsMode, Writer out)
throws IOException
{
if (xmlMode) {
out.write("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n");
out.write("<xtfIndexDocuments>\n");
}
// Iterate every document.
int maxDoc = indexReader.maxDoc();
for (int i = 0; i < maxDoc; i++)
{
// Skip deleted docs
if (indexReader.isDeleted(i))
continue;
// Skip non-metadata docs (e.g. indexInfo, text blocks)
Document doc = indexReader.document(i);
if (doc.getField("docInfo") == null)
continue;
// See if any of the desired fields are present, and if so, record
// their values.
//
ArrayList<Field> toPrint = new ArrayList();
if (allFieldsMode) {
for (Field f : (List<Field>)doc.getFields())
{
// Only output user fields (i.e. skip XTF-internal fields)
if (!f.name().matches("^(docInfo|chunkCount|key|fileDate)$"))
toPrint.add(f);
}
}
else
{
for (int j = 0; j < fieldNames.length; j++)
{
Field[] got = doc.getFields(fieldNames[j]);
if (got != null)
toPrint.addAll(Arrays.asList(got));
}
}
if (!toPrint.isEmpty())
{
if (xmlMode)
dumpXmlRecord(toPrint, out);
else
dumpDelimitedRecord(toPrint, out);
}
}
if (xmlMode)
out.write("</xtfIndexDocuments>\n");
} // dumpFields()
//////////////////////////////////////////////////////////////////////////////
private static void dumpTermFreqs(IndexReader indexReader,
DocNumMap docNumMap, String[] fields,
Writer out)
throws IOException
{
TermDocs docs = indexReader.termDocs();
// Iterate every field.
for (int i = 0; i < fields.length; i++)
{
// Iterate all the terms for this field.
TermEnum terms = indexReader.terms(new Term(fields[i], ""));
while (terms.next())
{
Term t = terms.term();
if (!t.field().equals(fields[i]))
break;
// Skip bi-grams
String text = t.text();
if (text.indexOf("~") >= 0)
continue;
// Skip empty terms (there shouldn't be any though)
if (text.length() == 0)
continue;
// Skip special start/end of field marks (normal terms will also
// be present, without the marks.) Also skip element and attribute
// markers.
//
char c = text.charAt(0);
if (c == Constants.FIELD_START_MARKER ||
c == Constants.ELEMENT_MARKER ||
c == Constants.ATTRIBUTE_MARKER)
{
continue;
}
c = text.charAt(text.length() - 1);
if (c == Constants.FIELD_END_MARKER ||
c == Constants.ELEMENT_MARKER ||
c == Constants.ATTRIBUTE_MARKER)
{
continue;
}
// Okay, we have a live one. Accumulate the total occurrences of
// the term in all documents. For the benefit of the 'text' field,
// accumulate chunk counts into the main document.
//
int prevMainDoc = -1;
int docFreq = 0;
docs.seek(terms);
int termFreq = 0;
while (docs.next())
{
int mainDoc = docs.doc();
if (t.field().equals("text"))
mainDoc = docNumMap.getDocNum(docs.doc());
if (mainDoc != prevMainDoc) {
++docFreq;
prevMainDoc = mainDoc;
}
termFreq += docs.freq();
}
// Output the results.
out.write(
fields[i] + "|" + docFreq + "|" + termFreq + "|" + t.text() + "\n");
} // while
} // for i
} // dumpTermFreqs()
//////////////////////////////////////////////////////////////////////////////
/**
* Removes XTF's special characters (such as bump markers and field start/end
* markers) from the input string. Also changes characters we use for
* field and value markers ('|' and ';') to something else so they won't
* be taken for markers.
*/
private static String stripValue(String str, boolean changeDelimiters)
{
char[] in = str.toCharArray();
char[] out = new char[in.length * 2];
int outLen = 0;
for (int i = 0; i < in.length; i++)
{
switch (in[i]) {
case Constants.FIELD_START_MARKER:
case Constants.FIELD_END_MARKER:
case Constants.NODE_MARKER:
break;
case Constants.BUMP_MARKER:
i++;
while (i < in.length && in[i] != Constants.BUMP_MARKER)
i++;
if (i < in.length)
i++;
out[outLen++] = ';';
break;
case ';':
out[outLen++] = changeDelimiters ? ',' : in[i];
break;
case '|':
out[outLen++] = changeDelimiters ? '.' : in[i];
break;
case '\n':
if (changeDelimiters) {
out[outLen++] = '\\';
out[outLen++] = 'n';
}
else
out[outLen++] = in[i];
break;
default:
out[outLen++] = in[i];
break;
}
} // for i
return new String(out, 0, outLen);
} // stripValue()
} // class IndexDump