package org.cdlib.xtf.textIndexer;
/**
* Copyright (c) 2004, Regents of the University of California
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* - Neither the name of the University of California nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.StringTokenizer;
import java.util.Vector;
import javax.xml.transform.Templates;
import javax.xml.transform.Transformer;
import javax.xml.transform.sax.SAXSource;
import net.sf.saxon.om.NodeInfo;
import net.sf.saxon.tree.TreeBuilder;
import org.apache.lucene.util.StringUtil;
import org.cdlib.xtf.cache.Dependency;
import org.cdlib.xtf.cache.FileDependency;
import org.cdlib.xtf.servletBase.StylesheetCache;
import org.cdlib.xtf.textEngine.IndexUtil;
import org.cdlib.xtf.util.EasyNode;
import org.cdlib.xtf.util.Path;
import org.cdlib.xtf.util.StructuredStore;
import org.cdlib.xtf.util.SubDirFilter;
import org.cdlib.xtf.util.Trace;
import org.cdlib.xtf.util.XMLWriter;
import org.xml.sax.InputSource;
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
/**
* This class is the main processing shell for files in the source text
* tree. It optimizes Lucene database access by opening the index once at
* the beginning, processing all the source files in the source tree
* (including skipping non-source XML files in the tree), and closing the
* database at the end. <br><br>
*
* Internally, this class uses the {@link org.cdlib.xtf.textIndexer.XMLTextProcessor}
* class to actually split the source files up into chunks and add them to the
* Lucene index.
*
*/
public class SrcTreeProcessor
{
private IndexerConfig cfgInfo;
private XMLTextProcessor textProcessor;
private StylesheetCache stylesheetCache = new StylesheetCache(100, 0, true);
private Templates docSelector;
private int nScanned = 0;
private StringBuffer docBuf = new StringBuffer(1024);
private StringBuffer dirBuf = new StringBuffer(1024);
private String docSelPath;
private File docSelCacheFile;
private DocSelCache docSelCache = new DocSelCache();
////////////////////////////////////////////////////////////////////////////
/** Default constructor. <br><br>
*
* Instantiates the {@link org.cdlib.xtf.textIndexer.XMLTextProcessor}
* used internally to process individual XML source files. <br><br>
*/
public SrcTreeProcessor()
{
// Instantiate a text processor object to use on each XML file
// encountered in the file tree.
//
textProcessor = new XMLTextProcessor();
} // SrcTreeProcessor()
////////////////////////////////////////////////////////////////////////////
/** Indexing open function. <br><br>
*
* Calls the {@link org.cdlib.xtf.textIndexer.XMLTextProcessor}
* {@link org.cdlib.xtf.textIndexer.XMLTextProcessor#open(String, IndexInfo, boolean, boolean) open()}
* method to actually create/open the Lucene index.
*
* @param cfgInfo The {@link org.cdlib.xtf.textIndexer#IndexerConfig IndexerConfig}
* that indentifies the Lucene index, source text tree, and
* other parameters required to perform indexing. <br><br>
*
* @throws IOException Any I/O exceptions generated by the
* {@link org.cdlib.xtf.textIndexer.XMLTextProcessor}
* {@link org.cdlib.xtf.textIndexer.XMLTextProcessor#open(String, IndexInfo, boolean, boolean) open()}
* method. <br><br>
*/
public void open(IndexerConfig cfgInfo)
throws Exception
{
// Hang on to a reference to the config info.
this.cfgInfo = cfgInfo;
// If no XTF home directory specified, assume it is the same
// directory as the config file.
//
if (cfgInfo.xtfHomePath == null) {
cfgInfo.xtfHomePath = new File(cfgInfo.cfgFilePath).getParentFile()
.toString();
}
// Make a transformer for the docSelector stylesheet.
docSelPath = Path.resolveRelOrAbs(cfgInfo.xtfHomePath,
cfgInfo.indexInfo.docSelectorPath);
docSelector = stylesheetCache.find(docSelPath);
// Load the previous docSelector cache (if any)
loadCache(cfgInfo);
// Open the Lucene index specified by the config info.
textProcessor.open(cfgInfo.xtfHomePath, cfgInfo.indexInfo, cfgInfo.clean,
cfgInfo.force);
cfgInfo.clean = false;
} // open()
////////////////////////////////////////////////////////////////////////////
/** Indexing close function. <br><br>
*
* Calls the {@link org.cdlib.xtf.textIndexer.XMLTextProcessor}
* {@link org.cdlib.xtf.textIndexer.XMLTextProcessor#processQueuedTexts() processQueuedTexts()}
* method to flush all the pending Lucene writes to disk. Then it calls the
* {@link org.cdlib.xtf.textIndexer.XMLTextProcessor}
* {@link org.cdlib.xtf.textIndexer.XMLTextProcessor#close() close()}
* method to actually close the Lucene index. <br><br>
*
* @throws IOException Any I/O exceptions generated by the
* {@link org.cdlib.xtf.textIndexer.XMLTextProcessor}
* {@link org.cdlib.xtf.textIndexer.XMLTextProcessor#close() close()}
* method. <br><br>
*
*/
public void close()
throws IOException
{
// Flush the remaining open documents.
textProcessor.processQueuedTexts();
// Save the doc selector cache. We do this *after* processing the texts,
// in case something catastrophic happens in there.
//
saveCache();
// Let go of the config info now that we're done with it.
cfgInfo = null;
// Close the index database.
textProcessor.close();
} // close()
////////////////////////////////////////////////////////////////////////////
String calcIndexPath()
{
String indexPath = Path.resolveRelOrAbs(cfgInfo.xtfHomePath,
cfgInfo.indexInfo.indexPath);
return Path.normalizePath(indexPath);
}
////////////////////////////////////////////////////////////////////////////
/** Load the previous docSelector cache.
*
* @param cfgInfo The {@link org.cdlib.xtf.textIndexer#IndexerConfig IndexerConfig}
* that indentifies the Lucene index, source text tree, and
* other parameters required to perform indexing. <br><br>
*/
public void loadCache(IndexerConfig cfgInfo)
{
docSelCache.clear();
// Figure out the path to the cache file
docSelCacheFile = new File(calcIndexPath() + "docSelect.cache");
// Calculate all the file dependencies of the docSelector stylesheet.
Iterator iter = stylesheetCache.getDependencies(docSelPath);
StringBuffer depBuf = new StringBuffer();
while (iter.hasNext())
{
Dependency d = (Dependency)iter.next();
if (d instanceof FileDependency) {
depBuf.append(d.toString());
depBuf.append("\n");
}
}
docSelCache.dependencies = depBuf.toString();
// If we're making a clean index, delete the old cache file.
if (cfgInfo.clean) {
docSelCacheFile.delete();
return;
}
// If the cache file doesn't exist, don't load it.
if (!docSelCacheFile.canRead())
return;
// Read the file.
String thisDep = docSelCache.dependencies;
try {
docSelCache.load(docSelCacheFile);
}
catch (IOException e) {
Trace.warning(
"Warning: Error loading docSelector cache \"" + docSelCacheFile +
"\": " + e);
docSelCache.clear();
return;
}
// If the dependencies are different, toss it.
if (!docSelCache.dependencies.equals(thisDep)) {
Trace.debug(
"Note: docSelector stylesheet or sub-sheet " +
" has changed... throwing away " + "old docSelector cache.");
docSelCacheFile.delete();
docSelCache.clear();
docSelCache.dependencies = thisDep;
return;
}
} // loadCache()
////////////////////////////////////////////////////////////////////////////
/** Save the docSelector cache.
*/
public void saveCache()
{
try {
docSelCache.save(docSelCacheFile);
}
catch (IOException e) {
Trace.warning(
"Warning: Error writing docSelector cache \"" + docSelCacheFile + "\": " + e);
}
} // saveCache()
////////////////////////////////////////////////////////////////////////////
/** Process a directory containing source XML files. <br><br>
*
* This method iterates through a source directory's contents indexing any
* valid files it finds, any processing any sub-directories. <br><br>
*
* @param curDir The current directory to be processed. <br>
* @param subDirFilter Sub-dirs to scan, or null for all. <br>
* @param topLevel true for the top-level directory, false else. <br>
*
* @throws Exception Any exceptions generated internally
* by the <code>File</code> class or the
* {@link org.cdlib.xtf.textIndexer.XMLTextProcessor}
* class. <br><br>
*
*/
public void processDir(File curDir, SubDirFilter subDirFilter, boolean topLevel)
throws Exception
{
// If we're only doing a subset and this directory isn't in it, skip.
if (subDirFilter != null && !subDirFilter.approve(curDir))
return;
// We're looking at a directory. Get the list of files it contains.
String[] fileStrs = curDir.getAbsoluteFile().list();
if (fileStrs == null) {
Trace.warning(
"Warning: error retrieving file list for directory: " + curDir);
return;
}
ArrayList list = new ArrayList(fileStrs.length);
for (int i = 0; i < fileStrs.length; i++)
list.add(fileStrs[i]);
Collections.sort(list);
// Process all of the non-directory files first. Form a document
// representing the directory and all its files.
//
docBuf.setLength(0);
dirBuf.setLength(0);
String dirPath = Path.normalizePath(curDir.toString());
docBuf.append("<directory dirPath=\"" + StringUtil.escapeHTMLChars(dirPath) + "\">\n");
int nFiles = 0;
for (Iterator i = list.iterator(); i.hasNext();)
{
File subFile = new File(curDir, (String)i.next());
if (!subFile.getAbsoluteFile().isDirectory())
{
docBuf.append(" <file fileName=\"");
docBuf.append(StringUtil.escapeHTMLChars(subFile.getName()));
docBuf.append("\"/>\n");
dirBuf.append(StringUtil.escapeHTMLChars(subFile.getName()));
dirBuf.append(':');
dirBuf.append(subFile.lastModified());
dirBuf.append("\n");
++nFiles;
// Print out dots as we process large amounts of files, just so
// the user knows something is happening.
//
if (((nScanned++) % 200) == 0)
Trace.more(Trace.info, ".");
}
}
docBuf.append("</directory>\n");
// Now process the document using the docSelector stylesheet.
boolean anyProcessed = false;
boolean runStylesheet;
String inStr = docBuf.toString();
String filesAndTimes = dirBuf.toString();
String dirKey;
if (topLevel)
dirKey = cfgInfo.indexInfo.indexName + ":/";
else
dirKey = IndexUtil.calcDocKey(new File(cfgInfo.xtfHomePath),
cfgInfo.indexInfo, curDir);
if (nFiles == 0)
runStylesheet = false;
else
{
DocSelCache.Entry ent = (DocSelCache.Entry)docSelCache.get(dirKey);
if (ent == null)
runStylesheet = true;
else if (cfgInfo.force || !ent.filesAndTimes.equals(filesAndTimes)) {
docSelCache.remove(dirKey);
runStylesheet = true;
}
else {
anyProcessed = ent.anyProcessed;
runStylesheet = false;
}
}
if (runStylesheet)
{
InputSource docSelectorInput = new InputSource(new StringReader(inStr));
if (Trace.getOutputLevel() >= Trace.debug) {
Trace.debug("*** docSelector input ***\n" + inStr);
Trace.debug("");
}
TreeBuilder tree = new TreeBuilder();
Transformer docSelectorTrans = docSelector.newTransformer();
docSelectorTrans.transform(new SAXSource(docSelectorInput), tree);
NodeInfo result = tree.getCurrentRoot();
if (Trace.getOutputLevel() >= Trace.debug) {
Trace.debug("*** docSelector output ***\n" +
XMLWriter.toString(result));
Trace.debug("");
}
// Iterate the result, and queue any files to index.
EasyNode root = new EasyNode(result);
for (int i = 0; i < root.nChildren(); i++)
{
EasyNode node = root.child(i);
if (!node.isElement())
continue;
String tagName = node.name();
if (tagName.equalsIgnoreCase("indexFiles")) {
root = node;
i = -1;
continue;
}
if (tagName.equalsIgnoreCase("indexFile")) {
if (processFile(dirPath, node))
anyProcessed = true;
}
else {
Trace.error(
"Error: docSelector returned unknown element '" + tagName + "'");
return;
}
} // while
// Store this in the cache so we don't have to run the stylesheet
// next time (that is, unless the directory contents or stylesheet
// are different).
//
docSelCache.put(dirKey, new DocSelCache.Entry(filesAndTimes, anyProcessed));
} // if nFiles > 0
// In the old mode (scanAllDirs = false), if we found any files to process,
// the convention is that subdirectories contain file related to the ones
// we processed, and that they shouldn't be processed individually.
//
// In the new mode (scanAllDirs = true), we always process subdirs. This
// seems to be what most people really want and expect.
//
if (anyProcessed && !cfgInfo.indexInfo.scanAllDirs)
return;
// Recursively try sub-directories.
for (Iterator i = list.iterator(); i.hasNext();) {
File subFile = new File(curDir, (String)i.next());
if (subFile.getAbsoluteFile().isDirectory())
processDir(subFile, subDirFilter, false);
}
} // processDir()
////////////////////////////////////////////////////////////////////////////
/** Process file. <br><br>
*
* This method processes a source file, including source text XML files,
* PDF files, etc. <br><br>
*
* @param parentEl DOM element representing the current file to be
* processed. This may be a source XML file, PDF file,
* etc. <br><br>
*
* @return true if the document was processed, false if it was
* skipped due to skipping rules.<br><br>
*
* @throws Exception Any exceptions generated internally by the <code>File</code>
* class or the {@link org.cdlib.xtf.textIndexer.XMLTextProcessor}
* class. <br><br>
*
*/
public boolean processFile(String dir, EasyNode parentEl)
throws Exception
{
// Gather all the info from the element's attributes.
File srcPath = null;
Vector preFilterVec = new Vector();
Templates displayStyle = null;
String fileName = null;
String format = null;
boolean removeDoctypeDecl = false;
for (int i = 0; i < parentEl.nAttrs(); i++)
{
String attrName = parentEl.attrName(i);
String attrVal = parentEl.attrValue(i);
// Get the file name and check it.
if (attrName.equalsIgnoreCase("fileName"))
{
fileName = attrVal; // for extension checking only
srcPath = new File(Path.normalizeFileName(dir + attrVal));
if (!srcPath.canRead()) {
Trace.error("Error: cannot read input document '" + srcPath + "'");
return false;
}
}
// Is there an input filter(s) specified?
else if (attrName.equalsIgnoreCase("preFilter"))
{
// Break up a list separated by semicolons or commas.
StringTokenizer st = new StringTokenizer(attrVal, ";,");
while (st.hasMoreTokens()) {
String partialPath = st.nextToken();
String preFilterPath = Path.resolveRelOrAbs(cfgInfo.xtfHomePath,
partialPath);
preFilterVec.add(stylesheetCache.find(preFilterPath));
} // while
} // else
// If there a display stylesheet specified?
else if (attrName.equalsIgnoreCase("displayStyle")) {
String displayPath = Path.resolveRelOrAbs(cfgInfo.xtfHomePath, attrVal);
displayStyle = stylesheetCache.find(displayPath);
}
// Is there a format specified?
else if (attrName.equalsIgnoreCase("type"))
{
format = attrVal;
if (format.equalsIgnoreCase("XML"))
format = "XML";
else if (format.equalsIgnoreCase("PDF"))
format = "PDF";
else if (format.equalsIgnoreCase("HTML"))
format = "HTML";
else if (format.equalsIgnoreCase("DOC") || format.equalsIgnoreCase("MSWord"))
format = "MSWord";
else if (format.equalsIgnoreCase("Text"))
format = "Text";
else if (format.equalsIgnoreCase("MARC"))
format = "MARC";
else {
Trace.error("Error: docSelector returned unknown type: '" + format +
"'");
return false;
}
}
// Is DOCTYPE declaration removal specified?
else if (attrName.equalsIgnoreCase("removeDoctypeDecl"))
{
if (attrVal.matches("^yes$|^true$"))
removeDoctypeDecl = true;
else if (attrVal.matches("^no$|^false$"))
removeDoctypeDecl = false;
else {
Trace.error(
"Error: docSelector returned invalid value for " + attrName +
" attribute: " +
"expected 'true', 'yes', 'false', or 'no', but found '" + attrVal +
"'");
return false;
}
}
// Other attributes are in error.
else {
Trace.error(
"Error: docSelector returned unknown attribute: '" + attrName + "'");
return false;
}
} // while
// Make sure the filename was specified.
if (srcPath == null) {
Trace.error("Error: docSelector must return 'fileName' attribute");
return false;
}
// If no format was specified, make a guess.
if (format == null && fileName != null)
{
String lcFileName = fileName.toLowerCase();
if (lcFileName.endsWith(".xml"))
format = "XML";
else if (lcFileName.endsWith(".pdf"))
format = "PDF";
else if (lcFileName.endsWith(".htm") || lcFileName.endsWith(".html"))
format = "HTML";
else if (lcFileName.endsWith(".doc"))
format = "MSWord";
else if (lcFileName.endsWith(".txt"))
format = "Text";
else if (lcFileName.endsWith(".marc") || lcFileName.endsWith(".mrc"))
format = "MARC";
else {
Trace.warning(
"Warning: cannot deduce file type from extension on file '" +
srcPath);
return false;
}
}
// We need to refer to the file in a way that isn't dependent on the
// particular location the index is at right now. So calculate a key
// that just contains the index name and the part of the path after that
// index's data directory.
//
String key = IndexUtil.calcDocKey(new File(cfgInfo.xtfHomePath),
cfgInfo.indexInfo, srcPath);
// Calculate a proper system ID for this file.
String systemId = srcPath.toURL().toString();
// Figure out where to put the lazy file (if we've been asked to build one)
StructuredStore lazyStore = null;
if (cfgInfo.buildLazyFiles)
{
// Figure out where to put the lazy tree file. We don't create
// the directory just yet, since for non-XML files the store will
// never be used.
//
File lazyFile = IndexUtil.calcLazyPath(new File(cfgInfo.xtfHomePath),
cfgInfo.indexInfo,
srcPath,
false); // false: don't create yet
// Use a file proxy so that we don't actually open the file handle
// until (and if) the queued file is actually indexed.
//
lazyStore = new StructuredFileProxy(lazyFile);
}
// Convert the prefilter(s) to an array.
Templates[] preFilters = null;
if (!preFilterVec.isEmpty())
preFilters = (Templates[])preFilterVec.toArray(
new Templates[preFilterVec.size()]);
// Now we have enough info to construct the SrcFile.
IndexSource srcFile = null;
if (format.equalsIgnoreCase("XML")) {
InputSource finalSrc = new InputSource(systemId);
srcFile = new XMLIndexSource(finalSrc,
srcPath,
key,
preFilters,
displayStyle,
lazyStore);
if (removeDoctypeDecl)
((XMLIndexSource)srcFile).removeDoctypeDecl(true);
}
else if (format.equalsIgnoreCase("PDF"))
srcFile = new PDFIndexSource(srcPath, key, preFilters, displayStyle, null);
else if (format.equalsIgnoreCase("HTML"))
srcFile = new HTMLIndexSource(srcPath, key, preFilters, displayStyle, null);
else if (format.equalsIgnoreCase("MSWord"))
srcFile = new MSWordIndexSource(srcPath, key, preFilters, displayStyle, null);
else if (format.equalsIgnoreCase("Text"))
srcFile = new TextIndexSource(srcPath, key, preFilters, displayStyle, null);
else if (format.equalsIgnoreCase("MARC"))
srcFile = new MARCIndexSource(srcPath, key, preFilters, displayStyle);
else
throw new RuntimeException("Internal error: code missing support for type");
// Now queue up the file.
textProcessor.checkAndQueueText(srcFile);
// Let the caller know we didn't skip the file.
return true;
} // processFile()
} // class SrcTreeProcessor