package org.cdlib.xtf.textEngine;
/**
* Copyright (c) 2004, Regents of the University of California
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* - Neither the name of the University of California nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import javax.xml.transform.Result;
import javax.xml.transform.Templates;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.sax.SAXSource;
import javax.xml.transform.sax.SAXTransformerFactory;
import net.sf.saxon.Filter;
import org.cdlib.xtf.saxonExt.sql.SQLConnect;
import org.cdlib.xtf.textIndexer.CrimsonBugWorkaround;
import org.cdlib.xtf.textIndexer.IndexInfo;
import org.cdlib.xtf.textIndexer.IndexerConfig;
import org.cdlib.xtf.util.DocTypeDeclRemover;
import org.cdlib.xtf.util.Path;
import org.cdlib.xtf.util.XTFSaxonErrorListener;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
/**
* This class provides methods related to, but not always part of, a text
* index. For instance, there are methods to calculate document keys (as
* used in an index), or lazy file paths. It also maintains a publicly
* accessible cache of index info entries read from the index config file(s).
*
* @author Martin Haye
*/
public class IndexUtil
{
private static ConfigCache configCache = new ConfigCache();
private static SAXParserFactory saxParserFactory = null;
private static TransformerFactory transformerFactory = null;
/**
* Given an index configuration file and the name of an index within that file,
* fetch the configuration info. This is a memo function, so any given index
* name will be cached and thus only loaded once.
*
* @param idxConfigFile Index configuration file to read
* @param idxName Name of the index within that file
*
* @return Information for the specified index.
* @throws Exception If there is a problem reading the config file.
*/
public static IndexInfo getIndexInfo(File idxConfigFile, String idxName)
throws Exception
{
return configCache.find(idxConfigFile, idxName).indexInfo;
} // getIndexInfo()
/**
* Given an index within a config file and the path to the source XML text
* of a document, this method infers the correct path to the lazy version
* of that source document. The lazy version will be somewhere within the
* index's directory.
*
* @param idxConfigFile File to load index configuration from
* @param idxName Index name within the config
* @param srcTextFile Source text file of interest
* @param createDir true to create the directory for the lazy file
* if it doesn't exist; false to never create the
* directory.
*
* @return Expected location of the lazy version of the
* source file
*/
public static File calcLazyPath(File xtfHome, File idxConfigFile,
String idxName, File srcTextFile,
boolean createDir)
throws IOException
{
// First, load the particular index info from the config file (though if
// we've already loaded it, the cache will just return it.)
//
IndexerConfig idxCfg;
try {
idxCfg = configCache.find(idxConfigFile, idxName);
}
catch (Exception e) {
if (e instanceof IOException)
throw (IOException)e;
throw new RuntimeException(e);
}
// If we couldn't find the index name, throw an exception.
if (idxCfg.indexInfo == null || idxCfg.indexInfo.sourcePath == null)
throw new RuntimeException(
"Index name '" + idxName + "' not found in index config file");
// Use the other form of calcLazyPath() to do the rest of the work.
return calcLazyPath(xtfHome, idxCfg.indexInfo, srcTextFile, createDir);
} // public calcLazyPath()
/**
* Given an index within a config file and the path to the source XML text
* of a document, this method infers the correct path to the lazy version
* of that source document. The lazy version will be somewhere within the
* index's directory.
*
* @param xtfHome File at the root of the XTF directory tree
* @param idxInfo Configuration info for the index in question.
* @param srcTextFile Source text file of interest
* @param createDir true to create the directory for the lazy file
* if it doesn't exist; false to never create the
* directory.
*
* @return Expected location of the lazy version of the
* source file
*/
public static File calcLazyPath(File xtfHome, IndexInfo idxInfo,
File srcTextFile, boolean createDir)
throws IOException
{
// Figure out the part of the source file's path that matches the index
// data directory.
//
String sourcePath;
if (idxInfo.cloneData && srcTextFile.toString().contains("/dataClone/"))
sourcePath = Path.normalizePath(idxInfo.indexPath) + "dataClone/" + idxInfo.indexName + "/";
else
sourcePath = idxInfo.sourcePath;
String fullSourcePath = Path.resolveRelOrAbs(xtfHome.toString(), sourcePath);
String prefix = Path.calcPrefix(srcTextFile.getParent(),
fullSourcePath.toString());
if (prefix == null) {
throw new IOException(
"XML source file " + srcTextFile + " is not contained within " +
idxInfo.sourcePath);
}
// Form the result by adding the non-overlapping part to the 'lazy'
// directory within the index directory.
//
String srcTextPath = Path.normalizeFileName(srcTextFile.toString());
String after = srcTextPath.substring(prefix.length());
String lazyPath = idxInfo.indexPath + "lazy/" + idxInfo.indexName + "/" +
after + ".lazy";
lazyPath = Path.resolveRelOrAbs(xtfHome.toString(), lazyPath);
File lazyFile = new File(lazyPath);
// If we've been asked to create the directory, do it now.
if (createDir) {
if (!Path.createPath(lazyFile.getParentFile().toString()))
throw new IOException("Error creating lazy file path");
}
// And we're done.
return lazyFile;
} // public calcLazyPath()
/**
* Given an index within a config file and the path to the source XML text
* of a document, this method infers the correct document key that should be
* stored in the index.
*
* @param idxConfigFile File to load index configuration from
* @param idxName Index name within the config
* @param srcTextFile Source text file of interest
*
* @return Document key to store or look for in the index
*/
public static String calcDocKey(File xtfHome, File idxConfigFile,
String idxName, File srcTextFile)
throws IOException
{
// First, load the particular index info from the config file (though if
// we've already loaded it, the cache will just return it.)
//
IndexerConfig config;
try {
config = configCache.find(idxConfigFile, idxName);
}
catch (Exception e) {
if (e instanceof IOException)
throw (IOException)e;
throw new RuntimeException(e);
}
// Use the other form of calcDocKey() to do the rest of the work.
return calcDocKey(xtfHome, config.indexInfo, srcTextFile);
} // calcDocKey()
/**
* Given an index within a config file and the path to the source XML text
* of a document, this method infers the correct document key that should be
* stored in the index.
*
* @param xtfHomeFile The XTF_HOME directory
* @param idxInfo Configuration info for the index in question.
* @param srcTextFile Source text file of interest
*
* @return Document key to store or look for in the index
*/
public static String calcDocKey(File xtfHomeFile, IndexInfo idxInfo,
File srcTextFile)
throws IOException
{
// Figure out the part of the source file's path that matches the index
// data directory.
//
String sourcePath;
if (idxInfo.cloneData && srcTextFile.toString().contains("/dataClone/"))
sourcePath = Path.normalizePath(idxInfo.indexPath) + "dataClone/" + idxInfo.indexName + "/";
else
sourcePath = idxInfo.sourcePath;
String fullSourcePath = Path.resolveRelOrAbs(xtfHomeFile, sourcePath);
String prefix = Path.calcPrefix(srcTextFile.getParent(), fullSourcePath);
if (prefix == null) {
throw new IOException(
"XML source file " + srcTextFile + " is not contained within " +
sourcePath);
}
// Form the result using the index name and the non-overlapping part.
String srcTextPath = Path.normalizeFileName(srcTextFile.toString());
String after = srcTextPath.substring(prefix.length());
String key = idxInfo.indexName + ":" + after;
// And we're done.
return key;
} // calcDocKey()
/**
* Create a SAX parser using the best implementation we can find. We prefer
* the new parser supplied by Java 1.5. Failing that, we try for the Crimson
* parser, and if that's not found, we try the default.
*/
public static SAXParser createSAXParser()
{
// If we don't have a factory yet, make one...
if (saxParserFactory == null)
{
// Our first choice is the new parser supplied by Java 1.5.
// Second choice is the older (but reliable) Crimson parser.
//
try
{
Class factoryClass = Class.forName(
"com.sun.org.apache.xerces.internal.jaxp.SAXParserFactoryImpl");
saxParserFactory = (SAXParserFactory)factoryClass.newInstance();
}
catch (ClassNotFoundException e) {
try
{
Class factoryClass = Class.forName(
"org.apache.crimson.jaxp.SAXParserFactoryImpl");
saxParserFactory = (SAXParserFactory)factoryClass.newInstance();
}
catch (ClassNotFoundException e2) {
// Okay, accept whatever the default is.
saxParserFactory = SAXParserFactory.newInstance();
}
catch (InstantiationException e2) {
throw new RuntimeException(e2);
}
catch (IllegalAccessException e2) {
throw new RuntimeException(e2);
}
}
catch (InstantiationException e) {
throw new RuntimeException(e);
}
catch (IllegalAccessException e) {
throw new RuntimeException(e);
}
}
// Use the parser factory to make a new parser.
synchronized (saxParserFactory)
{
try {
SAXParser xmlParser = saxParserFactory.newSAXParser();
XMLReader xmlReader = xmlParser.getXMLReader();
xmlReader.setFeature("http://xml.org/sax/features/namespaces", true);
xmlReader.setFeature("http://xml.org/sax/features/namespace-prefixes",
false);
// For speed, and to make indexing utterly reliable, don't load external
// DTDs. If this fails, we ignore it (at least we tried.)
//
try {
xmlReader.setFeature(
"http://apache.org/xml/features/nonvalidating/load-external-dtd",
false);
}
catch (SAXException err) {
}
// All done
return xmlParser;
}
catch (SAXException e) {
throw new RuntimeException(e);
}
catch (ParserConfigurationException e) {
throw new RuntimeException(e);
}
}
} // createSaxParser()
/**
* Create an XML reader using the best implementation we can find. We prefer
* the new parser supplied by Java 1.5. Failing that, we try for the Crimson
* parser, and if that's not found, we try the default.
*/
public static XMLReader createXMLReader()
{
try {
SAXParser parser = createSAXParser();
return parser.getXMLReader();
}
catch (SAXException e) {
throw new RuntimeException(e);
}
} // createXMLReader()
/**
* Get a TransformerFactory.
*/
private static TransformerFactory getTransformerFactory()
{
// If we don't have a factory yet, make one.
if (transformerFactory == null)
transformerFactory = new net.sf.saxon.TransformerFactoryImpl();
return transformerFactory;
} // getTransformerFactory()
/**
* Create a Saxon transformer.
*/
public static Transformer createTransformer()
{
// Make the new transformer that was requested.
try {
return getTransformerFactory().newTransformer();
}
catch (TransformerConfigurationException e) {
throw new RuntimeException(e);
}
} // createTransformer()
/**
* Applies the standard set of filters for an XML document. In our case,
* this involves removing document type declarations, and working around
* a bug in the Apache Crimson parser.
*
* @param inStream Document stream to filter
* @param applyCrimsonWorkaround true to apply the workaround for the
* 8193-byte bug in the Crimson XML parser.
* @param removeDoctypeDecl true to remove DOCTYPE declaration; false to
* leave them alone.
*
* @return Filtered input stream
*/
public static InputStream filterXMLDocument(InputStream inStream,
boolean applyCrimsonWorkaround,
boolean removeDoctypeDecl)
{
// Remove DOCTYPE declarations, since the XML reader will barf if it
// can't resolve the entity reference, and we really don't care.
//
if (removeDoctypeDecl)
inStream = new DocTypeDeclRemover(inStream);
// Work around a nasty bug in the Apache Crimson parser. If it
// finds a ']' character at the end of its 8193-byte buffer,
// and that is preceded by a '>' character then it crashes. The
// following filter inserts a space in such cases.
//
if (applyCrimsonWorkaround)
inStream = new CrimsonBugWorkaround(inStream);
return inStream;
}
/**
* Applies the standard set of filters for an XML document. In our case,
* this involves removing document type declarations, and working around
* a bug in the Apache Crimson parser.
*
* @param inStream Document stream to filter
* @param saxParser Parser that will be used to parse the document; used
* to determine whether or not to apply the Crimson
* parser workaround.
* @param removeDoctypeDecl true to remove DOCTYPE declaration; false to
* leave them alone.
*
* @return Filtered input stream
*/
public static InputStream filterXMLDocument(InputStream inStream,
SAXParser saxParser,
boolean removeDoctypeDecl)
{
boolean applyCrimsonWorkaround = saxParser.getClass().getName().equals(
"org.apache.crimson.jaxp.SAXParserImpl");
return filterXMLDocument(inStream, applyCrimsonWorkaround, removeDoctypeDecl);
}
/**
* Apply one or more prefilter stylesheets to an XML input source. Pass the
* filtered data to to the specified Result.
*
* @param prefilterStylesheets Stylesheets to process
* @param reader Reader to use for parsing the input XML
* @param xmlSource Source of XML data
* @param ultimateResult Where to send the output
*/
public static void applyPreFilters(Templates[] prefilterStylesheets,
XMLReader reader, InputSource xmlSource,
Result ultimateResult)
throws SAXException, TransformerException, TransformerConfigurationException
{
assert prefilterStylesheets.length > 0 : "applyPrefilters must have at least one stylesheet";
XMLReader lastInChain = reader;
SAXTransformerFactory stf = (SAXTransformerFactory)getTransformerFactory();
// Process each prefilter.
for (int i = 0; i < prefilterStylesheets.length; i++)
{
// Create an XMLFilter from the stylesheet
Filter filter = (Filter)stf.newXMLFilter(prefilterStylesheets[i]);
Transformer trans = filter.getTransformer();
// Make sure errors get directed to the right place.
if (!(trans.getErrorListener() instanceof XTFSaxonErrorListener))
trans.setErrorListener(new XTFSaxonErrorListener());
// Hook up its input.
filter.setParent(lastInChain);
// Onward.
lastInChain = filter;
} // for i
// Set up the transformer to process the SAX events generated
// by the last filter in the chain.
//
Transformer transformer = stf.newTransformer();
SAXSource transformSource = new SAXSource(lastInChain, xmlSource);
transformer.transform(transformSource, ultimateResult);
// If any SQL connections were opened during the transformation, close
// them now.
//
SQLConnect.closeThreadConnections();
} // applyPreFilter()
} // class FileCalc