package org.cdlib.xtf.dynaXML; /* * Copyright (c) 2004, Regents of the University of California * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the University of California nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import javax.xml.parsers.SAXParser; import javax.xml.transform.Templates; import javax.xml.transform.sax.SAXResult; import net.sf.saxon.Configuration; import net.sf.saxon.event.Receiver; import net.sf.saxon.event.ReceivingContentHandler; import org.cdlib.xtf.lazyTree.LazyTreeBuilder; import org.cdlib.xtf.servletBase.TextConfig; import org.cdlib.xtf.servletBase.TextServlet; import org.cdlib.xtf.textEngine.IndexUtil; import org.cdlib.xtf.util.DocTypeDeclRemover; import org.cdlib.xtf.util.Path; import org.cdlib.xtf.util.StructuredFile; import org.cdlib.xtf.util.StructuredStore; import org.cdlib.xtf.util.SubStoreReader; import org.cdlib.xtf.util.SubStoreWriter; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; /* * This file created on Mar 11, 2005 by Martin Haye */ /** * Provides local filesystem-based access to lazy and non-lazy versions of * a source XML document. * * @author Martin Haye */ public class DefaultDocLocator implements DocLocator { /** Servlet we are part of */ private TextServlet servlet; /** Attach to a servlet */ public void setServlet(TextServlet servlet) { this.servlet = servlet; } /** * Search for a StructuredStore containing the "lazy" or persistent * representation of a given document. Index parameters are specified, * since often the lazy file is stored along with the index. This method * is called first, and if it returns null, then * {@link #getInputSource(String, boolean)} will be called as a fall-back. * * @param indexConfigPath Path to the index configuration file * @param indexName Name of the index being searched * @param sourcePath Path to the source document * @param preFilter Prefilter stylesheet to run (or null for none) * @param removeDoctypeDecl Set to true to remove DOCTYPE declaration from * the XML document. * * @return Store containing the tree, or null if none * could be found. */ public StructuredStore getLazyStore(String indexConfigPath, String indexName, String sourcePath, Templates preFilter, boolean removeDoctypeDecl) throws IOException { // If we're not allowed to use lazy files, then don't. TextConfig config = servlet.getConfig(); if (config instanceof DynaXMLConfig && !((DynaXMLConfig)config).useLazyFiles) return null; // If no 'index' specified in the docInfo, then there's no way we can // find the lazy file. // if (indexConfigPath == null || indexName == null) return null; // If the source isn't a local file, we also can't use a lazy file. if (sourcePath.startsWith("http:")) return null; if (sourcePath.startsWith("https:")) return null; // If it's a directory, something went wrong. No lazy file for sure. File sourceFile = new File(sourcePath); if (!sourceFile.isFile()) return null; // Figure out where the lazy file is (or should be.) File lazyFile = calcLazyPath(new File(servlet.getRealPath("")), new File(indexConfigPath), indexName, new File(sourcePath), false); // Get the config flag telling us whether we're allowed to build lazy // files outside of indexing. // boolean buildLazyFilesAlone = false; if (config instanceof DynaXMLConfig) buildLazyFilesAlone = ((DynaXMLConfig)config).buildLazyFilesAlone; // If the lazy file is out of date (and we created it), rebuild it. Note // that it's not safe to rebuild lazy files created by the indexer, since // it would cause hit highlighting to fail due to a mismatch between // node numbers stored in the index vs. stored in the lazy file. // if (buildLazyFilesAlone && lazyFile.canRead() && sourceFile.lastModified() > lazyFile.lastModified() && isPostIndexLazyFile(lazyFile)) { lazyFile.delete(); } // If we can't read it for any reason (including because we just deleted an // out-of-date file), try to build it instead... // if (!lazyFile.canRead()) { // ... unless we've been asked not to build lazy files alone. // This is the case by default, but people who want to use dynaXML // without textIndexer will allow dynaXML to build lazy files by // itself. // if (!buildLazyFilesAlone) return null; // Decide whether we need to strip whitespace boolean stripWhitespace = false; try { stripWhitespace = IndexUtil.getIndexInfo(new File(indexConfigPath), indexName).stripWhitespace; } catch (Exception e) { } // Build the lazy file. buildLazyStore(lazyFile, sourcePath, preFilter, removeDoctypeDecl, stripWhitespace); } // Cool. Open the lazy file. return StructuredFile.open(lazyFile); } // getLazyStore() /** * Wrapper for IndexUtil.calcLazyPath(); useful for derived classes to supply their * own implementation. */ public File calcLazyPath( File xtfHome, File idxConfigFile, String idxName, File srcTextFile, boolean createDir) throws IOException { return IndexUtil.calcLazyPath(xtfHome, idxConfigFile, idxName, srcTextFile, createDir); } /** * Retrieve the data stream for an XML source document. * * @param sourcePath Path to the source document * @param removeDoctypeDecl Set to true to remove DOCTYPE declaration from * the XML document. * * @return Data stream for the document. */ public InputSource getInputSource(String sourcePath, boolean removeDoctypeDecl) throws IOException { // If it's non-local, load the URL. if (sourcePath.startsWith("http:") || sourcePath.startsWith("https:")) { return new InputSource(sourcePath); } // Okay, assume it's a local file. InputStream inStream = new FileInputStream(sourcePath); // Remove DOCTYPE declarations, since the XML reader will barf // if it can't resolve the entity reference, and we really // don't care one way or the other. // if (removeDoctypeDecl) inStream = new DocTypeDeclRemover(inStream); // Make the input source, and give it a real system ID. InputSource inSrc = new InputSource(inStream); inSrc.setSystemId(new File(sourcePath).toURL().toString()); // All done! return inSrc; } // getInputSource() /** * Create a lazy document by loading the original, building the lazy * tree, and writing it out. * * @param lazyFile Lazy file to create * @param sourcePath Path to the source document * @param preFilter A prefilter stylesheet (or null for no pre-filtering.) * @param removeDoctypeDecl true to remove DOCTYPE declarations from the * XML document * @param stripWhitespace If set, whitespace will be removed between elements * in the lazy file. */ private void buildLazyStore(File lazyFile, String sourcePath, Templates preFilter, boolean removeDoctypeDecl, boolean stripWhitespace) throws IOException { // The directory the lazy file is to be stored in might not exist yet. // If not, we need to create it now before making the lazy file. // Path.createPath(lazyFile.getParent()); // Build a temp file, and when it's finished, rename it. File tmpFile = new File(lazyFile.getAbsolutePath() + ".tmp"); // While we parse the source document, we're going to also build up // a tree that will be written to the lazy file. // Configuration config = new Configuration(); LazyTreeBuilder lazyBuilder = new LazyTreeBuilder(config); StructuredStore lazyStore = StructuredFile.create(tmpFile); // Put a special marker subfile within the store so we know it was created // outside of the indexing process. That way, we can identify files that // are okay to update when the timestamp of the original changes. // SubStoreWriter sub = lazyStore.createSubStore("isPostIndexLazyFile"); sub.writeByte(1); sub.close(); // Start the build process. Receiver lazyReceiver = lazyBuilder.begin(lazyStore); try { ReceivingContentHandler lazyHandler = new ReceivingContentHandler(); lazyHandler.setReceiver(lazyReceiver); lazyHandler.setPipelineConfiguration(lazyReceiver.getPipelineConfiguration()); // Instantiate a new XML parser, being sure to get the right one. SAXParser xmlParser = IndexUtil.createSAXParser(); // Open the source file for reading InputStream inStream = new FileInputStream(sourcePath); // Apply the standard set of document filters. InputSource inSrc = new InputSource(IndexUtil.filterXMLDocument( inStream, xmlParser, removeDoctypeDecl)); // Put a proper system ID onto the InputSource. inSrc.setSystemId(new File(sourcePath).toURL().toString()); // Make a DefaultHandler that will pass events to the lazy receiver. LazyPassthru passthru = new LazyPassthru(lazyHandler, stripWhitespace); // Apply a prefilter if one was specified. if (preFilter == null) { try { xmlParser.parse(inSrc, passthru); } catch (Exception e) { throw new RuntimeException(e); } } else { // Apply the pre-filter. try { Templates[] array = new Templates[1]; array[0] = preFilter; IndexUtil.applyPreFilters(array, xmlParser.getXMLReader(), inSrc, new SAXResult(passthru)); } catch (Exception e) { lazyBuilder.abort(lazyReceiver); throw new RuntimeException(e); } } // Finish off the lazy file. lazyBuilder.finish(lazyReceiver, true); // And rename the temp file. tmpFile.renameTo(lazyFile); } catch (IOException e) { lazyBuilder.abort(lazyReceiver); throw e; } } // buildLazyStore() /** * Check if the given lazy file was created after the indexing process * (i.e. by this doc locator) */ private boolean isPostIndexLazyFile(File f) { StructuredStore store = null; SubStoreReader sub = null; boolean ret = false; try { store = StructuredFile.open(f); sub = store.openSubStore("isPostIndexLazyFile"); if (sub.readByte() == 1) ret = true; } catch (IOException e) { } finally { try { if (sub != null) sub.close(); if (store != null) store.close(); } catch (IOException e) { } } return ret; } /** * Passes SAX events to a ContentHandler. Also performs character * buffering that mimics what the textIndexer normally does. */ private static class LazyPassthru extends DefaultHandler { private StringBuffer charBuf = new StringBuffer(); private ContentHandler lazyHandler; private boolean stripWhitespace; public LazyPassthru(ContentHandler lazyHandler, boolean stripWhitespace) { this.lazyHandler = lazyHandler; this.stripWhitespace = stripWhitespace; } public void startDocument() throws SAXException { lazyHandler.startDocument(); } public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException { flushCharacters(); lazyHandler.startElement(uri, localName, qName, atts); } public void endElement(String uri, String localName, String qName) throws SAXException { flushCharacters(); lazyHandler.endElement(uri, localName, qName); } public void processingInstruction(String target, String data) throws SAXException { lazyHandler.processingInstruction(target, data); } public void endDocument() throws SAXException { lazyHandler.endDocument(); } public void characters(char[] ch, int start, int length) { charBuf.append(ch, start, length); } private void flushCharacters() throws SAXException { // If the entire buffer is whitespace (or empty), we can safely // strip it. // int i = 0; if (stripWhitespace) { for (i = 0; i < charBuf.length(); i++) if (!Character.isWhitespace(charBuf.charAt(i))) break; } if (i < charBuf.length()) lazyHandler.characters(charBuf.toString().toCharArray(), 0, charBuf.length()); charBuf.setLength(0); } } ; } // class DefaultDocLocator