package org.cdlib.xtf.lazyTree;
/**
* Copyright (c) 2004, Regents of the University of California
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* - Neither the name of the University of California nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.Arrays;
import net.sf.saxon.Configuration;
import net.sf.saxon.event.PipelineConfiguration;
import net.sf.saxon.event.Receiver;
import net.sf.saxon.om.NamePool;
import net.sf.saxon.om.NodeInfo;
import net.sf.saxon.tinytree.HackedTinyBuilder;
import net.sf.saxon.tinytree.TinyNodeImpl;
import net.sf.saxon.tinytree.TinyTree;
import net.sf.saxon.type.Type;
import org.cdlib.xtf.util.ConsecutiveMap;
import org.cdlib.xtf.util.PackedByteBuf;
import org.cdlib.xtf.util.StructuredStore;
import org.cdlib.xtf.util.SubStoreWriter;
import org.cdlib.xtf.util.XTFSaxonErrorListener;
/**
* <p>Creates and/or loads a disk-based representation of an XML tree. Once
* created, the persistent version can be quickly and incrementally loaded
* into memory.</p>
*
* <p>To build a tree, call the {@link #begin(StructuredStore)} method to start the
* process. Using the Receiver it returns, pass all the SAX events gathered
* from parsing the document. Finally, {@link #finish(Receiver, boolean)} will
* complete the process.</p>
*
* <p>To load a tree that was built previously, use either load method:
* {@link #load(StructuredStore)} or {@link #load(StructuredStore, LazyDocument)}.
*
* @author Martin Haye
*/
public class LazyTreeBuilder
{
/** The Saxon 'tiny' document, used to load the input tree */
private TinyTree tree;
/** Name pool used to map namecodes */
private NamePool namePool;
/** Mapping of names found to our internal name numbers */
private ConsecutiveMap names = new ConsecutiveMap();
/** Saxon configuration used for tree loading */
private Configuration config;
/** Pipeline configuration */
private PipelineConfiguration pipe;
/** File version stored in the persistent file. */
public static final String CURRENT_VERSION = "2.0";
/** Minimum version we can read. */
public static final String REQUIRED_VERSION = "2.0";
/** Default constructor -- sets up the configuration */
public LazyTreeBuilder(Configuration config) {
this.config = config;
config.setErrorListener(new XTFSaxonErrorListener());
pipe = new PipelineConfiguration();
pipe.setConfiguration(config);
pipe.setErrorListener(config.getErrorListener());
}
/** Establishes the name pool used to resolve namecodes */
public void setNamePool(NamePool pool) {
namePool = pool;
}
/**
* Load a persistent document using the default loader.
*
* @param treeStore The store to load from
*
* @return The root node of the document (which implements DocumentInfo)
*/
public NodeInfo load(StructuredStore treeStore)
throws FileNotFoundException, IOException
{
LazyDocument targetDoc = new LazyDocument(config);
load(treeStore, targetDoc);
return targetDoc;
} // load()
/**
* Load a pre-existing persistent tree and load it into an empty in-memory
* document.
*
* @param treeStore The store to load from
* @param emptyDoc An empty document object to initialize
*/
public void load(StructuredStore treeStore, LazyDocument emptyDoc)
throws FileNotFoundException, IOException
{
// Don't use it if the version number is old.
String fileVer = treeStore.getUserVersion();
if (fileVer.compareTo(REQUIRED_VERSION) < 0) {
throw new IOException("Cannot use old version of LazyTree file... " +
"consider re-indexing with '-clean'.");
}
// Now init the document (which loads the root node.)
emptyDoc.init(namePool, treeStore);
emptyDoc.setSystemId(treeStore.getSystemId());
}
/**
* Alternate way of constructing a lazy tree. First, begin() is called,
* returning a Receiver that should receive all the SAX events from the
* input. When all events have been sent, then call
* {@link #finish(Receiver, boolean)}.
*/
public Receiver begin(StructuredStore treeStore)
throws IOException
{
// A great way to read the tree in just the form we need it is to
// use Saxon's "TinyTree" implementation. Unfortunately, all of its
// members are private, so we actually use a hacked version whose only
// difference is that they're made public, and that text is accumulated
// straight to the disk file, rather than to a memory buffer.
//
HackedTinyBuilder builder = new HackedTinyBuilder();
if (namePool == null)
namePool = config.getNamePool();
builder.setPipelineConfiguration(pipe);
// We're going to make a structured file to contain the entire tree.
// To save memory, we'll write the character data directly to it
// rather than buffer it up.
//
builder.setTreeStore(treeStore);
treeStore.setUserVersion(CURRENT_VERSION);
SubStoreWriter textFile = treeStore.createSubStore("text");
builder.setTextStore(textFile);
// Done for now.
return builder;
} // begin()
/**
* Retrieves the current node number in the build. Indexer uses this to
* record node numbers in text chunks.
*
* @param inBuilder The builder gotten from begin()
* @return The current node number.
*/
public int getNodeNum(Receiver inBuilder) {
HackedTinyBuilder builder = (HackedTinyBuilder)inBuilder;
tree = builder.getTree();
// Don't count the stopper at the end of the tiny tree.
int nNodes = tree.getNumberOfNodes();
while (nNodes > 0 && tree.getNodeKind(nNodes-1) == Type.STOPPER)
nNodes--;
return nNodes;
} // getNodeNum()
/**
* Completes writing out a disk-based file. Assumes that the receiver
* (which must come from begin()) has been sent all the SAX events for
* the input document.
*/
public void finish(Receiver inBuilder, boolean closeStore)
throws IOException
{
HackedTinyBuilder builder = (HackedTinyBuilder)inBuilder;
StructuredStore treeStore = builder.getTreeStore();
tree = builder.getTree();
// Done with the text file now.
builder.getTextStore().close();
builder.setTextStore(null);
// If the build failed, delete the file.
if (tree == null) {
treeStore.delete();
return;
}
// Make sure we support all the features used in the document.
checkSupport();
// Now make a structured file containing the entire tree's contents.
writeNames(treeStore.createSubStore("names"));
writeAttrs(treeStore.createSubStore("attributes")); // must be before nodes
writeNodes(treeStore.createSubStore("nodes"));
// Close the store if requested.
if (closeStore)
treeStore.close();
// All done!
tree = null;
names = null;
}
/**
* Like finish() above, but aborts the tree building process and removes the
* file. Should be called if normal processing cannot complete.
*/
public void abort(Receiver inBuilder) {
HackedTinyBuilder builder = (HackedTinyBuilder)inBuilder;
try {
if (builder.getTextStore() != null)
builder.getTextStore().close();
StructuredStore treeStore = builder.getTreeStore();
if (treeStore != null)
treeStore.delete();
}
catch (IOException e) {
// We're aborting, which means some more important exception came first.
// So ignore problems during the abort itself.
}
}
/**
* Build and write out the table of names referenced by the tree. Also
* includes the namespaces.
*
* @param out SubStore to write to.
*/
private void writeNames(SubStoreWriter out)
throws IOException
{
PackedByteBuf buf = new PackedByteBuf(1000);
// Make sure the right name pool was used.
assert tree.getConfiguration().getNamePool() == namePool;
// Write out all the namespaces.
buf.writeInt(tree.getNumberOfNamespaces());
int[] namespaceCodes = tree.getNamespaceCodeArray();
int[] namespaceParents = tree.getNamespaceParentArray();
for (int i = 0; i < tree.getNumberOfNamespaces(); i++) {
int code = namespaceCodes[i];
buf.writeString(namePool.getPrefixFromNamespaceCode(code));
buf.writeString(namePool.getURIFromNamespaceCode(code));
buf.writeInt(namespaceParents[i]);
}
// Add all the namecodes from elements and attributes.
int[] nameCodes = tree.getNameCodeArray();
for (int i = 0; i < tree.getNumberOfNodes(); i++) {
if (nameCodes[i] >= 0)
names.put(Integer.valueOf(nameCodes[i]));
}
int[] attCodes = tree.getAttributeNameCodeArray();
for (int i = 0; i < tree.getNumberOfAttributes(); i++) {
if (attCodes[i] >= 0)
names.put(Integer.valueOf(attCodes[i]));
}
// Write out all the namecodes.
Object[] nameArray = names.getArray();
buf.writeInt(nameArray.length);
for (int i = 0; i < nameArray.length; i++) {
int code = ((Integer)nameArray[i]).intValue();
buf.writeString(namePool.getPrefix(code));
buf.writeString(namePool.getURI(code));
buf.writeString(namePool.getLocalName(code));
}
// All done.
buf.output(out);
out.close();
} // writeNames()
/**
* Build and write out all the nodes in the tree. The resulting table has
* fixed-sized entries, sized to fit the largest node.
*
* @param out SubStore to write to.
*/
private void writeNodes(SubStoreWriter out)
throws IOException
{
// Write the root node's number
out.writeInt(0);
// Figure out how many nodes there are, excluding the stopper at the end
// of the tree.
//
int nNodes = tree.getNumberOfNodes();
while (nNodes > 0 && tree.getNodeKind(nNodes-1) == Type.STOPPER)
nNodes--;
// Get pointers to handy arrays we'll need
int[] nameCodes = tree.getNameCodeArray();
int[] nexts = tree.getNextPointerArray();
int[] alphas = tree.getAlphaArray();
int[] betas = tree.getBetaArray();
// TinyTree won't provide us direct access to the 'prior' array... so
// let's build it ourselves.
int[] prior = new int[tree.getNumberOfNodes()];
Arrays.fill(prior, 0, tree.getNumberOfNodes(), -1);
for (int i = 0; i < tree.getNumberOfNodes(); i++)
{
int nextNode = nexts[i];
if (nextNode > i) {
prior[nextNode] = i;
}
}
// Pack up each node. That way we can calculate the maximum size of
// any particular one, and they can be randomly accessed by multiplying
// the node number by that size.
//
PackedByteBuf[] nodeBufs = new PackedByteBuf[nNodes];
int maxSize = 0;
for (int i = 0; i < nNodes; i++)
{
PackedByteBuf buf = nodeBufs[i] = new PackedByteBuf(20);
// Check for un-handled node types.
byte kind = tree.nodeKind[i];
if (kind == Type.COMMENT || kind == Type.PROCESSING_INSTRUCTION)
{
// These should have been filtered out by HackedTinyBuilder
throw new RuntimeException(
"Internal error: processing instructions and comments " +
"should have been filtered out");
}
// Kind
buf.writeByte(tree.nodeKind[i]);
// Flag
NodeInfo node = tree.getNode(i);
int nameCode = nameCodes[i];
int parent = (node.getParent() != null)
? (((TinyNodeImpl)node.getParent()).getNodeNumber()) : -1;
int prevSib = prior[i];
int nextSib = (nexts[i] > i) ? nexts[i] : -1;
int child = node.hasChildNodes() ? (i + 1) : -1;
int alpha = alphas[i];
int beta = betas[i];
int flags = ((nameCode != -1) ? Flag.HAS_NAMECODE : 0) |
((parent != -1) ? Flag.HAS_PARENT : 0) |
((prevSib != -1) ? Flag.HAS_PREV_SIBLING : 0) |
((nextSib != -1) ? Flag.HAS_NEXT_SIBLING : 0) |
((child != -1) ? Flag.HAS_CHILD : 0) |
((alpha != -1) ? Flag.HAS_ALPHA : 0) |
((beta != -1) ? Flag.HAS_BETA : 0);
buf.writeInt(flags);
assert prevSib != nextSib || prevSib < 0;
// Name code
if (nameCode >= 0) {
int nameIdx = names.get(Integer.valueOf(nameCode));
assert nameIdx >= 0 : "A name was missed when writing name codes";
buf.writeInt(nameIdx);
}
// Parent
if (parent >= 0)
buf.writeInt(parent);
// Prev sibling
if (prevSib >= 0)
buf.writeInt(prevSib);
// Next sibling.
if (nextSib >= 0)
buf.writeInt(nextSib);
// First child (if any).
if (child >= 0) {
assert child != 0;
buf.writeInt(child);
}
// Alpha and beta
if (alpha != -1)
buf.writeInt(alpha);
if (beta != -1)
buf.writeInt(beta);
// Now calculate the size of the buffer, and bump the max if needed
buf.compact();
maxSize = Math.max(maxSize, buf.length());
} // for i
// Okay, we're ready to write out the node table now. First comes the
// number of nodes, followed by the size in bytes of each one.
//
out.writeInt(tree.getNumberOfNodes());
out.writeInt(maxSize);
for (int i = 0; i < nNodes; i++)
nodeBufs[i].output(out, maxSize);
// All done.
out.close();
} // writeNodes()
/**
* Build and write out all the attributes in the tree. The resulting table
* has variable-sized entries.
*
* @param out SubStore to write to.
*/
private void writeAttrs(SubStoreWriter out)
throws IOException
{
// Do a dry run to figure out the max size of any entry.
int maxSize = 0;
PackedByteBuf buf = new PackedByteBuf(100);
int[] attParents = tree.getAttributeParentArray();
int[] attCodes = tree.getAttributeNameCodeArray();
CharSequence[] attValues = tree.getAttributeValueArray();
for (int i = 0; i < tree.getNumberOfAttributes();)
{
// Figure out how many attributes for this parent.
int j;
for (j = i + 1; j < tree.getNumberOfAttributes(); j++) {
if (attParents[j] != attParents[i])
break;
}
int nAttrs = j - i;
// Pack them all up.
buf.reset();
buf.writeInt(nAttrs);
for (j = i; j < i + nAttrs; j++)
{
// Name code
int nameIdx = names.get(Integer.valueOf(attCodes[j]));
assert nameIdx >= 0 : "A name was missed when writing name codes";
buf.writeInt(nameIdx);
// Value
buf.writeString(attValues[j].toString());
}
// Bump the max if necessary.
maxSize = Math.max(maxSize, buf.length());
// Next!
i += nAttrs;
} // for i
// Write the max size of any block.
out.writeInt(maxSize);
// Write out each attrib, and record their offsets and lengths.
int[] alphas = tree.getAlphaArray();
for (int i = 0; i < tree.getNumberOfAttributes();)
{
// Figure out how many attributes for this parent.
int j;
for (j = i + 1; j < tree.getNumberOfAttributes(); j++) {
if (attParents[j] != attParents[i])
break;
}
int nAttrs = j - i;
// Pack them all up.
buf.reset();
buf.writeInt(nAttrs);
for (j = i; j < i + nAttrs; j++)
{
// Name code
int nameIdx = names.get(Integer.valueOf(attCodes[j]));
assert nameIdx >= 0 : "A name was missed when writing name codes";
buf.writeInt(nameIdx);
// Value
buf.writeString(attValues[j].toString());
}
// Record the offset in the attribute file.
int parent = attParents[i];
assert tree.nodeKind[parent] == Type.ELEMENT;
alphas[parent] = (int)out.length();
// Write out the data.
buf.output(out);
// Next!
i += nAttrs;
} // for i
// To avoid the reader having to worry about overrunning the subfile,
// write an extra maxSize bytes.
//
byte[] tmp = new byte[maxSize];
out.write(tmp);
// All done.
out.close();
} // writeAttrs()
/**
* Checks that the tree doesn't use features we don't support. If it does,
* we throw an exception.
*/
private void checkSupport()
throws IOException
{
if (tree.getAttributeTypeCodeArray() != null ||
tree.getTypeCodeArray() != null)
{
// We don't actually care. At the moment, it appears to be safe to
// simply throw these away.
}
} // checkSupport()
} // class LazyTreeBuilder