/* * eXist Open Source Native XML Database * Copyright (C) 2001-2007 The eXist team * http://exist-db.org * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program; if not, write to the Free Software Foundation * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * * $Id$ */ package org.exist; import org.apache.log4j.Logger; import org.exist.collections.CollectionConfiguration; import org.exist.dom.AttrImpl; import org.exist.dom.CDATASectionImpl; import org.exist.dom.CommentImpl; import org.exist.dom.DocumentImpl; import org.exist.dom.DocumentTypeImpl; import org.exist.dom.ElementImpl; import org.exist.dom.ProcessingInstructionImpl; import org.exist.dom.QName; import org.exist.dom.StoredNode; import org.exist.dom.TextImpl; import org.exist.indexing.StreamListener; import org.exist.storage.DBBroker; import org.exist.storage.GeneralRangeIndexSpec; import org.exist.storage.IndexSpec; import org.exist.storage.NodePath; import org.exist.storage.txn.Txn; import org.exist.util.Configuration; import org.exist.util.ProgressIndicator; import org.exist.util.XMLChar; import org.exist.util.XMLString; import org.exist.util.pool.NodePool; import org.exist.xquery.Constants; import org.exist.xquery.value.StringValue; import org.w3c.dom.Element; import org.w3c.dom.Node; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.ErrorHandler; import org.xml.sax.Locator; import org.xml.sax.SAXException; import org.xml.sax.SAXNotRecognizedException; import org.xml.sax.SAXNotSupportedException; import org.xml.sax.SAXParseException; import org.xml.sax.ext.LexicalHandler; import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParserFactory; import java.util.HashMap; import java.util.Map; import java.util.Observable; import java.util.Stack; /** * Parses a given input document via SAX, stores it to * the database and handles index-creation. * * @author wolf * */ public class Indexer extends Observable implements ContentHandler, LexicalHandler, ErrorHandler { private static final int CACHE_CHILD_COUNT_MAX = 0x10000; public static final String ATTR_ID_TYPE = "ID"; public static final String ATTR_IDREF_TYPE = "IDREF"; public static final String ATTR_IDREFS_TYPE = "IDREFS"; private final static Logger LOG = Logger.getLogger(Indexer.class); public static final String CONFIGURATION_ELEMENT_NAME = "indexer"; public static final String CONFIGURATION_INDEX_ELEMENT_NAME = "index"; public static final String SUPPRESS_WHITESPACE_ATTRIBUTE = "suppress-whitespace"; public static final String PRESERVE_WS_MIXED_CONTENT_ATTRIBUTE = "preserve-whitespace-mixed-content"; public static final String PROPERTY_INDEXER_CONFIG = "indexer.config"; public final static String PROPERTY_SUPPRESS_WHITESPACE = "indexer.suppress-whitespace"; public static final String PROPERTY_PRESERVE_WS_MIXED_CONTENT = "indexer.preserve-whitespace-mixed-content"; protected DBBroker broker = null; protected Txn transaction; protected StreamListener indexListener; protected XMLString charBuf = new XMLString(); protected boolean inCDATASection = false; protected int currentLine = 0; protected NodePath currentPath = new NodePath(); protected DocumentImpl document = null; protected IndexSpec indexSpec = null; protected boolean insideDTD = false; protected boolean validate = false; protected int level = 0; protected Locator locator = null; protected int normalize = XMLString.SUPPRESS_BOTH; protected Map nsMappings = new HashMap(); protected Element rootNode; protected Stack stack = new Stack(); protected Stack nodeContentStack = new Stack(); protected StoredNode prevNode = null; protected String ignorePrefix = null; protected ProgressIndicator progress; protected boolean suppressWSmixed = false; protected int docSize = 0; /* used to record the number of children of an element during * validation phase. later, when storing the nodes, we already * know the child count and don't need to update the element * a second time. */ private int childCnt[] = new int[0x1000]; // the current position in childCnt private int elementCnt = 0; // the current nodeFactoryInstanceCnt private int nodeFactoryInstanceCnt = 0; // reusable fields private TextImpl text = new TextImpl(); private Stack usedElements = new Stack(); /** * Create a new parser using the given database broker and * user to store the document. * *@param broker *@exception EXistException */ public Indexer(DBBroker broker, Txn transaction) throws EXistException { this(broker, transaction, false); } /** * Create a new parser using the given database broker and * user to store the document. * *@param broker The database broker to use. *@param transaction The transaction to use for indexing *@param priv used by the security manager to * indicate that it needs privileged * access to the db. *@exception EXistException */ public Indexer(DBBroker broker, Txn transaction, boolean priv) throws EXistException { this.broker = broker; this.transaction = transaction; //TODO : move the configuration in the constructor or in a dedicated method Configuration config = broker.getConfiguration(); String suppressWS = (String) config.getProperty(PROPERTY_SUPPRESS_WHITESPACE); if (suppressWS != null) { if ("leading".equals(suppressWS)) normalize = XMLString.SUPPRESS_LEADING_WS; else if ("trailing".equals(suppressWS)) normalize = XMLString.SUPPRESS_TRAILING_WS; else if ("none".equals(suppressWS)) normalize = 0; } Boolean temp; if ((temp = (Boolean) config.getProperty(PROPERTY_PRESERVE_WS_MIXED_CONTENT)) != null) suppressWSmixed = temp.booleanValue(); } public void setValidating(boolean validate) { this.validate = validate; if (!validate) { broker.getIndexController().setDocument(document, StreamListener.STORE); this.indexListener = broker.getIndexController().getStreamListener(); } } /** * Prepare the indexer for parsing a new document. This will * reset the internal state of the Indexer object. * * @param doc */ public void setDocument(DocumentImpl doc, CollectionConfiguration collectionConfig) { document = doc; if (collectionConfig != null) indexSpec = collectionConfig.getIndexConfiguration(); // reset internal fields level = 0; currentPath.reset(); stack = new Stack(); docSize = 0; nsMappings.clear(); indexListener = null; rootNode = null; setPrevious(null); } /** * Set the document object to be used by this Indexer. This * method doesn't reset the internal state. * * @param doc */ public void setDocumentObject(DocumentImpl doc) { document = doc; } public DocumentImpl getDocument() { return document; } public int getDocSize() { return docSize; } public void characters(char[] ch, int start, int length) { if (length <= 0) return; if (charBuf != null) { charBuf.append(ch, start, length); } else { charBuf = new XMLString(ch, start, length); } } public void comment(char[] ch, int start, int length) { if (insideDTD) return; CommentImpl comment = new CommentImpl(ch, start, length); comment.setOwnerDocument(document); if (stack.empty()) { comment.setNodeId(broker.getBrokerPool().getNodeFactory().createInstance(nodeFactoryInstanceCnt++)); if (!validate) { broker.storeNode(transaction, comment, currentPath, indexSpec); } document.appendChild(comment); } else { ElementImpl last = (ElementImpl) stack.peek(); if (charBuf != null && charBuf.length() > 0) { text.setData(charBuf); text.setOwnerDocument(document); last.appendChildInternal(prevNode, text); if (!validate) { storeText(); } setPrevious(text); charBuf.reset(); } last.appendChildInternal(prevNode, comment); setPrevious(comment); if (!validate) { broker.storeNode(transaction, comment, currentPath, indexSpec); } } } public void endCDATA() { if (!stack.isEmpty()) { ElementImpl last = (ElementImpl) stack.peek(); if (charBuf != null && charBuf.length() > 0) { CDATASectionImpl cdata = new CDATASectionImpl(charBuf); cdata.setOwnerDocument(document); last.appendChildInternal(prevNode, cdata); if (!validate) { broker.storeNode(transaction, cdata, currentPath, indexSpec); if (indexListener != null) { indexListener.characters(transaction, cdata, currentPath); } } setPrevious(cdata); if (!nodeContentStack.isEmpty()) { for (int i = 0; i < nodeContentStack.size(); i++) { XMLString next = (XMLString) nodeContentStack.get(i); next.append(charBuf); } } charBuf.reset(); } } inCDATASection = false; } public void endDTD() { insideDTD = false; } public void endDocument() { if (!validate) { progress.finish(); setChanged(); notifyObservers(progress); } // LOG.debug("elementCnt = " + childCnt.length); } public void endElement(String namespace, String name, String qname) { final ElementImpl last = (ElementImpl) stack.peek(); if (last.getNodeName().equals(qname)) { if (charBuf != null && charBuf.length() > 0) { // remove whitespace if the node has just a single text child, // keep whitespace for mixed content. final XMLString normalized; if((charBuf.isWhitespaceOnly() && suppressWSmixed) || last.preserveSpace()) { normalized = charBuf; } else { normalized = last.getChildCount() == 0 ? charBuf.normalize(normalize) : (charBuf.isWhitespaceOnly() ? null : charBuf); } if (normalized != null && normalized.length() > 0) { text.setData(normalized); text.setOwnerDocument(document); last.appendChildInternal(prevNode, text); if (!validate) storeText(); setPrevious(text); } charBuf.reset(); } stack.pop(); XMLString elemContent = null; if (!validate && GeneralRangeIndexSpec.hasQNameOrValueIndex(last.getIndexType())) { elemContent = (XMLString) nodeContentStack.pop(); } if (!validate) { final String content = elemContent == null ? null : elemContent.toString(); broker.endElement(last, currentPath, content); if (indexListener != null) indexListener.endElement(transaction, last, currentPath); } currentPath.removeLastComponent(); if (validate) { if (childCnt != null) setChildCount(last); } else { document.setOwnerDocument(document); if ((childCnt == null && last.getChildCount() > 0) || (childCnt != null && childCnt[last.getPosition()] != last.getChildCount())) { broker.updateNode(transaction, last, false); } } setPrevious(last); level--; } } /** * @param last */ private void setChildCount(final ElementImpl last) { if (last.getPosition() >= childCnt.length) { if (childCnt.length > CACHE_CHILD_COUNT_MAX) { childCnt = null; return; } int n[] = new int[childCnt.length * 2]; System.arraycopy(childCnt, 0, n, 0, childCnt.length); childCnt = n; } childCnt[last.getPosition()] = last.getChildCount(); } public void endEntity(String name) { } public void endPrefixMapping(String prefix) { if (ignorePrefix != null && prefix.equals(ignorePrefix)) { ignorePrefix = null; } else { nsMappings.remove(prefix); } } public void error(SAXParseException e) throws SAXException { String msg="error at (" + e.getLineNumber() + "," + e.getColumnNumber() + ") : " + e.getMessage(); LOG.debug(msg); throw new SAXException(msg, e); } public void fatalError(SAXParseException e) throws SAXException { String msg="fatal error at (" + e.getLineNumber() + "," + e.getColumnNumber() + ") : " + e.getMessage(); LOG.debug(msg); throw new SAXException(msg, e); } public void ignorableWhitespace(char[] ch, int start, int length) { } public void processingInstruction(String target, String data) { ProcessingInstructionImpl pi = new ProcessingInstructionImpl(target, data); pi.setOwnerDocument(document); if (stack.isEmpty()) { pi.setNodeId(broker.getBrokerPool().getNodeFactory().createInstance(nodeFactoryInstanceCnt++)); if (!validate) { broker.storeNode(transaction, pi, currentPath, indexSpec); } document.appendChild(pi); } else { ElementImpl last = (ElementImpl) stack.peek(); if (charBuf != null && charBuf.length() > 0) { XMLString normalized = charBuf.normalize(normalize); if (normalized.length() > 0) { //TextImpl text = // new TextImpl( normalized ); text.setData(normalized); text.setOwnerDocument(document); last.appendChildInternal(prevNode, text); if (!validate) { storeText(); } setPrevious(text); } charBuf.reset(); } last.appendChildInternal(prevNode, pi); setPrevious(pi); if (!validate) { broker.storeNode(transaction, pi, currentPath, indexSpec); } } } public void setDocumentLocator(Locator locator) { this.locator = locator; } /** * set SAX parser feature. This method will catch (and ignore) exceptions * if the used parser does not support a feature. * *@param factory *@param feature *@param value */ private void setFeature(SAXParserFactory factory, String feature, boolean value) { try { factory.setFeature(feature, value); } catch (SAXNotRecognizedException e) { LOG.warn(e); } catch (SAXNotSupportedException snse) { LOG.warn(snse); } catch (ParserConfigurationException pce) { LOG.warn(pce); } } public void skippedEntity(String name) { } public void startCDATA() { if (!stack.isEmpty()) { ElementImpl last = (ElementImpl) stack.peek(); if (charBuf != null && charBuf.length() > 0) { text.setData(charBuf); text.setOwnerDocument(document); last.appendChildInternal(prevNode, text); if (!validate) storeText(); setPrevious(text); charBuf.reset(); } } inCDATASection = true; } // Methods of interface LexicalHandler // used to determine Doctype public void startDTD(String name, String publicId, String systemId) { DocumentTypeImpl docType = new DocumentTypeImpl(name, publicId, systemId); document.setDocumentType(docType); insideDTD = true; } public void startDocument() { if (!validate) { progress = new ProgressIndicator(currentLine, 100); document.setChildCount(0); elementCnt = 0; } docSize = 0; } public void startElement(String namespace, String name, String qname, Attributes attributes) throws SAXException { // calculate number of real attributes: // don't store namespace declarations int attrLength = attributes.getLength(); String attrQName; String attrNS; for (int i = 0; i < attributes.getLength(); i++) { attrNS = attributes.getURI(i); attrQName = attributes.getQName(i); if (attrQName.startsWith("xmlns") || attrNS.equals(Namespaces.EXIST_NS)) --attrLength; } ElementImpl last; ElementImpl node; int p = qname.indexOf(':'); String prefix = (p != Constants.STRING_NOT_FOUND) ? qname.substring(0, p) : ""; QName qn = broker.getBrokerPool().getSymbols().getQName(Node.ELEMENT_NODE, namespace, name, prefix); if (!stack.empty()) { last = (ElementImpl) stack.peek(); if (charBuf != null) { if(charBuf.isWhitespaceOnly()) { if (suppressWSmixed) { if(charBuf.length() > 0 && last.getChildCount() > 0) { text.setData(charBuf); text.setOwnerDocument(document); last.appendChildInternal(prevNode, text); if (!validate) storeText(); setPrevious(text); } } } else if(charBuf.length() > 0) { // mixed element content: don't normalize the text node, just check // if there is any text at all text.setData(charBuf); text.setOwnerDocument(document); last.appendChildInternal(prevNode, text); if (!validate) storeText(); setPrevious(text); } charBuf.reset(); } if (!usedElements.isEmpty()) { node = (ElementImpl) usedElements.pop(); node.setNodeName(qn); } else node = new ElementImpl(qn); // copy xml:space setting node.setPreserveSpace(last.preserveSpace()); // append the node to its parent // (computes the node id and updates the parent's child count) last.appendChildInternal(prevNode, node); setPrevious(null); node.setOwnerDocument(document); node.setAttributes((short) attrLength); if (nsMappings != null && nsMappings.size() > 0) { node.setNamespaceMappings(nsMappings); nsMappings.clear(); } stack.push(node); currentPath.addComponent(qn); node.setPosition(elementCnt++); if (!validate) { if (childCnt != null) { node.setChildCount(childCnt[node.getPosition()]); } storeElement(node); } } else { if (validate) { node = new ElementImpl(qn); } else { node = new ElementImpl(qn); } rootNode = node; setPrevious(null); node.setOwnerDocument(document); node.setNodeId(broker.getBrokerPool().getNodeFactory().createInstance(nodeFactoryInstanceCnt++)); node.setAttributes((short) attrLength); if (nsMappings != null && nsMappings.size() > 0) { node.setNamespaceMappings(nsMappings); nsMappings.clear(); } stack.push(node); currentPath.addComponent(qn); node.setPosition(elementCnt++); if (!validate) { if (childCnt != null) { node.setChildCount(childCnt[node.getPosition()]); } storeElement(node); } document.appendChild(node); } level++; String attrPrefix; String attrLocalName; for (int i = 0; i < attributes.getLength(); i++) { attrNS = attributes.getURI(i); attrLocalName = attributes.getLocalName(i); attrQName = attributes.getQName(i); // skip xmlns-attributes and attributes in eXist's namespace if (attrQName.startsWith("xmlns") || attrNS.equals(Namespaces.EXIST_NS)) --attrLength; else { p = attrQName.indexOf(':'); attrPrefix = (p != Constants.STRING_NOT_FOUND) ? attrQName.substring(0, p) : null; final AttrImpl attr = (AttrImpl) NodePool.getInstance().borrowNode(Node.ATTRIBUTE_NODE); attr.setNodeName(broker.getBrokerPool().getSymbols().getQName(Node.ATTRIBUTE_NODE, attrNS, attrLocalName, attrPrefix)); attr.setValue(attributes.getValue(i)); attr.setOwnerDocument(document); if (attributes.getType(i).equals(ATTR_ID_TYPE)) { attr.setType(AttrImpl.ID); } else if (attributes.getType(i).equals(ATTR_IDREF_TYPE)) { attr.setType(AttrImpl.IDREF); } else if (attributes.getType(i).equals(ATTR_IDREFS_TYPE)) { attr.setType(AttrImpl.IDREFS); } else if (attr.getQName().equalsSimple(Namespaces.XML_ID_QNAME)) { // an xml:id attribute. Normalize the attribute and set its type to ID attr.setValue(StringValue.trimWhitespace(StringValue.collapseWhitespace(attr.getValue()))); if (!XMLChar.isValidNCName(attr.getValue())) throw new SAXException("Value of xml:id attribute is not a valid NCName: " + attr.getValue()); attr.setType(AttrImpl.ID); } else if (attr.getQName().equalsSimple(Namespaces.XML_SPACE_QNAME)) { node.setPreserveSpace("preserve".equals(attr.getValue())); } node.appendChildInternal(prevNode, attr); setPrevious(attr); if (!validate) { broker.storeNode(transaction, attr, currentPath, indexSpec); if (indexListener != null) indexListener.attribute(transaction, attr, currentPath); } } } if (attrLength > 0) node.setAttributes((short) attrLength); // notify observers about progress every 100 lines if (locator != null) { currentLine = locator.getLineNumber(); if (!validate) { progress.setValue(currentLine); if (progress.changed()) { setChanged(); notifyObservers(progress); } } } ++docSize; } private void storeText() { if (!nodeContentStack.isEmpty()) { for (int i = 0; i < nodeContentStack.size(); i++) { XMLString next = (XMLString) nodeContentStack.get(i); next.append(charBuf); } } broker.storeNode(transaction, text, currentPath, indexSpec); if (indexListener != null) { indexListener.characters(transaction, text, currentPath); } } private void storeElement(ElementImpl node) { broker.storeNode(transaction, node, currentPath, indexSpec); if (indexListener != null) indexListener.startElement(transaction, node, currentPath); node.setChildCount(0); if (GeneralRangeIndexSpec.hasQNameOrValueIndex(node.getIndexType())) { XMLString contentBuf = new XMLString(); nodeContentStack.push(contentBuf); } } public void startEntity(String name) { } public void startPrefixMapping(String prefix, String uri) { // skip the eXist namespace // if (uri.equals(Namespaces.EXIST_NS)) { // ignorePrefix = prefix; // return; // } nsMappings.put(prefix, uri); } public void warning(SAXParseException e) throws SAXException { String msg="warning at (" + e.getLineNumber() + "," + e.getColumnNumber() + ") : " + e.getMessage(); throw new SAXException(msg, e); } private void setPrevious(StoredNode previous) { if (prevNode != null) { switch (prevNode.getNodeType()) { case Node.ATTRIBUTE_NODE : prevNode.release(); break; case Node.ELEMENT_NODE : if (prevNode != rootNode) { prevNode.clear(); usedElements.push(prevNode); } break; case Node.TEXT_NODE : prevNode.clear(); break; } } prevNode = previous; } }