package at.ac.tuwien.dsg.scaledom.dom; import java.io.Closeable; import java.io.IOException; import java.util.Iterator; import javax.xml.namespace.QName; import javax.xml.stream.XMLStreamConstants; import javax.xml.stream.events.Attribute; import javax.xml.stream.events.Characters; import javax.xml.stream.events.Comment; import javax.xml.stream.events.EndElement; import javax.xml.stream.events.Namespace; import javax.xml.stream.events.ProcessingInstruction; import javax.xml.stream.events.StartDocument; import javax.xml.stream.events.StartElement; import javax.xml.stream.events.XMLEvent; import org.apache.xerces.dom.ParentNode; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.w3c.dom.Attr; import org.w3c.dom.CDATASection; import org.w3c.dom.Element; import org.w3c.dom.Node; import org.w3c.dom.Text; import at.ac.tuwien.dsg.scaledom.io.NodeLocation; import at.ac.tuwien.dsg.scaledom.io.NodeLocationFactory; import at.ac.tuwien.dsg.scaledom.lazy.LazyLoadingStrategy; import at.ac.tuwien.dsg.scaledom.parser.XmlEventLocation; import at.ac.tuwien.dsg.scaledom.parser.XmlParserEventListener; import at.ac.tuwien.dsg.scaledom.util.LowMemoryDetector; public class LoadProcess implements XmlParserEventListener, Closeable { /** Logger. */ private final static Logger log = LoggerFactory.getLogger(LoadProcess.class); private final ScaleDomDocument doc; private final LoadType loadType; private final LazyLoadingStrategy strategy; private final NodeLocationFactory nodeLocationFactory; private final long additionalOffset; /** Current state, outsourced to own class. */ private final LoadProcessState state; /** Low memory detector. */ private final LowMemoryDetector lowMemoryDetector; public LoadProcess(final ScaleDomDocument doc, final ParentNode parent, final LoadType loadType, final LazyLoadingStrategy strategy, final NodeLocationFactory nodeLocationFactory) { this(doc, parent, loadType, strategy, nodeLocationFactory, 0, 0); } public LoadProcess(final ScaleDomDocument doc, final ParentNode parent, final LoadType loadType, final LazyLoadingStrategy strategy, final NodeLocationFactory nodeLocationFactory, final int eventsToSkip, final long additionalOffset) { this.doc = doc; this.loadType = loadType; this.nodeLocationFactory = nodeLocationFactory; this.strategy = strategy; this.additionalOffset = additionalOffset; state = new LoadProcessState(parent, eventsToSkip); // Use a low memory detector during load process lowMemoryDetector = new LowMemoryDetector(0.99); } /** * Returns the number of created nodes. * * @return the number of created nodes. */ public long getNumberOfCreatedNodes() { return state.getNumberOfCreatedNodes(); } @Override public void close() throws IOException { // Remove low memory detector after loading is done lowMemoryDetector.close(); } @Override public boolean process(final XMLEvent event, final XmlEventLocation eventLocation) { // The parser input may include additional nodes in the beginning (e.g. the parent node of the children to be // loaded, a root node with namespace declarations, etc.), skip those if (state.shouldSkipEvent()) { state.skippedEvent(); return CONTINUE_PARSING; } // Convert the parser's outputted location into a node location final NodeLocation location = nodeLocationFactory.eventLocationToNodeLocation(eventLocation, additionalOffset); // The location of the event following an EndElement event, is the end location for the previous element node if (state.getLastEventType() == XMLStreamConstants.END_ELEMENT) { final ParentNode node = state.getPreviousParentNode(); node.getNodeLocation().setEndLocation(location); } // Filter uninteresting events if (filter(event)) { return CONTINUE_PARSING; } // Inform the parent node of existing children - regardless of whether the child is loaded in the end or not if (isNodeCreatingEvent(event)) { state.getCurrentParentNode().parsedChild(); } // Let the LazyLoadingStrategy decide whether event should be processed (and may result in a node) if (!strategy.shouldLoad(event, eventLocation)) { return CONTINUE_PARSING; } state.setLastEventType(event.getEventType()); // Process events which do not result in a node if (event.getEventType() == XMLStreamConstants.START_DOCUMENT) { processStartDocument((StartDocument) event); return CONTINUE_PARSING; } else if (event.getEventType() == XMLStreamConstants.END_ELEMENT) { processEndElement(event.asEndElement()); return CONTINUE_PARSING; } // Check the load type, whether a node should be created if (loadType == LoadType.RELOAD_ELEMENTS_ONLY && event.getEventType() != XMLStreamConstants.START_ELEMENT) { return CONTINUE_PARSING; } // Check if we have enough memory to create more nodes if (lowMemoryDetector.isLowMemory()) { final Node node = state.getCurrentParentNode(); if (node != null) { // log.error("Reached memory threshold, stopped loading children of '" // + state.getCurrentParentNode().getNodeName() + "' to prevent OutOfMemoryError!"); } else { log.error("Reached memory threshold, stopped loading."); } // TODO: Return ABORT_PARSING after LowMemoryDetector is properly working. // return ABORT_PARSING; } // Process events which result in a node switch (event.getEventType()) { case XMLStreamConstants.START_ELEMENT: processStartElement(event.asStartElement(), location); break; case XMLStreamConstants.CHARACTERS: final Characters text = event.asCharacters(); processText(text, location); break; case XMLStreamConstants.CDATA: final Characters cdata = event.asCharacters(); processCData(cdata, location); break; case XMLStreamConstants.COMMENT: final Comment comment = (Comment) event; processComment(comment, location); break; case XMLStreamConstants.PROCESSING_INSTRUCTION: final ProcessingInstruction processingInstruction = (ProcessingInstruction) event; processProcessingInstruction(processingInstruction, location); break; default: log.error("An XMLEvent of unknown type '" + event.getEventType() + "' did occur."); break; } return CONTINUE_PARSING; } private void processStartDocument(final StartDocument event) { doc.setDocumentURI(event.getSystemId()); doc.setXmlStandalone(event.isStandalone()); doc.setXmlVersion(event.getVersion()); } private void processStartElement(final StartElement startElement, final NodeLocation location) { final QName name = startElement.getName(); final Iterator<?> attributes = startElement.getAttributes(); // Create Element node // TODO: Use createElement() if no namespace is present? final Element elementNode = doc.createElementNS(name.getNamespaceURI(), name.toString(), name.getLocalPart()); final ParentNode asParentNode = (ParentNode) elementNode; asParentNode.setNodeLocation(location); // Add attributes while (attributes.hasNext()) { final Attribute attribute = (Attribute) attributes.next(); // Create Attr node // TODO: Use createAttributeNS() if namespace is present? final Attr attrNode = doc.createAttribute(attribute.getName().getLocalPart()); attrNode.setValue(attribute.getValue()); elementNode.setAttributeNode(attrNode); } // Cache namespaces final Iterator<?> namespaces = startElement.getNamespaces(); while (namespaces.hasNext()) { final Namespace namespace = (Namespace) namespaces.next(); asParentNode.parsedNamespace(namespace); } // Update current hierarchy state.getCurrentParentNode().appendChild(elementNode); state.addParentNodeAtBottom(asParentNode); state.createdNode(); } private void processEndElement(final EndElement endElement) { // Update current hierarchy state.removeCurrentParentNode(); } private void processText(final Characters text, final NodeLocation location) { // Create and append Text node final Text textNode = doc.createTextNode(text.getData()); state.getCurrentParentNode().appendChild(textNode); state.createdNode(); } private void processCData(final Characters cdata, final NodeLocation location) { // Create and append CDATASection node final CDATASection cdataSectionNode = doc.createCDATASection(cdata.getData()); state.getCurrentParentNode().appendChild(cdataSectionNode); state.createdNode(); } private void processComment(final Comment comment, final NodeLocation location) { // Create and append Comment node final org.w3c.dom.Comment commentNode = doc.createComment(comment.getText()); state.getCurrentParentNode().appendChild(commentNode); state.createdNode(); } private void processProcessingInstruction(final ProcessingInstruction processingInstruction, final NodeLocation location) { // Create and append ProcessingInstruction node final org.w3c.dom.ProcessingInstruction processingInstructionNode = doc.createProcessingInstruction( processingInstruction.getTarget(), processingInstruction.getData()); state.getCurrentParentNode().appendChild(processingInstructionNode); state.createdNode(); } private boolean filter(final XMLEvent event) { switch (event.getEventType()) { // The following event types are not interesting for ScaleDOM: case XMLStreamConstants.END_DOCUMENT: return true; // The following event types are not supported by ScaleDOM: case XMLStreamConstants.DTD: log.debug("An XMLEvent of type 'DTD' has been ignored."); return true; case XMLStreamConstants.ENTITY_REFERENCE: log.warn("An XMLEvent of type 'ENTITY_REFERENCE' has been ignored, please instruct the parser to expand entity references."); return true; // The following event types do not occur as first-order events when parsing a document source: case XMLStreamConstants.ATTRIBUTE: case XMLStreamConstants.ENTITY_DECLARATION: case XMLStreamConstants.NAMESPACE: case XMLStreamConstants.NOTATION_DECLARATION: log.error("An XMLEvent of type '" + event.getEventType() + "' did occur as first-order event."); return true; // The following event types do not occur at all // TODO: Check http://stackoverflow.com/questions/15010864/ case XMLStreamConstants.SPACE: log.error("An XMLEvent of type 'SPACE' did occur."); return true; } return false; } private boolean isNodeCreatingEvent(final XMLEvent event) { final int eventType = event.getEventType(); return eventType == XMLStreamConstants.START_ELEMENT || eventType == XMLStreamConstants.CHARACTERS || eventType == XMLStreamConstants.CDATA || eventType == XMLStreamConstants.COMMENT || eventType == XMLStreamConstants.PROCESSING_INSTRUCTION; } }