package org.compass.core.xml.jdom.converter.support; import java.util.HashMap; import javax.xml.stream.XMLStreamConstants; import javax.xml.stream.XMLStreamException; import javax.xml.stream.XMLStreamReader; import org.jdom.Attribute; import org.jdom.Content; import org.jdom.Document; import org.jdom.Element; import org.jdom.JDOMFactory; import org.jdom.Namespace; import org.jdom.UncheckedJDOMFactory; /** * Builds a JDOM {@link org.jdom.Document org.jdom.Document} using a * {@link javax.xml.stream.XMLStreamReader}. * * @author kimchy */ public class StAXBuilder { /** * Map that contains conversion from textual attribute types StAX uses, * to int values JDOM uses. */ final static HashMap<String, Integer> attrTypes = new HashMap<String, Integer>(32); static { attrTypes.put("CDATA", Attribute.CDATA_TYPE); attrTypes.put("cdata", Attribute.CDATA_TYPE); attrTypes.put("ID", Attribute.ID_TYPE); attrTypes.put("id", Attribute.ID_TYPE); attrTypes.put("IDREF", Attribute.IDREF_TYPE); attrTypes.put("idref", Attribute.IDREF_TYPE); attrTypes.put("IDREFS", Attribute.IDREFS_TYPE); attrTypes.put("idrefs", Attribute.IDREFS_TYPE); attrTypes.put("ENTITY", Attribute.ENTITY_TYPE); attrTypes.put("entity", Attribute.ENTITY_TYPE); attrTypes.put("ENTITIES", Attribute.ENTITIES_TYPE); attrTypes.put("entities", Attribute.ENTITIES_TYPE); attrTypes.put("NMTOKEN", Attribute.NMTOKEN_TYPE); attrTypes.put("nmtoken", Attribute.NMTOKEN_TYPE); attrTypes.put("NMTOKENS", Attribute.NMTOKENS_TYPE); attrTypes.put("nmtokens", Attribute.NMTOKENS_TYPE); attrTypes.put("NOTATION", Attribute.NOTATION_TYPE); attrTypes.put("notation", Attribute.NOTATION_TYPE); attrTypes.put("ENUMERATED", Attribute.ENUMERATED_TYPE); attrTypes.put("enumerated", Attribute.ENUMERATED_TYPE); } // // // Configuration settings: /** * The factory for creating new JDOM objects */ private JDOMFactory factory = null; /** * Whether ignorable white space should be ignored, ie not added * in the resulting JDOM tree. If true, it will be ignored; if false, * it will be added in the tree. Default value if false. */ protected boolean cfgIgnoreWS = false; /** * Object that will be used when trying to remove indentation white * space: if so, the object is consulted to figure out what consistutes * indentation white space, as well as about context in which such * white space is to be removed. * <p> * Note that only such text events (CHARACTERS) are considered that * are not known to be fully ignorable (ignorable white space would * be reported as SPACE) by this removal process. SPACE events can * be trimmed simply by setting {@link #cfgIgnoreWS} to true. */ protected StAXTextModifier textModifier = null; /** * Default constructor. */ public StAXBuilder() { } /* * This sets a custom JDOMFactory for the builder. Use this to build * the tree with your own subclasses of the JDOM classes. * * @param factory <code>JDOMFactory</code> to use */ public void setFactory(JDOMFactory f) { factory = f; } public void setTextModifier(StAXTextModifier mod) { textModifier = mod; } /** * Method used to set value of {@link #cfgIgnoreWS}; that is, to * make parser either remove ignorable white space (true), or * to include it (false). * <p> * Whether all-whitespace text segment is ignorable white space or * not is based on DTD read in, as per XML specifications (white space * is only significant in mixed content or pure text elements). */ public void setIgnoreWhitespace(boolean state) { cfgIgnoreWS = state; } /** * Method used to enable or disable automatic heuristic removal * of indentation white If set to true, the builder will * try to remove white space that seems to be used for * indentation purposes; otherwise it will not try to do any removal. * <p> * Note that this setting only applies to all-whitespace segments * that have NOT been determined to be ignorable white space (either * because DTD is not available, or because such white space is in * mixed or text-only element content). As such it is a heuristics * that should only be enabled when application knows that such * white space removal does not cause problems. * <p> * Also note that internally the method calls * {@link #setTextModifier} with either the default text modifier * (true), or with null (false). */ public void setRemoveIndentation(boolean state) { if (state) { setTextModifier(IndentRemover.getInstance()); } else { setTextModifier(null); } } /** * Returns the current {@link org.jdom.JDOMFactory} in use, if * one has been previously set with {@link #setFactory}, otherwise * null. * * @return the factory builder will use */ public JDOMFactory getFactory() { return factory; } /** * This will build a JDOM tree given a StAX stream reader. * * @param r Stream reader from which input is read. * @return <code>Document</code> - JDOM document object. * @throws XMLStreamException If the reader threw such exception (to * indicate a parsing or I/O problem) */ public Document build(XMLStreamReader r) throws XMLStreamException { /* Should we do sanity checking to see that r is positioned at * beginning? Not doing so will allow creating documents from * sub-trees, though? (not necessarily, depending on the * build loop: it may expect END_DOCUMENT?) */ JDOMFactory f = factory; if (f == null) { f = new UncheckedJDOMFactory(); } Document doc = f.document(null); buildTree(f, r, doc, textModifier); return doc; } /** * This takes a <code>XMLStreamReader</code> and builds up * a JDOM tree. Recursion has been eliminated by using nodes' * parent/child relationship; this improves performance somewhat * (classic recursion-by-iteration-and-explicit stack transformation) * * @param f Node factory to use for creating JDOM nodes * @param r Stream reader to use for reading the document from which * to build the tree * @param doc JDOM <code>Document</code> being built. * @param tmod Text modifier to use for modifying content of text * nodes (CHARACTERS, not CDATA), if any; null if no modifications * are needed (modifier is usually used for trimming unnecessary * but non-ignorable white space). */ protected void buildTree(JDOMFactory f, XMLStreamReader r, Document doc, StAXTextModifier tmod) throws XMLStreamException { Element current = null; // At top level /* Only relevant when trying to trim indentation. But if so, let's * just always allow modifications in prolog/epilog. */ boolean allowTextMods = (tmod != null); int evtType = XMLStreamConstants.START_DOCUMENT; main_loop: while (true) { int prevEvent = evtType; evtType = r.next(); /* 11-Dec-2004, TSa: We may want to trim (indentation) white * space... and it's easiest to do as a completely separate * piece of logic, before the main switch. */ if (allowTextMods) { // Ok; did we get CHARACTERS to potentially modify? if (evtType == XMLStreamConstants.CHARACTERS) { // Mayhaps we could be interested in modifying it? if (tmod.possiblyModifyText(r, prevEvent)) { /* Need to get text before iterating to see the * following event (as that'll lose it) */ String txt = r.getText(); evtType = r.next(); // So how should the text be modified if at all? txt = tmod.textToIncludeBetween(r, prevEvent, evtType, txt); // Need to output if it's non-empty text, then: if (txt != null && txt.length() > 0) { /* See discussion below for CHARACTERS case; basically * we apparently can't add anything in epilog/prolog, * not even white space. */ if (current != null) { f.addContent(current, f.text(txt)); } } prevEvent = XMLStreamConstants.CHARACTERS; // Ok, let's fall down to handle new current event } } // And then can just fall back to the regular handling } Content child; switch (evtType) { case XMLStreamConstants.CDATA: child = f.cdata(r.getText()); break; case XMLStreamConstants.SPACE: if (cfgIgnoreWS) { continue main_loop; } // fall through case XMLStreamConstants.CHARACTERS: /* Small complication: although (ignorable) white space * is allowed in prolog/epilog, and StAX may report such * event, JDOM barfs if trying to add it. Thus, let's just * ignore all textual stuff outside the tree: */ if (current == null) { continue main_loop; } child = f.text(r.getText()); break; case XMLStreamConstants.COMMENT: child = f.comment(r.getText()); break; case XMLStreamConstants.END_DOCUMENT: break main_loop; case XMLStreamConstants.END_ELEMENT: current = current.getParentElement(); if (tmod != null) { allowTextMods = tmod.allowModificationsAfter(r, evtType); } continue main_loop; case XMLStreamConstants.ENTITY_DECLARATION: case XMLStreamConstants.NOTATION_DECLARATION: /* Shouldn't really get these, but maybe some stream readers * do provide the info. If so, better ignore it -- DTD event * should have most/all we need. */ continue main_loop; case XMLStreamConstants.ENTITY_REFERENCE: child = f.entityRef(r.getLocalName()); break; case XMLStreamConstants.PROCESSING_INSTRUCTION: child = f.processingInstruction(r.getPITarget(), r.getPIData()); break; case XMLStreamConstants.START_ELEMENT: // Ok, need to add a new element... { Element newElem = null; String nsURI = r.getNamespaceURI(); String elemPrefix = r.getPrefix(); // needed for special handling of elem's namespace String ln = r.getLocalName(); if (nsURI == null || nsURI.length() == 0) { if (elemPrefix == null || elemPrefix.length() == 0) { newElem = f.element(ln); } else { /* Happens when a prefix is bound to the default * (empty) namespace... */ newElem = f.element(ln, elemPrefix, ""); } } else { newElem = f.element(ln, elemPrefix, nsURI); } /* Let's add element right away (probably have to do * it to bind attribute namespaces, too) */ if (current == null) { // at root doc.setRootElement(newElem); } else { f.addContent(current, newElem); } // Any declared namespaces? for (int i = 0, len = r.getNamespaceCount(); i < len; ++i) { String prefix = r.getNamespacePrefix(i); if (prefix == null) { prefix = ""; } Namespace ns = Namespace.getNamespace(prefix, r.getNamespaceURI(i)); // JDOM has special handling for element's "own" ns: if (prefix.equals(elemPrefix)) { ; // already set by when it was constructed... } else { f.addNamespaceDeclaration(newElem, ns); } } // And then the attributes: for (int i = 0, len = r.getAttributeCount(); i < len; ++i) { String prefix = r.getAttributePrefix(i); Namespace ns; if (prefix == null || prefix.length() == 0) { // Attribute not in any namespace ns = Namespace.NO_NAMESPACE; } else { ns = newElem.getNamespace(prefix); } Attribute attr = f.attribute(r.getAttributeLocalName(i), r.getAttributeValue(i), resolveAttrType(r.getAttributeType(i)), ns); f.setAttribute(newElem, attr); } // And then 'push' new element... current = newElem; } if (tmod != null) { allowTextMods = tmod.allowModificationsAfter(r, evtType); } // Already added the element, can continue continue main_loop; case XMLStreamConstants.START_DOCUMENT: /* This should only be received at the beginning of document... * so, should we indicate the problem or not? */ /* For now, let it pass: maybe some (broken) readers pass * that info as first event in beginning of doc? */ continue main_loop; case XMLStreamConstants.DTD: /* !!! Note: StAX does not expose enough information about * doctype declaration (specifically, public and system id!); * should (re-)parse information... not yet implemented */ // TBI continue main_loop; // Should never get these, from a stream reader: /* (commented out entries are just FYI; default catches * them all) */ //case XMLStreamConstants.ATTRIBUTE: //case XMLStreamConstants.NAMESPACE: default: throw new XMLStreamException("Unrecognized iterator event type: " + r.getEventType() + "; should not receive such types (broken stream reader?)"); } if (child != null) { if (current == null) { f.addContent(doc, child); } else { f.addContent(current, child); } } } } /** * Method called when option is turned on; * to determine if current CHARACTERS event looks like it might * be used for indentation purposes (it is all white space, and * is either immediately after a start element, or could be * immediately before a start element). * <p> * The default implementation just checks whether the text segment * (known to be all white space) starts with a * linefeed character. */ protected boolean isIndentationWhitespace(XMLStreamReader r) throws XMLStreamException { String text = r.getText(); // Should never be empty... but let's be sure if (text.length() > 0) { char c = text.charAt(0); return (c == '\n' || c == '\r'); } return false; } // // // Private methods: private static int resolveAttrType(String typeStr) { if (typeStr != null && typeStr.length() > 0) { Integer I = attrTypes.get(typeStr); if (I != null) { return I.intValue(); } } return Attribute.UNDECLARED_TYPE; } // // // Basic text modifier class(es) public static class IndentRemover extends StAXTextModifier { final static IndentRemover sInstance = new IndentRemover(); protected IndentRemover() { super(); } public static IndentRemover getInstance() { return sInstance; } /** * Always removes indentation after * all start and elements without any further checks; essentially * allowing (indentation) white space removal anywhere in the * document. */ public boolean allowModificationsAfter(XMLStreamReader r, int eventType) throws XMLStreamException { return true; } /** * Enables modifications for * so-called "indentation * white space", ie. all-whitespace (non-CDATA) text segment that * starts with * a linefeed character (\n or \r); provided it follows a non-text * event (anything other than CDATA, ENTITY_REFERENCE and CHARACTERS; * none of which usually should be adjacent to CHARACTERS event, * if text coalescing is enabled, and automatic entity expansion * is not disabled). */ public boolean possiblyModifyText(XMLStreamReader r, int prevEvent) throws XMLStreamException { if (r.getEventType() == XMLStreamConstants.CHARACTERS) { if (!(prevEvent == XMLStreamConstants.CHARACTERS || prevEvent == XMLStreamConstants.CDATA || prevEvent == XMLStreamConstants.ENTITY_REFERENCE)) { if (r.isWhiteSpace()) { String txt = r.getText(); if (txt.length() > 0) { // should always be true char c = txt.charAt(0); return (c == '\n' || c == '\r'); } } } } return false; } /** * If we ever get this far, we will still check that * the CHARACTERS event is not immediately followed by another * textual event. If so, we'll just remove the (all white space) * text event. */ public String textToIncludeBetween(XMLStreamReader r, int prevEvent, int nextEvent, String text) throws XMLStreamException { /* Only remove white space if neither preceding nor following * event is of non-ignorable textual type (CHARACTERS, CDATA, * ENTITY_REFERENCE; note that SPACE should never be adjacent * to CHARACTERS event). */ if (nextEvent == XMLStreamConstants.CHARACTERS || nextEvent == XMLStreamConstants.CDATA || nextEvent == XMLStreamConstants.ENTITY_REFERENCE) { return text; } /* If we got this far, we know it's indentation white space * and should just be removed completely: */ return null; } } // // // Testing /** * Trivial test driver for testing functionality. */ public static void main(String[] args) throws Exception { if (args.length != 1) { System.err.println("Usage: java ... [file]"); System.exit(1); } String filename = args[0]; java.io.Reader r = new java.io.FileReader(filename); javax.xml.stream.XMLInputFactory f = javax.xml.stream.XMLInputFactory.newInstance(); XMLStreamReader sr = f.createXMLStreamReader(r); StAXBuilder builder = new StAXBuilder(); Document domDoc = builder.build(sr); System.out.println("Done [with " + sr.getClass() + "]:"); System.out.println("----- JDom -----"); org.jdom.output.XMLOutputter outputter = new org.jdom.output.XMLOutputter(); java.io.PrintWriter pw = new java.io.PrintWriter(System.out); outputter.output(domDoc, pw); pw.flush(); System.out.println("----- /JDom -----"); } }