// XmlParser.java: the main parser class. // NO WARRANTY! See README, and copyright below. // $Id$ package com.microstar.xml; import java.io.BufferedInputStream; import java.io.EOFException; import java.io.IOException; import java.io.InputStream; import java.io.Reader; import java.net.URL; import java.net.URLConnection; import java.util.Enumeration; import java.util.Hashtable; import java.util.Locale; import java.util.Stack; /** * Parse XML documents and return parse events through call-backs. * <p>You need to define a class implementing the <code>XmlHandler</code> * interface: an object belonging to this class will receive the * callbacks for the events. (As an alternative to implementing * the full XmlHandler interface, you can simply extend the * <code>HandlerBase</code> convenience class.) * <p>Usage (assuming that <code>MyHandler</code> is your implementation * of the <code>XmlHandler</code> interface): * <pre> * XmlHandler handler = new MyHandler(); * XmlParser parser = new XmlParser(); * parser.setHandler(handler); * try { * parser.parse("http://www.host.com/doc.xml", null); * } catch (Exception e) { * [do something interesting] * } * </pre> * <p>Alternatively, you can use the standard SAX interfaces * with the <code>SAXDriver</code> class as your entry point. * @author Copyright (c) 1997, 1998 by Microstar Software Ltd. * @author Written by David Megginson <dmeggins@microstar.com> * @version 1.1 * @since Ptolemy II 0.2 * @see XmlHandler * @see HandlerBase */ public class XmlParser { // // Use special cheats that speed up the code (currently about 50%), // but may cause problems with future maintenance and add to the // class file size (about 500 bytes). // private final static boolean USE_CHEATS = true; ////////////////////////////////////////////////////////////////////// // Constructors. //////////////////////////////////////////////////////////////////////// /** * Construct a new parser with no associated handler. * @see #setHandler * @see #parse */ public XmlParser() { } /** * Set the handler that will receive parsing events. * @param handler The handler to receive callback events. * @see #parse * @see XmlHandler */ public void setHandler(XmlHandler handler) { this.handler = handler; } /** * Parse an XML document from a URI. * <p>You may parse a document more than once, but only one thread * may call this method for an object at one time. * @param systemId The URI of the document. * @param publicId The public identifier of the document, or null. * @param encoding The suggested encoding, or null if unknown. * @exception java.lang.Exception Any exception thrown by your * own handlers, or any derivation of java.io.IOException * thrown by the parser itself. */ public void parse(String systemId, String publicId, String encoding) throws java.lang.Exception { doParse(systemId, publicId, null, null, encoding); } /** * Parse an XML document from a byte stream. * <p>The URI that you supply will become the base URI for * resolving relative links, but Ælfred will actually read * the document from the supplied input stream. * <p>You may parse a document more than once, but only one thread * may call this method for an object at one time. * @param systemId The base URI of the document, or null if not * known. * @param publicId The public identifier of the document, or null * if not known. * @param stream A byte input stream. * @param encoding The suggested encoding, or null if unknown. * @exception java.lang.Exception Any exception thrown by your * own handlers, or any derivation of java.io.IOException * thrown by the parser itself. */ public void parse(String systemId, String publicId, InputStream stream, String encoding) throws java.lang.Exception { doParse(systemId, publicId, null, stream, encoding); } /** * Parse an XML document from a character stream. * <p>The URI that you supply will become the base URI for * resolving relative links, but Ælfred will actually read * the document from the supplied input stream. * <p>You may parse a document more than once, but only one thread * may call this method for an object at one time. * @param systemId The base URI of the document, or null if not * known. * @param publicId The public identifier of the document, or null * if not known. * @param reader A character stream. * @exception java.lang.Exception Any exception thrown by your * own handlers, or any derivation of java.io.IOException * thrown by the parser itself. */ public void parse(String systemId, String publicId, Reader reader) throws java.lang.Exception { doParse(systemId, publicId, reader, null, null); } private synchronized void doParse(String systemId, String publicId, Reader reader, InputStream stream, String encoding) throws java.lang.Exception { basePublicId = publicId; baseURI = systemId; baseReader = reader; baseInputStream = stream; initializeVariables(); // Set the default entities here. setInternalEntity(intern("amp"), "&"); setInternalEntity(intern("lt"), "<"); setInternalEntity(intern("gt"), ">"); setInternalEntity(intern("apos"), "'"); setInternalEntity(intern("quot"), """); if (handler != null) { handler.startDocument(); } pushURL("[document]", basePublicId, baseURI, baseReader, baseInputStream, encoding); parseDocument(); if (handler != null) { handler.endDocument(); } cleanupVariables(); } //////////////////////////////////////////////////////////////////////// // Constants. //////////////////////////////////////////////////////////////////////// // // Constants for element content type. // /** * Constant: an element has not been declared. * @see #getElementContentType */ public final static int CONTENT_UNDECLARED = 0; /** * Constant: the element has a content model of ANY. * @see #getElementContentType */ public final static int CONTENT_ANY = 1; /** * Constant: the element has declared content of EMPTY. * @see #getElementContentType */ public final static int CONTENT_EMPTY = 2; /** * Constant: the element has mixed content. * @see #getElementContentType */ public final static int CONTENT_MIXED = 3; /** * Constant: the element has element content. * @see #getElementContentType */ public final static int CONTENT_ELEMENTS = 4; // // Constants for the entity type. // /** * Constant: the entity has not been declared. * @see #getEntityType */ public final static int ENTITY_UNDECLARED = 0; /** * Constant: the entity is internal. * @see #getEntityType */ public final static int ENTITY_INTERNAL = 1; /** * Constant: the entity is external, non-XML data. * @see #getEntityType */ public final static int ENTITY_NDATA = 2; /** * Constant: the entity is external XML data. * @see #getEntityType */ public final static int ENTITY_TEXT = 3; // // Constants for attribute type. // /** * Constant: the attribute has not been declared for this element type. * @see #getAttributeType */ public final static int ATTRIBUTE_UNDECLARED = 0; /** * Constant: the attribute value is a string value. * @see #getAttributeType */ public final static int ATTRIBUTE_CDATA = 1; /** * Constant: the attribute value is a unique identifier. * @see #getAttributeType */ public final static int ATTRIBUTE_ID = 2; /** * Constant: the attribute value is a reference to a unique identifier. * @see #getAttributeType */ public final static int ATTRIBUTE_IDREF = 3; /** * Constant: the attribute value is a list of ID references. * @see #getAttributeType */ public final static int ATTRIBUTE_IDREFS = 4; /** * Constant: the attribute value is the name of an entity. * @see #getAttributeType */ public final static int ATTRIBUTE_ENTITY = 5; /** * Constant: the attribute value is a list of entity names. * @see #getAttributeType */ public final static int ATTRIBUTE_ENTITIES = 6; /** * Constant: the attribute value is a name token. * @see #getAttributeType */ public final static int ATTRIBUTE_NMTOKEN = 7; /** * Constant: the attribute value is a list of name tokens. * @see #getAttributeType */ public final static int ATTRIBUTE_NMTOKENS = 8; /** * Constant: the attribute value is a token from an enumeration. * @see #getAttributeType */ public final static int ATTRIBUTE_ENUMERATED = 9; /** * Constant: the attribute is the name of a notation. * @see #getAttributeType */ public final static int ATTRIBUTE_NOTATION = 10; // // When the class is loaded, populate the hash table of // attribute types. // /** * Hash table of attribute types. */ private static Hashtable attributeTypeHash; static { attributeTypeHash = new Hashtable(); attributeTypeHash.put("CDATA", Integer.valueOf(ATTRIBUTE_CDATA)); attributeTypeHash.put("ID", Integer.valueOf(ATTRIBUTE_ID)); attributeTypeHash.put("IDREF", Integer.valueOf(ATTRIBUTE_IDREF)); attributeTypeHash.put("IDREFS", Integer.valueOf(ATTRIBUTE_IDREFS)); attributeTypeHash.put("ENTITY", Integer.valueOf(ATTRIBUTE_ENTITY)); attributeTypeHash.put("ENTITIES", Integer.valueOf(ATTRIBUTE_ENTITIES)); attributeTypeHash.put("NMTOKEN", Integer.valueOf(ATTRIBUTE_NMTOKEN)); attributeTypeHash.put("NMTOKENS", Integer.valueOf(ATTRIBUTE_NMTOKENS)); attributeTypeHash.put("NOTATION", Integer.valueOf(ATTRIBUTE_NOTATION)); } // // Constants for supported encodings. // private final static int ENCODING_UTF_8 = 1; private final static int ENCODING_ISO_8859_1 = 2; private final static int ENCODING_UCS_2_12 = 3; private final static int ENCODING_UCS_2_21 = 4; private final static int ENCODING_UCS_4_1234 = 5; private final static int ENCODING_UCS_4_4321 = 6; private final static int ENCODING_UCS_4_2143 = 7; private final static int ENCODING_UCS_4_3412 = 8; // // Constants for attribute default value. // /** * Constant: the attribute is not declared. * @see #getAttributeDefaultValueType */ public final static int ATTRIBUTE_DEFAULT_UNDECLARED = 0; /** * Constant: the attribute has a literal default value specified. * @see #getAttributeDefaultValueType * @see #getAttributeDefaultValue */ public final static int ATTRIBUTE_DEFAULT_SPECIFIED = 1; /** * Constant: the attribute was declared #IMPLIED. * @see #getAttributeDefaultValueType */ public final static int ATTRIBUTE_DEFAULT_IMPLIED = 2; /** * Constant: the attribute was declared #REQUIRED. * @see #getAttributeDefaultValueType */ public final static int ATTRIBUTE_DEFAULT_REQUIRED = 3; /** * Constant: the attribute was declared #FIXED. * @see #getAttributeDefaultValueType * @see #getAttributeDefaultValue */ public final static int ATTRIBUTE_DEFAULT_FIXED = 4; // // Constants for input. // private final static int INPUT_NONE = 0; private final static int INPUT_INTERNAL = 1; private final static int INPUT_EXTERNAL = 2; private final static int INPUT_STREAM = 3; private final static int INPUT_BUFFER = 4; private final static int INPUT_READER = 5; // // Flags for reading literals. // private final static int LIT_CHAR_REF = 1; private final static int LIT_ENTITY_REF = 2; private final static int LIT_PE_REF = 4; private final static int LIT_NORMALIZE = 8; // // Flags for parsing context. // private final static int CONTEXT_NONE = 0; private final static int CONTEXT_DTD = 1; private final static int CONTEXT_ENTITYVALUE = 2; private final static int CONTEXT_ATTRIBUTEVALUE = 3; ////////////////////////////////////////////////////////////////////// // Error reporting. ////////////////////////////////////////////////////////////////////// /** * Report an error. * @param message The error message. * @param textFound The text that caused the error (or null). * @see XmlHandler#error * @see #line */ void error(String message, String textFound, String textExpected) throws java.lang.Exception { errorCount++; if (textFound != null) { message = message + " (found \"" + textFound + "\")"; } if (textExpected != null) { message = message + " (expected \"" + textExpected + "\")"; } if (handler != null) { String uri = null; if (externalEntity != null) { uri = externalEntity.getURL().toString(); } handler.error(message, uri, line, column); } } /** * Report a serious error. * @param message The error message. * @param textFound The text that caused the error (or null). */ void error(String message, char textFound, String textExpected) throws java.lang.Exception { error(message, Character.toString(textFound), textExpected); } ////////////////////////////////////////////////////////////////////// // Major syntactic productions. ////////////////////////////////////////////////////////////////////// /** * Parse an XML document. * <pre> * [1] document ::= prolog element Misc* * </pre> * <p>This is the top-level parsing function for a single XML * document. As a minimum, a well-formed document must have * a document element, and a valid document must have a prolog * as well. */ void parseDocument() throws java.lang.Exception { char c; parseProlog(); require('<'); parseElement(); try { parseMisc(); //skip all white, PIs, and comments c = readCh(); //if this doesn't throw an exception... error("unexpected characters after document end", c, null); } catch (EOFException e) { return; } } /** * Skip a comment. * <pre> * [18] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* "-->" * </pre> * <p>(The <code><!--</code> has already been read.) */ void parseComment() throws java.lang.Exception { skipUntil("-->"); } /** * Parse a processing instruction and do a call-back. * <pre> * [19] PI ::= '<?' Name (S (Char* - (Char* '?>' Char*)))? '?>' * </pre> * <p>(The <code><?</code> has already been read.) * <p>An XML processing instruction <em>must</em> begin with * a Name, which is the instruction's target. */ void parsePI() throws java.lang.Exception { String name; name = readNmtoken(true); if (!tryRead("?>")) { requireWhitespace(); parseUntil("?>"); } if (handler != null) { handler.processingInstruction(name, dataBufferToString()); } } /** * Parse a CDATA marked section. * <pre> * [20] CDSect ::= CDStart CData CDEnd * [21] CDStart ::= '<![CDATA[' * [22] CData ::= (Char* - (Char* ']]>' Char*)) * [23] CDEnd ::= ']]>' * </pre> * <p>(The '<![CDATA[' has already been read.) * <p>Note that this just appends characters to the dataBuffer, * without actually generating an event. */ void parseCDSect() throws java.lang.Exception { parseUntil("]]>"); } /** * Parse the prolog of an XML document. * <pre> * [24] prolog ::= XMLDecl? Misc* (Doctypedecl Misc*)? * </pre> * <p>There are a couple of tricks here. First, it is necessary to * declare the XML default attributes after the DTD (if present) * has been read. Second, it is not possible to expand general * references in attribute value literals until after the entire * DTD (if present) has been parsed. * <p>We do not look for the XML declaration here, because it is * handled by pushURL(). * @see #pushURL */ void parseProlog() throws java.lang.Exception { parseMisc(); if (tryRead("<!DOCTYPE")) { parseDoctypedecl(); parseMisc(); } } /** * Parse the XML declaration. * <pre> * [25] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>' * [26] VersionInfo ::= S 'version' Eq ('"1.0"' | "'1.0'") * [33] SDDecl ::= S 'standalone' Eq "'" ('yes' | 'no') "'" * | S 'standalone' Eq '"' ("yes" | "no") '"' * [78] EncodingDecl ::= S 'encoding' Eq QEncoding * </pre> * <p>([80] to [82] are also significant.) * <p>(The <code><?xml</code> and whitespace have already been read.) * <p>TODO: validate value of standalone. * @see #parseTextDecl * @see #checkEncoding */ void parseXMLDecl(boolean ignoreEncoding) throws java.lang.Exception { String version; String encodingName = null; // String standalone = null; // Read the version. require("version"); parseEq(); version = readLiteral(0); if (!version.equals("1.0")) { error("unsupported XML version", version, "1.0"); } // Try reading an encoding declaration. skipWhitespace(); if (tryRead("encoding")) { parseEq(); encodingName = readLiteral(0); checkEncoding(encodingName, ignoreEncoding); } // Try reading a standalone declaration skipWhitespace(); if (tryRead("standalone")) { parseEq(); // FIXME: Why is the literal read, but the value ignored? /* standalone = */readLiteral(0); } skipWhitespace(); require("?>"); } /** * Parse the Encoding PI. * <pre> * [78] EncodingDecl ::= S 'encoding' Eq QEncoding * [79] EncodingPI ::= '<?xml' S 'encoding' Eq QEncoding S? '?>' * [80] QEncoding ::= '"' Encoding '"' | "'" Encoding "'" * [81] Encoding ::= LatinName * [82] LatinName ::= [A-Za-z] ([A-Za-z0-9._] | '-')* * </pre> * <p>(The <code><?xml</code>' and whitespace have already been read.) * @see #parseXMLDecl * @see #checkEncoding */ void parseTextDecl(boolean ignoreEncoding) throws java.lang.Exception { String encodingName = null; // Read an optional version. if (tryRead("version")) { String version; parseEq(); version = readLiteral(0); if (!version.equals("1.0")) { error("unsupported XML version", version, "1.0"); } requireWhitespace(); } // Read the encoding. require("encoding"); parseEq(); encodingName = readLiteral(0); checkEncoding(encodingName, ignoreEncoding); skipWhitespace(); require("?>"); } /** * Check that the encoding specified makes sense. * <p>Compare what the author has specified in the XML declaration * or encoding PI with what we have detected. * <p>This is also important for distinguishing among the various * 7- and 8-bit encodings, such as ISO-LATIN-1 (I cannot autodetect * those). * @param encodingName The name of the encoding specified by the user. * @see #parseXMLDecl * @see #parseTextDecl */ void checkEncoding(String encodingName, boolean ignoreEncoding) throws java.lang.Exception { // FindBugs suggests using toUpperCase(Locale) encodingName = encodingName.toUpperCase(Locale.getDefault()); if (ignoreEncoding) { return; } switch (encoding) { // 8-bit encodings case ENCODING_UTF_8: if (encodingName.equals("ISO-8859-1")) { encoding = ENCODING_ISO_8859_1; } else if (!encodingName.equals("UTF-8")) { error("unsupported 8-bit encoding", encodingName, "UTF-8 or ISO-8859-1"); } break; // 16-bit encodings case ENCODING_UCS_2_12: case ENCODING_UCS_2_21: if (!encodingName.equals("ISO-10646-UCS-2") && !encodingName.equals("UTF-16")) { error("unsupported 16-bit encoding", encodingName, "ISO-10646-UCS-2"); } break; // 32-bit encodings case ENCODING_UCS_4_1234: case ENCODING_UCS_4_4321: case ENCODING_UCS_4_2143: case ENCODING_UCS_4_3412: if (!encodingName.equals("ISO-10646-UCS-4")) { error("unsupported 32-bit encoding", encodingName, "ISO-10646-UCS-4"); } } } /** * Parse miscellaneous markup outside the document element and DOCTYPE * declaration. * <pre> * [27] Misc ::= Comment | PI | S * </pre> */ void parseMisc() throws java.lang.Exception { while (true) { skipWhitespace(); if (tryRead("<?")) { parsePI(); } else if (tryRead("<!--")) { parseComment(); } else { return; } } } /** * Parse a document type declaration. * <pre> * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? * ('[' %markupdecl* ']' S?)? '>' * </pre> * <p>(The <code><!DOCTYPE</code> has already been read.) */ void parseDoctypedecl() throws java.lang.Exception { String doctypeName; String[] ids; // Read the document type name. requireWhitespace(); doctypeName = readNmtoken(true); // Read the ExternalIDs. skipWhitespace(); ids = readExternalIds(false); // Look for a declaration subset. skipWhitespace(); if (tryRead('[')) { // loop until the subset ends while (true) { context = CONTEXT_DTD; skipWhitespace(); context = CONTEXT_NONE; if (tryRead(']')) { break; // end of subset } else { context = CONTEXT_DTD; parseMarkupdecl(); context = CONTEXT_NONE; } } } // Read the external subset, if any if (ids[1] != null) { pushURL("[external subset]", ids[0], ids[1], null, null, null); // Loop until we end up back at '>' while (true) { context = CONTEXT_DTD; skipWhitespace(); context = CONTEXT_NONE; if (tryRead('>')) { break; } else { context = CONTEXT_DTD; parseMarkupdecl(); context = CONTEXT_NONE; } } } else { // No external subset. skipWhitespace(); require('>'); } if (handler != null) { handler.doctypeDecl(doctypeName, ids[0], ids[1]); } // Expand general entities in // default values of attributes. // (Do this after the doctypeDecl // event!). // expandAttributeDefaultValues(); } /** * Parse a markup declaration in the internal or external DTD subset. * <pre> * [29] markupdecl ::= ( %elementdecl | %AttlistDecl | %EntityDecl | * %NotationDecl | %PI | %S | %Comment | * InternalPERef ) * [30] InternalPERef ::= PEReference * [31] extSubset ::= (%markupdecl | %conditionalSect)* * </pre> */ void parseMarkupdecl() throws java.lang.Exception { if (tryRead("<!ELEMENT")) { parseElementdecl(); } else if (tryRead("<!ATTLIST")) { parseAttlistDecl(); } else if (tryRead("<!ENTITY")) { parseEntityDecl(); } else if (tryRead("<!NOTATION")) { parseNotationDecl(); } else if (tryRead("<?")) { parsePI(); } else if (tryRead("<!--")) { parseComment(); } else if (tryRead("<![")) { parseConditionalSect(); } else { error("expected markup declaration", null, null); } } /** * Parse an element, with its tags. * <pre> * [33] STag ::= '<' Name (S Attribute)* S? '>' [WFC: unique Att spec] * [38] element ::= EmptyElement | STag content ETag * [39] EmptyElement ::= '<' Name (S Attribute)* S? '/>' * [WFC: unique Att spec] * </pre> * <p>(The '<' has already been read.) * <p>NOTE: this method actually chains onto parseContent(), if necessary, * and parseContent() will take care of calling parseETag(). */ void parseElement() throws java.lang.Exception { String gi; char c; int oldElementContent = currentElementContent; String oldElement = currentElement; // This is the (global) counter for the // array of specified attributes. tagAttributePos = 0; // Read the element type name. gi = readNmtoken(true); // Determine the current content type. currentElement = gi; currentElementContent = getElementContentType(gi); if (currentElementContent == CONTENT_UNDECLARED) { currentElementContent = CONTENT_ANY; } // Read the attributes, if any. // After this loop, we should be just // in front of the closing delimiter. skipWhitespace(); c = readCh(); while ((c != '/') && (c != '>')) { unread(c); parseAttribute(gi); skipWhitespace(); c = readCh(); } unread(c); // Supply any defaulted attributes. Enumeration atts = declaredAttributes(gi); if (atts != null) { String aname; loop: while (atts.hasMoreElements()) { aname = (String) atts.nextElement(); // See if it was specified. for (int i = 0; i < tagAttributePos; i++) { if (tagAttributes[i].equals(aname)) { continue loop; } } // I guess not... if (handler != null) { handler.attribute(aname, getAttributeExpandedValue(gi, aname), false); } } } // Figure out if this is a start tag // or an empty element, and dispatch an // event accordingly. c = readCh(); switch (c) { case '>': if (handler != null) { handler.startElement(gi); } parseContent(); break; case '/': require('>'); if (handler != null) { handler.startElement(gi); handler.endElement(gi); } break; } // Restore the previous state. currentElement = oldElement; currentElementContent = oldElementContent; } /** * Parse an attribute assignment. * <pre> * [34] Attribute ::= Name Eq AttValue * </pre> * @param name The name of the attribute's element. * @see XmlHandler#attribute */ void parseAttribute(String name) throws java.lang.Exception { String aname; int type; String value; // Read the attribute name. aname = readNmtoken(true).intern(); // Fix by Zoltan Kemenczy for: // "attribute value normalization according to Section 3.3.3 // Attribute-Value Normalization of XML 1.0 // http://www.w3.org/TR/2000/REC-xml-20001006#AVNormalize). It // says that escaped whitespace character references that are not // #x20 (like the newline,#xa) should be preserved in the // normalized value)" //type = getAttributeDefaultValueType(name, aname); type = getAttributeType(name, aname); // Parse '=' parseEq(); // Read the value, normalizing whitespace // if it is not CDATA. if ((type == ATTRIBUTE_CDATA) || (type == ATTRIBUTE_UNDECLARED)) { value = readLiteral(LIT_CHAR_REF | LIT_ENTITY_REF); } else { value = readLiteral(LIT_CHAR_REF | LIT_ENTITY_REF | LIT_NORMALIZE); } // Inform the handler about the // attribute. if (handler != null) { handler.attribute(aname, value, true); } dataBufferPos = 0; // Note that the attribute has been // specified. if (tagAttributePos == tagAttributes.length) { String[] newAttrib = new String[tagAttributes.length * 2]; System.arraycopy(tagAttributes, 0, newAttrib, 0, tagAttributePos); tagAttributes = newAttrib; } tagAttributes[tagAttributePos++] = aname; } /** * Parse an equals sign surrounded by optional whitespace. * [35] Eq ::= S? '=' S? */ void parseEq() throws java.lang.Exception { skipWhitespace(); require('='); skipWhitespace(); } /** * Parse an end tag. * [36] ETag ::= '</' Name S? '>' * *NOTE: parseContent() chains to here. */ void parseETag() throws java.lang.Exception { String name; name = readNmtoken(true); if (!name.equals(currentElement)) { error("mismatched end tag", name, currentElement); } skipWhitespace(); require('>'); if (handler != null) { handler.endElement(name); } } /** * Parse the content of an element. * [37] content ::= (element | PCData | Reference | CDSect | PI | Comment)* * [68] Reference ::= EntityRef | CharRef */ void parseContent() throws java.lang.Exception { char c; while (true) { switch (currentElementContent) { case CONTENT_ANY: case CONTENT_MIXED: parsePCData(); break; case CONTENT_ELEMENTS: parseWhitespace(); break; } // Handle delimiters c = readCh(); switch (c) { case '&': // Found "&" c = readCh(); if (c == '#') { parseCharRef(); } else { unread(c); parseEntityRef(true); } break; case '<': // Found "<" c = readCh(); switch (c) { case '!': // Found "<!" c = readCh(); switch (c) { case '-': // Found "<!-" require('-'); parseComment(); break; case '[': // Found "<![" require("CDATA["); parseCDSect(); break; default: error("expected comment or CDATA section", c, null); break; } break; case '?': // Found "<?" dataBufferFlush(); parsePI(); break; case '/': // Found "</" dataBufferFlush(); parseETag(); return; default: // Found "<" followed by something else dataBufferFlush(); unread(c); parseElement(); break; } } } } /** * Parse an element type declaration. * [40] elementdecl ::= '<!ELEMENT' S %Name S (%S S)? %contentspec S? '>' * [VC: Unique Element Declaration] * *NOTE: the '<!ELEMENT' has already been read. */ void parseElementdecl() throws java.lang.Exception { String name; requireWhitespace(); // Read the element type name. name = readNmtoken(true); requireWhitespace(); // Read the content model. parseContentspec(name); skipWhitespace(); require('>'); } /** * Content specification. * [41] contentspec ::= 'EMPTY' | 'ANY' | Mixed | elements */ void parseContentspec(String name) throws java.lang.Exception { if (tryRead("EMPTY")) { setElement(name, CONTENT_EMPTY, null, null); return; } else if (tryRead("ANY")) { setElement(name, CONTENT_ANY, null, null); return; } else { require('('); dataBufferAppend('('); skipWhitespace(); if (tryRead("#PCDATA")) { dataBufferAppend("#PCDATA"); parseMixed(); setElement(name, CONTENT_MIXED, dataBufferToString(), null); } else { parseElements(); setElement(name, CONTENT_ELEMENTS, dataBufferToString(), null); } } } /** * Parse an element-content model. * [42] elements ::= (choice | seq) ('?' | '*' | '+')? * [44] cps ::= S? %cp S? * [45] choice ::= '(' S? %ctokplus (S? '|' S? %ctoks)* S? ')' * [46] ctokplus ::= cps ('|' cps)+ * [47] ctoks ::= cps ('|' cps)* * [48] seq ::= '(' S? %stoks (S? ',' S? %stoks)* S? ')' * [49] stoks ::= cps (',' cps)* * *NOTE: the opening '(' and S have already been read. * *TODO: go over parameter entity boundaries more carefully. */ void parseElements() throws java.lang.Exception { char c; char sep; // Parse the first content particle skipWhitespace(); parseCp(); // Check for end or for a separator. skipWhitespace(); c = readCh(); switch (c) { case ')': dataBufferAppend(')'); c = readCh(); switch (c) { case '*': case '+': case '?': dataBufferAppend(c); break; default: unread(c); } return; case ',': // Register the separator. case '|': sep = c; dataBufferAppend(c); break; default: error("bad separator in content model", c, null); return; } // Parse the rest of the content model. while (true) { skipWhitespace(); parseCp(); skipWhitespace(); c = readCh(); if (c == ')') { dataBufferAppend(')'); break; } else if (c != sep) { error("bad separator in content model", c, "'" + sep + "'"); return; } else { dataBufferAppend(c); } } // Check for the occurrence indicator. c = readCh(); switch (c) { case '?': case '*': case '+': dataBufferAppend(c); return; default: unread(c); return; } } /** * Parse a content particle. * [43] cp ::= (Name | choice | seq) ('?' | '*' | '+') * *NOTE: I actually use a slightly different production here: * cp ::= (elements | (Name ('?' | '*' | '+')?)) */ void parseCp() throws java.lang.Exception { char c; if (tryRead('(')) { dataBufferAppend('('); parseElements(); } else { dataBufferAppend(readNmtoken(true)); c = readCh(); switch (c) { case '?': case '*': case '+': dataBufferAppend(c); break; default: unread(c); break; } } } /** * Parse mixed content. * [50] Mixed ::= '(' S? %( %'#PCDATA' (S? '|' S? %Mtoks)* ) S? ')*' * | '(' S? %('#PCDATA') S? ')' * [51] Mtoks ::= %Name (S? '|' S? %Name)* * *NOTE: the S and '#PCDATA' have already been read. */ void parseMixed() throws java.lang.Exception { // Check for PCDATA alone. skipWhitespace(); if (tryRead(')')) { dataBufferAppend(")*"); tryRead('*'); return; } // Parse mixed content. skipWhitespace(); while (!tryRead(")*")) { require('|'); dataBufferAppend('|'); skipWhitespace(); dataBufferAppend(readNmtoken(true)); skipWhitespace(); } dataBufferAppend(")*"); } /** * Parse an attribute list declaration. * [52] AttlistDecl ::= '<!ATTLIST' S %Name S? %AttDef+ S? '>' * *NOTE: the '<!ATTLIST' has already been read. */ void parseAttlistDecl() throws java.lang.Exception { String elementName; requireWhitespace(); elementName = readNmtoken(true); requireWhitespace(); while (!tryRead('>')) { parseAttDef(elementName); skipWhitespace(); } } /** * Parse a single attribute definition. * [53] AttDef ::= S %Name S %AttType S %Default */ void parseAttDef(String elementName) throws java.lang.Exception { String name; int type; String enumeration = null; // Read the attribute name. name = readNmtoken(true); // Read the attribute type. requireWhitespace(); type = readAttType(); // Get the string of enumerated values // if necessary. if ((type == ATTRIBUTE_ENUMERATED) || (type == ATTRIBUTE_NOTATION)) { enumeration = dataBufferToString(); } // Read the default value. requireWhitespace(); parseDefault(elementName, name, type, enumeration); } /** * Parse the attribute type. * [54] AttType ::= StringType | TokenizedType | EnumeratedType * [55] StringType ::= 'CDATA' * [56] TokenizedType ::= 'ID' | 'IDREF' | 'IDREFS' | 'ENTITY' | 'ENTITIES' | * 'NMTOKEN' | 'NMTOKENS' * [57] EnumeratedType ::= NotationType | Enumeration * *TODO: validate the type!! */ int readAttType() throws java.lang.Exception { String typeString; Integer type; if (tryRead('(')) { parseEnumeration(); return ATTRIBUTE_ENUMERATED; } else { typeString = readNmtoken(true); if (typeString.equals("NOTATION")) { parseNotationType(); } type = (Integer) attributeTypeHash.get(typeString); if (type == null) { error("illegal attribute type", typeString, null); return ATTRIBUTE_UNDECLARED; } else { return type.intValue(); } } } /** * Parse an enumeration. * [60] Enumeration ::= '(' S? %Etoks (S? '|' S? %Etoks)* S? ')' * [61] Etoks ::= %Nmtoken (S? '|' S? %Nmtoken)* * *NOTE: the '(' has already been read. */ void parseEnumeration() throws java.lang.Exception { dataBufferAppend('('); // Read the first token. skipWhitespace(); dataBufferAppend(readNmtoken(true)); // Read the remaining tokens. skipWhitespace(); while (!tryRead(')')) { require('|'); dataBufferAppend('|'); skipWhitespace(); dataBufferAppend(readNmtoken(true)); skipWhitespace(); } dataBufferAppend(')'); } /** * Parse a notation type for an attribute. * [58] NotationType ::= %'NOTATION' S '(' S? %Ntoks (S? '|' S? %Ntoks)* * S? ')' * [59] Ntoks ::= %Name (S? '|' S? %Name) * *NOTE: the 'NOTATION' has already been read */ void parseNotationType() throws java.lang.Exception { requireWhitespace(); require('('); parseEnumeration(); } /** * Parse the default value for an attribute. * [62] Default ::= '#REQUIRED' | '#IMPLIED' | ((%'#FIXED' S)? %AttValue */ void parseDefault(String elementName, String name, int type, String enumeration) throws java.lang.Exception { int valueType = ATTRIBUTE_DEFAULT_SPECIFIED; String value = null; if (tryRead('#')) { if (tryRead("FIXED")) { valueType = ATTRIBUTE_DEFAULT_FIXED; requireWhitespace(); context = CONTEXT_ATTRIBUTEVALUE; value = readLiteral(LIT_CHAR_REF); context = CONTEXT_DTD; } else if (tryRead("REQUIRED")) { valueType = ATTRIBUTE_DEFAULT_REQUIRED; } else if (tryRead("IMPLIED")) { valueType = ATTRIBUTE_DEFAULT_IMPLIED; } else { error("illegal keyword for attribute default value", null, null); } } else { context = CONTEXT_ATTRIBUTEVALUE; value = readLiteral(LIT_CHAR_REF); context = CONTEXT_DTD; } setAttribute(elementName, name, type, enumeration, value, valueType); } /** * Parse a conditional section. * [63] conditionalSect ::= includeSect || ignoreSect * [64] includeSect ::= '<![' %'INCLUDE' '[' (%markupdecl*)* ']]>' * [65] ignoreSect ::= '<![' %'IGNORE' '[' ignoreSectContents* ']]>' * [66] ignoreSectContents ::= ((SkipLit | Comment | PI) -(Char* ']]>')) * | ('<![' ignoreSectContents* ']]>') * | (Char - (']' | [<'"])) * | ('<!' (Char - ('-' | '['))) * *NOTE: the '<![' has already been read. * *TODO: verify that I am handling ignoreSectContents right. */ void parseConditionalSect() throws java.lang.Exception { skipWhitespace(); if (tryRead("INCLUDE")) { skipWhitespace(); require('['); skipWhitespace(); while (!tryRead("]]>")) { parseMarkupdecl(); skipWhitespace(); } } else if (tryRead("IGNORE")) { skipWhitespace(); require('['); char c; for (int nest = 1; nest > 0;) { c = readCh(); switch (c) { case '<': if (tryRead("![")) { nest++; } break; case ']': if (tryRead("]>")) { nest--; } break; } } } else { error("conditional section must begin with INCLUDE or IGNORE", null, null); } } /** * Read a character reference. * [67] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';' * *NOTE: the '&#' has already been read. */ void parseCharRef() throws java.lang.Exception { int value = 0; char c; if (tryRead('x')) { loop1: while (true) { c = readCh(); switch (c) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': case 'a': case 'A': case 'b': case 'B': case 'c': case 'C': case 'd': case 'D': case 'e': case 'E': case 'f': case 'F': value *= 16; value += Integer.parseInt(Character.toString(c), 16); break; case ';': break loop1; default: error("illegal character in character reference", c, null); break loop1; } } } else { loop2: while (true) { c = readCh(); switch (c) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': value *= 10; value += Integer.parseInt(Character.toString(c), 10); break; case ';': break loop2; default: error("illegal character in character reference", c, null); break loop2; } } } // Check for surrogates: 00000000 0000xxxx yyyyyyyy zzzzzzzz // (1101|10xx|xxyy|yyyy + 1101|11yy|zzzz|zzzz: if (value <= 0x0000ffff) { // no surrogates needed dataBufferAppend((char) value); } else if (value <= 0x000fffff) { // > 16 bits, surrogate needed dataBufferAppend((char) (0xd8 | ((value & 0x000ffc00) >> 10))); dataBufferAppend((char) (0xdc | (value & 0x0003ff))); } else { // too big for surrogate error("character reference " + value + " is too large for UTF-16", Integer.valueOf(value).toString(), null); } } /** * Parse a reference. * [69] EntityRef ::= '&' Name ';' * *NOTE: the '&' has already been read. * @param externalAllowed External entities are allowed here. */ void parseEntityRef(boolean externalAllowed) throws java.lang.Exception { String name; name = readNmtoken(true); require(';'); switch (getEntityType(name)) { case ENTITY_UNDECLARED: error("reference to undeclared entity", name, null); break; case ENTITY_INTERNAL: pushString(name, getEntityValue(name)); break; case ENTITY_TEXT: if (externalAllowed) { pushURL(name, getEntityPublicId(name), getEntitySystemId(name), null, null, null); } else { error("reference to external entity in attribute value.", name, null); } break; case ENTITY_NDATA: if (externalAllowed) { error("data entity reference in content", name, null); } else { error("reference to external entity in attribute value.", name, null); } break; } } /** * Parse a parameter entity reference. * [70] PEReference ::= '%' Name ';' * *NOTE: the '%' has already been read. */ void parsePEReference(boolean isEntityValue) throws java.lang.Exception { String name; name = "%" + readNmtoken(true); require(';'); switch (getEntityType(name)) { case ENTITY_UNDECLARED: error("reference to undeclared parameter entity", name, null); break; case ENTITY_INTERNAL: if (isEntityValue) { pushString(name, getEntityValue(name)); } else { pushString(name, " " + getEntityValue(name) + ' '); } break; case ENTITY_TEXT: if (isEntityValue) { pushString(null, " "); } pushURL(name, getEntityPublicId(name), getEntitySystemId(name), null, null, null); if (isEntityValue) { pushString(null, " "); } break; } } /** * Parse an entity declaration. * [71] EntityDecl ::= '<!ENTITY' S %Name S %EntityDef S? '>' * | '<!ENTITY' S '%' S %Name S %EntityDef S? '>' * [72] EntityDef ::= EntityValue | ExternalDef * [73] ExternalDef ::= ExternalID %NDataDecl? * [74] ExternalID ::= 'SYSTEM' S SystemLiteral * | 'PUBLIC' S PubidLiteral S SystemLiteral * [75] NDataDecl ::= S %'NDATA' S %Name * *NOTE: the '<!ENTITY' has already been read. */ void parseEntityDecl() throws java.lang.Exception { char c; boolean peFlag = false; String name; String value; String notationName; String[] ids; // Check for a parameter entity. requireWhitespace(); if (tryRead('%')) { peFlag = true; requireWhitespace(); } // Read the entity name, and prepend // '%' if necessary. name = readNmtoken(true); if (peFlag) { name = "%" + name; } // Read the entity value. requireWhitespace(); c = readCh(); unread(c); if ((c == '"') || (c == '\'')) { // Internal entity. context = CONTEXT_ENTITYVALUE; value = readLiteral(LIT_CHAR_REF | LIT_PE_REF); context = CONTEXT_DTD; setInternalEntity(name, value); } else { // Read the external IDs ids = readExternalIds(false); if (ids[1] == null) { error("system identifier missing", name, null); } // Check for NDATA declaration. skipWhitespace(); if (tryRead("NDATA")) { requireWhitespace(); notationName = readNmtoken(true); setExternalDataEntity(name, ids[0], ids[1], notationName); } else { setExternalTextEntity(name, ids[0], ids[1]); } } // Finish the declaration. skipWhitespace(); require('>'); } /** * Parse a notation declaration. * [81] NotationDecl ::= '<!NOTATION' S %Name S %ExternalID S? '>' * *NOTE: the '<!NOTATION' has already been read. */ void parseNotationDecl() throws java.lang.Exception { String nname; String[] ids; requireWhitespace(); nname = readNmtoken(true); requireWhitespace(); // Read the external identifiers. ids = readExternalIds(true); if ((ids[0] == null) && (ids[1] == null)) { error("external identifier missing", nname, null); } // Register the notation. setNotation(nname, ids[0], ids[1]); skipWhitespace(); require('>'); } /** * Parse PCDATA. * <pre> * [16] PCData ::= [^<&]* * </pre> * <p>The trick here is that the data stays in the dataBuffer without * necessarily being converted to a string right away. */ void parsePCData() throws java.lang.Exception { char c; // Start with a little cheat -- in most // cases, the entire sequence of // character data will already be in // the readBuffer; if not, fall through to // the normal approach. if (USE_CHEATS) { int lineAugment = 0; int columnAugment = 0; /*loop:*/for (int i = readBufferPos; i < readBufferLength; i++) { switch (readBuffer[i]) { case '\n': lineAugment++; columnAugment = 0; break; case '&': case '<': int start = readBufferPos; columnAugment++; readBufferPos = i; if (lineAugment > 0) { line += lineAugment; column = columnAugment; } else { column += columnAugment; } dataBufferAppend(readBuffer, start, i - start); return; default: columnAugment++; } } } // OK, the cheat didn't work; start over // and do it by the book. while (true) { c = readCh(); switch (c) { case '<': case '&': unread(c); return; default: dataBufferAppend(c); break; } } } ////////////////////////////////////////////////////////////////////// // High-level reading and scanning methods. ////////////////////////////////////////////////////////////////////// /** * Require whitespace characters. * [1] S ::= (#x20 | #x9 | #xd | #xa)+ */ void requireWhitespace() throws java.lang.Exception { char c = readCh(); if (isWhitespace(c)) { skipWhitespace(); } else { error("whitespace expected", c, null); } } /** * Parse whitespace characters, and leave them in the data buffer. */ void parseWhitespace() throws java.lang.Exception { char c = readCh(); while (isWhitespace(c)) { dataBufferAppend(c); c = readCh(); } unread(c); } /** * Skip whitespace characters. * [1] S ::= (#x20 | #x9 | #xd | #xa)+ */ void skipWhitespace() throws java.lang.Exception { // Start with a little cheat. Most of // the time, the white space will fall // within the current read buffer; if // not, then fall through. if (USE_CHEATS) { int lineAugment = 0; int columnAugment = 0; loop: for (int i = readBufferPos; i < readBufferLength; i++) { switch (readBuffer[i]) { case ' ': case '\t': case '\r': columnAugment++; break; case '\n': lineAugment++; columnAugment = 0; break; case '%': if ((context == CONTEXT_DTD) || (context == CONTEXT_ENTITYVALUE)) { break loop; } // else fall through... default: readBufferPos = i; if (lineAugment > 0) { line += lineAugment; column = columnAugment; } else { column += columnAugment; } return; } } } // OK, do it by the book. char c = readCh(); while (isWhitespace(c)) { c = readCh(); } unread(c); } /** * Read a name or name token. * [5] Name ::= (Letter | '_' | ':') (NameChar)* * [7] Nmtoken ::= (NameChar)+ * *NOTE: [6] is implemented implicitly where required. */ String readNmtoken(boolean isName) throws java.lang.Exception { char c; if (USE_CHEATS) { loop: for (int i = readBufferPos; i < readBufferLength; i++) { switch (readBuffer[i]) { case '%': if ((context == CONTEXT_DTD) || (context == CONTEXT_ENTITYVALUE)) { break loop; } // else fall through... case '<': case '>': case '&': case ',': case '|': case '*': case '+': case '?': case ')': case '=': case '\'': case '"': case '[': case ' ': case '\t': case '\r': case '\n': case ';': case '/': case '#': int start = readBufferPos; if (i == start) { error("name expected", readBuffer[i], null); } readBufferPos = i; return intern(readBuffer, start, i - start); } } } nameBufferPos = 0; // Read the first character. /*loop: */while (true) { c = readCh(); switch (c) { case '%': case '<': case '>': case '&': case ',': case '|': case '*': case '+': case '?': case ')': case '=': case '\'': case '"': case '[': case ' ': case '\t': case '\n': case '\r': case ';': case '/': unread(c); if (nameBufferPos == 0) { error("name expected", null, null); } String s = intern(nameBuffer, 0, nameBufferPos); nameBufferPos = 0; return s; default: nameBuffer = (char[]) extendArray(nameBuffer, nameBuffer.length, nameBufferPos); nameBuffer[nameBufferPos++] = c; } } } /** * Read a literal. * [10] AttValue ::= '"' ([^<&"] | Reference)* '"' * | "'" ([^<&'] | Reference)* "'" * [11] SystemLiteral ::= '"' URLchar* '"' | "'" (URLchar - "'")* "'" * [13] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'" * [9] EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"' * | "'" ([^%&'] | PEReference | Reference)* "'" */ String readLiteral(int flags) throws java.lang.Exception { char delim; char c; int startLine = line; // Find the delimiter. delim = readCh(); if ((delim != '"') && (delim != '\'') && (delim != (char) 0)) { error("expected '\"' or \"'\"", delim, null); return null; } // Read the literal. try { c = readCh(); loop: while (c != delim) { switch (c) { // Literals never have line ends case '\n': case '\r': c = ' '; break; // References may be allowed case '&': if ((flags & LIT_CHAR_REF) > 0) { c = readCh(); if (c == '#') { parseCharRef(); c = readCh(); continue loop; // check the next character } else if ((flags & LIT_ENTITY_REF) > 0) { unread(c); parseEntityRef(false); c = readCh(); continue loop; } else { dataBufferAppend('&'); } } break; default: break; } dataBufferAppend(c); c = readCh(); } } catch (EOFException e) { error("end of input while looking for delimiter (started on line " + startLine + ')', null, Character.toString(delim)); } // Normalise whitespace if necessary. if ((flags & LIT_NORMALIZE) > 0) { dataBufferNormalize(); } // Return the value. return dataBufferToString(); } /** * Try reading external identifiers. * <p>The system identifier is not required for notations. * @param inNotation Are we in a notation? * @return A two-member String array containing the identifiers. */ String[] readExternalIds(boolean inNotation) throws java.lang.Exception { String[] ids = new String[2]; if (tryRead("PUBLIC")) { requireWhitespace(); ids[0] = readLiteral(LIT_NORMALIZE); // public id if (inNotation) { skipWhitespace(); if (tryRead('"') || tryRead('\'')) { ids[1] = readLiteral(0); } } else { requireWhitespace(); ids[1] = readLiteral(0); // system id } } else if (tryRead("SYSTEM")) { requireWhitespace(); ids[1] = readLiteral(0); // system id } return ids; } /** * Test if a character is whitespace. * <pre> * [1] S ::= (#x20 | #x9 | #xd | #xa)+ * </pre> * @param c The character to test. * @return true if the character is whitespace. */ final boolean isWhitespace(char c) { switch (c) { case 0x20: case 0x09: case 0x0d: case 0x0a: return true; default: return false; } } ////////////////////////////////////////////////////////////////////// // Utility routines. ////////////////////////////////////////////////////////////////////// /** * Add a character to the data buffer. */ void dataBufferAppend(char c) { // Expand buffer if necessary. if (dataBufferPos >= dataBuffer.length) { // dataBufferAppend() gets called alot, so instead of // calling extendArray() here, we optimize the heck out of this // code. //dataBuffer = (char[]) extendArray(dataBuffer, dataBuffer.length, // dataBufferPos); final int currentSize = dataBuffer.length; int newSize = currentSize * 2; if (newSize <= dataBufferPos) { newSize = dataBufferPos + 1; } // Dwight Richards pointed out that newSize was ignored (11/03) char [] newArray = new char[newSize]; System.arraycopy(dataBuffer, 0, newArray, 0, currentSize); dataBuffer = newArray; } dataBuffer[dataBufferPos++] = c; } /** * Add a string to the data buffer. */ void dataBufferAppend(String s) { dataBufferAppend(s.toCharArray(), 0, s.length()); } /** * Append (part of) a character array to the data buffer. */ void dataBufferAppend(char[] ch, int start, int length) { dataBuffer = (char[]) extendArray(dataBuffer, dataBuffer.length, dataBufferPos + length); System.arraycopy(ch, start, dataBuffer, dataBufferPos, length); dataBufferPos += length; } /** * Normalise whitespace in the data buffer. */ void dataBufferNormalize() { int i = 0; int j = 0; int end = dataBufferPos; // Skip whitespace at the start. while ((j < end) && isWhitespace(dataBuffer[j])) { j++; } // Skip whitespace at the end. while ((end > j) && isWhitespace(dataBuffer[end - 1])) { end--; } // Start copying to the left. while (j < end) { char c = dataBuffer[j++]; // Normalise all other whitespace to // a single space. if (isWhitespace(c)) { while ((j < end) && isWhitespace(dataBuffer[j++])) { } dataBuffer[i++] = ' '; dataBuffer[i++] = dataBuffer[j - 1]; } else { dataBuffer[i++] = c; } } // The new length is <= the old one. dataBufferPos = i; } /** * Convert the data buffer to a string. * @see #intern(char[],int,int) */ String dataBufferToString() { String s = new String(dataBuffer, 0, dataBufferPos); dataBufferPos = 0; return s; } /** * Flush the contents of the data buffer to the handler, if * appropriate, and reset the buffer for new input. */ void dataBufferFlush() throws java.lang.Exception { if (dataBufferPos > 0) { switch (currentElementContent) { case CONTENT_UNDECLARED: case CONTENT_EMPTY: // do nothing break; case CONTENT_MIXED: case CONTENT_ANY: if (handler != null) { handler.charData(dataBuffer, 0, dataBufferPos); } break; case CONTENT_ELEMENTS: if (handler != null) { handler.ignorableWhitespace(dataBuffer, 0, dataBufferPos); } break; } dataBufferPos = 0; } } /** * Require a string to appear, or throw an exception. */ void require(String delim) throws java.lang.Exception { char[] ch = delim.toCharArray(); for (int i = 0; i < ch.length; i++) { require(ch[i]); } } /** * Require a character to appear, or throw an exception. */ void require(char delim) throws java.lang.Exception { char c = readCh(); if (c != delim) { error("expected character", c, Character.toString(delim)); } } /** * Return an internalised version of a string. * <p>Ælfred uses this method to create an internalised version * of all names and attribute values, so that it can test equality * with <code>==</code> instead of <code>String.equals()</code>. * <p>If you want to be able to test for equality in the same way, * you can use this method to internalise your own strings first: * <pre> * String PARA = handler.intern("PARA"); * </pre> * <p>Note that this will not return the same results as String.intern(). * @param s The string to internalise. * @return An internalised version of the string. * @see #intern(char[],int,int) * @see java.lang.String#intern */ public String intern(String s) { char[] ch = s.toCharArray(); return intern(ch, 0, ch.length); } /** * Create an internalised string from a character array. * <p>This is much more efficient than constructing a non-internalised * string first, and then internalising it. * <p>Note that this will not return the same results as String.intern(). * @param ch an array of characters for building the string. * @param start the starting position in the array. * @param length the number of characters to place in the string. * @return an internalised string. * @see #intern(String) * @see java.lang.String#intern */ public String intern(char[] ch, int start, int length) { int index; int hash = 0; // Generate a hash code. for (int i = start; i < (start + length); i++) { hash = ((hash << 1) & 0xffffff) + ch[i]; } hash = hash % SYMBOL_TABLE_LENGTH; // Get the bucket. Object[] bucket = (Object[]) symbolTable[hash]; if (bucket == null) { symbolTable[hash] = bucket = new Object[8]; } // Search for a matching tuple, and // return the string if we find one. for (index = 0; index < bucket.length; index += 2) { char[] chFound = (char[]) bucket[index]; // Stop when we hit a null index. if (chFound == null) { break; } // If they're the same length, // check for a match. // If the loop finishes, 'index' will // contain the current bucket // position. if (chFound.length == length) { for (int i = 0; i < chFound.length; i++) { // Stop if there are no more tuples. if (ch[start + i] != chFound[i]) { break; } else if (i == (length - 1)) { // That's it, we have a match! return (String) bucket[index + 1]; } } } } // Not found -- we'll have to add it. // Do we have to grow the bucket? bucket = (Object[]) extendArray(bucket, bucket.length, index); // OK, add it to the end of the // bucket. String s = new String(ch, start, length); bucket[index] = s.toCharArray(); bucket[index + 1] = s; symbolTable[hash] = bucket; return s; } /** * Ensure the capacity of an array, allocating a new one if * necessary. */ Object extendArray(Object array, int currentSize, int requiredSize) { if (requiredSize < currentSize) { return array; } else { Object newArray = null; int newSize = currentSize * 2; if (newSize <= requiredSize) { newSize = requiredSize + 1; } // Dwight Richards pointed out that newSize was ignored (11/03) if (array instanceof char[]) { newArray = new char[newSize]; } else if (array instanceof Object[]) { newArray = new Object[newSize]; } else { throw new RuntimeException("Array must be char[] or Object[]"); } System.arraycopy(array, 0, newArray, 0, currentSize); return newArray; } } ////////////////////////////////////////////////////////////////////// // XML query routines. ////////////////////////////////////////////////////////////////////// // // Elements // /** * Get the declared elements for an XML document. * <p>The results will be valid only after the DTD (if any) has been * parsed. * @return An enumeration of all element types declared for this * document (as Strings). * @see #getElementContentType * @see #getElementContentModel */ public Enumeration declaredElements() { return elementInfo.keys(); } /** * Look up the content type of an element. * @param name The element type name. * @return An integer constant representing the content type. * @see #getElementContentModel * @see #CONTENT_UNDECLARED * @see #CONTENT_ANY * @see #CONTENT_EMPTY * @see #CONTENT_MIXED * @see #CONTENT_ELEMENTS */ public int getElementContentType(String name) { Object[] element = (Object[]) elementInfo.get(name); if (element == null) { return CONTENT_UNDECLARED; } else { return ((Integer) element[0]).intValue(); } } /** * Look up the content model of an element. * <p>The result will always be null unless the content type is * CONTENT_ELEMENTS or CONTENT_MIXED. * @param name The element type name. * @return The normalised content model, as a string. * @see #getElementContentType */ public String getElementContentModel(String name) { Object[] element = (Object[]) elementInfo.get(name); if (element == null) { return null; } else { return (String) element[1]; } } /** * Register an element. * Array format: * element type * attribute hash table */ void setElement(String name, int contentType, String contentModel, Hashtable attributes) throws java.lang.Exception { Object[] element; // Try looking up the element element = (Object[]) elementInfo.get(name); // Make a new one if necessary. if (element == null) { element = new Object[3]; element[0] = Integer.valueOf(CONTENT_UNDECLARED); element[1] = null; element[2] = null; } else if ((contentType != CONTENT_UNDECLARED) && (((Integer) element[0]).intValue() != CONTENT_UNDECLARED)) { error("multiple declarations for element type", name, null); return; } // Insert the content type, if any. if (contentType != CONTENT_UNDECLARED) { element[0] = Integer.valueOf(contentType); } // Insert the content model, if any. if (contentModel != null) { element[1] = contentModel; } // Insert the attributes, if any. if (attributes != null) { element[2] = attributes; } // Save the element info. elementInfo.put(name, element); } /** * Look up the attribute hash table for an element. * The hash table is the second item in the element array. */ Hashtable getElementAttributes(String name) { Object[] element = (Object[]) elementInfo.get(name); if (element == null) { return null; } else { return (Hashtable) element[2]; } } // // Attributes // /** * Get the declared attributes for an element type. * @param elname The name of the element type. * @return An Enumeration of all the attributes declared for * a specific element type. The results will be valid only * after the DTD (if any) has been parsed. * @see #getAttributeType * @see #getAttributeEnumeration * @see #getAttributeDefaultValueType * @see #getAttributeDefaultValue * @see #getAttributeExpandedValue */ public Enumeration declaredAttributes(String elname) { Hashtable attlist = getElementAttributes(elname); if (attlist == null) { return null; } else { return attlist.keys(); } } /** * Retrieve the declared type of an attribute. * @param name The name of the associated element. * @param aname The name of the attribute. * @return An integer constant representing the attribute type. * @see #ATTRIBUTE_UNDECLARED * @see #ATTRIBUTE_CDATA * @see #ATTRIBUTE_ID * @see #ATTRIBUTE_IDREF * @see #ATTRIBUTE_IDREFS * @see #ATTRIBUTE_ENTITY * @see #ATTRIBUTE_ENTITIES * @see #ATTRIBUTE_NMTOKEN * @see #ATTRIBUTE_NMTOKENS * @see #ATTRIBUTE_ENUMERATED * @see #ATTRIBUTE_NOTATION */ public int getAttributeType(String name, String aname) { Object[] attribute = getAttribute(name, aname); if (attribute == null) { return ATTRIBUTE_UNDECLARED; } else { return ((Integer) attribute[0]).intValue(); } } /** * Retrieve the allowed values for an enumerated attribute type. * @param name The name of the associated element. * @param aname The name of the attribute. * @return A string containing the token list. * @see #ATTRIBUTE_ENUMERATED * @see #ATTRIBUTE_NOTATION */ public String getAttributeEnumeration(String name, String aname) { Object[] attribute = getAttribute(name, aname); if (attribute == null) { return null; } else { return (String) attribute[3]; } } /** * Retrieve the default value of a declared attribute. * @param name The name of the associated element. * @param aname The name of the attribute. * @return The default value, or null if the attribute was * #IMPLIED or simply undeclared and unspecified. * @see #getAttributeExpandedValue */ public String getAttributeDefaultValue(String name, String aname) { Object[] attribute = getAttribute(name, aname); if (attribute == null) { return null; } else { return (String) attribute[1]; } } /** * Retrieve the expanded value of a declared attribute. * <p>All general entities will be expanded. * @param name The name of the associated element. * @param aname The name of the attribute. * @return The expanded default value, or null if the attribute was * #IMPLIED or simply undeclared * @see #getAttributeDefaultValue */ public String getAttributeExpandedValue(String name, String aname) { Object[] attribute = getAttribute(name, aname); if (attribute == null) { return null; } else if ((attribute[4] == null) && (attribute[1] != null)) { try { pushString(null, (char) 0 + (String) attribute[1] + (char) 0); attribute[4] = readLiteral(LIT_NORMALIZE | LIT_CHAR_REF | LIT_ENTITY_REF); } catch (Exception ex) { // We could ignore this and return but instead return here. return (String) attribute[4]; } } return (String) attribute[4]; } /** * Retrieve the default value type of a declared attribute. * @param name The name of the element. * @param aname The name of the attribute. * @return ATTRIBUTE_DEFAULT_UNDECLARED if the attribute * cannot be found, otherwise return an integer. * @see #ATTRIBUTE_DEFAULT_SPECIFIED * @see #ATTRIBUTE_DEFAULT_IMPLIED * @see #ATTRIBUTE_DEFAULT_REQUIRED * @see #ATTRIBUTE_DEFAULT_FIXED */ public int getAttributeDefaultValueType(String name, String aname) { Object[] attribute = getAttribute(name, aname); if (attribute == null) { return ATTRIBUTE_DEFAULT_UNDECLARED; } else { return ((Integer) attribute[2]).intValue(); } } /** * Register an attribute declaration for later retrieval. * Format: * - String type * - String default value * - int value type * *TODO: do something with attribute types. */ void setAttribute(String elName, String name, int type, String enumeration, String value, int valueType) throws java.lang.Exception { Hashtable attlist; Object[] attribute; // Create a new hashtable if necessary. attlist = getElementAttributes(elName); if (attlist == null) { attlist = new Hashtable(); } // Check that the attribute doesn't // already exist! if (attlist.get(name) != null) { return; } else { attribute = new Object[5]; attribute[0] = Integer.valueOf(type); attribute[1] = value; attribute[2] = Integer.valueOf(valueType); attribute[3] = enumeration; attribute[4] = null; attlist.put(name.intern(), attribute); // Use CONTENT_UNDECLARED to avoid overwriting // existing element declaration. setElement(elName, CONTENT_UNDECLARED, null, attlist); } } /** * Retrieve the three-member array representing an * attribute declaration. * @param elName The name of the element. * @param name The name of the attribute. */ Object[] getAttribute(String elName, String name) { Hashtable attlist; Object[] attribute; attlist = getElementAttributes(elName); if (attlist == null) { return null; } attribute = (Object[]) attlist.get(name); return attribute; } // // Entities // /** * Get declared entities. * @return An Enumeration of all the entities declared for * this XML document. The results will be valid only * after the DTD (if any) has been parsed. * @see #getEntityType * @see #getEntityPublicId * @see #getEntitySystemId * @see #getEntityValue * @see #getEntityNotationName */ public Enumeration declaredEntities() { return entityInfo.keys(); } /** Return the current element. * @return The current Element. */ public String getCurrentElement() { // Ptolemy localization for MoMLParser so that we // can get the currentElement from within MoMLParser.attribute() return currentElement; } /** * Find the type of an entity. * @param ename The name of the entity. * @return An integer constant representing the entity type. * @see #ENTITY_UNDECLARED * @see #ENTITY_INTERNAL * @see #ENTITY_NDATA * @see #ENTITY_TEXT */ public int getEntityType(String ename) { Object[] entity = (Object[]) entityInfo.get(ename); if (entity == null) { return ENTITY_UNDECLARED; } else { return ((Integer) entity[0]).intValue(); } } /** * Return an external entity's public identifier, if any. * @param ename The name of the external entity. * @return The entity's system identifier, or null if the * entity was not declared, if it is not an * external entity, or if no public identifier was * provided. * @see #getEntityType */ public String getEntityPublicId(String ename) { Object[] entity = (Object[]) entityInfo.get(ename); if (entity == null) { return null; } else { return (String) entity[1]; } } /** * Return an external entity's system identifier. * @param ename The name of the external entity. * @return The entity's system identifier, or null if the * entity was not declared, or if it is not an * external entity. * @see #getEntityType */ public String getEntitySystemId(String ename) { Object[] entity = (Object[]) entityInfo.get(ename); if (entity == null) { return null; } else { return (String) entity[2]; } } /** * Return the value of an internal entity. * @param ename The name of the internal entity. * @return The entity's value, or null if the entity was * not declared, or if it is not an internal entity. * @see #getEntityType */ public String getEntityValue(String ename) { Object[] entity = (Object[]) entityInfo.get(ename); if (entity == null) { return null; } else { return (String) entity[3]; } } /** * Get the notation name associated with an NDATA entity. * @param eName The NDATA entity name. * @return The associated notation name, or null if the * entity was not declared, or if it is not an * NDATA entity. * @see #getEntityType */ public String getEntityNotationName(String eName) { Object[] entity = (Object[]) entityInfo.get(eName); if (entity == null) { return null; } else { return (String) entity[4]; } } /** * Register an entity declaration for later retrieval. */ void setInternalEntity(String eName, String value) { setEntity(eName, ENTITY_INTERNAL, null, null, value, null); } /** * Register an external data entity. */ void setExternalDataEntity(String eName, String pubid, String sysid, String nName) { setEntity(eName, ENTITY_NDATA, pubid, sysid, null, nName); } /** * Register an external text entity. */ void setExternalTextEntity(String eName, String pubid, String sysid) { setEntity(eName, ENTITY_TEXT, pubid, sysid, null, null); } /** * Register an entity declaration for later retrieval. */ void setEntity(String eName, int eClass, String pubid, String sysid, String value, String nName) { Object[] entity; if (entityInfo.get(eName) == null) { entity = new Object[5]; entity[0] = Integer.valueOf(eClass); entity[1] = pubid; entity[2] = sysid; entity[3] = value; entity[4] = nName; entityInfo.put(eName, entity); } } // // Notations. // /** * Get declared notations. * @return An Enumeration of all the notations declared for * this XML document. The results will be valid only * after the DTD (if any) has been parsed. * @see #getNotationPublicId * @see #getNotationSystemId */ public Enumeration declaredNotations() { return notationInfo.keys(); } /** * Look up the public identifier for a notation. * You will normally use this method to look up a notation * that was provided as an attribute value or for an NDATA entity. * @param nname The name of the notation. * @return A string containing the public identifier, or null * if none was provided or if no such notation was * declared. * @see #getNotationSystemId */ public String getNotationPublicId(String nname) { Object[] notation = (Object[]) notationInfo.get(nname); if (notation == null) { return null; } else { return (String) notation[0]; } } /** * Look up the system identifier for a notation. * You will normally use this method to look up a notation * that was provided as an attribute value or for an NDATA entity. * @param nname The name of the notation. * @return A string containing the system identifier, or null * if no such notation was declared. * @see #getNotationPublicId */ public String getNotationSystemId(String nname) { Object[] notation = (Object[]) notationInfo.get(nname); if (notation == null) { return null; } else { return (String) notation[1]; } } /** * Register a notation declaration for later retrieval. * Format: * - public id * - system id */ void setNotation(String nname, String pubid, String sysid) throws java.lang.Exception { Object[] notation; if (notationInfo.get(nname) == null) { notation = new Object[2]; notation[0] = pubid; notation[1] = sysid; notationInfo.put(nname, notation); } else { error("multiple declarations of notation", nname, null); } } // // Location. // /** * Return the current line number. * @return The current line number. */ public int getLineNumber() { return line; } /** * Return the current column number. * @return The current column number. */ public int getColumnNumber() { return column; } ////////////////////////////////////////////////////////////////////// // High-level I/O. ////////////////////////////////////////////////////////////////////// /** * Read a single character from the readBuffer. * <p>The readDataChunk() method maintains the buffer. * <p>If we hit the end of an entity, try to pop the stack and * keep going. * <p>(This approach doesn't really enforce XML's rules about * entity boundaries, but this is not currently a validating * parser). * <p>This routine also attempts to keep track of the current * position in external entities, but it's not entirely accurate. * @return The next available input character. * @see #unread(char) * @see #readDataChunk * @see #readBuffer * @see #line * @return The next character from the current input source. */ char readCh() throws java.lang.Exception { char c; // As long as there's nothing in the // read buffer, try reading more data // (for an external entity) or popping // the entity stack (for either). while (readBufferPos >= readBufferLength) { switch (sourceType) { case INPUT_READER: case INPUT_EXTERNAL: case INPUT_STREAM: readDataChunk(); while (readBufferLength < 1) { popInput(); if (readBufferLength < 1) { readDataChunk(); } } break; default: popInput(); break; } } c = readBuffer[readBufferPos++]; // This is a particularly nasty bit // of code, that checks for a parameter // entity reference but peeks ahead to // catch the '%' in parameter entity // declarations. if ((c == '%') && ((context == CONTEXT_DTD) || (context == CONTEXT_ENTITYVALUE))) { char c2 = readCh(); unread(c2); if (!isWhitespace(c2)) { parsePEReference(context == CONTEXT_ENTITYVALUE); return readCh(); } } if (c == '\n') { line++; column = 0; } else { column++; } return c; } /** * Push a single character back onto the current input stream. * <p>This method usually pushes the character back onto * the readBuffer. * <p>I don't think that this would ever be called with * readBufferPos = 0, because the methods always reads a character * before unreading it, but just in case, I've added a boundary * condition. * @param c The character to push back. * @see #readCh * @see #unread(char[], int) * @see #readBuffer */ void unread(char c) throws java.lang.Exception { // Normal condition. if (c == '\n') { line--; column = -1; } if (readBufferPos > 0) { readBuffer[--readBufferPos] = c; } else { pushString(null, Character.toString(c)); } } /** * Push a char array back onto the current input stream. * <p>NOTE: you must <em>never</em> push back characters that you * haven't actually read: use pushString() instead. * @see #readCh * @see #unread(char) * @see #readBuffer * @see #pushString */ void unread(char[] ch, int length) throws java.lang.Exception { for (int i = 0; i < length; i++) { if (ch[i] == '\n') { line--; column = -1; } } if (length < readBufferPos) { readBufferPos -= length; } else { pushCharArray(null, ch, 0, length); sourceType = INPUT_BUFFER; } } /** * Push a new external input source. * <p>The source will be either an external text entity, or the DTD * external subset. * <p>TO DO: Right now, this method always attempts to autodetect * the encoding; in the future, it should allow the caller to * request an encoding explicitly, and it should also look at the * headers with an HTTP connection. * @param ename * @param publicId * @param systemId * @param reader * @param stream * @param encoding * @exception Exception * @see XmlHandler#resolveEntity * @see #pushString * @see #sourceType * @see #pushInput * @see #detectEncoding * @see #sourceType * @see #readBuffer */ void pushURL(String ename, String publicId, String systemId, Reader reader, InputStream stream, String encoding) throws java.lang.Exception { URL url; boolean ignoreEncoding = false; // Push the existing status. pushInput(ename); // Create a new read buffer. // (Note the four-character margin) readBuffer = new char[READ_BUFFER_MAX + 4]; readBufferPos = 0; readBufferLength = 0; readBufferOverflow = -1; is = null; line = 1; currentByteCount = 0; // Flush any remaining data. dataBufferFlush(); // Make the URL absolute. if ((systemId != null) && (externalEntity != null)) { systemId = new URL(externalEntity.getURL(), systemId).toString(); } else if (baseURI != null) { try { systemId = new URL(new URL(baseURI), systemId).toString(); } catch (Throwable throwable) { // Ignore this and stick with the old systemId } } // See if the application wants to // redirect the system ID and/or // supply its own character stream. if ((systemId != null) && (handler != null)) { Object input = handler.resolveEntity(publicId, systemId); if (input != null) { if (input instanceof String) { systemId = (String) input; } else if (input instanceof InputStream) { stream = (InputStream) input; } else if (input instanceof Reader) { reader = (Reader) input; } } } // Start the entity. if (handler != null) { if (systemId != null) { handler.startExternalEntity(systemId); } else { handler.startExternalEntity("[external stream]"); } } // Figure out what we're reading from. if (reader != null) { // There's an explicit character stream. sourceType = INPUT_READER; this.reader = reader; tryEncodingDecl(true); return; } else if (stream != null) { sourceType = INPUT_STREAM; is = stream; } else { // We have to open our own stream // to the URL. // Set the new status sourceType = INPUT_EXTERNAL; url = new URL(systemId); externalEntity = url.openConnection(); externalEntity.connect(); is = externalEntity.getInputStream(); } // If we get to here, there must be // an InputStream available. if (!is.markSupported()) { is = new BufferedInputStream(is); } // Attempt to detect the encoding. if ((encoding == null) && (externalEntity != null)) { encoding = externalEntity.getContentEncoding(); } if (encoding != null) { checkEncoding(encoding, false); ignoreEncoding = true; } else { detectEncoding(); ignoreEncoding = false; } // Read an XML or text declaration. tryEncodingDecl(ignoreEncoding); } /** * Check for an encoding declaration. */ void tryEncodingDecl(boolean ignoreEncoding) throws java.lang.Exception { // Read the XML/Encoding declaration. if (tryRead("<?xml")) { if (tryWhitespace()) { if (inputStack.size() > 0) { parseTextDecl(ignoreEncoding); } else { parseXMLDecl(ignoreEncoding); } } else { unread("xml".toCharArray(), 3); parsePI(); } } } /** * Attempt to detect the encoding of an entity. * <p>The trick here (as suggested in the XML standard) is that * any entity not in UTF-8, or in UCS-2 with a byte-order mark, * <b>must</b> begin with an XML declaration or an encoding * declaration; we simply have to look for "<?XML" in various * encodings. * <p>This method has no way to distinguish among 8-bit encodings. * Instead, it assumes UTF-8, then (possibly) revises its assumption * later in checkEncoding(). Any ASCII-derived 8-bit encoding * should work, but most will be rejected later by checkEncoding(). * <p>I don't currently detect EBCDIC, since I'm concerned that it * could also be a valid UTF-8 sequence; I'll have to do more checking * later. * @see #tryEncoding(byte[], byte, byte, byte, byte) * @see #tryEncoding(byte[], byte, byte) * @see #checkEncoding * @see #read8bitEncodingDeclaration */ void detectEncoding() throws java.lang.Exception { byte[] signature = new byte[4]; // Read the first four bytes for // autodetection. is.mark(4); int bytesRead = is.read(signature); if (bytesRead != signature.length) { throw new IOException("Read only " + bytesRead + " bytes instead of " + signature.length); } is.reset(); // Look for a known signature. if (tryEncoding(signature, (byte) 0x00, (byte) 0x00, (byte) 0x00, (byte) 0x3c)) { // UCS-4 must begin with "<!XML" // 0x00 0x00 0x00 0x3c: UCS-4, big-endian (1234) encoding = ENCODING_UCS_4_1234; } else if (tryEncoding(signature, (byte) 0x3c, (byte) 0x00, (byte) 0x00, (byte) 0x00)) { // UCS-4 must begin with "<!XML" // 0x3c 0x00 0x00 0x00: UCS-4, little-endian (4321) encoding = ENCODING_UCS_4_4321; } else if (tryEncoding(signature, (byte) 0x00, (byte) 0x00, (byte) 0x3c, (byte) 0x00)) { // UCS-4 must begin with "<!XML" // 0x00 0x00 0x3c 0x00: UCS-4, unusual (2143) encoding = ENCODING_UCS_4_2143; } else if (tryEncoding(signature, (byte) 0x00, (byte) 0x3c, (byte) 0x00, (byte) 0x00)) { // UCS-4 must begin with "<!XML" // 0x00 0x3c 0x00 0x00: UCS-4, unusual (3421) encoding = ENCODING_UCS_4_3412; } else if (tryEncoding(signature, (byte) 0xfe, (byte) 0xff)) { // UCS-2 with a byte-order marker. // 0xfe 0xff: UCS-2, big-endian (12) encoding = ENCODING_UCS_2_12; is.read(); is.read(); } else if (tryEncoding(signature, (byte) 0xff, (byte) 0xfe)) { // UCS-2 with a byte-order marker. // 0xff 0xfe: UCS-2, little-endian (21) encoding = ENCODING_UCS_2_21; is.read(); is.read(); } else if (tryEncoding(signature, (byte) 0x00, (byte) 0x3c, (byte) 0x00, (byte) 0x3f)) { // UCS-2 without a BOM must begin with "<?XML" // 0x00 0x3c 0x00 0x3f: UCS-2, big-endian, no byte-order mark encoding = ENCODING_UCS_2_12; error("no byte-order mark for UCS-2 entity", null, null); } else if (tryEncoding(signature, (byte) 0x3c, (byte) 0x00, (byte) 0x3f, (byte) 0x00)) { // UCS-2 without a BOM must begin with "<?XML" // 0x3c 0x00 0x3f 0x00: UCS-2, little-endian, no byte-order mark encoding = ENCODING_UCS_2_21; error("no byte-order mark for UCS-2 entity", null, null); } else if (tryEncoding(signature, (byte) 0x3c, (byte) 0x3f, (byte) 0x78, (byte) 0x6d)) { // Some kind of 8-bit encoding with "<?XML" // 0x3c 0x3f 0x78 0x6d: UTF-8 or other 8-bit markup (read ENCODING) encoding = ENCODING_UTF_8; read8bitEncodingDeclaration(); } else { // Some kind of 8-bit encoding without "<?XML" // (otherwise) UTF-8 without encoding/XML declaration encoding = ENCODING_UTF_8; } } /** * Check for a four-byte signature. * <p>Utility routine for detectEncoding(). * <p>Always looks for some part of "<?XML" in a specific encoding. * @param sig The first four bytes read. * @param b1 The first byte of the signature * @param b2 The second byte of the signature * @param b3 The third byte of the signature * @param b4 The fourth byte of the signature * @see #detectEncoding */ boolean tryEncoding(byte[] sig, byte b1, byte b2, byte b3, byte b4) { return ((sig[0] == b1) && (sig[1] == b2) && (sig[2] == b3) && (sig[3] == b4)); } /** * Check for a two-byte signature. * <p>Looks for a UCS-2 byte-order mark. * <p>Utility routine for detectEncoding(). * @param sig The first four bytes read. * @param b1 The first byte of the signature * @param b2 The second byte of the signature * @see #detectEncoding */ boolean tryEncoding(byte[] sig, byte b1, byte b2) { return ((sig[0] == b1) && (sig[1] == b2)); } /** * This method pushes a string back onto input. * <p>It is useful either as the expansion of an internal entity, * or for backtracking during the parse. * <p>Call pushCharArray() to do the actual work. * @param s The string to push back onto input. * @see #pushCharArray */ void pushString(String ename, String s) throws java.lang.Exception { char[] ch = s.toCharArray(); pushCharArray(ename, ch, 0, ch.length); } /** * Push a new internal input source. * <p>This method is useful for expanding an internal entity, * or for unreading a string of characters. It creates a new * readBuffer containing the characters in the array, instead * of characters converted from an input byte stream. * <p>I've added a couple of optimisations: don't push zero- * length strings, and just push back a single character * for 1-character strings; this should save some time and memory. * @param ch The char array to push. * @see #pushString * @see #pushURL * @see #readBuffer * @see #sourceType * @see #pushInput */ void pushCharArray(String ename, char[] ch, int start, int length) throws java.lang.Exception { // Push the existing status pushInput(ename); sourceType = INPUT_INTERNAL; readBuffer = ch; readBufferPos = start; readBufferLength = length; readBufferOverflow = -1; } /** * Save the current input source onto the stack. * <p>This method saves all of the global variables associated with * the current input source, so that they can be restored when a new * input source has finished. It also tests for entity recursion. * <p>The method saves the following global variables onto a stack * using a fixed-length array: * <ol> * <li>sourceType * <li>externalEntity * <li>readBuffer * <li>readBufferPos * <li>readBufferLength * <li>line * <li>encoding * </ol> * @param ename The name of the entity (if any) causing the new input. * @see #popInput * @see #sourceType * @see #externalEntity * @see #readBuffer * @see #readBufferPos * @see #readBufferLength * @see #line * @see #encoding */ void pushInput(String ename) throws java.lang.Exception { Object[] input = new Object[12]; // Check for entity recursion. if (ename != null) { Enumeration entities = entityStack.elements(); while (entities.hasMoreElements()) { String e = (String) entities.nextElement(); if (e.equals(ename)) { error("recursive reference to entity", ename, null); } } } entityStack.push(ename); // Don't bother if there is no input. if (sourceType == INPUT_NONE) { return; } // Set up a snapshot of the current // input source. input[0] = Integer.valueOf(sourceType); input[1] = externalEntity; input[2] = readBuffer; input[3] = Integer.valueOf(readBufferPos); input[4] = Integer.valueOf(readBufferLength); input[5] = Integer.valueOf(line); input[6] = Integer.valueOf(encoding); input[7] = Integer.valueOf(readBufferOverflow); input[8] = is; input[9] = Integer.valueOf(currentByteCount); input[10] = Integer.valueOf(column); input[11] = reader; // Push it onto the stack. inputStack.push(input); } /** * Restore a previous input source. * <p>This method restores all of the global variables associated with * the current input source. * @exception java.io.EOFException * If there are no more entries on the input stack. * @see #pushInput * @see #sourceType * @see #externalEntity * @see #readBuffer * @see #readBufferPos * @see #readBufferLength * @see #line * @see #encoding */ void popInput() throws java.lang.Exception { Object[] input; switch (sourceType) { case INPUT_EXTERNAL: dataBufferFlush(); if ((handler != null) && (externalEntity != null)) { handler.endExternalEntity(externalEntity.getURL().toString()); } break; case INPUT_STREAM: dataBufferFlush(); if (baseURI != null) { if (handler != null) { handler.endExternalEntity(baseURI); } } break; case INPUT_READER: dataBufferFlush(); if (baseURI != null) { if (handler != null) { handler.endExternalEntity(baseURI); } } break; } // Throw an EOFException if there // is nothing else to pop. if (inputStack.isEmpty()) { throw new EOFException("XML parser input stack was empty, " + "end of file or xml fragment reached. " + "Perhaps there is a missing '>' " + "or a comment is unterminated by '->'?"); } else { input = (Object[]) inputStack.pop(); entityStack.pop(); } sourceType = ((Integer) input[0]).intValue(); externalEntity = (URLConnection) input[1]; readBuffer = (char[]) input[2]; readBufferPos = ((Integer) input[3]).intValue(); readBufferLength = ((Integer) input[4]).intValue(); line = ((Integer) input[5]).intValue(); encoding = ((Integer) input[6]).intValue(); readBufferOverflow = ((Integer) input[7]).intValue(); is = (InputStream) input[8]; currentByteCount = ((Integer) input[9]).intValue(); column = ((Integer) input[10]).intValue(); reader = (Reader) input[11]; } /** * Return true if we can read the expected character. * <p>Note that the character will be removed from the input stream * on success, but will be put back on failure. Do not attempt to * read the character again if the method succeeds. * @param delim The character that should appear next. For a * insensitive match, you must supply this in upper-case. * @return true if the character was successfully read, or false if * it was not. * @see #tryRead(String) */ boolean tryRead(char delim) throws java.lang.Exception { char c; // Read the character c = readCh(); // Test for a match, and push the character // back if the match fails. if (c == delim) { return true; } else { unread(c); return false; } } /** * Return true if we can read the expected string. * <p>This is simply a convenience method. * <p>Note that the string will be removed from the input stream * on success, but will be put back on failure. Do not attempt to * read the string again if the method succeeds. * <p>This method will push back a character rather than an * array whenever possible (probably the majority of cases). * <p><b>NOTE:</b> This method currently has a hard-coded limit * of 100 characters for the delimiter. * @param delim The string that should appear next. * @return true if the string was successfully read, or false if * it was not. * @see #tryRead(char) */ boolean tryRead(String delim) throws java.lang.Exception { char[] ch = delim.toCharArray(); char c; // Compare the input, character- // by character. for (int i = 0; i < ch.length; i++) { c = readCh(); if (c != ch[i]) { unread(c); if (i != 0) { unread(ch, i); } return false; } } return true; } /** * Return true if we can read some whitespace. * <p>This is simply a convenience method. * <p>This method will push back a character rather than an * array whenever possible (probably the majority of cases). * @return true if whitespace was found. */ boolean tryWhitespace() throws java.lang.Exception { char c; c = readCh(); if (isWhitespace(c)) { skipWhitespace(); return true; } else { unread(c); return false; } } /** * Read all data until we find the specified string. * <p>This is especially useful for scanning marked sections. * <p>This is a a little inefficient right now, since it calls tryRead() * for every character. * @param delim The string delimiter * @see #tryRead(String) * @see #readCh */ void parseUntil(String delim) throws java.lang.Exception { char c; int startLine = line; try { while (!tryRead(delim)) { c = readCh(); dataBufferAppend(c); } } catch (EOFException e) { error("end of input while looking for delimiter (started on line " + startLine + ')', null, delim); } } // Modified November 14, 1998 by Steve Neuendorffer // There was a bug because this was not skipping things that looked // like parameter entities properly. // Copied the appropriate code from readCh, excluding the lines referring to // '%'. /** * Skip all data until we find the specified string. * <p>This is especially useful for scanning comments. * <p>This is a a little inefficient right now, since it calls tryRead() * for every character. * @param delim The string delimiter * @see #readCh */ void skipUntil(String delim) throws java.lang.Exception { while (!tryRead(delim)) { char c; // As long as there's nothing in the // read buffer, try reading more data // (for an external entity) or popping // the entity stack (for either). while (readBufferPos >= readBufferLength) { switch (sourceType) { case INPUT_READER: case INPUT_EXTERNAL: case INPUT_STREAM: readDataChunk(); while (readBufferLength < 1) { popInput(); if (readBufferLength < 1) { readDataChunk(); } } break; default: popInput(); break; } } c = readBuffer[readBufferPos++]; if (c == '\n') { line++; column = 0; } else { column++; } } } /** * Read just the encoding declaration (or XML declaration) at the * start of an external entity. * When this method is called, we know that the declaration is * present (or appears to be). We also know that the entity is * in some sort of ASCII-derived 8-bit encoding. * The idea of this is to let us read what the 8-bit encoding is * before we've committed to converting any more of the file; the * XML or encoding declaration must be in 7-bit ASCII, so we're * safe as long as we don't go past it. */ void read8bitEncodingDeclaration() throws java.lang.Exception { int ch; readBufferPos = readBufferLength = 0; while (true) { ch = is.read(); readBuffer[readBufferLength++] = (char) ch; switch (ch) { case '>': return; case -1: error("end of file before end of XML or encoding declaration.", null, "?>"); return; } if (readBuffer.length == readBufferLength) { error("unfinished XML or encoding declaration", null, null); } } } ////////////////////////////////////////////////////////////////////// // Low-level I/O. ////////////////////////////////////////////////////////////////////// /** * Read a chunk of data from an external input source. * <p>This is simply a front-end that fills the rawReadBuffer * with bytes, then calls the appropriate encoding handler. * @see #encoding * @see #rawReadBuffer * @see #readBuffer * @see #filterCR * @see #copyUtf8ReadBuffer * @see #copyIso8859_1ReadBuffer */ void readDataChunk() throws java.lang.Exception { int count; // See if we have any overflow. if (readBufferOverflow > -1) { readBuffer[0] = (char) readBufferOverflow; readBufferOverflow = -1; readBufferPos = 1; sawCR = true; } else { readBufferPos = 0; sawCR = false; } // Special situation -- we're taking // input from a character stream. if (sourceType == INPUT_READER) { count = reader.read(readBuffer, readBufferPos, READ_BUFFER_MAX - 1); if (count < 0) { readBufferLength = -1; } else { readBufferLength = readBufferPos + count; filterCR(); sawCR = false; } return; } // Read as many bytes as possible // into the read buffer. count = is.read(rawReadBuffer, 0, READ_BUFFER_MAX); // Dispatch to an encoding-specific // reader method to populate the // readBuffer. switch (encoding) { case ENCODING_UTF_8: copyUtf8ReadBuffer(count); break; case ENCODING_ISO_8859_1: copyIso8859_1ReadBuffer(count); break; case ENCODING_UCS_2_12: copyUcs2ReadBuffer(count, 8, 0); break; case ENCODING_UCS_2_21: copyUcs2ReadBuffer(count, 0, 8); break; case ENCODING_UCS_4_1234: copyUcs4ReadBuffer(count, 24, 16, 8, 0); break; case ENCODING_UCS_4_4321: copyUcs4ReadBuffer(count, 0, 8, 16, 24); break; case ENCODING_UCS_4_2143: copyUcs4ReadBuffer(count, 16, 24, 0, 8); break; case ENCODING_UCS_4_3412: copyUcs4ReadBuffer(count, 8, 0, 24, 16); break; } // Filter out all carriage returns // if we've seen any. if (sawCR) { filterCR(); sawCR = false; } // Reset the position. readBufferPos = 0; currentByteCount += count; } /** * Filter carriage returns in the read buffer. * <p>CRLF becomes LF; CR becomes LF. * @see #readDataChunk * @see #readBuffer * @see #readBufferOverflow */ void filterCR() { int i; int j; readBufferOverflow = -1; loop: for (i = 0, j = 0; j < readBufferLength; i++, j++) { switch (readBuffer[j]) { case '\r': if (j == (readBufferLength - 1)) { readBufferOverflow = '\r'; readBufferLength--; break loop; } else if (readBuffer[j + 1] == '\n') { j++; } readBuffer[i] = '\n'; break; case '\n': default: readBuffer[i] = readBuffer[j]; break; } } readBufferLength = i; } /** * Convert a buffer of UTF-8-encoded bytes into UTF-16 characters. * <p>When readDataChunk() calls this method, the raw bytes are in * rawReadBuffer, and the final characters will appear in * readBuffer. * <p>The tricky part of this is dealing with UTF-8 multi-byte * sequences, but it doesn't seem to slow things down too much. * @param count The number of bytes to convert. * @see #readDataChunk * @see #rawReadBuffer * @see #readBuffer * @see #getNextUtf8Byte */ void copyUtf8ReadBuffer(int count) throws java.lang.Exception { int i = 0; int j = readBufferPos; int b1; while (i < count) { b1 = rawReadBuffer[i++]; // Determine whether we are dealing // with a one-, two-, three-, or four- // byte sequence. if ((b1 & 0x80) == 0) { // 1-byte sequence: 000000000xxxxxxx = 0xxxxxxx readBuffer[j++] = (char) b1; } else if ((b1 & 0xe0) == 0xc0) { // 2-byte sequence: 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx readBuffer[j++] = (char) (((b1 & 0x1f) << 6) | getNextUtf8Byte( i++, count)); } else if ((b1 & 0xf0) == 0xe0) { // 3-byte sequence: zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx readBuffer[j++] = (char) (((b1 & 0x0f) << 12) | (getNextUtf8Byte(i++, count) << 6) | getNextUtf8Byte( i++, count)); } else if ((b1 & 0xf8) == 0xf0) { // 4-byte sequence: 11101110wwwwzzzzyy + 110111yyyyxxxxxx // = 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx // (uuuuu = wwww + 1) int b2 = getNextUtf8Byte(i++, count); int b3 = getNextUtf8Byte(i++, count); int b4 = getNextUtf8Byte(i++, count); readBuffer[j++] = (char) (0xd800 | ((((b1 & 0x07) << 2) | (((b2 & 0x30) >> 4) - 1)) << 6) | ((b2 & 0x0f) << 2) | ((b3 & 0x30) >> 4)); readBuffer[j++] = (char) (0xdc | ((b3 & 0x0f) << 6) | b4); // TODO: test that surrogate value is legal. } else { // Otherwise, the 8th bit may not be set in UTF-8 encodingError("bad start for UTF-8 multi-byte sequence", b1, i); } if (readBuffer[j - 1] == '\r') { sawCR = true; } } // How many characters have we read? readBufferLength = j; } /** * Return the next byte value in a UTF-8 sequence. * If it is not possible to get a byte from the current * entity, throw an exception. * @param pos The current position in the rawReadBuffer. * @param count The number of bytes in the rawReadBuffer * @return The significant six bits of a non-initial byte in * a UTF-8 sequence. * @exception EOFException If the sequence is incomplete. */ int getNextUtf8Byte(int pos, int count) throws java.lang.Exception { int val; // Take a character from the buffer // or from the actual input stream. if (pos < count) { val = rawReadBuffer[pos]; } else { val = is.read(); if (val == -1) { encodingError("unfinished multi-byte UTF-8 sequence at EOF", -1, pos); } } // Check for the correct bits at the // start. if ((val & 0xc0) != 0x80) { encodingError("bad continuation of multi-byte UTF-8 sequence", val, pos + 1); } // Return the significant bits. return (val & 0x3f); } /** * Convert a buffer of ISO-8859-1-encoded bytes into UTF-16 characters. * <p>When readDataChunk() calls this method, the raw bytes are in * rawReadBuffer, and the final characters will appear in * readBuffer. * <p>This is a direct conversion, with no tricks. * @param count The number of bytes to convert. * @see #readDataChunk * @see #rawReadBuffer * @see #readBuffer */ void copyIso8859_1ReadBuffer(int count) { int i; int j; for (i = 0, j = readBufferPos; i < count; i++, j++) { readBuffer[j] = (char) (rawReadBuffer[i] & 0xff); if (readBuffer[j] == '\r') { sawCR = true; } } readBufferLength = j; } /** * Convert a buffer of UCS-2-encoded bytes into UTF-16 characters. * <p>When readDataChunk() calls this method, the raw bytes are in * rawReadBuffer, and the final characters will appear in * readBuffer. * @param count The number of bytes to convert. * @param shift1 The number of bits to shift byte 1. * @param shift2 The number of bits to shift byte 2 * @see #readDataChunk * @see #rawReadBuffer * @see #readBuffer */ void copyUcs2ReadBuffer(int count, int shift1, int shift2) throws java.lang.Exception { int j = readBufferPos; if ((count > 0) && ((count % 2) != 0)) { encodingError("odd number of bytes in UCS-2 encoding", -1, count); } for (int i = 0; i < count; i += 2) { readBuffer[j++] = (char) (((rawReadBuffer[i] & 0xff) << shift1) | ((rawReadBuffer[i + 1] & 0xff) << shift2)); if (readBuffer[j - 1] == '\r') { sawCR = true; } } readBufferLength = j; } /** * Convert a buffer of UCS-4-encoded bytes into UTF-16 characters. * <p>When readDataChunk() calls this method, the raw bytes are in * rawReadBuffer, and the final characters will appear in * readBuffer. * <p>Java has 16-bit chars, but this routine will attempt to use * surrogates to encoding values between 0x00010000 and 0x000fffff. * @param count The number of bytes to convert. * @param shift1 The number of bits to shift byte 1. * @param shift2 The number of bits to shift byte 2 * @param shift3 The number of bits to shift byte 2 * @param shift4 The number of bits to shift byte 2 * @see #readDataChunk * @see #rawReadBuffer * @see #readBuffer */ void copyUcs4ReadBuffer(int count, int shift1, int shift2, int shift3, int shift4) throws java.lang.Exception { int j = readBufferPos; int value; if ((count > 0) && ((count % 4) != 0)) { encodingError( "number of bytes in UCS-4 encoding not divisible by 4", -1, count); } for (int i = 0; i < count; i += 4) { value = (((rawReadBuffer[i] & 0xff) << shift1) | ((rawReadBuffer[i + 1] & 0xff) << shift2) | ((rawReadBuffer[i + 2] & 0xff) << shift3) | ((rawReadBuffer[i + 3] & 0xff) << shift4)); if (value < 0x0000ffff) { readBuffer[j++] = (char) value; if (value == '\r') { sawCR = true; } } else if (value < 0x000fffff) { readBuffer[j++] = (char) (0xd8 | ((value & 0x000ffc00) >> 10)); readBuffer[j++] = (char) (0xdc | (value & 0x0003ff)); } else { encodingError("value cannot be represented in UTF-16", value, i); } } readBufferLength = j; } /** * Report a character encoding error. */ void encodingError(String message, int value, int offset) throws java.lang.Exception { String uri; if (value >= 0) { message = message + " (byte value: 0x" + Integer.toHexString(value) + ')'; } if (externalEntity != null) { uri = externalEntity.getURL().toString(); } else { uri = baseURI; } handler.error(message, uri, -1, offset + currentByteCount); } ////////////////////////////////////////////////////////////////////// // Local Variables. ////////////////////////////////////////////////////////////////////// /** * Re-initialize the variables for each parse. */ void initializeVariables() { // No errors; first lineb errorCount = 0; line = 1; column = 0; // Set up the buffers for data and names dataBufferPos = 0; dataBuffer = new char[DATA_BUFFER_INITIAL]; nameBufferPos = 0; nameBuffer = new char[NAME_BUFFER_INITIAL]; // Set up the DTD hash tables elementInfo = new Hashtable(); entityInfo = new Hashtable(); notationInfo = new Hashtable(); // Set up the variables for the current // element context. currentElement = null; currentElementContent = CONTENT_UNDECLARED; // Set up the input variables sourceType = INPUT_NONE; inputStack = new Stack(); entityStack = new Stack(); externalEntity = null; tagAttributePos = 0; tagAttributes = new String[100]; rawReadBuffer = new byte[READ_BUFFER_MAX]; readBufferOverflow = -1; context = CONTEXT_NONE; symbolTable = new Object[SYMBOL_TABLE_LENGTH]; } /** * Clean up after the parse to allow some garbage collection. * Leave around anything that might be useful for queries. */ void cleanupVariables() { errorCount = -1; line = -1; column = -1; dataBuffer = null; nameBuffer = null; currentElement = null; currentElementContent = CONTENT_UNDECLARED; sourceType = INPUT_NONE; inputStack = null; externalEntity = null; entityStack = null; } // // The current XML handler interface. // XmlHandler handler; // // I/O information. // private Reader reader; // current reader private InputStream is; // current input stream private int line; // current line number private int column; // current column number private int sourceType; // type of input source private Stack inputStack; // stack of input sources private URLConnection externalEntity; // current external entity private int encoding; // current character encoding. private int currentByteCount; // how many bytes read from current source. // // Maintain a count of errors. // private int errorCount; // // Buffers for decoded but unparsed character input. // private final static int READ_BUFFER_MAX = 16384; private char[] readBuffer; private int readBufferPos; private int readBufferLength; private int readBufferOverflow; // overflow character from last data chunk. // // Stack of entity names, to help detect recursion. // private Stack entityStack; // // Buffer for undecoded raw byte input. // private byte[] rawReadBuffer; // // Buffer for parsed character data. // private static int DATA_BUFFER_INITIAL = 4096; private char[] dataBuffer; private int dataBufferPos; // // Buffer for parsed names. // private static int NAME_BUFFER_INITIAL = 1024; private char[] nameBuffer; private int nameBufferPos; // // Hashtables for DTD information on elements, entities, and notations. // private Hashtable elementInfo; private Hashtable entityInfo; private Hashtable notationInfo; // // Element type currently in force. // private String currentElement; private int currentElementContent; // // Base external identifiers for resolution. // private String basePublicId; private String baseURI; private Reader baseReader; private InputStream baseInputStream; // // Are we in a context where PEs are allowed? // private int context; // // Symbol table, for internalising names. // private Object[] symbolTable; private final static int SYMBOL_TABLE_LENGTH = 1087; // // Hash table of attributes found in current start tag. // private String[] tagAttributes; private int tagAttributePos; // // Utility flag: have we noticed a CR while reading the last // data chunk? If so, we will have to go back and normalise // CR/LF. // private boolean sawCR; }