// XmlParser.java: the main parser class. // NO WARRANTY! See README, and copyright below. // $Id: XmlParser.java,v 1.3 2009-12-16 21:59:29 bruno Exp $ package com.microstar.xml; import java.io.BufferedInputStream; import java.io.EOFException; import java.io.InputStream; import java.io.Reader; import java.net.URL; import java.net.URLConnection; import java.util.Enumeration; import java.util.Hashtable; import java.util.Stack; /** * Parse XML documents and return parse events through call-backs. * <p> * You need to define a class implementing the <code>XmlHandler</code> * interface: an object belonging to this class will receive the callbacks for * the events. (As an alternative to implementing the full XmlHandler interface, * you can simply extend the <code>HandlerBase</code> convenience class.) * <p> * Usage (assuming that <code>MyHandler</code> is your implementation of the * <code>XmlHandler</code> interface): * * <pre> * * * * XmlHandler handler = new MyHandler(); * XmlParser parser = new XmlParser(); * parser.setHandler(handler); * try { * parser.parse("http://www.host.com/doc.xml", null); * } catch (Exception e) { * [do something interesting] * } * * * * </pre> * * <p> * Alternatively, you can use the standard SAX interfaces with the * <code>SAXDriver</code> class as your entry point. * * @author Copyright (c) 1997, 1998 by Microstar Software Ltd. * @author Written by David Megginson <dmeggins@microstar.com> * @version 1.1 * @see XmlHandler * @see HandlerBase * @see SAXDriver */ public class XmlParser { // // Use special cheats that speed up the code (currently about 50%), // but may cause problems with future maintenance and add to the // class file size (about 500 bytes). // private final static boolean USE_CHEATS = true; ////////////////////////////////////////////////////////////////////// // Constructors. //////////////////////////////////////////////////////////////////////// /** * Construct a new parser with no associated handler. * * @see #setHandler * @see #parse */ public XmlParser() { } /** * Set the handler that will receive parsing events. * * @param handler * The handler to receive callback events. * @see #parse * @see XmlHandler */ public void setHandler(XmlHandler handler) { this.handler = handler; } /** * Parse an XML document from a URI. * <p> * You may parse a document more than once, but only one thread may call * this method for an object at one time. * * @param systemId * The URI of the document. * @param publicId * The public identifier of the document, or null. * @param encoding * The suggested encoding, or null if unknown. * @exception java.lang.Exception * Any exception thrown by your own handlers, or any * derivation of java.io.IOException thrown by the parser * itself. */ public void parse(String systemId, String publicId, String encoding) throws java.lang.Exception { doParse(systemId, publicId, null, null, encoding); } /** * Parse an XML document from a byte stream. * <p> * The URI that you supply will become the base URI for resolving relative * links, but Ælfred will actually read the document from the supplied * input stream. * <p> * You may parse a document more than once, but only one thread may call * this method for an object at one time. * * @param systemId * The base URI of the document, or null if not known. * @param publicId * The public identifier of the document, or null if not known. * @param stream * A byte input stream. * @param encoding * The suggested encoding, or null if unknown. * @exception java.lang.Exception * Any exception thrown by your own handlers, or any * derivation of java.io.IOException thrown by the parser * itself. */ public void parse(String systemId, String publicId, InputStream stream, String encoding) throws java.lang.Exception { doParse(systemId, publicId, null, stream, encoding); } /** * Parse an XML document from a character stream. * <p> * The URI that you supply will become the base URI for resolving relative * links, but Ælfred will actually read the document from the supplied * input stream. * <p> * You may parse a document more than once, but only one thread may call * this method for an object at one time. * * @param systemId * The base URI of the document, or null if not known. * @param publicId * The public identifier of the document, or null if not known. * @param reader * A character stream. * @exception java.lang.Exception * Any exception thrown by your own handlers, or any * derivation of java.io.IOException thrown by the parser * itself. */ public void parse(String systemId, String publicId, Reader reader) throws java.lang.Exception { doParse(systemId, publicId, reader, null, null); } private synchronized void doParse(String systemId, String publicId, Reader reader, InputStream stream, String encoding) throws java.lang.Exception { basePublicId = publicId; baseURI = systemId; baseReader = reader; baseInputStream = stream; initializeVariables(); // Set the default entities here. setInternalEntity(intern("amp"), "&"); setInternalEntity(intern("lt"), "<"); setInternalEntity(intern("gt"), ">"); setInternalEntity(intern("apos"), "'"); setInternalEntity(intern("quot"), """); if (handler != null) { handler.startDocument(); } pushURL("[document]", basePublicId, baseURI, baseReader, baseInputStream, encoding); parseDocument(); if (handler != null) { handler.endDocument(); } cleanupVariables(); } //////////////////////////////////////////////////////////////////////// // Constants. //////////////////////////////////////////////////////////////////////// // // Constants for element content type. // /** * Constant: an element has not been declared. * * @see #getElementContentType */ public final static int CONTENT_UNDECLARED = 0; /** * Constant: the element has a content model of ANY. * * @see #getElementContentType */ public final static int CONTENT_ANY = 1; /** * Constant: the element has declared content of EMPTY. * * @see #getElementContentType */ public final static int CONTENT_EMPTY = 2; /** * Constant: the element has mixed content. * * @see #getElementContentType */ public final static int CONTENT_MIXED = 3; /** * Constant: the element has element content. * * @see #getElementContentType */ public final static int CONTENT_ELEMENTS = 4; // // Constants for the entity type. // /** * Constant: the entity has not been declared. * * @see #getEntityType */ public final static int ENTITY_UNDECLARED = 0; /** * Constant: the entity is internal. * * @see #getEntityType */ public final static int ENTITY_INTERNAL = 1; /** * Constant: the entity is external, non-XML data. * * @see #getEntityType */ public final static int ENTITY_NDATA = 2; /** * Constant: the entity is external XML data. * * @see #getEntityType */ public final static int ENTITY_TEXT = 3; // // Constants for attribute type. // /** * Constant: the attribute has not been declared for this element type. * * @see #getAttributeType */ public final static int ATTRIBUTE_UNDECLARED = 0; /** * Constant: the attribute value is a string value. * * @see #getAttributeType */ public final static int ATTRIBUTE_CDATA = 1; /** * Constant: the attribute value is a unique identifier. * * @see #getAttributeType */ public final static int ATTRIBUTE_ID = 2; /** * Constant: the attribute value is a reference to a unique identifier. * * @see #getAttributeType */ public final static int ATTRIBUTE_IDREF = 3; /** * Constant: the attribute value is a list of ID references. * * @see #getAttributeType */ public final static int ATTRIBUTE_IDREFS = 4; /** * Constant: the attribute value is the name of an entity. * * @see #getAttributeType */ public final static int ATTRIBUTE_ENTITY = 5; /** * Constant: the attribute value is a list of entity names. * * @see #getAttributeType */ public final static int ATTRIBUTE_ENTITIES = 6; /** * Constant: the attribute value is a name token. * * @see #getAttributeType */ public final static int ATTRIBUTE_NMTOKEN = 7; /** * Constant: the attribute value is a list of name tokens. * * @see #getAttributeType */ public final static int ATTRIBUTE_NMTOKENS = 8; /** * Constant: the attribute value is a token from an enumeration. * * @see #getAttributeType */ public final static int ATTRIBUTE_ENUMERATED = 9; /** * Constant: the attribute is the name of a notation. * * @see #getAttributeType */ public final static int ATTRIBUTE_NOTATION = 10; // // When the class is loaded, populate the hash table of // attribute types. // /** * Hash table of attribute types. */ private static Hashtable attributeTypeHash; static { attributeTypeHash = new Hashtable(); attributeTypeHash.put("CDATA", new Integer(ATTRIBUTE_CDATA)); attributeTypeHash.put("ID", new Integer(ATTRIBUTE_ID)); attributeTypeHash.put("IDREF", new Integer(ATTRIBUTE_IDREF)); attributeTypeHash.put("IDREFS", new Integer(ATTRIBUTE_IDREFS)); attributeTypeHash.put("ENTITY", new Integer(ATTRIBUTE_ENTITY)); attributeTypeHash.put("ENTITIES", new Integer(ATTRIBUTE_ENTITIES)); attributeTypeHash.put("NMTOKEN", new Integer(ATTRIBUTE_NMTOKEN)); attributeTypeHash.put("NMTOKENS", new Integer(ATTRIBUTE_NMTOKENS)); attributeTypeHash.put("NOTATION", new Integer(ATTRIBUTE_NOTATION)); } // // Constants for supported encodings. // private final static int ENCODING_UTF_8 = 1; private final static int ENCODING_ISO_8859_1 = 2; private final static int ENCODING_UCS_2_12 = 3; private final static int ENCODING_UCS_2_21 = 4; private final static int ENCODING_UCS_4_1234 = 5; private final static int ENCODING_UCS_4_4321 = 6; private final static int ENCODING_UCS_4_2143 = 7; private final static int ENCODING_UCS_4_3412 = 8; private final static int ENCODING_UTF_16 = 9; // // Constants for attribute default value. // /** * Constant: the attribute is not declared. * * @see #getAttributeDefaultValueType */ public final static int ATTRIBUTE_DEFAULT_UNDECLARED = 0; /** * Constant: the attribute has a literal default value specified. * * @see #getAttributeDefaultValueType * @see #getAttributeDefaultValue */ public final static int ATTRIBUTE_DEFAULT_SPECIFIED = 1; /** * Constant: the attribute was declared #IMPLIED. * * @see #getAttributeDefaultValueType */ public final static int ATTRIBUTE_DEFAULT_IMPLIED = 2; /** * Constant: the attribute was declared #REQUIRED. * * @see #getAttributeDefaultValueType */ public final static int ATTRIBUTE_DEFAULT_REQUIRED = 3; /** * Constant: the attribute was declared #FIXED. * * @see #getAttributeDefaultValueType * @see #getAttributeDefaultValue */ public final static int ATTRIBUTE_DEFAULT_FIXED = 4; // // Constants for input. // private final static int INPUT_NONE = 0; private final static int INPUT_INTERNAL = 1; private final static int INPUT_EXTERNAL = 2; private final static int INPUT_STREAM = 3; private final static int INPUT_BUFFER = 4; private final static int INPUT_READER = 5; // // Flags for reading literals. // private final static int LIT_CHAR_REF = 1; private final static int LIT_ENTITY_REF = 2; private final static int LIT_PE_REF = 4; private final static int LIT_NORMALIZE = 8; // // Flags for parsing context. // private final static int CONTEXT_NONE = 0; private final static int CONTEXT_DTD = 1; private final static int CONTEXT_ENTITYVALUE = 2; private final static int CONTEXT_ATTRIBUTEVALUE = 3; ////////////////////////////////////////////////////////////////////// // Error reporting. ////////////////////////////////////////////////////////////////////// /** * Report an error. * * @param message * The error message. * @param textFound * The text that caused the error (or null). * @see XmlHandler#error * @see #line */ void error(String message, String textFound, String textExpected) throws java.lang.Exception { errorCount++; if (textFound != null) { message = message + " (found \"" + textFound + "\")"; } if (textExpected != null) { message = message + " (expected \"" + textExpected + "\")"; } if (handler != null) { String uri = null; if (externalEntity != null) { uri = externalEntity.getURL().toString(); } handler.error(message, uri, line, column); } } /** * Report a serious error. * * @param message * The error message. * @param textFound * The text that caused the error (or null). */ void error(String message, char textFound, String textExpected) throws java.lang.Exception { error(message, new Character(textFound).toString(), textExpected); } ////////////////////////////////////////////////////////////////////// // Major syntactic productions. ////////////////////////////////////////////////////////////////////// /** * Parse an XML document. * * <pre> * * * * [1] document ::= prolog element Misc* * * * * </pre> * * <p> * This is the top-level parsing function for a single XML document. As a * minimum, a well-formed document must have a document element, and a valid * document must have a prolog as well. */ void parseDocument() throws java.lang.Exception { char c; parseProlog(); require('<'); parseElement(); try { parseMisc(); //skip all white, PIs, and comments c = readCh(); //if this doesn't throw an exception... error("unexpected characters after document end", c, null); } catch (EOFException e) { return; } } /** * Skip a comment. * * <pre> * * * * [18] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* "-->" * * * * </pre> * * <p> * (The <code><!--</code> has already been read.) */ void parseComment() throws java.lang.Exception { skipUntil("-->"); } /** * Parse a processing instruction and do a call-back. * * <pre> * * * * [19] PI ::= '<?' Name (S (Char* - (Char* '?>' Char*)))? '?>' * * * * </pre> * * <p> * (The <code><?</code> has already been read.) * <p> * An XML processing instruction <em>must</em> begin with a Name, which is * the instruction's target. */ void parsePI() throws java.lang.Exception { String name; name = readNmtoken(true); if (!tryRead("?>")) { requireWhitespace(); parseUntil("?>"); } if (handler != null) { handler.processingInstruction(name, dataBufferToString()); } } /** * Parse a CDATA marked section. * * <pre> * * * * [20] CDSect ::= CDStart CData CDEnd * [21] CDStart ::= '<![CDATA[' * [22] CData ::= (Char* - (Char* ']]>' Char*)) * [23] CDEnd ::= ']]>' * * * * </pre> * * <p> * (The '<![CDATA[' has already been read.) * <p> * Note that this just appends characters to the dataBuffer, without * actually generating an event. */ void parseCDSect() throws java.lang.Exception { parseUntil("]]>"); } /** * Parse the prolog of an XML document. * * <pre> * * * * [24] prolog ::= XMPDecl? Misc* (Doctypedecl Misc*)? * * * * </pre> * * <p> * There are a couple of tricks here. First, it is necessary to declare the * XML default attributes after the DTD (if present) has been read. Second, * it is not possible to expand general references in attribute value * literals until after the entire DTD (if present) has been parsed. * <p> * We do not look for the XML declaration here, because it is handled by * pushURL(). * * @see pushURL */ void parseProlog() throws java.lang.Exception { parseMisc(); if (tryRead("<!DOCTYPE")) { parseDoctypedecl(); parseMisc(); } } /** * Parse the XML declaration. * * <pre> * * * * [25] XMPDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>' * [26] VersionInfo ::= S 'version' Eq ('"1.0"' | "'1.0'") * [33] SDDecl ::= S 'standalone' Eq "'" ('yes' | 'no') "'" * | S 'standalone' Eq '"' ("yes" | "no") '"' * [78] EncodingDecl ::= S 'encoding' Eq QEncoding * * * * </pre> * * <p> * ([80] to [82] are also significant.) * <p> * (The <code><?xml</code> and whitespace have already been read.) * * @see #parseTextDecl * @see #checkEncoding */ void parseXMPDecl(boolean ignoreEncoding) throws java.lang.Exception { String version; String encodingName = null; String standalone = null; // Read the version. require("version"); parseEq(); version = readLiteral(0); if (!version.equals("1.0")) { error("unsupported XML version", version, "1.0"); } // Try reading an encoding declaration. skipWhitespace(); if (tryRead("encoding")) { parseEq(); encodingName = readLiteral(0); checkEncoding(encodingName, ignoreEncoding); } // Try reading a standalone declaration skipWhitespace(); if (tryRead("standalone")) { parseEq(); standalone = readLiteral(0); } skipWhitespace(); require("?>"); } /** * Parse the Encoding PI. * * <pre> * * * * [78] EncodingDecl ::= S 'encoding' Eq QEncoding * [79] EncodingPI ::= '<?xml' S 'encoding' Eq QEncoding S? '?>' * [80] QEncoding ::= '"' Encoding '"' | "'" Encoding "'" * [81] Encoding ::= LatinName * [82] LatinName ::= [A-Za-z] ([A-Za-z0-9._] | '-')* * * * * </pre> * * <p> * (The <code><?xml</code>' and whitespace have already been read.) * * @see #parseXMPDecl * @see #checkEncoding */ void parseTextDecl(boolean ignoreEncoding) throws java.lang.Exception { String encodingName = null; // Read an optional version. if (tryRead("version")) { String version; parseEq(); version = readLiteral(0); if (!version.equals("1.0")) { error("unsupported XML version", version, "1.0"); } requireWhitespace(); } // Read the encoding. require("encoding"); parseEq(); encodingName = readLiteral(0); checkEncoding(encodingName, ignoreEncoding); skipWhitespace(); require("?>"); } /** * Check that the encoding specified makes sense. * <p> * Compare what the author has specified in the XML declaration or encoding * PI with what we have detected. * <p> * This is also important for distinguishing among the various 7- and 8-bit * encodings, such as ISO-LATIN-1 (I cannot autodetect those). * * @param encodingName * The name of the encoding specified by the user. * @see #parseXMPDecl * @see #parseTextDecl */ void checkEncoding(String encodingName, boolean ignoreEncoding) throws java.lang.Exception { encodingName = encodingName.toUpperCase(); if (ignoreEncoding) { return; } switch (encoding) { // 8-bit encodings case ENCODING_UTF_8: if (encodingName.equals("ISO-8859-1")) { encoding = ENCODING_ISO_8859_1; } else if (!encodingName.equals("UTF-8")) { error("unsupported 8-bit encoding", encodingName, "UTF-8 or ISO-8859-1"); } break; // 16-bit encodings case ENCODING_UCS_2_12: case ENCODING_UCS_2_21: if (!encodingName.equals("ISO-10646-UCS-2") && !encodingName.equals("UTF-16")) { error("unsupported 16-bit encoding", encodingName, "ISO-10646-UCS-2"); } break; // 32-bit encodings case ENCODING_UCS_4_1234: case ENCODING_UCS_4_4321: case ENCODING_UCS_4_2143: case ENCODING_UCS_4_3412: if (!encodingName.equals("ISO-10646-UCS-4")) { error("unsupported 32-bit encoding", encodingName, "ISO-10646-UCS-4"); } } } /** * Parse miscellaneous markup outside the document element and DOCTYPE * declaration. * * <pre> * * * * [27] Misc ::= Comment | PI | S * * * * </pre> */ void parseMisc() throws java.lang.Exception { while (true) { skipWhitespace(); if (tryRead("<?")) { parsePI(); } else if (tryRead("<!--")) { parseComment(); } else { break; } } } /** * Parse a document type declaration. * * <pre> * * * * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? * ('[' %markupdecl* ']' S?)? '>' * * * * </pre> * * <p> * (The <code><!DOCTYPE</code> has already been read.) */ void parseDoctypedecl() throws java.lang.Exception { String doctypeName, ids[]; // Read the document type name. requireWhitespace(); doctypeName = readNmtoken(true); // Read the ExternalIDs. skipWhitespace(); ids = readExternalIds(false); // Look for a declaration subset. skipWhitespace(); if (tryRead('[')) { // loop until the subset ends while (true) { context = CONTEXT_DTD; skipWhitespace(); context = CONTEXT_NONE; if (tryRead(']')) { break; // end of subset } else { context = CONTEXT_DTD; parseMarkupdecl(); context = CONTEXT_NONE; } } } // Read the external subset, if any if (ids[1] != null) { pushURL("[external subset]", ids[0], ids[1], null, null, null); // Loop until we end up back at '>' while (true) { context = CONTEXT_DTD; skipWhitespace(); context = CONTEXT_NONE; if (tryRead('>')) { break; } else { context = CONTEXT_DTD; parseMarkupdecl(); context = CONTEXT_NONE; } } } else { // No external subset. skipWhitespace(); require('>'); } if (handler != null) { handler.doctypeDecl(doctypeName, ids[0], ids[1]); } // Expand general entities in // default values of attributes. // (Do this after the doctypeDecl // event!). // expandAttributeDefaultValues(); } /** * Parse a markup declaration in the internal or external DTD subset. * * <pre> * * * * [29] markupdecl ::= ( %elementdecl | %AttlistDecl | %EntityDecl | * %NotationDecl | %PI | %S | %Comment | * InternalPERef ) * [30] InternalPERef ::= PEReference * [31] extSubset ::= (%markupdecl | %conditionalSect)* * * * * </pre> */ void parseMarkupdecl() throws java.lang.Exception { if (tryRead("<!ELEMENT")) { parseElementdecl(); } else if (tryRead("<!ATTLIST")) { parseAttlistDecl(); } else if (tryRead("<!ENTITY")) { parseEntityDecl(); } else if (tryRead("<!NOTATION")) { parseNotationDecl(); } else if (tryRead("<?")) { parsePI(); } else if (tryRead("<!--")) { parseComment(); } else if (tryRead("<![")) { parseConditionalSect(); } else { error("expected markup declaration", null, null); } } /** * Parse an element, with its tags. * * <pre> * * * * [33] STag ::= '<' Name (S Attribute)* S? '>' [WFC: unique Att spec] * [38] element ::= EmptyElement | STag content ETag * [39] EmptyElement ::= '<' Name (S Attribute)* S? '/>' * [WFC: unique Att spec] * * * * </pre> * * <p> * (The '<' has already been read.) * <p> * NOTE: this method actually chains onto parseContent(), if necessary, and * parseContent() will take care of calling parseETag(). */ void parseElement() throws java.lang.Exception { String gi; char c; int oldElementContent = currentElementContent; String oldElement = currentElement; // This is the (global) counter for the // array of specified attributes. tagAttributePos = 0; // Read the element type name. gi = readNmtoken(true); // Determine the current content type. currentElement = gi; currentElementContent = getElementContentType(gi); if (currentElementContent == CONTENT_UNDECLARED) { currentElementContent = CONTENT_ANY; } // Read the attributes, if any. // After this loop, we should be just // in front of the closing delimiter. skipWhitespace(); c = readCh(); while (c != '/' && c != '>') { unread(c); parseAttribute(gi); skipWhitespace(); c = readCh(); } unread(c); // Supply any defaulted attributes. Enumeration atts = declaredAttributes(gi); if (atts != null) { String aname; loop: while (atts.hasMoreElements()) { aname = (String) atts.nextElement(); // See if it was specified. for (int i = 0; i < tagAttributePos; i++) { if (tagAttributes[i] == aname) { continue loop; } } // I guess not... if (handler != null) { handler.attribute(aname, getAttributeExpandedValue(gi, aname), false); } } } // Figure out if this is a start tag // or an empty element, and dispatch an // event accordingly. c = readCh(); switch (c) { case '>': if (handler != null) { handler.startElement(gi); } parseContent(); break; case '/': require('>'); if (handler != null) { handler.startElement(gi); handler.endElement(gi); } break; } // Restore the previous state. currentElement = oldElement; currentElementContent = oldElementContent; } /** * Parse an attribute assignment. * * <pre> * * * * [34] Attribute ::= Name Eq AttValue * * * * </pre> * * @param name * The name of the attribute's element. * @see XmlHandler#attribute */ void parseAttribute(String name) throws java.lang.Exception { String aname; int type; String value; // Read the attribute name. aname = readNmtoken(true).intern(); type = getAttributeDefaultValueType(name, aname); // Parse '=' parseEq(); // Read the value, normalizing whitespace // if it is not CDATA. if (type == ATTRIBUTE_CDATA || type == ATTRIBUTE_UNDECLARED) { value = readLiteral(LIT_CHAR_REF | LIT_ENTITY_REF); } else { value = readLiteral(LIT_CHAR_REF | LIT_ENTITY_REF | LIT_NORMALIZE); } // Inform the handler about the // attribute. if (handler != null) { handler.attribute(aname, value, true); } dataBufferPos = 0; // Note that the attribute has been // specified. if (tagAttributePos == tagAttributes.length) { String newAttrib[] = new String[tagAttributes.length * 2]; System.arraycopy(tagAttributes, 0, newAttrib, 0, tagAttributePos); tagAttributes = newAttrib; } tagAttributes[tagAttributePos++] = aname; } /** * Parse an equals sign surrounded by optional whitespace. [35] Eq ::= S? * '=' S? */ void parseEq() throws java.lang.Exception { skipWhitespace(); require('='); skipWhitespace(); } /** * Parse an end tag. [36] ETag ::= ' </' Name S? '>' *NOTE: parseContent() * chains to here. */ void parseETag() throws java.lang.Exception { String name; name = readNmtoken(true); if (name != currentElement) { error("mismatched end tag", name, currentElement); } skipWhitespace(); require('>'); if (handler != null) { handler.endElement(name); } } /** * Parse the content of an element. [37] content ::= (element | PCData | * Reference | CDSect | PI | Comment)* [68] Reference ::= EntityRef | * CharRef */ void parseContent() throws java.lang.Exception { String data; char c; while (true) { switch (currentElementContent) { case CONTENT_ANY: case CONTENT_MIXED: parsePCData(); break; case CONTENT_ELEMENTS: parseWhitespace(); break; } // Handle delimiters c = readCh(); switch (c) { case '&': // Found "&" c = readCh(); if (c == '#') { parseCharRef(); } else { unread(c); parseEntityRef(true); } break; case '<': // Found "<" c = readCh(); switch (c) { case '!': // Found "<!" c = readCh(); switch (c) { case '-': // Found "<!-" require('-'); parseComment(); break; case '[': // Found "<![" require("CDATA["); parseCDSect(); break; default: error("expected comment or CDATA section", c, null); break; } break; case '?': // Found "<?" dataBufferFlush(); parsePI(); break; case '/': // Found "</" dataBufferFlush(); parseETag(); return; default: // Found "<" followed by something else dataBufferFlush(); unread(c); parseElement(); break; } } } } /** * Parse an element type declaration. [40] elementdecl ::= ' <!ELEMENT' S * %Name S (%S S)? %contentspec S? '>' [VC: Unique Element Declaration] * *NOTE: the ' <!ELEMENT' has already been read. */ void parseElementdecl() throws java.lang.Exception { String name; requireWhitespace(); // Read the element type name. name = readNmtoken(true); requireWhitespace(); // Read the content model. parseContentspec(name); skipWhitespace(); require('>'); } /** * Content specification. [41] contentspec ::= 'EMPTY' | 'ANY' | Mixed | * elements */ void parseContentspec(String name) throws java.lang.Exception { if (tryRead("EMPTY")) { setElement(name, CONTENT_EMPTY, null, null); return; } else if (tryRead("ANY")) { setElement(name, CONTENT_ANY, null, null); return; } else { require('('); dataBufferAppend('('); skipWhitespace(); if (tryRead("#PCDATA")) { dataBufferAppend("#PCDATA"); parseMixed(); setElement(name, CONTENT_MIXED, dataBufferToString(), null); } else { parseElements(); setElement(name, CONTENT_ELEMENTS, dataBufferToString(), null); } } } /** * Parse an element-content model. [42] elements ::= (choice | seq) ('?' | * '*' | '+')? [44] cps ::= S? %cp S? [45] choice ::= '(' S? %ctokplus (S? * '|' S? %ctoks)* S? ')' [46] ctokplus ::= cps ('|' cps)+ [47] ctoks ::= * cps ('|' cps)* [48] seq ::= '(' S? %stoks (S? ',' S? %stoks)* S? ')' [49] * stoks ::= cps (',' cps)* *NOTE: the opening '(' and S have already been * read. */ void parseElements() throws java.lang.Exception { char c; char sep; // Parse the first content particle skipWhitespace(); parseCp(); // Check for end or for a separator. skipWhitespace(); c = readCh(); switch (c) { case ')': dataBufferAppend(')'); c = readCh(); switch (c) { case '*': case '+': case '?': dataBufferAppend(c); break; default: unread(c); } return; case ',': // Register the separator. case '|': sep = c; dataBufferAppend(c); break; default: error("bad separator in content model", c, null); return; } // Parse the rest of the content model. while (true) { skipWhitespace(); parseCp(); skipWhitespace(); c = readCh(); if (c == ')') { dataBufferAppend(')'); break; } else if (c != sep) { error("bad separator in content model", c, null); return; } else { dataBufferAppend(c); } } // Check for the occurrence indicator. c = readCh(); switch (c) { case '?': case '*': case '+': dataBufferAppend(c); return; default: unread(c); return; } } /** * Parse a content particle. [43] cp ::= (Name | choice | seq) ('?' | '*' | * '+') *NOTE: I actually use a slightly different production here: cp ::= * (elements | (Name ('?' | '*' | '+')?)) */ void parseCp() throws java.lang.Exception { char c; if (tryRead('(')) { dataBufferAppend('('); parseElements(); } else { dataBufferAppend(readNmtoken(true)); c = readCh(); switch (c) { case '?': case '*': case '+': dataBufferAppend(c); break; default: unread(c); break; } } } /** * Parse mixed content. [50] Mixed ::= '(' S? %( %'#PCDATA' (S? '|' S? * %Mtoks)* ) S? ')*' | '(' S? %('#PCDATA') S? ')' [51] Mtoks ::= %Name (S? * '|' S? %Name)* *NOTE: the S and '#PCDATA' have already been read. */ void parseMixed() throws java.lang.Exception { // Check for PCDATA alone. skipWhitespace(); if (tryRead(')')) { dataBufferAppend(")*"); tryRead('*'); return; } // Parse mixed content. skipWhitespace(); while (!tryRead(")*")) { require('|'); dataBufferAppend('|'); skipWhitespace(); dataBufferAppend(readNmtoken(true)); skipWhitespace(); } dataBufferAppend(")*"); } /** * Parse an attribute list declaration. [52] AttlistDecl ::= ' <!ATTLIST' S * %Name S? %AttDef+ S? '>' *NOTE: the ' <!ATTLIST' has already been read. */ void parseAttlistDecl() throws java.lang.Exception { String elementName; requireWhitespace(); elementName = readNmtoken(true); requireWhitespace(); while (!tryRead('>')) { parseAttDef(elementName); skipWhitespace(); } } /** * Parse a single attribute definition. [53] AttDef ::= S %Name S %AttType S * %Default */ void parseAttDef(String elementName) throws java.lang.Exception { String name; int type; String enum_ = null; // Read the attribute name. name = readNmtoken(true); // Read the attribute type. requireWhitespace(); type = readAttType(); // Get the string of enumerated values // if necessary. if (type == ATTRIBUTE_ENUMERATED || type == ATTRIBUTE_NOTATION) { enum_ = dataBufferToString(); } // Read the default value. requireWhitespace(); parseDefault(elementName, name, type, enum_); } /** * Parse the attribute type. [54] AttType ::= StringType | TokenizedType | * EnumeratedType [55] StringType ::= 'CDATA' [56] TokenizedType ::= 'ID' | * 'IDREF' | 'IDREFS' | 'ENTITY' | 'ENTITIES' | 'NMTOKEN' | 'NMTOKENS' [57] * EnumeratedType ::= NotationType | Enumeration* */ int readAttType() throws java.lang.Exception { String typeString; Integer type; if (tryRead('(')) { parseEnumeration(); return ATTRIBUTE_ENUMERATED; } else { typeString = readNmtoken(true); if (typeString.equals("NOTATION")) { parseNotationType(); } type = (Integer) attributeTypeHash.get(typeString); if (type == null) { error("illegal attribute type", typeString, null); return ATTRIBUTE_UNDECLARED; } else { return type.intValue(); } } } /** * Parse an enumeration. [60] Enumeration ::= '(' S? %Etoks (S? '|' S? * %Etoks)* S? ')' [61] Etoks ::= %Nmtoken (S? '|' S? %Nmtoken)* *NOTE: the * '(' has already been read. */ void parseEnumeration() throws java.lang.Exception { char c; dataBufferAppend('('); // Read the first token. skipWhitespace(); dataBufferAppend(readNmtoken(true)); // Read the remaining tokens. skipWhitespace(); while (!tryRead(')')) { require('|'); dataBufferAppend('|'); skipWhitespace(); dataBufferAppend(readNmtoken(true)); skipWhitespace(); } dataBufferAppend(')'); } /** * Parse a notation type for an attribute. [58] NotationType ::= %'NOTATION' * S '(' S? %Ntoks (S? '|' S? %Ntoks)* S? ')' [59] Ntoks ::= %Name (S? '|' * S? %Name) *NOTE: the 'NOTATION' has already been read */ void parseNotationType() throws java.lang.Exception { requireWhitespace(); require('('); parseEnumeration(); } /** * Parse the default value for an attribute. [62] Default ::= '#REQUIRED' | * '#IMPLIED' | ((%'#FIXED' S)? %AttValue */ void parseDefault(String elementName, String name, int type, String enum_) throws java.lang.Exception { int valueType = ATTRIBUTE_DEFAULT_SPECIFIED; String value = null; boolean normalizeWSFlag; if (tryRead('#')) { if (tryRead("FIXED")) { valueType = ATTRIBUTE_DEFAULT_FIXED; requireWhitespace(); context = CONTEXT_ATTRIBUTEVALUE; value = readLiteral(LIT_CHAR_REF); context = CONTEXT_DTD; } else if (tryRead("REQUIRED")) { valueType = ATTRIBUTE_DEFAULT_REQUIRED; } else if (tryRead("IMPLIED")) { valueType = ATTRIBUTE_DEFAULT_IMPLIED; } else { error("illegal keyword for attribute default value", null, null); } } else { context = CONTEXT_ATTRIBUTEVALUE; value = readLiteral(LIT_CHAR_REF); context = CONTEXT_DTD; } setAttribute(elementName, name, type, enum_, value, valueType); } /** * Parse a conditional section. [63] conditionalSect ::= includeSect || * ignoreSect [64] includeSect ::= ' <![' %'INCLUDE' '[' (%markupdecl*)* * ']]>' [65] ignoreSect ::= ' <![' %'IGNORE' '[' ignoreSectContents* ']]>' * [66] ignoreSectContents ::= ((SkipLit | Comment | PI) -(Char* ']]>')) | (' * <![' ignoreSectContents* ']]>') | (Char - (']' | [ <'"])) | (' <!' (Char - * ('-' | '['))) *NOTE: the ' <![' has already been read. */ void parseConditionalSect() throws java.lang.Exception { skipWhitespace(); if (tryRead("INCLUDE")) { skipWhitespace(); require('['); skipWhitespace(); while (!tryRead("]]>")) { parseMarkupdecl(); skipWhitespace(); } } else if (tryRead("IGNORE")) { skipWhitespace(); require('['); int nesting = 1; char c; for (int nest = 1; nest > 0;) { c = readCh(); switch (c) { case '<': if (tryRead("![")) { nest++; } case ']': if (tryRead("]>")) { nest--; } } } } else { error("conditional section must begin with INCLUDE or IGNORE", null, null); } } /** * Read a character reference. [67] CharRef ::= '&#' [0-9]+ ';' | '&#x' * [0-9a-fA-F]+ ';' *NOTE: the '&#' has already been read. */ void parseCharRef() throws java.lang.Exception { return ; /* int value = 0; char c; if (tryRead('x')) { loop1: while (true) { c = readCh(); switch (c) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': case 'a': case 'A': case 'b': case 'B': case 'c': case 'C': case 'd': case 'D': case 'e': case 'E': case 'f': case 'F': value *= 16; value += Integer.parseInt(new Character(c).toString(), 16); break; case ';': break loop1; default: error("illegal character in character reference", c, null); break loop1; } } } else { loop2: while (true) { c = readCh(); switch (c) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': value *= 10; value += Integer.parseInt(new Character(c).toString(), 10); break; case ';': break loop2; default: error("illegal character in character reference", c, null); break loop2; } } } // Check for surrogates: 00000000 0000xxxx yyyyyyyy zzzzzzzz // (1101|10xx|xxyy|yyyy + 1101|11yy|zzzz|zzzz: if (value <= 0x0000ffff) { // no surrogates needed dataBufferAppend((char) value); } else if (value <= 0x000fffff) { // > 16 bits, surrogate needed dataBufferAppend((char) (0xd8 | ((value & 0x000ffc00) >> 10))); dataBufferAppend((char) (0xdc | (value & 0x0003ff))); } else { // too big for surrogate error("character reference " + value + " is too large for UTF-16", new Integer(value).toString(), null); } */ } /** * Parse a reference. [69] EntityRef ::= '&' Name ';' *NOTE: the '&' has * already been read. * * @param externalAllowed * External entities are allowed here. */ void parseEntityRef(boolean externalAllowed) throws java.lang.Exception { String name; name = readNmtoken(true); require(';'); switch (getEntityType(name)) { case ENTITY_UNDECLARED: error("reference to undeclared entity", name, null); break; case ENTITY_INTERNAL: pushString(name, getEntityValue(name)); break; case ENTITY_TEXT: if (externalAllowed) { pushURL(name, getEntityPublicId(name), getEntitySystemId(name), null, null, null); } else { error("reference to external entity in attribute value.", name, null); } break; case ENTITY_NDATA: if (externalAllowed) { error("data entity reference in content", name, null); } else { error("reference to external entity in attribute value.", name, null); } break; } } /** * Parse a parameter entity reference. [70] PEReference ::= '%' Name ';' * *NOTE: the '%' has already been read. */ void parsePEReference(boolean isEntityValue) throws java.lang.Exception { String name; name = "%" + readNmtoken(true); require(';'); switch (getEntityType(name)) { case ENTITY_UNDECLARED: error("reference to undeclared parameter entity", name, null); break; case ENTITY_INTERNAL: if (isEntityValue) { pushString(name, getEntityValue(name)); } else { pushString(name, " " + getEntityValue(name) + ' '); } break; case ENTITY_TEXT: if (isEntityValue) { pushString(null, " "); } pushURL(name, getEntityPublicId(name), getEntitySystemId(name), null, null, null); if (isEntityValue) { pushString(null, " "); } break; } } /** * Parse an entity declaration. [71] EntityDecl ::= ' <!ENTITY' S %Name S * %EntityDef S? '>' | ' <!ENTITY' S '%' S %Name S %EntityDef S? '>' [72] * EntityDef ::= EntityValue | ExternalDef [73] ExternalDef ::= ExternalID * %NDataDecl? [74] ExternalID ::= 'SYSTEM' S SystemLiteral | 'PUBLIC' S * PubidLiteral S SystemLiteral [75] NDataDecl ::= S %'NDATA' S %Name *NOTE: * the ' <!ENTITY' has already been read. */ void parseEntityDecl() throws java.lang.Exception { char c; boolean peFlag = false; String name, value, notationName, ids[]; // Check for a parameter entity. requireWhitespace(); if (tryRead('%')) { peFlag = true; requireWhitespace(); } // Read the entity name, and prepend // '%' if necessary. name = readNmtoken(true); if (peFlag) { name = "%" + name; } // Read the entity value. requireWhitespace(); c = readCh(); unread(c); if (c == '"' || c == '\'') { // Internal entity. context = CONTEXT_ENTITYVALUE; value = readLiteral(LIT_CHAR_REF | LIT_PE_REF); context = CONTEXT_DTD; setInternalEntity(name, value); } else { // Read the external IDs ids = readExternalIds(false); if (ids[1] == null) { error("system identifer missing", name, null); } // Check for NDATA declaration. skipWhitespace(); if (tryRead("NDATA")) { requireWhitespace(); notationName = readNmtoken(true); setExternalDataEntity(name, ids[0], ids[1], notationName); } else { setExternalTextEntity(name, ids[0], ids[1]); } } // Finish the declaration. skipWhitespace(); require('>'); } /** * Parse a notation declaration. [81] NotationDecl ::= ' <!NOTATION' S %Name * S %ExternalID S? '>' *NOTE: the ' <!NOTATION' has already been read. */ void parseNotationDecl() throws java.lang.Exception { String nname, ids[]; requireWhitespace(); nname = readNmtoken(true); requireWhitespace(); // Read the external identifiers. ids = readExternalIds(true); if (ids[0] == null && ids[1] == null) { error("external identifer missing", nname, null); } // Register the notation. setNotation(nname, ids[0], ids[1]); skipWhitespace(); require('>'); } /** * Parse PCDATA. * * <pre> * * * * [16] PCData ::= [ˆ<&]* * * * * </pre> * * <p> * The trick here is that the data stays in the dataBuffer without * necessarily being converted to a string right away. */ void parsePCData() throws java.lang.Exception { char c; // Start with a little cheat -- in most // cases, the entire sequence of // character data will already be in // the readBuffer; if not, fall through to // the normal approach. if (USE_CHEATS) { int lineAugment = 0; int columnAugment = 0; loop: for (int i = readBufferPos; i < readBufferLength; i++) { switch (readBuffer[i]) { case '\n': lineAugment++; columnAugment = 0; break; case '&': case '<': int start = readBufferPos; columnAugment++; readBufferPos = i; if (lineAugment > 0) { line += lineAugment; column = columnAugment; } else { column += columnAugment; } dataBufferAppend(readBuffer, start, i - start); return; default: columnAugment++; } } } // OK, the cheat didn't work; start over // and do it by the book. while (true) { c = readCh(); switch (c) { case '<': case '&': unread(c); return; default: dataBufferAppend(c); break; } } } ////////////////////////////////////////////////////////////////////// // High-level reading and scanning methods. ////////////////////////////////////////////////////////////////////// /** * Require whitespace characters. [1] S ::= (#x20 | #x9 | #xd | #xa)+ */ void requireWhitespace() throws java.lang.Exception { char c = readCh(); if (isWhitespace(c)) { skipWhitespace(); } else { error("whitespace expected", c, null); } } /** * Parse whitespace characters, and leave them in the data buffer. */ void parseWhitespace() throws java.lang.Exception { char c = readCh(); while (isWhitespace(c)) { dataBufferAppend(c); c = readCh(); } unread(c); } /** * Skip whitespace characters. [1] S ::= (#x20 | #x9 | #xd | #xa)+ */ void skipWhitespace() throws java.lang.Exception { // Start with a little cheat. Most of // the time, the white space will fall // within the current read buffer; if // not, then fall through. if (USE_CHEATS) { int lineAugment = 0; int columnAugment = 0; loop: for (int i = readBufferPos; i < readBufferLength; i++) { switch (readBuffer[i]) { case ' ': case '\t': case '\r': columnAugment++; break; case '\n': lineAugment++; columnAugment = 0; break; case '%': if (context == CONTEXT_DTD || context == CONTEXT_ENTITYVALUE) { break loop; } // else fall through... default: readBufferPos = i; if (lineAugment > 0) { line += lineAugment; column = columnAugment; } else { column += columnAugment; } return; } } } // OK, do it by the book. char c = readCh(); while (isWhitespace(c)) { c = readCh(); } unread(c); } /** * Read a name or name token. [5] Name ::= (Letter | '_' | ':') (NameChar)* * [7] Nmtoken ::= (NameChar)+ *NOTE: [6] is implemented implicitly where * required. */ String readNmtoken(boolean isName) throws java.lang.Exception { char c; if (USE_CHEATS) { loop: for (int i = readBufferPos; i < readBufferLength; i++) { switch (readBuffer[i]) { case '%': if (context == CONTEXT_DTD || context == CONTEXT_ENTITYVALUE) { break loop; } // else fall through... case '<': case '>': case '&': case ',': case '|': case '*': case '+': case '?': case ')': case '=': case '\'': case '"': case '[': case ' ': case '\t': case '\r': case '\n': case ';': case '/': case '#': int start = readBufferPos; if (i == start) { error("name expected", readBuffer[i], null); } readBufferPos = i; return intern(readBuffer, start, i - start); } } } nameBufferPos = 0; // Read the first character. loop: while (true) { c = readCh(); switch (c) { case '%': case '<': case '>': case '&': case ',': case '|': case '*': case '+': case '?': case ')': case '=': case '\'': case '"': case '[': case ' ': case '\t': case '\n': case '\r': case ';': case '/': unread(c); if (nameBufferPos == 0) { error("name expected", null, null); } String s = intern(nameBuffer, 0, nameBufferPos); nameBufferPos = 0; return s; default: nameBuffer = (char[]) extendArray(nameBuffer, nameBuffer.length, nameBufferPos); nameBuffer[nameBufferPos++] = c; } } } /** * Read a literal. [10] AttValue ::= '"' ([^ <&"] | Reference)* '"' | "'" * ([^ <&'] | Reference)* "'" [11] SystemLiteral ::= '"' URLchar* '"' | "'" * (URLchar - "'")* "'" [13] PubidLiteral ::= '"' PubidChar* '"' | "'" * (PubidChar - "'")* "'" [9] EntityValue ::= '"' ([^%&"] | PEReference | * Reference)* '"' | "'" ([^%&'] | PEReference | Reference)* "'" */ String readLiteral(int flags) throws java.lang.Exception { char delim, c; int startLine = line; // Find the delimiter. delim = readCh(); if (delim != '"' && delim != '\'' && delim != (char) 0) { error("expected '\"' or \"'\"", delim, null); return null; } // Read the literal. try { c = readCh(); loop: while (c != delim) { switch (c) { // Literals never have line ends case '\n': case '\r': c = ' '; break; // References may be allowed case '&': if ((flags & LIT_CHAR_REF) > 0) { c = readCh(); if (c == '#') { parseCharRef(); c = readCh(); continue loop; // check the next character } else if ((flags & LIT_ENTITY_REF) > 0) { unread(c); parseEntityRef(false); c = readCh(); continue loop; } else { dataBufferAppend('&'); } } break; default: break; } dataBufferAppend(c); c = readCh(); } } catch (EOFException e) { error("end of input while looking for delimiter (started on line " + startLine + ')', null, new Character(delim).toString()); } // Normalise whitespace if necessary. if ((flags & LIT_NORMALIZE) > 0) { dataBufferNormalize(); } // Return the value. return dataBufferToString(); } /** * Try reading external identifiers. * <p> * The system identifier is not required for notations. * * @param inNotation * Are we in a notation? * @return A two-member String array containing the identifiers. */ String[] readExternalIds(boolean inNotation) throws java.lang.Exception { char c; String ids[] = new String[2]; if (tryRead("PUBLIC")) { requireWhitespace(); ids[0] = readLiteral(LIT_NORMALIZE); // public id if (inNotation) { skipWhitespace(); if (tryRead('"') || tryRead('\'')) { ids[1] = readLiteral(0); } } else { requireWhitespace(); ids[1] = readLiteral(0); // system id } } else if (tryRead("SYSTEM")) { requireWhitespace(); ids[1] = readLiteral(0); // system id } return ids; } /** * Test if a character is whitespace. * * <pre> * * * * [1] S ::= (#x20 | #x9 | #xd | #xa)+ * * * * </pre> * * @param c * The character to test. * @return true if the character is whitespace. */ final boolean isWhitespace(char c) { switch ((int) c) { case 0x20: case 0x09: case 0x0d: case 0x0a: return true; default: return false; } } ////////////////////////////////////////////////////////////////////// // Utility routines. ////////////////////////////////////////////////////////////////////// /** * Add a character to the data buffer. */ void dataBufferAppend(char c) { // Expand buffer if necessary. dataBuffer = (char[]) extendArray(dataBuffer, dataBuffer.length, dataBufferPos); dataBuffer[dataBufferPos++] = c; } /** * Add a string to the data buffer. */ void dataBufferAppend(String s) { dataBufferAppend(s.toCharArray(), 0, s.length()); } /** * Append (part of) a character array to the data buffer. */ void dataBufferAppend(char ch[], int start, int length) { dataBuffer = (char[]) extendArray(dataBuffer, dataBuffer.length, dataBufferPos + length); System.arraycopy(ch, start, dataBuffer, dataBufferPos, length); dataBufferPos += length; } /** * Normalise whitespace in the data buffer. */ void dataBufferNormalize() { int i = 0; int j = 0; int end = dataBufferPos; // Skip whitespace at the start. while (j < end && isWhitespace(dataBuffer[j])) { j++; } // Skip whitespace at the end. while (end > j && isWhitespace(dataBuffer[end - 1])) { end--; } // Start copying to the left. while (j < end) { char c = dataBuffer[j++]; // Normalise all other whitespace to // a single space. if (isWhitespace(c)) { while (j < end && isWhitespace(dataBuffer[j++])) { } dataBuffer[i++] = ' '; dataBuffer[i++] = dataBuffer[j - 1]; } else { dataBuffer[i++] = c; } } // The new length is <= the old one. dataBufferPos = i; } /** * Convert the data buffer to a string. * * @param internFlag * true if the contents should be interned. * @see #intern(char[],int,int) */ String dataBufferToString() { String s = new String(dataBuffer, 0, dataBufferPos); dataBufferPos = 0; return s; } /** * Flush the contents of the data buffer to the handler, if appropriate, and * reset the buffer for new input. */ void dataBufferFlush() throws java.lang.Exception { if (dataBufferPos > 0) { switch (currentElementContent) { case CONTENT_UNDECLARED: case CONTENT_EMPTY: // do nothing break; case CONTENT_MIXED: case CONTENT_ANY: if (handler != null) { handler.charData(dataBuffer, 0, dataBufferPos); } break; case CONTENT_ELEMENTS: if (handler != null) { handler.ignorableWhitespace(dataBuffer, 0, dataBufferPos); } break; } dataBufferPos = 0; } } /** * Require a string to appear, or throw an exception. */ void require(String delim) throws java.lang.Exception { char ch[] = delim.toCharArray(); for (int i = 0; i < ch.length; i++) { require(ch[i]); } } /** * Require a character to appear, or throw an exception. */ void require(char delim) throws java.lang.Exception { char c = readCh(); if (c != delim) { error("expected character", c, new Character(delim).toString()); } } /** * Return an internalised version of a string. * <p> * Ælfred uses this method to create an internalised version of all * names and attribute values, so that it can test equality with * <code>==</code> instead of <code>String.equals()</code>. * <p> * If you want to be able to test for equality in the same way, you can use * this method to internalise your own strings first: * * <pre> * String PARA = handler.intern("PARA"); * </pre> * * <p> * Note that this will not return the same results as String.intern(). * * @param s * The string to internalise. * @return An internalised version of the string. * @see #intern(char[],int,int) * @see java.lang.String#intern */ public String intern(String s) { char ch[] = s.toCharArray(); return intern(ch, 0, ch.length); } /** * Create an internalised string from a character array. * <p> * This is much more efficient than constructing a non-internalised string * first, and then internalising it. * <p> * Note that this will not return the same results as String.intern(). * * @param ch * an array of characters for building the string. * @param start * the starting position in the array. * @param length * the number of characters to place in the string. * @return an internalised string. * @see #intern(String) * @see java.lang.String#intern */ public String intern(char ch[], int start, int length) { int index; int hash = 0; // Generate a hash code. for (int i = start; i < start + length; i++) { hash = ((hash << 1) & 0xffffff) + (int) ch[i]; } hash = hash % SYMBOL_TABLE_LENGTH; // Get the bucket. Object bucket[] = (Object[]) symbolTable[hash]; if (bucket == null) { symbolTable[hash] = bucket = new Object[8]; } // Search for a matching tuple, and // return the string if we find one. for (index = 0; index < bucket.length; index += 2) { char chFound[] = (char[]) bucket[index]; // Stop when we hit a null index. if (chFound == null) { break; } // If they're the same length, // check for a match. // If the loop finishes, 'index' will // contain the current bucket // position. if (chFound.length == length) { for (int i = 0; i < chFound.length; i++) { // Stop if there are no more tuples. if (ch[start + i] != chFound[i]) { break; } else if (i == length - 1) { // That's it, we have a match! return (String) bucket[index + 1]; } } } } // Not found -- we'll have to add it. // Do we have to grow the bucket? bucket = (Object[]) extendArray(bucket, bucket.length, index); // OK, add it to the end of the // bucket. String s = new String(ch, start, length); bucket[index] = s.toCharArray(); bucket[index + 1] = s; symbolTable[hash] = bucket; return s; } /** * Ensure the capacity of an array, allocating a new one if necessary. */ Object extendArray(Object array, int currentSize, int requiredSize) { if (requiredSize < currentSize) { return array; } else { Object newArray = null; int newSize = currentSize * 2; if (newSize <= requiredSize) { newSize = requiredSize + 1; } if (array instanceof char[]) { newArray = new char[currentSize * 2]; } else if (array instanceof Object[]) { newArray = new Object[currentSize * 2]; } System.arraycopy(array, 0, newArray, 0, currentSize); return newArray; } } ////////////////////////////////////////////////////////////////////// // XML query routines. ////////////////////////////////////////////////////////////////////// // // Elements // /** * Get the declared elements for an XML document. * <p> * The results will be valid only after the DTD (if any) has been parsed. * * @return An enumeration of all element types declared for this document * (as Strings). * @see #getElementContentType * @see #getElementContentModel */ public Enumeration declaredElements() { return elementInfo.keys(); } /** * Look up the content type of an element. * * @param name * The element type name. * @return An integer constant representing the content type. * @see #getElementContentModel * @see #CONTENT_UNDECLARED * @see #CONTENT_ANY * @see #CONTENT_EMPTY * @see #CONTENT_MIXED * @see #CONTENT_ELEMENTS */ public int getElementContentType(String name) { Object element[] = (Object[]) elementInfo.get(name); if (element == null) { return CONTENT_UNDECLARED; } else { return ((Integer) element[0]).intValue(); } } /** * Look up the content model of an element. * <p> * The result will always be null unless the content type is * CONTENT_ELEMENTS or CONTENT_MIXED. * * @param name * The element type name. * @return The normalised content model, as a string. * @see #getElementContentType */ public String getElementContentModel(String name) { Object element[] = (Object[]) elementInfo.get(name); if (element == null) { return null; } else { return (String) element[1]; } } /** * Register an element. Array format: element type attribute hash table */ void setElement(String name, int contentType, String contentModel, Hashtable attributes) throws java.lang.Exception { Object element[]; // Try looking up the element element = (Object[]) elementInfo.get(name); // Make a new one if necessary. if (element == null) { element = new Object[3]; element[0] = new Integer(CONTENT_UNDECLARED); element[1] = null; element[2] = null; } else if (contentType != CONTENT_UNDECLARED && ((Integer) element[0]).intValue() != CONTENT_UNDECLARED) { error("multiple declarations for element type", name, null); return; } // Insert the content type, if any. if (contentType != CONTENT_UNDECLARED) { element[0] = new Integer(contentType); } // Insert the content model, if any. if (contentModel != null) { element[1] = contentModel; } // Insert the attributes, if any. if (attributes != null) { element[2] = attributes; } // Save the element info. elementInfo.put(name, element); } /** * Look up the attribute hash table for an element. The hash table is the * second item in the element array. */ Hashtable getElementAttributes(String name) { Object element[] = (Object[]) elementInfo.get(name); if (element == null) { return null; } return (Hashtable) element[2]; } // // Attributes // /** * Get the declared attributes for an element type. * * @param elname * The name of the element type. * @return An Enumeration of all the attributes declared for a specific * element type. The results will be valid only after the DTD (if * any) has been parsed. * @see #getAttributeType * @see #getAttributeEnumeration * @see #getAttributeDefaultValueType * @see #getAttributeDefaultValue * @see #getAttributeExpandedValue */ public Enumeration declaredAttributes(String elname) { Hashtable attlist = getElementAttributes(elname); if (attlist == null) { return null; } else { return attlist.keys(); } } /** * Retrieve the declared type of an attribute. * * @param name * The name of the associated element. * @param aname * The name of the attribute. * @return An integer constant representing the attribute type. * @see #ATTRIBUTE_UNDECLARED * @see #ATTRIBUTE_CDATA * @see #ATTRIBUTE_ID * @see #ATTRIBUTE_IDREF * @see #ATTRIBUTE_IDREFS * @see #ATTRIBUTE_ENTITY * @see #ATTRIBUTE_ENTITIES * @see #ATTRIBUTE_NMTOKEN * @see #ATTRIBUTE_NMTOKENS * @see #ATTRIBUTE_ENUMERATED * @see #ATTRIBUTE_NOTATION */ public int getAttributeType(String name, String aname) { Object attribute[] = getAttribute(name, aname); if (attribute == null) { return ATTRIBUTE_UNDECLARED; } else { return ((Integer) attribute[0]).intValue(); } } /** * Retrieve the allowed values for an enumerated attribute type. * * @param name * The name of the associated element. * @param aname * The name of the attribute. * @return A string containing the token list. * @see #ATTRIBUTE_ENUMERATED * @see #ATTRIBUTE_NOTATION */ public String getAttributeEnumeration(String name, String aname) { Object attribute[] = getAttribute(name, aname); if (attribute == null) { return null; } else { return (String) attribute[3]; } } /** * Retrieve the default value of a declared attribute. * * @param name * The name of the associated element. * @param aname * The name of the attribute. * @return The default value, or null if the attribute was #IMPLIED or * simply undeclared and unspecified. * @see #getAttributeExpandedValue */ public String getAttributeDefaultValue(String name, String aname) { Object attribute[] = getAttribute(name, aname); if (attribute == null) { return null; } else { return (String) attribute[1]; } } /** * Retrieve the expanded value of a declared attribute. * <p> * All general entities will be expanded. * * @param name * The name of the associated element. * @param aname * The name of the attribute. * @return The expanded default value, or null if the attribute was #IMPLIED * or simply undeclared * @see #getAttributeDefaultValue */ public String getAttributeExpandedValue(String name, String aname) { Object attribute[] = getAttribute(name, aname); if (attribute == null) { return null; } else if (attribute[4] == null && attribute[1] != null) { try { pushString(null, (char) 0 + (String) attribute[1] + (char) 0); attribute[4] = readLiteral(LIT_NORMALIZE | LIT_CHAR_REF | LIT_ENTITY_REF); } catch (Exception e) { } } return (String) attribute[4]; } /** * Retrieve the default value type of a declared attribute. * * @see #ATTRIBUTE_DEFAULT_SPECIFIED * @see #ATTRIBUTE_DEFAULT_IMPLIED * @see #ATTRIBUTE_DEFAULT_REQUIRED * @see #ATTRIBUTE_DEFAULT_FIXED */ public int getAttributeDefaultValueType(String name, String aname) { Object attribute[] = getAttribute(name, aname); if (attribute == null) { return ATTRIBUTE_DEFAULT_UNDECLARED; } else { return ((Integer) attribute[2]).intValue(); } } /** * Register an attribute declaration for later retrieval. Format: - String * type - String default value - int value type */ void setAttribute(String elName, String name, int type, String enumeration, String value, int valueType) throws java.lang.Exception { Hashtable attlist; Object attribute[]; // Create a new hashtable if necessary. attlist = getElementAttributes(elName); if (attlist == null) { attlist = new Hashtable(); } // Check that the attribute doesn't // already exist! if (attlist.get(name) != null) { return; } else { attribute = new Object[5]; attribute[0] = new Integer(type); attribute[1] = value; attribute[2] = new Integer(valueType); attribute[3] = enumeration; attribute[4] = null; attlist.put(name.intern(), attribute); // Use CONTENT_UNDECLARED to avoid overwriting // existing element declaration. setElement(elName, CONTENT_UNDECLARED, null, attlist); } } /** * Retrieve the three-member array representing an attribute declaration. */ Object[] getAttribute(String elName, String name) { Hashtable attlist; Object attribute[]; attlist = getElementAttributes(elName); if (attlist == null) { return null; } attribute = (Object[]) attlist.get(name); return attribute; } // // Entities // /** * Get declared entities. * * @return An Enumeration of all the entities declared for this XML * document. The results will be valid only after the DTD (if any) * has been parsed. * @see #getEntityType * @see #getEntityPublicId * @see #getEntitySystemId * @see #getEntityValue * @see #getEntityNotationName */ public Enumeration declaredEntities() { return entityInfo.keys(); } /** * Find the type of an entity. * * @returns An integer constant representing the entity type. * @see #ENTITY_UNDECLARED * @see #ENTITY_INTERNAL * @see #ENTITY_NDATA * @see #ENTITY_TEXT */ public int getEntityType(String ename) { Object entity[] = (Object[]) entityInfo.get(ename); if (entity == null) { return ENTITY_UNDECLARED; } return ((Integer) entity[0]).intValue(); } /** * Return an external entity's public identifier, if any. * * @param ename * The name of the external entity. * @return The entity's system identifier, or null if the entity was not * declared, if it is not an external entity, or if no public * identifier was provided. * @see #getEntityType */ public String getEntityPublicId(String ename) { Object entity[] = (Object[]) entityInfo.get(ename); if (entity == null) { return null; } return (String) entity[1]; } /** * Return an external entity's system identifier. * * @param ename * The name of the external entity. * @return The entity's system identifier, or null if the entity was not * declared, or if it is not an external entity. * @see #getEntityType */ public String getEntitySystemId(String ename) { Object entity[] = (Object[]) entityInfo.get(ename); if (entity == null) { return null; } return (String) entity[2]; } /** * Return the value of an internal entity. * * @param ename * The name of the internal entity. * @return The entity's value, or null if the entity was not declared, or if * it is not an internal entity. * @see #getEntityType */ public String getEntityValue(String ename) { Object entity[] = (Object[]) entityInfo.get(ename); if (entity == null) { return null; } return (String) entity[3]; } /** * Get the notation name associated with an NDATA entity. * * @param ename * The NDATA entity name. * @return The associated notation name, or null if the entity was not * declared, or if it is not an NDATA entity. * @see #getEntityType */ public String getEntityNotationName(String eName) { Object entity[] = (Object[]) entityInfo.get(eName); if (entity == null) { return null; } return (String) entity[4]; } /** * Register an entity declaration for later retrieval. */ void setInternalEntity(String eName, String value) { setEntity(eName, ENTITY_INTERNAL, null, null, value, null); } /** * Register an external data entity. */ void setExternalDataEntity(String eName, String pubid, String sysid, String nName) { setEntity(eName, ENTITY_NDATA, pubid, sysid, null, nName); } /** * Register an external text entity. */ void setExternalTextEntity(String eName, String pubid, String sysid) { setEntity(eName, ENTITY_TEXT, pubid, sysid, null, null); } /** * Register an entity declaration for later retrieval. */ void setEntity(String eName, int eClass, String pubid, String sysid, String value, String nName) { Object entity[]; if (entityInfo.get(eName) == null) { entity = new Object[5]; entity[0] = new Integer(eClass); entity[1] = pubid; entity[2] = sysid; entity[3] = value; entity[4] = nName; entityInfo.put(eName, entity); } } // // Notations. // /** * Get declared notations. * * @return An Enumeration of all the notations declared for this XML * document. The results will be valid only after the DTD (if any) * has been parsed. * @see #getNotationPublicId * @see #getNotationSystemId */ public Enumeration declaredNotations() { return notationInfo.keys(); } /** * Look up the public identifier for a notation. You will normally use this * method to look up a notation that was provided as an attribute value or * for an NDATA entity. * * @param nname * The name of the notation. * @return A string containing the public identifier, or null if none was * provided or if no such notation was declared. * @see #getNotationSystemId */ public String getNotationPublicId(String nname) { Object notation[] = (Object[]) notationInfo.get(nname); if (notation == null) { return null; } return (String) notation[0]; } /** * Look up the system identifier for a notation. You will normally use this * method to look up a notation that was provided as an attribute value or * for an NDATA entity. * * @param nname * The name of the notation. * @return A string containing the system identifier, or null if no such * notation was declared. * @see #getNotationPublicId */ public String getNotationSystemId(String nname) { Object notation[] = (Object[]) notationInfo.get(nname); if (notation == null) { return null; } return (String) notation[1]; } /** * Register a notation declaration for later retrieval. Format: - public id - * system id */ void setNotation(String nname, String pubid, String sysid) throws java.lang.Exception { Object notation[]; if (notationInfo.get(nname) == null) { notation = new Object[2]; notation[0] = pubid; notation[1] = sysid; notationInfo.put(nname, notation); } else { error("multiple declarations of notation", nname, null); } } // // Location. // /** * Return the current line number. */ public int getLineNumber() { return line; } /** * Return the current column number. */ public int getColumnNumber() { return column; } ////////////////////////////////////////////////////////////////////// // High-level I/O. ////////////////////////////////////////////////////////////////////// /** * Read a single character from the readBuffer. * <p> * The readDataChunk() method maintains the buffer. * <p> * If we hit the end of an entity, try to pop the stack and keep going. * <p> * (This approach doesn't really enforce XML's rules about entity * boundaries, but this is not currently a validating parser). * <p> * This routine also attempts to keep track of the current position in * external entities, but it's not entirely accurate. * * @return The next available input character. * @see #unread(char) * @see #unread(String) * @see #readDataChunk * @see #readBuffer * @see #line * @return The next character from the current input source. */ char readCh() throws java.lang.Exception { char c; // As long as there's nothing in the // read buffer, try reading more data // (for an external entity) or popping // the entity stack (for either). while (readBufferPos >= readBufferLength) { switch (sourceType) { case INPUT_READER: case INPUT_EXTERNAL: case INPUT_STREAM: readDataChunk(); while (readBufferLength < 1) { popInput(); if (readBufferLength < 1) { readDataChunk(); } } break; default: popInput(); break; } } c = readBuffer[readBufferPos++]; // This is a particularly nasty bit // of code, that checks for a parameter // entity reference but peeks ahead to // catch the '%' in parameter entity // declarations. if (c == '%' && (context == CONTEXT_DTD || context == CONTEXT_ENTITYVALUE)) { char c2 = readCh(); unread(c2); if (!isWhitespace(c2)) { parsePEReference(context == CONTEXT_ENTITYVALUE); return readCh(); } } if (c == '\n') { line++; column = 0; } else { column++; } return c; } /** * Push a single character back onto the current input stream. * <p> * This method usually pushes the character back onto the readBuffer, while * the unread(String) method treats the string as a new internal entity. * <p> * I don't think that this would ever be called with readBufferPos = 0, * because the methods always reads a character before unreading it, but * just in case, I've added a boundary condition. * * @param c * The character to push back. * @see #readCh * @see #unread(String) * @see #unread(char[]) * @see #readBuffer */ void unread(char c) throws java.lang.Exception { // Normal condition. if (c == '\n') { line--; column = -1; } if (readBufferPos > 0) { readBuffer[--readBufferPos] = c; } else { pushString(null, new Character(c).toString()); } } /** * Push a char array back onto the current input stream. * <p> * NOTE: you must <em>never</em> push back characters that you haven't * actually read: use pushString() instead. * * @see #readCh * @see #unread(char) * @see #unread(String) * @see #readBuffer * @see #pushString */ void unread(char ch[], int length) throws java.lang.Exception { for (int i = 0; i < length; i++) { if (ch[i] == '\n') { line--; column = -1; } } if (length < readBufferPos) { readBufferPos -= length; } else { pushCharArray(null, ch, 0, length); sourceType = INPUT_BUFFER; } } /** * Push a new external input source. * <p> * The source will be either an external text entity, or the DTD external * subset. * <p> * TO DO: Right now, this method always attempts to autodetect the encoding; * in the future, it should allow the caller to request an encoding * explicitly, and it should also look at the headers with an HTTP * connection. * * @param url * The java.net.URL object for the entity. * @see XmlHandler#resolveEntity * @see #pushString * @see #sourceType * @see #pushInput * @see #detectEncoding * @see #sourceType * @see #readBuffer */ void pushURL(String ename, String publicId, String systemId, Reader reader, InputStream stream, String encoding) throws java.lang.Exception { URL url; boolean ignoreEncoding = false; // Push the existing status. pushInput(ename); // Create a new read buffer. // (Note the four-character margin) readBuffer = new char[READ_BUFFER_MAX + 4]; readBufferPos = 0; readBufferLength = 0; readBufferOverflow = -1; is = null; line = 1; currentByteCount = 0; // Flush any remaining data. dataBufferFlush(); // Make the URL absolute. if (systemId != null && externalEntity != null) { systemId = new URL(externalEntity.getURL(), systemId).toString(); } else if (baseURI != null) { try { systemId = new URL(new URL(baseURI), systemId).toString(); } catch (Exception e) { } } // See if the application wants to // redirect the system ID and/or // supply its own character stream. if (systemId != null && handler != null) { Object input = handler.resolveEntity(publicId, systemId); if (input != null) { if (input instanceof String) { systemId = (String) input; } else if (input instanceof InputStream) { stream = (InputStream) input; } else if (input instanceof Reader) { reader = (Reader) input; } } } // Start the entity. if (handler != null) { if (systemId != null) { handler.startExternalEntity(systemId); } else { handler.startExternalEntity("[external stream]"); } } // Figure out what we're reading from. if (reader != null) { // There's an explicit character stream. sourceType = INPUT_READER; this.reader = reader; tryEncodingDecl(true); return; } else if (stream != null) { sourceType = INPUT_STREAM; is = stream; } else { // We have to open our own stream // to the URL. // Set the new status sourceType = INPUT_EXTERNAL; url = new URL(systemId); externalEntity = url.openConnection(); externalEntity.connect(); is = externalEntity.getInputStream(); } // If we get to here, there must be // an InputStream available. if (!is.markSupported()) { is = new BufferedInputStream(is); } // Attempt to detect the encoding. if (encoding == null && externalEntity != null) { encoding = externalEntity.getContentEncoding(); } if (encoding != null) { checkEncoding(encoding, false); ignoreEncoding = true; } else { detectEncoding(); ignoreEncoding = false; } // Read an XML or text declaration. tryEncodingDecl(ignoreEncoding); } /** * Check for an encoding declaration. */ void tryEncodingDecl(boolean ignoreEncoding) throws java.lang.Exception { // Read the XML/Encoding declaration. if (tryRead("<?xml")) { if (tryWhitespace()) { if (inputStack.size() > 0) { parseTextDecl(ignoreEncoding); } else { parseXMPDecl(ignoreEncoding); } } else { unread("xml".toCharArray(), 3); parsePI(); } } } /** * Attempt to detect the encoding of an entity. * <p> * The trick here (as suggested in the XML standard) is that any entity not * in UTF-8, or in UCS-2 with a byte-order mark, <b>must </b> begin with an * XML declaration or an encoding declaration; we simply have to look for * "<?XML" in various encodings. * <p> * This method has no way to distinguish among 8-bit encodings. Instead, it * assumes UTF-8, then (possibly) revises its assumption later in * checkEncoding(). Any ASCII-derived 8-bit encoding should work, but most * will be rejected later by checkEncoding(). * <p> * I don't currently detect EBCDIC, since I'm concerned that it could also * be a valid UTF-8 sequence; I'll have to do more checking later. * * @see #tryEncoding(byte[], byte, byte, byte, byte) * @see #tryEncoding(byte[], byte, byte) * @see #checkEncoding * @see #read8bitEncodingDeclaration */ void detectEncoding() throws java.lang.Exception { byte signature[] = new byte[4]; // Read the first four bytes for // autodetection. is.mark(4); is.read(signature); is.reset(); // Look for a known signature. if (tryEncoding(signature, (byte) 0x00, (byte) 0x00, (byte) 0x00, (byte) 0x3c)) { // UCS-4 must begin with "<!XML" // 0x00 0x00 0x00 0x3c: UCS-4, big-endian (1234) encoding = ENCODING_UCS_4_1234; } else if (tryEncoding(signature, (byte) 0x3c, (byte) 0x00, (byte) 0x00, (byte) 0x00)) { // UCS-4 must begin with "<!XML" // 0x3c 0x00 0x00 0x00: UCS-4, little-endian (4321) encoding = ENCODING_UCS_4_4321; } else if (tryEncoding(signature, (byte) 0x00, (byte) 0x00, (byte) 0x3c, (byte) 0x00)) { // UCS-4 must begin with "<!XML" // 0x00 0x00 0x3c 0x00: UCS-4, unusual (2143) encoding = ENCODING_UCS_4_2143; } else if (tryEncoding(signature, (byte) 0x00, (byte) 0x3c, (byte) 0x00, (byte) 0x00)) { // UCS-4 must begin with "<!XML" // 0x00 0x3c 0x00 0x00: UCS-4, unusual (3421) encoding = ENCODING_UCS_4_3412; } else if (tryEncoding(signature, (byte) 0xfe, (byte) 0xff)) { // UCS-2 with a byte-order marker. // 0xfe 0xff: UCS-2, big-endian (12) encoding = ENCODING_UCS_2_12; is.read(); is.read(); } else if (tryEncoding(signature, (byte) 0xff, (byte) 0xfe)) { // UCS-2 with a byte-order marker. // 0xff 0xfe: UCS-2, little-endian (21) encoding = ENCODING_UCS_2_21; is.read(); is.read(); } else if (tryEncoding(signature, (byte) 0x00, (byte) 0x3c, (byte) 0x00, (byte) 0x3f)) { // UCS-2 without a BOM must begin with "<?XML" // 0x00 0x3c 0x00 0x3f: UCS-2, big-endian, no byte-order mark encoding = ENCODING_UCS_2_12; error("no byte-order mark for UCS-2 entity", null, null); } else if (tryEncoding(signature, (byte) 0x3c, (byte) 0x00, (byte) 0x3f, (byte) 0x00)) { // UCS-2 without a BOM must begin with "<?XML" // 0x3c 0x00 0x3f 0x00: UCS-2, little-endian, no byte-order mark encoding = ENCODING_UCS_2_21; error("no byte-order mark for UCS-2 entity", null, null); } else if (tryEncoding(signature, (byte) 0x3c, (byte) 0x3f, (byte) 0x78, (byte) 0x6d)) { // Some kind of 8-bit encoding with "<?XML" // 0x3c 0x3f 0x78 0x6d: UTF-8 or other 8-bit markup (read ENCODING) encoding = ENCODING_UTF_8; read8bitEncodingDeclaration(); } else { // Some kind of 8-bit encoding without "<?XML" // (otherwise) UTF-8 without encoding/XML declaration encoding = ENCODING_UTF_8; } } /** * Check for a four-byte signature. * <p> * Utility routine for detectEncoding(). * <p> * Always looks for some part of " <?XML" in a specific encoding. * * @param sig * The first four bytes read. * @param b1 * The first byte of the signature * @param b2 * The second byte of the signature * @param b3 * The third byte of the signature * @param b4 * The fourth byte of the signature * @see #detectEncoding */ boolean tryEncoding(byte sig[], byte b1, byte b2, byte b3, byte b4) { return (sig[0] == b1 && sig[1] == b2 && sig[2] == b3 && sig[3] == b4); } /** * Check for a two-byte signature. * <p> * Looks for a UCS-2 byte-order mark. * <p> * Utility routine for detectEncoding(). * * @param sig * The first four bytes read. * @param b1 * The first byte of the signature * @param b2 * The second byte of the signature * @see #detectEncoding */ boolean tryEncoding(byte sig[], byte b1, byte b2) { return ((sig[0] == b1) && (sig[1] == b2)); } /** * This method pushes a string back onto input. * <p> * It is useful either as the expansion of an internal entity, or for * backtracking during the parse. * <p> * Call pushCharArray() to do the actual work. * * @param s * The string to push back onto input. * @see #pushCharArray */ void pushString(String ename, String s) throws java.lang.Exception { char ch[] = s.toCharArray(); pushCharArray(ename, ch, 0, ch.length); } /** * Push a new internal input source. * <p> * This method is useful for expanding an internal entity, or for unreading * a string of characters. It creates a new readBuffer containing the * characters in the array, instead of characters converted from an input * byte stream. * <p> * I've added a couple of optimisations: don't push zero- length strings, * and just push back a single character for 1-character strings; this * should save some time and memory. * * @param ch * The char array to push. * @see #pushString * @see #pushURL * @see #readBuffer * @see #sourceType * @see #pushInput */ void pushCharArray(String ename, char ch[], int start, int length) throws java.lang.Exception { // Push the existing status pushInput(ename); sourceType = INPUT_INTERNAL; readBuffer = ch; readBufferPos = start; readBufferLength = length; readBufferOverflow = -1; } /** * Save the current input source onto the stack. * <p> * This method saves all of the global variables associated with the current * input source, so that they can be restored when a new input source has * finished. It also tests for entity recursion. * <p> * The method saves the following global variables onto a stack using a * fixed-length array: * <ol> * <li>sourceType * <li>externalEntity * <li>readBuffer * <li>readBufferPos * <li>readBufferLength * <li>line * <li>encoding * </ol> * * @param ename * The name of the entity (if any) causing the new input. * @see #popInput * @see #sourceType * @see #externalEntity * @see #readBuffer * @see #readBufferPos * @see #readBufferLength * @see #line * @see #encoding */ void pushInput(String ename) throws java.lang.Exception { Object input[] = new Object[12]; // Check for entity recursion. if (ename != null) { Enumeration entities = entityStack.elements(); while (entities.hasMoreElements()) { String e = (String) entities.nextElement(); if (e == ename) { error("recursive reference to entity", ename, null); } } } entityStack.push(ename); // Don't bother if there is no input. if (sourceType == INPUT_NONE) { return; } // Set up a snapshot of the current // input source. input[0] = new Integer(sourceType); input[1] = externalEntity; input[2] = readBuffer; input[3] = new Integer(readBufferPos); input[4] = new Integer(readBufferLength); input[5] = new Integer(line); input[6] = new Integer(encoding); input[7] = new Integer(readBufferOverflow); input[8] = is; input[9] = new Integer(currentByteCount); input[10] = new Integer(column); input[11] = reader; // Push it onto the stack. inputStack.push(input); } /** * Restore a previous input source. * <p> * This method restores all of the global variables associated with the * current input source. * * @exception java.io.EOFException * If there are no more entries on the input stack. * @see #pushInput * @see #sourceType * @see #externalEntity * @see #readBuffer * @see #readBufferPos * @see #readBufferLength * @see #line * @see #encoding */ void popInput() throws java.lang.Exception { Object input[]; switch (sourceType) { case INPUT_EXTERNAL: dataBufferFlush(); if (handler != null && externalEntity != null) { handler.endExternalEntity(externalEntity.getURL() .toString()); } break; case INPUT_STREAM: dataBufferFlush(); if (baseURI != null) { if (handler != null) { handler.endExternalEntity(baseURI); } } break; case INPUT_READER: dataBufferFlush(); if (baseURI != null) { if (handler != null) { handler.endExternalEntity(baseURI); } } break; } // Throw an EOFException if there // is nothing else to pop. if (inputStack.isEmpty()) { throw new EOFException(); } else { String s; input = (Object[]) inputStack.pop(); s = (String) entityStack.pop(); } sourceType = ((Integer) input[0]).intValue(); externalEntity = (URLConnection) input[1]; readBuffer = (char[]) input[2]; readBufferPos = ((Integer) input[3]).intValue(); readBufferLength = ((Integer) input[4]).intValue(); line = ((Integer) input[5]).intValue(); encoding = ((Integer) input[6]).intValue(); readBufferOverflow = ((Integer) input[7]).intValue(); is = (InputStream) input[8]; currentByteCount = ((Integer) input[9]).intValue(); column = ((Integer) input[10]).intValue(); reader = (Reader) input[11]; } /** * Return true if we can read the expected character. * <p> * Note that the character will be removed from the input stream on success, * but will be put back on failure. Do not attempt to read the character * again if the method succeeds. * * @param delim * The character that should appear next. For a insensitive * match, you must supply this in upper-case. * @return true if the character was successfully read, or false if it was * not. * @see #tryRead(String) */ boolean tryRead(char delim) throws java.lang.Exception { char c; // Read the character c = readCh(); // Test for a match, and push the character // back if the match fails. if (c == delim) { return true; } else { unread(c); return false; } } /** * Return true if we can read the expected string. * <p> * This is simply a convenience method. * <p> * Note that the string will be removed from the input stream on success, * but will be put back on failure. Do not attempt to read the string again * if the method succeeds. * <p> * This method will push back a character rather than an array whenever * possible (probably the majority of cases). * <p> * <b>NOTE: </b> This method currently has a hard-coded limit of 100 * characters for the delimiter. * * @param delim * The string that should appear next. * @return true if the string was successfully read, or false if it was not. * @see #tryRead(char) */ boolean tryRead(String delim) throws java.lang.Exception { char ch[] = delim.toCharArray(); char c; // Compare the input, character- // by character. for (int i = 0; i < ch.length; i++) { c = readCh(); if (c != ch[i]) { unread(c); if (i != 0) { unread(ch, i); } return false; } } return true; } /** * Return true if we can read some whitespace. * <p> * This is simply a convenience method. * <p> * This method will push back a character rather than an array whenever * possible (probably the majority of cases). * * @return true if whitespace was found. */ boolean tryWhitespace() throws java.lang.Exception { char c; c = readCh(); if (isWhitespace(c)) { skipWhitespace(); return true; } else { unread(c); return false; } } /** * Read all data until we find the specified string. * <p> * This is especially useful for scanning marked sections. * <p> * This is a a little inefficient right now, since it calls tryRead() for * every character. * * @param delim * The string delimiter * @see #tryRead(String, boolean) * @see #readCh */ void parseUntil(String delim) throws java.lang.Exception { char c; int startLine = line; try { while (!tryRead(delim)) { c = readCh(); dataBufferAppend(c); } } catch (EOFException e) { error("end of input while looking for delimiter (started on line " + startLine + ')', null, delim); } } /** * Skip all data until we find the specified string. * <p> * This is especially useful for scanning comments. * <p> * This is a a little inefficient right now, since it calls tryRead() for * every character. * * @param delim * The string delimiter * @see #tryRead(String, boolean) * @see #readCh */ void skipUntil(String delim) throws java.lang.Exception { while (!tryRead(delim)) { readCh(); } } /** * Read just the encoding declaration (or XML declaration) at the start of * an external entity. When this method is called, we know that the * declaration is present (or appears to be). We also know that the entity * is in some sort of ASCII-derived 8-bit encoding. The idea of this is to * let us read what the 8-bit encoding is before we've committed to * converting any more of the file; the XML or encoding declaration must be * in 7-bit ASCII, so we're safe as long as we don't go past it. */ void read8bitEncodingDeclaration() throws java.lang.Exception { int ch; readBufferPos = readBufferLength = 0; while (true) { ch = is.read(); readBuffer[readBufferLength++] = (char) ch; switch (ch) { case (int) '>': return; case -1: error( "end of file before end of XML or encoding declaration.", null, "?>"); return; } if (readBuffer.length == readBufferLength) { error("unfinished XML or encoding declaration", null, null); } } } ////////////////////////////////////////////////////////////////////// // Low-level I/O. ////////////////////////////////////////////////////////////////////// /** * Read a chunk of data from an external input source. * <p> * This is simply a front-end that fills the rawReadBuffer with bytes, then * calls the appropriate encoding handler. * * @see #encoding * @see #rawReadBuffer * @see #readBuffer * @see #filterCR * @see #copyUtf8ReadBuffer * @see #copyIso8859_1ReadBuffer * @see #copyUcs_2ReadBuffer * @see #copyUcs_4ReadBuffer */ void readDataChunk() throws java.lang.Exception { int count, i, j; // See if we have any overflow. if (readBufferOverflow > -1) { readBuffer[0] = (char) readBufferOverflow; readBufferOverflow = -1; readBufferPos = 1; sawCR = true; } else { readBufferPos = 0; sawCR = false; } // Special situation -- we're taking // input from a character stream. if (sourceType == INPUT_READER) { count = reader.read(readBuffer, readBufferPos, READ_BUFFER_MAX - 1); if (count < 0) { readBufferLength = -1; } else { readBufferLength = readBufferPos + count; filterCR(); sawCR = false; } return; } // Read as many bytes as possible // into the read buffer. count = is.read(rawReadBuffer, 0, READ_BUFFER_MAX); // Dispatch to an encoding-specific // reader method to populate the // readBuffer. switch (encoding) { case ENCODING_UTF_8: copyUtf8ReadBuffer(count); break; case ENCODING_ISO_8859_1: copyIso8859_1ReadBuffer(count); break; case ENCODING_UCS_2_12: copyUcs2ReadBuffer(count, 8, 0); break; case ENCODING_UCS_2_21: copyUcs2ReadBuffer(count, 0, 8); break; case ENCODING_UCS_4_1234: copyUcs4ReadBuffer(count, 24, 16, 8, 0); break; case ENCODING_UCS_4_4321: copyUcs4ReadBuffer(count, 0, 8, 16, 24); break; case ENCODING_UCS_4_2143: copyUcs4ReadBuffer(count, 16, 24, 0, 8); break; case ENCODING_UCS_4_3412: copyUcs4ReadBuffer(count, 8, 0, 24, 16); break; } // Filter out all carriage returns // if we've seen any. if (sawCR) { filterCR(); sawCR = false; } // Reset the position. readBufferPos = 0; currentByteCount += count; } /** * Filter carriage returns in the read buffer. * <p> * CRLF becomes LF; CR becomes LF. * * @see #readDataChunk * @see #readBuffer * @see #readBufferOverflow */ void filterCR() { int i, j; readBufferOverflow = -1; loop: for (i = 0, j = 0; j < readBufferLength; i++, j++) { switch (readBuffer[j]) { case '\r': if (j == readBufferLength - 1) { readBufferOverflow = '\r'; readBufferLength--; break loop; } else if (readBuffer[j + 1] == '\n') { j++; } readBuffer[i] = '\n'; break; case '\n': default: readBuffer[i] = readBuffer[j]; break; } } readBufferLength = i; } /** * Convert a buffer of UTF-8-encoded bytes into UTF-16 characters. * <p> * When readDataChunk() calls this method, the raw bytes are in * rawReadBuffer, and the final characters will appear in readBuffer. * <p> * The tricky part of this is dealing with UTF-8 multi-byte sequences, but * it doesn't seem to slow things down too much. * * @param count * The number of bytes to convert. * @see #readDataChunk * @see #rawReadBuffer * @see #readBuffer * @see #getNextUtf8Byte */ void copyUtf8ReadBuffer(int count) throws java.lang.Exception { int i = 0; int j = readBufferPos; int b1; boolean isSurrogate = false; while (i < count) { b1 = rawReadBuffer[i++]; isSurrogate = false; // Determine whether we are dealing // with a one-, two-, three-, or four- // byte sequence. if ((b1 & 0x80) == 0) { // 1-byte sequence: 000000000xxxxxxx = 0xxxxxxx readBuffer[j++] = (char) b1; } else if ((b1 & 0xe0) == 0xc0) { // 2-byte sequence: 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx readBuffer[j++] = (char) (((b1 & 0x1f) << 6) | getNextUtf8Byte( i++, count)); } else if ((b1 & 0xf0) == 0xe0) { // 3-byte sequence: zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy // 10xxxxxx readBuffer[j++] = (char) (((b1 & 0x0f) << 12) | (getNextUtf8Byte(i++, count) << 6) | getNextUtf8Byte( i++, count)); } else if ((b1 & 0xf8) == 0xf0) { // 4-byte sequence: 11101110wwwwzzzzyy + 110111yyyyxxxxxx // = 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx // (uuuuu = wwww + 1) isSurrogate = true; int b2 = getNextUtf8Byte(i++, count); int b3 = getNextUtf8Byte(i++, count); int b4 = getNextUtf8Byte(i++, count); readBuffer[j++] = (char) (0xd800 | ((((b1 & 0x07) << 2) | ((b2 & 0x30) >> 4) - 1) << 6) | ((b2 & 0x0f) << 2) | ((b3 & 0x30) >> 4)); readBuffer[j++] = (char) (0xdc | ((b3 & 0x0f) << 6) | b4); } else { // Otherwise, the 8th bit may not be set in UTF-8 encodingError("bad start for UTF-8 multi-byte sequence", b1, i); } if (readBuffer[j - 1] == '\r') { sawCR = true; } } // How many characters have we read? readBufferLength = j; } /** * Return the next byte value in a UTF-8 sequence. If it is not possible to * get a byte from the current entity, throw an exception. * * @param pos * The current position in the rawReadBuffer. * @param count * The number of bytes in the rawReadBuffer * @return The significant six bits of a non-initial byte in a UTF-8 * sequence. * @exception EOFException * If the sequence is incomplete. */ int getNextUtf8Byte(int pos, int count) throws java.lang.Exception { int val; // Take a character from the buffer // or from the actual input stream. if (pos < count) { val = rawReadBuffer[pos]; } else { val = is.read(); if (val == -1) { encodingError("unfinished multi-byte UTF-8 sequence at EOF", -1, pos); } } // Check for the correct bits at the // start. if ((val & 0xc0) != 0x80) { encodingError("bad continuation of multi-byte UTF-8 sequence", val, pos + 1); } // Return the significant bits. return (val & 0x3f); } /** * Convert a buffer of ISO-8859-1-encoded bytes into UTF-16 characters. * <p> * When readDataChunk() calls this method, the raw bytes are in * rawReadBuffer, and the final characters will appear in readBuffer. * <p> * This is a direct conversion, with no tricks. * * @param count * The number of bytes to convert. * @see #readDataChunk * @see #rawReadBuffer * @see #readBuffer */ void copyIso8859_1ReadBuffer(int count) { int i, j; for (i = 0, j = readBufferPos; i < count; i++, j++) { readBuffer[j] = (char) (rawReadBuffer[i] & 0xff); if (readBuffer[j] == '\r') { sawCR = true; } } readBufferLength = j; } /** * Convert a buffer of UCS-2-encoded bytes into UTF-16 characters. * <p> * When readDataChunk() calls this method, the raw bytes are in * rawReadBuffer, and the final characters will appear in readBuffer. * * @param count * The number of bytes to convert. * @param shift1 * The number of bits to shift byte 1. * @param shift2 * The number of bits to shift byte 2 * @see #readDataChunk * @see #rawReadBuffer * @see #readBuffer */ void copyUcs2ReadBuffer(int count, int shift1, int shift2) throws java.lang.Exception { int j = readBufferPos; if (count > 0 && (count % 2) != 0) { encodingError("odd number of bytes in UCS-2 encoding", -1, count); } for (int i = 0; i < count; i += 2) { readBuffer[j++] = (char) (((rawReadBuffer[i] & 0xff) << shift1) | ((rawReadBuffer[i + 1] & 0xff) << shift2)); if (readBuffer[j - 1] == '\r') { sawCR = true; } } readBufferLength = j; } /** * Convert a buffer of UCS-4-encoded bytes into UTF-16 characters. * <p> * When readDataChunk() calls this method, the raw bytes are in * rawReadBuffer, and the final characters will appear in readBuffer. * <p> * Java has 16-bit chars, but this routine will attempt to use surrogates to * encoding values between 0x00010000 and 0x000fffff. * * @param count * The number of bytes to convert. * @param shift1 * The number of bits to shift byte 1. * @param shift2 * The number of bits to shift byte 2 * @param shift3 * The number of bits to shift byte 2 * @param shift4 * The number of bits to shift byte 2 * @see #readDataChunk * @see #rawReadBuffer * @see #readBuffer */ void copyUcs4ReadBuffer(int count, int shift1, int shift2, int shift3, int shift4) throws java.lang.Exception { int j = readBufferPos; int value; if (count > 0 && (count % 4) != 0) { encodingError( "number of bytes in UCS-4 encoding not divisible by 4", -1, count); } for (int i = 0; i < count; i += 4) { value = (((rawReadBuffer[i] & 0xff) << shift1) | ((rawReadBuffer[i + 1] & 0xff) << shift2) | ((rawReadBuffer[i + 2] & 0xff) << shift3) | ((rawReadBuffer[i + 3] & 0xff) << shift4)); if (value < 0x0000ffff) { readBuffer[j++] = (char) value; if (value == (int) '\r') { sawCR = true; } } else if (value < 0x000fffff) { readBuffer[j++] = (char) (0xd8 | ((value & 0x000ffc00) >> 10)); readBuffer[j++] = (char) (0xdc | (value & 0x0003ff)); } else { encodingError("value cannot be represented in UTF-16", value, i); } } readBufferLength = j; } /** * Report a character encoding error. */ void encodingError(String message, int value, int offset) throws java.lang.Exception { String uri; if (value >= 0) { message = message + " (byte value: 0x" + Integer.toHexString(value) + ')'; } if (externalEntity != null) { uri = externalEntity.getURL().toString(); } else { uri = baseURI; } handler.error(message, uri, -1, offset + currentByteCount); } ////////////////////////////////////////////////////////////////////// // Local Variables. ////////////////////////////////////////////////////////////////////// /** * Re-initialize the variables for each parse. */ void initializeVariables() { // No errors; first line errorCount = 0; line = 1; column = 0; // Set up the buffers for data and names dataBufferPos = 0; dataBuffer = new char[DATA_BUFFER_INITIAL]; nameBufferPos = 0; nameBuffer = new char[NAME_BUFFER_INITIAL]; // Set up the variables for the current // element context. currentElement = null; currentElementContent = CONTENT_UNDECLARED; // Set up the input variables sourceType = INPUT_NONE; externalEntity = null; tagAttributePos = 0; tagAttributes = new String[100]; rawReadBuffer = new byte[READ_BUFFER_MAX]; readBufferOverflow = -1; context = CONTEXT_NONE; symbolTable = new Object[SYMBOL_TABLE_LENGTH]; } /** * Clean up after the parse to allow some garbage collection. Leave around * anything that might be useful for queries. */ void cleanupVariables() { errorCount = -1; line = -1; column = -1; dataBuffer = null; nameBuffer = null; currentElement = null; currentElementContent = CONTENT_UNDECLARED; sourceType = INPUT_NONE; inputStack = null; externalEntity = null; entityStack = null; } // // The current XML handler interface. // XmlHandler handler; // // I/O information. // private Reader reader; // current reader private InputStream is; // current input stream private int line; // current line number private int column; // current column number private int sourceType; // type of input source private Stack inputStack = new Stack () ; // stack of input sources private URLConnection externalEntity; // current external entity private int encoding; // current character encoding. private int currentByteCount; // how many bytes read from current source. // // Maintain a count of errors. // private int errorCount; // // Buffers for decoded but unparsed character input. // private final static int READ_BUFFER_MAX = 16384; private char readBuffer[]; private int readBufferPos; private int readBufferLength; private int readBufferOverflow; // overflow character from last data chunk. // // Buffer for undecoded raw byte input. // private byte rawReadBuffer[]; // // Buffer for parsed character data. // private static int DATA_BUFFER_INITIAL = 4096; private char dataBuffer[]; private int dataBufferPos; // // Buffer for parsed names. // private static int NAME_BUFFER_INITIAL = 1024; private char nameBuffer[]; private int nameBufferPos; // // Hashtables for DTD information on elements, entities, and notations. // private Hashtable elementInfo = new Hashtable(); private Hashtable entityInfo = new Hashtable (); private Hashtable notationInfo; // // Element type currently in force. // private String currentElement; private int currentElementContent; // // Base external identifiers for resolution. // private String basePublicId; private String baseURI; private Reader baseReader; private InputStream baseInputStream; // // Stack of entity names, to help detect recursion. // private Stack entityStack = new Stack () ; // // Are we in a context where PEs are allowed? // private int context; // // Symbol table, for internalising names. // private Object symbolTable[]; private final static int SYMBOL_TABLE_LENGTH = 1087; // // Hash table of attributes found in current start tag. // private String tagAttributes[]; private int tagAttributePos; // // Utility flag: have we noticed a CR while reading the last // data chunk? If so, we will have to go back and normalise // CR/LF. // private boolean sawCR; }