// XmlParser.java: the main parser class.
// NO WARRANTY! See README, and copyright below.
// $Id$
package com.microstar.xml;
import java.io.BufferedInputStream;
import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.net.URL;
import java.net.URLConnection;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.Locale;
import java.util.Stack;
/**
* Parse XML documents and return parse events through call-backs.
* <p>You need to define a class implementing the <code>XmlHandler</code>
* interface: an object belonging to this class will receive the
* callbacks for the events. (As an alternative to implementing
* the full XmlHandler interface, you can simply extend the
* <code>HandlerBase</code> convenience class.)
* <p>Usage (assuming that <code>MyHandler</code> is your implementation
* of the <code>XmlHandler</code> interface):
* <pre>
* XmlHandler handler = new MyHandler();
* XmlParser parser = new XmlParser();
* parser.setHandler(handler);
* try {
* parser.parse("http://www.host.com/doc.xml", null);
* } catch (Exception e) {
* [do something interesting]
* }
* </pre>
* <p>Alternatively, you can use the standard SAX interfaces
* with the <code>SAXDriver</code> class as your entry point.
* @author Copyright (c) 1997, 1998 by Microstar Software Ltd.
* @author Written by David Megginson <dmeggins@microstar.com>
* @version 1.1
* @since Ptolemy II 0.2
* @see XmlHandler
* @see HandlerBase
*/
public class XmlParser {
//
// Use special cheats that speed up the code (currently about 50%),
// but may cause problems with future maintenance and add to the
// class file size (about 500 bytes).
//
private final static boolean USE_CHEATS = true;
//////////////////////////////////////////////////////////////////////
// Constructors.
////////////////////////////////////////////////////////////////////////
/**
* Construct a new parser with no associated handler.
* @see #setHandler
* @see #parse
*/
public XmlParser() {
}
/**
* Set the handler that will receive parsing events.
* @param handler The handler to receive callback events.
* @see #parse
* @see XmlHandler
*/
public void setHandler(XmlHandler handler) {
this.handler = handler;
}
/**
* Parse an XML document from a URI.
* <p>You may parse a document more than once, but only one thread
* may call this method for an object at one time.
* @param systemId The URI of the document.
* @param publicId The public identifier of the document, or null.
* @param encoding The suggested encoding, or null if unknown.
* @exception java.lang.Exception Any exception thrown by your
* own handlers, or any derivation of java.io.IOException
* thrown by the parser itself.
*/
public void parse(String systemId, String publicId, String encoding)
throws java.lang.Exception {
doParse(systemId, publicId, null, null, encoding);
}
/**
* Parse an XML document from a byte stream.
* <p>The URI that you supply will become the base URI for
* resolving relative links, but Ælfred will actually read
* the document from the supplied input stream.
* <p>You may parse a document more than once, but only one thread
* may call this method for an object at one time.
* @param systemId The base URI of the document, or null if not
* known.
* @param publicId The public identifier of the document, or null
* if not known.
* @param stream A byte input stream.
* @param encoding The suggested encoding, or null if unknown.
* @exception java.lang.Exception Any exception thrown by your
* own handlers, or any derivation of java.io.IOException
* thrown by the parser itself.
*/
public void parse(String systemId, String publicId, InputStream stream,
String encoding) throws java.lang.Exception {
doParse(systemId, publicId, null, stream, encoding);
}
/**
* Parse an XML document from a character stream.
* <p>The URI that you supply will become the base URI for
* resolving relative links, but Ælfred will actually read
* the document from the supplied input stream.
* <p>You may parse a document more than once, but only one thread
* may call this method for an object at one time.
* @param systemId The base URI of the document, or null if not
* known.
* @param publicId The public identifier of the document, or null
* if not known.
* @param reader A character stream.
* @exception java.lang.Exception Any exception thrown by your
* own handlers, or any derivation of java.io.IOException
* thrown by the parser itself.
*/
public void parse(String systemId, String publicId, Reader reader)
throws java.lang.Exception {
doParse(systemId, publicId, reader, null, null);
}
private synchronized void doParse(String systemId, String publicId,
Reader reader, InputStream stream, String encoding)
throws java.lang.Exception {
basePublicId = publicId;
baseURI = systemId;
baseReader = reader;
baseInputStream = stream;
initializeVariables();
// Set the default entities here.
setInternalEntity(intern("amp"), "&");
setInternalEntity(intern("lt"), "<");
setInternalEntity(intern("gt"), ">");
setInternalEntity(intern("apos"), "'");
setInternalEntity(intern("quot"), """);
if (handler != null) {
handler.startDocument();
}
pushURL("[document]", basePublicId, baseURI, baseReader,
baseInputStream, encoding);
parseDocument();
if (handler != null) {
handler.endDocument();
}
cleanupVariables();
}
////////////////////////////////////////////////////////////////////////
// Constants.
////////////////////////////////////////////////////////////////////////
//
// Constants for element content type.
//
/**
* Constant: an element has not been declared.
* @see #getElementContentType
*/
public final static int CONTENT_UNDECLARED = 0;
/**
* Constant: the element has a content model of ANY.
* @see #getElementContentType
*/
public final static int CONTENT_ANY = 1;
/**
* Constant: the element has declared content of EMPTY.
* @see #getElementContentType
*/
public final static int CONTENT_EMPTY = 2;
/**
* Constant: the element has mixed content.
* @see #getElementContentType
*/
public final static int CONTENT_MIXED = 3;
/**
* Constant: the element has element content.
* @see #getElementContentType
*/
public final static int CONTENT_ELEMENTS = 4;
//
// Constants for the entity type.
//
/**
* Constant: the entity has not been declared.
* @see #getEntityType
*/
public final static int ENTITY_UNDECLARED = 0;
/**
* Constant: the entity is internal.
* @see #getEntityType
*/
public final static int ENTITY_INTERNAL = 1;
/**
* Constant: the entity is external, non-XML data.
* @see #getEntityType
*/
public final static int ENTITY_NDATA = 2;
/**
* Constant: the entity is external XML data.
* @see #getEntityType
*/
public final static int ENTITY_TEXT = 3;
//
// Constants for attribute type.
//
/**
* Constant: the attribute has not been declared for this element type.
* @see #getAttributeType
*/
public final static int ATTRIBUTE_UNDECLARED = 0;
/**
* Constant: the attribute value is a string value.
* @see #getAttributeType
*/
public final static int ATTRIBUTE_CDATA = 1;
/**
* Constant: the attribute value is a unique identifier.
* @see #getAttributeType
*/
public final static int ATTRIBUTE_ID = 2;
/**
* Constant: the attribute value is a reference to a unique identifier.
* @see #getAttributeType
*/
public final static int ATTRIBUTE_IDREF = 3;
/**
* Constant: the attribute value is a list of ID references.
* @see #getAttributeType
*/
public final static int ATTRIBUTE_IDREFS = 4;
/**
* Constant: the attribute value is the name of an entity.
* @see #getAttributeType
*/
public final static int ATTRIBUTE_ENTITY = 5;
/**
* Constant: the attribute value is a list of entity names.
* @see #getAttributeType
*/
public final static int ATTRIBUTE_ENTITIES = 6;
/**
* Constant: the attribute value is a name token.
* @see #getAttributeType
*/
public final static int ATTRIBUTE_NMTOKEN = 7;
/**
* Constant: the attribute value is a list of name tokens.
* @see #getAttributeType
*/
public final static int ATTRIBUTE_NMTOKENS = 8;
/**
* Constant: the attribute value is a token from an enumeration.
* @see #getAttributeType
*/
public final static int ATTRIBUTE_ENUMERATED = 9;
/**
* Constant: the attribute is the name of a notation.
* @see #getAttributeType
*/
public final static int ATTRIBUTE_NOTATION = 10;
//
// When the class is loaded, populate the hash table of
// attribute types.
//
/**
* Hash table of attribute types.
*/
private static Hashtable attributeTypeHash;
static {
attributeTypeHash = new Hashtable();
attributeTypeHash.put("CDATA", Integer.valueOf(ATTRIBUTE_CDATA));
attributeTypeHash.put("ID", Integer.valueOf(ATTRIBUTE_ID));
attributeTypeHash.put("IDREF", Integer.valueOf(ATTRIBUTE_IDREF));
attributeTypeHash.put("IDREFS", Integer.valueOf(ATTRIBUTE_IDREFS));
attributeTypeHash.put("ENTITY", Integer.valueOf(ATTRIBUTE_ENTITY));
attributeTypeHash.put("ENTITIES", Integer.valueOf(ATTRIBUTE_ENTITIES));
attributeTypeHash.put("NMTOKEN", Integer.valueOf(ATTRIBUTE_NMTOKEN));
attributeTypeHash.put("NMTOKENS", Integer.valueOf(ATTRIBUTE_NMTOKENS));
attributeTypeHash.put("NOTATION", Integer.valueOf(ATTRIBUTE_NOTATION));
}
//
// Constants for supported encodings.
//
private final static int ENCODING_UTF_8 = 1;
private final static int ENCODING_ISO_8859_1 = 2;
private final static int ENCODING_UCS_2_12 = 3;
private final static int ENCODING_UCS_2_21 = 4;
private final static int ENCODING_UCS_4_1234 = 5;
private final static int ENCODING_UCS_4_4321 = 6;
private final static int ENCODING_UCS_4_2143 = 7;
private final static int ENCODING_UCS_4_3412 = 8;
//
// Constants for attribute default value.
//
/**
* Constant: the attribute is not declared.
* @see #getAttributeDefaultValueType
*/
public final static int ATTRIBUTE_DEFAULT_UNDECLARED = 0;
/**
* Constant: the attribute has a literal default value specified.
* @see #getAttributeDefaultValueType
* @see #getAttributeDefaultValue
*/
public final static int ATTRIBUTE_DEFAULT_SPECIFIED = 1;
/**
* Constant: the attribute was declared #IMPLIED.
* @see #getAttributeDefaultValueType
*/
public final static int ATTRIBUTE_DEFAULT_IMPLIED = 2;
/**
* Constant: the attribute was declared #REQUIRED.
* @see #getAttributeDefaultValueType
*/
public final static int ATTRIBUTE_DEFAULT_REQUIRED = 3;
/**
* Constant: the attribute was declared #FIXED.
* @see #getAttributeDefaultValueType
* @see #getAttributeDefaultValue
*/
public final static int ATTRIBUTE_DEFAULT_FIXED = 4;
//
// Constants for input.
//
private final static int INPUT_NONE = 0;
private final static int INPUT_INTERNAL = 1;
private final static int INPUT_EXTERNAL = 2;
private final static int INPUT_STREAM = 3;
private final static int INPUT_BUFFER = 4;
private final static int INPUT_READER = 5;
//
// Flags for reading literals.
//
private final static int LIT_CHAR_REF = 1;
private final static int LIT_ENTITY_REF = 2;
private final static int LIT_PE_REF = 4;
private final static int LIT_NORMALIZE = 8;
//
// Flags for parsing context.
//
private final static int CONTEXT_NONE = 0;
private final static int CONTEXT_DTD = 1;
private final static int CONTEXT_ENTITYVALUE = 2;
private final static int CONTEXT_ATTRIBUTEVALUE = 3;
//////////////////////////////////////////////////////////////////////
// Error reporting.
//////////////////////////////////////////////////////////////////////
/**
* Report an error.
* @param message The error message.
* @param textFound The text that caused the error (or null).
* @see XmlHandler#error
* @see #line
*/
void error(String message, String textFound, String textExpected)
throws java.lang.Exception {
errorCount++;
if (textFound != null) {
message = message + " (found \"" + textFound + "\")";
}
if (textExpected != null) {
message = message + " (expected \"" + textExpected + "\")";
}
if (handler != null) {
String uri = null;
if (externalEntity != null) {
uri = externalEntity.getURL().toString();
}
handler.error(message, uri, line, column);
}
}
/**
* Report a serious error.
* @param message The error message.
* @param textFound The text that caused the error (or null).
*/
void error(String message, char textFound, String textExpected)
throws java.lang.Exception {
error(message, Character.toString(textFound), textExpected);
}
//////////////////////////////////////////////////////////////////////
// Major syntactic productions.
//////////////////////////////////////////////////////////////////////
/**
* Parse an XML document.
* <pre>
* [1] document ::= prolog element Misc*
* </pre>
* <p>This is the top-level parsing function for a single XML
* document. As a minimum, a well-formed document must have
* a document element, and a valid document must have a prolog
* as well.
*/
void parseDocument() throws java.lang.Exception {
char c;
parseProlog();
require('<');
parseElement();
try {
parseMisc(); //skip all white, PIs, and comments
c = readCh(); //if this doesn't throw an exception...
error("unexpected characters after document end", c, null);
} catch (EOFException e) {
return;
}
}
/**
* Skip a comment.
* <pre>
* [18] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* "-->"
* </pre>
* <p>(The <code><!--</code> has already been read.)
*/
void parseComment() throws java.lang.Exception {
skipUntil("-->");
}
/**
* Parse a processing instruction and do a call-back.
* <pre>
* [19] PI ::= '<?' Name (S (Char* - (Char* '?>' Char*)))? '?>'
* </pre>
* <p>(The <code><?</code> has already been read.)
* <p>An XML processing instruction <em>must</em> begin with
* a Name, which is the instruction's target.
*/
void parsePI() throws java.lang.Exception {
String name;
name = readNmtoken(true);
if (!tryRead("?>")) {
requireWhitespace();
parseUntil("?>");
}
if (handler != null) {
handler.processingInstruction(name, dataBufferToString());
}
}
/**
* Parse a CDATA marked section.
* <pre>
* [20] CDSect ::= CDStart CData CDEnd
* [21] CDStart ::= '<![CDATA['
* [22] CData ::= (Char* - (Char* ']]>' Char*))
* [23] CDEnd ::= ']]>'
* </pre>
* <p>(The '<![CDATA[' has already been read.)
* <p>Note that this just appends characters to the dataBuffer,
* without actually generating an event.
*/
void parseCDSect() throws java.lang.Exception {
parseUntil("]]>");
}
/**
* Parse the prolog of an XML document.
* <pre>
* [24] prolog ::= XMLDecl? Misc* (Doctypedecl Misc*)?
* </pre>
* <p>There are a couple of tricks here. First, it is necessary to
* declare the XML default attributes after the DTD (if present)
* has been read. Second, it is not possible to expand general
* references in attribute value literals until after the entire
* DTD (if present) has been parsed.
* <p>We do not look for the XML declaration here, because it is
* handled by pushURL().
* @see #pushURL
*/
void parseProlog() throws java.lang.Exception {
parseMisc();
if (tryRead("<!DOCTYPE")) {
parseDoctypedecl();
parseMisc();
}
}
/**
* Parse the XML declaration.
* <pre>
* [25] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
* [26] VersionInfo ::= S 'version' Eq ('"1.0"' | "'1.0'")
* [33] SDDecl ::= S 'standalone' Eq "'" ('yes' | 'no') "'"
* | S 'standalone' Eq '"' ("yes" | "no") '"'
* [78] EncodingDecl ::= S 'encoding' Eq QEncoding
* </pre>
* <p>([80] to [82] are also significant.)
* <p>(The <code><?xml</code> and whitespace have already been read.)
* <p>TODO: validate value of standalone.
* @see #parseTextDecl
* @see #checkEncoding
*/
void parseXMLDecl(boolean ignoreEncoding) throws java.lang.Exception {
String version;
String encodingName = null;
// String standalone = null;
// Read the version.
require("version");
parseEq();
version = readLiteral(0);
if (!version.equals("1.0")) {
error("unsupported XML version", version, "1.0");
}
// Try reading an encoding declaration.
skipWhitespace();
if (tryRead("encoding")) {
parseEq();
encodingName = readLiteral(0);
checkEncoding(encodingName, ignoreEncoding);
}
// Try reading a standalone declaration
skipWhitespace();
if (tryRead("standalone")) {
parseEq();
// FIXME: Why is the literal read, but the value ignored?
/* standalone = */readLiteral(0);
}
skipWhitespace();
require("?>");
}
/**
* Parse the Encoding PI.
* <pre>
* [78] EncodingDecl ::= S 'encoding' Eq QEncoding
* [79] EncodingPI ::= '<?xml' S 'encoding' Eq QEncoding S? '?>'
* [80] QEncoding ::= '"' Encoding '"' | "'" Encoding "'"
* [81] Encoding ::= LatinName
* [82] LatinName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
* </pre>
* <p>(The <code><?xml</code>' and whitespace have already been read.)
* @see #parseXMLDecl
* @see #checkEncoding
*/
void parseTextDecl(boolean ignoreEncoding) throws java.lang.Exception {
String encodingName = null;
// Read an optional version.
if (tryRead("version")) {
String version;
parseEq();
version = readLiteral(0);
if (!version.equals("1.0")) {
error("unsupported XML version", version, "1.0");
}
requireWhitespace();
}
// Read the encoding.
require("encoding");
parseEq();
encodingName = readLiteral(0);
checkEncoding(encodingName, ignoreEncoding);
skipWhitespace();
require("?>");
}
/**
* Check that the encoding specified makes sense.
* <p>Compare what the author has specified in the XML declaration
* or encoding PI with what we have detected.
* <p>This is also important for distinguishing among the various
* 7- and 8-bit encodings, such as ISO-LATIN-1 (I cannot autodetect
* those).
* @param encodingName The name of the encoding specified by the user.
* @see #parseXMLDecl
* @see #parseTextDecl
*/
void checkEncoding(String encodingName, boolean ignoreEncoding)
throws java.lang.Exception {
// FindBugs suggests using toUpperCase(Locale)
encodingName = encodingName.toUpperCase(Locale.getDefault());
if (ignoreEncoding) {
return;
}
switch (encoding) {
// 8-bit encodings
case ENCODING_UTF_8:
if (encodingName.equals("ISO-8859-1")) {
encoding = ENCODING_ISO_8859_1;
} else if (!encodingName.equals("UTF-8")) {
error("unsupported 8-bit encoding", encodingName,
"UTF-8 or ISO-8859-1");
}
break;
// 16-bit encodings
case ENCODING_UCS_2_12:
case ENCODING_UCS_2_21:
if (!encodingName.equals("ISO-10646-UCS-2")
&& !encodingName.equals("UTF-16")) {
error("unsupported 16-bit encoding", encodingName,
"ISO-10646-UCS-2");
}
break;
// 32-bit encodings
case ENCODING_UCS_4_1234:
case ENCODING_UCS_4_4321:
case ENCODING_UCS_4_2143:
case ENCODING_UCS_4_3412:
if (!encodingName.equals("ISO-10646-UCS-4")) {
error("unsupported 32-bit encoding", encodingName,
"ISO-10646-UCS-4");
}
}
}
/**
* Parse miscellaneous markup outside the document element and DOCTYPE
* declaration.
* <pre>
* [27] Misc ::= Comment | PI | S
* </pre>
*/
void parseMisc() throws java.lang.Exception {
while (true) {
skipWhitespace();
if (tryRead("<?")) {
parsePI();
} else if (tryRead("<!--")) {
parseComment();
} else {
return;
}
}
}
/**
* Parse a document type declaration.
* <pre>
* [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
* ('[' %markupdecl* ']' S?)? '>'
* </pre>
* <p>(The <code><!DOCTYPE</code> has already been read.)
*/
void parseDoctypedecl() throws java.lang.Exception {
String doctypeName;
String[] ids;
// Read the document type name.
requireWhitespace();
doctypeName = readNmtoken(true);
// Read the ExternalIDs.
skipWhitespace();
ids = readExternalIds(false);
// Look for a declaration subset.
skipWhitespace();
if (tryRead('[')) {
// loop until the subset ends
while (true) {
context = CONTEXT_DTD;
skipWhitespace();
context = CONTEXT_NONE;
if (tryRead(']')) {
break; // end of subset
} else {
context = CONTEXT_DTD;
parseMarkupdecl();
context = CONTEXT_NONE;
}
}
}
// Read the external subset, if any
if (ids[1] != null) {
pushURL("[external subset]", ids[0], ids[1], null, null, null);
// Loop until we end up back at '>'
while (true) {
context = CONTEXT_DTD;
skipWhitespace();
context = CONTEXT_NONE;
if (tryRead('>')) {
break;
} else {
context = CONTEXT_DTD;
parseMarkupdecl();
context = CONTEXT_NONE;
}
}
} else {
// No external subset.
skipWhitespace();
require('>');
}
if (handler != null) {
handler.doctypeDecl(doctypeName, ids[0], ids[1]);
}
// Expand general entities in
// default values of attributes.
// (Do this after the doctypeDecl
// event!).
// expandAttributeDefaultValues();
}
/**
* Parse a markup declaration in the internal or external DTD subset.
* <pre>
* [29] markupdecl ::= ( %elementdecl | %AttlistDecl | %EntityDecl |
* %NotationDecl | %PI | %S | %Comment |
* InternalPERef )
* [30] InternalPERef ::= PEReference
* [31] extSubset ::= (%markupdecl | %conditionalSect)*
* </pre>
*/
void parseMarkupdecl() throws java.lang.Exception {
if (tryRead("<!ELEMENT")) {
parseElementdecl();
} else if (tryRead("<!ATTLIST")) {
parseAttlistDecl();
} else if (tryRead("<!ENTITY")) {
parseEntityDecl();
} else if (tryRead("<!NOTATION")) {
parseNotationDecl();
} else if (tryRead("<?")) {
parsePI();
} else if (tryRead("<!--")) {
parseComment();
} else if (tryRead("<![")) {
parseConditionalSect();
} else {
error("expected markup declaration", null, null);
}
}
/**
* Parse an element, with its tags.
* <pre>
* [33] STag ::= '<' Name (S Attribute)* S? '>' [WFC: unique Att spec]
* [38] element ::= EmptyElement | STag content ETag
* [39] EmptyElement ::= '<' Name (S Attribute)* S? '/>'
* [WFC: unique Att spec]
* </pre>
* <p>(The '<' has already been read.)
* <p>NOTE: this method actually chains onto parseContent(), if necessary,
* and parseContent() will take care of calling parseETag().
*/
void parseElement() throws java.lang.Exception {
String gi;
char c;
int oldElementContent = currentElementContent;
String oldElement = currentElement;
// This is the (global) counter for the
// array of specified attributes.
tagAttributePos = 0;
// Read the element type name.
gi = readNmtoken(true);
// Determine the current content type.
currentElement = gi;
currentElementContent = getElementContentType(gi);
if (currentElementContent == CONTENT_UNDECLARED) {
currentElementContent = CONTENT_ANY;
}
// Read the attributes, if any.
// After this loop, we should be just
// in front of the closing delimiter.
skipWhitespace();
c = readCh();
while ((c != '/') && (c != '>')) {
unread(c);
parseAttribute(gi);
skipWhitespace();
c = readCh();
}
unread(c);
// Supply any defaulted attributes.
Enumeration atts = declaredAttributes(gi);
if (atts != null) {
String aname;
loop: while (atts.hasMoreElements()) {
aname = (String) atts.nextElement();
// See if it was specified.
for (int i = 0; i < tagAttributePos; i++) {
if (tagAttributes[i].equals(aname)) {
continue loop;
}
}
// I guess not...
if (handler != null) {
handler.attribute(aname, getAttributeExpandedValue(gi,
aname), false);
}
}
}
// Figure out if this is a start tag
// or an empty element, and dispatch an
// event accordingly.
c = readCh();
switch (c) {
case '>':
if (handler != null) {
handler.startElement(gi);
}
parseContent();
break;
case '/':
require('>');
if (handler != null) {
handler.startElement(gi);
handler.endElement(gi);
}
break;
}
// Restore the previous state.
currentElement = oldElement;
currentElementContent = oldElementContent;
}
/**
* Parse an attribute assignment.
* <pre>
* [34] Attribute ::= Name Eq AttValue
* </pre>
* @param name The name of the attribute's element.
* @see XmlHandler#attribute
*/
void parseAttribute(String name) throws java.lang.Exception {
String aname;
int type;
String value;
// Read the attribute name.
aname = readNmtoken(true).intern();
// Fix by Zoltan Kemenczy for:
// "attribute value normalization according to Section 3.3.3
// Attribute-Value Normalization of XML 1.0
// http://www.w3.org/TR/2000/REC-xml-20001006#AVNormalize). It
// says that escaped whitespace character references that are not
// #x20 (like the newline,#xa) should be preserved in the
// normalized value)"
//type = getAttributeDefaultValueType(name, aname);
type = getAttributeType(name, aname);
// Parse '='
parseEq();
// Read the value, normalizing whitespace
// if it is not CDATA.
if ((type == ATTRIBUTE_CDATA) || (type == ATTRIBUTE_UNDECLARED)) {
value = readLiteral(LIT_CHAR_REF | LIT_ENTITY_REF);
} else {
value = readLiteral(LIT_CHAR_REF | LIT_ENTITY_REF | LIT_NORMALIZE);
}
// Inform the handler about the
// attribute.
if (handler != null) {
handler.attribute(aname, value, true);
}
dataBufferPos = 0;
// Note that the attribute has been
// specified.
if (tagAttributePos == tagAttributes.length) {
String[] newAttrib = new String[tagAttributes.length * 2];
System.arraycopy(tagAttributes, 0, newAttrib, 0, tagAttributePos);
tagAttributes = newAttrib;
}
tagAttributes[tagAttributePos++] = aname;
}
/**
* Parse an equals sign surrounded by optional whitespace.
* [35] Eq ::= S? '=' S?
*/
void parseEq() throws java.lang.Exception {
skipWhitespace();
require('=');
skipWhitespace();
}
/**
* Parse an end tag.
* [36] ETag ::= '</' Name S? '>'
* *NOTE: parseContent() chains to here.
*/
void parseETag() throws java.lang.Exception {
String name;
name = readNmtoken(true);
if (!name.equals(currentElement)) {
error("mismatched end tag", name, currentElement);
}
skipWhitespace();
require('>');
if (handler != null) {
handler.endElement(name);
}
}
/**
* Parse the content of an element.
* [37] content ::= (element | PCData | Reference | CDSect | PI | Comment)*
* [68] Reference ::= EntityRef | CharRef
*/
void parseContent() throws java.lang.Exception {
char c;
while (true) {
switch (currentElementContent) {
case CONTENT_ANY:
case CONTENT_MIXED:
parsePCData();
break;
case CONTENT_ELEMENTS:
parseWhitespace();
break;
}
// Handle delimiters
c = readCh();
switch (c) {
case '&': // Found "&"
c = readCh();
if (c == '#') {
parseCharRef();
} else {
unread(c);
parseEntityRef(true);
}
break;
case '<': // Found "<"
c = readCh();
switch (c) {
case '!': // Found "<!"
c = readCh();
switch (c) {
case '-': // Found "<!-"
require('-');
parseComment();
break;
case '[': // Found "<!["
require("CDATA[");
parseCDSect();
break;
default:
error("expected comment or CDATA section", c, null);
break;
}
break;
case '?': // Found "<?"
dataBufferFlush();
parsePI();
break;
case '/': // Found "</"
dataBufferFlush();
parseETag();
return;
default: // Found "<" followed by something else
dataBufferFlush();
unread(c);
parseElement();
break;
}
}
}
}
/**
* Parse an element type declaration.
* [40] elementdecl ::= '<!ELEMENT' S %Name S (%S S)? %contentspec S? '>'
* [VC: Unique Element Declaration]
* *NOTE: the '<!ELEMENT' has already been read.
*/
void parseElementdecl() throws java.lang.Exception {
String name;
requireWhitespace();
// Read the element type name.
name = readNmtoken(true);
requireWhitespace();
// Read the content model.
parseContentspec(name);
skipWhitespace();
require('>');
}
/**
* Content specification.
* [41] contentspec ::= 'EMPTY' | 'ANY' | Mixed | elements
*/
void parseContentspec(String name) throws java.lang.Exception {
if (tryRead("EMPTY")) {
setElement(name, CONTENT_EMPTY, null, null);
return;
} else if (tryRead("ANY")) {
setElement(name, CONTENT_ANY, null, null);
return;
} else {
require('(');
dataBufferAppend('(');
skipWhitespace();
if (tryRead("#PCDATA")) {
dataBufferAppend("#PCDATA");
parseMixed();
setElement(name, CONTENT_MIXED, dataBufferToString(), null);
} else {
parseElements();
setElement(name, CONTENT_ELEMENTS, dataBufferToString(), null);
}
}
}
/**
* Parse an element-content model.
* [42] elements ::= (choice | seq) ('?' | '*' | '+')?
* [44] cps ::= S? %cp S?
* [45] choice ::= '(' S? %ctokplus (S? '|' S? %ctoks)* S? ')'
* [46] ctokplus ::= cps ('|' cps)+
* [47] ctoks ::= cps ('|' cps)*
* [48] seq ::= '(' S? %stoks (S? ',' S? %stoks)* S? ')'
* [49] stoks ::= cps (',' cps)*
* *NOTE: the opening '(' and S have already been read.
* *TODO: go over parameter entity boundaries more carefully.
*/
void parseElements() throws java.lang.Exception {
char c;
char sep;
// Parse the first content particle
skipWhitespace();
parseCp();
// Check for end or for a separator.
skipWhitespace();
c = readCh();
switch (c) {
case ')':
dataBufferAppend(')');
c = readCh();
switch (c) {
case '*':
case '+':
case '?':
dataBufferAppend(c);
break;
default:
unread(c);
}
return;
case ',': // Register the separator.
case '|':
sep = c;
dataBufferAppend(c);
break;
default:
error("bad separator in content model", c, null);
return;
}
// Parse the rest of the content model.
while (true) {
skipWhitespace();
parseCp();
skipWhitespace();
c = readCh();
if (c == ')') {
dataBufferAppend(')');
break;
} else if (c != sep) {
error("bad separator in content model", c, "'" + sep + "'");
return;
} else {
dataBufferAppend(c);
}
}
// Check for the occurrence indicator.
c = readCh();
switch (c) {
case '?':
case '*':
case '+':
dataBufferAppend(c);
return;
default:
unread(c);
return;
}
}
/**
* Parse a content particle.
* [43] cp ::= (Name | choice | seq) ('?' | '*' | '+')
* *NOTE: I actually use a slightly different production here:
* cp ::= (elements | (Name ('?' | '*' | '+')?))
*/
void parseCp() throws java.lang.Exception {
char c;
if (tryRead('(')) {
dataBufferAppend('(');
parseElements();
} else {
dataBufferAppend(readNmtoken(true));
c = readCh();
switch (c) {
case '?':
case '*':
case '+':
dataBufferAppend(c);
break;
default:
unread(c);
break;
}
}
}
/**
* Parse mixed content.
* [50] Mixed ::= '(' S? %( %'#PCDATA' (S? '|' S? %Mtoks)* ) S? ')*'
* | '(' S? %('#PCDATA') S? ')'
* [51] Mtoks ::= %Name (S? '|' S? %Name)*
* *NOTE: the S and '#PCDATA' have already been read.
*/
void parseMixed() throws java.lang.Exception {
// Check for PCDATA alone.
skipWhitespace();
if (tryRead(')')) {
dataBufferAppend(")*");
tryRead('*');
return;
}
// Parse mixed content.
skipWhitespace();
while (!tryRead(")*")) {
require('|');
dataBufferAppend('|');
skipWhitespace();
dataBufferAppend(readNmtoken(true));
skipWhitespace();
}
dataBufferAppend(")*");
}
/**
* Parse an attribute list declaration.
* [52] AttlistDecl ::= '<!ATTLIST' S %Name S? %AttDef+ S? '>'
* *NOTE: the '<!ATTLIST' has already been read.
*/
void parseAttlistDecl() throws java.lang.Exception {
String elementName;
requireWhitespace();
elementName = readNmtoken(true);
requireWhitespace();
while (!tryRead('>')) {
parseAttDef(elementName);
skipWhitespace();
}
}
/**
* Parse a single attribute definition.
* [53] AttDef ::= S %Name S %AttType S %Default
*/
void parseAttDef(String elementName) throws java.lang.Exception {
String name;
int type;
String enumeration = null;
// Read the attribute name.
name = readNmtoken(true);
// Read the attribute type.
requireWhitespace();
type = readAttType();
// Get the string of enumerated values
// if necessary.
if ((type == ATTRIBUTE_ENUMERATED) || (type == ATTRIBUTE_NOTATION)) {
enumeration = dataBufferToString();
}
// Read the default value.
requireWhitespace();
parseDefault(elementName, name, type, enumeration);
}
/**
* Parse the attribute type.
* [54] AttType ::= StringType | TokenizedType | EnumeratedType
* [55] StringType ::= 'CDATA'
* [56] TokenizedType ::= 'ID' | 'IDREF' | 'IDREFS' | 'ENTITY' | 'ENTITIES' |
* 'NMTOKEN' | 'NMTOKENS'
* [57] EnumeratedType ::= NotationType | Enumeration
* *TODO: validate the type!!
*/
int readAttType() throws java.lang.Exception {
String typeString;
Integer type;
if (tryRead('(')) {
parseEnumeration();
return ATTRIBUTE_ENUMERATED;
} else {
typeString = readNmtoken(true);
if (typeString.equals("NOTATION")) {
parseNotationType();
}
type = (Integer) attributeTypeHash.get(typeString);
if (type == null) {
error("illegal attribute type", typeString, null);
return ATTRIBUTE_UNDECLARED;
} else {
return type.intValue();
}
}
}
/**
* Parse an enumeration.
* [60] Enumeration ::= '(' S? %Etoks (S? '|' S? %Etoks)* S? ')'
* [61] Etoks ::= %Nmtoken (S? '|' S? %Nmtoken)*
* *NOTE: the '(' has already been read.
*/
void parseEnumeration() throws java.lang.Exception {
dataBufferAppend('(');
// Read the first token.
skipWhitespace();
dataBufferAppend(readNmtoken(true));
// Read the remaining tokens.
skipWhitespace();
while (!tryRead(')')) {
require('|');
dataBufferAppend('|');
skipWhitespace();
dataBufferAppend(readNmtoken(true));
skipWhitespace();
}
dataBufferAppend(')');
}
/**
* Parse a notation type for an attribute.
* [58] NotationType ::= %'NOTATION' S '(' S? %Ntoks (S? '|' S? %Ntoks)*
* S? ')'
* [59] Ntoks ::= %Name (S? '|' S? %Name)
* *NOTE: the 'NOTATION' has already been read
*/
void parseNotationType() throws java.lang.Exception {
requireWhitespace();
require('(');
parseEnumeration();
}
/**
* Parse the default value for an attribute.
* [62] Default ::= '#REQUIRED' | '#IMPLIED' | ((%'#FIXED' S)? %AttValue
*/
void parseDefault(String elementName, String name, int type,
String enumeration) throws java.lang.Exception {
int valueType = ATTRIBUTE_DEFAULT_SPECIFIED;
String value = null;
if (tryRead('#')) {
if (tryRead("FIXED")) {
valueType = ATTRIBUTE_DEFAULT_FIXED;
requireWhitespace();
context = CONTEXT_ATTRIBUTEVALUE;
value = readLiteral(LIT_CHAR_REF);
context = CONTEXT_DTD;
} else if (tryRead("REQUIRED")) {
valueType = ATTRIBUTE_DEFAULT_REQUIRED;
} else if (tryRead("IMPLIED")) {
valueType = ATTRIBUTE_DEFAULT_IMPLIED;
} else {
error("illegal keyword for attribute default value", null, null);
}
} else {
context = CONTEXT_ATTRIBUTEVALUE;
value = readLiteral(LIT_CHAR_REF);
context = CONTEXT_DTD;
}
setAttribute(elementName, name, type, enumeration, value, valueType);
}
/**
* Parse a conditional section.
* [63] conditionalSect ::= includeSect || ignoreSect
* [64] includeSect ::= '<![' %'INCLUDE' '[' (%markupdecl*)* ']]>'
* [65] ignoreSect ::= '<![' %'IGNORE' '[' ignoreSectContents* ']]>'
* [66] ignoreSectContents ::= ((SkipLit | Comment | PI) -(Char* ']]>'))
* | ('<![' ignoreSectContents* ']]>')
* | (Char - (']' | [<'"]))
* | ('<!' (Char - ('-' | '[')))
* *NOTE: the '<![' has already been read.
* *TODO: verify that I am handling ignoreSectContents right.
*/
void parseConditionalSect() throws java.lang.Exception {
skipWhitespace();
if (tryRead("INCLUDE")) {
skipWhitespace();
require('[');
skipWhitespace();
while (!tryRead("]]>")) {
parseMarkupdecl();
skipWhitespace();
}
} else if (tryRead("IGNORE")) {
skipWhitespace();
require('[');
char c;
for (int nest = 1; nest > 0;) {
c = readCh();
switch (c) {
case '<':
if (tryRead("![")) {
nest++;
}
break;
case ']':
if (tryRead("]>")) {
nest--;
}
break;
}
}
} else {
error("conditional section must begin with INCLUDE or IGNORE",
null, null);
}
}
/**
* Read a character reference.
* [67] CharRef ::= '' [0-9]+ ';' | '' [0-9a-fA-F]+ ';'
* *NOTE: the '' has already been read.
*/
void parseCharRef() throws java.lang.Exception {
int value = 0;
char c;
if (tryRead('x')) {
loop1: while (true) {
c = readCh();
switch (c) {
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
case 'a':
case 'A':
case 'b':
case 'B':
case 'c':
case 'C':
case 'd':
case 'D':
case 'e':
case 'E':
case 'f':
case 'F':
value *= 16;
value += Integer.parseInt(Character.toString(c), 16);
break;
case ';':
break loop1;
default:
error("illegal character in character reference", c, null);
break loop1;
}
}
} else {
loop2: while (true) {
c = readCh();
switch (c) {
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
value *= 10;
value += Integer.parseInt(Character.toString(c), 10);
break;
case ';':
break loop2;
default:
error("illegal character in character reference", c, null);
break loop2;
}
}
}
// Check for surrogates: 00000000 0000xxxx yyyyyyyy zzzzzzzz
// (1101|10xx|xxyy|yyyy + 1101|11yy|zzzz|zzzz:
if (value <= 0x0000ffff) {
// no surrogates needed
dataBufferAppend((char) value);
} else if (value <= 0x000fffff) {
// > 16 bits, surrogate needed
dataBufferAppend((char) (0xd8 | ((value & 0x000ffc00) >> 10)));
dataBufferAppend((char) (0xdc | (value & 0x0003ff)));
} else {
// too big for surrogate
error("character reference " + value + " is too large for UTF-16",
Integer.valueOf(value).toString(), null);
}
}
/**
* Parse a reference.
* [69] EntityRef ::= '&' Name ';'
* *NOTE: the '&' has already been read.
* @param externalAllowed External entities are allowed here.
*/
void parseEntityRef(boolean externalAllowed) throws java.lang.Exception {
String name;
name = readNmtoken(true);
require(';');
switch (getEntityType(name)) {
case ENTITY_UNDECLARED:
error("reference to undeclared entity", name, null);
break;
case ENTITY_INTERNAL:
pushString(name, getEntityValue(name));
break;
case ENTITY_TEXT:
if (externalAllowed) {
pushURL(name, getEntityPublicId(name), getEntitySystemId(name),
null, null, null);
} else {
error("reference to external entity in attribute value.", name,
null);
}
break;
case ENTITY_NDATA:
if (externalAllowed) {
error("data entity reference in content", name, null);
} else {
error("reference to external entity in attribute value.", name,
null);
}
break;
}
}
/**
* Parse a parameter entity reference.
* [70] PEReference ::= '%' Name ';'
* *NOTE: the '%' has already been read.
*/
void parsePEReference(boolean isEntityValue) throws java.lang.Exception {
String name;
name = "%" + readNmtoken(true);
require(';');
switch (getEntityType(name)) {
case ENTITY_UNDECLARED:
error("reference to undeclared parameter entity", name, null);
break;
case ENTITY_INTERNAL:
if (isEntityValue) {
pushString(name, getEntityValue(name));
} else {
pushString(name, " " + getEntityValue(name) + ' ');
}
break;
case ENTITY_TEXT:
if (isEntityValue) {
pushString(null, " ");
}
pushURL(name, getEntityPublicId(name), getEntitySystemId(name),
null, null, null);
if (isEntityValue) {
pushString(null, " ");
}
break;
}
}
/**
* Parse an entity declaration.
* [71] EntityDecl ::= '<!ENTITY' S %Name S %EntityDef S? '>'
* | '<!ENTITY' S '%' S %Name S %EntityDef S? '>'
* [72] EntityDef ::= EntityValue | ExternalDef
* [73] ExternalDef ::= ExternalID %NDataDecl?
* [74] ExternalID ::= 'SYSTEM' S SystemLiteral
* | 'PUBLIC' S PubidLiteral S SystemLiteral
* [75] NDataDecl ::= S %'NDATA' S %Name
* *NOTE: the '<!ENTITY' has already been read.
*/
void parseEntityDecl() throws java.lang.Exception {
char c;
boolean peFlag = false;
String name;
String value;
String notationName;
String[] ids;
// Check for a parameter entity.
requireWhitespace();
if (tryRead('%')) {
peFlag = true;
requireWhitespace();
}
// Read the entity name, and prepend
// '%' if necessary.
name = readNmtoken(true);
if (peFlag) {
name = "%" + name;
}
// Read the entity value.
requireWhitespace();
c = readCh();
unread(c);
if ((c == '"') || (c == '\'')) {
// Internal entity.
context = CONTEXT_ENTITYVALUE;
value = readLiteral(LIT_CHAR_REF | LIT_PE_REF);
context = CONTEXT_DTD;
setInternalEntity(name, value);
} else {
// Read the external IDs
ids = readExternalIds(false);
if (ids[1] == null) {
error("system identifier missing", name, null);
}
// Check for NDATA declaration.
skipWhitespace();
if (tryRead("NDATA")) {
requireWhitespace();
notationName = readNmtoken(true);
setExternalDataEntity(name, ids[0], ids[1], notationName);
} else {
setExternalTextEntity(name, ids[0], ids[1]);
}
}
// Finish the declaration.
skipWhitespace();
require('>');
}
/**
* Parse a notation declaration.
* [81] NotationDecl ::= '<!NOTATION' S %Name S %ExternalID S? '>'
* *NOTE: the '<!NOTATION' has already been read.
*/
void parseNotationDecl() throws java.lang.Exception {
String nname;
String[] ids;
requireWhitespace();
nname = readNmtoken(true);
requireWhitespace();
// Read the external identifiers.
ids = readExternalIds(true);
if ((ids[0] == null) && (ids[1] == null)) {
error("external identifier missing", nname, null);
}
// Register the notation.
setNotation(nname, ids[0], ids[1]);
skipWhitespace();
require('>');
}
/**
* Parse PCDATA.
* <pre>
* [16] PCData ::= [^<&]*
* </pre>
* <p>The trick here is that the data stays in the dataBuffer without
* necessarily being converted to a string right away.
*/
void parsePCData() throws java.lang.Exception {
char c;
// Start with a little cheat -- in most
// cases, the entire sequence of
// character data will already be in
// the readBuffer; if not, fall through to
// the normal approach.
if (USE_CHEATS) {
int lineAugment = 0;
int columnAugment = 0;
/*loop:*/for (int i = readBufferPos; i < readBufferLength; i++) {
switch (readBuffer[i]) {
case '\n':
lineAugment++;
columnAugment = 0;
break;
case '&':
case '<':
int start = readBufferPos;
columnAugment++;
readBufferPos = i;
if (lineAugment > 0) {
line += lineAugment;
column = columnAugment;
} else {
column += columnAugment;
}
dataBufferAppend(readBuffer, start, i - start);
return;
default:
columnAugment++;
}
}
}
// OK, the cheat didn't work; start over
// and do it by the book.
while (true) {
c = readCh();
switch (c) {
case '<':
case '&':
unread(c);
return;
default:
dataBufferAppend(c);
break;
}
}
}
//////////////////////////////////////////////////////////////////////
// High-level reading and scanning methods.
//////////////////////////////////////////////////////////////////////
/**
* Require whitespace characters.
* [1] S ::= (#x20 | #x9 | #xd | #xa)+
*/
void requireWhitespace() throws java.lang.Exception {
char c = readCh();
if (isWhitespace(c)) {
skipWhitespace();
} else {
error("whitespace expected", c, null);
}
}
/**
* Parse whitespace characters, and leave them in the data buffer.
*/
void parseWhitespace() throws java.lang.Exception {
char c = readCh();
while (isWhitespace(c)) {
dataBufferAppend(c);
c = readCh();
}
unread(c);
}
/**
* Skip whitespace characters.
* [1] S ::= (#x20 | #x9 | #xd | #xa)+
*/
void skipWhitespace() throws java.lang.Exception {
// Start with a little cheat. Most of
// the time, the white space will fall
// within the current read buffer; if
// not, then fall through.
if (USE_CHEATS) {
int lineAugment = 0;
int columnAugment = 0;
loop: for (int i = readBufferPos; i < readBufferLength; i++) {
switch (readBuffer[i]) {
case ' ':
case '\t':
case '\r':
columnAugment++;
break;
case '\n':
lineAugment++;
columnAugment = 0;
break;
case '%':
if ((context == CONTEXT_DTD)
|| (context == CONTEXT_ENTITYVALUE)) {
break loop;
} // else fall through...
default:
readBufferPos = i;
if (lineAugment > 0) {
line += lineAugment;
column = columnAugment;
} else {
column += columnAugment;
}
return;
}
}
}
// OK, do it by the book.
char c = readCh();
while (isWhitespace(c)) {
c = readCh();
}
unread(c);
}
/**
* Read a name or name token.
* [5] Name ::= (Letter | '_' | ':') (NameChar)*
* [7] Nmtoken ::= (NameChar)+
* *NOTE: [6] is implemented implicitly where required.
*/
String readNmtoken(boolean isName) throws java.lang.Exception {
char c;
if (USE_CHEATS) {
loop: for (int i = readBufferPos; i < readBufferLength; i++) {
switch (readBuffer[i]) {
case '%':
if ((context == CONTEXT_DTD)
|| (context == CONTEXT_ENTITYVALUE)) {
break loop;
} // else fall through...
case '<':
case '>':
case '&':
case ',':
case '|':
case '*':
case '+':
case '?':
case ')':
case '=':
case '\'':
case '"':
case '[':
case ' ':
case '\t':
case '\r':
case '\n':
case ';':
case '/':
case '#':
int start = readBufferPos;
if (i == start) {
error("name expected", readBuffer[i], null);
}
readBufferPos = i;
return intern(readBuffer, start, i - start);
}
}
}
nameBufferPos = 0;
// Read the first character.
/*loop: */while (true) {
c = readCh();
switch (c) {
case '%':
case '<':
case '>':
case '&':
case ',':
case '|':
case '*':
case '+':
case '?':
case ')':
case '=':
case '\'':
case '"':
case '[':
case ' ':
case '\t':
case '\n':
case '\r':
case ';':
case '/':
unread(c);
if (nameBufferPos == 0) {
error("name expected", null, null);
}
String s = intern(nameBuffer, 0, nameBufferPos);
nameBufferPos = 0;
return s;
default:
nameBuffer = (char[]) extendArray(nameBuffer,
nameBuffer.length, nameBufferPos);
nameBuffer[nameBufferPos++] = c;
}
}
}
/**
* Read a literal.
* [10] AttValue ::= '"' ([^<&"] | Reference)* '"'
* | "'" ([^<&'] | Reference)* "'"
* [11] SystemLiteral ::= '"' URLchar* '"' | "'" (URLchar - "'")* "'"
* [13] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
* [9] EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"'
* | "'" ([^%&'] | PEReference | Reference)* "'"
*/
String readLiteral(int flags) throws java.lang.Exception {
char delim;
char c;
int startLine = line;
// Find the delimiter.
delim = readCh();
if ((delim != '"') && (delim != '\'') && (delim != (char) 0)) {
error("expected '\"' or \"'\"", delim, null);
return null;
}
// Read the literal.
try {
c = readCh();
loop: while (c != delim) {
switch (c) {
// Literals never have line ends
case '\n':
case '\r':
c = ' ';
break;
// References may be allowed
case '&':
if ((flags & LIT_CHAR_REF) > 0) {
c = readCh();
if (c == '#') {
parseCharRef();
c = readCh();
continue loop; // check the next character
} else if ((flags & LIT_ENTITY_REF) > 0) {
unread(c);
parseEntityRef(false);
c = readCh();
continue loop;
} else {
dataBufferAppend('&');
}
}
break;
default:
break;
}
dataBufferAppend(c);
c = readCh();
}
} catch (EOFException e) {
error("end of input while looking for delimiter (started on line "
+ startLine + ')', null, Character.toString(delim));
}
// Normalise whitespace if necessary.
if ((flags & LIT_NORMALIZE) > 0) {
dataBufferNormalize();
}
// Return the value.
return dataBufferToString();
}
/**
* Try reading external identifiers.
* <p>The system identifier is not required for notations.
* @param inNotation Are we in a notation?
* @return A two-member String array containing the identifiers.
*/
String[] readExternalIds(boolean inNotation) throws java.lang.Exception {
String[] ids = new String[2];
if (tryRead("PUBLIC")) {
requireWhitespace();
ids[0] = readLiteral(LIT_NORMALIZE); // public id
if (inNotation) {
skipWhitespace();
if (tryRead('"') || tryRead('\'')) {
ids[1] = readLiteral(0);
}
} else {
requireWhitespace();
ids[1] = readLiteral(0); // system id
}
} else if (tryRead("SYSTEM")) {
requireWhitespace();
ids[1] = readLiteral(0); // system id
}
return ids;
}
/**
* Test if a character is whitespace.
* <pre>
* [1] S ::= (#x20 | #x9 | #xd | #xa)+
* </pre>
* @param c The character to test.
* @return true if the character is whitespace.
*/
final boolean isWhitespace(char c) {
switch (c) {
case 0x20:
case 0x09:
case 0x0d:
case 0x0a:
return true;
default:
return false;
}
}
//////////////////////////////////////////////////////////////////////
// Utility routines.
//////////////////////////////////////////////////////////////////////
/**
* Add a character to the data buffer.
*/
void dataBufferAppend(char c) {
// Expand buffer if necessary.
if (dataBufferPos >= dataBuffer.length) {
// dataBufferAppend() gets called alot, so instead of
// calling extendArray() here, we optimize the heck out of this
// code.
//dataBuffer = (char[]) extendArray(dataBuffer, dataBuffer.length,
// dataBufferPos);
final int currentSize = dataBuffer.length;
int newSize = currentSize * 2;
if (newSize <= dataBufferPos) {
newSize = dataBufferPos + 1;
}
// Dwight Richards pointed out that newSize was ignored (11/03)
char [] newArray = new char[newSize];
System.arraycopy(dataBuffer, 0, newArray, 0, currentSize);
dataBuffer = newArray;
}
dataBuffer[dataBufferPos++] = c;
}
/**
* Add a string to the data buffer.
*/
void dataBufferAppend(String s) {
dataBufferAppend(s.toCharArray(), 0, s.length());
}
/**
* Append (part of) a character array to the data buffer.
*/
void dataBufferAppend(char[] ch, int start, int length) {
dataBuffer = (char[]) extendArray(dataBuffer, dataBuffer.length,
dataBufferPos + length);
System.arraycopy(ch, start, dataBuffer, dataBufferPos, length);
dataBufferPos += length;
}
/**
* Normalise whitespace in the data buffer.
*/
void dataBufferNormalize() {
int i = 0;
int j = 0;
int end = dataBufferPos;
// Skip whitespace at the start.
while ((j < end) && isWhitespace(dataBuffer[j])) {
j++;
}
// Skip whitespace at the end.
while ((end > j) && isWhitespace(dataBuffer[end - 1])) {
end--;
}
// Start copying to the left.
while (j < end) {
char c = dataBuffer[j++];
// Normalise all other whitespace to
// a single space.
if (isWhitespace(c)) {
while ((j < end) && isWhitespace(dataBuffer[j++])) {
}
dataBuffer[i++] = ' ';
dataBuffer[i++] = dataBuffer[j - 1];
} else {
dataBuffer[i++] = c;
}
}
// The new length is <= the old one.
dataBufferPos = i;
}
/**
* Convert the data buffer to a string.
* @see #intern(char[],int,int)
*/
String dataBufferToString() {
String s = new String(dataBuffer, 0, dataBufferPos);
dataBufferPos = 0;
return s;
}
/**
* Flush the contents of the data buffer to the handler, if
* appropriate, and reset the buffer for new input.
*/
void dataBufferFlush() throws java.lang.Exception {
if (dataBufferPos > 0) {
switch (currentElementContent) {
case CONTENT_UNDECLARED:
case CONTENT_EMPTY:
// do nothing
break;
case CONTENT_MIXED:
case CONTENT_ANY:
if (handler != null) {
handler.charData(dataBuffer, 0, dataBufferPos);
}
break;
case CONTENT_ELEMENTS:
if (handler != null) {
handler.ignorableWhitespace(dataBuffer, 0, dataBufferPos);
}
break;
}
dataBufferPos = 0;
}
}
/**
* Require a string to appear, or throw an exception.
*/
void require(String delim) throws java.lang.Exception {
char[] ch = delim.toCharArray();
for (int i = 0; i < ch.length; i++) {
require(ch[i]);
}
}
/**
* Require a character to appear, or throw an exception.
*/
void require(char delim) throws java.lang.Exception {
char c = readCh();
if (c != delim) {
error("expected character", c, Character.toString(delim));
}
}
/**
* Return an internalised version of a string.
* <p>Ælfred uses this method to create an internalised version
* of all names and attribute values, so that it can test equality
* with <code>==</code> instead of <code>String.equals()</code>.
* <p>If you want to be able to test for equality in the same way,
* you can use this method to internalise your own strings first:
* <pre>
* String PARA = handler.intern("PARA");
* </pre>
* <p>Note that this will not return the same results as String.intern().
* @param s The string to internalise.
* @return An internalised version of the string.
* @see #intern(char[],int,int)
* @see java.lang.String#intern
*/
public String intern(String s) {
char[] ch = s.toCharArray();
return intern(ch, 0, ch.length);
}
/**
* Create an internalised string from a character array.
* <p>This is much more efficient than constructing a non-internalised
* string first, and then internalising it.
* <p>Note that this will not return the same results as String.intern().
* @param ch an array of characters for building the string.
* @param start the starting position in the array.
* @param length the number of characters to place in the string.
* @return an internalised string.
* @see #intern(String)
* @see java.lang.String#intern
*/
public String intern(char[] ch, int start, int length) {
int index;
int hash = 0;
// Generate a hash code.
for (int i = start; i < (start + length); i++) {
hash = ((hash << 1) & 0xffffff) + ch[i];
}
hash = hash % SYMBOL_TABLE_LENGTH;
// Get the bucket.
Object[] bucket = (Object[]) symbolTable[hash];
if (bucket == null) {
symbolTable[hash] = bucket = new Object[8];
}
// Search for a matching tuple, and
// return the string if we find one.
for (index = 0; index < bucket.length; index += 2) {
char[] chFound = (char[]) bucket[index];
// Stop when we hit a null index.
if (chFound == null) {
break;
}
// If they're the same length,
// check for a match.
// If the loop finishes, 'index' will
// contain the current bucket
// position.
if (chFound.length == length) {
for (int i = 0; i < chFound.length; i++) {
// Stop if there are no more tuples.
if (ch[start + i] != chFound[i]) {
break;
} else if (i == (length - 1)) {
// That's it, we have a match!
return (String) bucket[index + 1];
}
}
}
}
// Not found -- we'll have to add it.
// Do we have to grow the bucket?
bucket = (Object[]) extendArray(bucket, bucket.length, index);
// OK, add it to the end of the
// bucket.
String s = new String(ch, start, length);
bucket[index] = s.toCharArray();
bucket[index + 1] = s;
symbolTable[hash] = bucket;
return s;
}
/**
* Ensure the capacity of an array, allocating a new one if
* necessary.
*/
Object extendArray(Object array, int currentSize, int requiredSize) {
if (requiredSize < currentSize) {
return array;
} else {
Object newArray = null;
int newSize = currentSize * 2;
if (newSize <= requiredSize) {
newSize = requiredSize + 1;
}
// Dwight Richards pointed out that newSize was ignored (11/03)
if (array instanceof char[]) {
newArray = new char[newSize];
} else if (array instanceof Object[]) {
newArray = new Object[newSize];
} else {
throw new RuntimeException("Array must be char[] or Object[]");
}
System.arraycopy(array, 0, newArray, 0, currentSize);
return newArray;
}
}
//////////////////////////////////////////////////////////////////////
// XML query routines.
//////////////////////////////////////////////////////////////////////
//
// Elements
//
/**
* Get the declared elements for an XML document.
* <p>The results will be valid only after the DTD (if any) has been
* parsed.
* @return An enumeration of all element types declared for this
* document (as Strings).
* @see #getElementContentType
* @see #getElementContentModel
*/
public Enumeration declaredElements() {
return elementInfo.keys();
}
/**
* Look up the content type of an element.
* @param name The element type name.
* @return An integer constant representing the content type.
* @see #getElementContentModel
* @see #CONTENT_UNDECLARED
* @see #CONTENT_ANY
* @see #CONTENT_EMPTY
* @see #CONTENT_MIXED
* @see #CONTENT_ELEMENTS
*/
public int getElementContentType(String name) {
Object[] element = (Object[]) elementInfo.get(name);
if (element == null) {
return CONTENT_UNDECLARED;
} else {
return ((Integer) element[0]).intValue();
}
}
/**
* Look up the content model of an element.
* <p>The result will always be null unless the content type is
* CONTENT_ELEMENTS or CONTENT_MIXED.
* @param name The element type name.
* @return The normalised content model, as a string.
* @see #getElementContentType
*/
public String getElementContentModel(String name) {
Object[] element = (Object[]) elementInfo.get(name);
if (element == null) {
return null;
} else {
return (String) element[1];
}
}
/**
* Register an element.
* Array format:
* element type
* attribute hash table
*/
void setElement(String name, int contentType, String contentModel,
Hashtable attributes) throws java.lang.Exception {
Object[] element;
// Try looking up the element
element = (Object[]) elementInfo.get(name);
// Make a new one if necessary.
if (element == null) {
element = new Object[3];
element[0] = Integer.valueOf(CONTENT_UNDECLARED);
element[1] = null;
element[2] = null;
} else if ((contentType != CONTENT_UNDECLARED)
&& (((Integer) element[0]).intValue() != CONTENT_UNDECLARED)) {
error("multiple declarations for element type", name, null);
return;
}
// Insert the content type, if any.
if (contentType != CONTENT_UNDECLARED) {
element[0] = Integer.valueOf(contentType);
}
// Insert the content model, if any.
if (contentModel != null) {
element[1] = contentModel;
}
// Insert the attributes, if any.
if (attributes != null) {
element[2] = attributes;
}
// Save the element info.
elementInfo.put(name, element);
}
/**
* Look up the attribute hash table for an element.
* The hash table is the second item in the element array.
*/
Hashtable getElementAttributes(String name) {
Object[] element = (Object[]) elementInfo.get(name);
if (element == null) {
return null;
} else {
return (Hashtable) element[2];
}
}
//
// Attributes
//
/**
* Get the declared attributes for an element type.
* @param elname The name of the element type.
* @return An Enumeration of all the attributes declared for
* a specific element type. The results will be valid only
* after the DTD (if any) has been parsed.
* @see #getAttributeType
* @see #getAttributeEnumeration
* @see #getAttributeDefaultValueType
* @see #getAttributeDefaultValue
* @see #getAttributeExpandedValue
*/
public Enumeration declaredAttributes(String elname) {
Hashtable attlist = getElementAttributes(elname);
if (attlist == null) {
return null;
} else {
return attlist.keys();
}
}
/**
* Retrieve the declared type of an attribute.
* @param name The name of the associated element.
* @param aname The name of the attribute.
* @return An integer constant representing the attribute type.
* @see #ATTRIBUTE_UNDECLARED
* @see #ATTRIBUTE_CDATA
* @see #ATTRIBUTE_ID
* @see #ATTRIBUTE_IDREF
* @see #ATTRIBUTE_IDREFS
* @see #ATTRIBUTE_ENTITY
* @see #ATTRIBUTE_ENTITIES
* @see #ATTRIBUTE_NMTOKEN
* @see #ATTRIBUTE_NMTOKENS
* @see #ATTRIBUTE_ENUMERATED
* @see #ATTRIBUTE_NOTATION
*/
public int getAttributeType(String name, String aname) {
Object[] attribute = getAttribute(name, aname);
if (attribute == null) {
return ATTRIBUTE_UNDECLARED;
} else {
return ((Integer) attribute[0]).intValue();
}
}
/**
* Retrieve the allowed values for an enumerated attribute type.
* @param name The name of the associated element.
* @param aname The name of the attribute.
* @return A string containing the token list.
* @see #ATTRIBUTE_ENUMERATED
* @see #ATTRIBUTE_NOTATION
*/
public String getAttributeEnumeration(String name, String aname) {
Object[] attribute = getAttribute(name, aname);
if (attribute == null) {
return null;
} else {
return (String) attribute[3];
}
}
/**
* Retrieve the default value of a declared attribute.
* @param name The name of the associated element.
* @param aname The name of the attribute.
* @return The default value, or null if the attribute was
* #IMPLIED or simply undeclared and unspecified.
* @see #getAttributeExpandedValue
*/
public String getAttributeDefaultValue(String name, String aname) {
Object[] attribute = getAttribute(name, aname);
if (attribute == null) {
return null;
} else {
return (String) attribute[1];
}
}
/**
* Retrieve the expanded value of a declared attribute.
* <p>All general entities will be expanded.
* @param name The name of the associated element.
* @param aname The name of the attribute.
* @return The expanded default value, or null if the attribute was
* #IMPLIED or simply undeclared
* @see #getAttributeDefaultValue
*/
public String getAttributeExpandedValue(String name, String aname) {
Object[] attribute = getAttribute(name, aname);
if (attribute == null) {
return null;
} else if ((attribute[4] == null) && (attribute[1] != null)) {
try {
pushString(null, (char) 0 + (String) attribute[1] + (char) 0);
attribute[4] = readLiteral(LIT_NORMALIZE | LIT_CHAR_REF
| LIT_ENTITY_REF);
} catch (Exception ex) {
// We could ignore this and return but instead return here.
return (String) attribute[4];
}
}
return (String) attribute[4];
}
/**
* Retrieve the default value type of a declared attribute.
* @param name The name of the element.
* @param aname The name of the attribute.
* @return ATTRIBUTE_DEFAULT_UNDECLARED if the attribute
* cannot be found, otherwise return an integer.
* @see #ATTRIBUTE_DEFAULT_SPECIFIED
* @see #ATTRIBUTE_DEFAULT_IMPLIED
* @see #ATTRIBUTE_DEFAULT_REQUIRED
* @see #ATTRIBUTE_DEFAULT_FIXED
*/
public int getAttributeDefaultValueType(String name, String aname) {
Object[] attribute = getAttribute(name, aname);
if (attribute == null) {
return ATTRIBUTE_DEFAULT_UNDECLARED;
} else {
return ((Integer) attribute[2]).intValue();
}
}
/**
* Register an attribute declaration for later retrieval.
* Format:
* - String type
* - String default value
* - int value type
* *TODO: do something with attribute types.
*/
void setAttribute(String elName, String name, int type, String enumeration,
String value, int valueType) throws java.lang.Exception {
Hashtable attlist;
Object[] attribute;
// Create a new hashtable if necessary.
attlist = getElementAttributes(elName);
if (attlist == null) {
attlist = new Hashtable();
}
// Check that the attribute doesn't
// already exist!
if (attlist.get(name) != null) {
return;
} else {
attribute = new Object[5];
attribute[0] = Integer.valueOf(type);
attribute[1] = value;
attribute[2] = Integer.valueOf(valueType);
attribute[3] = enumeration;
attribute[4] = null;
attlist.put(name.intern(), attribute);
// Use CONTENT_UNDECLARED to avoid overwriting
// existing element declaration.
setElement(elName, CONTENT_UNDECLARED, null, attlist);
}
}
/**
* Retrieve the three-member array representing an
* attribute declaration.
* @param elName The name of the element.
* @param name The name of the attribute.
*/
Object[] getAttribute(String elName, String name) {
Hashtable attlist;
Object[] attribute;
attlist = getElementAttributes(elName);
if (attlist == null) {
return null;
}
attribute = (Object[]) attlist.get(name);
return attribute;
}
//
// Entities
//
/**
* Get declared entities.
* @return An Enumeration of all the entities declared for
* this XML document. The results will be valid only
* after the DTD (if any) has been parsed.
* @see #getEntityType
* @see #getEntityPublicId
* @see #getEntitySystemId
* @see #getEntityValue
* @see #getEntityNotationName
*/
public Enumeration declaredEntities() {
return entityInfo.keys();
}
/** Return the current element.
* @return The current Element.
*/
public String getCurrentElement() {
// Ptolemy localization for MoMLParser so that we
// can get the currentElement from within MoMLParser.attribute()
return currentElement;
}
/**
* Find the type of an entity.
* @param ename The name of the entity.
* @return An integer constant representing the entity type.
* @see #ENTITY_UNDECLARED
* @see #ENTITY_INTERNAL
* @see #ENTITY_NDATA
* @see #ENTITY_TEXT
*/
public int getEntityType(String ename) {
Object[] entity = (Object[]) entityInfo.get(ename);
if (entity == null) {
return ENTITY_UNDECLARED;
} else {
return ((Integer) entity[0]).intValue();
}
}
/**
* Return an external entity's public identifier, if any.
* @param ename The name of the external entity.
* @return The entity's system identifier, or null if the
* entity was not declared, if it is not an
* external entity, or if no public identifier was
* provided.
* @see #getEntityType
*/
public String getEntityPublicId(String ename) {
Object[] entity = (Object[]) entityInfo.get(ename);
if (entity == null) {
return null;
} else {
return (String) entity[1];
}
}
/**
* Return an external entity's system identifier.
* @param ename The name of the external entity.
* @return The entity's system identifier, or null if the
* entity was not declared, or if it is not an
* external entity.
* @see #getEntityType
*/
public String getEntitySystemId(String ename) {
Object[] entity = (Object[]) entityInfo.get(ename);
if (entity == null) {
return null;
} else {
return (String) entity[2];
}
}
/**
* Return the value of an internal entity.
* @param ename The name of the internal entity.
* @return The entity's value, or null if the entity was
* not declared, or if it is not an internal entity.
* @see #getEntityType
*/
public String getEntityValue(String ename) {
Object[] entity = (Object[]) entityInfo.get(ename);
if (entity == null) {
return null;
} else {
return (String) entity[3];
}
}
/**
* Get the notation name associated with an NDATA entity.
* @param eName The NDATA entity name.
* @return The associated notation name, or null if the
* entity was not declared, or if it is not an
* NDATA entity.
* @see #getEntityType
*/
public String getEntityNotationName(String eName) {
Object[] entity = (Object[]) entityInfo.get(eName);
if (entity == null) {
return null;
} else {
return (String) entity[4];
}
}
/**
* Register an entity declaration for later retrieval.
*/
void setInternalEntity(String eName, String value) {
setEntity(eName, ENTITY_INTERNAL, null, null, value, null);
}
/**
* Register an external data entity.
*/
void setExternalDataEntity(String eName, String pubid, String sysid,
String nName) {
setEntity(eName, ENTITY_NDATA, pubid, sysid, null, nName);
}
/**
* Register an external text entity.
*/
void setExternalTextEntity(String eName, String pubid, String sysid) {
setEntity(eName, ENTITY_TEXT, pubid, sysid, null, null);
}
/**
* Register an entity declaration for later retrieval.
*/
void setEntity(String eName, int eClass, String pubid, String sysid,
String value, String nName) {
Object[] entity;
if (entityInfo.get(eName) == null) {
entity = new Object[5];
entity[0] = Integer.valueOf(eClass);
entity[1] = pubid;
entity[2] = sysid;
entity[3] = value;
entity[4] = nName;
entityInfo.put(eName, entity);
}
}
//
// Notations.
//
/**
* Get declared notations.
* @return An Enumeration of all the notations declared for
* this XML document. The results will be valid only
* after the DTD (if any) has been parsed.
* @see #getNotationPublicId
* @see #getNotationSystemId
*/
public Enumeration declaredNotations() {
return notationInfo.keys();
}
/**
* Look up the public identifier for a notation.
* You will normally use this method to look up a notation
* that was provided as an attribute value or for an NDATA entity.
* @param nname The name of the notation.
* @return A string containing the public identifier, or null
* if none was provided or if no such notation was
* declared.
* @see #getNotationSystemId
*/
public String getNotationPublicId(String nname) {
Object[] notation = (Object[]) notationInfo.get(nname);
if (notation == null) {
return null;
} else {
return (String) notation[0];
}
}
/**
* Look up the system identifier for a notation.
* You will normally use this method to look up a notation
* that was provided as an attribute value or for an NDATA entity.
* @param nname The name of the notation.
* @return A string containing the system identifier, or null
* if no such notation was declared.
* @see #getNotationPublicId
*/
public String getNotationSystemId(String nname) {
Object[] notation = (Object[]) notationInfo.get(nname);
if (notation == null) {
return null;
} else {
return (String) notation[1];
}
}
/**
* Register a notation declaration for later retrieval.
* Format:
* - public id
* - system id
*/
void setNotation(String nname, String pubid, String sysid)
throws java.lang.Exception {
Object[] notation;
if (notationInfo.get(nname) == null) {
notation = new Object[2];
notation[0] = pubid;
notation[1] = sysid;
notationInfo.put(nname, notation);
} else {
error("multiple declarations of notation", nname, null);
}
}
//
// Location.
//
/**
* Return the current line number.
* @return The current line number.
*/
public int getLineNumber() {
return line;
}
/**
* Return the current column number.
* @return The current column number.
*/
public int getColumnNumber() {
return column;
}
//////////////////////////////////////////////////////////////////////
// High-level I/O.
//////////////////////////////////////////////////////////////////////
/**
* Read a single character from the readBuffer.
* <p>The readDataChunk() method maintains the buffer.
* <p>If we hit the end of an entity, try to pop the stack and
* keep going.
* <p>(This approach doesn't really enforce XML's rules about
* entity boundaries, but this is not currently a validating
* parser).
* <p>This routine also attempts to keep track of the current
* position in external entities, but it's not entirely accurate.
* @return The next available input character.
* @see #unread(char)
* @see #readDataChunk
* @see #readBuffer
* @see #line
* @return The next character from the current input source.
*/
char readCh() throws java.lang.Exception {
char c;
// As long as there's nothing in the
// read buffer, try reading more data
// (for an external entity) or popping
// the entity stack (for either).
while (readBufferPos >= readBufferLength) {
switch (sourceType) {
case INPUT_READER:
case INPUT_EXTERNAL:
case INPUT_STREAM:
readDataChunk();
while (readBufferLength < 1) {
popInput();
if (readBufferLength < 1) {
readDataChunk();
}
}
break;
default:
popInput();
break;
}
}
c = readBuffer[readBufferPos++];
// This is a particularly nasty bit
// of code, that checks for a parameter
// entity reference but peeks ahead to
// catch the '%' in parameter entity
// declarations.
if ((c == '%')
&& ((context == CONTEXT_DTD) || (context == CONTEXT_ENTITYVALUE))) {
char c2 = readCh();
unread(c2);
if (!isWhitespace(c2)) {
parsePEReference(context == CONTEXT_ENTITYVALUE);
return readCh();
}
}
if (c == '\n') {
line++;
column = 0;
} else {
column++;
}
return c;
}
/**
* Push a single character back onto the current input stream.
* <p>This method usually pushes the character back onto
* the readBuffer.
* <p>I don't think that this would ever be called with
* readBufferPos = 0, because the methods always reads a character
* before unreading it, but just in case, I've added a boundary
* condition.
* @param c The character to push back.
* @see #readCh
* @see #unread(char[], int)
* @see #readBuffer
*/
void unread(char c) throws java.lang.Exception {
// Normal condition.
if (c == '\n') {
line--;
column = -1;
}
if (readBufferPos > 0) {
readBuffer[--readBufferPos] = c;
} else {
pushString(null, Character.toString(c));
}
}
/**
* Push a char array back onto the current input stream.
* <p>NOTE: you must <em>never</em> push back characters that you
* haven't actually read: use pushString() instead.
* @see #readCh
* @see #unread(char)
* @see #readBuffer
* @see #pushString
*/
void unread(char[] ch, int length) throws java.lang.Exception {
for (int i = 0; i < length; i++) {
if (ch[i] == '\n') {
line--;
column = -1;
}
}
if (length < readBufferPos) {
readBufferPos -= length;
} else {
pushCharArray(null, ch, 0, length);
sourceType = INPUT_BUFFER;
}
}
/**
* Push a new external input source.
* <p>The source will be either an external text entity, or the DTD
* external subset.
* <p>TO DO: Right now, this method always attempts to autodetect
* the encoding; in the future, it should allow the caller to
* request an encoding explicitly, and it should also look at the
* headers with an HTTP connection.
* @param ename
* @param publicId
* @param systemId
* @param reader
* @param stream
* @param encoding
* @exception Exception
* @see XmlHandler#resolveEntity
* @see #pushString
* @see #sourceType
* @see #pushInput
* @see #detectEncoding
* @see #sourceType
* @see #readBuffer
*/
void pushURL(String ename, String publicId, String systemId, Reader reader,
InputStream stream, String encoding) throws java.lang.Exception {
URL url;
boolean ignoreEncoding = false;
// Push the existing status.
pushInput(ename);
// Create a new read buffer.
// (Note the four-character margin)
readBuffer = new char[READ_BUFFER_MAX + 4];
readBufferPos = 0;
readBufferLength = 0;
readBufferOverflow = -1;
is = null;
line = 1;
currentByteCount = 0;
// Flush any remaining data.
dataBufferFlush();
// Make the URL absolute.
if ((systemId != null) && (externalEntity != null)) {
systemId = new URL(externalEntity.getURL(), systemId).toString();
} else if (baseURI != null) {
try {
systemId = new URL(new URL(baseURI), systemId).toString();
} catch (Throwable throwable) {
// Ignore this and stick with the old systemId
}
}
// See if the application wants to
// redirect the system ID and/or
// supply its own character stream.
if ((systemId != null) && (handler != null)) {
Object input = handler.resolveEntity(publicId, systemId);
if (input != null) {
if (input instanceof String) {
systemId = (String) input;
} else if (input instanceof InputStream) {
stream = (InputStream) input;
} else if (input instanceof Reader) {
reader = (Reader) input;
}
}
}
// Start the entity.
if (handler != null) {
if (systemId != null) {
handler.startExternalEntity(systemId);
} else {
handler.startExternalEntity("[external stream]");
}
}
// Figure out what we're reading from.
if (reader != null) {
// There's an explicit character stream.
sourceType = INPUT_READER;
this.reader = reader;
tryEncodingDecl(true);
return;
} else if (stream != null) {
sourceType = INPUT_STREAM;
is = stream;
} else {
// We have to open our own stream
// to the URL.
// Set the new status
sourceType = INPUT_EXTERNAL;
url = new URL(systemId);
externalEntity = url.openConnection();
externalEntity.connect();
is = externalEntity.getInputStream();
}
// If we get to here, there must be
// an InputStream available.
if (!is.markSupported()) {
is = new BufferedInputStream(is);
}
// Attempt to detect the encoding.
if ((encoding == null) && (externalEntity != null)) {
encoding = externalEntity.getContentEncoding();
}
if (encoding != null) {
checkEncoding(encoding, false);
ignoreEncoding = true;
} else {
detectEncoding();
ignoreEncoding = false;
}
// Read an XML or text declaration.
tryEncodingDecl(ignoreEncoding);
}
/**
* Check for an encoding declaration.
*/
void tryEncodingDecl(boolean ignoreEncoding) throws java.lang.Exception {
// Read the XML/Encoding declaration.
if (tryRead("<?xml")) {
if (tryWhitespace()) {
if (inputStack.size() > 0) {
parseTextDecl(ignoreEncoding);
} else {
parseXMLDecl(ignoreEncoding);
}
} else {
unread("xml".toCharArray(), 3);
parsePI();
}
}
}
/**
* Attempt to detect the encoding of an entity.
* <p>The trick here (as suggested in the XML standard) is that
* any entity not in UTF-8, or in UCS-2 with a byte-order mark,
* <b>must</b> begin with an XML declaration or an encoding
* declaration; we simply have to look for "<?XML" in various
* encodings.
* <p>This method has no way to distinguish among 8-bit encodings.
* Instead, it assumes UTF-8, then (possibly) revises its assumption
* later in checkEncoding(). Any ASCII-derived 8-bit encoding
* should work, but most will be rejected later by checkEncoding().
* <p>I don't currently detect EBCDIC, since I'm concerned that it
* could also be a valid UTF-8 sequence; I'll have to do more checking
* later.
* @see #tryEncoding(byte[], byte, byte, byte, byte)
* @see #tryEncoding(byte[], byte, byte)
* @see #checkEncoding
* @see #read8bitEncodingDeclaration
*/
void detectEncoding() throws java.lang.Exception {
byte[] signature = new byte[4];
// Read the first four bytes for
// autodetection.
is.mark(4);
int bytesRead = is.read(signature);
if (bytesRead != signature.length) {
throw new IOException("Read only " + bytesRead
+ " bytes instead of " + signature.length);
}
is.reset();
// Look for a known signature.
if (tryEncoding(signature, (byte) 0x00, (byte) 0x00, (byte) 0x00,
(byte) 0x3c)) {
// UCS-4 must begin with "<!XML"
// 0x00 0x00 0x00 0x3c: UCS-4, big-endian (1234)
encoding = ENCODING_UCS_4_1234;
} else if (tryEncoding(signature, (byte) 0x3c, (byte) 0x00,
(byte) 0x00, (byte) 0x00)) {
// UCS-4 must begin with "<!XML"
// 0x3c 0x00 0x00 0x00: UCS-4, little-endian (4321)
encoding = ENCODING_UCS_4_4321;
} else if (tryEncoding(signature, (byte) 0x00, (byte) 0x00,
(byte) 0x3c, (byte) 0x00)) {
// UCS-4 must begin with "<!XML"
// 0x00 0x00 0x3c 0x00: UCS-4, unusual (2143)
encoding = ENCODING_UCS_4_2143;
} else if (tryEncoding(signature, (byte) 0x00, (byte) 0x3c,
(byte) 0x00, (byte) 0x00)) {
// UCS-4 must begin with "<!XML"
// 0x00 0x3c 0x00 0x00: UCS-4, unusual (3421)
encoding = ENCODING_UCS_4_3412;
} else if (tryEncoding(signature, (byte) 0xfe, (byte) 0xff)) {
// UCS-2 with a byte-order marker.
// 0xfe 0xff: UCS-2, big-endian (12)
encoding = ENCODING_UCS_2_12;
is.read();
is.read();
} else if (tryEncoding(signature, (byte) 0xff, (byte) 0xfe)) {
// UCS-2 with a byte-order marker.
// 0xff 0xfe: UCS-2, little-endian (21)
encoding = ENCODING_UCS_2_21;
is.read();
is.read();
} else if (tryEncoding(signature, (byte) 0x00, (byte) 0x3c,
(byte) 0x00, (byte) 0x3f)) {
// UCS-2 without a BOM must begin with "<?XML"
// 0x00 0x3c 0x00 0x3f: UCS-2, big-endian, no byte-order mark
encoding = ENCODING_UCS_2_12;
error("no byte-order mark for UCS-2 entity", null, null);
} else if (tryEncoding(signature, (byte) 0x3c, (byte) 0x00,
(byte) 0x3f, (byte) 0x00)) {
// UCS-2 without a BOM must begin with "<?XML"
// 0x3c 0x00 0x3f 0x00: UCS-2, little-endian, no byte-order mark
encoding = ENCODING_UCS_2_21;
error("no byte-order mark for UCS-2 entity", null, null);
} else if (tryEncoding(signature, (byte) 0x3c, (byte) 0x3f,
(byte) 0x78, (byte) 0x6d)) {
// Some kind of 8-bit encoding with "<?XML"
// 0x3c 0x3f 0x78 0x6d: UTF-8 or other 8-bit markup (read ENCODING)
encoding = ENCODING_UTF_8;
read8bitEncodingDeclaration();
} else {
// Some kind of 8-bit encoding without "<?XML"
// (otherwise) UTF-8 without encoding/XML declaration
encoding = ENCODING_UTF_8;
}
}
/**
* Check for a four-byte signature.
* <p>Utility routine for detectEncoding().
* <p>Always looks for some part of "<?XML" in a specific encoding.
* @param sig The first four bytes read.
* @param b1 The first byte of the signature
* @param b2 The second byte of the signature
* @param b3 The third byte of the signature
* @param b4 The fourth byte of the signature
* @see #detectEncoding
*/
boolean tryEncoding(byte[] sig, byte b1, byte b2, byte b3, byte b4) {
return ((sig[0] == b1) && (sig[1] == b2) && (sig[2] == b3) && (sig[3] == b4));
}
/**
* Check for a two-byte signature.
* <p>Looks for a UCS-2 byte-order mark.
* <p>Utility routine for detectEncoding().
* @param sig The first four bytes read.
* @param b1 The first byte of the signature
* @param b2 The second byte of the signature
* @see #detectEncoding
*/
boolean tryEncoding(byte[] sig, byte b1, byte b2) {
return ((sig[0] == b1) && (sig[1] == b2));
}
/**
* This method pushes a string back onto input.
* <p>It is useful either as the expansion of an internal entity,
* or for backtracking during the parse.
* <p>Call pushCharArray() to do the actual work.
* @param s The string to push back onto input.
* @see #pushCharArray
*/
void pushString(String ename, String s) throws java.lang.Exception {
char[] ch = s.toCharArray();
pushCharArray(ename, ch, 0, ch.length);
}
/**
* Push a new internal input source.
* <p>This method is useful for expanding an internal entity,
* or for unreading a string of characters. It creates a new
* readBuffer containing the characters in the array, instead
* of characters converted from an input byte stream.
* <p>I've added a couple of optimisations: don't push zero-
* length strings, and just push back a single character
* for 1-character strings; this should save some time and memory.
* @param ch The char array to push.
* @see #pushString
* @see #pushURL
* @see #readBuffer
* @see #sourceType
* @see #pushInput
*/
void pushCharArray(String ename, char[] ch, int start, int length)
throws java.lang.Exception {
// Push the existing status
pushInput(ename);
sourceType = INPUT_INTERNAL;
readBuffer = ch;
readBufferPos = start;
readBufferLength = length;
readBufferOverflow = -1;
}
/**
* Save the current input source onto the stack.
* <p>This method saves all of the global variables associated with
* the current input source, so that they can be restored when a new
* input source has finished. It also tests for entity recursion.
* <p>The method saves the following global variables onto a stack
* using a fixed-length array:
* <ol>
* <li>sourceType
* <li>externalEntity
* <li>readBuffer
* <li>readBufferPos
* <li>readBufferLength
* <li>line
* <li>encoding
* </ol>
* @param ename The name of the entity (if any) causing the new input.
* @see #popInput
* @see #sourceType
* @see #externalEntity
* @see #readBuffer
* @see #readBufferPos
* @see #readBufferLength
* @see #line
* @see #encoding
*/
void pushInput(String ename) throws java.lang.Exception {
Object[] input = new Object[12];
// Check for entity recursion.
if (ename != null) {
Enumeration entities = entityStack.elements();
while (entities.hasMoreElements()) {
String e = (String) entities.nextElement();
if (e.equals(ename)) {
error("recursive reference to entity", ename, null);
}
}
}
entityStack.push(ename);
// Don't bother if there is no input.
if (sourceType == INPUT_NONE) {
return;
}
// Set up a snapshot of the current
// input source.
input[0] = Integer.valueOf(sourceType);
input[1] = externalEntity;
input[2] = readBuffer;
input[3] = Integer.valueOf(readBufferPos);
input[4] = Integer.valueOf(readBufferLength);
input[5] = Integer.valueOf(line);
input[6] = Integer.valueOf(encoding);
input[7] = Integer.valueOf(readBufferOverflow);
input[8] = is;
input[9] = Integer.valueOf(currentByteCount);
input[10] = Integer.valueOf(column);
input[11] = reader;
// Push it onto the stack.
inputStack.push(input);
}
/**
* Restore a previous input source.
* <p>This method restores all of the global variables associated with
* the current input source.
* @exception java.io.EOFException
* If there are no more entries on the input stack.
* @see #pushInput
* @see #sourceType
* @see #externalEntity
* @see #readBuffer
* @see #readBufferPos
* @see #readBufferLength
* @see #line
* @see #encoding
*/
void popInput() throws java.lang.Exception {
Object[] input;
switch (sourceType) {
case INPUT_EXTERNAL:
dataBufferFlush();
if ((handler != null) && (externalEntity != null)) {
handler.endExternalEntity(externalEntity.getURL().toString());
}
break;
case INPUT_STREAM:
dataBufferFlush();
if (baseURI != null) {
if (handler != null) {
handler.endExternalEntity(baseURI);
}
}
break;
case INPUT_READER:
dataBufferFlush();
if (baseURI != null) {
if (handler != null) {
handler.endExternalEntity(baseURI);
}
}
break;
}
// Throw an EOFException if there
// is nothing else to pop.
if (inputStack.isEmpty()) {
throw new EOFException("XML parser input stack was empty, "
+ "end of file or xml fragment reached. "
+ "Perhaps there is a missing '>' "
+ "or a comment is unterminated by '->'?");
} else {
input = (Object[]) inputStack.pop();
entityStack.pop();
}
sourceType = ((Integer) input[0]).intValue();
externalEntity = (URLConnection) input[1];
readBuffer = (char[]) input[2];
readBufferPos = ((Integer) input[3]).intValue();
readBufferLength = ((Integer) input[4]).intValue();
line = ((Integer) input[5]).intValue();
encoding = ((Integer) input[6]).intValue();
readBufferOverflow = ((Integer) input[7]).intValue();
is = (InputStream) input[8];
currentByteCount = ((Integer) input[9]).intValue();
column = ((Integer) input[10]).intValue();
reader = (Reader) input[11];
}
/**
* Return true if we can read the expected character.
* <p>Note that the character will be removed from the input stream
* on success, but will be put back on failure. Do not attempt to
* read the character again if the method succeeds.
* @param delim The character that should appear next. For a
* insensitive match, you must supply this in upper-case.
* @return true if the character was successfully read, or false if
* it was not.
* @see #tryRead(String)
*/
boolean tryRead(char delim) throws java.lang.Exception {
char c;
// Read the character
c = readCh();
// Test for a match, and push the character
// back if the match fails.
if (c == delim) {
return true;
} else {
unread(c);
return false;
}
}
/**
* Return true if we can read the expected string.
* <p>This is simply a convenience method.
* <p>Note that the string will be removed from the input stream
* on success, but will be put back on failure. Do not attempt to
* read the string again if the method succeeds.
* <p>This method will push back a character rather than an
* array whenever possible (probably the majority of cases).
* <p><b>NOTE:</b> This method currently has a hard-coded limit
* of 100 characters for the delimiter.
* @param delim The string that should appear next.
* @return true if the string was successfully read, or false if
* it was not.
* @see #tryRead(char)
*/
boolean tryRead(String delim) throws java.lang.Exception {
char[] ch = delim.toCharArray();
char c;
// Compare the input, character-
// by character.
for (int i = 0; i < ch.length; i++) {
c = readCh();
if (c != ch[i]) {
unread(c);
if (i != 0) {
unread(ch, i);
}
return false;
}
}
return true;
}
/**
* Return true if we can read some whitespace.
* <p>This is simply a convenience method.
* <p>This method will push back a character rather than an
* array whenever possible (probably the majority of cases).
* @return true if whitespace was found.
*/
boolean tryWhitespace() throws java.lang.Exception {
char c;
c = readCh();
if (isWhitespace(c)) {
skipWhitespace();
return true;
} else {
unread(c);
return false;
}
}
/**
* Read all data until we find the specified string.
* <p>This is especially useful for scanning marked sections.
* <p>This is a a little inefficient right now, since it calls tryRead()
* for every character.
* @param delim The string delimiter
* @see #tryRead(String)
* @see #readCh
*/
void parseUntil(String delim) throws java.lang.Exception {
char c;
int startLine = line;
try {
while (!tryRead(delim)) {
c = readCh();
dataBufferAppend(c);
}
} catch (EOFException e) {
error("end of input while looking for delimiter (started on line "
+ startLine + ')', null, delim);
}
}
// Modified November 14, 1998 by Steve Neuendorffer
// There was a bug because this was not skipping things that looked
// like parameter entities properly.
// Copied the appropriate code from readCh, excluding the lines referring to
// '%'.
/**
* Skip all data until we find the specified string.
* <p>This is especially useful for scanning comments.
* <p>This is a a little inefficient right now, since it calls tryRead()
* for every character.
* @param delim The string delimiter
* @see #readCh
*/
void skipUntil(String delim) throws java.lang.Exception {
while (!tryRead(delim)) {
char c;
// As long as there's nothing in the
// read buffer, try reading more data
// (for an external entity) or popping
// the entity stack (for either).
while (readBufferPos >= readBufferLength) {
switch (sourceType) {
case INPUT_READER:
case INPUT_EXTERNAL:
case INPUT_STREAM:
readDataChunk();
while (readBufferLength < 1) {
popInput();
if (readBufferLength < 1) {
readDataChunk();
}
}
break;
default:
popInput();
break;
}
}
c = readBuffer[readBufferPos++];
if (c == '\n') {
line++;
column = 0;
} else {
column++;
}
}
}
/**
* Read just the encoding declaration (or XML declaration) at the
* start of an external entity.
* When this method is called, we know that the declaration is
* present (or appears to be). We also know that the entity is
* in some sort of ASCII-derived 8-bit encoding.
* The idea of this is to let us read what the 8-bit encoding is
* before we've committed to converting any more of the file; the
* XML or encoding declaration must be in 7-bit ASCII, so we're
* safe as long as we don't go past it.
*/
void read8bitEncodingDeclaration() throws java.lang.Exception {
int ch;
readBufferPos = readBufferLength = 0;
while (true) {
ch = is.read();
readBuffer[readBufferLength++] = (char) ch;
switch (ch) {
case '>':
return;
case -1:
error("end of file before end of XML or encoding declaration.",
null, "?>");
return;
}
if (readBuffer.length == readBufferLength) {
error("unfinished XML or encoding declaration", null, null);
}
}
}
//////////////////////////////////////////////////////////////////////
// Low-level I/O.
//////////////////////////////////////////////////////////////////////
/**
* Read a chunk of data from an external input source.
* <p>This is simply a front-end that fills the rawReadBuffer
* with bytes, then calls the appropriate encoding handler.
* @see #encoding
* @see #rawReadBuffer
* @see #readBuffer
* @see #filterCR
* @see #copyUtf8ReadBuffer
* @see #copyIso8859_1ReadBuffer
*/
void readDataChunk() throws java.lang.Exception {
int count;
// See if we have any overflow.
if (readBufferOverflow > -1) {
readBuffer[0] = (char) readBufferOverflow;
readBufferOverflow = -1;
readBufferPos = 1;
sawCR = true;
} else {
readBufferPos = 0;
sawCR = false;
}
// Special situation -- we're taking
// input from a character stream.
if (sourceType == INPUT_READER) {
count = reader.read(readBuffer, readBufferPos, READ_BUFFER_MAX - 1);
if (count < 0) {
readBufferLength = -1;
} else {
readBufferLength = readBufferPos + count;
filterCR();
sawCR = false;
}
return;
}
// Read as many bytes as possible
// into the read buffer.
count = is.read(rawReadBuffer, 0, READ_BUFFER_MAX);
// Dispatch to an encoding-specific
// reader method to populate the
// readBuffer.
switch (encoding) {
case ENCODING_UTF_8:
copyUtf8ReadBuffer(count);
break;
case ENCODING_ISO_8859_1:
copyIso8859_1ReadBuffer(count);
break;
case ENCODING_UCS_2_12:
copyUcs2ReadBuffer(count, 8, 0);
break;
case ENCODING_UCS_2_21:
copyUcs2ReadBuffer(count, 0, 8);
break;
case ENCODING_UCS_4_1234:
copyUcs4ReadBuffer(count, 24, 16, 8, 0);
break;
case ENCODING_UCS_4_4321:
copyUcs4ReadBuffer(count, 0, 8, 16, 24);
break;
case ENCODING_UCS_4_2143:
copyUcs4ReadBuffer(count, 16, 24, 0, 8);
break;
case ENCODING_UCS_4_3412:
copyUcs4ReadBuffer(count, 8, 0, 24, 16);
break;
}
// Filter out all carriage returns
// if we've seen any.
if (sawCR) {
filterCR();
sawCR = false;
}
// Reset the position.
readBufferPos = 0;
currentByteCount += count;
}
/**
* Filter carriage returns in the read buffer.
* <p>CRLF becomes LF; CR becomes LF.
* @see #readDataChunk
* @see #readBuffer
* @see #readBufferOverflow
*/
void filterCR() {
int i;
int j;
readBufferOverflow = -1;
loop: for (i = 0, j = 0; j < readBufferLength; i++, j++) {
switch (readBuffer[j]) {
case '\r':
if (j == (readBufferLength - 1)) {
readBufferOverflow = '\r';
readBufferLength--;
break loop;
} else if (readBuffer[j + 1] == '\n') {
j++;
}
readBuffer[i] = '\n';
break;
case '\n':
default:
readBuffer[i] = readBuffer[j];
break;
}
}
readBufferLength = i;
}
/**
* Convert a buffer of UTF-8-encoded bytes into UTF-16 characters.
* <p>When readDataChunk() calls this method, the raw bytes are in
* rawReadBuffer, and the final characters will appear in
* readBuffer.
* <p>The tricky part of this is dealing with UTF-8 multi-byte
* sequences, but it doesn't seem to slow things down too much.
* @param count The number of bytes to convert.
* @see #readDataChunk
* @see #rawReadBuffer
* @see #readBuffer
* @see #getNextUtf8Byte
*/
void copyUtf8ReadBuffer(int count) throws java.lang.Exception {
int i = 0;
int j = readBufferPos;
int b1;
while (i < count) {
b1 = rawReadBuffer[i++];
// Determine whether we are dealing
// with a one-, two-, three-, or four-
// byte sequence.
if ((b1 & 0x80) == 0) {
// 1-byte sequence: 000000000xxxxxxx = 0xxxxxxx
readBuffer[j++] = (char) b1;
} else if ((b1 & 0xe0) == 0xc0) {
// 2-byte sequence: 00000yyyyyxxxxxx = 110yyyyy 10xxxxxx
readBuffer[j++] = (char) (((b1 & 0x1f) << 6) | getNextUtf8Byte(
i++, count));
} else if ((b1 & 0xf0) == 0xe0) {
// 3-byte sequence: zzzzyyyyyyxxxxxx = 1110zzzz 10yyyyyy 10xxxxxx
readBuffer[j++] = (char) (((b1 & 0x0f) << 12)
| (getNextUtf8Byte(i++, count) << 6) | getNextUtf8Byte(
i++, count));
} else if ((b1 & 0xf8) == 0xf0) {
// 4-byte sequence: 11101110wwwwzzzzyy + 110111yyyyxxxxxx
// = 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx
// (uuuuu = wwww + 1)
int b2 = getNextUtf8Byte(i++, count);
int b3 = getNextUtf8Byte(i++, count);
int b4 = getNextUtf8Byte(i++, count);
readBuffer[j++] = (char) (0xd800
| ((((b1 & 0x07) << 2) | (((b2 & 0x30) >> 4) - 1)) << 6)
| ((b2 & 0x0f) << 2) | ((b3 & 0x30) >> 4));
readBuffer[j++] = (char) (0xdc | ((b3 & 0x0f) << 6) | b4);
// TODO: test that surrogate value is legal.
} else {
// Otherwise, the 8th bit may not be set in UTF-8
encodingError("bad start for UTF-8 multi-byte sequence", b1, i);
}
if (readBuffer[j - 1] == '\r') {
sawCR = true;
}
}
// How many characters have we read?
readBufferLength = j;
}
/**
* Return the next byte value in a UTF-8 sequence.
* If it is not possible to get a byte from the current
* entity, throw an exception.
* @param pos The current position in the rawReadBuffer.
* @param count The number of bytes in the rawReadBuffer
* @return The significant six bits of a non-initial byte in
* a UTF-8 sequence.
* @exception EOFException If the sequence is incomplete.
*/
int getNextUtf8Byte(int pos, int count) throws java.lang.Exception {
int val;
// Take a character from the buffer
// or from the actual input stream.
if (pos < count) {
val = rawReadBuffer[pos];
} else {
val = is.read();
if (val == -1) {
encodingError("unfinished multi-byte UTF-8 sequence at EOF",
-1, pos);
}
}
// Check for the correct bits at the
// start.
if ((val & 0xc0) != 0x80) {
encodingError("bad continuation of multi-byte UTF-8 sequence", val,
pos + 1);
}
// Return the significant bits.
return (val & 0x3f);
}
/**
* Convert a buffer of ISO-8859-1-encoded bytes into UTF-16 characters.
* <p>When readDataChunk() calls this method, the raw bytes are in
* rawReadBuffer, and the final characters will appear in
* readBuffer.
* <p>This is a direct conversion, with no tricks.
* @param count The number of bytes to convert.
* @see #readDataChunk
* @see #rawReadBuffer
* @see #readBuffer
*/
void copyIso8859_1ReadBuffer(int count) {
int i;
int j;
for (i = 0, j = readBufferPos; i < count; i++, j++) {
readBuffer[j] = (char) (rawReadBuffer[i] & 0xff);
if (readBuffer[j] == '\r') {
sawCR = true;
}
}
readBufferLength = j;
}
/**
* Convert a buffer of UCS-2-encoded bytes into UTF-16 characters.
* <p>When readDataChunk() calls this method, the raw bytes are in
* rawReadBuffer, and the final characters will appear in
* readBuffer.
* @param count The number of bytes to convert.
* @param shift1 The number of bits to shift byte 1.
* @param shift2 The number of bits to shift byte 2
* @see #readDataChunk
* @see #rawReadBuffer
* @see #readBuffer
*/
void copyUcs2ReadBuffer(int count, int shift1, int shift2)
throws java.lang.Exception {
int j = readBufferPos;
if ((count > 0) && ((count % 2) != 0)) {
encodingError("odd number of bytes in UCS-2 encoding", -1, count);
}
for (int i = 0; i < count; i += 2) {
readBuffer[j++] = (char) (((rawReadBuffer[i] & 0xff) << shift1) | ((rawReadBuffer[i + 1] & 0xff) << shift2));
if (readBuffer[j - 1] == '\r') {
sawCR = true;
}
}
readBufferLength = j;
}
/**
* Convert a buffer of UCS-4-encoded bytes into UTF-16 characters.
* <p>When readDataChunk() calls this method, the raw bytes are in
* rawReadBuffer, and the final characters will appear in
* readBuffer.
* <p>Java has 16-bit chars, but this routine will attempt to use
* surrogates to encoding values between 0x00010000 and 0x000fffff.
* @param count The number of bytes to convert.
* @param shift1 The number of bits to shift byte 1.
* @param shift2 The number of bits to shift byte 2
* @param shift3 The number of bits to shift byte 2
* @param shift4 The number of bits to shift byte 2
* @see #readDataChunk
* @see #rawReadBuffer
* @see #readBuffer
*/
void copyUcs4ReadBuffer(int count, int shift1, int shift2, int shift3,
int shift4) throws java.lang.Exception {
int j = readBufferPos;
int value;
if ((count > 0) && ((count % 4) != 0)) {
encodingError(
"number of bytes in UCS-4 encoding not divisible by 4", -1,
count);
}
for (int i = 0; i < count; i += 4) {
value = (((rawReadBuffer[i] & 0xff) << shift1)
| ((rawReadBuffer[i + 1] & 0xff) << shift2)
| ((rawReadBuffer[i + 2] & 0xff) << shift3) | ((rawReadBuffer[i + 3] & 0xff) << shift4));
if (value < 0x0000ffff) {
readBuffer[j++] = (char) value;
if (value == '\r') {
sawCR = true;
}
} else if (value < 0x000fffff) {
readBuffer[j++] = (char) (0xd8 | ((value & 0x000ffc00) >> 10));
readBuffer[j++] = (char) (0xdc | (value & 0x0003ff));
} else {
encodingError("value cannot be represented in UTF-16", value, i);
}
}
readBufferLength = j;
}
/**
* Report a character encoding error.
*/
void encodingError(String message, int value, int offset)
throws java.lang.Exception {
String uri;
if (value >= 0) {
message = message + " (byte value: 0x" + Integer.toHexString(value)
+ ')';
}
if (externalEntity != null) {
uri = externalEntity.getURL().toString();
} else {
uri = baseURI;
}
handler.error(message, uri, -1, offset + currentByteCount);
}
//////////////////////////////////////////////////////////////////////
// Local Variables.
//////////////////////////////////////////////////////////////////////
/**
* Re-initialize the variables for each parse.
*/
void initializeVariables() {
// No errors; first lineb
errorCount = 0;
line = 1;
column = 0;
// Set up the buffers for data and names
dataBufferPos = 0;
dataBuffer = new char[DATA_BUFFER_INITIAL];
nameBufferPos = 0;
nameBuffer = new char[NAME_BUFFER_INITIAL];
// Set up the DTD hash tables
elementInfo = new Hashtable();
entityInfo = new Hashtable();
notationInfo = new Hashtable();
// Set up the variables for the current
// element context.
currentElement = null;
currentElementContent = CONTENT_UNDECLARED;
// Set up the input variables
sourceType = INPUT_NONE;
inputStack = new Stack();
entityStack = new Stack();
externalEntity = null;
tagAttributePos = 0;
tagAttributes = new String[100];
rawReadBuffer = new byte[READ_BUFFER_MAX];
readBufferOverflow = -1;
context = CONTEXT_NONE;
symbolTable = new Object[SYMBOL_TABLE_LENGTH];
}
/**
* Clean up after the parse to allow some garbage collection.
* Leave around anything that might be useful for queries.
*/
void cleanupVariables() {
errorCount = -1;
line = -1;
column = -1;
dataBuffer = null;
nameBuffer = null;
currentElement = null;
currentElementContent = CONTENT_UNDECLARED;
sourceType = INPUT_NONE;
inputStack = null;
externalEntity = null;
entityStack = null;
}
//
// The current XML handler interface.
//
XmlHandler handler;
//
// I/O information.
//
private Reader reader; // current reader
private InputStream is; // current input stream
private int line; // current line number
private int column; // current column number
private int sourceType; // type of input source
private Stack inputStack; // stack of input sources
private URLConnection externalEntity; // current external entity
private int encoding; // current character encoding.
private int currentByteCount; // how many bytes read from current source.
//
// Maintain a count of errors.
//
private int errorCount;
//
// Buffers for decoded but unparsed character input.
//
private final static int READ_BUFFER_MAX = 16384;
private char[] readBuffer;
private int readBufferPos;
private int readBufferLength;
private int readBufferOverflow; // overflow character from last data chunk.
//
// Stack of entity names, to help detect recursion.
//
private Stack entityStack;
//
// Buffer for undecoded raw byte input.
//
private byte[] rawReadBuffer;
//
// Buffer for parsed character data.
//
private static int DATA_BUFFER_INITIAL = 4096;
private char[] dataBuffer;
private int dataBufferPos;
//
// Buffer for parsed names.
//
private static int NAME_BUFFER_INITIAL = 1024;
private char[] nameBuffer;
private int nameBufferPos;
//
// Hashtables for DTD information on elements, entities, and notations.
//
private Hashtable elementInfo;
private Hashtable entityInfo;
private Hashtable notationInfo;
//
// Element type currently in force.
//
private String currentElement;
private int currentElementContent;
//
// Base external identifiers for resolution.
//
private String basePublicId;
private String baseURI;
private Reader baseReader;
private InputStream baseInputStream;
//
// Are we in a context where PEs are allowed?
//
private int context;
//
// Symbol table, for internalising names.
//
private Object[] symbolTable;
private final static int SYMBOL_TABLE_LENGTH = 1087;
//
// Hash table of attributes found in current start tag.
//
private String[] tagAttributes;
private int tagAttributePos;
//
// Utility flag: have we noticed a CR while reading the last
// data chunk? If so, we will have to go back and normalise
// CR/LF.
//
private boolean sawCR;
}