/*
* Copyright 1990-2009 Sun Microsystems, Inc. All Rights Reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version
* 2 only, as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License version 2 for more details (a copy is
* included at /legal/license.txt).
*
* You should have received a copy of the GNU General Public License
* version 2 along with this work; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
* 02110-1301 USA
*
* Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa
* Clara, CA 95054 or visit www.sun.com if you need additional
* information or have any questions.
*/
package com.sun.ukit.xml;
import java.util.Hashtable;
import java.io.InputStream;
import java.io.Reader;
import java.io.InputStreamReader;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import com.sun.ukit.io.ReaderUTF8;
import com.sun.ukit.io.ReaderUTF16;
/**
* XML non-validating parser engine.
*/
public abstract class Parser
{
public final static String FAULT = "";
private static String preprocessMsgText( String msg ) { return ""; }
private static final String FAULT_NO_OPENED_ELEMENT = preprocessMsgText("No open element");
private static final String FAULT_END_ELEMENT_TAG_MISMATCH = preprocessMsgText("End element tag mismatch");
private static final String FAULT_SYNTAX = preprocessMsgText("Syntax error");
private static final String FAULT_UNEXPECTED_EOF = preprocessMsgText("Unexpected EOF");
private static final String FAULT_UNSUPPORTED_MARKUP_DECLARATION = preprocessMsgText("Unsupported murkup declaration");
protected final static int BUFFSIZE_READER = 512;
protected final static int BUFFSIZE_PARSER = 128;
/** The end of stream character. */
public final static char EOS = 0xffff;
private Pair mXml; // the xml namespace
private Hashtable mEnt; // the entities look up table
private Hashtable mPEnt; // the parameter entities look up table
protected boolean mIsSAlone; // xml decl standalone flag
protected boolean mIsSAloneSet; // standalone is explicitly set
protected boolean mIsNSAware; // if true - namespace aware mode
protected int mPh; // current phase of document processing
protected final static int PH_BEFORE_DOC = -1; // before parsing
protected final static int PH_DOC_START = 0; // document start
protected final static int PH_MISC_DTD = 1; // misc before DTD
protected final static int PH_DTD = 2; // DTD
protected final static int PH_DTD_MISC = 3; // misc after DTD
protected final static int PH_DOCELM = 4; // document's element
protected final static int PH_DOCELM_MISC = 5; // misc after element
protected final static int PH_AFTER_DOC = 6; // after parsing
protected int mEvt; // current event type
protected final static int EV_NULL = 0; // unknown
protected final static int EV_ELM = 1; // empty element
protected final static int EV_ELMS = 2; // start element
protected final static int EV_ELME = 3; // end element
protected final static int EV_TEXT = 4; // textual content
protected final static int EV_WSPC = 5; // white space content
protected final static int EV_PI = 6; // processing instruction
protected final static int EV_CDAT = 7; // character data
protected final static int EV_COMM = 8; // comment
protected final static int EV_DTDS = 9; // start DTD
protected final static int EV_DTDE = 10; // end DTD
protected final static int EV_ENT = 11; // skipped entity
protected final static int EV_PENT = 12; // parsed entity declaration
protected final static int EV_UENT = 13; // unparsed entity declaration
protected final static int EV_NOT = 14; // notation declaration
// flag indicates that element had been processed with namespace aware mode
public final static int FLAG_NSAWARE = 0x001;
// flag indicating whether current element contains or inherits attribute
// "xml:space" with "preserve" value
public final static int FLAG_XMLSPC_PRESERVE = 0x002;
// value of "xml:space" attribute
protected static final String XMLSPC_PRESERVE = "preserve";
// flag indicating UCS-4 character, used as mUnent value
private final static String UCS4_CHAR = "#";
//
protected interface Attrs extends Attributes {
void set(Pair elm);
int getIndexNullNS(String uri, String localName);
}
private char mESt; // built-in entity recognizer state
// mESt values:
// 0x100 : the initial state
// > 0x100 : unrecognized name
// < 0x100 : replacement character
protected char[] mBuff; // parser buffer
protected int mBuffIdx; // index of the last char
// mPref is linked list of Pair objects which represents current namespace
// declaration stack.
// mPref.chars - qName characters with no suffix (see: bname)
// mPref.name - prefix as String
// mPref.ns - not in use
// mPref.value - namespace as String
// mPref.num - not in use
// mPref.id - not in use
// mPref.list - link to the element owner (see: mElm)
// mPref.next - link to the next element of the list or null
//
protected Pair mPref; // stack of prefixes
// mElm is linked list of Pair objects which represents current nested
// element stack.
// mElm.chars - qName characters (see: bname)
// mElm.name - local (NS-aware) or qualified (not-NS-aware) name
// mElm.ns - after attrs method call: contains NS String or null
// mElm.value - not in use
// mElm.num - number of actual attributes
// mElm.id - element flags (see: FLAG_...)
// mElm.list - before attrs method call: list of attributes declared on
// this element; after attrs method call: list of actual
// attributes of this element
// mElm.next - link to the parent element of the list or null for root
//
protected Pair mElm; // stack of elements
// mAttL.chars - element qname
// mAttL.next - next element
// mAttL.list - list of attributes declared on this element
// mAttL.list.chars - attribute qname
// mAttL.list.id - a char representing attribute's type see below
// mAttL.list.next - next attribute defined on the element
// mAttL.list.list - default value structure or null
// mAttL.list.list.chars - "name='value' " chars array for Input
//
// Attribute type character values:
// 'i' - "ID"
// 'r' - "IDREF"
// 'R' - "IDREFS"
// 'n' - "ENTITY"
// 'N' - "ENTITIES"
// 't' - "NMTOKEN"
// 'T' - "NMTOKENS"
// 'u' - enumeration type
// 'o' - "NOTATION"
// 'c' - "CDATA"
// see also: bkeyword()
//
protected Pair mAttL; // list of defined attrs by element name
protected Input mDoc; // document entity
protected Input mInp; // stack of entities
private Pair mPSid; // DTD public and system ids
private char[] mChars; // reading buffer
private int mChLen; // current capacity
private int mChIdx; // index to the next char
final protected Attrs mAttrs = new com.sun.ukit.xml.Attrs(); // attributes of the curr. element
private String mUnent; // unresolved entity name
private int mIent; // UCS-4 character value
private Pair mDltd; // deleted objects for reuse
/**
* Default prefixes and special attributes
*/
protected final static char NONS[];
protected final static char XML[];
protected final static char XMLNS[];
protected final static char XMLSPC[];
protected final static char XMLID[];
static {
NONS = new char[1];
NONS[0] = (char)0;
XML = new char[4];
XML[0] = (char)4;
XML[1] = 'x';
XML[2] = 'm';
XML[3] = 'l';
XMLNS = new char[6];
XMLNS[0] = (char)6;
XMLNS[1] = 'x';
XMLNS[2] = 'm';
XMLNS[3] = 'l';
XMLNS[4] = 'n';
XMLNS[5] = 's';
XMLSPC = new char[10];
XMLSPC[0] = (char)4;
XMLSPC[1] = 'x';
XMLSPC[2] = 'm';
XMLSPC[3] = 'l';
XMLSPC[4] = ':';
XMLSPC[5] = 's';
XMLSPC[6] = 'p';
XMLSPC[7] = 'a';
XMLSPC[8] = 'c';
XMLSPC[9] = 'e';
XMLID = new char[7];
XMLID[0] = (char)4;
XMLID[1] = 'x';
XMLID[2] = 'm';
XMLID[3] = 'l';
XMLID[4] = ':';
XMLID[5] = 'i';
XMLID[6] = 'd';
}
/**
* ASCII character type array.
*
* This array maps an ASCII (7 bit) character to the character type.<br />
* Possible character type values are:<br />
* - ' ' for any kind of white space character;<br />
* - 'a' for any lower case alphabetical character value;<br />
* - 'A' for any upper case alphabetical character value;<br />
* - 'd' for any decimal digit character value;<br />
* - 'z' for any character less then ' ' except '\t', '\n', '\r';<br />
* An ASCII (7 bit) character which does not fall in any category listed
* above is mapped to it self.
*/
private static final byte asctyp[];
/**
* NMTOKEN character type array.
*
* This array maps an ASCII (7 bit) character to the character type.<br />
* Possible character type values are:<br />
* - 0 for underscore ('_') or any lower and upper case alphabetical character value;<br />
* - 1 for colon (':') character;<br />
* - 2 for dash ('-') and dot ('.') or any decimal digit character value;<br />
* - 3 for any kind of white space character<br />
* An ASCII (7 bit) character which does not fall in any category listed
* above is mapped to 0xff.
*/
private static final byte nmttyp[];
/**
* Static constructor.
*
* Sets up the ASCII character type array which is used by
* {@link #asctyp asctyp} method and NMTOKEN character type array.
*/
static {
short i = 0;
asctyp = new byte[0x80];
while (i < ' ')
asctyp[i++] = (byte)'z';
asctyp['\t'] = (byte)' ';
asctyp['\r'] = (byte)' ';
asctyp['\n'] = (byte)' ';
while (i < '0')
asctyp[i] = (byte)i++;
while (i <= '9')
asctyp[i++] = (byte)'d';
while (i < 'A')
asctyp[i] = (byte)i++;
while (i <= 'Z')
asctyp[i++] = (byte)'A';
while (i < 'a')
asctyp[i] = (byte)i++;
while (i <= 'z')
asctyp[i++] = (byte)'a';
while (i < 0x80)
asctyp[i] = (byte)i++;
nmttyp = new byte[0x80];
for (i = 0; i < '0'; i++)
nmttyp[i] = (byte)0xff;
while (i <= '9')
nmttyp[i++] = (byte)2; // digits
while (i < 'A')
nmttyp[i++] = (byte)0xff;
// skipped upper case alphabetical character are already 0
for (i = '['; i < 'a'; i++)
nmttyp[i] = (byte)0xff;
// skipped lower case alphabetical character are already 0
for (i = '{'; i < 0x80; i++)
nmttyp[i] = (byte)0xff;
nmttyp['_'] = 0;
nmttyp[':'] = 1;
nmttyp['.'] = 2;
nmttyp['-'] = 2;
nmttyp[' '] = 3;
nmttyp['\t'] = 3;
nmttyp['\r'] = 3;
nmttyp['\n'] = 3;
}
/**
* Constructor.
*/
protected Parser()
{
mPh = PH_BEFORE_DOC; // before parsing
// Initialize the parser
mBuff = new char[BUFFSIZE_PARSER];
// XML namespace
mPref = pair(mPref);
mPref.name = "xml";
mPref.value = "http://www.w3.org/XML/1998/namespace";
mPref.chars = XML;
mXml = mPref; // XML namespace
}
/**
* Initializes parser's internals. Note, current input has to
* be set before this method is called.
*/
protected void init()
{
mUnent = null;
mElm = null;
mPref = mXml;
mAttL = null;
mPEnt = new Hashtable();
mEnt = new Hashtable();
mDoc = mInp; // current input is document entity
mChars = mInp.chars; // use document entity buffer
mPh = PH_DOC_START; // the beginning of the document
}
/**
* Cleans up parser internal resources.
*/
protected void cleanup()
{
// Default attributes
while (mAttL != null) {
while (mAttL.list != null) {
if (mAttL.list.list != null)
del(mAttL.list.list);
mAttL.list = del(mAttL.list);
}
mAttL = del(mAttL);
}
// Element stack
while (mElm != null)
mElm = del(mElm);
// Namespace prefixes
while (mPref != mXml)
mPref = del(mPref);
// Inputs
while (mInp != null)
pop();
// Document reader
if ((mDoc != null) && (mDoc.src != null)) {
try { mDoc.src.close(); } catch (IOException ioe) {}
}
mPEnt = null;
mEnt = null;
mDoc = null;
mPh = PH_AFTER_DOC; // before document processing
}
/**
* Processes a portion of document.
* This method returns one of EV_* constants as an identifier of
* the portion of document have been read.
*
* @return Identifier of processed document portion.
* @exception Exception is parser specific exception form panic method.
* @exception IOException
*/
protected int step() throws Exception
{
mEvt = EV_NULL;
int st = (mPh == PH_DOCELM)? 0: 4; // skip white space
while (mEvt == EV_NULL) {
char ch = (mChIdx < mChLen)? mChars[mChIdx++]: getch();
switch (st) {
case 0: // all sorts of markup (dispatcher)
if (ch != '<') {
bkch();
mBuffIdx = -1; // clean parser buffer
st = 1;
break;
}
switch (getch()) {
case '/': // the end of the element content
mEvt = EV_ELME;
if (mElm == null)
panic(FAULT_NO_OPENED_ELEMENT);
// Check element's open/close tags balance
mBuffIdx = -1; // clean parser buffer
bname(mIsNSAware);
char[] chars = mElm.chars;
if (chars.length != (mBuffIdx + 1)) // the same length
panic(FAULT_END_ELEMENT_TAG_MISMATCH);
for (char i = 1; i <= mBuffIdx; i += 1) {
if (chars[i] != mBuff[i])
panic(FAULT_END_ELEMENT_TAG_MISMATCH);
}
// Skip white spaces before '>'
if (wsskip() != '>')
panic(FAULT_SYNTAX);
getch(); // read '>'
break;
case '!': // a comment or a CDATA
ch = getch();
bkch();
switch (ch) {
case '-': // must be a comment
mEvt = EV_COMM;
comm();
break;
case '[': // must be a CDATA section
mEvt = EV_CDAT;
cdat();
break;
default: // must be 'DOCTYPE'
mEvt = EV_DTDS;
mPSid = dtd();
break;
}
break;
case '?': // processing instruction
mEvt = EV_PI;
pi();
break;
default: // must be the first char of an xml name
bkch();
mElm = pair(mElm); // add new element to the stack
if (mElm.next == null) {
dtdpost(); // do dtdpost before the first element
mElm.id = (mIsNSAware)?
FLAG_NSAWARE | FLAG_XMLSPC_PRESERVE:
FLAG_XMLSPC_PRESERVE;
} else { // release previous element attributes
while (mElm.list != null)
mElm.list = del(mElm.list);
mElm.id = mElm.next.id; // inherit flags
}
// Read an element name and put it on top of the
// element stack
mElm.chars = qname(mIsNSAware);
mElm.name = (mIsNSAware)? mElm.local(): mElm.qname();
mElm.num = 0; // attribute counter
// Find the list of defined attributes of the current
// element
Pair elm = find(mAttL, mElm.chars);
mElm.list = (elm != null)? elm.list: null;
// Read attributes till the end of the element tag
attrs();
mAttrs.set(mElm);
// Skip white spaces before '>'
switch (wsskip()) {
case '>':
getch(); // read '>'
mEvt = EV_ELMS;
break;
case '/':
getch(); // read '/'
if (getch() != '>') // read '>'
panic(FAULT_SYNTAX);
mEvt = EV_ELM;
break;
default:
panic(FAULT_SYNTAX);
}
break;
}
break;
case 1: // read white space
switch (ch) {
case ' ':
case '\t':
case '\n':
bappend(ch);
break;
case '\r': // EOL processing [#2.11]
if (getch() != '\n')
bkch();
bappend('\n');
break;
case '<':
mEvt = EV_WSPC;
bkch();
bflash_ws();
break;
default:
bkch();
st = 2;
break;
}
break;
case 2: // read the text content of the element
switch (ch) {
case '&':
if (mUnent == null) {
// There was no unresolved entity on previous step.
if ((mUnent = ent('x')) != null) {
// Unresolved entity has been read
if (mBuffIdx >= 0) {
// There are some characters in the buffer
bkch(); // move back to ';' after entity name
setch('&'); // parser must be back on next step
mEvt = EV_TEXT;
bflash();
} else {
// There is nothing in the buffer
if (mUnent == UCS4_CHAR) {
mEvt = EV_TEXT;
reportUCS4(mIent);
} else {
mEvt = EV_ENT;
skippedEnt(mUnent);
}
mUnent = null;
}
}
} else {
// There was unresolved entity on previous step.
if (mUnent == UCS4_CHAR) {
mEvt = EV_TEXT;
reportUCS4(mIent);
} else {
mEvt = EV_ENT;
skippedEnt(mUnent);
}
mUnent = null;
}
break;
case '<':
mEvt = EV_TEXT;
bkch();
bflash();
break;
case '\r': // EOL processing [#2.11]
if (getch() != '\n')
bkch();
bappend('\n');
break;
case EOS:
panic(FAULT_UNEXPECTED_EOF);
case ' ': // characters not supported by bappend()
case '\"':
case '\'':
case '\n':
case '\t':
case '%':
bappend(ch);
break;
default:
bappend();
break;
}
break;
case 3: // DTD internal/external subset processing
switch (ch) {
case '<':
switch (ch = getch()) {
case '!': // a comment or a DTD declaration
ch = getch();
bkch();
if (ch == '-') { // a comment
mEvt = EV_COMM;
comm();
} else { // a DTD declaration
bntok();
switch (bkeyword()) {
case 'n':
mEvt = dtdent(); // parse entity declaration
break;
case 'a':
dtdattl(); // parse attributes declaration
break;
case 'e':
dtdelm(); // parse element declaration
break;
case 'o':
mEvt = EV_NOT;
dtdnot(); // parse notation declaration
break;
default:
panic(FAULT_UNSUPPORTED_MARKUP_DECLARATION); // unsupported markup declaration
}
wsskip();
if (getch() != '>')
panic(FAULT_SYNTAX);
}
break;
case '?': // processing instruction
mEvt = EV_PI;
pi();
break;
default: //
panic(FAULT_SYNTAX);
}
break;
case ']':
// The end of the DTD subset
if (mPSid != null) {
// Report the DTD external subset
InputSource is = resolveEnt(
"[dtd]", mPSid.name, mPSid.value);
if (is != null) {
if (mIsSAlone == false) {
// Set the end of DTD external subset char
bkch();
// Set the DTD external subset InputSource
push(new Input(BUFFSIZE_READER));
setinp(is);
mInp.pubid = mPSid.name;
mInp.sysid = mPSid.value;
} else {
// Unresolved DTD external subset
skippedEnt("[dtd]");
// Release reader and stream
if (is.getCharacterStream() != null) {
try {
is.getCharacterStream().close();
} catch (IOException ioe) {
}
}
if (is.getByteStream() != null) {
try {
is.getByteStream().close();
} catch (IOException ioe) {
}
}
mEvt = EV_DTDE;
}
} else {
// Unresolved DTD external subset
skippedEnt("[dtd]");
mEvt = EV_DTDE;
}
del(mPSid);
mPSid = null;
} else {
mEvt = EV_DTDE;
}
break;
case '%':
// A parameter entity reference
pent(' ');
break;
case ' ':
case '\t':
case '\r':
case '\n':
// Skip white spaces
break;
default:
panic(FAULT_SYNTAX);
}
break;
case 4: // Skip white spaces
switch (ch) {
case ' ':
case '\t':
case '\r':
case '\n':
// Skip white spaces
break;
default:
bkch();
st = (mPh != PH_DTD)? 0: 3;
}
break;
default:
panic(FAULT_SYNTAX);
}
}
return mEvt;
}
/**
* Parses the document type declaration.
*
* @exception Exception is parser specific exception form panic method.
* @exception IOException
*/
private Pair dtd()
throws Exception
{
char ch;
String name = null;
Pair psid = null;
// read 'DOCTYPE'
if ("DOCTYPE".equals(name(false)) != true || mPh >= PH_DTD)
panic(FAULT);
for (short st = 0; st >= 0;) {
ch = getch();
switch (st) {
case 0: // read the document type name
if (chtyp(ch) != ' ') {
bkch();
name = name(mIsNSAware);
wsskip();
st = 1; // read 'PUBLIC' or 'SYSTEM'
}
break;
case 1: // read 'PUBLIC' or 'SYSTEM'
switch (chtyp(ch)) {
case 'A':
bkch();
psid = pubsys(' ');
st = 2; // skip spaces before internal subset
break;
case '[':
bkch();
psid = pair(null);
st = 2; // skip spaces before internal subset
break;
case '>':
bkch();
psid = pair(null);
st = 3; // skip spaces after internal subset
break;
default:
panic(FAULT);
}
break;
case 2: // skip spaces before internal subset
switch (chtyp(ch)) {
case '[':
// Read and accumulate the DTD internal subset
psid.chars = dtdint();
st = 3; // skip spaces after internal subset
break;
case '>':
// There is no internal subset
bkch();
st = 3; // skip spaces after internal subset
break;
case ' ':
// skip white spaces
break;
default:
panic(FAULT);
}
break;
case 3: // skip spaces after internal subset
switch (chtyp(ch)) {
case '>':
docType(name, psid.name, psid.value, psid.chars);
// Set the end of DTD internal subset char
bkch();
setch(']');
// Set the DTD internal subset
char[] chars = (psid.chars != null)? psid.chars: new char[0];
Input inp = new Input(chars);
inp.pubid = mInp.pubid;
inp.sysid = mInp.sysid;
inp.xmlenc = mInp.xmlenc;
inp.xmlver = mInp.xmlver;
push(inp);
psid.chars = null;
if (psid.name == null && psid.value == null) {
// No external subset
del(psid);
psid = null;
}
st = -1; // end of DTD
break;
case ' ':
// skip white spaces
break;
default:
panic(FAULT);
}
break;
default:
panic(FAULT);
}
}
return psid;
}
/**
* Retrieves the document type declaration internal subset.
*
* @return The DTD internal subset as an array of characters.
* @exception Exception is parser specific exception form panic method.
* @exception IOException
*/
private char[] dtdint()
throws Exception
{
int bsize = mBuff.length;
mBuffIdx = -1;
while (true) {
char ch = (mChIdx < mChLen)? mChars[mChIdx++]: getch();
switch (ch) {
case ']':
char chars[] = new char[mBuffIdx + 1];
System.arraycopy(mBuff, 0, chars, 0, mBuffIdx + 1);
// Do not keep large buffer
if (mBuff.length > bsize)
mBuff = new char[bsize];
mBuffIdx = -1;
return chars;
case EOS:
panic(FAULT);
default:
bappend(ch);
}
}
}
/**
* Parses an entity declaration.
* This method fills the general (<code>mEnt</code>) and parameter
* (<code>mPEnt</code>) entity look up table.
*
* @return Entity event or null event.
* @exception Exception is parser specific exception form panic method.
* @exception IOException
*/
private int dtdent()
throws Exception
{
String str = null;
char[] val = null;
Input inp = null;
Pair ids = null;
int evt = EV_NULL;
char ch;
for (short st = 0; st >= 0;) {
ch = getch();
switch (st) {
case 0: // skip white spaces before entity name
switch (chtyp(ch)) {
case ' ':
// Skip white spaces
break;
case '%':
// Parameter entity or parameter entity declaration.
ch = getch();
bkch();
if (chtyp(ch) == ' ') {
// Parameter entity declaration.
wsskip();
str = name(false);
switch (chtyp(wsskip())) {
case 'A':
// Read the external identifier
ids = pubsys(' ');
if (wsskip() == '>') {
// External parsed entity
if (mPEnt.containsKey(str) == false) { // [#4.2]
inp = new Input();
inp.pubid = ids.name;
inp.sysid = ids.value;
mPEnt.put(str, inp);
}
} else {
panic(FAULT);
}
del(ids);
st = -1; // the end of declaration
break;
case '\"':
case '\'':
// Read the parameter entity value
bqstr('d');
// Create the parameter entity value
val = new char[mBuffIdx + 1];
System.arraycopy(mBuff, 1, val, 1, val.length - 1);
// Add surrounding spaces [#4.4.8]
val[0] = ' ';
// Add the entity to the entity look up table
if (mPEnt.containsKey(str) == false) { // [#4.2]
inp = new Input(val);
inp.pubid = mInp.pubid;
inp.sysid = mInp.sysid;
inp.xmlenc = mInp.xmlenc;
inp.xmlver = mInp.xmlver;
mPEnt.put(str, inp);
}
st = -1; // the end of declaration
break;
default:
panic(FAULT);
break;
}
} else {
// Parameter entity reference.
pent(' ');
}
break;
default:
bkch();
str = name(false);
st = 1; // read entity declaration value
break;
}
break;
case 1: // read entity declaration value
switch (chtyp(ch)) {
case '\"': // internal entity
case '\'':
bkch();
bqstr('d'); // read a string into the buffer
if (mEnt.get(str) == null) {
// Create general entity value
val = new char[mBuffIdx];
System.arraycopy(mBuff, 1, val, 0, val.length);
// Add the entity to the entity look up table
if (mEnt.containsKey(str) == false) { // [#4.2]
inp = new Input(val);
inp.pubid = mInp.pubid;
inp.sysid = mInp.sysid;
inp.xmlenc = mInp.xmlenc;
inp.xmlver = mInp.xmlver;
mEnt.put(str, inp);
intparsedEntDecl(str, val);
evt = EV_PENT;
}
}
st = -1; // the end of declaration
break;
case 'A': // external entity
bkch();
ids = pubsys(' ');
switch (wsskip()) {
case '>': // external parsed entity
if (mEnt.containsKey(str) == false) { // [#4.2]
inp = new Input();
inp.pubid = ids.name;
inp.sysid = ids.value;
mEnt.put(str, inp);
extparsedEntDecl(str, inp.pubid, inp.sysid);
evt = EV_PENT;
}
break;
case 'N': // external general unparsed entity
if ("NDATA".equals(name(false)) == true) {
wsskip();
unparsedEntDecl(str, ids.name, ids.value, name(false));
evt = EV_UENT;
break;
}
default:
panic(FAULT);
}
del(ids);
st = -1; // the end of declaration
break;
case ' ':
// Skip white spaces
break;
default:
panic(FAULT);
}
break;
default:
panic(FAULT);
}
}
return evt;
}
/**
* Parses an element declaration.
*
* This method parses the declaration up to the closing angle
* bracket.
*
* @exception Exception is parser specific exception form panic method.
* @exception IOException
*/
private void dtdelm()
throws Exception
{
for (short st = 0; st >= 0;) {
switch (st) {
case 0: // read element name
if (wsskip() != '%') {
mBuffIdx = -1;
bname(mIsNSAware); // element name
st = 1; // read element content definition
} else {
getch();
pent(' ');
}
break;
case 1: // read element content definition
switch (wsskip()) {
case 'E':
case 'A':
mBuffIdx = -1;
bntok(); // EMPTY or ANY
switch (bkeyword()) {
case 'E': // EMPTY
case 'Y': // ANY
st = 2; // read optional white space followed by '>'
break;
default:
panic(FAULT);
}
break;
case '(':
dtdelm_cont(true);
st = 2; // read optional white space followed by '>'
break;
case '%':
getch();
pent(' ');
break;
default:
panic(FAULT);
}
break;
case 2: // read optional white space followed by '>'
switch(wsskip()) {
case '>':
st = -1; // exit
break;
case '%':
getch();
pent(' ');
break;
default:
panic(FAULT);
}
break;
default:
panic(FAULT);
}
}
}
/**
* Parses an element content particle.
*
* @exception Exception is parser specific exception form panic method.
* @exception IOException
*/
private void dtdelm_cont(boolean mix)
throws Exception
{
char type = 0;
char ch;
for (short st = 0; st >= 0;) {
if ((ch = getch()) == '%') {
pent(' ');
wsskip();
continue;
}
switch (st) {
case 0: // an element name or a choice/sequence/mixed
switch (ch) {
case '(':
wsskip();
st = 1; // the first element of content particle
break;
default:
if (mix == true)
panic(FAULT);
bkch();
mBuffIdx = -1;
bname(mIsNSAware); // element name
st = -1; // exit
break;
}
break;
case 1: // the first element of content particle
switch (ch) {
case '#':
if (mix == false)
panic(FAULT);
bkch();
dtdelm_mix(); // mixed content declaration
return;
default:
bkch();
dtdelm_cont(false);
st = 2; // type of content particle
break;
}
break;
case 2: // type of content particle
switch (ch) {
case '|': // it is a choice
type = '|';
bkch();
st = 3; // read a choice or a sequence
break;
case ',': // it is a sequence
type = ',';
bkch();
st = 3; // read a choice or a sequence
break;
case ')': // it is one element sequence
st = -1; // exit
break;
default:
panic(FAULT);
}
break;
case 3: // read a choice or a sequence
switch (ch) {
case '|': // it is a choice
if (type != '|')
panic(FAULT);
wsskip();
dtdelm_cont(false);
break;
case ',': // it is a sequence
if (type != ',')
panic(FAULT);
wsskip();
dtdelm_cont(false);
break;
case ')': // it is the end of a sequence or a choice
st = -1; // exit
break;
default:
panic(FAULT);
}
break;
default:
panic(FAULT);
}
}
switch (getch()) {
case '?':
case '+':
case '*':
break;
default:
bkch();
}
wsskip();
}
/**
* Parses an element mixed content declaration.
*
* @exception Exception is parser specific exception form panic method.
* @exception IOException
*/
private void dtdelm_mix()
throws Exception
{
// String '(' S? had been read by dtdelm_cont
if ((getch() != '#') || ("PCDATA".equals(name(false)) != true))
panic(FAULT);
for (short st = 0; st >= 0;) {
switch (st) {
case 0: // read pcdata element content
switch (wsskip()) {
case '|':
st = 1; // read mixed element content
break;
case ')':
getch();
if (getch() != '*')
bkch();
st = -1; // exit
break;
case '%':
getch();
pent(' ');
break;
default:
panic(FAULT);
}
break;
case 1: // read mixed element content
// String (S? '|' S? name)* S? ')*'
switch (getch()) {
case '|':
wsskip();
st = 2; // read name in the mixed element content
break;
case ')':
if (getch() != '*')
panic(FAULT);
st = -1; // exit
break;
case '%':
pent(' ');
break;
default:
panic(FAULT);
}
break;
case 2: // read name in the mixed element content
switch (getch()) {
case '%':
pent(' ');
break;
default:
mBuffIdx = -1;
bname(mIsNSAware); // element name
wsskip();
st = 1; // read mixed element content
break;
}
break;
default:
panic(FAULT);
}
}
wsskip();
}
/**
* Parses an attribute list declaration.
*
* This method parses the declaration up to the closing angle
* bracket.
*
* @exception Exception is parser specific exception form panic method.
* @exception IOException
*/
private void dtdattl()
throws Exception
{
char elmqn[] = null;
Pair elm = null;
char ch;
for (short st = 0; st >= 0;) {
ch = getch();
switch (st) {
case 0: // read the element name
switch (chtyp(ch)) {
case 'a':
case 'A':
case '_':
case 'X':
case ':':
bkch();
// Get the element from the list or add a new one.
elmqn = qname(mIsNSAware);
elm = find(mAttL, elmqn);
if (elm == null) {
elm = pair(mAttL);
elm.chars = elmqn;
mAttL = elm;
}
st = 1; // read an attribute declaration
break;
case ' ':
break;
case '%':
pent(' ');
break;
default:
panic(FAULT);
break;
}
break;
case 1: // read an attribute declaration
switch (chtyp(ch)) {
case 'a':
case 'A':
case '_':
case 'X':
case ':':
bkch();
dtdatt(elm);
if (wsskip() == '>')
return;
break;
case ' ':
break;
case '%':
pent(' ');
break;
default:
panic(FAULT);
break;
}
break;
default:
panic(FAULT);
break;
}
}
}
/**
* Parses an attribute declaration.
*
* The attribute uses the following fields of Pair object:
* chars - characters of qualified name
* id - the type identifier of the attribute
* num - carries attribute flags where:
* 0x1 - attribute is declared in DTD (attribute declaration had
* been read);
* 0x2 - attribute's default value is used.
* list - a pair which holds the default value (chars field)
*
* @param elm An object which represents all defined attributes on an element.
* @exception Exception is parser specific exception form panic method.
* @exception IOException
*/
private void dtdatt(Pair elm)
throws Exception
{
char attqn[] = null;
Pair att = null;
char ch;
for (short st = 0; st >= 0;) {
ch = getch();
switch (st) {
case 0: // the attribute name
switch (chtyp(ch)) {
case 'a':
case 'A':
case '_':
case 'X':
case ':':
bkch();
// Get the attribute from the list or add a new one.
attqn = qname(mIsNSAware);
att = find(elm.list, attqn);
if (att == null) {
// New attribute declaration
att = pair(elm.list);
att.chars = attqn;
att.num = 0x1; // attribute is declared in DTD
att.id = 'c'; // default type is CDATA
elm.list = att;
} else {
// Do not override the attribute declaration [#3.3]
att = pair(null);
att.chars = attqn;
att.num = 0x1; // attribute is declared in DTD
att.id = 'c'; // default type is CDATA
}
wsskip();
st = 1;
break;
case '%':
pent(' ');
break;
case ' ':
break;
default:
panic(FAULT);
}
break;
case 1: // the attribute type
switch (chtyp(ch)) {
case '(':
att.id = 'u'; // enumeration type
st = 2; // read the first element of the list
break;
case '%':
pent(' ');
break;
case ' ':
break;
default:
bkch();
bntok(); // read type id
att.id = bkeyword();
switch (att.id) {
case 'o': // NOTATION
if (chtyp(getch()) != ' ' && wsskip() != '(')
panic(FAULT);
ch = getch();
st = 2; // read the first element of the list
break;
case 'i': // ID
case 'r': // IDREF
case 'R': // IDREFS
case 'n': // ENTITY
case 'N': // ENTITIES
case 't': // NMTOKEN
case 'T': // NMTOKENS
case 'c': // CDATA
wsskip();
st = 4; // read default declaration
break;
default:
panic(FAULT);
}
break;
}
break;
case 2: // read the first element of the list
switch (chtyp(ch)) {
case 'a':
case 'A':
case 'd':
case '.':
case ':':
case '-':
case '_':
case 'X':
bkch();
switch (att.id) {
case 'u': // enumeration type
bntok();
break;
case 'o': // NOTATION
mBuffIdx = -1;
bname(false);
break;
default:
panic(FAULT);
break;
}
wsskip();
st = 3; // read next element of the list
break;
case '%':
pent(' ');
break;
case ' ':
break;
default:
panic(FAULT);
break;
}
break;
case 3: // read next element of the list
switch (ch) {
case ')':
wsskip();
st = 4; // read default declaration
break;
case '|':
wsskip();
switch (att.id) {
case 'u': // enumeration type
bntok();
break;
case 'o': // NOTATION
mBuffIdx = -1;
bname(false);
break;
default:
panic(FAULT);
break;
}
wsskip();
break;
case '%':
pent(' ');
break;
default:
panic(FAULT);
break;
}
break;
case 4: // read default declaration
switch (ch) {
case '#':
bntok();
switch (bkeyword()) {
case 'F': // FIXED
wsskip();
st = 5; // read the default value
break;
case 'Q': // REQUIRED
case 'I': // IMPLIED
st = -1;
break;
default:
panic(FAULT);
break;
}
break;
case '\"':
case '\'':
bkch();
st = 5; // read the default value
break;
case ' ':
case '\n':
case '\r':
case '\t':
break;
case '%':
pent(' ');
break;
default:
panic(FAULT);
}
break;
case 5: // read the default value
switch (ch) {
case '\"':
case '\'':
bkch();
bqstr('d'); // the value in the mBuff now
att.num |= 0x2; // attribute has default value
att.list = pair(null);
// Create a string like "attqname='value' "
att.list.chars = new char[att.chars.length + mBuffIdx + 3];
System.arraycopy(
att.chars, 1, att.list.chars, 0, att.chars.length - 1);
att.list.chars[att.chars.length - 1] = '=';
att.list.chars[att.chars.length] = ch;
System.arraycopy(
mBuff, 1, att.list.chars, att.chars.length + 1, mBuffIdx);
att.list.chars[att.chars.length + mBuffIdx + 1] = ch;
att.list.chars[att.chars.length + mBuffIdx + 2] = ' ';
st = -1;
break;
default:
panic(FAULT);
break;
}
break;
default:
panic(FAULT);
break;
}
}
}
/**
* Parses a notation declaration.
*
* This method parses the declaration up to the closing angle
* bracket.
*
* @exception Exception is parser specific exception form panic method.
* @exception IOException
*/
private void dtdnot()
throws Exception
{
wsskip();
String name = name(false);
wsskip();
Pair ids = pubsys('N');
notDecl(name, ids.name, ids.value);
del(ids);
}
/**
* DTD post-processing.
*
* This method is always called before the first element of the document
* is read. This method completes default attribute processing.
*
* @exception Exception is parser specific exception form panic method.
* @exception IOException
*/
protected void dtdpost()
throws Exception
{
for (Pair elm = mAttL; elm != null; elm = elm.next) {
for (Pair attr = elm.list; attr != null; attr = attr.next) {
if (attr.list == null)
continue; // there is no default value declared
push(new Input(attr.list.chars));
bname(false); // skip attribute's name
wsskip();
if (getch() != '=')
panic(FAULT);
bqstr(((char)attr.id == 'c')? 'c': 'i'); // read the value
battrval(attr); // mElm == null at this stage
pop();
}
}
}
/**
* Parses all attributes.
*
* This method builds list of attributes (<code>mElm.list</code>) and adds
* prefix mappings to <code>mPref</code>.
*
* <p><code>att.num</code> carries attribute flags where: 0x1 -
* attribute is declared in DTD (attribute declaration had been read);
* 0x2 - attribute's default value is used.</p>
*
* Note, in order to provide correct values of default attributes and to
* support DOM implementation this method MUST process default attributes
* before defined attributes.
*
* @exception Exception is parser specific exception form panic method.
* @exception IOException
*/
protected void attrs()
throws Exception
{
final boolean withns = mIsNSAware;
Pair head = null; // head of NS declarations list
Pair tail = null; // tail of NS declarations list
Pair decl = null;
Pair next = mElm.list; // list of declared attributes of this element
Pair attr = null;
Pair list = null; // empty list of element's attributes
int anum = 0; // actual number of attributes
// Main attribute processing loop
attrloop: while (true) {
if (next == null) { // next is null or a reference to default
// No defaults left. Read defined attributes.
switch (wsskip()) {
case '/':
case '>':
break attrloop; // all attributes had been processed
case EOS:
panic(FAULT);
default:
// Read the attribute name and value
attr = pair(null);
attr.chars = qname(withns);
attr.num = 0; // no attribute flags
wsskip();
if (getch() != '=')
panic(FAULT);
if ((decl = find(mElm.list, attr.chars)) != null) {
attr.num |= 0x1; // attribute is declared
attr.id = decl.id;
bqstr(((char)attr.id == 'c')? 'c': 'i');
} else {
attr.id = 'c'; // CDATA-type by default [#3.3.3]
bqstr('c'); // read the value
}
battrval(attr); // mElm != null in contrast with dtdpost
break;
}
} else {
// Find next declared attribute with default value
while (next != null && (next.num & 0x3) != 0x3)
next = next.next;
if (next == null)
continue attrloop; // there is no more default attributes
// Copy of default attribute
attr = pair(null);
attr.copyof(next);
next = next.next;
}
// Check for duplicate or default attribute
Pair defa = find(list, attr.chars); // lookup for duplicate
if (defa != null) {
if ((defa.num & 0x2) == 0) // attribute is not default
panic(FAULT); // duplicate attribute
defa.value = attr.value; // override default attribute value
defa.num &= ~0x2; // clear default value flag
if (defa.list != null) // link to NS declaration
defa.list.value = defa.value; // update NS declaration
del(attr); // no need to add attribute to the list
continue attrloop;
}
// Classify current attribute
switch (attr.chars[0]) { // length of prefix plus 1
case 0: // there is no prefix
if (attr.chars.length != 6) // cannot be 'xmlns'
break;
// compare name chars to 'xmlns'
case 6: // 5 char prefix
if (withns == false) // no need to continue if non-NS-aware
break;
if (attr.chars[1] == 'x' &&
attr.chars[2] == 'm' &&
attr.chars[3] == 'l' &&
attr.chars[4] == 'n' &&
attr.chars[5] == 's') {
// Namespace declaration
decl = pair(null);
decl.list = mElm; // prefix owner element
decl.value = attr.value; // namespace string
if (attr.chars[0] == 0) {
// Default namespace
decl.name = ""; // prefix string
decl.chars = NONS;
} else {
// Prefix to namespace mapping
decl.name = attr.local(); // prefix string
int len = decl.name.length();
decl.chars = new char[len + 1];
decl.chars[0] = (char)(len + 1);
decl.name.getChars(0, len, decl.chars, 1);
}
attr.list = decl; // link from attribute to its NS decl
// Add NS decl to the temporary list of namespace decl
// Note, call to newPrefix is deferred because NS decl
// in default attr may be overridden by defined attr.
if (tail != null) {
tail.next = decl;
tail = decl;
} else {
head = decl;
tail = decl;
}
}
break;
case 4: // 3 char prefix
if (attr.chars[1] == 'x' &&
attr.chars[2] == 'm' &&
attr.chars[3] == 'l') {
if (attr.eqname(XMLSPC)) { // 'xml:space' attribute
// Manage the white space preserve flag
if (XMLSPC_PRESERVE.equals(attr.value)) {
mElm.id |= FLAG_XMLSPC_PRESERVE;
} else {
mElm.id &= ~FLAG_XMLSPC_PRESERVE;
}
} else if (attr.eqname(XMLID)) { // 'xml:id' attribute
attr.id = 'i'; // enforce ID type on the attribute
}
}
break;
default:
break;
}
// Add current attribute to the list
attr.next = list;
list = attr;
// take into account namespace declaration also
anum++; // number of attributes
}
mElm.list = list; // mElm.list is complete list of element's attributes
mElm.num = anum; // actual number of attributes
// Declare namespaces
while (head != null) {
// Move declaration from the temp list to the top of mPref stack
decl = head;
head = decl.next;
decl.next = mPref;
mPref = decl;
// A namespace declaration. mPref.name contains prefix
// and mPref.value contains namespace URI.
newPrefix();
}
// Resolve element and all attribute prefixes
if (withns == false)
return; // no need to resolve prefixes
for (attr = mElm; attr != null; attr = next) {
char len = attr.chars[0]; // length of prefix
if (attr != mElm) {
next = attr.next;
if (len == 0)
continue; // no default namespaces for attributes
if (attr.list != null) { // it is NS declaration attribute
attr.ns = "http://www.w3.org/2000/xmlns/";
continue;
}
} else {
next = list; // next is the first attribute
if (len == 0) { // the element has no prefix
// Special case: default namespace lookup
for (Pair pref = mPref; pref != null; pref = pref.next) {
if (pref.chars[0] != 0)
continue; // next prefix
attr.ns = pref.value;
break; // the default namespace has been assigned
}
continue; // element is done; go get the first attribute
}
}
// Resolve
resolve: for (Pair pref = mPref; pref != null; pref = pref.next) {
if (len != pref.chars[0])
continue resolve; // the prefix length is not equal
// Compare characters of prefixes equal by length
for (char i = 1; i < len; i++) {
if (pref.chars[i] != attr.chars[i])
continue resolve;
}
// All corresponding characters are equal
attr.ns = pref.value;
break;
}
// Attributes without prefix are filtered out before the
// resolve loop. Element without prefix is handled as a
// special case above. There may not be unresolved prefixes
// at this stage.
if (attr.ns == null)
panic(FAULT);
}
}
/**
* Parses a comment.
*
* The '<!' part is read in dispatcher so the method starts
* with first '-' after '<!'.
*
* @exception Exception is parser specific exception form panic method.
*/
private void comm()
throws Exception
{
if (mPh == PH_DOC_START)
mPh = PH_MISC_DTD; // misc before DTD
// '<!' has been already read by dispatcher.
char ch;
mBuffIdx = -1;
for (short st = 0; st >= 0;) {
ch = (mChIdx < mChLen)? mChars[mChIdx++]: getch();
if (ch == EOS)
panic(FAULT);
switch (st) {
case 0: // first '-' of the comment open
if (ch == '-')
st = 1;
else
panic(FAULT);
break;
case 1: // second '-' of the comment open
if (ch == '-')
st = 2;
else
panic(FAULT);
break;
case 2: // skip the comment body
switch (ch) {
case '-':
st = 3;
break;
default:
bappend(ch);
break;
}
break;
case 3: // second '-' of the comment close
switch (ch) {
case '-':
st = 4;
break;
default:
bappend('-');
bappend(ch);
st = 2;
break;
}
break;
case 4: // '>' of the comment close
if (ch == '>') {
comm(mBuff, mBuffIdx + 1);
st = -1;
break;
}
// else - panic [#2.5 compatibility note]
default:
panic(FAULT);
}
}
}
/**
* Parses a processing instruction.
*
* The '<?' is read in dispatcher so the method starts
* with first character of PI target name after '<?'.
*
* @exception Exception is parser specific exception form panic method.
* @exception IOException
*/
private void pi()
throws Exception
{
// '<?' has been already read by dispatcher.
char ch;
String str = null;
mBuffIdx = -1;
for (short st = 0; st >= 0;) {
ch = getch();
if (ch == EOS)
panic(FAULT);
switch (st) {
case 0: // read the PI target name
switch (chtyp(ch)) {
case 'a':
case 'A':
case '_':
case ':':
case 'X':
bkch();
str = name(false);
// PI target name may not be empty string [#2.6]
// PI target name 'XML' is reserved [#2.6]
if ((str.length() == 0) ||
(mXml.name.equals(str.toLowerCase()) == true))
panic(FAULT);
// This is processing instruction
if (mPh == PH_DOC_START) // the beginning of the document
mPh = PH_MISC_DTD; // misc before DTD
wsskip(); // skip spaces after the PI target name
st = 1; // accumulate the PI body
mBuffIdx = -1;
break;
default:
panic(FAULT);
}
break;
case 1: // accumulate the PI body
switch (ch) {
case '?':
st = 2; // end of the PI body
break;
default:
bappend(ch);
break;
}
break;
case 2: // end of the PI body
switch (ch) {
case '>':
// PI has been read.
pi(str, new String(mBuff, 0, mBuffIdx + 1));
st = -1;
break;
case '?':
bappend('?');
break;
default:
bappend('?');
bappend(ch);
st = 1; // accumulate the PI body
break;
}
break;
default:
panic(FAULT);
}
}
}
/**
* Parses a character data.
*
* The '<!' part is read in dispatcher so the method starts
* with first '[' after '<!'.
*
* @exception Exception is parser specific exception form panic method.
* @exception IOException
*/
private void cdat()
throws Exception
{
// '<!' has been already read by dispatcher.
char ch;
mBuffIdx = -1;
for (short st = 0; st >= 0;) {
ch = getch();
switch (st) {
case 0: // the first '[' of the CDATA open
if (ch == '[')
st = 1;
else
panic(FAULT);
break;
case 1: // read "CDATA"
if (chtyp(ch) == 'A') {
bappend(ch);
} else {
if ("CDATA".equals(
new String(mBuff, 0, mBuffIdx + 1)) != true)
panic(FAULT);
bkch();
st = 2;
}
break;
case 2: // the second '[' of the CDATA open
if (ch != '[')
panic(FAULT);
mBuffIdx = -1;
st = 3;
break;
case 3: // read data before the first ']'
switch (ch) {
case ']':
st = 4;
break;
case EOS:
panic(FAULT);
default:
bappend(ch);
}
break;
case 4: // read the second ']' or continue to read the data
if (ch != ']') {
bappend(']');
bappend(ch);
st = 3;
} else {
st = 5;
}
break;
case 5: // read '>' or continue to read the data
switch (ch) {
case ']':
bappend(']');
break;
case '>':
bflash();
st = -1;
break;
default:
bappend(']');
bappend(']');
bappend(ch);
st = 3;
break;
}
break;
default:
panic(FAULT);
}
}
}
/**
* Reads a xml name.
*
* The xml name must conform "Namespaces in XML" specification. Therefore
* the ':' character is not allowed in the name. This method should be
* used for PI and entity names which may not have a namespace according
* to the specification mentioned above.
*
* @param ns The true value turns namespace conformance on.
* @return The name has been read.
* @exception Exception When incorrect character appear in the name.
* @exception IOException
*/
protected String name(boolean ns)
throws Exception
{
mBuffIdx = -1;
bname(ns);
return new String(mBuff, 1, mBuffIdx);
}
/**
* Reads a qualified xml name.
*
* The characters of a qualified name is an array of characters. The
* first (chars[0]) character is the index of the colon character which
* separates the prefix from the local name. If the index is zero, the
* name does not contain separator or the parser works in the namespace
* unaware mode. The length of qualified name is the length of the array
* minus one.
*
* @param ns The true value turns namespace conformance on.
* @return The characters of a qualified name.
* @exception Exception When incorrect character appear in the name.
* @exception IOException
*/
protected char[] qname(boolean ns)
throws Exception
{
mBuffIdx = -1;
bname(ns);
char chars[] = new char[mBuffIdx + 1];
System.arraycopy(mBuff, 0, chars, 0, mBuffIdx + 1);
return chars;
}
/**
* Reads the public or/and system identifiers.
*
* @param flag The 'N' allows public id be without system id.
* @return The public or/and system identifiers pair.
* @exception Exception is parser specific exception form panic method.
* @exception IOException
*/
private Pair pubsys(char flag)
throws Exception
{
Pair ids = pair(null);
String str = name(false);
if ("PUBLIC".equals(str) == true) {
bqstr('i'); // non-CDATA normalization [#4.2.2]
ids.name = new String(mBuff, 1, mBuffIdx);
switch (wsskip()) {
case '\"':
case '\'':
bqstr(' ');
ids.value = new String(mBuff, 1, mBuffIdx);
break;
case EOS:
panic(FAULT);
default:
if (flag != 'N') // [#4.7]
panic(FAULT);
ids.value = null;
break;
}
return ids;
} else if ("SYSTEM".equals(str) == true) {
ids.name = null;
bqstr(' ');
ids.value = new String(mBuff, 1, mBuffIdx);
return ids;
}
panic(FAULT);
return null;
}
/**
* Reads an attribute value.
*
* The grammar which this method can read is:<br />
* <code>eqstr := S "=" qstr</code><br />
* <code>qstr := S ("'" string "'") |
* ('"' string '"')</code><br />
* This method resolves entities inside a string unless the parser
* parses DTD.
*
* @param flag The '=' character forces the method to accept the '='
* character before quoted string and read the following string as not
* an attribute ('-'), 'c' - CDATA, 'i' - non CDATA,
* ' ' - no normalization; '-' - not an attribute value;
* 'd' - in DTD context.
* @return The content of the quoted string as a string.
* @exception Exception is parser specific exception form panic method.
* @exception IOException
*/
protected String eqstr(char flag)
throws Exception
{
if (flag == '=') {
wsskip();
if (getch() != '=')
panic(FAULT);
}
bqstr((flag == '=')? '-': flag);
return new String(mBuff, 1, mBuffIdx);
}
/**
* Resolves an entity.
*
* This method resolves built-in and character entity references. It is
* also reports external entities to the application.
*
* @param flag The 'x' character forces the method to report a skipped entity;
* 'i' character - indicates non-CDATA normalization.
* @return Name of unresolved entity or <code>null</code> if entity had been
* resolved successfully.
* @exception Exception is parser specific exception form panic method.
* @exception IOException
*/
private String ent(char flag)
throws Exception
{
char ch;
int idx = mBuffIdx + 1;
Input inp = null;
String str = null;
mESt = 0x100; // reset the built-in entity recognizer
bappend('&');
for (short st = 0; st >= 0;) {
ch = (mChIdx < mChLen)? mChars[mChIdx++]: getch();
switch (st) {
case 0: // the first character of the entity name
case 1: // read built-in entity name
switch (chtyp(ch)) {
case 'd':
case '.':
case '-':
if (st != 1)
panic(FAULT);
case 'a':
case 'A':
case '_':
case 'X':
bappend(ch);
eappend(ch);
st = 1;
break;
case ':':
if (mIsNSAware != false)
panic(FAULT);
bappend(ch);
eappend(ch);
st = 1;
break;
case ';':
if (mESt < 0x100) {
// The entity is a built-in entity
mBuffIdx = idx - 1;
bappend(mESt);
st = -1;
break;
} else if (mPh == PH_DTD) {
// In DTD entity declaration has to resolve character
// entities and include "as is" others. [#4.4.7]
bappend(';');
st = -1;
break;
}
// Convert an entity name to a string
str = new String(mBuff, idx + 1, mBuffIdx - idx);
inp = (Input)mEnt.get(str);
// Restore the buffer offset
mBuffIdx = idx - 1;
if (inp != null) {
if (inp.chars == null) {
// External entity
InputSource is = resolveEnt(str, inp.pubid, inp.sysid);
if (is != null) {
push(new Input(BUFFSIZE_READER));
setinp(is);
mInp.pubid = inp.pubid;
mInp.sysid = inp.sysid;
str = null; // the entity is resolved
} else {
// Unresolved external entity
if (flag != 'x')
panic(FAULT); // unknown entity within markup
// str is name of unresolved entity
}
} else {
// Internal entity
push(inp);
str = null; // the entity is resolved
}
} else {
// Unknown or general unparsed entity
// NOTE: resolveEnt could be used to let an app. to
// resolve entity by the entity name even if it was
// not defined.
if (flag != 'x')
panic(FAULT); // unknown entity within markup
// str is name of unresolved entity
}
st = -1;
break;
case '#':
if (st != 0)
panic(FAULT);
st = 2;
break;
default:
panic(FAULT);
}
break;
case 2: // read character entity
switch (chtyp(ch)) {
case 'd':
bappend(ch);
break;
case ';':
// Convert the character entity to a character
try {
int i = Integer.parseInt(
new String(mBuff, idx + 1, mBuffIdx - idx), 10);
if (i >= 0xffff) {
if (flag != 'x')
panic(FAULT);
mIent = i;
str = UCS4_CHAR;
mBuffIdx = idx - 1;
st = -1;
break;
}
ch = (char)i;
} catch (NumberFormatException nfe) {
panic(FAULT);
}
// Restore the buffer offset
mBuffIdx = idx - 1;
if (ch == ' ' || mInp.next != null)
bappend(ch, flag);
else
bappend(ch);
st = -1;
break;
case 'a':
// If the entity buffer is empty and ch == 'x'
if ((mBuffIdx == idx) && (ch == 'x')) {
st = 3;
break;
}
default:
panic(FAULT);
}
break;
case 3: // read hex character entity
switch (chtyp(ch)) {
case 'A':
case 'a':
case 'd':
bappend(ch);
break;
case ';':
// Convert the character entity to a character
try {
int i = Integer.parseInt(
new String(mBuff, idx + 1, mBuffIdx - idx), 16);
if (i >= 0xffff) {
if (flag != 'x')
panic(FAULT);
mIent = i;
str = UCS4_CHAR;
mBuffIdx = idx - 1;
st = -1;
break;
}
ch = (char)i;
} catch (NumberFormatException nfe) {
panic(FAULT);
}
// Restore the buffer offset
mBuffIdx = idx - 1;
if (ch == ' ' || mInp.next != null)
bappend(ch, flag);
else
bappend(ch);
st = -1;
break;
default:
panic(FAULT);
}
break;
default:
panic(FAULT);
}
}
return str;
}
/**
* Resolves a parameter entity.
*
* This method resolves a parameter entity references. It is also reports
* external entities to the application.
*
* @param flag The '-' instruct the method to do not set up surrounding
* spaces [#4.4.8].
* @exception Exception is parser specific exception form panic method.
* @exception IOException
*/
private void pent(char flag)
throws Exception
{
int idx = mBuffIdx + 1;
Input inp = null;
String str = null;
bappend('%');
if (mPh != PH_DTD) // the DTD internal subset
return; // Not Recognized [#4.4.1]
// Read entity name
bname(false);
str = new String(mBuff, idx + 2, mBuffIdx - idx - 1);
if (getch() != ';')
panic(FAULT);
inp = (Input)mPEnt.get(str);
// Restore the buffer offset
mBuffIdx = idx - 1;
if (inp != null) {
if (inp.chars == null) {
// External parameter entity
InputSource is = resolveEnt(str, inp.pubid, inp.sysid);
if (is != null) {
if (flag != '-')
bappend(' '); // tail space
push(new Input(BUFFSIZE_READER));
// BUG: there is no leading space! [#4.4.8]
setinp(is);
mInp.pubid = inp.pubid;
mInp.sysid = inp.sysid;
} else {
// Unresolved external parameter entity
skippedEnt("%" + str);
}
} else {
// Internal parameter entity
if (flag == '-') {
// No surrounding spaces
inp.chIdx = 1;
} else {
// Insert surrounding spaces
bappend(' '); // tail space
inp.chIdx = 0;
}
push(inp);
}
} else {
// Unknown parameter entity
skippedEnt("%" + str);
}
}
/**
* Skips xml white space characters.
*
* This method skips white space characters (' ', '\t', '\n', '\r') and
* looks ahead not white space character.
*
* @return The first not white space look ahead character.
* @exception IOException
*/
protected char wsskip()
throws IOException
{
char ch;
while (true) {
// Read next character
ch = (mChIdx < mChLen)? mChars[mChIdx++]: getch();
if (ch < 0x80) {
if (nmttyp[ch] != 3) // [ \t\n\r]
break;
} else {
break;
}
}
mChIdx--; // bkch();
return ch;
}
/**
* Reports document type.
*
* @param name The name of the entity.
* @param pubid The public identifier of the DTD or <code>null</code>.
* @param sysid The system identifier of the DTD or <code>null</code>.
* @param dtdint The DTD internal subset or <code>null</code>.
*/
protected abstract void docType(
String name, String pubid, String sysid, char[] dtdint);
/**
* Reports a comment.
*
* @param text The comment text starting from first character.
* @param length The number of characters in comment.
*/
protected abstract void comm(char[] text, int length);
/**
* Reports a processing instruction.
*
* @param target The processing instruction target name.
* @param body The processing instruction body text.
*/
protected abstract void pi(String target, String body)
throws Exception;
/**
* Reports new namespace prefix.
* The Namespace prefix (<code>mPref.name</code>) being declared and
* the Namespace URI (<code>mPref.value</code>) the prefix is mapped
* to. An empty string is used for the default element namespace,
* which has no prefix.
*/
protected abstract void newPrefix()
throws Exception;
/**
* Reports skipped entity name.
*
* @param name The entity name.
*/
protected abstract void skippedEnt(String name)
throws Exception;
/**
* Returns an <code>InputSource</code> for specified entity or
* <code>null</code>.
*
* @param name The name of the entity.
* @param pubid The public identifier of the entity.
* @param sysid The system identifier of the entity.
*/
protected abstract InputSource resolveEnt(
String name, String pubid, String sysid)
throws Exception;
/**
* Reports internal parsed entity.
*
* @param name The entity name.
* @param value The entity replacement text.
*/
protected abstract void intparsedEntDecl(String name, char[] value)
throws Exception;
/**
* Reports external parsed entity.
*
* @param name The entity name.
* @param pubid The entity public identifier, may be null.
* @param name The entity system identifier, may be null.
*/
protected abstract void extparsedEntDecl(
String name, String pubid, String sysid)
throws Exception;
/**
* Reports notation declaration.
*
* @param name The notation's name.
* @param pubid The notation's public identifier, or null if none was given.
* @param sysid The notation's system identifier, or null if none was given.
*/
protected abstract void notDecl(String name, String pubid, String sysid)
throws Exception;
/**
* Reports unparsed entity name.
*
* @param name The unparsed entity's name.
* @param pubid The entity's public identifier, or null if none was given.
* @param sysid The entity's system identifier.
* @param notation The name of the associated notation.
*/
protected abstract void unparsedEntDecl(
String name, String pubid, String sysid, String notation)
throws Exception;
/**
* Notifies the handler about fatal parsing error.
*
* @param msg The problem description message.
*/
protected abstract void panic(String msg)
throws Exception;
/**
* Processes an attribute value in the buffer.
*
* @param attr The attribute owner of the value in the buffer.
*/
protected void battrval(Pair attr)
{
attr.value = new String(mBuff, 1, mBuffIdx);
}
/**
* Reads a qualified xml name.
*
* This is low level routine which leaves a qName in the buffer.
* The characters of a qualified name is an array of characters. The
* first (chars[0]) character is the index of the colon character which
* separates the prefix from the local name. If the index is zero, the
* name does not contain separator or the parser works in the namespace
* unaware mode. The length of qualified name is the length of the array
* minus one.
*
* @param ns The true value turns namespace conformance on.
* @exception Exception is parser specific exception form panic method.
* @exception IOException
*/
protected void bname(boolean ns)
throws Exception
{
char ch;
char type;
mBuffIdx++; // allocate a char for colon offset
int bqname = mBuffIdx;
int bcolon = bqname;
int bchidx = bqname + 1;
int bstart = bchidx;
int cstart = mChIdx;
short st = (short)((ns == true)? 0: 2);
while (true) {
// Read next character
if (mChIdx >= mChLen) {
bcopy(cstart, bstart);
getch();
mChIdx--; // bkch();
cstart = mChIdx;
bstart = bchidx;
}
ch = mChars[mChIdx++];
type = (char)0; // [X]
if (ch < 0x80) {
type = (char)nmttyp[ch];
} else if (ch == EOS) {
panic(FAULT);
}
// Parse QName
switch (st) {
case 0: // read the first char of the prefix
case 2: // read the first char of the suffix
switch (type) {
case 0: // [aA_X]
bchidx++; // append char to the buffer
st++; // (st == 0)? 1: 3;
break;
case 1: // [:]
if (st == 2) { // read the first char of the suffix
mChIdx--; // bkch();
st = 3; // read the suffix
break;
}
default:
panic(FAULT);
}
break;
case 1: // read the prefix
case 3: // read the suffix
switch (type) {
case 0: // [aA_X]
case 2: // [.-d]
bchidx++; // append char to the buffer
break;
case 1: // [:]
bchidx++; // append char to the buffer
if (ns == true) {
if (bcolon != bqname)
panic(FAULT); // it must be only one colon
bcolon = bchidx - 1;
if (st == 1)
st = 2;
}
break;
default:
mChIdx--; // bkch();
bcopy(cstart, bstart);
mBuff[bqname] = (char)(bcolon - bqname);
return;
}
break;
default:
panic(FAULT);
}
}
}
/**
* Reads a nmtoken.
*
* This is low level routine which leaves a nmtoken in the buffer.
*
* @exception Exception is parser specific exception form panic method.
* @exception IOException
*/
protected void bntok()
throws Exception
{
char ch;
mBuffIdx = -1;
bappend((char)0); // default offset to the colon char
while (true) {
ch = getch();
switch (chtyp(ch)) {
case 'a':
case 'A':
case 'd':
case '.':
case ':':
case '-':
case '_':
case 'X':
bappend(ch);
break;
case 'Z':
panic(FAULT);
default:
bkch();
if (mBuffIdx == 0) // zero length nmtoken
panic(FAULT);
return;
}
}
}
/**
* Recognizes a keyword.
*
* This is low level routine which recognizes one of keywords in the buffer.
* Keyword Id
* ID - i
* IDREF - r
* IDREFS - R
* ENTITY - n
* ENTITIES - N
* NMTOKEN - t
* NMTOKENS - T
* ELEMENT - e
* ATTLIST - a
* NOTATION - o
* CDATA - c
* REQUIRED - Q
* IMPLIED - I
* FIXED - F
* EMPTY - E
* ANY - Y
*
* @return an id of a keyword or '?'.
* @exception Exception is parser specific exception form panic method.
* @exception IOException
*/
protected char bkeyword()
throws Exception
{
String str = new String(mBuff, 1, mBuffIdx);
switch (str.length()) {
case 2: // ID
return ("ID".equals(str) == true)? 'i': '?';
case 3: // ANY
return ("ANY".equals(str) == true)? 'Y': '?';
case 5: // IDREF, CDATA, FIXED, EMPTY
switch (mBuff[1]) {
case 'I':
return ("IDREF".equals(str) == true)? 'r': '?';
case 'C':
return ("CDATA".equals(str) == true)? 'c': '?';
case 'F':
return ("FIXED".equals(str) == true)? 'F': '?';
case 'E':
return ("EMPTY".equals(str) == true)? 'E': '?';
default:
break;
}
break;
case 6: // IDREFS, ENTITY
switch (mBuff[1]) {
case 'I':
return ("IDREFS".equals(str) == true)? 'R': '?';
case 'E':
return ("ENTITY".equals(str) == true)? 'n': '?';
default:
break;
}
break;
case 7: // NMTOKEN, IMPLIED, ATTLIST, ELEMENT
switch (mBuff[1]) {
case 'I':
return ("IMPLIED".equals(str) == true)? 'I': '?';
case 'N':
return ("NMTOKEN".equals(str) == true)? 't': '?';
case 'A':
return ("ATTLIST".equals(str) == true)? 'a': '?';
case 'E':
return ("ELEMENT".equals(str) == true)? 'e': '?';
default:
break;
}
break;
case 8: // ENTITIES, NMTOKENS, NOTATION, REQUIRED
switch (mBuff[2]) {
case 'N':
return ("ENTITIES".equals(str) == true)? 'N': '?';
case 'M':
return ("NMTOKENS".equals(str) == true)? 'T': '?';
case 'O':
return ("NOTATION".equals(str) == true)? 'o': '?';
case 'E':
return ("REQUIRED".equals(str) == true)? 'Q': '?';
default:
break;
}
break;
default:
break;
}
return '?';
}
/**
* Reads a single or double quoted string in to the buffer.
*
* This method resolves entities inside a string unless the parser
* parses DTD.
*
* @param flag 'c' - CDATA, 'i' - non CDATA, ' ' - no normalization;
* '-' - not an attribute value; 'd' - in DTD context.
* @exception Exception is parser specific exception form panic method.
* @exception IOException
*/
protected void bqstr(char flag)
throws Exception
{
Input inp = mInp; // remember the original input
mBuffIdx = -1;
bappend((char)0); // default offset to the colon char
char ch;
for (short st = 0; st >= 0;) {
ch = (mChIdx < mChLen)? mChars[mChIdx++]: getch();
switch (st) {
case 0: // read a single or double quote
switch (ch) {
case ' ':
case '\n':
case '\r':
case '\t':
break;
case '\'':
st = 2; // read a single quoted string
break;
case '\"':
st = 3; // read a double quoted string
break;
default:
panic(FAULT);
break;
}
break;
case 2: // read a single quoted string
case 3: // read a double quoted string
switch (ch) {
case '\'':
if ((st == 2) && (mInp == inp))
st = -1;
else
bappend(ch);
break;
case '\"':
if ((st == 3) && (mInp == inp))
st = -1;
else
bappend(ch);
break;
case '&':
if (flag != 'd')
ent(flag);
else
bappend(ch);
break;
case '%':
if (flag == 'd')
pent('-');
else
bappend(ch);
break;
case '<':
if ((flag == '-') || (flag == 'd'))
bappend(ch);
else
panic(FAULT);
break;
case EOS: // EOS before single/double quote
panic(FAULT);
case '\r': // EOL processing [#2.11 & #3.3.3]
if (flag != ' ' && mInp.next == null) {
if (getch() != '\n')
bkch();
ch = '\n';
}
case ' ':
case '\n':
case '\t':
bappend(ch, flag);
break;
default:
bappend();
break;
}
break;
default:
panic(FAULT);
}
}
// There is maximum one space at the end of the string in
// i-mode (non CDATA normalization) and it has to be removed.
if ((flag == 'i') && (mBuff[mBuffIdx] == ' '))
mBuffIdx -= 1;
}
/**
* Reports characters and empties the parser's buffer.
* This method is called only if parser is going to return control to
* the main loop. This means that this method may use parser buffer
* to report white space without copying characters to temporary
* buffer.
*/
protected abstract void bflash()
throws Exception;
/**
* Reports white space characters and empties the parser's buffer.
* This method is called only if parser is going to return control to
* the main loop. This means that this method may use parser buffer
* to report white space without copying characters to temporary
* buffer.
*/
protected abstract void bflash_ws()
throws Exception;
/**
* Appends a characters to parser's buffer starting with the last
* read character and until one of special characters.
*/
protected void bappend()
throws Exception
{
char ch;
bkch();
while (true) {
ch = (mChIdx < mChLen)? mChars[mChIdx++]: getch();
switch (ch) {
case ' ':
case '\"':
case '\'':
case '\n':
case '\r':
case '\t':
case '<':
case '%':
case '&':
case EOS:
bkch();
return;
default:
mBuffIdx++;
if (mBuffIdx < mBuff.length) {
mBuff[mBuffIdx] = ch;
} else {
mBuffIdx--;
bappend(ch);
}
break;
}
}
}
/**
* Appends a character to parser's buffer with normalization.
*
* @param ch The character to append to the buffer.
* @param mode The normalization mode.
*/
protected void bappend(char ch, char mode)
{
// This implements attribute value normalization as
// described in the XML specification [#3.3.3].
switch (mode) {
case 'i': // non CDATA normalization
switch (ch) {
case ' ':
case '\n':
case '\r':
case '\t':
if ((mBuffIdx > 0) && (mBuff[mBuffIdx] != ' '))
bappend(' ');
return;
default:
break;
}
break;
case 'c': // CDATA normalization
switch (ch) {
case '\n':
case '\r':
case '\t':
ch = ' ';
break;
default:
break;
}
break;
default: // no normalization
break;
}
mBuffIdx++;
if (mBuffIdx < mBuff.length) {
mBuff[mBuffIdx] = ch;
} else {
mBuffIdx--;
bappend(ch);
}
}
/**
* Appends a character to parser's buffer.
*
* @param ch The character to append to the buffer.
*/
protected void bappend(char ch)
{
try {
mBuff[++mBuffIdx] = ch;
} catch (Exception exp) {
// Double the buffer size
char buff[] = new char[mBuff.length << 1];
System.arraycopy(mBuff, 0, buff, 0, mBuff.length);
mBuff = buff;
mBuff[mBuffIdx] = ch;
}
}
/**
* Appends (mChIdx - cidx) characters from character buffer (mChars) to
* parser's buffer (mBuff).
*
* @param cidx The character buffer (mChars) start index.
* @param bidx The parser buffer (mBuff) start index.
*/
protected void bcopy(int cidx, int bidx)
{
int length = mChIdx - cidx;
if ((bidx + length + 1) >= mBuff.length) {
// Expand the buffer
char buff[] = new char[mBuff.length + length];
System.arraycopy(mBuff, 0, buff, 0, mBuff.length);
mBuff = buff;
}
System.arraycopy(mChars, cidx, mBuff, bidx, length);
mBuffIdx += length;
}
/**
* Recognizes the built-in entities <i>lt</i>, <i>gt</i>, <i>amp</i>,
* <i>apos</i>, <i>quot</i>.
* The initial state is 0x100. Any state bellow 0x100 is a built-in
* entity replacement character.
*
* @param ch the next character of an entity name.
*/
private void eappend(char ch)
{
switch (mESt) {
case 0x100: // "l" or "g" or "a" or "q"
switch (ch) {
case 'l': mESt = 0x101; break;
case 'g': mESt = 0x102; break;
case 'a': mESt = 0x103; break;
case 'q': mESt = 0x107; break;
default: mESt = 0x200; break;
}
break;
case 0x101: // "lt"
mESt = (ch == 't')? '<': (char)0x200;
break;
case 0x102: // "gt"
mESt = (ch == 't')? '>': (char)0x200;
break;
case 0x103: // "am" or "ap"
switch (ch) {
case 'm': mESt = 0x104; break;
case 'p': mESt = 0x105; break;
default: mESt = 0x200; break;
}
break;
case 0x104: // "amp"
mESt = (ch == 'p')? '&': (char)0x200;
break;
case 0x105: // "apo"
mESt = (ch == 'o')? (char)0x106: (char)0x200;
break;
case 0x106: // "apos"
mESt = (ch == 's')? '\'': (char)0x200;
break;
case 0x107: // "qu"
mESt = (ch == 'u')? (char)0x108: (char)0x200;
break;
case 0x108: // "quo"
mESt = (ch == 'o')? (char)0x109: (char)0x200;
break;
case 0x109: // "quot"
mESt = (ch == 't')? '\"': (char)0x200;
break;
case '<': // "lt"
case '>': // "gt"
case '&': // "amp"
case '\'': // "apos"
case '\"': // "quot"
mESt = 0x200;
default:
break;
}
}
/**
* Sets up a new input source on the top of the input stack.
* Note, the first byte returned by the entity's byte stream has to be the
* first byte in the entity. However, the parser does not expect the byte
* order mask in both cases when encoding is provided by the input source.
*
* @param is A new input source to set up.
* @exception IOException If any IO errors occur.
* @exception Exception is parser specific exception form panic method.
*/
protected void setinp(InputSource is)
throws Exception
{
Reader reader = null;
mChIdx = 0;
mChLen = 0;
mChars = mInp.chars;
mInp.src = null;
if (mPh < PH_DOC_START)
mIsSAlone = false; // default [#2.9]
mIsSAloneSet = false;
if (is.getCharacterStream() != null) {
// Ignore encoding in the xml text decl.
reader = is.getCharacterStream();
xml(reader);
} else if (is.getByteStream() != null) {
String expenc;
if (is.getEncoding() != null) {
// Ignore encoding in the xml text decl.
expenc = is.getEncoding().toUpperCase();
if (expenc.equals("UTF-16"))
reader = bom16(is.getByteStream()); // UTF-16 [#4.3.3]
else
reader = enc(expenc, is.getByteStream());
xml(reader);
} else {
BufferedInputStream bis = new BufferedInputStream(is.getByteStream());
// Get encoding from BOM or the xml text decl.
reader = bom(bis);
if (reader == null) {
// Encoding is defined by the xml text decl.
ReaderUTF8 r = new ReaderUTF8(bis.getInputStream());
expenc = xml(r);
if (expenc.startsWith("UTF-16"))
panic(FAULT); // UTF-16 must have BOM [#4.3.3]
if( expenc.equals("UTF-8") ){
reader = r;
} else reader = enc(expenc, r.getByteStream());
} else {
// Encoding is defined by the BOM.
xml(reader);
}
}
} else {
// There is no support for public/system identifiers.
panic(FAULT);
}
mInp.src = reader;
mInp.pubid = is.getPublicId();
mInp.sysid = is.getSystemId();
}
/**
* Determines the entity encoding (encoding scheme) in UTF-16 stream.
*
* This method gets encoding from Byte Order Mask [#4.3.3] if any.
* Note, the first byte returned by the entity's byte stream has
* to be the first byte in the entity. Also, there is no support
* for UCS-4.
*
* @param is A byte stream of the entity.
* @return a reader constructed from the BOM.
* @exception Exception is parser specific exception form panic method.
* @exception IOException
*/
private Reader bom16(InputStream is) throws Exception {
int val = is.read();
switch (val) {
case 0xfe: // UTF-16, big-endian
if (is.read() != 0xff)
panic(FAULT);
return new ReaderUTF16(is, 'b');
case 0xff: // UTF-16, little-endian
if (is.read() != 0xfe)
panic(FAULT);
return new ReaderUTF16(is, 'l');
default:
panic(FAULT);
}
return null;
}
/**
* Determines the entity encoding.
*
* This method gets encoding from Byte Order Mask [#4.3.3] if any.
* Note, the first byte returned by the entity's byte stream has
* to be the first byte in the entity. Also, there is no support
* for UCS-4.
*
* @param is A byte stream of the entity.
* @param hint An encoding hint, character U means UTF-16.
* @return a reader constructed from the BOM or UTF-8 by default.
* @exception Exception is parser specific exception form panic method.
* @exception IOException
*/
private Reader bom(BufferedInputStream is) throws Exception
{
int val = is.read();
switch (val) {
case 0xef: // UTF-8
if (is.read() != 0xbb)
panic(FAULT);
if (is.read() != 0xbf)
panic(FAULT);
return new ReaderUTF8(is);
case 0xfe: // UTF-16, big-endian
if (is.read() != 0xff)
panic(FAULT);
return new ReaderUTF16(is, 'b');
case 0xff: // UTF-16, little-endian
if (is.read() != 0xfe)
panic(FAULT);
return new ReaderUTF16(is, 'l');
case -1:
// mChars[mChIdx++] = EOS;
return new ReaderUTF8(is);
default:
// put back val character
is.putBack( (byte)val );
return null;
}
}
/**
* Parses the xml text declaration.
*
* This method gets encoding from the xml text declaration [#4.3.1] if any.
* The method assumes the buffer (mChars) is big enough to accommodate whole
* xml text declaration.
*
* @param reader is entity reader.
* @return The xml text declaration encoding or default UTF-8 encoding.
* @exception Exception is parser specific exception form panic method.
* @exception IOException
*/
private String xml(Reader reader)
throws Exception
{
String str = null;
String enc = "UTF-8";
char ch;
int val;
short st;
// Read the xml text declaration into the buffer
if (mChIdx != 0) {
// The bom method have read ONE char into the buffer.
st = (short)((mChars[0] == '<')? 1: -1);
} else {
st = 0;
}
while (st >= 0 && mChIdx < mChars.length) {
ch = ((val = reader.read()) >= 0)? (char)val: EOS;
mChars[mChIdx++] = ch;
switch (st) {
case 0: // read '<' of xml declaration
switch (ch) {
case '<':
st = 1;
break;
case 0xfeff: // the byte order mask
ch = ((val = reader.read()) >= 0)? (char)val: EOS;
mChars[mChIdx - 1] = ch;
st = (short)((ch == '<')? 1: -1);
break;
default:
st = -1;
break;
}
break;
case 1: // read '?' of xml declaration [#4.3.1]
st = (short)((ch == '?')? 2: -1);
break;
case 2: // read 'x' of xml declaration [#4.3.1]
st = (short)((ch == 'x')? 3: -1);
break;
case 3: // read 'm' of xml declaration [#4.3.1]
st = (short)((ch == 'm')? 4: -1);
break;
case 4: // read 'l' of xml declaration [#4.3.1]
st = (short)((ch == 'l')? 5: -1);
break;
case 5: // read white space after 'xml'
switch (ch) {
case ' ':
case '\t':
case '\r':
case '\n':
st = 6;
break;
default:
st = -1;
break;
}
break;
case 6: // read content of xml declaration
switch (ch) {
case '?':
st = 7;
break;
case EOS:
st = -2;
break;
default:
break;
}
break;
case 7: // read '>' after '?' of xml declaration
switch (ch) {
case '>':
case EOS:
st = -2;
break;
default:
st = 6;
break;
}
break;
default:
panic(FAULT);
break;
}
}
mChLen = mChIdx;
mChIdx = 0;
// If there is no xml text declaration, the encoding is default.
if (st == -1) {
return enc;
}
mChIdx = 5; // the first white space after "<?xml"
// Parse the xml text declaration
for (st = 0; st >= 0;) {
ch = getch();
switch (st) {
case 0: // skip spaces after the xml declaration name
if (chtyp(ch) != ' ') {
bkch();
st = 1;
}
break;
case 1: // read xml declaration version
case 2: // read xml declaration encoding or standalone
case 3: // read xml declaration standalone
switch (chtyp(ch)) {
case 'a':
case 'A':
case '_':
bkch();
str = name(false).toLowerCase();
if ("version".equals(str) == true) {
if (st != 1)
panic(FAULT);
if ("1.0".equals(eqstr('=')) != true)
panic(FAULT);
mInp.xmlver = 0x0100;
st = 2;
} else if ("encoding".equals(str) == true) {
if (st != 2)
panic(FAULT);
mInp.xmlenc = eqstr('=').toUpperCase();
enc = mInp.xmlenc;
st = 3;
} else if ("standalone".equals(str) == true) {
if ((st == 1) || (mPh >= PH_DOC_START)) // [#4.3.1]
panic(FAULT);
str = eqstr('=').toLowerCase();
// Check the 'standalone' value and use it [#5.1]
if (str.equals("yes") == true) {
mIsSAlone = true;
} else if (str.equals("no") == true) {
mIsSAlone = false;
} else {
panic(FAULT);
}
mIsSAloneSet = true;
st = 4;
} else {
panic(FAULT);
}
break;
case ' ':
break;
case '?':
if (st == 1)
panic(FAULT);
bkch();
st = 4;
break;
default:
panic(FAULT);
}
break;
case 4: // end of xml declaration
switch (chtyp(ch)) {
case '?':
if (getch() != '>')
panic(FAULT);
if (mPh <= PH_DOC_START)
mPh = PH_MISC_DTD; // misc before DTD
st = -1;
break;
case ' ':
break;
default:
panic(FAULT);
}
break;
default:
panic(FAULT);
}
}
return enc;
}
/**
* Sets up the document reader.
*
* @param name an encoding name.
* @param is the document byte input stream.
* @return a reader constructed from encoding name and input stream.
* @exception UnsupportedEncodingException
*/
private Reader enc(String name, InputStream is)
throws UnsupportedEncodingException
{
// DO NOT CLOSE current reader if any!
if (name.equals("UTF-8"))
return new ReaderUTF8(is);
if (name.equals("UTF-16LE"))
return new ReaderUTF16(is, 'l');
if (name.equals("UTF-16BE"))
return new ReaderUTF16(is, 'b');
return new InputStreamReader(is, name);
}
/**
* Sets up current input on the top of the input stack.
*
* @param inp A new input to set up.
*/
protected void push(Input inp)
{
mInp.chLen = mChLen;
mInp.chIdx = mChIdx;
inp.next = mInp;
mInp = inp;
mChars = inp.chars;
mChLen = inp.chLen;
mChIdx = inp.chIdx;
}
/**
* Restores previous input on the top of the input stack.
*/
protected void pop()
{
if (mInp.src != null) {
try { mInp.src.close(); } catch (IOException ioe) {}
mInp.src = null;
}
mInp = mInp.next;
if (mInp != null) {
mChars = mInp.chars;
mChLen = mInp.chLen;
mChIdx = mInp.chIdx;
} else {
mChars = null;
mChLen = 0;
mChIdx = 0;
}
}
/**
* Maps a character to it's type.
*
* Possible character type values are:<br />
* - ' ' for any kind of white space character;<br />
* - 'a' for any lower case alphabetical character value;<br />
* - 'A' for any upper case alphabetical character value;<br />
* - 'd' for any decimal digit character value;<br />
* - 'z' for any character less then ' ' except
* '\t', '\n', '\r';<br />
* - 'X' for any not ASCII character;<br />
* - 'Z' for EOS character.<br />
* An ASCII (7 bit) character which does not fall in any category listed
* above is mapped to it self.
*
* @param ch The character to map.
* @return The type of character.
*/
protected char chtyp(char ch)
{
if (ch < 0x80)
return (char)asctyp[ch];
return (ch != EOS)? 'X': 'Z';
}
/**
* Retrieves the next character in the document.
*
* @return The next character in the document.
*/
protected char getch()
throws IOException
{
if (mChIdx >= mChLen) {
if (mInp.src == null) {
pop(); // remove internal entity
return getch();
}
// Read new portion of the document characters
int Num = mInp.src.read(mChars, 0, mChars.length);
if (Num < 0) {
if (mInp != mDoc) {
pop(); // restore the previous input
return getch();
} else {
mChars[0] = EOS;
mChLen = 1;
}
}
else
mChLen = Num;
mChIdx = 0;
}
return mChars[mChIdx++];
}
/**
* Puts back the last read character.
*
* This method <strong>MUST NOT</strong> be called more then once after
* each call of {@link #getch getch} method.
*/
protected void bkch()
throws Exception
{
if(mChIdx <= 0)
panic(FAULT);
mChIdx--;
}
/**
* Sets the current character.
*
* @param ch The character to set.
*/
protected void setch(char ch)
{
mChars[mChIdx] = ch;
}
/**
* Finds a pair in the pair chain by a qualified name.
*
* @param chain The first element of the chain of pairs.
* @param qname The qualified name.
* @return A pair with the specified qualified name or null.
*/
protected Pair find(Pair chain, char[] qname)
{
pairs: for (Pair pair = chain; pair != null; pair = pair.next) {
if (pair.chars.length != qname.length)
continue pairs;
for (int i = 0; i < qname.length; i += 1) {
if (pair.chars[i] != qname[i])
continue pairs;
}
return pair;
}
return null;
}
/**
* Provides an instance of a pair.
*
* @param next The reference to a next pair.
* @return An instance of a pair.
*/
protected Pair pair(Pair next)
{
Pair pair;
if (mDltd != null) {
pair = mDltd;
mDltd = pair.next;
} else {
pair = new Pair();
}
pair.next = next;
return pair;
}
/**
* Deletes an instance of a pair.
*
* @param pair The pair to delete.
* @return A reference to the next pair in a chain.
*/
protected Pair del(Pair pair)
{
Pair next = pair.next;
pair.name = null;
pair.ns = null;
pair.value = null;
pair.chars = null;
pair.list = null;
pair.next = mDltd;
mDltd = pair;
return next;
}
/**
* Reports UCS-4 (UTF-32) character encoded with UCS-2 (UTF-16)
*
* @param ucs4 Unicode character code (#x10000-#x10FFFF)
*/
private void reportUCS4(int ucs4) throws Exception
{
int tmp = ucs4 - 0x10000;
bappend((char) (0xD800 | (tmp >> 10)));
bappend((char) (0xDC00 | (tmp & 0x3FF)));
bflash();
}
private static class BufferedInputStream extends InputStream {
final private InputStream is;
private int lastChar = -1;
public BufferedInputStream( InputStream is ){
this.is = is;
}
public void putBack(byte val) {
// assert( lastChar == -1 && val != -1 );
lastChar = val;
}
public InputStream getInputStream() {
if( lastChar == -1 )
return is;
return this;
}
public int read() throws IOException {
if( lastChar == -1 ) return is.read();
byte v = (byte)lastChar;
lastChar = -1;
return v;
}
public int read(byte[] b, int off, int len) throws IOException {
if( len == 0 ) return 0;
if( lastChar != -1 ){
b[off] = (byte)lastChar;
lastChar = -1;
return 1 + is.read( b, off + 1, len - 1 );
}
return is.read( b, off, len );
}
}
}