/********************************************************************************* * TotalCross Software Development Kit * * Copyright (C) 2003-2004 Pierre G. Richard * * Copyright (C) 2003-2012 SuperWaba Ltda. * * All Rights Reserved * * * * This library and virtual machine is distributed in the hope that it will * * be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * * * * This file is covered by the GNU LESSER GENERAL PUBLIC LICENSE VERSION 3.0 * * A copy of this license is located in file license.txt at the root of this * * SDK or can be downloaded here: * * http://www.gnu.org/licenses/lgpl-3.0.txt * * * *********************************************************************************/ package totalcross.xml; import totalcross.io.Stream; import totalcross.sys.*; /** * * A Tokenizer for XML input. In non-strict mode (default), it recognizes * HTML constructs as well, <i>e.g.:</i> unquoted attributes value, * unterminated references, etc. * <P> * Four "tokenize" methods are provided: one takes a byte[] array; another * takes a byte[] array with offset and count; another one for an HTML document which is embedded within an HTTP stream; * and the last takes a (byte) Stream. * <P> * Tokenization events are reported via overridable methods: * <UL> * <LI>foundStartOfInput * <LI>foundStartTagName * <LI>foundEndTagName * <LI>foundEndEmptyTag * <LI>foundCharacterData * <LI>foundCharacter * <LI>foundAttributeName * <LI>foundAttributeValue * <LI>foundComment * <LI>foundProcessingInstruction * <LI>foundDeclaration * <LI>foundReference * <LI>foundEndOfInput * </UL> * </P> * <P> * Some of these methods pass the parameters pertinent to the kind of * tokenized events: tag name, attribute name and value...  These values * are only valid for the time the event is reported.  Never assume * that, after returning from a "foundXxx" method, the information that was * reported is still available! Persistent values are however provided * through the "getAbsoluteOffset()" method, which returns the absolute * offset of the current parameters of the foundXxxx method. * </P> * * <P> * <U>Typical invocation</U> * </P> * * <PRE> * class XmlTokenizerTest * { * static class MyXmlTokenizer extends XmlTokenizer * { * public void foundStartOfInput(byte buffer[], int offset, int count) * { * Vm.debug("Start: " + new String(buffer, offset, count)); * } * * public void foundStartTagName(byte buffer[], int offset, int count) * { * Vm.debug("StartTagName: " + new String(buffer, offset, count)); * } * * public void foundEndTagName(byte buffer[], int offset, int count) * { * Vm.debug("EndTagName: " + new String(buffer, offset, count)); * } * * public void foundEndEmptyTag() * { * Vm.debug("EndEmptyTag"); * } * * public void foundCharacterData(byte buffer[], int offset, int count) * { * Vm.debug("Content: " + new String(buffer, offset, count)); * } * * public void foundCharacter(char charFound) * { * Vm.debug("Content Ref |" + charFound + '|'); * } * * public void foundAttributeName(byte buffer[], int offset, int count) * { * Vm.debug("AttributeName: " + new String(buffer, offset, count)); * } * * public void foundAttributeValue(byte buffer[], int offset, int count, byte dlm) * { * Vm.debug("AttributeValue: " + new String(buffer, offset, count)); * } * * public void foundEndOfInput(int count) * { * Vm.debug("Ended: " + count + " bytes parsed."); * } * } * * public static void testMe() * { * String input = "<p>Hello<i>World!</i></p>"; * MyXmlTokenizer xtk = new MyXmlTokenizer(); * try * { * xtk.tokenize(input.getBytes()); * } * catch (SyntaxException ex) * { * Vm.debug(ex.getMessage()); * } * } * } * </PRE> * * <P> * <U>Note:</U> A Tokenizer is not a Parser.  The correctness of the * tag structure (stack) is not examined. <BR/> Ex: the dangling markup * "<foo><bar>opop</foo>" is syntactically valid. <BR/> As * a result, a Tokenizer can work on document fragments. */ public class XmlTokenizer { private int ofsStart; private int ofsCur; private int ofsEnd; private int readPos; private int state; private int substate; private byte[] endTagToSkipTo; private int ixEndTagToSkipTo; private byte quote; private boolean strictlyXml; private boolean resolveCharRef; // XML Predefined Named References private static final byte chrRef[][] = { {(byte) '<', (byte) 'l', (byte) 't'}, {(byte) '>', (byte) 'g', (byte) 't'}, {(byte) '&', (byte) 'a', (byte) 'm', (byte) 'p'}, {(byte) '\'', (byte) 'a', (byte) 'p', (byte) 'o', (byte) 's'}, {(byte) '"', (byte) 'q', (byte) 'u', (byte) 'o', (byte) 't'} }; // Was class XmlByteType. Moved here for optim and footprint. // (no one but the Tokenizer is supposed to use this class!) private static final byte is[] = new byte[256]; private static final byte ISNAMESTART = 1 << 0; private static final byte ISNAMEFOLLOWER = 1 << 1; private static final byte ISSPACE = 1 << 2; private static final byte ISQUOTE = 1 << 3; private static final byte ISCONTENTDLM = 1 << 4; private static final byte ISENDTAGDLM = 1 << 5; private static final byte ISENDREFERENCE = 1 << 6; static { byte isNameStartOrFollower = (byte) (ISNAMESTART | ISNAMEFOLLOWER); Convert.fill(is, 'a', 'z'+1, isNameStartOrFollower); Convert.fill(is, 'A', 'Z'+1, isNameStartOrFollower); Convert.fill(is, '0', '9'+1, ISNAMEFOLLOWER); is['_'] = isNameStartOrFollower; is[':'] = isNameStartOrFollower; is['-'] = ISNAMEFOLLOWER; is['.'] = ISNAMEFOLLOWER; is[' '] = ISSPACE; is['\r'] = ISSPACE; is['\n'] = ISSPACE; is['\t'] = ISSPACE; is['\f'] = ISSPACE; is['\''] = ISQUOTE; is['\"'] = ISQUOTE; is['>'] = ISENDTAGDLM; is['<'] = ISCONTENTDLM; is['&'] = ISCONTENTDLM; is[';'] = ISENDREFERENCE; } protected XmlTokenizer() { resolveCharRef = true; } /** * Tokenize an array of bytes. * * @param input * byte array to tokenize * @param offset * position of the first byte in the array * @param count * number of bytes to tokenize * @exception SyntaxException */ public final void tokenize(byte input[], int offset, int count) throws SyntaxException { ofsStart = 0; ofsCur = offset; ofsEnd = count; readPos = offset; state = 0; foundStartOfInput(input, offset, count); tokenizeBytes(input); endTokenize(input); } /** * Tokenize an array of bytes. * * @param input * byte array to tokenize * @exception SyntaxException */ public final void tokenize(byte input[]) throws SyntaxException { tokenize(input, 0, input.length); } /** * Tokenize a stream * * @param input * stream to tokenize * @exception SyntaxException * @throws totalcross.io.IOException */ public final void tokenize(Stream input) throws SyntaxException, totalcross.io.IOException { byte buffer[] = new byte[1024]; tokenize(input, buffer, 0, input.readBytes(buffer, 0, buffer.length), 0); } /** * Tokenize an already buffered Stream. * <P> * Versus the general method above, this tokenize method requires more * arguments. It should be used when the HTML document is embedded within * an HTTP stream. * * @param input * stream to tokenize * @param buffer * buffer already filled with bytes read from the input stream * @param start * starting position in the buffer * @param end * ending position in the buffer * @param pos * read position of the byte at offset 0 in the buffer * @exception SyntaxException * @throws totalcross.io.IOException */ public final void tokenize(Stream input, byte[] buffer, int start, int end, int pos) throws SyntaxException, totalcross.io.IOException { ofsStart = start; ofsCur = start; ofsEnd = end; readPos = pos; state = 0; foundStartOfInput(buffer, 0, ofsEnd); while (ofsCur < ofsEnd) { tokenizeBytes(buffer); // returns when ofsCur == ofsEnd if (ofsEnd == buffer.length) { // no more room if (ofsStart > 0) { // tidy is still possible Vm.arrayCopy(buffer, ofsStart, buffer, 0, ofsEnd - ofsStart); readPos += ofsStart; ofsCur -= ofsStart; ofsStart = 0; } else if (((state == 10) || (state == 22)) && (ofsCur > 0)) { // "Data" mode: flush foundCharacterData(buffer, 0, ofsCur); Vm.arrayCopy(buffer, 0, buffer, 0, ofsEnd - ofsCur); readPos += ofsCur; ofsCur = ofsStart = 0; } else { // nothing else to do than to extend byte oldBuffer[] = buffer; int newSize = oldBuffer.length * 15 / 10; // guich@510_17: instead of double, grow 50%... buffer = new byte[newSize]; Vm.arrayCopy(oldBuffer, 0, buffer, 0, ofsEnd); } } if (ofsCur >= ofsEnd) // we already reached the end, there's no need to keep trying any further break; ofsEnd = ofsCur + input.readBytes(buffer, ofsCur, buffer.length - ofsCur); } endTokenize(buffer); } /** * Resolve a numeric or named character reference. See <a * href=http://www.w3.org/TR/REC-xml#sec-predefined-ent>XML Predefined * Entities</a> * * @param input * byte array which describes the reference * @param offset * position of the first byte in the array * @param count * number of bytes of the reference * @return the resulting character, or '\uffff' (not a unicode * character) if the conversion could not be done */ public static final char resolveCharacterReference(byte input[], int offset, int count) { if ((count > 1) && (input[offset] == '#')) { if ((input[++offset] == 'x') || (input[offset] == 'X')) { return hex2char(input, offset + 1, count - 2); } else { return dec2char(input, offset, count - 1); } } else { return ref2char(input, offset, count); } } /** * Get the absolute offset of the data parameters of the currently * reported event. * * @return the absolute offset of the data parameters of the currently * reported event. */ public final int getAbsoluteOffset() { return ofsStart + readPos; } /** * Declare the input to be CDATA, until the end tag of the element * <code>tagName</code> is found. * <P> * This settings permits to handle character data.  For example, when * the <Script> tag is reported the derived class call this method: * <code>skipToEndOf("SCRIPT");</code> before to return.  From this * point, all input is reported as data until <code></SCRIPT></code>is * found. * <P> * <U>Note:</U> The Tokenizer is a low level class and does not register * the tag name. Therefore, this method must be called at each time the * caller wants to suprress markup recognition until the end tag is * found.  * * @param input * byte array containing the name of the element the end tag of * which ends the character data * @param offset * position of the first character in the array * @param count * number of relevant bytes */ protected final void setCdataContents(byte input[], int offset, int count) { endTagToSkipTo = new byte[count]; for (int i = 0; i < count; ++i) { byte b = input[offset + i]; if ('a' <= b) b -= ('a' - 'A'); // fast toUpper endTagToSkipTo[i] = b; } } /** * Tell if the data which is currently reported by foundCharacterData is * <code>CDATA</code> versus <code>PCDATA</code>. * <P> * In ISO 8879 (SGML) terminology, <code>CDATA</code> describes * "non displayable" data, as, for instance, data that is the * contents of a <code>SCRIPT</code> element.  It differs from * "regular data" as, for instance, data that is the contents of * a <code>P</code> element is named <code>PCDATA</code> (Parsed * Character Data) */ public final boolean isDataCDATA() { return (endTagToSkipTo != null); } /** * Set or unset the strict XML mode of the parser. * <P> * By default, the parser will allow most commonly used HTML constructs. * * @param toSet * if true, set the strict XML mode; if false, allows HTML * constructs. */ public final void setStrictlyXml(boolean toSet) { strictlyXml = toSet; } /** * Turn off or on the automatic resolution of references. * <P> * References are normally solved, and reported via * {@link XmlTokenizer#foundCharacter(char)}.  When automatic * resolution is turned off, * {@link XmlTokenizer#foundReference(byte[],int,int)} is called * instead.  By default, automatic resolution of references is <u>on</u>, * and {@link XmlTokenizer#foundReference(byte[],int,int)} is not called. * <P> * This option should be set before starting the tokenization.  See * {@link XmlTokenizer#foundReference(byte[],int,int)} for more details. * * @param disable * boolean: if <code>true</code> automatic resolution of * references is turned off, otherwise, it is turned on. */ public final void disableReferenceResolution(boolean disable) { resolveCharRef = !disable; } /** * Method called before to start tokenizing. * <P> * Derived class may override this method, for doing whatever appropriate * housekeeping (sniffing at the encoding, etc.) * * @param input * byte array containing the first bytes of the input about to * be tokenized * @param offset * position of the first byte to be tokenized * @param count * number of bytes to be tokenized */ protected void foundStartOfInput(byte input[], int offset, int count) { } /** * Method called when a start-tag has been found. * <P> * Derived class may override this method. * * @param input * byte array containing the name of the tag that started * @param offset * position of the first character of the tag name in the array * @param count * number of bytes the tag name is made of */ protected void foundStartTagName(byte input[], int offset, int count) { } /** * Method called when an end-tag has been found. * <P> * Derived class may override this method. * * @param input * byte array containing the name of the tag that ended * @param offset * position of the first character of the tag name in the array * @param count * number of bytes the tag name is made of */ protected void foundEndTagName(byte input[], int offset, int count) { } /** * Method called when an empty-tag has been found. * <P> * This method is called just after all events related to the starting tag * have been reported. The implied tag name is the one of the starting tag (<i>e.g.:</i> * the most recently reported start tag.) * <P> * Derived class may override this method. * <P> Example: * <PRE> * * <FOO A=B> generates: * - foundStartTagName("FOO"); * - foundAttributeName("A"); * - foundAttributeValue("B"); * - foundEndEmptyTag(); * </PRE> * */ protected void foundEndEmptyTag() { } /** * Method called when a character data content has been found. * <P> * Derived class may override this method. * * @param input * byte array containing the character data that was found * @param offset * position of the first character data in the array * @param count * number of bytes the character data content is made of */ protected void foundCharacterData(byte input[], int offset, int count) { } /** * Method called when a character has been found in the contents, which is resulting from a character reference resolution. * <P> * Derived class may override this method. * * @param charFound * resolved character - if the character is invalid, this value * is set to '\uffff', which is not a unicode character. * @see XmlTokenizer#foundReference(byte[],int,int) */ protected void foundCharacter(char charFound) { } /** * Method called when an attribute name has been found. * <P> * Derived class may override this method. * * @param input * byte array containing the attribute name * @param offset * position of the first character of the attribute name in the * array * @param count * number of bytes the attribute name is made of */ protected void foundAttributeName(byte input[], int offset, int count) { } /** * Method called when an attribute value has been found. * <P> * Derived class may override this method. * * @param input * byte array containing the attribute value * @param offset * position of the first character of the attribute value in the * array * @param count * number of bytes the attribute value is made of * @param dlm * delimiter that started the attribute value (' or "). '\0' if * none */ protected void foundAttributeValue(byte input[], int offset, int count, byte dlm) { } /** * Method called when a comment has been found. * <P> * Derived class may override this method. * * @param input * byte array containing the comment (without the * <CODE><B><!--</CODE></B> and <CODE><B>--></CODE></B> * delimiters) * @param offset * position of the first character of the comment in the array * @param count * number of bytes the comment is made of */ protected void foundComment(byte input[], int offset, int count) { } /** * Method called when a processing instruction has been found. * <P> * Derived class may override this method. * * @param input * byte array containing the processing instruction (without the * <CODE><B><?</CODE></B> and <CODE><B>?></CODE></B> * delimiters) * @param offset * position of the first character of the processing instruction * in the array * @param count * number of bytes the processing instruction is made of */ protected void foundProcessingInstruction(byte input[], int offset, int count) { } /** * Method called when a declaration has been found. * <P> * Derived class may override this method. * * @param input * byte array containing the declaration (without the * <CODE><B><!</CODE></B> and <CODE><B>></CODE></B> * delimiters) * @param offset * position of the first character of the declaration in the * array * @param count * number of bytes the declaration is made of */ protected void foundDeclaration(byte input[], int offset, int count) { } /** * Method called when a reference been found in content. * <P> * It can be either a named or numeric character reference, or an entity * reference.  Given the several syntaxes of reference, no * verification is made <i>a priori</i> on the validity of the "name" of * the reference. * <P> * For conveniency, a static method: * {@link XmlTokenizer#resolveCharacterReference(byte[],int,int)} allows * to convert the character reference into its UCS-2 encoded value. * </P> * <TABLE cellpadding=0 cellspacing=0> * <TR VALIGN=top> * <TD><B>Note:</B></TD> * <TD> </TD> * <TD> * <TD><code>foundReference</code> is called only if * {@link XmlTokenizer#disableReferenceResolution(boolean disable)} has * been called first, with <code>disable</code> set to <code>true</code>.  * If not, then <code>foundReference</code> is <U>never called</U>, and * {@link XmlTokenizer#foundCharacter(char)} is called instead.  This * design permits to easily handle simple XML documents — only * predefined named character entities, and numeric character entities * — and documents which have user-defined internal/external * entities.  This is explained below. </TD> * </TR> * </TABLE> * <P> * When working with a set of externally defined entities, issue * <code>disableReferenceResolution(true)</code> to turn off automatic * reference resolution. Then, your code in <code>foundReference</code> * could make a quick check to see if the found reference is * numeric.  If it is numeric — it starts with a <code>#</code> * character — call <i>resolveCharacterReference</i>; if it is not * a numeric reference, checks if the reference belongs to the known list * of defined entities for the parsed document.  If it does, do the * substitution; if not, call <i>resolveCharacterReference</i>, because * it could be one of the <a * href=http://www.w3.org/TR/REC-xml#sec-predefined-ent> XML Predefined * Entities</a> * <P> * By default, each character reference is naturally reported via * {@link XmlTokenizer#foundCharacter(char)}, which, again, <u>supersedes</u> * the <code>foundReference</code> notification. * * <P> * Derived class may override this method. * * @param input * byte array containing the reference name * @param offset * position of the first character of the reference name in the * array * @param count * number of bytes the reference name is made of * @see XmlTokenizer#setStrictlyXml(boolean toSet) */ protected void foundReference(byte input[], int offset, int count) { } /** * Method called when invalid data was found. This is often due to a bad * tag syntax. * <P> * Derived class may override this method. * * @param input * byte array containing the invalid data * @param offset * position of the first character of the invalid data in the * array * @param count * number of bytes the invalidData is made of */ protected void foundInvalidData(byte input[], int offset, int count) { } /** * Method called when the end of the input was found, and the tokenization is * about to end. * <P> * Derived class may override this method. * * @param count * number of bytes parsed */ protected void foundEndOfInput(int count) { } /** * Private method to tokenize a bunch of bytes. It returns when no bytes * are available, but can be resumed again with more bytes to parse * * @param input * byte array to parse * @exception SyntaxException */ private void tokenizeBytes(byte input[]) throws SyntaxException { while (ofsCur < ofsEnd) { int ch = (int) input[ofsCur] & 0xFF; switch (state) { case 0: ofsStart = ofsCur; if (endTagToSkipTo != null) { state = 22; continue; // same ofsCur!!! it can start </script> } else if (ch == '<') { state = 1; } else if (ch == '&') { state = 11; } else { state = 10; } break; case 1: if ((is[ch] & ISNAMESTART) != 0) { state = 2; } else if (ch == '/') { state = 12; } else if (ch == '!') { state = 16; } else if (ch == '?') { state = 20; substate = 0; // so we wait for "?>" } else if (!strictlyXml) { state = 10; // recovery: process "<$xxx" as data } else { endTokenize(input); // strictly XML: give up } break; case 2: while ((is[ch] & ISNAMEFOLLOWER) != 0) { if (++ofsCur >= ofsEnd) return; ch = (int) input[ofsCur] & 0xFF; } if (ch == '>') { state = 0; } else if (ch == '/') { state = 9; } else if ((is[ch] & ISSPACE) != 0) { state = 3; } else if (!strictlyXml) { // <ABC$xxx state = 10; // recovery: process "<ABC$xxx" as data break; } else { endTokenize(input); // strictly XML: give up } foundStartTagName(input, ofsStart + 1, ofsCur - ofsStart - 1); break; case 3: while ((is[ch] & ISSPACE) != 0) { if (++ofsCur >= ofsEnd) return; ch = (int) input[ofsCur] & 0xFF; } if (ch == '>') { state = 0; } else if (ch == '/') { state = 9; } else if ((is[ch] & ISNAMESTART) != 0) { ofsStart = ofsCur; state = 4; } else { state = 21; // possible recovery: skip to TAGC } break; case 4: while ((is[ch] & ISNAMEFOLLOWER) != 0) { if (++ofsCur >= ofsEnd) return; ch = (int) input[ofsCur] & 0xFF; } if ((is[ch] & ISSPACE) != 0) { state = 5; } else if (ch == '=') { state = 6; } else if (!strictlyXml && (ch == '>')) { state = 0; // <list compact> allowed in HTML } else { state = 21; // possible recovery: skip to TAGC break; } foundAttributeName(input, ofsStart, ofsCur - ofsStart); break; case 5: while ((is[ch] & ISSPACE) != 0) { if (++ofsCur >= ofsEnd) return; ch = (int) input[ofsCur] & 0xFF; } if (ch == '=') { state = 6; } else if (!strictlyXml) { if (ch == '>') { state = 0; // <list compact > allowed in HTML } else if ((is[ch] & ISNAMESTART) != 0) { ofsStart = ofsCur; state = 4; // <list compact simple> allowed in HTML } else { state = 21; // possible recovery: skip to TAGC } } else { state = 21; // possible recovery: skip to TAGC } break; case 6: while ((is[ch] & ISSPACE) != 0) { if (++ofsCur >= ofsEnd) return; ch = (int) input[ofsCur] & 0xFF; } if ((is[ch] & ISQUOTE) != 0) { quote = (byte) ch; ofsStart = ofsCur; state = 7; } else if (!strictlyXml) { if (ch == '>') { state = 0; } else { ofsStart = ofsCur; state = 15; } } else { endTokenize(input); // strictly XML: give up } break; case 7: while (ch != quote) { if (++ofsCur >= ofsEnd) return; ch = (int) input[ofsCur] & 0xFF; } ++ofsStart; foundAttributeValue(input, ofsStart, ofsCur - ofsStart, quote); state = 8; break; case 8: if (ch == '>') { state = 0; } else if (ch == '/') { state = 9; } else if ((is[ch] & ISSPACE) != 0) { state = 3; } else if ((is[ch] & ISNAMESTART) != 0) { ofsStart = ofsCur; state = 4; } else { state = 21; // possible recovery: skip to TAGC } break; case 9: if (ch != '>') { state = 21; // possible recovery: skip to TAGC } else { foundEndEmptyTag(); state = 0; } break; case 10: while ((is[ch] & ISCONTENTDLM) == 0) { if (++ofsCur >= ofsEnd) return; ch = (int) input[ofsCur] & 0xFF; } if (ofsCur > ofsStart) { foundCharacterData(input, ofsStart, ofsCur - ofsStart); } ofsStart = ofsCur; if (ch == '<') { state = 1; } else { state = 11; } break; case 11: while ((is[ch] & (ISCONTENTDLM | ISSPACE | ISENDREFERENCE)) == 0) { if (++ofsCur >= ofsEnd) return; ch = (int) input[ofsCur] & 0xFF; } tellReference(input, ofsStart + 1, ofsCur - ofsStart - 1); if (ch == ';') { ofsStart = ofsCur + 1; // data starts at next byte state = 10; } else if (!strictlyXml) { ofsStart = ofsCur; if (ch == '<') { state = 1; } else if (ch != '&') { // spaces (else '&' again, stay here) state = 10; } } else { endTokenize(input); // strictly XML: give up } break; case 12: if ((is[ch] & ISNAMESTART) != 0) { state = 13; } else if (!strictlyXml) { state = 10; // recovery: process "</$xxx" as data } else { endTokenize(input); // strictly XML: give up } break; case 13: while ((is[ch] & ISNAMEFOLLOWER) != 0) { if (++ofsCur >= ofsEnd) return; ch = (int) input[ofsCur] & 0xFF; } if (ch == '>') { state = 0; } else if ((is[ch] & ISSPACE) != 0) { state = 14; } else if (!strictlyXml) { state = 10; // recovery: process "</xxx$" as data break; } else { endTokenize(input); // strictly XML: give up } foundEndTagName(input, ofsStart + 2, ofsCur - ofsStart - 2); break; case 14: while ((is[ch] & ISSPACE) != 0) { if (++ofsCur >= ofsEnd) return; ch = (int) input[ofsCur] & 0xFF; } if (ch == '>') { state = 0; } else { state = 21; // possible recovery: skip to TAGC } break; case 15: // !strictlyXml while ((is[ch] & (ISSPACE | ISENDTAGDLM)) == 0) { if (++ofsCur >= ofsEnd) return; ch = (int) input[ofsCur] & 0xFF; } foundAttributeValue(input, ofsStart, ofsCur - ofsStart, (byte) 0); if (ch == '>') { state = 0; } else { state = 3; } break; case 16: ofsStart = ofsCur; if (ch == '-') { state = 18; } else { state = 17; } break; case 17: while (ch != '>') { if (++ofsCur >= ofsEnd) return; ch = (int) input[ofsCur] & 0xFF; } foundDeclaration(input, ofsStart, ofsCur - ofsStart); state = 0; break; case 18: if (ch == '-') { ofsStart = ofsCur; state = 19; substate = 0; // so we wait for "-->" } else { // keep ofsStart unchanged! state = 17; } break; case 19: switch (substate) { case 0: while (ch != '-') { if (++ofsCur >= ofsEnd) return; ch = (int) input[ofsCur] & 0xFF; } substate = 1; break; case 1: // '-' found if (ch != '-') { substate = 0; } else { substate = 2; } break; case 2: // '-'('-')+ found while (ch == '-') { if (++ofsCur >= ofsEnd) return; ch = (int) input[ofsCur] & 0xFF; } if (ch == '>') { foundComment(input, ofsStart + 1, ofsCur - ofsStart - 3); ofsStart = ofsCur; state = 0; } else if (ch == '!') { substate = 3; } else { substate = 0; } break; case 3: if (ch == '>') { foundComment(input, ofsStart + 1, ofsCur - ofsStart - 4); ofsStart = ofsCur; state = 0; } else { substate = 0; } break; } break; case 20: switch (substate) { case 0: while (ch != '?') { if (++ofsCur >= ofsEnd) return; ch = (int) input[ofsCur] & 0xFF; } substate = 1; break; case 1: if (ch == '>') { foundProcessingInstruction(input, ofsStart + 2, ofsCur - ofsStart - 3); ofsStart = ofsCur; state = 0; } else { substate = 0; } break; } break; case 21: // Skip to TAGC if (strictlyXml) { endTokenize(input); // strictly XML: give up } else { ofsStart = ofsCur; while (ch != '>') { if (++ofsCur >= ofsEnd) return; ch = (int) input[ofsCur] & 0xFF; } foundInvalidData(input, ofsStart, ofsCur - ofsStart); state = 0; } break; case 22: // skip to end tag (SCRIPT contents) while (ch != '<') { if (++ofsCur >= ofsEnd) return; ch = (int) input[ofsCur] & 0xFF; } state = 23; break; case 23: if (ch != '/') { state = 22; } else { state = 24; ixEndTagToSkipTo = 0; } break; case 24: if (ixEndTagToSkipTo == endTagToSkipTo.length) { int ofsTemp = ofsCur - ixEndTagToSkipTo - 2; ; if (ch == '>') { state = 0; } else if ((is[ch] & ISSPACE) != 0) { state = 14; } else { state = 22; break; // abandon here } foundCharacterData(input, ofsStart, ofsTemp - ofsStart); ofsStart = ofsTemp; foundEndTagName(input, ofsTemp + 2, ixEndTagToSkipTo); endTagToSkipTo = null; } else { if ('a' <= ch) ch -= ('a' - 'A'); // fast toUpper if (endTagToSkipTo[ixEndTagToSkipTo++] != (byte) ch) { state = 22; } } break; } ++ofsCur; } } /** * Private method to check the state when input ends. Reason is that we * don't do "non-SGML characters", * Impl. note: This method is also called when an invalid character is found. * Reason is that the parse is OK when it ends either on ">" or data. For the * former, any character is valid (we don't do non-SGML characters, remember * that we work on byte, not on encoded characters.) For the latter, '>' * can't be an invalid character. * * @param input * current buffer */ private void endTokenize(byte[] input) throws SyntaxException { switch (state) { case 0: break; case 10: if (ofsCur > ofsStart) { foundCharacterData(input, ofsStart, ofsCur - ofsStart); } break; case 11: if (!strictlyXml) { tellReference(input, ofsStart, ofsCur - ofsStart); break; } /* fall thru */ default: throw new SyntaxException(state, ofsCur + readPos); } foundEndOfInput(ofsCur + readPos); } /** * Method called when a reference been found in content. * * @param input * byte array which describes the reference * @param offset * position of the first byte in the array * @param count * number of bytes of the reference */ private void tellReference(byte[] input, int offset, int count) throws SyntaxException { if (resolveCharRef) { char res = resolveCharacterReference(input, offset, count); if (strictlyXml && (res == '\uffff')) { throw new SyntaxException(state, ofsCur + readPos); } foundCharacter(res); } else { foundReference(input, offset, count); } } /** * Convert a byte array of hexadecimal digits into a UCS-2 encoded Unicode * character. * * @param input * byte array to convert * @param offset * position of the first byte in the array * @param count * number of bytes to convert * @return the resulting character, or '\uffff' (not a unicode * character) if the conversion could not be done */ private static char hex2char(byte[] input, int offset, int count) { char res = 0; if ((count > 0) && (count <= 4)) { while (true) { char c = (char) (input[offset++]); if (c <= '9') { if (c < '0') break; res += (c & 0xF); } else { if ((c = (char) ((c & ~('a' - 'A')) - 'A')) >= (char) (16 - 10)) break; res += (c + 10); } if (--count == 0) return res; res <<= 4; } } return '\uffff'; } /** * Convert a byte array of decimal digits into a UCS-2 encoded Unicode * character. * * @param input * byte array to convert * @param offset * position of the first byte in the array * @param count * number of bytes to convert * @return the resulting character, or '\uffff' (not a unicode * character) */ private static char dec2char(byte[] input, int offset, int count) { char res = 0; if (count > 0) { while (true) { char c = (char) (input[offset++]); if (c <= '9') { if (c < '0') break; res += (c & 0xF); } if (--count == 0) return res; if ((res >= 6553) && ((res > 6553) || (input[offset + 1] > '5'))) break; res = (char) ((res << 1) + (res << 3)); } } return '\uffff'; } /** * Convert a named character reference into its UCS-2 encoded Unicode * character value. See <a * href=http://www.w3.org/TR/REC-xml#sec-predefined-ent>XML Predefined * Entities</a> * * @param input * byte array which contains the name of the reference * @param offset * position of the first byte in the array * @param count * number of bytes making the name of the reference * @return the resulting character, or '\uffff' (not a unicode * character) */ private static char ref2char(byte[] input, int offset, int count) { ++count; for (int i = 0; i < chrRef.length; ++i) { if (chrRef[i].length == count) { byte b[] = chrRef[i]; int k = offset; int j = 0; while (true) { if (++j == count) return (char) (b[0]); if (b[j] != input[k++]) break; } } } return '\uffff'; } /** Returns the hashcode of the given bytes. * @since TotalCross 1.25 */ public int hashCode(byte[] input, int offset, int count) { int hash = 0; while (--count >= 0) hash = (hash << 5) - hash + (int)input[offset++]; return hash; } }