/* Aalto XML processor * * Copyright (c) 2006- Tatu Saloranta, tatu.saloranta@iki.fi * * Licensed under the License specified in the file LICENSE which is * included with the source code. * You may not use this file except in compliance with the License. * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.fasterxml.aalto.in; import java.io.*; import javax.xml.stream.XMLStreamException; import com.fasterxml.aalto.impl.ErrorConsts; import com.fasterxml.aalto.util.DataUtil; import com.fasterxml.aalto.util.XmlCharTypes; import com.fasterxml.aalto.util.XmlChars; /** * Scanner for tokenizing XML content from a byte stream encoding using * UTF-8 encoding, or something suitably close it for decoding purposes * (including ISO-Latin1 and US-ASCII). */ public final class Utf8Scanner extends StreamScanner { /* /********************************************************************** /* Life-cycle /********************************************************************** */ public Utf8Scanner(ReaderConfig cfg, InputStream in, byte[] buffer, int ptr, int last) { super(cfg, in, buffer, ptr, last); } /* /********************************************************************** /* Internal methods, secondary parsing /********************************************************************** */ @Override protected final void finishToken() throws XMLStreamException { _tokenIncomplete = false; switch (_currToken) { case PROCESSING_INSTRUCTION: finishPI(); break; case CHARACTERS: finishCharacters(); break; case COMMENT: finishComment(); break; case SPACE: finishSpace(); break; case DTD: finishDTD(true); // true -> get text break; case CDATA: finishCData(); break; default: ErrorConsts.throwInternalError(); } } @Override protected int handleStartElement(byte b) throws XMLStreamException { _currToken = START_ELEMENT; _currNsCount = 0; PName elemName = parsePName(b); /* Ok. Need to create a qualified name. Simplest for element * in default ns (no extra work -- expressed as null binding); * otherwise need to find binding */ String prefix = elemName.getPrefix(); boolean allBound; // flag to check 'late' bindings if (prefix == null) { // element in default ns allBound = true; // which need not be bound } else { elemName = bindName(elemName, prefix); allBound = elemName.isBound(); } _tokenName = elemName; _currElem = new ElementScope(elemName, _currElem); // And then attribute parsing loop: int attrPtr = 0; while (true) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } b = _inputBuffer[_inputPtr++]; int c = (int) b & 0xFF; // Intervening space to skip? if (c <= INT_SPACE) { do { if (c == INT_LF) { markLF(); } else if (c == INT_CR) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (_inputBuffer[_inputPtr] == BYTE_LF) { ++_inputPtr; } markLF(); } else if (c != INT_SPACE && c != INT_TAB) { throwInvalidSpace(c); } if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } b = _inputBuffer[_inputPtr++]; c = (int) b & 0xFF; } while (c <= INT_SPACE); } else if (c != INT_SLASH && c != INT_GT) { c = decodeCharForError(b); throwUnexpectedChar(c, " expected space, or '>' or \"/>\""); } // Ok; either need to get an attribute name, or end marker: if (c == INT_SLASH) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } b = _inputBuffer[_inputPtr++]; if (b != BYTE_GT) { c = decodeCharForError(b); throwUnexpectedChar(c, " expected '>'"); } _isEmptyTag = true; break; } else if (c == INT_GT) { _isEmptyTag = false; break; } else if (c == INT_LT) { reportInputProblem("Unexpected '<' character in element (missing closing '>'?)"); } // Ok, an attr name: PName attrName = parsePName(b); prefix = attrName.getPrefix(); boolean isNsDecl; if (prefix == null) { // can be default ns decl: isNsDecl = (attrName.getLocalName() == "xmlns"); } else { // May be a namespace decl though? if (prefix == "xmlns") { isNsDecl = true; } else { attrName = bindName(attrName, prefix); if (allBound) { allBound = attrName.isBound(); } isNsDecl = false; } } // Optional space to skip again while (true) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } b = _inputBuffer[_inputPtr++]; c = (int) b & 0xFF; if (c > INT_SPACE) { break; } if (c == INT_LF) { markLF(); } else if (c == INT_CR) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (_inputBuffer[_inputPtr] == BYTE_LF) { ++_inputPtr; } markLF(); } else if (c != INT_SPACE && c != INT_TAB) { throwInvalidSpace(c); } } if (c != INT_EQ) { c = decodeCharForError(b); throwUnexpectedChar(c, " expected '='"); } // Optional space to skip again while (true) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } b = _inputBuffer[_inputPtr++]; c = (int) b & 0xFF; if (c > INT_SPACE) { break; } if (c == INT_LF) { markLF(); } else if (c == INT_CR) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (_inputBuffer[_inputPtr] == BYTE_LF) { ++_inputPtr; } markLF(); } else if (c != INT_SPACE && c != INT_TAB) { throwInvalidSpace(c); } } if (c != INT_QUOTE && c != INT_APOS) { c = decodeCharForError(b); throwUnexpectedChar(c, " Expected a quote"); } /* Ok, finally: value parsing. However, ns URIs are to be handled * different from attribute values... let's offline URIs, since * they should be less common than attribute values. */ if (isNsDecl) { // default ns, or explicit? handleNsDeclaration(attrName, b); ++_currNsCount; } else { // nope, a 'real' attribute: attrPtr = collectValue(attrPtr, b, attrName); } } { // Note: this call also checks attribute uniqueness int act = _attrCollector.finishLastValue(attrPtr); if (act < 0) { // error, dup attr indicated by -1 act = _attrCollector.getCount(); // let's get correct count reportInputProblem(_attrCollector.getErrorMsg()); } _attrCount = act; } ++_depth; /* Was there any prefix that wasn't bound prior to use? * That's legal, assuming declaration was found later on... * let's check */ if (!allBound) { if (!elemName.isBound()) { // element itself unbound reportUnboundPrefix(_tokenName, false); } for (int i = 0, len = _attrCount; i < len; ++i) { PName attrName = _attrCollector.getName(i); if (!attrName.isBound()) { reportUnboundPrefix(attrName, true); } } } return START_ELEMENT; } /** * This method implements the tight loop for parsing attribute * values. It's off-lined from the main start element method to * simplify main method, which makes code more maintainable * and possibly easier for JIT/HotSpot to optimize. */ private final int collectValue(int attrPtr, byte quoteByte, PName attrName) throws XMLStreamException { char[] attrBuffer = _attrCollector.startNewValue(attrName, attrPtr); final int[] TYPES = _charTypes.ATTR_CHARS; final int quoteChar = (int) quoteByte; value_loop: while (true) { int c; ascii_loop: while (true) { int ptr = _inputPtr; if (ptr >= _inputEnd) { loadMoreGuaranteed(); ptr = _inputPtr; } if (attrPtr >= attrBuffer.length) { attrBuffer = _attrCollector.valueBufferFull(); } int max = _inputEnd; { int max2 = ptr + (attrBuffer.length - attrPtr); if (max2 < max) { max = max2; } } while (ptr < max) { c = (int) _inputBuffer[ptr++] & 0xFF; if (TYPES[c] != 0) { _inputPtr = ptr; break ascii_loop; } attrBuffer[attrPtr++] = (char) c; } _inputPtr = ptr; } switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (_inputBuffer[_inputPtr] == BYTE_LF) { ++_inputPtr; } // fall through case XmlCharTypes.CT_WS_LF: markLF(); // fall through case XmlCharTypes.CT_WS_TAB: // Plus, need to convert these all to simple space c = INT_SPACE; break; case XmlCharTypes.CT_MULTIBYTE_2: c = decodeUtf8_2(c); break; case XmlCharTypes.CT_MULTIBYTE_3: c = decodeUtf8_3(c); break; case XmlCharTypes.CT_MULTIBYTE_4: c = decodeUtf8_4(c); // Let's add first part right away: attrBuffer[attrPtr++] = (char) (0xD800 | (c >> 10)); c = 0xDC00 | (c & 0x3FF); if (attrPtr >= attrBuffer.length) { attrBuffer = _attrCollector.valueBufferFull(); } break; case XmlCharTypes.CT_MULTIBYTE_N: reportInvalidInitial(c); case XmlCharTypes.CT_LT: throwUnexpectedChar(c, "'<' not allowed in attribute value"); case XmlCharTypes.CT_AMP: c = handleEntityInText(false); if (c == 0) { // unexpanded general entity... not good reportUnexpandedEntityInAttr(attrName, false); } // Ok; does it need a surrogate though? (over 16 bits) if ((c >> 16) != 0) { c -= 0x10000; attrBuffer[attrPtr++] = (char) (0xD800 | (c >> 10)); c = 0xDC00 | (c & 0x3FF); if (attrPtr >= attrBuffer.length) { attrBuffer = _attrCollector.valueBufferFull(); } } break; case XmlCharTypes.CT_ATTR_QUOTE: if (c == quoteChar) { break value_loop; } // default: // Other chars are not important here... } // We know there's room for at least one char without checking attrBuffer[attrPtr++] = (char) c; } return attrPtr; } /** * Method called from the main START_ELEMENT handling loop, to * parse namespace URI values. */ private void handleNsDeclaration(PName name, byte quoteByte) throws XMLStreamException { int attrPtr = 0; char[] attrBuffer = _nameBuffer; while (true) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } byte b = _inputBuffer[_inputPtr++]; if (b == quoteByte) { break; } int c; if (b == BYTE_AMP) { // entity c = handleEntityInText(false); if (c == 0) { // general entity; should never happen reportUnexpandedEntityInAttr(name, true); } // Ok; does it need a surrogate though? (over 16 bits) if ((c >> 16) != 0) { if (attrPtr >= attrBuffer.length) { _nameBuffer = attrBuffer = DataUtil.growArrayBy(attrBuffer, attrBuffer.length); } c -= 0x10000; attrBuffer[attrPtr++] = (char) (0xD800 | (c >> 10)); c = 0xDC00 | (c & 0x3FF); } } else if (b == BYTE_LT) { // error c = (int) b; throwUnexpectedChar(c, "'<' not allowed in attribute value"); } else { c = (int) b & 0xFF; if (c < INT_SPACE) { if (c == INT_LF) { markLF(); } else if (c == INT_CR) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (_inputBuffer[_inputPtr] == BYTE_LF) { ++_inputPtr; } markLF(); } else { if (c < 0) { c = decodeMultiByteChar(c, _inputPtr); if (c < 0) { // surrogate pair c = -c; // Let's add first part right away: if (attrPtr >= attrBuffer.length) { _nameBuffer = attrBuffer = DataUtil.growArrayBy(attrBuffer, attrBuffer.length); } c -= 0x10000; attrBuffer[attrPtr++] = (char) (0xD800 | (c >> 10)); c = 0xDC00 | (c & 0x3FF); } } else if (c != INT_TAB) { throwInvalidSpace(c); } } } } if (attrPtr >= attrBuffer.length) { _nameBuffer = attrBuffer = DataUtil.growArrayBy(attrBuffer, attrBuffer.length); } attrBuffer[attrPtr++] = (char) c; } /* Simple optimization: for default ns removal (or, with * ns 1.1, any other as well), will use empty value... no * need to try to intern: */ if (attrPtr == 0) { bindNs(name, ""); } else { String uri = _config.canonicalizeURI(attrBuffer, attrPtr); bindNs(name, uri); } } /** * Method called when an ampersand is encounter in text segment. * Method needs to determine whether it is a pre-defined or character * entity (in which case it will be expanded into a single char or * surrogate pair), or a general * entity (in which case it will most likely be returned as * ENTITY_REFERENCE event) * * @param inAttr True, if reference is from attribute value; false * if from normal text content * * @return 0 if a general parsed entity encountered; integer * value of a (valid) XML content character otherwise */ @Override protected final int handleEntityInText(boolean inAttr) throws XMLStreamException { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } byte b = _inputBuffer[_inputPtr++]; if (b == BYTE_HASH) { return handleCharEntity(); } String start; if (b == BYTE_a) { // amp or apos? b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne(); if (b == BYTE_m) { // amp? b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne(); if (b == BYTE_p) { b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne(); if (b == BYTE_SEMICOLON) { return INT_AMP; } start = "amp"; } else { start = "am"; } } else if (b == BYTE_p) { // apos? b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne(); if (b == BYTE_o) { b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne(); if (b == BYTE_s) { b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne(); if (b == BYTE_SEMICOLON) { return INT_APOS; } start = "apos"; } else { start = "apo"; } } else { start = "ap"; } } else { start = "a"; } } else if (b == BYTE_l) { // lt? b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne(); if (b == BYTE_t) { b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne(); if (b == BYTE_SEMICOLON) { return INT_LT; } start = "lt"; } else { start = "l"; } } else if (b == BYTE_g) { // gt? b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne(); if (b == BYTE_t) { b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne(); if (b == BYTE_SEMICOLON) { return INT_GT; } start = "gt"; } else { start = "g"; } } else if (b == BYTE_q) { // quot? b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne(); if (b == BYTE_u) { b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne(); if (b == BYTE_o) { b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne(); if (b == BYTE_t) { b = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne(); if (b == BYTE_SEMICOLON) { return INT_QUOTE; } start = "quot"; } else { start = "quo"; } } else { start = "qu"; } } else { start = "q"; } } else { start = ""; } final int[] TYPES = _charTypes.NAME_CHARS; /* All righty: we have the beginning of the name, plus the first * byte too. So let's see what we can do with it. */ char[] cbuf = _nameBuffer; int cix = 0; for (int len = start.length(); cix < len; ++cix) { cbuf[cix] = start.charAt(cix); } //int colon = -1; while (b != BYTE_SEMICOLON) { boolean ok; int c = (int) b & 0xFF; // Has to be a valid name start char though: switch (TYPES[c]) { case XmlCharTypes.CT_NAME_NONE: case XmlCharTypes.CT_NAME_COLON: // not ok for entities? case XmlCharTypes.CT_NAME_NONFIRST: ok = (cix > 0); break; case XmlCharTypes.CT_NAME_ANY: ok = true; break; case InputCharTypes.CT_INPUT_NAME_MB_2: c = decodeUtf8_2(c); ok = XmlChars.is10NameStartChar(c); break; case InputCharTypes.CT_INPUT_NAME_MB_3: c = decodeUtf8_3(c); ok = XmlChars.is10NameStartChar(c); break; case InputCharTypes.CT_INPUT_NAME_MB_4: c = decodeUtf8_4(c); ok = XmlChars.is10NameStartChar(c); if (ok) { if (cix >= cbuf.length) { _nameBuffer = cbuf = DataUtil.growArrayBy(cbuf, cbuf.length); } // Let's add first part right away: c -= 0x10000; cbuf[cix++] = (char) (0xD800 | (c >> 10)); c = 0xDC00 | (c & 0x3FF); } break; case InputCharTypes.CT_INPUT_NAME_MB_N: default: ok = false; break; } if (!ok) { reportInvalidNameChar(c, cix); } if (cix >= cbuf.length) { _nameBuffer = cbuf = DataUtil.growArrayBy(cbuf, cbuf.length); } cbuf[cix++] = (char) c; if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } b = _inputBuffer[_inputPtr++]; } // Ok, let's construct a (temporary) entity name, then: String pname = new String(cbuf, 0, cix); // (note: hash is dummy... not to be compared to anything etc) _tokenName = new PNameC(pname, null, pname, 0); /* One more thing: do we actually allow entities in this mode * and with this event? */ if (_config.willExpandEntities()) { reportInputProblem("General entity reference (&"+pname+";) encountered in entity expanding mode: operation not (yet) implemented"); } if (inAttr) { reportInputProblem("General entity reference (&"+pname+";) encountered in attribute value, in non-entity-expanding mode: no way to handle it"); } return 0; } /* /********************************************************************** /* Internal methods, name parsing: /********************************************************************** */ /** * Parsing of public ids is bit more complicated than that of system * ids, since white space is to be coalesced. */ @Override protected String parsePublicId(byte quoteChar) throws XMLStreamException { char[] outputBuffer = _nameBuffer; int outPtr = 0; final int[] TYPES = XmlCharTypes.PUBID_CHARS; boolean addSpace = false; main_loop: while (true) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } // Easier to check without char type table, first: byte b = _inputBuffer[_inputPtr++]; if (b == quoteChar) { break main_loop; } int c = (int) b & 0xFF; if (TYPES[c] != XmlCharTypes.PUBID_OK) { throwUnexpectedChar(c, " in public identifier"); } // White space? Needs to be coalecsed if (c <= INT_SPACE) { addSpace = true; continue; } if (addSpace) { if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } outputBuffer[outPtr++] = ' '; addSpace = false; } if (outPtr >= outputBuffer.length) { _nameBuffer = outputBuffer = DataUtil.growArrayBy(outputBuffer, outputBuffer.length); outPtr = 0; } outputBuffer[outPtr++] = (char) c; } return new String(outputBuffer, 0, outPtr); } @Override protected String parseSystemId(byte quoteChar) throws XMLStreamException { // caller has init'ed the buffer... char[] outputBuffer = _nameBuffer; int outPtr = 0; // attribute types are closest matches, so let's use them final int[] TYPES = _charTypes.ATTR_CHARS; //boolean spaceToAdd = false; main_loop: while (true) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } int c = (int) _inputBuffer[_inputPtr++] & 0xFF; if (TYPES[c] != 0) { switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (_inputBuffer[_inputPtr] == BYTE_LF) { ++_inputPtr; } markLF(); c = INT_LF; break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_MULTIBYTE_2: c = decodeUtf8_2(c); break; case XmlCharTypes.CT_MULTIBYTE_3: c = decodeUtf8_3(c); break; case XmlCharTypes.CT_MULTIBYTE_4: c = decodeUtf8_4(c); if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } // Let's add first part right away: outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10)); c = 0xDC00 | (c & 0x3FF); // And let the other char output down below break; case XmlCharTypes.CT_MULTIBYTE_N: reportInvalidInitial(c); case XmlCharTypes.CT_ATTR_QUOTE: if (c == (int) quoteChar) { break main_loop; } } } if (outPtr >= outputBuffer.length) { _nameBuffer = outputBuffer = DataUtil.growArrayBy(outputBuffer, outputBuffer.length); outPtr = 0; } outputBuffer[outPtr++] = (char) c; } return new String(outputBuffer, 0, outPtr); } /* /********************************************************************** /* Content skipping /********************************************************************** */ @Override protected final boolean skipCharacters() throws XMLStreamException { final int[] TYPES = _charTypes.TEXT_CHARS; final byte[] inputBuffer = _inputBuffer; while (true) { int c; // Then the tight ascii non-funny-char loop: ascii_loop: while (true) { int ptr = _inputPtr; int max = _inputEnd; if (ptr >= max) { loadMoreGuaranteed(); ptr = _inputPtr; max = _inputEnd; } while (ptr < max) { c = (int) inputBuffer[ptr++] & 0xFF; if (TYPES[c] != 0) { _inputPtr = ptr; break ascii_loop; } } _inputPtr = ptr; } switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (inputBuffer[_inputPtr] == BYTE_LF) { ++_inputPtr; } markLF(); break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_MULTIBYTE_2: skipUtf8_2(c); break; case XmlCharTypes.CT_MULTIBYTE_3: skipUtf8_3(c); break; case XmlCharTypes.CT_MULTIBYTE_4: skipUtf8_4(c); break; case XmlCharTypes.CT_MULTIBYTE_N: reportInvalidInitial(c); case XmlCharTypes.CT_LT: --_inputPtr; return false; case XmlCharTypes.CT_AMP: c = handleEntityInText(false); if (c == 0) { // unexpandable general parsed entity return true; } break; case XmlCharTypes.CT_RBRACKET: // ']]>'? { // Let's then just count number of brackets -- // in case they are not followed by '>' int count = 1; byte b; while (true) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } b = inputBuffer[_inputPtr]; if (b != BYTE_RBRACKET) { break; } ++_inputPtr; // to skip past bracket ++count; } if (b == BYTE_GT && count > 1) { reportIllegalCDataEnd(); } } break; // default: // Other types are not important here... } } } @Override protected final void skipComment() throws XMLStreamException { final int[] TYPES = _charTypes.OTHER_CHARS; final byte[] inputBuffer = _inputBuffer; while (true) { int c; // Then the tight ascii non-funny-char loop: ascii_loop: while (true) { int ptr = _inputPtr; int max = _inputEnd; if (ptr >= max) { loadMoreGuaranteed(); ptr = _inputPtr; max = _inputEnd; } while (ptr < max) { c = (int) inputBuffer[ptr++] & 0xFF; if (TYPES[c] != 0) { _inputPtr = ptr; break ascii_loop; } } _inputPtr = ptr; } switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (inputBuffer[_inputPtr] == BYTE_LF) { ++_inputPtr; } markLF(); break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_MULTIBYTE_2: skipUtf8_2(c); break; case XmlCharTypes.CT_MULTIBYTE_3: skipUtf8_3(c); break; case XmlCharTypes.CT_MULTIBYTE_4: skipUtf8_4(c); break; case XmlCharTypes.CT_MULTIBYTE_N: reportInvalidInitial(c); case XmlCharTypes.CT_HYPHEN: // '-->'? if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (_inputBuffer[_inputPtr] == BYTE_HYPHEN) { // ok, must be end then ++_inputPtr; if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (_inputBuffer[_inputPtr++] != BYTE_GT) { reportDoubleHyphenInComments(); } return; } break; // default: // Other types are not important here... } } } @Override protected final void skipCData() throws XMLStreamException { final int[] TYPES = _charTypes.OTHER_CHARS; final byte[] inputBuffer = _inputBuffer; while (true) { int c; // Then the tight ascii non-funny-char loop: ascii_loop: while (true) { int ptr = _inputPtr; int max = _inputEnd; if (ptr >= max) { loadMoreGuaranteed(); ptr = _inputPtr; max = _inputEnd; } while (ptr < max) { c = (int) inputBuffer[ptr++] & 0xFF; if (TYPES[c] != 0) { _inputPtr = ptr; break ascii_loop; } } _inputPtr = ptr; } switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (inputBuffer[_inputPtr] == BYTE_LF) { ++_inputPtr; } markLF(); break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_MULTIBYTE_2: skipUtf8_2(c); break; case XmlCharTypes.CT_MULTIBYTE_3: skipUtf8_3(c); break; case XmlCharTypes.CT_MULTIBYTE_4: skipUtf8_4(c); break; case XmlCharTypes.CT_MULTIBYTE_N: reportInvalidInitial(c); case XmlCharTypes.CT_RBRACKET: // ']]>'? { // end is nigh? int count = 0; byte b; do { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } ++count; b = _inputBuffer[_inputPtr++]; } while (b == BYTE_RBRACKET); if (b == BYTE_GT) { if (count > 1) { // gotcha return; } // can still skip plain ']>'... } else { --_inputPtr; // need to push back last char } } break; // default: // Other types are not important here... } } } @Override protected final void skipPI() throws XMLStreamException { final int[] TYPES = _charTypes.OTHER_CHARS; final byte[] inputBuffer = _inputBuffer; while (true) { int c; // Then the tight ascii non-funny-char loop: ascii_loop: while (true) { int ptr = _inputPtr; int max = _inputEnd; if (ptr >= max) { loadMoreGuaranteed(); ptr = _inputPtr; max = _inputEnd; } while (ptr < max) { c = (int) inputBuffer[ptr++] & 0xFF; if (TYPES[c] != 0) { _inputPtr = ptr; break ascii_loop; } } _inputPtr = ptr; } switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (inputBuffer[_inputPtr] == BYTE_LF) { ++_inputPtr; } markLF(); break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_MULTIBYTE_2: skipUtf8_2(c); break; case XmlCharTypes.CT_MULTIBYTE_3: skipUtf8_3(c); break; case XmlCharTypes.CT_MULTIBYTE_4: skipUtf8_4(c); break; case XmlCharTypes.CT_MULTIBYTE_N: reportInvalidInitial(c); case XmlCharTypes.CT_QMARK: // '?>'? if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (_inputBuffer[_inputPtr] == BYTE_GT) { ++_inputPtr; return; } break; // default: // Other types are not important here... } } } @Override protected final void skipSpace() throws XMLStreamException { // mTmpChar has a space, but it's been checked, can ignore int ptr = _inputPtr; while (true) { if (ptr >= _inputEnd) { if (!loadMore()) { break; } ptr = _inputPtr; } int c = (int) _inputBuffer[ptr] & 0xFF; if (c > INT_SPACE) { // !!! TODO: xml 1.1 ws break; } ++ptr; if (c == INT_LF) { markLF(ptr); } else if (c == INT_CR) { if (ptr >= _inputEnd) { if (!loadMore()) { break; } ptr = _inputPtr; } if (_inputBuffer[ptr] == BYTE_LF) { ++ptr; } markLF(ptr); } else if (c != INT_SPACE && c != INT_TAB) { _inputPtr = ptr; throwInvalidSpace(c); } } _inputPtr = ptr; } /* private final int skipMultiByteChar(int c, int ptr) throws XMLStreamException { int needed; // Ok; if we end here, we got multi-byte combination if ((c & 0xE0) == 0xC0) { // 2 bytes (0x0080 - 0x07FF) needed = 1; } else if ((c & 0xF0) == 0xE0) { // 3 bytes (0x0800 - 0xFFFF) needed = 2; } else if ((c & 0xF8) == 0xF0) { // 4 bytes; double-char with surrogates and all... needed = 3; } else { reportInvalidInitial(c & 0xFF); needed = 1; // never gets here } if (ptr >= _inputEnd) { loadMoreGuaranteed(); ptr = _inputPtr; } c = (int) _inputBuffer[ptr++]; if ((c & 0xC0) != 0x080) { reportInvalidOther(c & 0xFF, ptr); } if (needed > 1) { // needed == 1 means 2 bytes total if (ptr >= _inputEnd) { loadMoreGuaranteed(); ptr = _inputPtr; } c = (int) _inputBuffer[ptr++]; if ((c & 0xC0) != 0x080) { reportInvalidOther(c & 0xFF, ptr); } if (needed > 2) { // 4 bytes? (need surrogates) if (ptr >= _inputEnd) { loadMoreGuaranteed(); ptr = _inputPtr; } c = (int) _inputBuffer[ptr++]; if ((c & 0xC0) != 0x080) { reportInvalidOther(c & 0xFF, ptr); } } } return ptr; } private final int skipMultiByteChar(int c, int type, int ptr) throws XMLStreamException { type -= XmlCharTypes.CT_MULTIBYTE_N; // number of more bytes needed if (ptr >= _inputEnd) { loadMoreGuaranteed(); ptr = _inputPtr; } c = (int) _inputBuffer[ptr++]; if ((c & 0xC0) != 0x080) { reportInvalidOther(c & 0xFF, ptr); } if (type > 1) { // needed == 1 means 2 bytes total if (ptr >= _inputEnd) { loadMoreGuaranteed(); ptr = _inputPtr; } c = (int) _inputBuffer[ptr++]; if ((c & 0xC0) != 0x080) { reportInvalidOther(c & 0xFF, ptr); } if (type > 2) { // 4 bytes? (need surrogates) if (ptr >= _inputEnd) { loadMoreGuaranteed(); ptr = _inputPtr; } c = (int) _inputBuffer[ptr++]; if ((c & 0xC0) != 0x080) { reportInvalidOther(c & 0xFF, ptr); } } } return ptr; } */ private final void skipUtf8_2(int c) throws XMLStreamException { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = (int) _inputBuffer[_inputPtr++]; if ((c & 0xC0) != 0x080) { reportInvalidOther(c & 0xFF, _inputPtr); } } /* Alas, can't heavily optimize skipping, since we still have to * do validity checks... */ private final void skipUtf8_3(int c) throws XMLStreamException { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c &= 0x0F; if (c >= 0xD) { // have to check c <<= 6; int d = (int) _inputBuffer[_inputPtr++]; if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF, _inputPtr); } c |= (d & 0x3F); if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } d = (int) _inputBuffer[_inputPtr++]; if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF, _inputPtr); } c = (c << 6) | (d & 0x3F); // 0xD800-0xDFFF, 0xFFFE-0xFFFF illegal if (c >= 0xD800) { // surrogates illegal, as well as 0xFFFE/0xFFFF if (c < 0xE000 || (c >= 0xFFFE && c <= 0xFFFF)) { c = handleInvalidXmlChar(c); } } } else { // no checks, can discard c = (int) _inputBuffer[_inputPtr++]; if ((c & 0xC0) != 0x080) { reportInvalidOther(c & 0xFF, _inputPtr); } if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = (int) _inputBuffer[_inputPtr++]; if ((c & 0xC0) != 0x080) { reportInvalidOther(c & 0xFF, _inputPtr); } } } private final void skipUtf8_4(int c) throws XMLStreamException { if ((_inputPtr + 4) > _inputEnd) { skipUtf8_4Slow(c); return; } int d = (int) _inputBuffer[_inputPtr++]; if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF, _inputPtr); } d = (int) _inputBuffer[_inputPtr++]; if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF, _inputPtr); } d = (int) _inputBuffer[_inputPtr++]; if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF, _inputPtr); } } private final void skipUtf8_4Slow(int c) throws XMLStreamException { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } int d = (int) _inputBuffer[_inputPtr++]; if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF, _inputPtr); } if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } d = (int) _inputBuffer[_inputPtr++]; if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF, _inputPtr); } if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } d = (int) _inputBuffer[_inputPtr++]; if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF, _inputPtr); } } /* /********************************************************************** /* Content parsing /********************************************************************** */ @Override protected final void finishCData() throws XMLStreamException { final int[] TYPES = _charTypes.OTHER_CHARS; final byte[] inputBuffer = _inputBuffer; char[] outputBuffer = _textBuilder.resetWithEmpty(); int outPtr = 0; /* At this point, space (if any) has been skipped, and we are * to parse and store the contents */ main_loop: while (true) { int c; // Then the tight ascii non-funny-char loop: ascii_loop: while (true) { int ptr = _inputPtr; if (ptr >= _inputEnd) { loadMoreGuaranteed(); ptr = _inputPtr; } if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } int max = _inputEnd; { int max2 = ptr + (outputBuffer.length - outPtr); if (max2 < max) { max = max2; } } while (ptr < max) { c = (int) inputBuffer[ptr++] & 0xFF; if (TYPES[c] != 0) { _inputPtr = ptr; break ascii_loop; } outputBuffer[outPtr++] = (char) c; } _inputPtr = ptr; } // And then exceptions: switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (inputBuffer[_inputPtr] == BYTE_LF) { ++_inputPtr; } markLF(); c = INT_LF; break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_MULTIBYTE_2: c = decodeUtf8_2(c); break; case XmlCharTypes.CT_MULTIBYTE_3: c = decodeUtf8_3(c); break; case XmlCharTypes.CT_MULTIBYTE_4: c = decodeUtf8_4(c); // Let's add first part right away: outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10)); if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } c = 0xDC00 | (c & 0x3FF); // And let the other char output down below break; case XmlCharTypes.CT_MULTIBYTE_N: reportInvalidInitial(c); case XmlCharTypes.CT_RBRACKET: // close ']]>' marker? /* Ok: let's just parse all consequtive right brackets, * and see if followed by greater-than char. This because * we can only push back at most one char at a time, and * thus can't easily just check a subset */ int count = 0; // ignoring first one byte b; do { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } b = _inputBuffer[_inputPtr]; if (b != BYTE_RBRACKET) { break; } ++_inputPtr; ++count; } while (true); // Was the marker found? boolean ok = (b == BYTE_GT && count >= 1); if (ok) { --count; } // Brackets to copy to output? for (; count > 0; --count) { outputBuffer[outPtr++] = ']'; if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } } if (ok) { ++_inputPtr; // to consume '>' break main_loop; } break; } // Ok, can output the char; there's room for one char at least outputBuffer[outPtr++] = (char) c; } _textBuilder.setCurrentLength(outPtr); /* 03-Feb-2009, tatu: To support coalescing mode, may need to * do some extra work */ if (_cfgCoalescing && !_entityPending) { finishCoalescedText(); } } @Override protected final void finishCharacters() throws XMLStreamException { int outPtr; int c; char[] outputBuffer; // Ok, so what was the first char / entity? c = _tmpChar; if (c < 0) { // from entity; can just copy as is c = -c; outputBuffer = _textBuilder.resetWithEmpty(); outPtr = 0; if ((c >> 16) != 0) { // surrogate pair? c -= 0x10000; /* Note: after resetting the buffer, it's known to have * space for more than 2 chars we need to add */ outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10)); c = 0xDC00 | (c & 0x3FF); } outputBuffer[outPtr++] = (char) c; } else { // white space that we are interested in? if (c == INT_CR || c == INT_LF) { ++_inputPtr; // wasn't advanced yet, in this case outPtr = checkInTreeIndentation(c); if (outPtr < 0) { return; } // Above call also initializes the text builder appropriately outputBuffer = _textBuilder.getBufferWithoutReset(); } else { outputBuffer = _textBuilder.resetWithEmpty(); outPtr = 0; } } final int[] TYPES = _charTypes.TEXT_CHARS; final byte[] inputBuffer = _inputBuffer; main_loop: while (true) { // Then the tight ascii non-funny-char loop: ascii_loop: while (true) { int ptr = _inputPtr; if (ptr >= _inputEnd) { loadMoreGuaranteed(); ptr = _inputPtr; } if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } int max = _inputEnd; { int max2 = ptr + (outputBuffer.length - outPtr); if (max2 < max) { max = max2; } } while (ptr < max) { c = (int) inputBuffer[ptr++] & 0xFF; if (TYPES[c] != 0) { _inputPtr = ptr; break ascii_loop; } outputBuffer[outPtr++] = (char) c; } _inputPtr = ptr; } // And then fallback for funny chars / UTF-8 multibytes: switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (inputBuffer[_inputPtr] == BYTE_LF) { ++_inputPtr; } markLF(); c = INT_LF; break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_MULTIBYTE_2: c = decodeUtf8_2(c); break; case XmlCharTypes.CT_MULTIBYTE_3: if ((_inputEnd - _inputPtr) >= 2) { c = decodeUtf8_3fast(c); } else { c = decodeUtf8_3(c); } break; case XmlCharTypes.CT_MULTIBYTE_4: c = decodeUtf8_4(c); // Let's add first part right away: outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10)); if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } c = 0xDC00 | (c & 0x3FF); // And let the other char output down below break; case XmlCharTypes.CT_MULTIBYTE_N: reportInvalidInitial(c); case XmlCharTypes.CT_LT: --_inputPtr; break main_loop; case XmlCharTypes.CT_AMP: c = handleEntityInText(false); if (c == 0) { // unexpandable general parsed entity // _inputPtr set by entity expansion method _entityPending = true; break main_loop; } // Ok; does it need a surrogate though? (over 16 bits) if ((c >> 16) != 0) { c -= 0x10000; outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10)); // Need to ensure room for one more char if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } c = 0xDC00 | (c & 0x3FF); } break; case XmlCharTypes.CT_RBRACKET: // ']]>'? { // Let's then just count number of brackets -- // in case they are not followed by '>' int count = 1; byte b; while (true) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } b = inputBuffer[_inputPtr]; if (b != BYTE_RBRACKET) { break; } ++_inputPtr; // to skip past bracket ++count; } if (b == BYTE_GT && count > 1) { reportIllegalCDataEnd(); } // Nope. Need to output all brackets, then; except // for one that can be left for normal output while (count > 1) { outputBuffer[outPtr++] = ']'; // Need to ensure room for one more char if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } --count; } } // Can just output the first ']' along normal output break; // default: // Other types are not important here... } // We know there's room for one more: outputBuffer[outPtr++] = (char) c; } _textBuilder.setCurrentLength(outPtr); /* 03-Feb-2009, tatu: To support coalescing mode, may need to * do some extra work */ if (_cfgCoalescing && !_entityPending) { finishCoalescedText(); } } @Override protected final void finishComment() throws XMLStreamException { final int[] TYPES = _charTypes.OTHER_CHARS; final byte[] inputBuffer = _inputBuffer; char[] outputBuffer = _textBuilder.resetWithEmpty(); int outPtr = 0; main_loop: while (true) { int c; // Then the tight ascii non-funny-char loop: ascii_loop: while (true) { int ptr = _inputPtr; if (ptr >= _inputEnd) { loadMoreGuaranteed(); ptr = _inputPtr; } if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } int max = _inputEnd; { int max2 = ptr + (outputBuffer.length - outPtr); if (max2 < max) { max = max2; } } while (ptr < max) { c = (int) inputBuffer[ptr++] & 0xFF; if (TYPES[c] != 0) { _inputPtr = ptr; break ascii_loop; } outputBuffer[outPtr++] = (char) c; } _inputPtr = ptr; } switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (inputBuffer[_inputPtr] == BYTE_LF) { ++_inputPtr; } markLF(); c = INT_LF; break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_MULTIBYTE_2: c = decodeUtf8_2(c); break; case XmlCharTypes.CT_MULTIBYTE_3: c = decodeUtf8_3(c); break; case XmlCharTypes.CT_MULTIBYTE_4: c = decodeUtf8_4(c); // Let's add first part right away: outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10)); if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } c = 0xDC00 | (c & 0x3FF); // And let the other char output down below break; case XmlCharTypes.CT_MULTIBYTE_N: reportInvalidInitial(c); case XmlCharTypes.CT_HYPHEN: // '-->'? if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (_inputBuffer[_inputPtr] == BYTE_HYPHEN) { // ok, must be end then ++_inputPtr; if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (_inputBuffer[_inputPtr++] != BYTE_GT) { reportDoubleHyphenInComments(); } break main_loop; } break; // default: // Other types are not important here... } // Ok, can output the char (we know there's room for one more) outputBuffer[outPtr++] = (char) c; } _textBuilder.setCurrentLength(outPtr); } /** * When this method gets called we know that we have an internal subset, * and that the opening '[' has already been read. */ @Override protected final void finishDTD(boolean copyContents) throws XMLStreamException { char[] outputBuffer = copyContents ? _textBuilder.resetWithEmpty() : null; int outPtr = 0; final int[] TYPES = _charTypes.DTD_CHARS; boolean inDecl = false; // in declaration/directive? int quoteChar = 0; // inside quoted string? main_loop: while (true) { int c; /* First we'll have a quickie loop for speeding through * uneventful chars... */ ascii_loop: while (true) { int ptr = _inputPtr; if (ptr >= _inputEnd) { loadMoreGuaranteed(); ptr = _inputPtr; } int max = _inputEnd; if (outputBuffer != null) { if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } { int max2 = ptr + (outputBuffer.length - outPtr); if (max2 < max) { max = max2; } } } while (ptr < max) { c = (int) _inputBuffer[ptr++] & 0xFF; if (TYPES[c] != 0) { _inputPtr = ptr; break ascii_loop; } if (outputBuffer != null) { outputBuffer[outPtr++] = (char) c; } } _inputPtr = ptr; } switch (TYPES[c]) { // First, common types case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (_inputBuffer[_inputPtr] == BYTE_LF) { ++_inputPtr; } markLF(); c = INT_LF; break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_MULTIBYTE_2: c = decodeUtf8_2(c); break; case XmlCharTypes.CT_MULTIBYTE_3: c = decodeUtf8_3(c); break; case XmlCharTypes.CT_MULTIBYTE_4: c = decodeUtf8_4(c); if (outputBuffer != null) { // Let's add first part right away: outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10)); c = 0xDC00 | (c & 0x3FF); if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } // And let the other char output down below } break; case XmlCharTypes.CT_MULTIBYTE_N: reportInvalidInitial(c); // Then DTD-specific types: case XmlCharTypes.CT_DTD_QUOTE: // apos or quot if (quoteChar == 0) { quoteChar = c; } else { if (quoteChar == c) { quoteChar = 0; } } break; case XmlCharTypes.CT_DTD_LT: if (!inDecl) { inDecl = true; } break; case XmlCharTypes.CT_DTD_GT: if (quoteChar == 0) { inDecl = false; } break; case XmlCharTypes.CT_DTD_RBRACKET: if (!inDecl && quoteChar == 0) { break main_loop; } break; // default: // Other types are not important here... } if (outputBuffer != null) { // will have room for one more outputBuffer[outPtr++] = (char) c; } } if (outputBuffer != null) { _textBuilder.setCurrentLength(outPtr); } // but still need to match the '>'... byte b = skipInternalWs(false, null); if (b != BYTE_GT) { throwUnexpectedChar(decodeCharForError(b), " expected '>' after the internal subset"); } } @Override protected final void finishPI() throws XMLStreamException { final int[] TYPES = _charTypes.OTHER_CHARS; final byte[] inputBuffer = _inputBuffer; char[] outputBuffer = _textBuilder.resetWithEmpty(); int outPtr = 0; /* At this point, space (if any) has been skipped, and we are * to parse and store the contents */ main_loop: while (true) { int c; // Then the tight ascii non-funny-char loop: ascii_loop: while (true) { int ptr = _inputPtr; if (ptr >= _inputEnd) { loadMoreGuaranteed(); ptr = _inputPtr; } if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } int max = _inputEnd; { int max2 = ptr + (outputBuffer.length - outPtr); if (max2 < max) { max = max2; } } while (ptr < max) { c = (int) inputBuffer[ptr++] & 0xFF; if (TYPES[c] != 0) { _inputPtr = ptr; break ascii_loop; } outputBuffer[outPtr++] = (char) c; } _inputPtr = ptr; } // And then exceptions: switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (inputBuffer[_inputPtr] == BYTE_LF) { ++_inputPtr; } markLF(); c = INT_LF; break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_MULTIBYTE_2: c = decodeUtf8_2(c); break; case XmlCharTypes.CT_MULTIBYTE_3: c = decodeUtf8_3(c); break; case XmlCharTypes.CT_MULTIBYTE_4: c = decodeUtf8_4(c); // Let's add first part right away: outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10)); if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } c = 0xDC00 | (c & 0x3FF); // And let the other char output down below break; case XmlCharTypes.CT_MULTIBYTE_N: reportInvalidInitial(c); case XmlCharTypes.CT_QMARK: if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (_inputBuffer[_inputPtr] == BYTE_GT) { // ok, the end! ++_inputPtr; break main_loop; } // Not end mark, just need to reprocess the second char // default: // Other types are not important here... } // Ok, can output the char (we know there's room for one more) outputBuffer[outPtr++] = (char) c; } _textBuilder.setCurrentLength(outPtr); } /** * Note: this method is only called in cases where it is known * that only space chars are legal. Thus, encountering a non-space * is an error (WFC or VC). However, an end-of-input is ok. */ @Override protected final void finishSpace() throws XMLStreamException { /* Ok: so, mTmpChar contains first space char. If it looks * like indentation, we can probably optimize a bit... */ int tmp = _tmpChar; char[] outputBuffer; int outPtr; if (tmp == BYTE_CR || tmp == BYTE_LF) { outPtr = checkPrologIndentation(tmp); if (outPtr < 0) { return; } // Above call also initializes the text builder appropriately outputBuffer = _textBuilder.getBufferWithoutReset(); } else { outputBuffer = _textBuilder.resetWithEmpty(); outputBuffer[0] = (char) tmp; outPtr = 1; } int ptr = _inputPtr; while (true) { if (ptr >= _inputEnd) { if (!loadMore()) { break; } ptr = _inputPtr; } int c = (int) _inputBuffer[ptr] & 0xFF; // !!! TODO: check for xml 1.1 whitespace? if (c > INT_SPACE) { break; } ++ptr; if (c == INT_LF) { markLF(ptr); } else if (c == INT_CR) { if (ptr >= _inputEnd) { if (!loadMore()) { // still need to output the lf if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } outputBuffer[outPtr++] = '\n'; break; } ptr = _inputPtr; } if (_inputBuffer[ptr] == BYTE_LF) { ++ptr; } markLF(ptr); c = INT_LF; // need to convert to canonical lf } else if (c != INT_SPACE && c != INT_TAB) { _inputPtr = ptr; throwInvalidSpace(c); } // Ok, can output the char if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } outputBuffer[outPtr++] = (char) c; } _inputPtr = ptr; _textBuilder.setCurrentLength(outPtr); } /* /********************************************************************** /* 2nd level parsing/skipping for coalesced text /********************************************************************** */ /** * Method that gets called after a primary text segment (of type * CHARACTERS or CDATA, not applicable to SPACE) has been read in * text buffer. Method has to see if the following event would * be textual as well, and if so, read it (and any other following * textual segments). */ protected final void finishCoalescedText() throws XMLStreamException { while (true) { // no matter what, will need (and can get) one char if (_inputPtr >= _inputEnd) { if (!loadMore()) { // most likely an error, will be handled later on return; } } if (_inputBuffer[_inputPtr] == BYTE_LT) { // markup of some kind /* In worst case, need 3 chars ("<![") all in all to know * if we are getting a CDATA section */ if ((_inputPtr + 3) >= _inputEnd) { if (!loadAndRetain(3)) { // probably an error, but will be handled later return; } } if (_inputBuffer[_inputPtr+1] != BYTE_EXCL || _inputBuffer[_inputPtr+2] != BYTE_LBRACKET) { // can't be CDATA, we are done here return; } // but let's verify it still: _inputPtr += 3; for (int i = 0; i < 6; ++i) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } byte b = _inputBuffer[_inputPtr++]; if (b != (byte) CDATA_STR.charAt(i)) { int ch = decodeCharForError(b); reportTreeUnexpChar(ch, " (expected '"+CDATA_STR.charAt(i)+"' for CDATA section)"); } } finishCoalescedCData(); } else { // textual (or entity, error etc) finishCoalescedCharacters(); if (_entityPending) { break; } } } } // note: code mostly copied from 'finishCharacters', just simplified // in some places protected final void finishCoalescedCharacters() throws XMLStreamException { // first char can't be from (char) entity (wrt finishCharacters) final int[] TYPES = _charTypes.TEXT_CHARS; final byte[] inputBuffer = _inputBuffer; char[] outputBuffer = _textBuilder.getBufferWithoutReset(); int outPtr = _textBuilder.getCurrentLength(); int c; main_loop: while (true) { // Then the tight ascii non-funny-char loop: ascii_loop: while (true) { int ptr = _inputPtr; if (ptr >= _inputEnd) { loadMoreGuaranteed(); ptr = _inputPtr; } if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } int max = _inputEnd; { int max2 = ptr + (outputBuffer.length - outPtr); if (max2 < max) { max = max2; } } while (ptr < max) { c = (int) inputBuffer[ptr++] & 0xFF; if (TYPES[c] != 0) { _inputPtr = ptr; break ascii_loop; } outputBuffer[outPtr++] = (char) c; } _inputPtr = ptr; } // And then fallback for funny chars / UTF-8 multibytes: switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (inputBuffer[_inputPtr] == BYTE_LF) { ++_inputPtr; } markLF(); c = INT_LF; break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_MULTIBYTE_2: c = decodeUtf8_2(c); break; case XmlCharTypes.CT_MULTIBYTE_3: if ((_inputEnd - _inputPtr) >= 2) { c = decodeUtf8_3fast(c); } else { c = decodeUtf8_3(c); } break; case XmlCharTypes.CT_MULTIBYTE_4: c = decodeUtf8_4(c); // Let's add first part right away: outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10)); if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } c = 0xDC00 | (c & 0x3FF); // And let the other char output down below break; case XmlCharTypes.CT_MULTIBYTE_N: reportInvalidInitial(c); case XmlCharTypes.CT_LT: --_inputPtr; break main_loop; case XmlCharTypes.CT_AMP: c = handleEntityInText(false); if (c == 0) { // unexpandable general parsed entity // _inputPtr set by entity expansion method _entityPending = true; break main_loop; } // Ok; does it need a surrogate though? (over 16 bits) if ((c >> 16) != 0) { c -= 0x10000; outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10)); // Need to ensure room for one more char if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } c = 0xDC00 | (c & 0x3FF); } break; case XmlCharTypes.CT_RBRACKET: // ']]>'? { // Let's then just count number of brackets -- // in case they are not followed by '>' int count = 1; byte b; while (true) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } b = inputBuffer[_inputPtr]; if (b != BYTE_RBRACKET) { break; } ++_inputPtr; // to skip past bracket ++count; } if (b == BYTE_GT && count > 1) { reportIllegalCDataEnd(); } // Nope. Need to output all brackets, then; except // for one that can be left for normal output while (count > 1) { outputBuffer[outPtr++] = ']'; // Need to ensure room for one more char if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } --count; } } // Can just output the first ']' along normal output break; // default: // Other types are not important here... } // We know there's room for one more: outputBuffer[outPtr++] = (char) c; } _textBuilder.setCurrentLength(outPtr); } // note: code mostly copied from 'finishCharacters', just simplified // in some places protected final void finishCoalescedCData() throws XMLStreamException { final int[] TYPES = _charTypes.OTHER_CHARS; final byte[] inputBuffer = _inputBuffer; char[] outputBuffer = _textBuilder.getBufferWithoutReset(); int outPtr = _textBuilder.getCurrentLength(); /* At this point, space (if any) has been skipped, and we are * to parse and store the contents */ main_loop: while (true) { int c; // Then the tight ascii non-funny-char loop: ascii_loop: while (true) { int ptr = _inputPtr; if (ptr >= _inputEnd) { loadMoreGuaranteed(); ptr = _inputPtr; } if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } int max = _inputEnd; { int max2 = ptr + (outputBuffer.length - outPtr); if (max2 < max) { max = max2; } } while (ptr < max) { c = (int) inputBuffer[ptr++] & 0xFF; if (TYPES[c] != 0) { _inputPtr = ptr; break ascii_loop; } outputBuffer[outPtr++] = (char) c; } _inputPtr = ptr; } // And then exceptions: switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (inputBuffer[_inputPtr] == BYTE_LF) { ++_inputPtr; } markLF(); c = INT_LF; break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_MULTIBYTE_2: c = decodeUtf8_2(c); break; case XmlCharTypes.CT_MULTIBYTE_3: c = decodeUtf8_3(c); break; case XmlCharTypes.CT_MULTIBYTE_4: c = decodeUtf8_4(c); // Let's add first part right away: outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10)); if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } c = 0xDC00 | (c & 0x3FF); // And let the other char output down below break; case XmlCharTypes.CT_MULTIBYTE_N: reportInvalidInitial(c); case XmlCharTypes.CT_RBRACKET: // close ']]>' marker? /* Ok: let's just parse all consequtive right brackets, * and see if followed by greater-than char. This because * we can only push back at most one char at a time, and * thus can't easily just check a subset */ int count = 0; // ignoring first one byte b; do { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } b = _inputBuffer[_inputPtr]; if (b != BYTE_RBRACKET) { break; } ++_inputPtr; ++count; } while (true); // Was the marker found? boolean ok = (b == BYTE_GT && count >= 1); if (ok) { --count; } // Brackets to copy to output? for (; count > 0; --count) { outputBuffer[outPtr++] = ']'; if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } } if (ok) { ++_inputPtr; // to consume '>' break main_loop; } break; } // Ok, can output the char; there's room for one char at least outputBuffer[outPtr++] = (char) c; } _textBuilder.setCurrentLength(outPtr); } /** * Method that gets called after a primary text segment (of type * CHARACTERS or CDATA, not applicable to SPACE) has been skipped. * Method has to see if the following event would * be textual as well, and if so, skip it (and any other following * textual segments). * * @return True if we encountered an unexpandable entity */ @Override protected final boolean skipCoalescedText() throws XMLStreamException { while (true) { // no matter what, will need (and can get) one char if (_inputPtr >= _inputEnd) { if (!loadMore()) { // most likely an error, will be handled later on return false; } } if (_inputBuffer[_inputPtr] == BYTE_LT) { // markup of some kind /* In worst case, need 3 chars ("<![") all in all to know * if we are getting a CDATA section */ if ((_inputPtr + 3) >= _inputEnd) { if (!loadAndRetain(3)) { // probably an error, but will be handled later return false; } } if (_inputBuffer[_inputPtr+1] != BYTE_EXCL || _inputBuffer[_inputPtr+2] != BYTE_LBRACKET) { // can't be CDATA, we are done here return false; } // but let's verify it still: _inputPtr += 3; for (int i = 0; i < 6; ++i) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } byte b = _inputBuffer[_inputPtr++]; if (b != (byte) CDATA_STR.charAt(i)) { int ch = decodeCharForError(b); reportTreeUnexpChar(ch, " (expected '"+CDATA_STR.charAt(i)+"' for CDATA section)"); } } skipCData(); } else { // textual (or entity, error etc) if (skipCharacters()) { return true; } } } } /* /********************************************************************** /* Other methods, utf-decoding /********************************************************************** */ /** * @return Either decoded character (if positive int); or negated * value of a high-order char (one that needs surrogate pair) */ private final int decodeMultiByteChar(int c, int ptr) throws XMLStreamException { int needed; if ((c & 0xE0) == 0xC0) { // 2 bytes (0x0080 - 0x07FF) c &= 0x1F; needed = 1; } else if ((c & 0xF0) == 0xE0) { // 3 bytes (0x0800 - 0xFFFF) c &= 0x0F; needed = 2; } else if ((c & 0xF8) == 0xF0) { // 4 bytes; double-char with surrogates and all... c &= 0x07; needed = 3; } else { reportInvalidInitial(c & 0xFF); needed = 1; // never gets here } if (ptr >= _inputEnd) { loadMoreGuaranteed(); ptr = _inputPtr; } int d = (int) _inputBuffer[ptr++]; if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF, ptr); } c = (c << 6) | (d & 0x3F); if (needed > 1) { // needed == 1 means 2 bytes total if (ptr >= _inputEnd) { loadMoreGuaranteed(); ptr = _inputPtr; } d = (int) _inputBuffer[ptr++]; if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF, ptr); } c = (c << 6) | (d & 0x3F); if (needed > 2) { // 4 bytes? (need surrogates) if (ptr >= _inputEnd) { loadMoreGuaranteed(); ptr = _inputPtr; } d = (int) _inputBuffer[ptr++]; if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF, ptr); } c = (c << 6) | (d & 0x3F); /* Need to signal such pair differently (to make comparison * easier) */ c = -c; } } _inputPtr = ptr; return c; } private final int decodeUtf8_2(int c) throws XMLStreamException { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } int d = (int) _inputBuffer[_inputPtr++]; if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF, _inputPtr); } return ((c & 0x1F) << 6) | (d & 0x3F); } private final int decodeUtf8_3(int c1) throws XMLStreamException { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c1 &= 0x0F; int d = (int) _inputBuffer[_inputPtr++]; if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF, _inputPtr); } int c = (c1 << 6) | (d & 0x3F); if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } d = (int) _inputBuffer[_inputPtr++]; if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF, _inputPtr); } c = (c << 6) | (d & 0x3F); if (c1 >= 0xD) { // 0xD800-0xDFFF, 0xFFFE-0xFFFF illegal if (c >= 0xD800) { // surrogates illegal, as well as 0xFFFE/0xFFFF if (c < 0xE000 || (c >= 0xFFFE && c <= 0xFFFF)) { c = handleInvalidXmlChar(c); } } } return c; } private final int decodeUtf8_3fast(int c1) throws XMLStreamException { c1 &= 0x0F; int d = (int) _inputBuffer[_inputPtr++]; if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF, _inputPtr); } int c = (c1 << 6) | (d & 0x3F); d = (int) _inputBuffer[_inputPtr++]; if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF, _inputPtr); } c = (c << 6) | (d & 0x3F); if (c1 >= 0xD) { // 0xD800-0xDFFF, 0xFFFE-0xFFFF illegal if (c >= 0xD800) { // surrogates illegal, as well as 0xFFFE/0xFFFF if (c < 0xE000 || (c >= 0xFFFE && c <= 0xFFFF)) { c = handleInvalidXmlChar(c); } } } return c; } /** * @return Character value <b>minus 0x10000</c>; this so that caller * can readily expand it to actual surrogates */ private final int decodeUtf8_4(int c) throws XMLStreamException { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } int d = (int) _inputBuffer[_inputPtr++]; if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF, _inputPtr); } c = ((c & 0x07) << 6) | (d & 0x3F); if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } d = (int) _inputBuffer[_inputPtr++]; if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF, _inputPtr); } c = (c << 6) | (d & 0x3F); if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } d = (int) _inputBuffer[_inputPtr++]; if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF, _inputPtr); } /* note: won't change it to negative here, since caller * already knows it'll need a surrogate */ return ((c << 6) | (d & 0x3F)) - 0x10000; } /* /********************************************************************** /* Internal methods, error reporting /********************************************************************** */ /** * Method called called to decode a full UTF-8 characters, given * its first byte. Note: does not do any validity checks, since this * is only to be used for informational purposes (often when an error * has already been encountered) */ @Override public int decodeCharForError(byte b) throws XMLStreamException { int c = (int) b; if (c >= 0) { // ascii? fine as is... return c; } int needed; // Ok; if we end here, we got multi-byte combination if ((c & 0xE0) == 0xC0) { // 2 bytes (0x0080 - 0x07FF) c &= 0x1F; needed = 1; } else if ((c & 0xF0) == 0xE0) { // 3 bytes (0x0800 - 0xFFFF) c &= 0x0F; needed = 2; } else if ((c & 0xF8) == 0xF0) { // 4 bytes; double-char with surrogates and all... c &= 0x07; needed = 3; } else { reportInvalidInitial(c & 0xFF); needed = 1; // never gets here } int d = nextByte(); if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF); } c = (c << 6) | (d & 0x3F); if (needed > 1) { // needed == 1 means 2 bytes total d = nextByte(); // 3rd byte if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF); } c = (c << 6) | (d & 0x3F); if (needed > 2) { // 4 bytes? (need surrogates) d = nextByte(); if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF); } c = (c << 6) | (d & 0x3F); } } return c; } protected void reportInvalidOther(int mask, int ptr) throws XMLStreamException { _inputPtr = ptr; reportInvalidOther(mask); } }