/* Aalto XML processor * * Copyright (c) 2006- Tatu Saloranta, tatu.saloranta@iki.fi * * Licensed under the License specified in the file LICENSE which is * included with the source code. * You may not use this file except in compliance with the License. * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.fasterxml.aalto.async; import java.nio.ByteBuffer; import javax.xml.stream.XMLStreamException; import com.fasterxml.aalto.*; import com.fasterxml.aalto.in.*; import com.fasterxml.aalto.util.DataUtil; //import com.fasterxml.aalto.util.XmlConsts; import com.fasterxml.aalto.util.XmlCharTypes; /** * This is the base class for asynchronous (non-blocking) XML * scanners. Due to basic complexity of async approach, character-based * doesn't make much sense, so only byte-based input is supported. */ public class AsyncByteBufferScanner extends AsyncByteScanner implements AsyncByteBufferFeeder { /* /********************************************************************** /* Input buffer handling /********************************************************************** */ /** * This buffer is actually provided by caller */ protected ByteBuffer _inputBuffer; /** * In addition to current buffer pointer, and end pointer, * we will also need to know number of bytes originally * contained. This is needed to correctly update location * information when the block has been completed. */ protected int _origBufferLen; /* /********************************************************************** /* Instance construction /********************************************************************** */ public AsyncByteBufferScanner(ReaderConfig cfg) { super(cfg); // must start by checking if there's XML declaration... _state = STATE_PROLOG_INITIAL; _currToken = EVENT_INCOMPLETE; } @Override public String toString() { return "asyncScanner; curr="+_currToken+" next="+_nextEvent+", state = "+_state; } /* /********************************************************************** /* Parsing, comments /********************************************************************** */ protected int parseCommentContents() throws XMLStreamException { // Left-overs from last input block? if (_pendingInput != 0) { // CR, multi-byte, or '-'? int result = handleCommentPending(); // If there's not enough input, or if we completed, can leave if (result != 0) { return result; } // otherwise we should be good to continue } char[] outputBuffer = _textBuilder.getBufferWithoutReset(); int outPtr = _textBuilder.getCurrentLength(); final int[] TYPES = _charTypes.OTHER_CHARS; ByteBuffer inputBuffer = _inputBuffer; main_loop: while (true) { int c; // Then the tight ASCII non-funny-char loop: ascii_loop: while (true) { if (_inputPtr >= _inputEnd) { break main_loop; } if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } int max = _inputEnd; { int max2 = _inputPtr + (outputBuffer.length - outPtr); if (max2 < max) { max = max2; } } while (_inputPtr < max) { c = (int) inputBuffer.get(_inputPtr++) & 0xFF; if (TYPES[c] != 0) { break ascii_loop; } outputBuffer[outPtr++] = (char) c; } } switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: { if (_inputPtr >= _inputEnd) { _pendingInput = PENDING_STATE_CR; break main_loop; } if (inputBuffer.get(_inputPtr) == BYTE_LF) { ++_inputPtr; } markLF(); } c = INT_LF; break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_MULTIBYTE_2: if (_inputPtr >= _inputEnd) { _pendingInput = c; break main_loop; } c = decodeUtf8_2(c); break; case XmlCharTypes.CT_MULTIBYTE_3: if ((_inputEnd - _inputPtr) < 2) { if (_inputEnd > _inputPtr) { // 2 bytes available int d = (int) _inputBuffer.get(_inputPtr++) & 0xFF; c |= (d << 8); } _pendingInput = c; break main_loop; } c = decodeUtf8_3(c); break; case XmlCharTypes.CT_MULTIBYTE_4: if ((_inputEnd - _inputPtr) < 3) { if (_inputEnd > _inputPtr) { // at least 2 bytes? int d = (int) _inputBuffer.get(_inputPtr++) & 0xFF; c |= (d << 8); if (_inputEnd > _inputPtr) { // 3 bytes? d = (int) _inputBuffer.get(_inputPtr++) & 0xFF; c |= (d << 16); } } _pendingInput = c; break main_loop; } c = decodeUtf8_4(c); // Let's add first part right away: outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10)); if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } c = 0xDC00 | (c & 0x3FF); // And let the other char output down below break; case XmlCharTypes.CT_MULTIBYTE_N: reportInvalidInitial(c); case XmlCharTypes.CT_HYPHEN: // '-->'? if (_inputPtr >= _inputEnd) { _pendingInput = PENDING_STATE_COMMENT_HYPHEN1; break main_loop; } if (_inputBuffer.get(_inputPtr) == BYTE_HYPHEN) { // ok, must be end then ++_inputPtr; if (_inputPtr >= _inputEnd) { _pendingInput = PENDING_STATE_COMMENT_HYPHEN2; break main_loop; } if (_inputBuffer.get(_inputPtr++) != BYTE_GT) { reportDoubleHyphenInComments(); } _textBuilder.setCurrentLength(outPtr); _state = STATE_DEFAULT; _nextEvent = EVENT_INCOMPLETE; return COMMENT; } break; // default: // Other types are not important here... } // Ok, can output the char (we know there's room for one more) outputBuffer[outPtr++] = (char) c; } _textBuilder.setCurrentLength(outPtr); return EVENT_INCOMPLETE; } /** * @return EVENT_INCOMPLETE, if there's not enough input to * handle pending char, COMMENT, if we handled complete * "-->" end marker, or 0 to indicate something else * was succesfully handled. */ protected int handleCommentPending() throws XMLStreamException { if (_inputPtr >= _inputEnd) { return EVENT_INCOMPLETE; } if (_pendingInput == PENDING_STATE_COMMENT_HYPHEN1) { if (_inputBuffer.get(_inputPtr) != BYTE_HYPHEN) { // can't be the end marker, just append '-' and go _pendingInput = 0; _textBuilder.append("-"); return 0; } ++_inputPtr; _pendingInput = PENDING_STATE_COMMENT_HYPHEN2; if (_inputPtr >= _inputEnd) { // no more input? return EVENT_INCOMPLETE; } // continue } if (_pendingInput == PENDING_STATE_COMMENT_HYPHEN2) { _pendingInput = 0; byte b = _inputBuffer.get(_inputPtr++); if (b != BYTE_GT) { reportDoubleHyphenInComments(); } _state = STATE_DEFAULT; _nextEvent = EVENT_INCOMPLETE; return COMMENT; } // Otherwise can use default code return handleAndAppendPending() ? 0 : EVENT_INCOMPLETE; } /* /********************************************************************** /* Parsing, PI /********************************************************************** */ protected int parsePIData() throws XMLStreamException { // Left-overs from last input block? if (_pendingInput != 0) { // CR, multi-byte, '?' int result = handlePIPending(); // If there's not enough input, or if we completed, can leave if (result != 0) { return result; } // otherwise we should be good to continue } char[] outputBuffer = _textBuilder.getBufferWithoutReset(); int outPtr = _textBuilder.getCurrentLength(); final int[] TYPES = _charTypes.OTHER_CHARS; ByteBuffer inputBuffer = _inputBuffer; main_loop: while (true) { int c; // Then the tight ASCII non-funny-char loop: ascii_loop: while (true) { if (_inputPtr >= _inputEnd) { break main_loop; } if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } int max = _inputEnd; { int max2 = _inputPtr + (outputBuffer.length - outPtr); if (max2 < max) { max = max2; } } while (_inputPtr < max) { c = (int) inputBuffer.get(_inputPtr++) & 0xFF; if (TYPES[c] != 0) { break ascii_loop; } outputBuffer[outPtr++] = (char) c; } } switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: { if (_inputPtr >= _inputEnd) { _pendingInput = PENDING_STATE_CR; break main_loop; } if (inputBuffer.get(_inputPtr) == BYTE_LF) { ++_inputPtr; } markLF(); } c = INT_LF; break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_MULTIBYTE_2: if (_inputPtr >= _inputEnd) { _pendingInput = c; break main_loop; } c = decodeUtf8_2(c); break; case XmlCharTypes.CT_MULTIBYTE_3: if ((_inputEnd - _inputPtr) < 2) { if (_inputEnd > _inputPtr) { // 2 bytes available int d = (int) _inputBuffer.get(_inputPtr++) & 0xFF; c |= (d << 8); } _pendingInput = c; break main_loop; } c = decodeUtf8_3(c); break; case XmlCharTypes.CT_MULTIBYTE_4: if ((_inputEnd - _inputPtr) < 3) { if (_inputEnd > _inputPtr) { // at least 2 bytes? int d = (int) _inputBuffer.get(_inputPtr++) & 0xFF; c |= (d << 8); if (_inputEnd > _inputPtr) { // 3 bytes? d = (int) _inputBuffer.get(_inputPtr++) & 0xFF; c |= (d << 16); } } _pendingInput = c; break main_loop; } c = decodeUtf8_4(c); // Let's add first part right away: outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10)); if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } c = 0xDC00 | (c & 0x3FF); // And let the other char output down below break; case XmlCharTypes.CT_MULTIBYTE_N: reportInvalidInitial(c); case XmlCharTypes.CT_QMARK: if (_inputPtr >= _inputEnd) { _pendingInput = PENDING_STATE_PI_QMARK; break main_loop; } if (_inputBuffer.get(_inputPtr) == BYTE_GT) { // end ++_inputPtr; _textBuilder.setCurrentLength(outPtr); _state = STATE_DEFAULT; _nextEvent = EVENT_INCOMPLETE; return PROCESSING_INSTRUCTION; } // Not end mark, just need to reprocess the second char break; // default: // Other types are not important here... } // Ok, can output the char (we know there's room for one more) outputBuffer[outPtr++] = (char) c; } _textBuilder.setCurrentLength(outPtr); return EVENT_INCOMPLETE; } /** * @return EVENT_INCOMPLETE, if there's not enough input to * handle pending char, PROCESSING_INSTRUCTION, if we handled complete * "?>" end marker, or 0 to indicate something else * was succesfully handled. */ protected int handlePIPending() throws XMLStreamException { // First, the special case, end marker: if (_pendingInput == PENDING_STATE_PI_QMARK) { if (_inputPtr >= _inputEnd) { return EVENT_INCOMPLETE; } byte b = _inputBuffer.get(_inputPtr); _pendingInput = 0; if (b != BYTE_GT) { // can't be the end marker, just append '-' and go _textBuilder.append('?'); return 0; } ++_inputPtr; _state = STATE_DEFAULT; _nextEvent = EVENT_INCOMPLETE; return PROCESSING_INSTRUCTION; } // Otherwise can use default code return handleAndAppendPending() ? 0 : EVENT_INCOMPLETE; } /* /********************************************************************** /* Parsing, internal DTD subset /********************************************************************** */ protected final boolean handleDTDInternalSubset(boolean init) throws XMLStreamException { char[] outputBuffer; int outPtr; if (init) { // first time around outputBuffer = _textBuilder.resetWithEmpty(); outPtr = 0; _elemAttrQuote = 0; _inDtdDeclaration = false; } else { if (_pendingInput != 0) { if (!handleAndAppendPending()) { return false; } } outputBuffer = _textBuilder.getBufferWithoutReset(); outPtr = _textBuilder.getCurrentLength(); } final int[] TYPES = _charTypes.DTD_CHARS; ByteBuffer inputBuffer = _inputBuffer; main_loop: while (true) { int c; // Then the tight ASCII non-funny-char loop: ascii_loop: while (true) { if (_inputPtr >= _inputEnd) { break main_loop; } if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } int max = _inputEnd; { int max2 = _inputPtr + (outputBuffer.length - outPtr); if (max2 < max) { max = max2; } } while (_inputPtr < max) { c = (int) inputBuffer.get(_inputPtr++) & 0xFF; if (TYPES[c] != 0) { break ascii_loop; } outputBuffer[outPtr++] = (char) c; } } switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: if (_inputPtr >= _inputEnd) { _pendingInput = PENDING_STATE_CR; break main_loop; } if (inputBuffer.get(_inputPtr) == BYTE_LF) { ++_inputPtr; } markLF(); c = INT_LF; break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_MULTIBYTE_2: if (_inputPtr >= _inputEnd) { _pendingInput = c; break main_loop; } c = decodeUtf8_2(c); break; case XmlCharTypes.CT_MULTIBYTE_3: if ((_inputEnd - _inputPtr) < 2) { if (_inputEnd > _inputPtr) { // 2 bytes available int d = (int) _inputBuffer.get(_inputPtr++) & 0xFF; c |= (d << 8); } _pendingInput = c; break main_loop; } c = decodeUtf8_3(c); break; case XmlCharTypes.CT_MULTIBYTE_4: if ((_inputEnd - _inputPtr) < 3) { if (_inputEnd > _inputPtr) { // at least 2 bytes? int d = (int) _inputBuffer.get(_inputPtr++) & 0xFF; c |= (d << 8); if (_inputEnd > _inputPtr) { // 3 bytes? d = (int) _inputBuffer.get(_inputPtr++) & 0xFF; c |= (d << 16); } } _pendingInput = c; break main_loop; } c = decodeUtf8_4(c); // Let's add first part right away: outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10)); if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } c = 0xDC00 | (c & 0x3FF); // And let the other char output down below break; case XmlCharTypes.CT_MULTIBYTE_N: reportInvalidInitial(c); case XmlCharTypes.CT_DTD_QUOTE: // apos or quot if (_elemAttrQuote == 0) { _elemAttrQuote = (byte) c; } else { if (_elemAttrQuote == c) { _elemAttrQuote = 0; } } break; case XmlCharTypes.CT_DTD_LT: if (!_inDtdDeclaration) { _inDtdDeclaration = true; } break; case XmlCharTypes.CT_DTD_GT: if (_elemAttrQuote == 0) { _inDtdDeclaration = false; } break; case XmlCharTypes.CT_DTD_RBRACKET: if (!_inDtdDeclaration && _elemAttrQuote == 0) { _textBuilder.setCurrentLength(outPtr); return true; } break; // default: // Other types are not important here... } // Ok, can output the char (we know there's room for one more) outputBuffer[outPtr++] = (char) c; } _textBuilder.setCurrentLength(outPtr); return false; } /* /********************************************************************** /* Parsing, CDATA /********************************************************************** */ protected final int parseCDataContents() throws XMLStreamException { // Left-overs from last input block? if (_pendingInput != 0) { // CR, multi-byte, or ']'? int result = handleCDataPending(); // If there's not enough input, or if we completed, can leave if (result != 0) { return result; } // otherwise we should be good to continue } char[] outputBuffer = _textBuilder.getBufferWithoutReset(); int outPtr = _textBuilder.getCurrentLength(); final int[] TYPES = _charTypes.OTHER_CHARS; ByteBuffer inputBuffer = _inputBuffer; main_loop: while (true) { int c; // Then the tight ASCII non-funny-char loop: ascii_loop: while (true) { if (_inputPtr >= _inputEnd) { break main_loop; } if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } int max = _inputEnd; { int max2 = _inputPtr + (outputBuffer.length - outPtr); if (max2 < max) { max = max2; } } while (_inputPtr < max) { c = (int) inputBuffer.get(_inputPtr++) & 0xFF; if (TYPES[c] != 0) { break ascii_loop; } outputBuffer[outPtr++] = (char) c; } } switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: { if (_inputPtr >= _inputEnd) { _pendingInput = PENDING_STATE_CR; break main_loop; } if (inputBuffer.get(_inputPtr) == BYTE_LF) { ++_inputPtr; } markLF(); } c = INT_LF; break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_MULTIBYTE_2: if (_inputPtr >= _inputEnd) { _pendingInput = c; break main_loop; } c = decodeUtf8_2(c); break; case XmlCharTypes.CT_MULTIBYTE_3: if ((_inputEnd - _inputPtr) < 2) { if (_inputEnd > _inputPtr) { // 2 bytes available int d = (int) _inputBuffer.get(_inputPtr++) & 0xFF; c |= (d << 8); } _pendingInput = c; break main_loop; } c = decodeUtf8_3(c); break; case XmlCharTypes.CT_MULTIBYTE_4: if ((_inputEnd - _inputPtr) < 3) { if (_inputEnd > _inputPtr) { // at least 2 bytes? int d = (int) _inputBuffer.get(_inputPtr++) & 0xFF; c |= (d << 8); if (_inputEnd > _inputPtr) { // 3 bytes? d = (int) _inputBuffer.get(_inputPtr++) & 0xFF; c |= (d << 16); } } _pendingInput = c; break main_loop; } c = decodeUtf8_4(c); // Let's add first part right away: outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10)); if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } c = 0xDC00 | (c & 0x3FF); // And let the other char output down below break; case XmlCharTypes.CT_MULTIBYTE_N: reportInvalidInitial(c); case XmlCharTypes.CT_RBRACKET: // ']]>'? if (_inputPtr >= _inputEnd) { _pendingInput = PENDING_STATE_CDATA_BRACKET1; break main_loop; } // Hmmh. This is more complex... so be it. if (_inputBuffer.get(_inputPtr) == BYTE_RBRACKET) { // end might be nigh... ++_inputPtr; while (true) { if (_inputPtr >= _inputEnd) { _pendingInput = PENDING_STATE_CDATA_BRACKET2; break main_loop; } if (_inputBuffer.get(_inputPtr) == BYTE_GT) { ++_inputPtr; _textBuilder.setCurrentLength(outPtr); _state = STATE_DEFAULT; _nextEvent = EVENT_INCOMPLETE; return CDATA; } if (_inputBuffer.get(_inputPtr) != BYTE_RBRACKET) { // neither '>' nor ']'; push "]]" back outputBuffer[outPtr++] = ']'; if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } outputBuffer[outPtr++] = ']'; continue main_loop; } // Got third bracket; push one back, keep on checking ++_inputPtr; outputBuffer[outPtr++] = ']'; if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } } } break; // default: // Other types are not important here... } // Ok, can output the char (we know there's room for one more) outputBuffer[outPtr++] = (char) c; } _textBuilder.setCurrentLength(outPtr); return EVENT_INCOMPLETE; } /** * @return EVENT_INCOMPLETE, if there's not enough input to * handle pending char, CDATA, if we handled complete * "]]>" end marker, or 0 to indicate something else * was succesfully handled. */ protected final int handleCDataPending() throws XMLStreamException { if (_pendingInput == PENDING_STATE_CDATA_BRACKET1) { if (_inputPtr >= _inputEnd) { return EVENT_INCOMPLETE; } if (_inputBuffer.get(_inputPtr) != BYTE_RBRACKET) { // can't be the end marker, just append ']' and go _textBuilder.append(']'); return (_pendingInput = 0); } ++_inputPtr; _pendingInput = PENDING_STATE_CDATA_BRACKET2; if (_inputPtr >= _inputEnd) { // no more input? return EVENT_INCOMPLETE; } // continue } while (_pendingInput == PENDING_STATE_CDATA_BRACKET2) { if (_inputPtr >= _inputEnd) { return EVENT_INCOMPLETE; } byte b = _inputBuffer.get(_inputPtr++); if (b == BYTE_GT) { _pendingInput = 0; _state = STATE_DEFAULT; _nextEvent = EVENT_INCOMPLETE; return CDATA; } if (b != BYTE_RBRACKET) { --_inputPtr; _textBuilder.append("]]"); return (_pendingInput = 0); } _textBuilder.append(']'); } // Otherwise can use default code return handleAndAppendPending() ? 0 : EVENT_INCOMPLETE; } /** * This method gets called, if the first character of a * CHARACTERS event could not be fully read (multi-byte, * split over buffer boundary). If so, there is some * pending data to be handled. */ protected int startCharactersPending() throws XMLStreamException { // First, need to have at least one more byte: if (_inputPtr >= _inputEnd) { return EVENT_INCOMPLETE; } // K. So what was the type again? int c = _pendingInput; _pendingInput = 0; // Possible \r\n linefeed? if (c == PENDING_STATE_CR) { if (_inputBuffer.get(_inputPtr) == BYTE_LF) { ++_inputPtr; } markLF(); _textBuilder.resetWithChar(CHAR_LF); } else { // Nah, a multi-byte UTF-8 char: // Let's just retest the first pending byte (in LSB): switch (_charTypes.TEXT_CHARS[c & 0xFF]) { case XmlCharTypes.CT_MULTIBYTE_2: // Easy: must have just one byte, did get another one: _textBuilder.resetWithChar((char) decodeUtf8_2(c)); break; case XmlCharTypes.CT_MULTIBYTE_3: { // Ok... so do we have one or two pending bytes? int next = _inputBuffer.get(_inputPtr++) & 0xFF; int c2 = (c >> 8); if (c2 == 0) { // just one; need two more if (_inputPtr >= _inputEnd) { // but got only one _pendingInput = c | (next << 8); return EVENT_INCOMPLETE; } int c3 = _inputBuffer.get(_inputPtr++) & 0xFF; c = decodeUtf8_3(c, next, c3); } else { // had two, got one, bueno: c = decodeUtf8_3((c & 0xFF), c2, next); } _textBuilder.resetWithChar((char) c); } break; case XmlCharTypes.CT_MULTIBYTE_4: { int next = (int) _inputBuffer.get(_inputPtr++) & 0xFF; // Only had one? if ((c >> 8) == 0) { // ok, so need 3 more if (_inputPtr >= _inputEnd) { // just have 1 _pendingInput = c | (next << 8); return EVENT_INCOMPLETE; } int c2 = _inputBuffer.get(_inputPtr++) & 0xFF; if (_inputPtr >= _inputEnd) { // almost, got 2 _pendingInput = c | (next << 8) | (c2 << 16); return EVENT_INCOMPLETE; } int c3 = _inputBuffer.get(_inputPtr++) & 0xFF; c = decodeUtf8_4(c, next, c2, c3); } else { // had two or three int c2 = (c >> 8) & 0xFF; int c3 = (c >> 16); if (c3 == 0) { // just two if (_inputPtr >= _inputEnd) { // one short _pendingInput = c | (next << 16); return EVENT_INCOMPLETE; } c3 = _inputBuffer.get(_inputPtr++) & 0xFF; c = decodeUtf8_4((c & 0xFF), c2, next, c3); } else { // had three, got last c = decodeUtf8_4((c & 0xFF), c2, c3, next); } } } // Need a surrogate pair, have to call from here: _textBuilder.resetWithSurrogate(c); return (_currToken = CHARACTERS); default: // should never occur: throwInternal(); } } // Great, we got it. Is that enough? if (_cfgCoalescing && !_cfgLazyParsing) { // In eager coalescing mode, must read it all return finishCharactersCoalescing(); } _currToken = CHARACTERS; if (_cfgLazyParsing) { _tokenIncomplete = true; } else { finishCharacters(); } return _currToken; } /** * TODO: Method not yet implemented */ protected final int finishCharactersCoalescing() throws XMLStreamException { // First things first: any pending partial multi-bytes? if (_pendingInput != 0) { if (!handleAndAppendPending()) { return EVENT_INCOMPLETE; } } throw new UnsupportedOperationException(); // !!! TBI // return 0; } /* /********************************************************************** /* Async input, methods to feed (push) content to parse /********************************************************************** */ @Override public final boolean needMoreInput() { return (_inputPtr >=_inputEnd) && !_endOfInput; } @Override public void feedInput(ByteBuffer buffer) throws XMLStreamException { // Must not have remaining input if (_inputPtr < _inputEnd) { throw new XMLStreamException("Still have "+(_inputEnd - _inputPtr)+" unread bytes"); } // and shouldn't have been marked as end-of-input if (_endOfInput) { throw new XMLStreamException("Already closed, can not feed more input"); } // Time to update pointers first _pastBytesOrChars += _origBufferLen; _rowStartOffset -= _origBufferLen; int start = buffer.position(); int end = buffer.limit(); // And then update buffer settings _inputBuffer = buffer; _inputPtr = start; _inputEnd = end; _origBufferLen = end-start; } /* /********************************************************************** /* Implementation of parsing API /********************************************************************** */ @Override public final int nextFromProlog(boolean isProlog) throws XMLStreamException { // Had fully complete event? Need to reset state etc: if (_currToken != EVENT_INCOMPLETE) { // First: keep track of where event started setStartLocation(); // yet one more special case: after START_DOCUMENT need to check things... if (_currToken == START_DOCUMENT) { _currToken = EVENT_INCOMPLETE; if (_tokenName != null) { _nextEvent = PROCESSING_INSTRUCTION; _state = STATE_PI_AFTER_TARGET; checkPITargetName(_tokenName); return handlePI(); } } else { _currToken = _nextEvent = EVENT_INCOMPLETE; _state = STATE_DEFAULT; } } // Ok, do we know which event it will be? if (_nextEvent == EVENT_INCOMPLETE) { // nope // The very first thing: XML declaration handling if (_state == STATE_PROLOG_INITIAL) { if (_inputPtr >= _inputEnd) { return _currToken; } // Ok: see if we have what looks like XML declaration; process: if (_pendingInput != 0) { // already parsing (potential) XML declaration Boolean b = startXmlDeclaration(); // is or may be XML declaration, so: if (b == null) { // not yet known; bail out return EVENT_INCOMPLETE; } // no real XML declaration; syntesize one: if (b == Boolean.FALSE) { _currToken = START_DOCUMENT; return START_DOCUMENT; } return handleXmlDeclaration(); } if (_inputBuffer.get(_inputPtr) == BYTE_LT) { // first byte, see if it could be XML declaration ++_inputPtr; _pendingInput = PENDING_STATE_XMLDECL_LT; Boolean b = startXmlDeclaration(); // is or may be XML declaration, so: if (b == null) { return EVENT_INCOMPLETE; } if (b == Boolean.FALSE) { _currToken = START_DOCUMENT; return START_DOCUMENT; } return handleXmlDeclaration(); } // can't be XML declaration _state = STATE_DEFAULT; _currToken = START_DOCUMENT; return START_DOCUMENT; } // First: did we have a lone CR at the end of the buffer? if (_pendingInput != 0) { // yup if (!handlePartialCR()) { return _currToken; } } while (_state == STATE_DEFAULT) { if (_inputPtr >= _inputEnd) { // no more input available if (_endOfInput) { // for good? That may be fine setStartLocation(); return TOKEN_EOI; } return _currToken; } byte b = _inputBuffer.get(_inputPtr++); /* Really should get white space or '<'... anything else is * pretty much an error. */ if (b == BYTE_LT) { // root element, comment, proc instr? _state = STATE_PROLOG_SEEN_LT; break; } if (b == BYTE_SPACE || b == BYTE_CR || b == BYTE_LF || b == BYTE_TAB) { // Prolog/epilog ws is to be skipped, not part of Infoset if (!asyncSkipSpace()) { // ran out of input? if (_endOfInput) { // for good? That may be fine setStartLocation(); return TOKEN_EOI; } return _currToken; } } else { reportPrologUnexpChar(isProlog, decodeCharForError(b), null); } } if (_state == STATE_PROLOG_SEEN_LT) { if (_inputPtr >= _inputEnd) { return _currToken; } byte b = _inputBuffer.get(_inputPtr++); if (b == BYTE_EXCL) { // comment or DOCTYPE declaration? _state = STATE_PROLOG_DECL; return handlePrologDeclStart(isProlog); } if (b == BYTE_QMARK) { // PI _nextEvent = PROCESSING_INSTRUCTION; _state = STATE_DEFAULT; return handlePI(); } if (b == BYTE_SLASH || !isProlog) { reportPrologUnexpChar(isProlog, decodeCharForError(b), " (unbalanced start/end tags?)"); } return handleStartElementStart(b); } if (_state == STATE_PROLOG_DECL) { return handlePrologDeclStart(isProlog); } // should never have anything else... return throwInternal(); } // At this point, we do know the event type switch (_nextEvent) { case START_ELEMENT: return handleStartElement(); case START_DOCUMENT: return handleXmlDeclaration(); case PROCESSING_INSTRUCTION: return handlePI(); case COMMENT: return handleComment(); case DTD: return handleDTD(); } return throwInternal(); // should never get here } @Override public int nextFromTree() throws XMLStreamException { // Had a fully complete event? Need to reset state: if (_currToken != EVENT_INCOMPLETE) { /* First, need to handle some complications arising from * empty elements, and namespace binding/unbinding: */ if (_currToken == START_ELEMENT) { if (_isEmptyTag) { --_depth; // Important: do NOT overwrite start location, same as with START_ELEMENT return (_currToken = END_ELEMENT); } } else if (_currToken == END_ELEMENT) { _currElem = _currElem.getParent(); // Any namespace declarations that need to be unbound? while (_lastNsDecl != null && _lastNsDecl.getLevel() >= _depth) { _lastNsDecl = _lastNsDecl.unbind(); } } // keep track of where event started setStartLocation(); /* Only CHARACTERS can remain incomplete: this happens if * first character is decoded, but coalescing mode is NOT * set. Skip can not therefore block, nor will add pending * input. Can also occur when we have run out of input */ if (_tokenIncomplete) { if (!skipCharacters()) { // couldn't complete skipping return EVENT_INCOMPLETE; } _tokenIncomplete = false; } _currToken = _nextEvent = EVENT_INCOMPLETE; _state = STATE_DEFAULT; } // Don't yet know the type? if (_nextEvent == EVENT_INCOMPLETE) { if (_state == STATE_DEFAULT) { /* We can only have pending input for (incomplete) * CHARACTERS event. */ if (_pendingInput != 0) { // CR, or multi-byte? _nextEvent = CHARACTERS; return startCharactersPending(); } if (_inputPtr >= _inputEnd) { // nothing we can do? return _currToken; // i.e. EVENT_INCOMPLETE } byte b = _inputBuffer.get(_inputPtr++); if (b == BYTE_LT) { // root element, comment, proc instr? _state = STATE_TREE_SEEN_LT; } else if (b == BYTE_AMP) { _state = STATE_TREE_SEEN_AMP; } else { _nextEvent = CHARACTERS; return startCharacters(b); } } if (_inputPtr >= _inputEnd) { return _currToken; // i.e. EVENT_INCOMPLETE } if (_state == STATE_TREE_SEEN_LT) { // Ok, so we've just seen the less-than char... byte b = _inputBuffer.get(_inputPtr++); if (b == BYTE_EXCL) { // comment or CDATA _state = STATE_TREE_SEEN_EXCL; } else if (b == BYTE_QMARK) { _nextEvent = PROCESSING_INSTRUCTION; _state = STATE_DEFAULT; return handlePI(); } else if (b == BYTE_SLASH) { return handleEndElementStart(); } else { // Probably start element -- need to retain first char tho return handleStartElementStart(b); } } else if (_state == STATE_TREE_SEEN_AMP) { return handleEntityStartingToken(); } else if (_state == STATE_TREE_NAMED_ENTITY_START) { return handleNamedEntityStartingToken(); } else if (_state == STATE_TREE_NUMERIC_ENTITY_START) { return handleNumericEntityStartingToken(); } if (_state == STATE_TREE_SEEN_EXCL) { if (_inputPtr >= _inputEnd) { return _currToken; // i.e. EVENT_INCOMPLETE } byte b = _inputBuffer.get(_inputPtr++); // Comment or CDATA? if (b == BYTE_HYPHEN) { // Comment _nextEvent = COMMENT; _state = STATE_DEFAULT; } else if (b == BYTE_LBRACKET) { // CDATA _nextEvent = CDATA; _state = STATE_DEFAULT; } else { reportTreeUnexpChar(decodeCharForError(b), " (expected either '-' for COMMENT or '[CDATA[' for CDATA section)"); } } else { throwInternal(); } } /* We know the type; event is usually partially processed * and needs to be completely read. */ switch (_nextEvent) { case START_ELEMENT: return handleStartElement(); case END_ELEMENT: return handleEndElement(); case PROCESSING_INSTRUCTION: return handlePI(); case COMMENT: return handleComment(); case CDATA: return handleCData(); case CHARACTERS: if (!_cfgLazyParsing) { // !!! TBI: how would non-lazy mode work? if (_cfgCoalescing) { return finishCharactersCoalescing(); } } if (_pendingInput != 0) { // multi-byte, or CR without LF return startCharactersPending(); } // Otherwise, should not get here throwInternal(); // case ENTITY_REFERENCE: } return throwInternal(); // never gets here } /* /********************************************************************** /* Second-level parsing, prolog (XML declaration, DOCTYPE) /********************************************************************** */ private final int handlePrologDeclStart(boolean isProlog) throws XMLStreamException { if (_inputPtr >= _inputEnd) { // nothing we can do? return EVENT_INCOMPLETE; } byte b = _inputBuffer.get(_inputPtr++); // So far, we have seen "<!", need to know if it's DTD or COMMENT if (b == BYTE_HYPHEN) { _nextEvent = COMMENT; _state = STATE_DEFAULT; return handleComment(); } if (b == BYTE_D) { _nextEvent = DTD; _state = STATE_DEFAULT; return handleDTD(); } reportPrologUnexpChar(isProlog, decodeCharForError(b), " (expected '-' for COMMENT)"); return EVENT_INCOMPLETE; // never gets here } /** * Method that deals with recognizing XML declaration, but not with parsing * its contents. * * @return null if parsing is inconclusive (may or may not be XML declaration); * Boolean.TRUE if complete XML declaration, and Boolean.FALSE if something * else */ private final Boolean startXmlDeclaration() throws XMLStreamException { if (_inputPtr >= _inputEnd) { return null; } if (_pendingInput == PENDING_STATE_XMLDECL_LT) { // "<" at start of doc if (_inputBuffer.get(_inputPtr) != BYTE_QMARK) { // some other _pendingInput = 0; _state = STATE_PROLOG_SEEN_LT; return Boolean.FALSE; } ++_inputPtr; _pendingInput = PENDING_STATE_XMLDECL_LTQ; if (_inputPtr >= _inputEnd) { return null; } } if (_pendingInput == PENDING_STATE_XMLDECL_LTQ) { // "<?" at start of doc byte b = _inputBuffer.get(_inputPtr++); _tokenName = parseNewName(b); if (_tokenName == null) { // incomplete _pendingInput = PENDING_STATE_XMLDECL_TARGET; return null; } // xml or not? if (!"xml".equals(_tokenName.getPrefixedName())) { // nope: some other PI _pendingInput = 0; _state = STATE_PI_AFTER_TARGET; _nextEvent = PROCESSING_INSTRUCTION; checkPITargetName(_tokenName); return Boolean.FALSE; } } else if (_pendingInput == PENDING_STATE_XMLDECL_TARGET) { // "<?" at start of doc, part of name if ((_tokenName = parsePName()) == null) { // incomplete return null; } if (!"xml".equals(_tokenName.getPrefixedName())) { _pendingInput = 0; _state = STATE_PI_AFTER_TARGET; _nextEvent = PROCESSING_INSTRUCTION; checkPITargetName(_tokenName); return Boolean.FALSE; } } else { throwInternal(); } _pendingInput = 0; _nextEvent = START_DOCUMENT; _state = STATE_XMLDECL_AFTER_XML; return Boolean.TRUE; } /** * Method called to complete parsing of XML declaration, once it has * been reliably detected. * * @return Completed token (START_DOCUMENT), if fully parsed; incomplete (EVENT_INCOMPLETE) * otherwise */ private int handleXmlDeclaration() throws XMLStreamException { // First: left-over CRs? if (_pendingInput == PENDING_STATE_CR) { if (!handlePartialCR()) { return EVENT_INCOMPLETE; } } main_loop: while (_inputPtr < _inputEnd) { switch (_state) { case STATE_XMLDECL_AFTER_XML: // "<?xml", need space { byte b = _inputBuffer.get(_inputPtr++); if (b == BYTE_SPACE || b == BYTE_CR || b == BYTE_LF || b == BYTE_TAB) { _state = STATE_XMLDECL_BEFORE_VERSION; } else { reportPrologUnexpChar(true, decodeCharForError(b), " (expected space after 'xml' in xml declaration)"); } } if (_inputPtr >= _inputEnd) { break; } // fall through case STATE_XMLDECL_BEFORE_VERSION: if (!asyncSkipSpace()) { // not enough input break; } if ((_tokenName = parseNewName(_inputBuffer.get(_inputPtr++))) == null) { // incomplete _state = STATE_XMLDECL_VERSION; break; } if (!_tokenName.hasPrefixedName("version")) { reportInputProblem("Unexpected keyword '"+_tokenName.getPrefixedName()+"' in XML declaration: expected 'version'"); } _state = STATE_XMLDECL_AFTER_VERSION; continue main_loop; case STATE_XMLDECL_VERSION: // "<?xml ", part of "version" if ((_tokenName = parsePName()) == null) { // incomplete break; } if (!_tokenName.hasPrefixedName("version")) { reportInputProblem("Unexpected keyword '"+_tokenName.getPrefixedName()+"' in XML declaration: expected 'version'"); } _state = STATE_XMLDECL_AFTER_VERSION; if (_inputPtr >= _inputEnd) { break; } // fall through case STATE_XMLDECL_AFTER_VERSION: // "<?xml version", need space or '=' if (!asyncSkipSpace()) { // not enough input break; } { byte b = _inputBuffer.get(_inputPtr++); if (b != BYTE_EQ) { reportPrologUnexpChar(true, decodeCharForError(b), " (expected '=' after 'version' in xml declaration)"); } } _state = STATE_XMLDECL_VERSION_EQ; if (_inputPtr >= _inputEnd) { break; } // fall through case STATE_XMLDECL_VERSION_EQ: // "<?xml version=", need space or quote if (!asyncSkipSpace()) { // skip space, if any break; } _elemAttrQuote = _inputBuffer.get(_inputPtr++); if (_elemAttrQuote != BYTE_QUOT && _elemAttrQuote != BYTE_APOS) { reportPrologUnexpChar(true, decodeCharForError(_elemAttrQuote), " (expected '\"' or ''' in xml declaration for version value)"); } { char[] buf = _textBuilder.resetWithEmpty(); if (_inputPtr >= _inputEnd || !parseXmlDeclAttr(buf, 0)) { _state = STATE_XMLDECL_VERSION_VALUE; break; } } verifyAndSetXmlVersion(); _state = STATE_XMLDECL_AFTER_VERSION_VALUE; continue main_loop; case STATE_XMLDECL_VERSION_VALUE: // parsing version value if (!parseXmlDeclAttr(_textBuilder.getBufferWithoutReset(), _textBuilder.getCurrentLength())) { _state = STATE_XMLDECL_VERSION_VALUE; break; } verifyAndSetXmlVersion(); _state = STATE_XMLDECL_AFTER_VERSION_VALUE; if (_inputPtr >= _inputEnd) { break; } // fall through case STATE_XMLDECL_AFTER_VERSION_VALUE: // version got; need space or '?' { byte b = _inputBuffer.get(_inputPtr++); if (b == BYTE_QMARK) { _state = STATE_XMLDECL_ENDQ; continue main_loop; } if (b == BYTE_SPACE || b == BYTE_CR || b == BYTE_LF || b == BYTE_TAB) { _state = STATE_XMLDECL_BEFORE_ENCODING; } else { reportPrologUnexpChar(true, decodeCharForError(b), " (expected space after version value in xml declaration)"); } } if (_inputPtr >= _inputEnd) { break; } // fall through case STATE_XMLDECL_BEFORE_ENCODING: // version, value, space got, need '?' or 'e' if (!asyncSkipSpace()) { // not enough input break; } { byte b = _inputBuffer.get(_inputPtr++); if (b == BYTE_QMARK) { _state = STATE_XMLDECL_ENDQ; continue main_loop; } if ((_tokenName = parseNewName(b)) == null) { // incomplete _state = STATE_XMLDECL_ENCODING; break; } // Can actually also get "standalone" instead... if (_tokenName.hasPrefixedName("encoding")) { _state = STATE_XMLDECL_AFTER_ENCODING; } else if (_tokenName.hasPrefixedName("standalone")) { _state = STATE_XMLDECL_AFTER_STANDALONE; continue main_loop; } else { reportInputProblem("Unexpected keyword '"+_tokenName.getPrefixedName()+"' in XML declaration: expected 'encoding'"); } } continue main_loop; case STATE_XMLDECL_ENCODING: // parsing "encoding" if ((_tokenName = parsePName()) == null) { // incomplete break; } // Can actually also get "standalone" instead... if (_tokenName.hasPrefixedName("encoding")) { _state = STATE_XMLDECL_AFTER_ENCODING; } else if (_tokenName.hasPrefixedName("standalone")) { _state = STATE_XMLDECL_AFTER_STANDALONE; continue main_loop; } else { reportInputProblem("Unexpected keyword '"+_tokenName.getPrefixedName()+"' in XML declaration: expected 'encoding'"); } if (_inputPtr >= _inputEnd) { break; } // fall through case STATE_XMLDECL_AFTER_ENCODING: // got "encoding"; must get ' ' or '=' if (!asyncSkipSpace()) { // not enough input break; } { byte b = _inputBuffer.get(_inputPtr++); if (b != BYTE_EQ) { reportPrologUnexpChar(true, decodeCharForError(b), " (expected '=' after 'encoding' in xml declaration)"); } } _state = STATE_XMLDECL_ENCODING_EQ; if (_inputPtr >= _inputEnd) { break; } // fall through case STATE_XMLDECL_ENCODING_EQ: // "encoding=" if (!asyncSkipSpace()) { // skip space, if any break; } _elemAttrQuote = _inputBuffer.get(_inputPtr++); if (_elemAttrQuote != BYTE_QUOT && _elemAttrQuote != BYTE_APOS) { reportPrologUnexpChar(true, decodeCharForError(_elemAttrQuote), " (expected '\"' or ''' in xml declaration for encoding value)"); } _state = STATE_XMLDECL_ENCODING_VALUE; { char[] buf = _textBuilder.resetWithEmpty(); if (_inputPtr >= _inputEnd || !parseXmlDeclAttr(buf, 0)) { _state = STATE_XMLDECL_ENCODING_VALUE; break; } } verifyAndSetXmlEncoding(); _state = STATE_XMLDECL_AFTER_ENCODING_VALUE; break; case STATE_XMLDECL_ENCODING_VALUE: // parsing encoding value if (!parseXmlDeclAttr(_textBuilder.getBufferWithoutReset(), _textBuilder.getCurrentLength())) { _state = STATE_XMLDECL_ENCODING_VALUE; break; } verifyAndSetXmlEncoding(); _state = STATE_XMLDECL_AFTER_ENCODING_VALUE; if (_inputPtr >= _inputEnd) { break; } // fall through case STATE_XMLDECL_AFTER_ENCODING_VALUE: // encoding+value gotten; need space or '?' { byte b = _inputBuffer.get(_inputPtr++); if (b == BYTE_QMARK) { _state = STATE_XMLDECL_ENDQ; continue main_loop; } if (b == BYTE_SPACE || b == BYTE_CR || b == BYTE_LF || b == BYTE_TAB) { _state = STATE_XMLDECL_BEFORE_STANDALONE; } else { reportPrologUnexpChar(true, decodeCharForError(b), " (expected space after encoding value in xml declaration)"); } } if (_inputPtr >= _inputEnd) { break; } // fall through case STATE_XMLDECL_BEFORE_STANDALONE: // after encoding+value+space; get '?' or 's' if (!asyncSkipSpace()) { // not enough input break; } { byte b = _inputBuffer.get(_inputPtr++); if (b == BYTE_QMARK) { _state = STATE_XMLDECL_ENDQ; continue main_loop; } if ((_tokenName = parseNewName(b)) == null) { // incomplete _state = STATE_XMLDECL_STANDALONE; break; } if (!_tokenName.hasPrefixedName("standalone")) { reportInputProblem("Unexpected keyword '"+_tokenName.getPrefixedName()+"' in XML declaration: expected 'standalone'"); } } _state = STATE_XMLDECL_AFTER_STANDALONE; continue main_loop; case STATE_XMLDECL_STANDALONE: // parsing "standalone" if ((_tokenName = parsePName()) == null) { // incomplete break; } if (!_tokenName.hasPrefixedName("standalone")) { reportInputProblem("Unexpected keyword 'encoding' in XML declaration: expected 'standalone'"); } _state = STATE_XMLDECL_AFTER_STANDALONE; if (_inputPtr >= _inputEnd) { break; } // fall through case STATE_XMLDECL_AFTER_STANDALONE: // got "standalone"; must get ' ' or '=' if (!asyncSkipSpace()) { // not enough input break; } { byte b = _inputBuffer.get(_inputPtr++); if (b != BYTE_EQ) { reportPrologUnexpChar(true, decodeCharForError(b), " (expected '=' after 'standalone' in xml declaration)"); } } _state = STATE_XMLDECL_STANDALONE_EQ; if (_inputPtr >= _inputEnd) { break; } // fall through case STATE_XMLDECL_STANDALONE_EQ: // "standalone=" if (!asyncSkipSpace()) { // skip space, if any break; } _elemAttrQuote = _inputBuffer.get(_inputPtr++); if (_elemAttrQuote != BYTE_QUOT && _elemAttrQuote != BYTE_APOS) { reportPrologUnexpChar(true, decodeCharForError(_elemAttrQuote), " (expected '\"' or ''' in xml declaration for standalone value)"); } { char[] buf = _textBuilder.resetWithEmpty(); if (_inputPtr >= _inputEnd || !parseXmlDeclAttr(buf, 0)) { _state = STATE_XMLDECL_STANDALONE_VALUE; break; } } verifyAndSetXmlStandalone(); _state = STATE_XMLDECL_AFTER_STANDALONE_VALUE; continue main_loop; case STATE_XMLDECL_STANDALONE_VALUE: // encoding+value gotten; need space or '?' if (!parseXmlDeclAttr(_textBuilder.getBufferWithoutReset(), _textBuilder.getCurrentLength())) { _state = STATE_XMLDECL_STANDALONE_VALUE; break; } verifyAndSetXmlStandalone(); _state = STATE_XMLDECL_AFTER_STANDALONE_VALUE; if (_inputPtr >= _inputEnd) { break; } // fall through case STATE_XMLDECL_AFTER_STANDALONE_VALUE: // encoding+value gotten; need space or '?' if (!asyncSkipSpace()) { // skip space, if any break; } if (_inputBuffer.get(_inputPtr++) != BYTE_QMARK) { reportPrologUnexpChar(true, decodeCharForError(_inputBuffer.get(_inputPtr-1)), " (expected '?>' to end xml declaration)"); } _state = STATE_XMLDECL_ENDQ; if (_inputPtr >= _inputEnd) { break; } // fall through case STATE_XMLDECL_ENDQ: // Better clear up decoded name, to avoid later problems (would be taken as PI) _tokenName = null; _state = STATE_DEFAULT; _nextEvent = EVENT_INCOMPLETE; if (_inputBuffer.get(_inputPtr++) != BYTE_GT) { reportPrologUnexpChar(true, decodeCharForError(_inputBuffer.get(_inputPtr-1)), " (expected '>' to end xml declaration)"); } return START_DOCUMENT; default: throwInternal(); } } return EVENT_INCOMPLETE; } private int handleDTD() throws XMLStreamException { // First: left-over CRs? if (_pendingInput == PENDING_STATE_CR) { if (!handlePartialCR()) { return EVENT_INCOMPLETE; } } if (_state == STATE_DTD_INT_SUBSET) { if (handleDTDInternalSubset(false)) { // got it! _state = STATE_DTD_EXPECT_CLOSING_GT; } else { return EVENT_INCOMPLETE; } } main_loop: while (_inputPtr < _inputEnd) { switch (_state) { case STATE_DEFAULT: // seen 'D' _tokenName = parseNewName(BYTE_D); if (_tokenName == null) { _state = STATE_DTD_DOCTYPE; return EVENT_INCOMPLETE; } if (!"DOCTYPE".equals(_tokenName.getPrefixedName())) { reportPrologProblem(true, "expected 'DOCTYPE'"); } _state = STATE_DTD_AFTER_DOCTYPE; continue main_loop; case STATE_DTD_DOCTYPE: _tokenName = parsePName(); if (_tokenName == null) { _state = STATE_DTD_DOCTYPE; return EVENT_INCOMPLETE; } if (!"DOCTYPE".equals(_tokenName.getPrefixedName())) { reportPrologProblem(true, "expected 'DOCTYPE'"); } if (_inputPtr >= _inputEnd) { break; } // fall through case STATE_DTD_AFTER_DOCTYPE: { byte b = _inputBuffer.get(_inputPtr++); if (b == BYTE_SPACE || b == BYTE_CR || b == BYTE_LF || b == BYTE_TAB) { _state = STATE_DTD_BEFORE_ROOT_NAME; } else { reportPrologUnexpChar(true, decodeCharForError(b), " (expected space after 'DOCTYPE')"); } } // fall through (ok to skip bounds checks, async-skip does it) case STATE_DTD_BEFORE_ROOT_NAME: if (!asyncSkipSpace()) { // not enough input break; } if ((_tokenName = parseNewName(_inputBuffer.get(_inputPtr++))) == null) { // incomplete _state = STATE_DTD_ROOT_NAME; break; } _state = STATE_DTD_ROOT_NAME; continue main_loop; case STATE_DTD_ROOT_NAME: if ((_tokenName = parsePName()) == null) { // incomplete break; } _state = STATE_DTD_AFTER_ROOT_NAME; if (_inputPtr >= _inputEnd) { break; } // fall through case STATE_DTD_AFTER_ROOT_NAME: { byte b = _inputBuffer.get(_inputPtr++); if (b == BYTE_GT) { _state = STATE_DEFAULT; _nextEvent = EVENT_INCOMPLETE; return DTD; } if (b == BYTE_SPACE || b == BYTE_CR || b == BYTE_LF || b == BYTE_TAB) { _state = STATE_DTD_BEFORE_IDS; } else { reportPrologUnexpChar(true, decodeCharForError(b), " (expected space after root name in DOCTYPE declaration)"); } } // fall through (ok to skip bounds checks, async-skip does it) case STATE_DTD_BEFORE_IDS: if (!asyncSkipSpace()) { // not enough input break; } { byte b = _inputBuffer.get(_inputPtr++); if (b == BYTE_GT) { _state = STATE_DEFAULT; _nextEvent = EVENT_INCOMPLETE; return DTD; } PName name; if ((name = parseNewName(b)) == null) { _state = STATE_DTD_PUBLIC_OR_SYSTEM; break; } String str = name.getPrefixedName(); if ("PUBLIC".equals(str)) { _state = STATE_DTD_AFTER_PUBLIC; } else if ("SYSTEM".equals(str)) { _state = STATE_DTD_AFTER_SYSTEM; } else { reportPrologProblem(true, "unexpected token '"+str+"': expected either PUBLIC or SYSTEM"); } } continue main_loop; case STATE_DTD_PUBLIC_OR_SYSTEM: { PName name; if ((name = parsePName()) == null) { _state = STATE_DTD_PUBLIC_OR_SYSTEM; break; } String str = name.getPrefixedName(); if ("PUBLIC".equals(str)) { _state = STATE_DTD_AFTER_PUBLIC; } else if ("SYSTEM".equals(str)) { _state = STATE_DTD_AFTER_SYSTEM; } else { reportPrologProblem(true, "unexpected token '"+str+"': expected either PUBLIC or SYSTEM"); } } continue main_loop; case STATE_DTD_AFTER_PUBLIC: { byte b = _inputBuffer.get(_inputPtr++); if (b == BYTE_SPACE || b == BYTE_CR || b == BYTE_LF || b == BYTE_TAB) { _state = STATE_DTD_BEFORE_PUBLIC_ID; } else { reportPrologUnexpChar(true, decodeCharForError(b), " (expected space after PUBLIC keyword)"); } } continue main_loop; case STATE_DTD_AFTER_SYSTEM: { byte b = _inputBuffer.get(_inputPtr++); if (b == BYTE_SPACE || b == BYTE_CR || b == BYTE_LF || b == BYTE_TAB) { _state = STATE_DTD_BEFORE_SYSTEM_ID; } else { reportPrologUnexpChar(true, decodeCharForError(b), " (expected space after SYSTEM keyword)"); } } continue main_loop; case STATE_DTD_BEFORE_PUBLIC_ID: if (!asyncSkipSpace()) { break; } _elemAttrQuote = _inputBuffer.get(_inputPtr++); if (_elemAttrQuote != BYTE_QUOT && _elemAttrQuote != BYTE_APOS) { reportPrologUnexpChar(true, decodeCharForError(_elemAttrQuote), " (expected '\"' or ''' for PUBLIC ID)"); } { char[] buf = _textBuilder.resetWithEmpty(); if (_inputPtr >= _inputEnd || !parseDtdId(buf, 0, false)) { _state = STATE_DTD_PUBLIC_ID; break; } } verifyAndSetPublicId(); _state = STATE_DTD_AFTER_PUBLIC_ID; continue main_loop; case STATE_DTD_PUBLIC_ID: if (!parseDtdId(_textBuilder.getBufferWithoutReset(), _textBuilder.getCurrentLength(), false)) { break; } verifyAndSetPublicId(); _state = STATE_DTD_AFTER_PUBLIC_ID; if (_inputPtr >= _inputEnd) { break; } // fall through case STATE_DTD_AFTER_PUBLIC_ID: { byte b = _inputBuffer.get(_inputPtr++); if (b == BYTE_SPACE || b == BYTE_CR || b == BYTE_LF || b == BYTE_TAB) { _state = STATE_DTD_BEFORE_SYSTEM_ID; } else { reportPrologUnexpChar(true, decodeCharForError(b), " (expected space after PUBLIC ID)"); } } // fall through (ok to skip bounds checks, async-skip does it) case STATE_DTD_BEFORE_SYSTEM_ID: if (!asyncSkipSpace()) { break; } _elemAttrQuote = _inputBuffer.get(_inputPtr++); if (_elemAttrQuote != BYTE_QUOT && _elemAttrQuote != BYTE_APOS) { reportPrologUnexpChar(true, decodeCharForError(_elemAttrQuote), " (expected '\"' or ''' for SYSTEM ID)"); } { char[] buf = _textBuilder.resetWithEmpty(); if (_inputPtr >= _inputEnd || !parseDtdId(buf, 0, true)) { _state = STATE_DTD_SYSTEM_ID; break; } } verifyAndSetSystemId(); _state = STATE_DTD_AFTER_SYSTEM_ID; continue main_loop; case STATE_DTD_SYSTEM_ID: if (!parseDtdId(_textBuilder.getBufferWithoutReset(), _textBuilder.getCurrentLength(), true)) { break; } verifyAndSetSystemId(); _state = STATE_DTD_AFTER_SYSTEM_ID; if (_inputPtr >= _inputEnd) { break; } // fall through case STATE_DTD_AFTER_SYSTEM_ID: if (!asyncSkipSpace()) { break; } { byte b = _inputBuffer.get(_inputPtr++); if (b == BYTE_GT) { _state = STATE_DEFAULT; _nextEvent = EVENT_INCOMPLETE; return DTD; } if (b != BYTE_LBRACKET) { reportPrologUnexpChar(true, decodeCharForError(_elemAttrQuote), " (expected either '[' for internal subset, or '>' to end DOCTYPE)"); } } _state = STATE_DTD_INT_SUBSET; if (handleDTDInternalSubset(true)) { _state = STATE_DTD_EXPECT_CLOSING_GT; } else { return EVENT_INCOMPLETE; } // fall through case STATE_DTD_EXPECT_CLOSING_GT: if (!asyncSkipSpace()) { break; } { byte b = _inputBuffer.get(_inputPtr++); if (b != BYTE_GT) { reportPrologUnexpChar(true, b, "expected '>' to end DTD"); } } _state = STATE_DEFAULT; _nextEvent = EVENT_INCOMPLETE; return DTD; default: throwInternal(); } } return _currToken; } private final boolean parseDtdId(char[] outputBuffer, int outputPtr, boolean system) throws XMLStreamException { final int quote = (int) _elemAttrQuote; while (_inputPtr < _inputEnd) { int ch = _inputBuffer.get(_inputPtr++) & 0xFF; if (ch == quote) { _textBuilder.setCurrentLength(outputPtr); return true; } if (!system && !validPublicIdChar(ch)) { reportPrologUnexpChar(true, decodeCharForError((byte) ch), " (not valid in " + (system ? "SYSTEM" : "PUBLIC") + " ID)"); } if (outputPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outputPtr = 0; } outputBuffer[outputPtr++] = (char) ch; } _textBuilder.setCurrentLength(outputPtr); return false; } /** * Method called to try to parse an XML pseudo-attribute value. This is relatively * simple, since we can't have linefeeds or entities; and although there are exact * rules for what is allowed, we can do coarse parsing and only later on verify * validity (for encoding could do stricter parsing in future?) * * @return True if we managed to parse the whole pseudo-attribute */ private boolean parseXmlDeclAttr(char[] outputBuffer, int outputPtr) throws XMLStreamException { final int quote = (int) _elemAttrQuote; while (_inputPtr < _inputEnd) { int ch = _inputBuffer.get(_inputPtr++) & 0xFF; if (ch == quote) { _textBuilder.setCurrentLength(outputPtr); return true; } // this is not exact check; but does work for all legal (valid) characters: if (ch <= INT_SPACE || ch > INT_z) { reportPrologUnexpChar(true, decodeCharForError((byte) ch), " (not valid in XML pseudo-attribute values)"); } if (outputPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outputPtr = 0; } outputBuffer[outputPtr++] = (char) ch; } _textBuilder.setCurrentLength(outputPtr); return false; } /* /********************************************************************** /* Second-level parsing; character content (in tree) /********************************************************************** */ private int handleCData() throws XMLStreamException { if (_state == STATE_CDATA_CONTENT) { return parseCDataContents(); } if (_inputPtr >= _inputEnd) { return EVENT_INCOMPLETE; } return handleCDataStartMarker(_inputBuffer.get(_inputPtr++)); } private int handleCDataStartMarker(byte b) throws XMLStreamException { switch (_state) { case STATE_DEFAULT: if (b != BYTE_C) { reportTreeUnexpChar(decodeCharForError(b), " (expected 'C' for CDATA)"); } _state = STATE_CDATA_C; if (_inputPtr >= _inputEnd) { return EVENT_INCOMPLETE; } b = _inputBuffer.get(_inputPtr++); // fall through case STATE_CDATA_C: if (b != BYTE_D) { reportTreeUnexpChar(decodeCharForError(b), " (expected 'D' for CDATA)"); } _state = STATE_CDATA_CD; if (_inputPtr >= _inputEnd) { return EVENT_INCOMPLETE; } b = _inputBuffer.get(_inputPtr++); // fall through case STATE_CDATA_CD: if (b != BYTE_A) { reportTreeUnexpChar(decodeCharForError(b), " (expected 'A' for CDATA)"); } _state = STATE_CDATA_CDA; if (_inputPtr >= _inputEnd) { return EVENT_INCOMPLETE; } b = _inputBuffer.get(_inputPtr++); // fall through case STATE_CDATA_CDA: if (b != BYTE_T) { reportTreeUnexpChar(decodeCharForError(b), " (expected 'T' for CDATA)"); } _state = STATE_CDATA_CDAT; if (_inputPtr >= _inputEnd) { return EVENT_INCOMPLETE; } b = _inputBuffer.get(_inputPtr++); // fall through case STATE_CDATA_CDAT: if (b != BYTE_A) { reportTreeUnexpChar(decodeCharForError(b), " (expected 'A' for CDATA)"); } _state = STATE_CDATA_CDATA; if (_inputPtr >= _inputEnd) { return EVENT_INCOMPLETE; } b = _inputBuffer.get(_inputPtr++); // fall through case STATE_CDATA_CDATA: if (b != BYTE_LBRACKET) { reportTreeUnexpChar(decodeCharForError(b), " (expected '[' for CDATA)"); } _textBuilder.resetWithEmpty(); _state = STATE_CDATA_CONTENT; if (_inputPtr >= _inputEnd) { return EVENT_INCOMPLETE; } return parseCDataContents(); } return throwInternal(); } /* /********************************************************************** /* Second-level parsing; other (PI, Comment) /********************************************************************** */ private int handlePI() throws XMLStreamException { // Most common case first: if (_state == STATE_PI_IN_DATA) { return parsePIData(); } main_loop: while (true) { if (_inputPtr >= _inputEnd) { return EVENT_INCOMPLETE; } switch (_state) { case STATE_DEFAULT: _tokenName = parseNewName(_inputBuffer.get(_inputPtr++)); if (_tokenName == null) { _state = STATE_PI_IN_TARGET; return EVENT_INCOMPLETE; } _state = STATE_PI_AFTER_TARGET; checkPITargetName(_tokenName); if (_inputPtr >= _inputEnd) { return EVENT_INCOMPLETE; } // fall through case STATE_PI_AFTER_TARGET: // Need ws or "?>" { byte b = _inputBuffer.get(_inputPtr++); if (b == BYTE_QMARK) { // Quick check, can we see '>' as well? All done, if so if (_inputPtr < _inputEnd && _inputBuffer.get(_inputPtr) == BYTE_GT) { ++_inputPtr; break main_loop; // means we are done } // If not (whatever reason), let's move to check state _state = STATE_PI_AFTER_TARGET_QMARK; break; } if (b == BYTE_SPACE || b == BYTE_CR || b == BYTE_LF || b == BYTE_TAB) { if (!asyncSkipSpace()) { // ran out of input? _state = STATE_PI_AFTER_TARGET_WS; return EVENT_INCOMPLETE; } _textBuilder.resetWithEmpty(); // Quick check, perhaps we'll see end marker? if ((_inputPtr+1) < _inputEnd && _inputBuffer.get(_inputPtr) == BYTE_QMARK && _inputBuffer.get(_inputPtr+1) == BYTE_GT) { _inputPtr += 2; break main_loop; // means we are done } // If not, we'll move to 'data' portion of PI _state = STATE_PI_IN_DATA; return parsePIData(); } // Otherwise, it's an error reportMissingPISpace(decodeCharForError(b)); } // fall through case STATE_PI_AFTER_TARGET_WS: if (!asyncSkipSpace()) { // ran out of input? return EVENT_INCOMPLETE; } // Can just move to "data" portion right away _state = STATE_PI_IN_DATA; _textBuilder.resetWithEmpty(); return parsePIData(); case STATE_PI_AFTER_TARGET_QMARK: { // Must get '>' following '?' we saw right after name byte b = _inputBuffer.get(_inputPtr++); // Otherwise, it's an error if (b != BYTE_GT) { reportMissingPISpace(decodeCharForError(b)); } } // but if it's ok, we are done break main_loop; case STATE_PI_IN_TARGET: _tokenName = parsePName(); if (_tokenName == null) { return EVENT_INCOMPLETE; } checkPITargetName(_tokenName); _state = STATE_PI_AFTER_TARGET; break; default: return throwInternal(); } } _state = STATE_DEFAULT; _nextEvent = EVENT_INCOMPLETE; return PROCESSING_INSTRUCTION; } private final int handleComment() throws XMLStreamException { if (_state == STATE_COMMENT_CONTENT) { return parseCommentContents(); } if (_inputPtr >= _inputEnd) { return EVENT_INCOMPLETE; } byte b = _inputBuffer.get(_inputPtr++); if (_state == STATE_DEFAULT) { if (b != BYTE_HYPHEN) { reportTreeUnexpChar(decodeCharForError(b), " (expected '-' for COMMENT)"); } _state = STATE_COMMENT_CONTENT; _textBuilder.resetWithEmpty(); return parseCommentContents(); } if (_state == STATE_COMMENT_HYPHEN2) { // We are almost done, just need to get '>' at the end if (b != BYTE_GT) { reportDoubleHyphenInComments(); } _state = STATE_DEFAULT; _nextEvent = EVENT_INCOMPLETE; return COMMENT; } return throwInternal(); } /* /********************************************************************** /* Second-level parsing; helper methods /********************************************************************** */ /** * Method to skip whatever space can be skipped. *<p> * NOTE: if available content ends with a CR, method will set * <code>_pendingInput</code> to <code>PENDING_STATE_CR</code>. * * @return True, if was able to skip through the space and find * a non-space byte; false if reached end-of-buffer */ private boolean asyncSkipSpace() throws XMLStreamException { while (_inputPtr < _inputEnd) { byte b = _inputBuffer.get(_inputPtr); if ((b & 0xFF) > INT_SPACE) { // hmmmh. Shouldn't this be handled someplace else? if (_pendingInput == PENDING_STATE_CR) { markLF(); _pendingInput = 0; } return true; } ++_inputPtr; if (b == BYTE_LF) { markLF(); } else if (b == BYTE_CR) { if (_inputPtr >= _inputEnd) { _pendingInput = PENDING_STATE_CR; break; } if (_inputBuffer.get(_inputPtr) == BYTE_LF) { ++_inputPtr; } markLF(); } else if (b != BYTE_SPACE && b != BYTE_TAB) { throwInvalidSpace(b); } } return false; } /** * Method called when a new token (within tree) starts with an * entity. * * @return Type of event to return */ protected int handleEntityStartingToken() throws XMLStreamException { _textBuilder.resetWithEmpty(); byte b = _inputBuffer.get(_inputPtr++); // we know one is available if (b == BYTE_HASH) { // numeric character entity _textBuilder.resetWithEmpty(); _state = STATE_TREE_NUMERIC_ENTITY_START; _pendingInput = PENDING_STATE_ENT_SEEN_HASH; if (_inputPtr >= _inputEnd) { // but no more content to parse yet return EVENT_INCOMPLETE; } return handleNumericEntityStartingToken(); } PName n = parseNewEntityName(b); // null if incomplete; non-null otherwise if (n == null) { // Not sure if it's a char entity or general one; so we don't yet know type _state = STATE_TREE_NAMED_ENTITY_START; return EVENT_INCOMPLETE; } int ch = decodeGeneralEntity(n); if (ch == 0) { // not a character entity _tokenName = n; return (_nextEvent = _currToken = ENTITY_REFERENCE); } // character entity; initialize buffer, _textBuilder.resetWithChar((char)ch); _nextEvent = 0; _currToken = CHARACTERS; if (_cfgLazyParsing) { _tokenIncomplete = true; } else { finishCharacters(); } return _currToken; } /** * Method called when we see an entity that is starting a new token, * and part of its name has been decoded (but not all) */ protected int handleNamedEntityStartingToken() throws XMLStreamException { PName n = parseEntityName(); // null if incomplete; non-null otherwise if (n == null) { return _nextEvent; // i.e. EVENT_INCOMPLETE } int ch = decodeGeneralEntity(n); if (ch == 0) { // not a character entity _tokenName = n; return (_currToken = ENTITY_REFERENCE); } // character entity; initialize buffer, _textBuilder.resetWithChar((char)ch); _nextEvent = 0; _currToken = CHARACTERS; if (_cfgLazyParsing) { _tokenIncomplete = true; } else { finishCharacters(); } return _currToken; } /** * Method called to handle cases where we find something other than * a character entity (or one of 4 pre-defined general entities that * act like character entities) */ protected int handleNumericEntityStartingToken() throws XMLStreamException { if (_pendingInput == PENDING_STATE_ENT_SEEN_HASH) { byte b = _inputBuffer.get(_inputPtr); // we know one is available _entityValue = 0; if (b == BYTE_x) { // 'x' marks hex _pendingInput = PENDING_STATE_ENT_IN_HEX_DIGIT; if (++_inputPtr >= _inputEnd) { return EVENT_INCOMPLETE; } } else { // if not 'x', must be a digit _pendingInput = PENDING_STATE_ENT_IN_DEC_DIGIT; // let's just keep byte for calculation } } if (_pendingInput == PENDING_STATE_ENT_IN_HEX_DIGIT) { if (!decodeHexEntity()) { return EVENT_INCOMPLETE; } } else { if (!decodeDecEntity()) { return EVENT_INCOMPLETE; } } // and now we have the full value verifyAndAppendEntityCharacter(_entityValue); _currToken = CHARACTERS; if (_cfgLazyParsing) { _tokenIncomplete = true; } else { finishCharacters(); } _pendingInput = 0; return _currToken; } /** * @return True if entity was decoded (and value assigned to <code>_entityValue</code>; * false otherwise */ protected final boolean decodeHexEntity() throws XMLStreamException { int value = _entityValue; while (_inputPtr < _inputEnd) { byte b = _inputBuffer.get(_inputPtr++); if (b == BYTE_SEMICOLON) { _entityValue = value; return true; } int ch = (int) b; if (ch <= INT_9 && ch >= INT_0) { ch -= INT_0; } else if (ch <= INT_F && ch >= INT_A) { ch = 10 + (ch - INT_A); } else if (ch <= INT_f && ch >= INT_a) { ch = 10 + (ch - INT_a); } else { throwUnexpectedChar(decodeCharForError(b), " expected a hex digit (0-9a-fA-F) for character entity"); } value = (value << 4) + ch; if (value > MAX_UNICODE_CHAR) { // Overflow? _entityValue = value; reportEntityOverflow(); } } _entityValue = value; return false; } /** * @return True if entity was decoded (and value assigned to <code>_entityValue</code>; * false otherwise */ protected final boolean decodeDecEntity() throws XMLStreamException { int value = _entityValue; while (_inputPtr < _inputEnd) { byte b = _inputBuffer.get(_inputPtr++); if (b == BYTE_SEMICOLON) { _entityValue = value; return true; } int ch = ((int) b) - INT_0; if (ch < 0 || ch > 9) { // invalid entity throwUnexpectedChar(decodeCharForError(b), " expected a digit (0 - 9) for character entity"); } value = (value * 10) + ch; if (value > MAX_UNICODE_CHAR) { // Overflow? _entityValue = value; reportEntityOverflow(); } } _entityValue = value; return false; } /** * Method that verifies that given named entity is followed by * a semi-colon (meaning next byte must be available for reading); * and if so, whether it is one of pre-defined general entities. * * @return Character of the expanded pre-defined general entity * (if name matches one); zero if not. */ protected final int decodeGeneralEntity(PName entityName) throws XMLStreamException { // First things first: verify that we got semicolon afterwards byte b = _inputBuffer.get(_inputPtr++); if (b != BYTE_SEMICOLON) { throwUnexpectedChar(decodeCharForError(b), " expected ';' following entity name (\""+entityName.getPrefixedName()+"\")"); } String name = entityName.getPrefixedName(); if (name == "amp") { return INT_AMP; } if (name == "lt") { return INT_LT; } if (name == "apos") { return INT_APOS; } if (name == "quot") { return INT_QUOTE; } if (name == "gt") { return INT_GT; } return 0; } /** * Method called when '<' and (what appears to be) a name * start character have been seen. */ protected int handleStartElementStart(byte b) throws XMLStreamException { PName elemName = parseNewName(b); _nextEvent = START_ELEMENT; if (elemName == null) { _state = STATE_SE_ELEM_NAME; return EVENT_INCOMPLETE; } initStartElement(elemName); return handleStartElement(); } protected int handleStartElement() throws XMLStreamException { main_loop: while (true) { if (_inputPtr >= _inputEnd) { return EVENT_INCOMPLETE; } byte b; int c; switch (_state) { case STATE_SE_ELEM_NAME: { PName elemName = parsePName(); if (elemName == null) { return EVENT_INCOMPLETE; } initStartElement(elemName); } if (_inputPtr >= _inputEnd) { return EVENT_INCOMPLETE; } // Fall through to next state case STATE_SE_SPACE_OR_END: // obligatory space, or end if (_pendingInput != 0) { if (!handlePartialCR()) { return EVENT_INCOMPLETE; } // Ok, got a space, can move on } else { b = _inputBuffer.get(_inputPtr++); c = (int) b & 0xFF; if (c <= INT_SPACE) { if (c == INT_LF) { markLF(); } else if (c == INT_CR) { if (_inputPtr >= _inputEnd) { _pendingInput = PENDING_STATE_CR; return EVENT_INCOMPLETE; } if (_inputBuffer.get(_inputPtr) == BYTE_LF) { ++_inputPtr; } markLF(); } else if (c != INT_SPACE && c != INT_TAB) { throwInvalidSpace(c); } } else if (c == INT_GT) { // must be '/' or '>' return finishStartElement(false); } else if (c == INT_SLASH) { _state = STATE_SE_SEEN_SLASH; continue main_loop; } else { throwUnexpectedChar(decodeCharForError(b), " expected space, or '>' or \"/>\""); } } _state = STATE_SE_SPACE_OR_ATTRNAME; if (_inputPtr >= _inputEnd) { return EVENT_INCOMPLETE; } // can fall through, again: case STATE_SE_SPACE_OR_ATTRNAME: case STATE_SE_SPACE_OR_EQ: case STATE_SE_SPACE_OR_ATTRVALUE: /* Common to these states is that there may be leading space(s), * so let's see if any has to be skipped */ if (_pendingInput != 0) { if (!handlePartialCR()) { return EVENT_INCOMPLETE; } if (_inputPtr >= _inputEnd) { return EVENT_INCOMPLETE; } } b = _inputBuffer.get(_inputPtr++); c = (int) b & 0xFF; while (c <= INT_SPACE) { if (c == INT_LF) { markLF(); } else if (c == INT_CR) { if (_inputPtr >= _inputEnd) { _pendingInput = PENDING_STATE_CR; return EVENT_INCOMPLETE; } if (_inputBuffer.get(_inputPtr) == BYTE_LF) { ++_inputPtr; } markLF(); } else if (c != INT_SPACE && c != INT_TAB) { throwInvalidSpace(c); } if (_inputPtr >= _inputEnd) { return EVENT_INCOMPLETE; } b = _inputBuffer.get(_inputPtr++); c = (int) b & 0xFF; } switch (_state) { case STATE_SE_SPACE_OR_ATTRNAME: if (b == BYTE_SLASH) { _state = STATE_SE_SEEN_SLASH; continue main_loop; } if (b == BYTE_GT) { return finishStartElement(false); } { PName n = parseNewName(b); if (n == null) { _state = STATE_SE_ATTR_NAME; return EVENT_INCOMPLETE; } _state = STATE_SE_SPACE_OR_EQ; _elemAttrName = n; } continue main_loop; case STATE_SE_SPACE_OR_EQ: if (b != BYTE_EQ) { throwUnexpectedChar(decodeCharForError(b), " expected '='"); } _state = STATE_SE_SPACE_OR_ATTRVALUE; continue main_loop; case STATE_SE_SPACE_OR_ATTRVALUE: if (b != BYTE_QUOT && b != BYTE_APOS) { throwUnexpectedChar(decodeCharForError(b), " Expected a quote"); } initAttribute(b); continue main_loop; default: throwInternal(); } case STATE_SE_ATTR_NAME: { PName n = parsePName(); if (n == null) { return EVENT_INCOMPLETE; } _elemAttrName = n; _state = STATE_SE_SPACE_OR_EQ; } break; case STATE_SE_ATTR_VALUE_NORMAL: if (!handleAttrValue()) { return EVENT_INCOMPLETE; } _state = STATE_SE_SPACE_OR_END; break; case STATE_SE_ATTR_VALUE_NSDECL: if (!handleNsDecl()) { return EVENT_INCOMPLETE; } _state = STATE_SE_SPACE_OR_END; break; case STATE_SE_SEEN_SLASH: { b = _inputBuffer.get(_inputPtr++); if (b != BYTE_GT) { throwUnexpectedChar(decodeCharForError(b), " expected '>'"); } return finishStartElement(true); } default: throwInternal(); } } } private void initStartElement(PName elemName) { String prefix = elemName.getPrefix(); if (prefix == null) { // element in default ns _elemAllNsBound = true; // which need not be bound } else { elemName = bindName(elemName, prefix); _elemAllNsBound = elemName.isBound(); } _tokenName = elemName; _currElem = new ElementScope(elemName, _currElem); _attrCount = 0; _currNsCount = 0; _elemAttrPtr = 0; _state = STATE_SE_SPACE_OR_END; } private void initAttribute(byte quoteChar) { _elemAttrQuote = quoteChar; PName attrName = _elemAttrName; String prefix = attrName.getPrefix(); boolean nsDecl; if (prefix == null) { // can be default ns decl: nsDecl = (attrName.getLocalName() == "xmlns"); } else { // May be a namespace decl though? if (prefix == "xmlns") { nsDecl = true; } else { attrName = bindName(attrName, prefix); if (_elemAllNsBound) { _elemAllNsBound = attrName.isBound(); } nsDecl = false; } } if (nsDecl) { _state = STATE_SE_ATTR_VALUE_NSDECL; // Ns decls use name buffer transiently _elemNsPtr = 0; ++_currNsCount; } else { _state = STATE_SE_ATTR_VALUE_NORMAL; // Regular attributes are appended, shouldn't reset ptr _attrCollector.startNewValue(attrName, _elemAttrPtr); } } /** * Method called to wrap up settings when the whole start * (or empty) element has been parsed. */ private int finishStartElement(boolean emptyTag) throws XMLStreamException { _isEmptyTag = emptyTag; // Note: this call also checks attribute uniqueness int act = _attrCollector.finishLastValue(_elemAttrPtr); if (act < 0) { // error, dup attr indicated by -1 act = _attrCollector.getCount(); // let's get correct count reportInputProblem(_attrCollector.getErrorMsg()); } _attrCount = act; ++_depth; /* Was there any prefix that wasn't bound prior to use? * That's legal, assuming declaration was found later on... * let's check */ if (!_elemAllNsBound) { if (!_tokenName.isBound()) { // element itself unbound reportUnboundPrefix(_tokenName, false); } for (int i = 0, len = _attrCount; i < len; ++i) { PName attrName = _attrCollector.getName(i); if (!attrName.isBound()) { reportUnboundPrefix(attrName, true); } } } return (_currToken = START_ELEMENT); } private int handleEndElementStart() throws XMLStreamException { --_depth; _tokenName = _currElem.getName(); /* Ok, perhaps we can do this quickly? This works, if we * are expected to have the full name (plus one more byte * to indicate name end) in the current buffer: */ int size = _tokenName.sizeInQuads(); if ((_inputEnd - _inputPtr) < ((size << 2) + 1)) { // may need to load more _nextEvent = END_ELEMENT; _state = STATE_DEFAULT; _quadCount = _currQuad = _currQuadBytes = 0; /* No, need to take it slow. Can not yet give up, though, * without reading remainder of the buffer */ return handleEndElement(); } ByteBuffer buf = _inputBuffer; // First all full chunks of 4 bytes (if any) --size; for (int qix = 0; qix < size; ++qix) { int ptr = _inputPtr; int q = (buf.get(ptr) << 24) | ((buf.get(ptr+1) & 0xFF) << 16) | ((buf.get(ptr+2) & 0xFF) << 8) | ((buf.get(ptr+3) & 0xFF)) ; _inputPtr += 4; // match? if (q != _tokenName.getQuad(qix)) { reportUnexpectedEndTag(_tokenName.getPrefixedName()); } } /* After which we can deal with the last entry: it's bit * tricky as we don't actually fully know byte length... */ int lastQ = _tokenName.getQuad(size); int q = buf.get(_inputPtr++) & 0xFF; if (q != lastQ) { // need second byte? q = (q << 8) | (buf.get(_inputPtr++) & 0xFF); if (q != lastQ) { // need third byte? q = (q << 8) | (buf.get(_inputPtr++) & 0xFF); if (q != lastQ) { // need full 4 bytes? q = (q << 8) | (buf.get(_inputPtr++) & 0xFF); if (q != lastQ) { // still no match? failure! reportUnexpectedEndTag(_tokenName.getPrefixedName()); } } } } // Trailing space? int i2 = _inputBuffer.get(_inputPtr++) & 0xFF; while (i2 <= INT_SPACE) { if (i2 == INT_LF) { markLF(); } else if (i2 == INT_CR) { if (_inputPtr >= _inputEnd) { _pendingInput = PENDING_STATE_CR; _nextEvent = END_ELEMENT; _state = STATE_EE_NEED_GT; return EVENT_INCOMPLETE; } if (_inputBuffer.get(_inputPtr) == BYTE_LF) { ++_inputPtr; } markLF(); } else if (i2 != INT_SPACE && i2 != INT_TAB) { throwInvalidSpace(i2); } if (_inputPtr >= _inputEnd) { _nextEvent = END_ELEMENT; _state = STATE_EE_NEED_GT; return EVENT_INCOMPLETE; } i2 = _inputBuffer.get(_inputPtr++) & 0xFF; } if (i2 != INT_GT) { throwUnexpectedChar(decodeCharForError((byte)i2), " expected space or closing '>'"); } return (_currToken = END_ELEMENT); } /** * This method is "slow" version of above, used when name of * the end element can split input buffer boundary */ private int handleEndElement() throws XMLStreamException { if (_state == STATE_DEFAULT) { // parsing name final PName elemName = _tokenName; final int quadSize = elemName.sizeInQuads() - 1; // need to ignore last for now for (; _quadCount < quadSize; ++_quadCount) { // first, full quads for (; _currQuadBytes < 4; ++_currQuadBytes) { if (_inputPtr >= _inputEnd) { return EVENT_INCOMPLETE; } _currQuad = (_currQuad << 8) | (_inputBuffer.get(_inputPtr++) & 0xFF); } // match? if (_currQuad != elemName.getQuad(_quadCount)) { reportUnexpectedEndTag(elemName.getPrefixedName()); } _currQuad = _currQuadBytes = 0; } // So far so good! Now need to check the last quad: int lastQ = elemName.getLastQuad(); while (true) { if (_inputPtr >= _inputEnd) { return EVENT_INCOMPLETE; } int q = (_currQuad << 8); q |= (_inputBuffer.get(_inputPtr++) & 0xFF); _currQuad = q; if (q == lastQ) { // match break; } if (++_currQuadBytes > 3) { // no match, error reportUnexpectedEndTag(elemName.getPrefixedName()); break; // never gets here } } // Bueno. How about optional space, '>'? _state = STATE_EE_NEED_GT; } else if (_state != STATE_EE_NEED_GT) { throwInternal(); } if (_pendingInput != 0) { if (!handlePartialCR()) { return EVENT_INCOMPLETE; } // it's ignorable ws } // Trailing space? while (true) { if (_inputPtr >= _inputEnd) { return EVENT_INCOMPLETE; } int i2 = _inputBuffer.get(_inputPtr++) & 0xFF; if (i2 <= INT_SPACE) { if (i2 == INT_LF) { markLF(); } else if (i2 == INT_CR) { if (_inputPtr >= _inputEnd) { _pendingInput = PENDING_STATE_CR; return EVENT_INCOMPLETE; } if (_inputBuffer.get(_inputPtr) == BYTE_LF) { ++_inputPtr; } markLF(); } else if (i2 != INT_SPACE && i2 != INT_TAB) { throwInvalidSpace(i2); } continue; } if (i2 != INT_GT) { throwUnexpectedChar(decodeCharForError((byte)i2), " expected space or closing '>'"); } // Hah, done! return (_currToken = END_ELEMENT); } } /* /********************************************************************** /* Implementation of parsing API, character events /********************************************************************** */ @Override protected final int startCharacters(byte b) throws XMLStreamException { dummy_loop: do { // dummy loop, to allow break int c = (int) b & 0xFF; switch (_charTypes.TEXT_CHARS[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: /* Note: can not have pending input when this method * is called. No need to check that (could assert) */ if (_inputPtr >= _inputEnd) { // no more input available _pendingInput = PENDING_STATE_CR; return EVENT_INCOMPLETE; } if (_inputBuffer.get(_inputPtr) == BYTE_LF) { ++_inputPtr; } markLF(); c = INT_LF; break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_MULTIBYTE_2: if (_inputPtr >= _inputEnd) { _pendingInput = c; return EVENT_INCOMPLETE; } c = decodeUtf8_2(c); break; case XmlCharTypes.CT_MULTIBYTE_3: if ((_inputEnd - _inputPtr) < 2) { if (_inputEnd > _inputPtr) { // 2 bytes available int d = (int) _inputBuffer.get(_inputPtr++) & 0xFF; c |= (d << 8); } _pendingInput = c; return EVENT_INCOMPLETE; } c = decodeUtf8_3(c); break; case XmlCharTypes.CT_MULTIBYTE_4: if ((_inputEnd - _inputPtr) < 3) { if (_inputEnd > _inputPtr) { // at least 2 bytes? int d = (int) _inputBuffer.get(_inputPtr++) & 0xFF; c |= (d << 8); if (_inputEnd > _inputPtr) { // 3 bytes? d = (int) _inputBuffer.get(_inputPtr++) & 0xFF; c |= (d << 16); } } _pendingInput = c; return EVENT_INCOMPLETE; } c = decodeUtf8_4(c); // Need a surrogate pair, have to call from here: _textBuilder.resetWithSurrogate(c); break dummy_loop; case XmlCharTypes.CT_MULTIBYTE_N: reportInvalidInitial(c); break; case XmlCharTypes.CT_LT: // should never get here case XmlCharTypes.CT_AMP: // - "" - throwInternal(); break; case XmlCharTypes.CT_RBRACKET: // ']]>'? // !!! TBI: check for "]]>" default: break; } _textBuilder.resetWithChar((char) c); } while (false); // dummy loop, for break if (_cfgCoalescing && !_cfgLazyParsing) { // In eager coalescing mode, must read it all return finishCharactersCoalescing(); } _currToken = CHARACTERS; if (_cfgLazyParsing) { _tokenIncomplete = true; } else { finishCharacters(); } return _currToken; } /** * This method only gets called in non-coalescing mode; and if so, * needs to parse as many characters of the current text segment * from the current input block as possible. */ @Override protected final void finishCharacters() throws XMLStreamException { /* Now: there should not usually be any pending input (as it's * handled when CHARACTERS segment started, and this method * only gets called exactly once)... but we may want to * revisit this subject when (if) coalescing mode is to be * tackled. */ if (_pendingInput != 0) { // !!! TBI: needs to be changed for coalescing mode throwInternal(); } final int[] TYPES = _charTypes.TEXT_CHARS; final ByteBuffer inputBuffer = _inputBuffer; char[] outputBuffer = _textBuilder.getBufferWithoutReset(); // Should have just one code point (one or two chars). Assert? int outPtr = _textBuilder.getCurrentLength(); main_loop: while (true) { int c; // Then the tight ASCII non-funny-char loop: ascii_loop: while (true) { int ptr = _inputPtr; if (ptr >= _inputEnd) { break main_loop; } if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } int max = _inputEnd; { int max2 = ptr + (outputBuffer.length - outPtr); if (max2 < max) { max = max2; } } while (ptr < max) { c = (int) inputBuffer.get(ptr++) & 0xFF; if (TYPES[c] != 0) { _inputPtr = ptr; break ascii_loop; } outputBuffer[outPtr++] = (char) c; } _inputPtr = ptr; } // And then fallback for funny chars / UTF-8 multibytes: switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: { if (_inputPtr >= _inputEnd) { _pendingInput = PENDING_STATE_CR; break main_loop; } if (inputBuffer.get(_inputPtr) == BYTE_LF) { ++_inputPtr; } markLF(); } c = INT_LF; break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_MULTIBYTE_2: if (_inputPtr >= _inputEnd) { _pendingInput = c; break main_loop; } c = decodeUtf8_2(c); break; case XmlCharTypes.CT_MULTIBYTE_3: if ((_inputEnd - _inputPtr) < 2) { if (_inputEnd > _inputPtr) { // 2 bytes available int d = (int) _inputBuffer.get(_inputPtr++) & 0xFF; c |= (d << 8); } _pendingInput = c; break main_loop; } c = decodeUtf8_3(c); break; case XmlCharTypes.CT_MULTIBYTE_4: if ((_inputEnd - _inputPtr) < 3) { if (_inputEnd > _inputPtr) { // at least 2 bytes? int d = (int) _inputBuffer.get(_inputPtr++) & 0xFF; c |= (d << 8); if (_inputEnd > _inputPtr) { // 3 bytes? d = (int) _inputBuffer.get(_inputPtr++) & 0xFF; c |= (d << 16); } } _pendingInput = c; break main_loop; } c = decodeUtf8_4(c); // Let's add first part right away: outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10)); if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } c = 0xDC00 | (c & 0x3FF); // And let the other char output down below break; case XmlCharTypes.CT_MULTIBYTE_N: reportInvalidInitial(c); case XmlCharTypes.CT_LT: --_inputPtr; break main_loop; case XmlCharTypes.CT_AMP: c = handleEntityInCharacters(); if (c == 0) { // not a succesfully expanded char entity // _inputPtr set by entity expansion method --_inputPtr; break main_loop; } // Ok; does it need a surrogate though? (over 16 bits) if ((c >> 16) != 0) { c -= 0x10000; outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10)); // Need to ensure room for one more char if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } c = 0xDC00 | (c & 0x3FF); } break; case XmlCharTypes.CT_RBRACKET: // ']]>'? /* 09-Mar-2007, tatus: This will not give 100% coverage, * for it may be split across input buffer boundary. * For now this will have to suffice though. */ { // Let's then just count number of brackets -- // in case they are not followed by '>' int count = 1; byte b = BYTE_NULL; while (_inputPtr < _inputEnd) { b = inputBuffer.get(_inputPtr); if (b != BYTE_RBRACKET) { break; } ++_inputPtr; // to skip past bracket ++count; } if (b == BYTE_GT && count > 1) { reportIllegalCDataEnd(); } // Nope. Need to output all brackets, then; except // for one that can be left for normal output while (--count > 0) { outputBuffer[outPtr++] = ']'; // Need to ensure room for one more char if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } } } // Can just output the first ']' along normal output break; // default: // Other types are not important here... } // We know there's room for one more: outputBuffer[outPtr++] = (char) c; } _textBuilder.setCurrentLength(outPtr); } /** * Method called to handle entity encountered inside * CHARACTERS segment, when trying to complete a non-coalescing text segment. *<p> * NOTE: unlike with generic parsing of named entities, where trailing semicolon * needs to be left in place, here we should just process it right away. * * @return Expanded (character) entity, if positive number; 0 if incomplete. */ protected int handleEntityInCharacters() throws XMLStreamException { /* Thing that simplifies processing here is that handling * is pretty much optional: if there isn't enough data, we * just return 0 and are done with it. * * Also: we need at least 3 more characters for any character entity */ int ptr = _inputPtr; if ((ptr + 3) <= _inputEnd) { byte b = _inputBuffer.get(ptr++); if (b == BYTE_HASH) { // numeric character entity if (_inputBuffer.get(ptr) == BYTE_x) { return handleHexEntityInCharacters(ptr+1); } return handleDecEntityInCharacters(ptr); } // general entity; maybe one of pre-defined ones if (b == BYTE_a) { // amp or apos? b = _inputBuffer.get(ptr++); if (b == BYTE_m) { if ((ptr + 1) < _inputPtr && _inputBuffer.get(ptr) == BYTE_p && _inputBuffer.get(ptr+1) == BYTE_SEMICOLON) { _inputPtr = ptr + 2; return INT_AMP; } } else if (b == BYTE_p) { if ((ptr + 2) < _inputPtr && _inputBuffer.get(ptr) == BYTE_o && _inputBuffer.get(ptr+1) == BYTE_s && _inputBuffer.get(ptr+2) == BYTE_SEMICOLON) { _inputPtr = ptr + 3; return INT_APOS; } } } else if (b == BYTE_g) { // gt? if (_inputBuffer.get(ptr) == BYTE_t && _inputBuffer.get(ptr+1) == BYTE_SEMICOLON) { _inputPtr = ptr + 2; return INT_GT; } } else if (b == BYTE_l) { // lt? if (_inputBuffer.get(ptr) == BYTE_t && _inputBuffer.get(ptr+1) == BYTE_SEMICOLON) { _inputPtr = ptr + 2; return INT_LT; } } else if (b == BYTE_q) { // quot? if ((ptr + 3) < _inputPtr && _inputBuffer.get(ptr)== BYTE_u && _inputBuffer.get(ptr+1) == BYTE_o && _inputBuffer.get(ptr+2) == BYTE_t && _inputBuffer.get(ptr+3) == BYTE_SEMICOLON) { _inputPtr = ptr + 4; return INT_APOS; } } } // couldn't handle: return 0; } protected int handleDecEntityInCharacters(int ptr) throws XMLStreamException { byte b = _inputBuffer.get(ptr++); final int end = _inputEnd; int value = 0; do { int ch = (int) b; if (ch > INT_9 || ch < INT_0) { throwUnexpectedChar(decodeCharForError(b), " expected a digit (0 - 9) for character entity"); } value = (value * 10) + (ch - INT_0); if (value > MAX_UNICODE_CHAR) { // Overflow? reportEntityOverflow(); } if (ptr >= end) { return 0; } b = _inputBuffer.get(ptr++); } while (b != BYTE_SEMICOLON); _inputPtr = ptr; verifyXmlChar(value); return value; } protected int handleHexEntityInCharacters(int ptr) throws XMLStreamException { byte b = _inputBuffer.get(ptr++); final int end = _inputEnd; int value = 0; do { int ch = (int) b; if (ch <= INT_9 && ch >= INT_0) { ch -= INT_0; } else if (ch <= INT_F && ch >= INT_A) { ch = 10 + (ch - INT_A); } else if (ch <= INT_f && ch >= INT_a) { ch = 10 + (ch - INT_a); } else { throwUnexpectedChar(decodeCharForError(b), " expected a hex digit (0-9a-fA-F) for character entity"); } value = (value << 4) + ch; if (value > MAX_UNICODE_CHAR) { // Overflow? reportEntityOverflow(); } if (ptr >= end) { return 0; } b = _inputBuffer.get(ptr++); } while (b != BYTE_SEMICOLON); _inputPtr = ptr; verifyXmlChar(value); return value; } /** * Method called to handle split multi-byte character, by decoding * it and appending to the text buffer, if possible. * * @return True, if split character was completely handled; false * if not */ private final boolean handleAndAppendPending() throws XMLStreamException { // First, need to have at least one more byte: if (_inputPtr >= _inputEnd) { return false; } int c = _pendingInput; _pendingInput = 0; // Possible \r\n linefeed? if (c < 0) { // markers are all negative if (c == PENDING_STATE_CR) { if (_inputBuffer.get(_inputPtr) == BYTE_LF) { ++_inputPtr; } markLF(); _textBuilder.append(CHAR_LF); return true; } throwInternal(); } // Nah, a multi-byte UTF-8 char: // Let's just re-test the first pending byte (in LSB): switch (_charTypes.TEXT_CHARS[c & 0xFF]) { case XmlCharTypes.CT_MULTIBYTE_2: // Easy: must have just one byte, did get another one: _textBuilder.append((char) decodeUtf8_2(c)); break; case XmlCharTypes.CT_MULTIBYTE_3: { // Ok... so do we have one or two pending bytes? int next = _inputBuffer.get(_inputPtr++) & 0xFF; int c2 = (c >> 8); if (c2 == 0) { // just one; need two more if (_inputPtr >= _inputEnd) { // but got only one _pendingInput = c | (next << 8); return false; } int c3 = _inputBuffer.get(_inputPtr++) & 0xFF; c = decodeUtf8_3(c, next, c3); } else { // had two, got one, bueno: c = decodeUtf8_3((c & 0xFF), c2, next); } _textBuilder.append((char) c); } break; case XmlCharTypes.CT_MULTIBYTE_4: { int next = (int) _inputBuffer.get(_inputPtr++) & 0xFF; // Only had one? if ((c >> 8) == 0) { // ok, so need 3 more if (_inputPtr >= _inputEnd) { // just have 1 _pendingInput = c | (next << 8); return false; } int c2 = _inputBuffer.get(_inputPtr++) & 0xFF; if (_inputPtr >= _inputEnd) { // almost, got 2 _pendingInput = c | (next << 8) | (c2 << 16); return false; } int c3 = _inputBuffer.get(_inputPtr++) & 0xFF; c = decodeUtf8_4(c, next, c2, c3); } else { // had two or three int c2 = (c >> 8) & 0xFF; int c3 = (c >> 16); if (c3 == 0) { // just two if (_inputPtr >= _inputEnd) { // one short _pendingInput = c | (next << 16); return false; } c3 = _inputBuffer.get(_inputPtr++) & 0xFF; c = decodeUtf8_4((c & 0xFF), c2, next, c3); } else { // had three, got last c = decodeUtf8_4((c & 0xFF), c2, c3, next); } } } // Need a surrogate pair, have to call from here: _textBuilder.appendSurrogate(c); break; default: // should never occur: throwInternal(); } return true; } /* /********************************************************************** /* Implementation of parsing API, skipping remainder CHARACTERS section /********************************************************************** */ /** * Method that will be called to skip all possible characters * from the input buffer, but without blocking. Partial * characters are not to be handled (not pending input * is to be added). * * @return True, if skipping ending with an unexpanded * entity; false if not */ @Override protected boolean skipCharacters() throws XMLStreamException { if (_pendingInput != 0) { if (!skipPending()) { return false; } } final int[] TYPES = _charTypes.TEXT_CHARS; final ByteBuffer inputBuffer = _inputBuffer; main_loop: while (true) { int c; ascii_loop: while (true) { int ptr = _inputPtr; int max = _inputEnd; if (ptr >= max) { break main_loop; } while (ptr < max) { c = (int) inputBuffer.get(ptr++) & 0xFF; if (TYPES[c] != 0) { _inputPtr = ptr; break ascii_loop; } } _inputPtr = ptr; } // And then fallback for funny chars / UTF-8 multibytes: switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: { if (_inputPtr >= _inputEnd) { _pendingInput = PENDING_STATE_CR; break main_loop; } if (inputBuffer.get(_inputPtr) == BYTE_LF) { ++_inputPtr; } markLF(); } break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_MULTIBYTE_2: if (_inputPtr >= _inputEnd) { _pendingInput = c; break main_loop; } skipUtf8_2(c); break; case XmlCharTypes.CT_MULTIBYTE_3: if ((_inputEnd - _inputPtr) < 2) { if (_inputEnd > _inputPtr) { // 2 bytes available int d = (int) _inputBuffer.get(_inputPtr++) & 0xFF; c |= (d << 8); } _pendingInput = c; break main_loop; } decodeUtf8_3(c); break; case XmlCharTypes.CT_MULTIBYTE_4: if ((_inputEnd - _inputPtr) < 3) { if (_inputEnd > _inputPtr) { // at least 2 bytes? int d = (int) _inputBuffer.get(_inputPtr++) & 0xFF; c |= (d << 8); if (_inputEnd > _inputPtr) { // 3 bytes? d = (int) _inputBuffer.get(_inputPtr++) & 0xFF; c |= (d << 16); } } _pendingInput = c; break main_loop; } decodeUtf8_4(c); break; case XmlCharTypes.CT_MULTIBYTE_N: reportInvalidInitial(c); case XmlCharTypes.CT_LT: --_inputPtr; return true; case XmlCharTypes.CT_AMP: c = skipEntityInCharacters(); if (c == 0) { // not a successfully expanded char entity _pendingInput = PENDING_STATE_TEXT_AMP; // but we may have input to skip nonetheless.. if (_inputPtr < _inputEnd) { if (skipPending()) { return true; } } return false; } break; case XmlCharTypes.CT_RBRACKET: // ']]>'? /* !!! 09-Mar-2007, tatu: This will not give 100% coverage, * for it may be split across input buffer boundary. * For now this will have to suffice though. */ { // Let's then just count number of brackets -- // in case they are not followed by '>' int count = 1; byte b = BYTE_NULL; while (_inputPtr < _inputEnd) { b = inputBuffer.get(_inputPtr); if (b != BYTE_RBRACKET) { break; } ++_inputPtr; // to skip past bracket ++count; } if (b == BYTE_GT && count > 1) { reportIllegalCDataEnd(); } } break; // default: // Other types are not important here... } } // Ran out of input, no entity encountered return false; } private final boolean skipPending() throws XMLStreamException { // First, need to have at least one more byte: if (_inputPtr >= _inputEnd) { return false; } // Possible \r\n linefeed? if (_pendingInput < 0) { // markers are all negative while (true) { switch (_pendingInput) { case PENDING_STATE_CR: _pendingInput = 0; if (_inputBuffer.get(_inputPtr) == BYTE_LF) { ++_inputPtr; } markLF(); return true; case PENDING_STATE_TEXT_AMP: { byte b = _inputBuffer.get(_inputPtr++); if (b == BYTE_HASH) { _pendingInput = PENDING_STATE_TEXT_AMP_HASH; break; } PName n = parseNewEntityName(b); if (n == null) { _pendingInput = PENDING_STATE_TEXT_IN_ENTITY; return false; } int ch = decodeGeneralEntity(n); if (ch == 0) { _tokenName = n; _nextEvent = ENTITY_REFERENCE; } } _pendingInput = 0; return true; // no matter what, we are done case PENDING_STATE_TEXT_AMP_HASH: _entityValue = 0; if (_inputBuffer.get(_inputPtr) == BYTE_x) { ++_inputPtr; if (decodeHexEntity()) { _pendingInput = 0; return true; } _pendingInput = PENDING_STATE_TEXT_HEX_ENTITY; return false; } if (decodeDecEntity()) { _pendingInput = 0; return true; } _pendingInput = PENDING_STATE_TEXT_DEC_ENTITY; return false; case PENDING_STATE_TEXT_DEC_ENTITY: if (decodeDecEntity()) { _pendingInput = 0; return true; } return false; case PENDING_STATE_TEXT_HEX_ENTITY: if (decodeHexEntity()) { _pendingInput = 0; return true; } return false; case PENDING_STATE_TEXT_IN_ENTITY: { PName n = parseEntityName(); if (n == null) { return false; } int ch = decodeGeneralEntity(n); if (ch == 0) { _tokenName = n; _nextEvent = ENTITY_REFERENCE; } } _pendingInput = 0; return true; case PENDING_STATE_TEXT_BRACKET1: if (_inputBuffer.get(_inputPtr) != BYTE_RBRACKET) { _pendingInput = 0; return true; } ++_inputPtr; _pendingInput = PENDING_STATE_TEXT_BRACKET2; break; case PENDING_STATE_TEXT_BRACKET2: // may get sequence... { byte b = _inputBuffer.get(_inputPtr); if (b == BYTE_RBRACKET) { ++_inputPtr; break; } if (b == BYTE_GT) { // problem! ++_inputPtr; reportInputProblem("Encountered ']]>' in text segment"); } } // nope, something else, reprocess _pendingInput = 0; return true; default: throwInternal(); } if (_inputPtr >= _inputEnd) { return false; } } } // Nah, a multi-byte UTF-8 char: // Let's just re-test the first pending byte (in LSB): int c = _pendingInput; switch (_charTypes.TEXT_CHARS[c & 0xFF]) { case XmlCharTypes.CT_MULTIBYTE_2: // Easy: must have just one byte, did get another one: skipUtf8_2(c); break; case XmlCharTypes.CT_MULTIBYTE_3: { // Ok... so do we have one or two pending bytes? int next = _inputBuffer.get(_inputPtr++) & 0xFF; int c2 = (c >> 8); if (c2 == 0) { // just one; need two more if (_inputPtr >= _inputEnd) { // but got only one _pendingInput = c | (next << 8); return false; } int c3 = _inputBuffer.get(_inputPtr++) & 0xFF; decodeUtf8_3(c, next, c3); } else { // had two, got one, bueno: decodeUtf8_3((c & 0xFF), c2, next); } } break; case XmlCharTypes.CT_MULTIBYTE_4: { int next = (int) _inputBuffer.get(_inputPtr++) & 0xFF; // Only had one? if ((c >> 8) == 0) { // ok, so need 3 more if (_inputPtr >= _inputEnd) { // just have 1 _pendingInput = c | (next << 8); return false; } int c2 = _inputBuffer.get(_inputPtr++) & 0xFF; if (_inputPtr >= _inputEnd) { // almost, got 2 _pendingInput = c | (next << 8) | (c2 << 16); return false; } int c3 = _inputBuffer.get(_inputPtr++) & 0xFF; decodeUtf8_4(c, next, c2, c3); } else { // had two or three int c2 = (c >> 8) & 0xFF; int c3 = (c >> 16); if (c3 == 0) { // just two if (_inputPtr >= _inputEnd) { // one short _pendingInput = c | (next << 16); return false; } c3 = _inputBuffer.get(_inputPtr++) & 0xFF; decodeUtf8_4((c & 0xFF), c2, next, c3); } else { // had three, got last decodeUtf8_4((c & 0xFF), c2, c3, next); } } } break; default: // should never occur: throwInternal(); } _pendingInput = 0; return true; } /** * Method called to handle entity encountered inside * CHARACTERS segment, when trying to complete a non-coalescing text segment. * * @return Expanded (character) entity, if positive number; 0 if incomplete. */ private int skipEntityInCharacters() throws XMLStreamException { /* Thing that simplifies processing here is that handling * is pretty much optional: if there isn't enough data, we * just return 0 and are done with it. * * Also: we need at least 3 more characters for any character entity */ int ptr = _inputPtr; if ((ptr + 3) <= _inputEnd) { byte b = _inputBuffer.get(ptr++); if (b == BYTE_HASH) { // numeric character entity if (_inputBuffer.get(ptr) == BYTE_x) { return handleHexEntityInCharacters(ptr+1); } return handleDecEntityInCharacters(ptr); } // general entity; maybe one of pre-defined ones if (b == BYTE_a) { // amp or apos? b = _inputBuffer.get(ptr++); if (b == BYTE_m) { if ((ptr + 1) < _inputPtr && _inputBuffer.get(ptr) == BYTE_p && _inputBuffer.get(ptr+1) == BYTE_SEMICOLON) { _inputPtr = ptr + 2; // NOTE: do skip semicolon as well return INT_AMP; } } else if (b == BYTE_p) { if ((ptr + 2) < _inputPtr && _inputBuffer.get(ptr) == BYTE_o && _inputBuffer.get(ptr+1) == BYTE_s && _inputBuffer.get(ptr+2) == BYTE_SEMICOLON) { _inputPtr = ptr + 3; return INT_APOS; } } } else if (b == BYTE_g) { // gt? if (_inputBuffer.get(ptr) == BYTE_t && _inputBuffer.get(ptr+1) == BYTE_SEMICOLON) { _inputPtr = ptr + 2; return INT_GT; } } else if (b == BYTE_l) { // lt? if (_inputBuffer.get(ptr) == BYTE_t && _inputBuffer.get(ptr+1) == BYTE_SEMICOLON) { _inputPtr = ptr + 2; return INT_LT; } } else if (b == BYTE_q) { // quot? if ((ptr + 3) < _inputPtr && _inputBuffer.get(ptr) == BYTE_u && _inputBuffer.get(ptr+1) == BYTE_o && _inputBuffer.get(ptr+2) == BYTE_t && _inputBuffer.get(ptr+3) == BYTE_SEMICOLON) { _inputPtr = ptr + 4; return INT_APOS; } } } // couldn't handle: return 0; } /** * Coalescing mode is (and will) not be implemented for non-blocking * parsers, so this method should never get called. */ @Override protected boolean skipCoalescedText() throws XMLStreamException { throwInternal(); return false; } /* /********************************************************************** /* Implementation of parsing API, element/attr events /********************************************************************** */ /** * @return True, if the whole value was read; false if * only part (due to buffer ending) */ @Override protected boolean handleAttrValue() throws XMLStreamException { // First; any pending input? if (_pendingInput != 0) { if (!handleAttrValuePending()) { return false; } _pendingInput = 0; } char[] attrBuffer = _attrCollector.continueValue(); final int[] TYPES = _charTypes.ATTR_CHARS; final int quoteChar = (int) _elemAttrQuote; value_loop: while (true) { int c; ascii_loop: while (true) { if (_inputPtr >= _inputEnd) { return false; } if (_elemAttrPtr >= attrBuffer.length) { attrBuffer = _attrCollector.valueBufferFull(); } int max = _inputEnd; { int max2 = _inputPtr + (attrBuffer.length - _elemAttrPtr); if (max2 < max) { max = max2; } } while (_inputPtr < max) { c = (int) _inputBuffer.get(_inputPtr++) & 0xFF; if (TYPES[c] != 0) { break ascii_loop; } attrBuffer[_elemAttrPtr++] = (char) c; } } switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: if (_inputPtr >= _inputEnd) { _pendingInput = PENDING_STATE_CR; return false; } if (_inputBuffer.get(_inputPtr) == BYTE_LF) { ++_inputPtr; } // fall through case XmlCharTypes.CT_WS_LF: markLF(); // fall through case XmlCharTypes.CT_WS_TAB: // Plus, need to convert these all to simple space c = INT_SPACE; break; case XmlCharTypes.CT_MULTIBYTE_2: if (_inputPtr >= _inputEnd) { _pendingInput = c; return false; } c = decodeUtf8_2(c); break; case XmlCharTypes.CT_MULTIBYTE_3: if ((_inputEnd - _inputPtr) < 2) { if (_inputEnd > _inputPtr) { // 2 bytes available int d = (int) _inputBuffer.get(_inputPtr++) & 0xFF; c |= (d << 8); } _pendingInput = c; return false; } c = decodeUtf8_3(c); break; case XmlCharTypes.CT_MULTIBYTE_4: if ((_inputEnd - _inputPtr) < 3) { if (_inputEnd > _inputPtr) { // at least 2 bytes? int d = (int) _inputBuffer.get(_inputPtr++) & 0xFF; c |= (d << 8); if (_inputEnd > _inputPtr) { // 3 bytes? d = (int) _inputBuffer.get(_inputPtr++) & 0xFF; c |= (d << 16); } } _pendingInput = c; return false; } c = decodeUtf8_4(c); // Let's add first part right away: attrBuffer[_elemAttrPtr++] = (char) (0xD800 | (c >> 10)); c = 0xDC00 | (c & 0x3FF); if (_elemAttrPtr >= attrBuffer.length) { attrBuffer = _attrCollector.valueBufferFull(); } break; case XmlCharTypes.CT_MULTIBYTE_N: reportInvalidInitial(c); case XmlCharTypes.CT_LT: throwUnexpectedChar(c, "'<' not allowed in attribute value"); case XmlCharTypes.CT_AMP: c = handleEntityInAttributeValue(); if (c <= 0) { // general entity; should never happen if (c < 0) { // end-of-input return false; } reportUnexpandedEntityInAttr(_elemAttrName, false); } // Ok; does it need a surrogate though? (over 16 bits) if ((c >> 16) != 0) { c -= 0x10000; attrBuffer[_elemAttrPtr++] = (char) (0xD800 | (c >> 10)); c = 0xDC00 | (c & 0x3FF); if (_elemAttrPtr >= attrBuffer.length) { attrBuffer = _attrCollector.valueBufferFull(); } } break; case XmlCharTypes.CT_ATTR_QUOTE: if (c == quoteChar) { break value_loop; } // default: // Other chars are not important here... } // We know there's room for at least one char without checking attrBuffer[_elemAttrPtr++] = (char) c; } return true; // yeah, we're done! } /** * @return True if the partial information was succesfully handled; * false if not */ private final boolean handleAttrValuePending() throws XMLStreamException { if (_pendingInput == PENDING_STATE_CR) { if (!handlePartialCR()) { return false; } char[] attrBuffer = _attrCollector.continueValue(); if (_elemAttrPtr >= attrBuffer.length) { attrBuffer = _attrCollector.valueBufferFull(); } // All LFs get converted to spaces, in attribute values attrBuffer[_elemAttrPtr++] = ' '; return true; } // otherwise must be related to entity handling within attribute value if (_inputPtr >= _inputEnd) { return false; } int ch; if (_pendingInput == PENDING_STATE_ATTR_VALUE_AMP) { byte b = _inputBuffer.get(_inputPtr++); if (b == BYTE_HASH) { // numeric character entity _pendingInput = PENDING_STATE_ATTR_VALUE_AMP_HASH; if (_inputPtr >= _inputEnd) { return false; } if (_inputBuffer.get(_inputPtr) == BYTE_x) { _pendingInput = PENDING_STATE_ATTR_VALUE_AMP_HASH_X; ++_inputPtr; if (_inputPtr >= _inputEnd) { return false; } ch = handleHexEntityInAttribute(true); } else { ch = handleDecEntityInAttribute(true); } } else { PName entityName = parseNewEntityName(b); if (entityName == null) { _pendingInput = PENDING_STATE_ATTR_VALUE_ENTITY_NAME; return false; } ch = decodeGeneralEntity(entityName); if (ch == 0) { // can't have general entities within attribute values _tokenName = entityName; reportUnexpandedEntityInAttr(_elemAttrName, false); } } } else if (_pendingInput == PENDING_STATE_ATTR_VALUE_AMP_HASH) { if (_inputBuffer.get(_inputPtr) == BYTE_x) { _pendingInput = PENDING_STATE_ATTR_VALUE_AMP_HASH_X; ++_inputPtr; if (_inputPtr >= _inputEnd) { return false; } ch = handleHexEntityInAttribute(true); } else { ch = handleDecEntityInAttribute(true); } } else if (_pendingInput == PENDING_STATE_ATTR_VALUE_AMP_HASH_X) { ch = handleHexEntityInAttribute(true); } else if (_pendingInput == PENDING_STATE_ATTR_VALUE_ENTITY_NAME) { PName entityName = parseEntityName(); if (entityName == null) { return false; } ch = decodeGeneralEntity(entityName); if (ch == 0) { // can't have general entities within attribute values _tokenName = entityName; reportUnexpandedEntityInAttr(_elemAttrName, false); } } else if (_pendingInput == PENDING_STATE_ATTR_VALUE_DEC_DIGIT) { ch = handleDecEntityInAttribute(false); } else if (_pendingInput == PENDING_STATE_ATTR_VALUE_HEX_DIGIT) { ch = handleHexEntityInAttribute(false); } else { // nope, split UTF-8 char // Nah, a multi-byte UTF-8 char. Alas, can't use shared method, as results // don't go in shared text buffer... ch = handleAttrValuePendingUTF8(); } if (ch == 0) { // wasn't resolved return false; } char[] attrBuffer = _attrCollector.continueValue(); // Ok; does it need a surrogate though? (over 16 bits) if ((ch >> 16) != 0) { ch -= 0x10000; if (_elemAttrPtr >= attrBuffer.length) { attrBuffer = _attrCollector.valueBufferFull(); } attrBuffer[_elemAttrPtr++] = (char) (0xD800 | (ch >> 10)); ch = 0xDC00 | (ch & 0x3FF); } if (_elemAttrPtr >= attrBuffer.length) { attrBuffer = _attrCollector.valueBufferFull(); } attrBuffer[_elemAttrPtr++] = (char) ch; return true; // done it! } private final int handleAttrValuePendingUTF8() throws XMLStreamException { // note: we know there must be at least one byte available at this point int c = _pendingInput; _pendingInput = 0; // Let's just re-test the first pending byte (in LSB): switch (_charTypes.TEXT_CHARS[c & 0xFF]) { case XmlCharTypes.CT_MULTIBYTE_2: // Easy: must have just one byte, did get another one: return decodeUtf8_2(c); case XmlCharTypes.CT_MULTIBYTE_3: { // Ok... so do we have one or two pending bytes? int next = _inputBuffer.get(_inputPtr++) & 0xFF; int c2 = (c >> 8); if (c2 == 0) { // just one; need two more if (_inputPtr >= _inputEnd) { // but got only one _pendingInput = c | (next << 8); return 0; } int c3 = _inputBuffer.get(_inputPtr++) & 0xFF; c = decodeUtf8_3(c, next, c3); } else { // had two, got one, bueno: c = decodeUtf8_3((c & 0xFF), c2, next); } return c; } case XmlCharTypes.CT_MULTIBYTE_4: { int next = (int) _inputBuffer.get(_inputPtr++) & 0xFF; // Only had one? if ((c >> 8) == 0) { // ok, so need 3 more if (_inputPtr >= _inputEnd) { // just have 1 _pendingInput = c | (next << 8); return 0; } int c2 = _inputBuffer.get(_inputPtr++) & 0xFF; if (_inputPtr >= _inputEnd) { // almost, got 2 _pendingInput = c | (next << 8) | (c2 << 16); return 0; } int c3 = _inputBuffer.get(_inputPtr++) & 0xFF; c = decodeUtf8_4(c, next, c2, c3); } else { // had two or three int c2 = (c >> 8) & 0xFF; int c3 = (c >> 16); if (c3 == 0) { // just two if (_inputPtr >= _inputEnd) { // one short _pendingInput = c | (next << 16); return 0; } c3 = _inputBuffer.get(_inputPtr++) & 0xFF; c = decodeUtf8_4((c & 0xFF), c2, next, c3); } else { // had three, got last c = decodeUtf8_4((c & 0xFF), c2, c3, next); } } return c; } default: // should never occur: throwInternal(); return 0; // never gets here } } private final int handleDecEntityInAttribute(boolean starting) throws XMLStreamException { byte b = _inputBuffer.get(_inputPtr++); // we know one is available if (starting) { int ch = (int) b; if (ch < INT_0 || ch > INT_9) { // invalid entity throwUnexpectedChar(decodeCharForError(b), " expected a digit (0 - 9) for character entity"); } _pendingInput = PENDING_STATE_ATTR_VALUE_DEC_DIGIT; _entityValue = ch - INT_0; if (_inputPtr >= _inputEnd) { return 0; } b = _inputBuffer.get(_inputPtr++); } while (b != BYTE_SEMICOLON) { int ch = ((int) b) - INT_0; if (ch < 0 || ch > 9) { // invalid entity throwUnexpectedChar(decodeCharForError(b), " expected a digit (0 - 9) for character entity"); } int value = (_entityValue * 10) + ch; _entityValue = value; if (value > MAX_UNICODE_CHAR) { // Overflow? reportEntityOverflow(); } if (_inputPtr >= _inputEnd) { return 0; } b = _inputBuffer.get(_inputPtr++); } verifyXmlChar(_entityValue); _pendingInput = 0; return _entityValue; } private final int handleHexEntityInAttribute(boolean starting) throws XMLStreamException { byte b = _inputBuffer.get(_inputPtr++); // we know one is available if (starting) { int ch = (int) b; if (ch < INT_0 || ch > INT_9) { // invalid entity throwUnexpectedChar(decodeCharForError(b), " expected a hex digit (0-9a-fA-F) for character entity"); } _pendingInput = PENDING_STATE_ATTR_VALUE_HEX_DIGIT; _entityValue = ch - INT_0; if (_inputPtr >= _inputEnd) { return 0; } b = _inputBuffer.get(_inputPtr++); } while (b != BYTE_SEMICOLON) { int ch = (int) b; if (ch <= INT_9 && ch >= INT_0) { ch -= INT_0; } else if (ch <= INT_F && ch >= INT_A) { ch = 10 + (ch - INT_A); } else if (ch <= INT_f && ch >= INT_a) { ch = 10 + (ch - INT_a); } else { throwUnexpectedChar(decodeCharForError(b), " expected a hex digit (0-9a-fA-F) for character entity"); } int value = (_entityValue << 4) + ch; _entityValue = value; if (value > MAX_UNICODE_CHAR) { // Overflow? reportEntityOverflow(); } if (_inputPtr >= _inputEnd) { return 0; } b = _inputBuffer.get(_inputPtr++); } verifyXmlChar(_entityValue); _pendingInput = 0; return _entityValue; } /** * Method called to handle entity encountered inside attribute value. * * @return Value of expanded character entity, if processed (which must be * 1 or above); 0 for general entity, or -1 for "not enough input" */ protected int handleEntityInAttributeValue() throws XMLStreamException { if (_inputPtr >= _inputEnd) { _pendingInput = PENDING_STATE_ATTR_VALUE_AMP; return -1; } byte b = _inputBuffer.get(_inputPtr++); if (b == BYTE_HASH) { // numeric character entity _pendingInput = PENDING_STATE_ATTR_VALUE_AMP_HASH; if (_inputPtr >= _inputEnd) { return -1; } int ch; if (_inputBuffer.get(_inputPtr) == BYTE_x) { _pendingInput = PENDING_STATE_ATTR_VALUE_AMP_HASH_X; ++_inputPtr; if (_inputPtr >= _inputEnd) { return -1; } ch = handleHexEntityInAttribute(true); } else { ch = handleDecEntityInAttribute(true); } if (ch == 0) { return -1; } return ch; } PName entityName = parseNewEntityName(b); if (entityName == null) { _pendingInput = PENDING_STATE_ATTR_VALUE_ENTITY_NAME; return -1; } int ch = decodeGeneralEntity(entityName); if (ch != 0) { return ch; } _tokenName = entityName; return 0; } @Override protected boolean handleNsDecl() throws XMLStreamException { final int[] TYPES = _charTypes.ATTR_CHARS; char[] attrBuffer = _nameBuffer; final int quoteChar = (int) _elemAttrQuote; // First; any pending input? if (_pendingInput != 0) { if (!handleNsValuePending()) { return false; } _pendingInput = 0; } value_loop: while (true) { int c; ascii_loop: while (true) { if (_inputPtr >= _inputEnd) { return false; } if (_elemNsPtr >= attrBuffer.length) { _nameBuffer = attrBuffer = DataUtil.growArrayBy(attrBuffer, attrBuffer.length); } int max = _inputEnd; { int max2 = _inputPtr + (attrBuffer.length - _elemNsPtr); if (max2 < max) { max = max2; } } while (_inputPtr < max) { c = (int) _inputBuffer.get(_inputPtr++) & 0xFF; if (TYPES[c] != 0) { break ascii_loop; } attrBuffer[_elemNsPtr++] = (char) c; } } switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: if (_inputPtr >= _inputEnd) { _pendingInput = PENDING_STATE_CR; return false; } if (_inputBuffer.get(_inputPtr) == BYTE_LF) { ++_inputPtr; } // fall through case XmlCharTypes.CT_WS_LF: markLF(); // fall through case XmlCharTypes.CT_WS_TAB: // Plus, need to convert these all to simple space c = INT_SPACE; break; case XmlCharTypes.CT_MULTIBYTE_2: if (_inputPtr >= _inputEnd) { _pendingInput = c; return false; } c = decodeUtf8_2(c); break; case XmlCharTypes.CT_MULTIBYTE_3: if ((_inputEnd - _inputPtr) < 2) { if (_inputEnd > _inputPtr) { // 2 bytes available int d = (int) _inputBuffer.get(_inputPtr++) & 0xFF; c |= (d << 8); } _pendingInput = c; return false; } c = decodeUtf8_3(c); break; case XmlCharTypes.CT_MULTIBYTE_4: if ((_inputEnd - _inputPtr) < 3) { if (_inputEnd > _inputPtr) { // at least 2 bytes? int d = (int) _inputBuffer.get(_inputPtr++) & 0xFF; c |= (d << 8); if (_inputEnd > _inputPtr) { // 3 bytes? d = (int) _inputBuffer.get(_inputPtr++) & 0xFF; c |= (d << 16); } } _pendingInput = c; return false; } c = decodeUtf8_4(c); // Let's add first part right away: attrBuffer[_elemNsPtr++] = (char) (0xD800 | (c >> 10)); c = 0xDC00 | (c & 0x3FF); if (_elemNsPtr >= attrBuffer.length) { _nameBuffer = attrBuffer = DataUtil.growArrayBy(attrBuffer, attrBuffer.length); } break; case XmlCharTypes.CT_MULTIBYTE_N: reportInvalidInitial(c); case XmlCharTypes.CT_LT: throwUnexpectedChar(c, "'<' not allowed in attribute value"); case XmlCharTypes.CT_AMP: c = handleEntityInAttributeValue(); if (c <= 0) { // general entity; should never happen if (c < 0) { // end-of-input return false; } reportUnexpandedEntityInAttr(_elemAttrName, true); } // Ok; does it need a surrogate though? (over 16 bits) if ((c >> 16) != 0) { c -= 0x10000; attrBuffer[_elemNsPtr++] = (char) (0xD800 | (c >> 10)); c = 0xDC00 | (c & 0x3FF); if (_elemNsPtr >= attrBuffer.length) { _nameBuffer = attrBuffer = DataUtil.growArrayBy(attrBuffer, attrBuffer.length); } } break; case XmlCharTypes.CT_ATTR_QUOTE: if (c == quoteChar) { break value_loop; } // default: // Other chars are not important here... } // We know there's room for at least one char without checking attrBuffer[_elemNsPtr++] = (char) c; } /* Simple optimization: for default ns removal (or, with * ns 1.1, any other as well), will use empty value... no * need to try to intern: */ int attrPtr = _elemNsPtr; if (attrPtr == 0) { bindNs(_elemAttrName, ""); } else { String uri = _config.canonicalizeURI(attrBuffer, attrPtr); bindNs(_elemAttrName, uri); } return true; } /** * @return True if the partial information was succesfully handled; * false if not */ private final boolean handleNsValuePending() throws XMLStreamException { if (_pendingInput == PENDING_STATE_CR) { if (!handlePartialCR()) { return false; } char[] attrBuffer = _nameBuffer; if (_elemNsPtr >= attrBuffer.length) { _nameBuffer = attrBuffer = DataUtil.growArrayBy(attrBuffer, attrBuffer.length); } // All lfs get converted to spaces, in attribute values attrBuffer[_elemNsPtr++] = ' '; return true; } // otherwise must be related to entity handling within attribute value if (_inputPtr >= _inputEnd) { return false; } int ch; if (_pendingInput == PENDING_STATE_ATTR_VALUE_AMP) { byte b = _inputBuffer.get(_inputPtr++); if (b == BYTE_HASH) { // numeric character entity _pendingInput = PENDING_STATE_ATTR_VALUE_AMP_HASH; if (_inputPtr >= _inputEnd) { return false; } if (_inputBuffer.get(_inputPtr) == BYTE_x) { _pendingInput = PENDING_STATE_ATTR_VALUE_AMP_HASH_X; ++_inputPtr; if (_inputPtr >= _inputEnd) { return false; } ch = handleHexEntityInAttribute(true); } else { ch = handleDecEntityInAttribute(true); } } else { PName entityName = parseNewEntityName(b); if (entityName == null) { _pendingInput = PENDING_STATE_ATTR_VALUE_ENTITY_NAME; return false; } ch = decodeGeneralEntity(entityName); if (ch == 0) { // can't have general entities within attribute values _tokenName = entityName; reportUnexpandedEntityInAttr(_elemAttrName, false); } } } else if (_pendingInput == PENDING_STATE_ATTR_VALUE_AMP_HASH) { if (_inputBuffer.get(_inputPtr) == BYTE_x) { _pendingInput = PENDING_STATE_ATTR_VALUE_AMP_HASH_X; ++_inputPtr; if (_inputPtr >= _inputEnd) { return false; } ch = handleHexEntityInAttribute(true); } else { ch = handleDecEntityInAttribute(true); } } else if (_pendingInput == PENDING_STATE_ATTR_VALUE_AMP_HASH_X) { ch = handleHexEntityInAttribute(true); } else if (_pendingInput == PENDING_STATE_ATTR_VALUE_ENTITY_NAME) { PName entityName = parseEntityName(); if (entityName == null) { return false; } ch = decodeGeneralEntity(entityName); if (ch == 0) { // can't have general entities within attribute values _tokenName = entityName; reportUnexpandedEntityInAttr(_elemAttrName, false); } } else if (_pendingInput == PENDING_STATE_ATTR_VALUE_DEC_DIGIT) { ch = handleDecEntityInAttribute(false); } else if (_pendingInput == PENDING_STATE_ATTR_VALUE_HEX_DIGIT) { ch = handleHexEntityInAttribute(false); } else { // 05-Aug-2012, tatu: Apparently we can end up here too... ch = handleAttrValuePendingUTF8(); } if (ch == 0) { // wasn't resolved return false; } char[] attrBuffer = _nameBuffer; // Ok; does it need a surrogate though? (over 16 bits) if ((ch >> 16) != 0) { ch -= 0x10000; if (_elemNsPtr >= attrBuffer.length) { _nameBuffer = attrBuffer = DataUtil.growArrayBy(attrBuffer, attrBuffer.length); } attrBuffer[_elemNsPtr++] = (char) (0xD800 | (ch >> 10)); ch = 0xDC00 | (ch & 0x3FF); } if (_elemNsPtr >= attrBuffer.length) { _nameBuffer = attrBuffer = DataUtil.growArrayBy(attrBuffer, attrBuffer.length); } attrBuffer[_elemNsPtr++] = (char) ch; return true; // done it! } /* /********************************************************************** /* Common name/entity parsing /********************************************************************** */ protected PName parseNewName(byte b) throws XMLStreamException { int q = b & 0xFF; /* Let's do just quick sanity check first; a thorough check will be * done later on if necessary, now we'll just do the very cheap * check to catch extra spaces etc. */ if (q < INT_A) { // lowest acceptable start char, except for ':' that would be allowed in non-ns mode throwUnexpectedChar(q, "; expected a name start character"); } _quadCount = 0; _currQuad = q; _currQuadBytes = 1; return parsePName(); } /** * This method can (for now?) be shared between all Ascii-based * encodings, since it only does coarse validity checking -- real * checks are done in different method. *<p> * Some notes about assumption implementation makes: *<ul> * <li>Well-formed xml content can not end with a name: as such, * end-of-input is an error and we can throw an exception * </li> * </ul> */ protected PName parsePName() throws XMLStreamException { int q = _currQuad; while (true) { int i; switch (_currQuadBytes) { case 0: if (_inputPtr >= _inputEnd) { return null; // all pointers have been set } q = _inputBuffer.get(_inputPtr++) & 0xFF; /* Since name char validity is checked later on, we only * need to be able to reliably see the end of the name... * and those are simple enough so that we can just * compare; lookup table won't speed things up (according * to profiler) */ if (q < 65) { // 'A' // Ok; "_" (45), "." (46) and "0"-"9"/":" (48 - 57/58) still name chars if (q < 45 || q > 58 || q == 47) { // End of name return findPName(q, 0); } } // fall through case 1: if (_inputPtr >= _inputEnd) { // need to store pointers _currQuad = q; _currQuadBytes = 1; return null; } i = _inputBuffer.get(_inputPtr++) & 0xFF; if (i < 65) { // 'A' if (i < 45 || i > 58 || i == 47) { return findPName(q, 1); } } q = (q << 8) | i; // fall through case 2: if (_inputPtr >= _inputEnd) { // need to store pointers _currQuad = q; _currQuadBytes = 2; return null; } i = _inputBuffer.get(_inputPtr++) & 0xFF; if (i < 65) { // 'A' if (i < 45 || i > 58 || i == 47) { return findPName(q, 2); } } q = (q << 8) | i; // fall through case 3: if (_inputPtr >= _inputEnd) { // need to store pointers _currQuad = q; _currQuadBytes = 3; return null; } i = _inputBuffer.get(_inputPtr++) & 0xFF; if (i < 65) { // 'A' if (i < 45 || i > 58 || i == 47) { return findPName(q, 3); } } q = (q << 8) | i; } /* If we get this far, need to add full quad into * result array and update state */ if (_quadCount == 0) { // first quad _quadBuffer[0] = q; _quadCount = 1; } else { if (_quadCount >= _quadBuffer.length) { // let's just double? _quadBuffer = DataUtil.growArrayBy(_quadBuffer, _quadBuffer.length); } _quadBuffer[_quadCount++] = q; } _currQuadBytes = 0; } } protected final PName parseNewEntityName(byte b) throws XMLStreamException { int q = b & 0xFF; if (q < INT_A) { throwUnexpectedChar(q, "; expected a name start character"); } _quadCount = 0; _currQuad = q; _currQuadBytes = 1; return parseEntityName(); } protected final PName parseEntityName() throws XMLStreamException { int q = _currQuad; while (true) { int i; switch (_currQuadBytes) { case 0: if (_inputPtr >= _inputEnd) { return null; // all pointers have been set } q = _inputBuffer.get(_inputPtr++) & 0xFF; /* Since name char validity is checked later on, we only * need to be able to reliably see the end of the name... * and those are simple enough so that we can just * compare; lookup table won't speed things up (according * to profiler) */ if (q < 65) { // 'A' // Ok; "_" (45), "." (46) and "0"-"9"/":" (48 - 57/58) still name chars if (q < 45 || q > 58 || q == 47) { // apos, quot? if (_quadCount == 1) { q = _quadBuffer[0]; if (q == EntityNames.ENTITY_APOS_QUAD) { --_inputPtr; return EntityNames.ENTITY_APOS; } if (q == EntityNames.ENTITY_QUOT_QUAD) { --_inputPtr; return EntityNames.ENTITY_QUOT; } } // Nope, generic: return findPName(q, 0); } } // fall through case 1: if (_inputPtr >= _inputEnd) { // need to store pointers _currQuad = q; _currQuadBytes = 1; return null; } i = _inputBuffer.get(_inputPtr++) & 0xFF; if (i < 65) { // 'A' if (i < 45 || i > 58 || i == 47) { return findPName(q, 1); } } q = (q << 8) | i; // fall through case 2: if (_inputPtr >= _inputEnd) { // need to store pointers _currQuad = q; _currQuadBytes = 2; return null; } i = _inputBuffer.get(_inputPtr++) & 0xFF; if (i < 65) { // 'A' if (i < 45 || i > 58 || i == 47) { // lt or gt? if (_quadCount == 0) { if (q == EntityNames.ENTITY_GT_QUAD) { --_inputPtr; return EntityNames.ENTITY_GT; } if (q == EntityNames.ENTITY_LT_QUAD) { --_inputPtr; return EntityNames.ENTITY_LT; } } return findPName(q, 2); } } q = (q << 8) | i; // fall through case 3: if (_inputPtr >= _inputEnd) { // need to store pointers _currQuad = q; _currQuadBytes = 3; return null; } i = _inputBuffer.get(_inputPtr++) & 0xFF; if (i < 65) { // 'A' if (i < 45 || i > 58 || i == 47) { // amp? if (_quadCount == 0) { if (q == EntityNames.ENTITY_AMP_QUAD) { --_inputPtr; return EntityNames.ENTITY_AMP; } } return findPName(q, 3); } } q = (q << 8) | i; } /* If we get this far, need to add full quad into * result array and update state */ if (_quadCount == 0) { // first quad _quadBuffer[0] = q; _quadCount = 1; } else { if (_quadCount >= _quadBuffer.length) { // let's just double? _quadBuffer = DataUtil.growArrayBy(_quadBuffer, _quadBuffer.length); } _quadBuffer[_quadCount++] = q; } _currQuadBytes = 0; } } /* /********************************************************************** /* Internal methods, LF handling /********************************************************************** */ /** * Method called when there is a pending \r (from past buffer), * and we need to see * * @return True if the linefeed was succesfully processed (had * enough input data to do that); or false if there is no * data available to check this */ protected final boolean handlePartialCR() { // sanity check if (_pendingInput != PENDING_STATE_CR) { throwInternal(); } if (_inputPtr >= _inputEnd) { return false; } _pendingInput = 0; if (_inputBuffer.get(_inputPtr) == BYTE_LF) { ++_inputPtr; } ++_currRow; _rowStartOffset = _inputPtr; return true; } /* /********************************************************************** /* Multi-byte char decoding /********************************************************************** */ /** *<p> * Note: caller must guarantee enough data is available before * calling the method */ protected final int decodeUtf8_2(int c) throws XMLStreamException { int d = (int) _inputBuffer.get(_inputPtr++); if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF, _inputPtr); } return ((c & 0x1F) << 6) | (d & 0x3F); } protected final void skipUtf8_2(int c) throws XMLStreamException { int d = (int) _inputBuffer.get(_inputPtr++); if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF, _inputPtr); } } /** *<p> * Note: caller must guarantee enough data is available before * calling the method */ protected final int decodeUtf8_3(int c1) throws XMLStreamException { c1 &= 0x0F; int d = (int) _inputBuffer.get(_inputPtr++); if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF, _inputPtr); } int c = (c1 << 6) | (d & 0x3F); d = (int) _inputBuffer.get(_inputPtr++); if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF, _inputPtr); } c = (c << 6) | (d & 0x3F); if (c1 >= 0xD) { // 0xD800-0xDFFF, 0xFFFE-0xFFFF illegal if (c >= 0xD800) { // surrogates illegal, as well as 0xFFFE/0xFFFF if (c < 0xE000 || (c >= 0xFFFE && c <= 0xFFFF)) { c = handleInvalidXmlChar(c); } } } return c; } protected final int decodeUtf8_3(int c1, int c2, int c3) throws XMLStreamException { // Note: first char is assumed to have been checked if ((c2 & 0xC0) != 0x080) { reportInvalidOther(c2 & 0xFF, _inputPtr-1); } if ((c3 & 0xC0) != 0x080) { reportInvalidOther(c3 & 0xFF, _inputPtr); } int c = ((c1 & 0x0F) << 12) | ((c2 & 0x3F) << 6) | (c3 & 0x3F); if (c1 >= 0xD) { // 0xD800-0xDFFF, 0xFFFE-0xFFFF illegal if (c >= 0xD800) { // surrogates illegal, as well as 0xFFFE/0xFFFF if (c < 0xE000 || (c >= 0xFFFE && c <= 0xFFFF)) { c = handleInvalidXmlChar(c); } } } return c; } protected final int decodeUtf8_4(int c) throws XMLStreamException { int d = (int) _inputBuffer.get(_inputPtr++); if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF, _inputPtr); } c = ((c & 0x07) << 6) | (d & 0x3F); d = (int) _inputBuffer.get(_inputPtr++); if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF, _inputPtr); } c = (c << 6) | (d & 0x3F); d = (int) _inputBuffer.get(_inputPtr++); if ((d & 0xC0) != 0x080) { reportInvalidOther(d & 0xFF, _inputPtr); } /* note: won't change it to negative here, since caller * already knows it'll need a surrogate */ return ((c << 6) | (d & 0x3F)) - 0x10000; } /** * @return Character value <b>minus 0x10000</c>; this so that caller * can readily expand it to actual surrogates */ protected final int decodeUtf8_4(int c1, int c2, int c3, int c4) throws XMLStreamException { /* Note: first char is assumed to have been checked, * (but not yet masked) */ if ((c2 & 0xC0) != 0x080) { reportInvalidOther(c2 & 0xFF, _inputPtr-2); } int c = ((c1 & 0x07) << 6) | (c2 & 0x3F); if ((c3 & 0xC0) != 0x080) { reportInvalidOther(c3 & 0xFF, _inputPtr-1); } c = (c << 6) | (c3 & 0x3F); if ((c4 & 0xC0) != 0x080) { reportInvalidOther(c4 & 0xFF, _inputPtr); } return ((c << 6) | (c4 & 0x3F)) - 0x10000; } }