package com.fasterxml.aalto.in; import java.io.*; import javax.xml.stream.XMLStreamException; import org.codehaus.stax2.XMLStreamLocation2; import com.fasterxml.aalto.impl.ErrorConsts; import com.fasterxml.aalto.impl.IoStreamException; import com.fasterxml.aalto.impl.LocationImpl; import com.fasterxml.aalto.util.DataUtil; import com.fasterxml.aalto.util.TextBuilder; import com.fasterxml.aalto.util.XmlCharTypes; import com.fasterxml.aalto.util.XmlChars; import com.fasterxml.aalto.util.XmlConsts; /** * This is the concrete scanner implementation used when input comes * as a {@link java.io.Reader}. In general using this scanner is quite * a bit less optimal than that of {@link java.io.InputStream} based * scanner. Nonetheless, it is included for completeness, since Stax * interface allows passing Readers as input sources. */ public final class ReaderScanner extends XmlScanner { /** * Although java chars are basically UTF-16 in memory, the closest * match for char types is Latin1. */ private final static XmlCharTypes sCharTypes = InputCharTypes.getLatin1CharTypes(); /* /********************************************************************** /* Configuration /********************************************************************** */ /** * Underlying InputStream to use for reading content. */ protected Reader _in; /* /********************************************************************** /* Input buffering /********************************************************************** */ protected char[] _inputBuffer; protected int _inputPtr; protected int _inputEnd; /** * Storage location for a single character that can not be pushed * back (for example, multi-byte char) */ protected int mTmpChar = INT_NULL; /* /********************************************************************** /* Symbol handling /********************************************************************** */ /** * For now, symbol table contains prefixed names. In future it is * possible that they may be split into prefixes and local names? */ protected final CharBasedPNameTable _symbols; /* /********************************************************************** /* Life-cycle /********************************************************************** */ public ReaderScanner(ReaderConfig cfg, Reader r, char[] buffer, int ptr, int last) { super(cfg); _in = r; _inputBuffer = buffer; _inputPtr = ptr; _inputEnd = last; _pastBytesOrChars = 0; // should it be passed by caller? _rowStartOffset = 0; // should probably be passed by caller... _symbols = cfg.getCBSymbols(); } public ReaderScanner(ReaderConfig cfg, Reader r) { super(cfg); _in = r; _inputBuffer = cfg.allocFullCBuffer(ReaderConfig.DEFAULT_CHAR_BUFFER_LEN); _inputPtr = _inputEnd = 0; _pastBytesOrChars = 0; // should it be passed by caller? _rowStartOffset = 0; // should probably be passed by caller... _symbols = cfg.getCBSymbols(); } @Override protected void _releaseBuffers() { super._releaseBuffers(); if (_symbols.maybeDirty()) { _config.updateCBSymbols(_symbols); } /* Note: if we have block input (_in == null), the buffer we * use is not owned by scanner, can't recycle * Also note that this method will always get called before * _closeSource(); so that _in won't be cleared before we * have a chance to see it. */ if (_in != null) { if (_inputBuffer != null) { _config.freeFullCBuffer(_inputBuffer); _inputBuffer = null; } } } @Override protected void _closeSource() throws IOException { if (_in != null) { _in.close(); _in = null; } } /* /********************************************************************** /* Public scanner interface (1st level parsing) /********************************************************************** */ @Override protected final void finishToken() throws XMLStreamException { _tokenIncomplete = false; switch (_currToken) { case PROCESSING_INSTRUCTION: finishPI(); break; case CHARACTERS: finishCharacters(); break; case COMMENT: finishComment(); break; case SPACE: finishSpace(); break; case DTD: finishDTD(true); // true -> get text break; case CDATA: finishCData(); break; default: ErrorConsts.throwInternalError(); } } // // // First, main iteration methods @Override public final int nextFromProlog(boolean isProlog) throws XMLStreamException { if (_tokenIncomplete) { // left-overs from last thingy? skipToken(); } // First: keep track of where event started setStartLocation(); // Ok: we should get a WS or '<'. So, let's skip through WS while (true) { // Any more data? Just need a single byte if (_inputPtr >= _inputEnd) { if (!loadMore()) { setStartLocation(); return TOKEN_EOI; } } int c = _inputBuffer[_inputPtr++] & 0xFF; // Really should get white space or '<'... if (c == '<') { break; } if (c != ' ') { if (c == '\n') { markLF(); } else if (c == '\r') { if (_inputPtr >= _inputEnd) { if (!loadMore()) { markLF(); setStartLocation(); return TOKEN_EOI; } } if (_inputBuffer[_inputPtr] == '\n') { ++_inputPtr; } markLF(); } else if (c != '\t') { reportPrologUnexpChar(isProlog, c, null); } } } // Ok, got LT: if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(COMMENT); } char c = _inputBuffer[_inputPtr++]; if (c == '!') { // comment/DOCTYPE? (CDATA not legal) return handlePrologDeclStart(isProlog); } if (c == '?') { return handlePIStart(); } /* End tag not allowed if no open tree; and only one root * element (one root-level start tag) */ if (c == '/' || !isProlog) { reportPrologUnexpChar(isProlog, c, " (unbalanced start/end tags?)"); } return handleStartElement(c); } @Override public final int nextFromTree() throws XMLStreamException { if (_tokenIncomplete) { // left-overs? if (skipToken()) { // Figured out next event (ENTITY_REFERENCE)? // !!! We don't yet parse DTD, don't know real contents return _nextEntity(); } } else { // note: START_ELEMENT/END_ELEMENT never incomplete if (_currToken == START_ELEMENT) { if (_isEmptyTag) { // Important: retain same start location as with START_ELEMENT, don't overwrite --_depth; return (_currToken = END_ELEMENT); } } else if (_currToken == END_ELEMENT) { _currElem = _currElem.getParent(); // Any namespace declarations that need to be unbound? while (_lastNsDecl != null && _lastNsDecl.getLevel() >= _depth) { _lastNsDecl = _lastNsDecl.unbind(); } } else { // It's possible CHARACTERS entity with an entity ref: if (_entityPending) { _entityPending = false; return _nextEntity(); } } } // and except for special cases, mark down actual start location of the event setStartLocation(); /* Any more data? Although it'd be an error not to get any, * let's leave error reporting up to caller */ if (_inputPtr >= _inputEnd) { if (!loadMore()) { setStartLocation(); return TOKEN_EOI; } } char c = _inputBuffer[_inputPtr]; /* Can get pretty much any type; start/end element, comment/PI, * CDATA, text, entity reference... */ if (c == '<') { // root element, comment, proc instr? ++_inputPtr; c = (_inputPtr < _inputEnd) ? _inputBuffer[_inputPtr++] : loadOne(COMMENT); if (c == '!') { // comment or CDATA return handleCommentOrCdataStart(); } if (c == '?') { return handlePIStart(); } if (c == '/') { return handleEndElement(); } return handleStartElement(c); } if (c == '&') { // entity reference ++_inputPtr; /* Need to expand; should indicate either text, or an unexpanded * entity reference */ int i = handleEntityInText(false); if (i == 0) { // general entity return (_currToken = ENTITY_REFERENCE); } /* Nope, a char entity; need to indicate it came from an entity. * Since we may want to store the char as is, too, let's negate * entity-based char */ mTmpChar = -i; } else { /* Let's store it for future reference. May or may not be used -- * so let's not advance input ptr quite yet. */ mTmpChar = c; } // text, possibly/probably ok if (_cfgLazyParsing) { _tokenIncomplete = true; } else { finishCharacters(); } return (_currToken = CHARACTERS); } /** * Helper method used to isolate things that need to be (re)set in * cases where */ protected int _nextEntity() { // !!! Also, have to assume start location has been set or such _textBuilder.resetWithEmpty(); // !!! TODO: handle start location? return (_currToken = ENTITY_REFERENCE); } /* /********************************************************************** /* 2nd level parsing /********************************************************************** */ protected final int handlePrologDeclStart(boolean isProlog) throws XMLStreamException { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } char c = _inputBuffer[_inputPtr++]; if (c == '-') { // Comment? if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr++]; if (c == '-') { if (_cfgLazyParsing) { _tokenIncomplete = true; } else { finishComment(); } return (_currToken = COMMENT); } } else if (c == 'D') { // DOCTYPE? if (isProlog) { // no DOCTYPE in epilog handleDtdStart(); // incomplete flag is set by handleDtdStart if (!_cfgLazyParsing) { if (_tokenIncomplete) { finishDTD(true); // must copy contents, may be needed _tokenIncomplete = false; } } return DTD; } } /* error... for error recovery purposes, let's just pretend * like it was unfinished CHARACTERS, though. */ _tokenIncomplete = true; _currToken = CHARACTERS; reportPrologUnexpChar(isProlog, c, " (expected '-' for COMMENT)"); return _currToken; // never gets here } private final int handleDtdStart() throws XMLStreamException { matchAsciiKeyword("DOCTYPE"); // And then some white space and root name char c = skipInternalWs(true, "after DOCTYPE keyword, before root name"); _tokenName = parsePName(c); c = skipInternalWs(false, null); //boolean gotId; if (c == 'P') { // PUBLIC matchAsciiKeyword("PUBLIC"); c = skipInternalWs(true, null); _publicId = parsePublicId(c); c = skipInternalWs(true, null); _systemId = parseSystemId(c); c = skipInternalWs(false, null); } else if (c == 'S') { // SYSTEM matchAsciiKeyword("SYSTEM"); c = skipInternalWs(true, null); _publicId = null; _systemId = parseSystemId(c); c = skipInternalWs(false, null); } else { _publicId = _systemId = null; } /* Ok; so, need to get either an internal subset, or the * end: */ if (c == '>') { // fine, we are done _tokenIncomplete = false; return (_currToken = DTD); } if (c != '[') { // If not end, must have int. subset String msg = (_systemId != null) ? " (expected '[' for the internal subset, or '>' to end DOCTYPE declaration)" : " (expected a 'PUBLIC' or 'SYSTEM' keyword, '[' for the internal subset, or '>' to end DOCTYPE declaration)"; reportTreeUnexpChar(c, msg); } /* Need not parse the int. subset yet, can leave as is, and then * either skip or parse later on */ _tokenIncomplete = true; return (_currToken = DTD); } protected final int handleCommentOrCdataStart() throws XMLStreamException { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } char c = _inputBuffer[_inputPtr++]; // Let's first see if it's a comment (simpler) if (c == '-') { // Comment if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr++]; if (c != '-') { reportTreeUnexpChar(c, " (expected '-' for COMMENT)"); } if (_cfgLazyParsing) { _tokenIncomplete = true; } else { finishComment(); } return (_currToken = COMMENT); } // If not, should be CDATA: if (c == '[') { // CDATA _currToken = CDATA; for (int i = 0; i < 6; ++i) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr++]; if (c != CDATA_STR.charAt(i)) { reportTreeUnexpChar(c, " (expected '"+CDATA_STR.charAt(i)+"' for CDATA section)"); } } if (_cfgLazyParsing) { _tokenIncomplete = true; } else { finishCData(); } return CDATA; } reportTreeUnexpChar(c, " (expected either '-' for COMMENT or '[CDATA[' for CDATA section)"); return TOKEN_EOI; // never gets here } protected final int handlePIStart() throws XMLStreamException { _currToken = PROCESSING_INSTRUCTION; // Ok, first, need a name if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } // Ok, first, need a name char c = _inputBuffer[_inputPtr++]; _tokenName = parsePName(c); { // but is it "xml" (case insensitive)? String ln = _tokenName.getLocalName(); if (ln.length() == 3 && ln.equalsIgnoreCase("xml") && _tokenName.getPrefix() == null) { reportInputProblem(ErrorConsts.ERR_WF_PI_XML_TARGET); } } /* Let's then verify that we either get a space, or closing * '?>': this way we'll catch some problems right away, and also * simplify actual processing of contents. */ if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr++]; if (c <= INT_SPACE) { // Ok, let's skip the white space... while (true) { if (c == '\n') { markLF(); } else if (c == '\r') { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (_inputBuffer[_inputPtr] == '\n') { ++_inputPtr; } markLF(); } else if (c != ' ' && c != '\t') { throwInvalidSpace(c); } if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr]; if (c > 0x0020) { break; } ++_inputPtr; } // Ok, got non-space, need to push back: if (_cfgLazyParsing) { _tokenIncomplete = true; } else { finishPI(); } } else { if (c != INT_QMARK) { reportMissingPISpace(c); } if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr++]; if (c != '>') { reportMissingPISpace(c); } _textBuilder.resetWithEmpty(); _tokenIncomplete = false; } return PROCESSING_INSTRUCTION; } /** * @return Code point for the entity that expands to a valid XML * content character. */ protected final int handleCharEntity() throws XMLStreamException { // Hex or decimal? if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } char c = _inputBuffer[_inputPtr++]; int value = 0; if (c == 'x') { // hex while (true) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr++]; if (c == ';') { break; } value = value << 4; if (c <= '9' && c >= '0') { value += (c - '0'); } else if (c >= 'a' && c <= 'f') { value += 10 + (c - 'a'); } else if (c >= 'A' && c <= 'F') { value += 10 + (c - 'A'); } else { throwUnexpectedChar(c, "; expected a hex digit (0-9a-fA-F)"); } if (value > MAX_UNICODE_CHAR) { // Overflow? reportEntityOverflow(); } } } else { // numeric (decimal) while (c != ';') { if (c <= '9' && c >= '0') { value = (value * 10) + (c - '0'); if (value > MAX_UNICODE_CHAR) { // Overflow? reportEntityOverflow(); } } else { throwUnexpectedChar(c, "; expected a decimal number"); } if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr++]; } } // Ok, and then need to check result is a valid XML content char: if (value >= 0xD800) { // note: checked for overflow earlier if (value < 0xE000) { // no surrogates via entity expansion reportInvalidXmlChar(value); } if (value == 0xFFFE || value == 0xFFFF) { reportInvalidXmlChar(value); } } else if (value < 32) { // XML 1.1 allows most other chars; 1.0 does not: if (value != INT_LF && value != INT_CR && value != INT_TAB) { if (!_xml11 || value == 0) { reportInvalidXmlChar(value); } } } return value; } protected final int handleStartElement(char c) throws XMLStreamException { _currToken = START_ELEMENT; _currNsCount = 0; PName elemName = parsePName(c); /* Ok. Need to create a qualified name. Simplest for element * in default ns (no extra work -- expressed as null binding); * otherwise need to find binding */ String prefix = elemName.getPrefix(); boolean allBound; // flag to check 'late' bindings if (prefix == null) { // element in default ns allBound = true; // which need not be bound } else { elemName = bindName(elemName, prefix); allBound = elemName.isBound(); } _tokenName = elemName; _currElem = new ElementScope(elemName, _currElem); // And then attribute parsing loop: int attrPtr = 0; while (true) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr++]; // Intervening space to skip? if (c <= INT_SPACE) { do { if (c == INT_LF) { markLF(); } else if (c == INT_CR) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (_inputBuffer[_inputPtr] == '\n') { ++_inputPtr; } markLF(); } else if (c != ' ' && c != '\t') { throwInvalidSpace(c); } if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr++]; } while (c <= INT_SPACE); } else if (c != INT_SLASH && c != INT_GT) { throwUnexpectedChar(c, " expected space, or '>' or \"/>\""); } // Ok; either need to get an attribute name, or end marker: if (c == INT_SLASH) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr++]; if (c != '>') { throwUnexpectedChar(c, " expected '>'"); } _isEmptyTag = true; break; } else if (c == '>') { _isEmptyTag = false; break; } else if (c == '<') { reportInputProblem("Unexpected '<' character in element (missing closing '>'?)"); } // Ok, an attr name: PName attrName = parsePName(c); prefix = attrName.getPrefix(); boolean isNsDecl; if (prefix == null) { // can be default ns decl: isNsDecl = (attrName.getLocalName() == "xmlns"); } else { // May be a namespace decl though? if (prefix == "xmlns") { isNsDecl = true; } else { attrName = bindName(attrName, prefix); if (allBound) { allBound = attrName.isBound(); } isNsDecl = false; } } // Optional space to skip again while (true) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr++]; if (c > INT_SPACE) { break; } if (c == '\n') { markLF(); } else if (c == '\r') { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (_inputBuffer[_inputPtr] == '\n') { ++_inputPtr; } markLF(); } else if (c != ' ' && c != '\t') { throwInvalidSpace(c); } } if (c != '=') { throwUnexpectedChar(c, " expected '='"); } // Optional space to skip again while (true) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr++]; if (c > INT_SPACE) { break; } if (c == '\n') { markLF(); } else if (c == '\r') { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (_inputBuffer[_inputPtr] == '\n') { ++_inputPtr; } markLF(); } else if (c != ' ' && c != '\t') { throwInvalidSpace(c); } } if (c != '"' && c != '\'') { throwUnexpectedChar(c, " Expected a quote"); } /* Ok, finally: value parsing. However, ns URIs are to be handled * different from attribute values... let's offline URIs, since * they should be less common than attribute values. */ if (isNsDecl) { // default ns, or explicit? handleNsDeclaration(attrName, c); ++_currNsCount; } else { // nope, a 'real' attribute: attrPtr = collectValue(attrPtr, c, attrName); } } { // Note: this call also checks attribute uniqueness int act = _attrCollector.finishLastValue(attrPtr); if (act < 0) { // error, dup attr indicated by -1 act = _attrCollector.getCount(); // let's get correct count reportInputProblem(_attrCollector.getErrorMsg()); } _attrCount = act; } ++_depth; /* Was there any prefix that wasn't bound prior to use? * That's legal, assuming declaration was found later on... * let's check */ if (!allBound) { if (!elemName.isBound()) { // element itself unbound reportUnboundPrefix(_tokenName, false); } for (int i = 0, len = _attrCount; i < len; ++i) { PName attrName = _attrCollector.getName(i); if (!attrName.isBound()) { reportUnboundPrefix(attrName, true); } } } return START_ELEMENT; } /** * This method implements the tight loop for parsing attribute * values. It's off-lined from the main start element method to * simplify main method, which makes code more maintainable * and possibly easier for JIT/HotSpot to optimize. */ private final int collectValue(int attrPtr, char quoteChar, PName attrName) throws XMLStreamException { char[] attrBuffer = _attrCollector.startNewValue(attrName, attrPtr); final int[] TYPES = sCharTypes.ATTR_CHARS; value_loop: while (true) { char c; ascii_loop: while (true) { int ptr = _inputPtr; if (ptr >= _inputEnd) { loadMoreGuaranteed(); ptr = _inputPtr; } if (attrPtr >= attrBuffer.length) { attrBuffer = _attrCollector.valueBufferFull(); } int max = _inputEnd; { int max2 = ptr + (attrBuffer.length - attrPtr); if (max2 < max) { max = max2; } } while (ptr < max) { c = _inputBuffer[ptr++]; if (c <= 0xFF) { if (TYPES[c] != 0) { _inputPtr = ptr; break ascii_loop; } } else if (c >= 0xD800) { // surrogates and 0xFFFE/0xFFFF _inputPtr = ptr; break ascii_loop; } attrBuffer[attrPtr++] = c; } _inputPtr = ptr; } if (c <= 0xFF) { switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (_inputBuffer[_inputPtr] == '\n') { ++_inputPtr; } // fall through case XmlCharTypes.CT_WS_LF: markLF(); // fall through case XmlCharTypes.CT_WS_TAB: // Plus, need to convert these all to simple space c = ' '; break; case XmlCharTypes.CT_LT: throwUnexpectedChar(c, "'<' not allowed in attribute value"); case XmlCharTypes.CT_AMP: { int d = handleEntityInText(false); if (d == 0) { // unexpanded general entity... not good reportUnexpandedEntityInAttr(attrName, false); } // Ok; does it need a surrogate though? (over 16 bits) if ((d >> 16) != 0) { d -= 0x10000; attrBuffer[attrPtr++] = (char) (0xD800 | (d >> 10)); d = 0xDC00 | (d & 0x3FF); if (attrPtr >= attrBuffer.length) { attrBuffer = _attrCollector.valueBufferFull(); } } c = (char) d; } break; case XmlCharTypes.CT_ATTR_QUOTE: if (c == quoteChar) { break value_loop; } // default: // Other chars are not important here... } } else if (c >= 0xD800) { if (c < 0xE000) { // if ok, returns second surrogate; otherwise exception char d = checkSurrogate(c); attrBuffer[attrPtr++] = c; // Need to ensure room for one more if (attrPtr >= attrBuffer.length) { attrBuffer = _attrCollector.valueBufferFull(); } c = d; } else if (c >= 0xFFFE) { c = handleInvalidXmlChar(c); } } // We know there's room for at least one more char attrBuffer[attrPtr++] = c; } return attrPtr; } /** * Method called from the main START_ELEMENT handling loop, to * parse namespace URI values. */ private void handleNsDeclaration(PName name, char quoteChar) throws XMLStreamException { int attrPtr = 0; char[] attrBuffer = _nameBuffer; while (true) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } char c = _inputBuffer[_inputPtr++]; if (c == quoteChar) { break; } if (c == '&') { // entity int d = handleEntityInText(false); if (d == 0) { // general entity; should never happen reportUnexpandedEntityInAttr(name, true); } // Ok; does it need a surrogate though? (over 16 bits) if ((d >> 16) != 0) { if (attrPtr >= attrBuffer.length) { _nameBuffer = attrBuffer = DataUtil.growArrayBy(attrBuffer, attrBuffer.length); } d -= 0x10000; attrBuffer[attrPtr++] = (char) (0xD800 | (d >> 10)); d = 0xDC00 | (d & 0x3FF); } c = (char) d; } else if (c == '<') { // error throwUnexpectedChar(c, "'<' not allowed in attribute value"); } else { if (c < INT_SPACE) { if (c == '\n') { markLF(); } else if (c == '\r') { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (_inputBuffer[_inputPtr] == '\n') { ++_inputPtr; } markLF(); c = '\n'; } else if (c != '\t') { throwInvalidSpace(c); } } } if (attrPtr >= attrBuffer.length) { _nameBuffer = attrBuffer = DataUtil.growArrayBy(attrBuffer, attrBuffer.length); } attrBuffer[attrPtr++] = c; } /* Simple optimization: for default ns removal (or, with * ns 1.1, any other as well), will use empty value... no * need to try to intern: */ if (attrPtr == 0) { bindNs(name, ""); } else { String uri = _config.canonicalizeURI(attrBuffer, attrPtr); bindNs(name, uri); } } protected final int handleEndElement() throws XMLStreamException { --_depth; _currToken = END_ELEMENT; // Ok, at this point we have seen '/', need the name _tokenName = _currElem.getName(); String pname = _tokenName.getPrefixedName(); char c; int i = 0; int len = pname.length(); do { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr++]; if (c != pname.charAt(i)) { reportUnexpectedEndTag(pname); } } while (++i < len); // Can still have a problem, if name didn't end there... if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr++]; if (c <= ' ') { c = skipInternalWs(false, null); } else if (c != '>') { if (c == ':' || XmlChars.is10NameChar(c)) { reportUnexpectedEndTag(pname); } } if (c != '>') { throwUnexpectedChar(c, " expected space or closing '>'"); } return END_ELEMENT; } protected final int handleEntityInText(boolean inAttr) throws XMLStreamException { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } char c = _inputBuffer[_inputPtr++]; if (c == '#') { return handleCharEntity(); } String start; if (c == 'a') { // amp or apos? if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr++]; if (c == 'm') { // amp? if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr++]; if (c == 'p') { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr++]; if (c == ';') { return INT_AMP; } start = "amp"; } else { start = "am"; } } else if (c == 'p') { // apos? if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr++]; if (c == 'o') { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr++]; if (c == 's') { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr++]; if (c == ';') { return INT_APOS; } start = "apos"; } else { start = "apo"; } } else { start = "ap"; } } else { start = "a"; } } else if (c == 'l') { // lt? if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr++]; if (c == 't') { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr++]; if (c == ';') { return INT_LT; } start = "lt"; } else { start = "l"; } } else if (c == 'g') { // gt? if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr++]; if (c == 't') { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr++]; if (c == ';') { return INT_GT; } start = "gt"; } else { start = "g"; } } else if (c == 'q') { // quot? if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr++]; if (c == 'u') { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr++]; if (c == 'o') { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr++]; if (c == 't') { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr++]; if (c == ';') { return INT_QUOTE; } start = "quot"; } else { start = "quo"; } } else { start = "qu"; } } else { start = "q"; } } else { start = ""; } final int[] TYPES = sCharTypes.NAME_CHARS; /* All righty: we have the beginning of the name, plus the first * char too. So let's see what we can do with it. */ char[] cbuf = _nameBuffer; int cix = 0; for (int len = start.length(); cix < len; ++cix) { cbuf[cix] = start.charAt(cix); } //int colon = -1; while (c != ';') { boolean ok; // Has to be a valid name start char though: if (c <= 0xFF) { switch (TYPES[c]) { case XmlCharTypes.CT_NAME_NONE: case XmlCharTypes.CT_NAME_COLON: // not ok for entities? case XmlCharTypes.CT_NAME_NONFIRST: ok = (cix > 0); break; case XmlCharTypes.CT_NAME_ANY: ok = true; break; default: ok = false; break; } } else { if (c < 0xE000) { // if ok, returns second surrogate; otherwise exception int value = decodeSurrogate(c); if (cix >= cbuf.length) { _nameBuffer = cbuf = DataUtil.growArrayBy(cbuf, cbuf.length); } cbuf[cix++] = c; c = _inputBuffer[_inputPtr-1]; // was read by decode func ok = (cix == 0) ? XmlChars.is10NameStartChar(value) : XmlChars.is10NameChar(value); } else if (c >= 0xFFFE) { c = handleInvalidXmlChar(c); ok = false; // never gets here } else { ok = true; } } if (!ok) { reportInvalidNameChar(c, cix); } if (cix >= cbuf.length) { _nameBuffer = cbuf = DataUtil.growArrayBy(cbuf, cbuf.length); } cbuf[cix++] = c; if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr++]; } // Ok, let's construct a (temporary) entity name, then: String pname = new String(cbuf, 0, cix); // (note: hash is dummy... not to be compared to anything etc) _tokenName = new PNameC(pname, null, pname, 0); /* One more thing: do we actually allow entities in this mode * and with this event? */ if (_config.willExpandEntities()) { reportInputProblem("General entity reference (&"+pname+";) encountered in entity expanding mode: operation not (yet) implemented"); } if (inAttr) { reportInputProblem("General entity reference (&"+pname+";) encountered in attribute value, in non-entity-expanding mode: no way to handle it"); } return 0; } @Override protected final void finishComment() throws XMLStreamException { final int[] TYPES = sCharTypes.OTHER_CHARS; final char[] inputBuffer = _inputBuffer; char[] outputBuffer = _textBuilder.resetWithEmpty(); int outPtr = 0; main_loop: while (true) { char c; // Then the tight ascii non-funny-char loop: ascii_loop: while (true) { int ptr = _inputPtr; if (ptr >= _inputEnd) { loadMoreGuaranteed(); ptr = _inputPtr; } if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } int max = _inputEnd; { int max2 = ptr + (outputBuffer.length - outPtr); if (max2 < max) { max = max2; } } while (ptr < max) { c = inputBuffer[ptr++]; if (c <= 0xFF) { if (TYPES[c] != 0) { _inputPtr = ptr; break ascii_loop; } } else if (c >= 0xD800) { // surrogates and 0xFFFE/0xFFFF _inputPtr = ptr; break ascii_loop; } outputBuffer[outPtr++] = c; } _inputPtr = ptr; } if (c <= 0xFF) { switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (inputBuffer[_inputPtr] == '\n') { ++_inputPtr; } markLF(); } c = '\n'; break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_HYPHEN: // '-->'? if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (_inputBuffer[_inputPtr] == '-') { // ok, must be end then ++_inputPtr; if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (_inputBuffer[_inputPtr++] != '>') { reportDoubleHyphenInComments(); } break main_loop; } break; // default: // Other types are not important here.. } } else if (c >= 0xD800) { // high-range, surrogates etc if (c < 0xE000) { // if ok, returns second surrogate; otherwise exception char d = checkSurrogate(c); outputBuffer[outPtr++] = c; if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } c = d; } else if (c >= 0xFFFE) { c = handleInvalidXmlChar(c); } } // We know there's room for one more: outputBuffer[outPtr++] = c; } _textBuilder.setCurrentLength(outPtr); } @Override protected final void finishPI() throws XMLStreamException { final int[] TYPES = sCharTypes.OTHER_CHARS; final char[] inputBuffer = _inputBuffer; char[] outputBuffer = _textBuilder.resetWithEmpty(); int outPtr = 0; main_loop: while (true) { char c; // Then the tight ascii non-funny-char loop: ascii_loop: while (true) { int ptr = _inputPtr; if (ptr >= _inputEnd) { loadMoreGuaranteed(); ptr = _inputPtr; } if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } int max = _inputEnd; { int max2 = ptr + (outputBuffer.length - outPtr); if (max2 < max) { max = max2; } } while (ptr < max) { c = inputBuffer[ptr++]; if (c <= 0xFF) { if (TYPES[c] != 0) { _inputPtr = ptr; break ascii_loop; } } else if (c >= 0xD800) { // surrogates and 0xFFFE/0xFFFF _inputPtr = ptr; break ascii_loop; } outputBuffer[outPtr++] = c; } _inputPtr = ptr; } if (c <= 0xFF) { switch (TYPES[c]) { case XmlCharTypes.CT_WS_CR: { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (inputBuffer[_inputPtr] == CHAR_LF) { ++_inputPtr; } markLF(); c = '\n'; } break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_QMARK: // '?>'? if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (_inputBuffer[_inputPtr] == '>') { ++_inputPtr; break main_loop; } break; // default: // Other types are not important here... } } else if (c >= 0xD800) { // high-range, surrogates etc if (c < 0xE000) { // if ok, returns second surrogate; otherwise exception char d = checkSurrogate(c); outputBuffer[outPtr++] = c; if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } c = d; } else if (c >= 0xFFFE) { c = handleInvalidXmlChar(c); } } // We know there's room for one more: outputBuffer[outPtr++] = c; } _textBuilder.setCurrentLength(outPtr); } @Override protected final void finishDTD(boolean copyContents) throws XMLStreamException { char[] outputBuffer = copyContents ? _textBuilder.resetWithEmpty() : null; int outPtr = 0; final int[] TYPES = sCharTypes.DTD_CHARS; boolean inDecl = false; // in declaration/directive? int quoteChar = 0; // inside quoted string? main_loop: while (true) { char c; /* First we'll have a quickie loop for speeding through * uneventful chars... */ ascii_loop: while (true) { int ptr = _inputPtr; if (ptr >= _inputEnd) { loadMoreGuaranteed(); ptr = _inputPtr; } int max = _inputEnd; if (outputBuffer != null) { if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } { int max2 = ptr + (outputBuffer.length - outPtr); if (max2 < max) { max = max2; } } } while (ptr < max) { c = _inputBuffer[ptr++]; if (c <= 0xFF) { if (TYPES[c] != 0) { _inputPtr = ptr; break ascii_loop; } } else if (c >= 0xD800) { // surrogates and 0xFFFE/0xFFFF _inputPtr = ptr; break ascii_loop; } if (outputBuffer != null) { outputBuffer[outPtr++] = c; } } _inputPtr = ptr; } if (c <= 0xFF) { switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (_inputBuffer[_inputPtr] == '\n') { ++_inputPtr; } markLF(); } c = '\n'; break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_DTD_QUOTE: // apos or quot if (quoteChar == 0) { quoteChar = c; } else { if (quoteChar == c) { quoteChar = 0; } } break; case XmlCharTypes.CT_DTD_LT: if (!inDecl) { inDecl = true; } break; case XmlCharTypes.CT_DTD_GT: if (quoteChar == 0) { inDecl = false; } break; case XmlCharTypes.CT_DTD_RBRACKET: if (!inDecl && quoteChar == 0) { break main_loop; } break; // default: // Other types are not important here... } } else if (c >= 0xD800) { // high-range, surrogates etc if (c < 0xE000) { // if ok, returns second surrogate; otherwise exception char d = checkSurrogate(c); if (outputBuffer != null) { outputBuffer[outPtr++] = c; if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } } c = d; } else if (c >= 0xFFFE) { c = handleInvalidXmlChar(c); } } if (outputBuffer != null) { // has room for one more outputBuffer[outPtr++] = c; } } if (outputBuffer != null) { _textBuilder.setCurrentLength(outPtr); } // but still need to match the '>'... char c = skipInternalWs(false, null); if (c != '>') { throwUnexpectedChar(c, " expected '>' after the internal subset"); } } @Override protected final void finishCData() throws XMLStreamException { final int[] TYPES = sCharTypes.OTHER_CHARS; final char[] inputBuffer = _inputBuffer; char[] outputBuffer = _textBuilder.resetWithEmpty(); int outPtr = 0; /* At this point, space (if any) has been skipped, and we are * to parse and store the contents */ main_loop: while (true) { char c; // Then the tight ascii non-funny-char loop: ascii_loop: while (true) { int ptr = _inputPtr; if (ptr >= _inputEnd) { loadMoreGuaranteed(); ptr = _inputPtr; } if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } int max = _inputEnd; { int max2 = ptr + (outputBuffer.length - outPtr); if (max2 < max) { max = max2; } } while (ptr < max) { c = inputBuffer[ptr++]; if (c <= 0xFF) { if (TYPES[c] != 0) { _inputPtr = ptr; break ascii_loop; } } else if (c >= 0xD800) { // surrogates and 0xFFFE/0xFFFF _inputPtr = ptr; break ascii_loop; } outputBuffer[outPtr++] = c; } _inputPtr = ptr; } // And then exceptions: if (c <= 0xFF) { switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (inputBuffer[_inputPtr] == '\n') { ++_inputPtr; } markLF(); } c = '\n'; break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_RBRACKET: // close ']]>' marker? /* Ok: let's just parse all consequtive right brackets, * and see if followed by greater-than char. This because * we can only push back at most one char at a time, and * thus can't easily just check a subset */ int count = 0; // ignore first bracket char d; do { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } d = _inputBuffer[_inputPtr]; if (d != ']') { break; } ++_inputPtr; ++count; } while (true); // Was the marker found? boolean ok = (d == '>' && count >= 1); if (ok) { --count; } // Brackets to copy to output? for (; count > 0; --count) { outputBuffer[outPtr++] = ']'; if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } } if (ok) { ++_inputPtr; // to consume '>' break main_loop; } break; // default: // Other types are not important here... } } else if (c >= 0xD800) { // high-range, surrogates etc if (c < 0xE000) { // if ok, returns second surrogate; otherwise exception char d = checkSurrogate(c); outputBuffer[outPtr++] = c; if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } c = d; } else if (c >= 0xFFFE) { c = handleInvalidXmlChar(c); } } // Ok, can output the char; there's room for one char at least outputBuffer[outPtr++] = c; } _textBuilder.setCurrentLength(outPtr); /* 03-Feb-2009, tatu: To support coalescing mode, may need to * do some extra work */ if (_cfgCoalescing && !_entityPending) { finishCoalescedText(); } } @Override protected final void finishCharacters() throws XMLStreamException { int outPtr; char[] outputBuffer; // Ok, so what was the first char / entity? { int c = mTmpChar; if (c < 0) { // from entity; can just copy as is c = -c; outputBuffer = _textBuilder.resetWithEmpty(); outPtr = 0; if ((c >> 16) != 0) { // surrogate pair? c -= 0x10000; /* Note: after resetting the buffer, it's known to have * space for more than 2 chars we need to add */ outputBuffer[outPtr++] = (char) (0xD800 | (c >> 10)); c = 0xDC00 | (c & 0x3FF); } outputBuffer[outPtr++] = (char) c; } else { // white space that we are interested in? if (c == INT_CR || c == INT_LF) { ++_inputPtr; // wasn't advanced yet, in this case outPtr = checkInTreeIndentation((char) c); if (outPtr < 0) { return; } // Above call also initializes the text builder appropriately outputBuffer = _textBuilder.getBufferWithoutReset(); } else { outputBuffer = _textBuilder.resetWithEmpty(); outPtr = 0; } } } final int[] TYPES = sCharTypes.TEXT_CHARS; final char[] inputBuffer = _inputBuffer; main_loop: while (true) { char c; // Then the tight ascii non-funny-char loop: ascii_loop: while (true) { int ptr = _inputPtr; if (ptr >= _inputEnd) { loadMoreGuaranteed(); ptr = _inputPtr; } if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } int max = _inputEnd; { int max2 = ptr + (outputBuffer.length - outPtr); if (max2 < max) { max = max2; } } while (ptr < max) { c = inputBuffer[ptr++]; if (c <= 0xFF) { if (TYPES[c] != 0) { _inputPtr = ptr; break ascii_loop; } } else if (c >= 0xD800) { // surrogates and 0xFFFE/0xFFFF _inputPtr = ptr; break ascii_loop; } outputBuffer[outPtr++] = c; } _inputPtr = ptr; } // And then exceptions: if (c <= 0xFF) { switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: { int ptr = _inputPtr; if (ptr >= _inputEnd) { loadMoreGuaranteed(); ptr = _inputPtr; } if (inputBuffer[ptr] == '\n') { ++_inputPtr; } markLF(); } c = '\n'; break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_LT: --_inputPtr; break main_loop; case XmlCharTypes.CT_AMP: { int d = handleEntityInText(false); if (d == 0) { // unexpandable general parsed entity // _inputPtr set by entity expansion method _entityPending = true; break main_loop; } // Ok; does it need a surrogate though? (over 16 bits) if ((d >> 16) != 0) { d -= 0x10000; outputBuffer[outPtr++] = (char) (0xD800 | (d >> 10)); // Need to ensure room for one more char if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } d = (0xDC00 | (d & 0x3FF)); } c = (char) d; } break; case XmlCharTypes.CT_RBRACKET: // ']]>'? { // Let's then just count number of brackets -- // in case they are not followed by '>' int count = 1; while (true) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = inputBuffer[_inputPtr]; if (c != ']') { break; } ++_inputPtr; // to skip past bracket ++count; } if (c == '>' && count > 1) { reportIllegalCDataEnd(); } // Nope. Need to output all brackets, then; except // for one that can be left for normal output while (count > 1) { outputBuffer[outPtr++] = ']'; if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } // Need to ensure room for one more char --count; } } // Can just output the first ']' along normal output c = ']'; break; // default: // Other types are not important here... } } else if (c >= 0xD800) { // high-range, surrogates etc if (c < 0xE000) { // if ok, returns second surrogate; otherwise exception char d = checkSurrogate(c); outputBuffer[outPtr++] = c; if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } c = d; } else if (c >= 0xFFFE) { c = handleInvalidXmlChar(c); } } outputBuffer[outPtr++] = c; } _textBuilder.setCurrentLength(outPtr); // 03-Feb-2009, tatu: Need to support coalescing mode too: if (_cfgCoalescing && !_entityPending) { finishCoalescedText(); } } @Override protected final void finishSpace() throws XMLStreamException { /* Ok: so, mTmpChar contains first space char. If it looks * like indentation, we can probably optimize a bit... */ char tmp = (char)mTmpChar; char[] outputBuffer; int outPtr; if (tmp == '\r' || tmp == '\n') { outPtr = checkPrologIndentation(tmp); if (outPtr < 0) { return; } // Above call also initializes the text builder appropriately outputBuffer = _textBuilder.getBufferWithoutReset(); } else { outputBuffer = _textBuilder.resetWithEmpty(); outputBuffer[0] = tmp; outPtr = 1; } int ptr = _inputPtr; while (true) { if (ptr >= _inputEnd) { if (!loadMore()) { break; } ptr = _inputPtr; } char c = _inputBuffer[ptr]; if (c > INT_SPACE) { break; } ++ptr; if (c == INT_LF) { markLF(ptr); } else if (c == INT_CR) { if (ptr >= _inputEnd) { if (!loadMore()) { // still need to output the lf if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } outputBuffer[outPtr++] = '\n'; break; } ptr = _inputPtr; } if (_inputBuffer[ptr] == '\n') { ++ptr; } markLF(ptr); c = '\n'; // need to convert to canonical lf } else if (c != ' ' && c != '\t') { _inputPtr = ptr; throwInvalidSpace(c); } // Ok, can output the char if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } outputBuffer[outPtr++] = c; } _inputPtr = ptr; _textBuilder.setCurrentLength(outPtr); } /* /********************************************************************** /* 2nd level parsing for coalesced text /********************************************************************** */ /** * Method that gets called after a primary text segment (of type * CHARACTERS or CDATA, not applicable to SPACE) has been read in * text buffer. Method has to see if the following event would * be textual as well, and if so, read it (and any other following * textual segments). */ protected final void finishCoalescedText() throws XMLStreamException { while (true) { // no matter what, will need (and can get) one char if (_inputPtr >= _inputEnd) { if (!loadMore()) { // most likely an error, will be handled later on return; } } if (_inputBuffer[_inputPtr] == '<') { // markup of some kind /* In worst case, need 3 chars ("<![") all in all to know * if we are getting a CDATA section */ if ((_inputPtr + 3) >= _inputEnd) { if (!loadAndRetain(3)) { // probably an error, but will be handled later return; } } if (_inputBuffer[_inputPtr+1] != '!' || _inputBuffer[_inputPtr+2] != '[') { // can't be CDATA, we are done here return; } // but let's verify it still: _inputPtr += 3; for (int i = 0; i < 6; ++i) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } char c = _inputBuffer[_inputPtr++]; if (c != CDATA_STR.charAt(i)) { reportTreeUnexpChar(c, " (expected '"+CDATA_STR.charAt(i)+"' for CDATA section)"); } } finishCoalescedCData(); } else { // textual (or entity, error etc) finishCoalescedCharacters(); if (_entityPending) { break; } } } } // note: code mostly copied from 'finishCharacters', just simplified // in some places protected final void finishCoalescedCData() throws XMLStreamException { final int[] TYPES = sCharTypes.OTHER_CHARS; final char[] inputBuffer = _inputBuffer; char[] outputBuffer = _textBuilder.getBufferWithoutReset(); int outPtr = _textBuilder.getCurrentLength(); /* At this point, space (if any) has been skipped, and we are * to parse and store the contents */ main_loop: while (true) { char c; // Then the tight ascii non-funny-char loop: ascii_loop: while (true) { int ptr = _inputPtr; if (ptr >= _inputEnd) { loadMoreGuaranteed(); ptr = _inputPtr; } if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } int max = _inputEnd; { int max2 = ptr + (outputBuffer.length - outPtr); if (max2 < max) { max = max2; } } while (ptr < max) { c = inputBuffer[ptr++]; if (c <= 0xFF) { if (TYPES[c] != 0) { _inputPtr = ptr; break ascii_loop; } } else if (c >= 0xD800) { // surrogates and 0xFFFE/0xFFFF _inputPtr = ptr; break ascii_loop; } outputBuffer[outPtr++] = c; } _inputPtr = ptr; } // And then exceptions: if (c <= 0xFF) { switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (inputBuffer[_inputPtr] == '\n') { ++_inputPtr; } markLF(); } c = '\n'; break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_RBRACKET: // close ']]>' marker? /* Ok: let's just parse all consequtive right brackets, * and see if followed by greater-than char. This because * we can only push back at most one char at a time, and * thus can't easily just check a subset */ int count = 0; // ignore first bracket char d; do { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } d = _inputBuffer[_inputPtr]; if (d != ']') { break; } ++_inputPtr; ++count; } while (true); // Was the marker found? boolean ok = (d == '>' && count >= 1); if (ok) { --count; } // Brackets to copy to output? for (; count > 0; --count) { outputBuffer[outPtr++] = ']'; if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } } if (ok) { ++_inputPtr; // to consume '>' break main_loop; } break; // default: // Other types are not important here... } } else if (c >= 0xD800) { // high-range, surrogates etc if (c < 0xE000) { // if ok, returns second surrogate; otherwise exception char d = checkSurrogate(c); outputBuffer[outPtr++] = c; if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } c = d; } else if (c >= 0xFFFE) { c = handleInvalidXmlChar(c); } } // Ok, can output the char; there's room for one char at least outputBuffer[outPtr++] = c; } _textBuilder.setCurrentLength(outPtr); } // note: code mostly copied from 'finishCharacters', just simplified // in some places protected final void finishCoalescedCharacters() throws XMLStreamException { // first char can't be from (char) entity (wrt finishCharacters) final int[] TYPES = sCharTypes.TEXT_CHARS; final char[] inputBuffer = _inputBuffer; char[] outputBuffer = _textBuilder.getBufferWithoutReset(); int outPtr = _textBuilder.getCurrentLength(); main_loop: while (true) { char c; ascii_loop: while (true) { // tight loop for ascii chars int ptr = _inputPtr; if (ptr >= _inputEnd) { loadMoreGuaranteed(); ptr = _inputPtr; } if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } int max = _inputEnd; { int max2 = ptr + (outputBuffer.length - outPtr); if (max2 < max) { max = max2; } } while (ptr < max) { c = inputBuffer[ptr++]; if (c <= 0xFF) { if (TYPES[c] != 0) { _inputPtr = ptr; break ascii_loop; } } else if (c >= 0xD800) { // surrogates and 0xFFFE/0xFFFF _inputPtr = ptr; break ascii_loop; } outputBuffer[outPtr++] = c; } _inputPtr = ptr; } // And then exceptions: if (c <= 0xFF) { switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: { int ptr = _inputPtr; if (ptr >= _inputEnd) { loadMoreGuaranteed(); ptr = _inputPtr; } if (inputBuffer[ptr] == '\n') { ++_inputPtr; } markLF(); } c = '\n'; break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_LT: --_inputPtr; break main_loop; case XmlCharTypes.CT_AMP: { int d = handleEntityInText(false); if (d == 0) { // unexpandable general parsed entity // _inputPtr set by entity expansion method _entityPending = true; break main_loop; } // Ok; does it need a surrogate though? (over 16 bits) if ((d >> 16) != 0) { d -= 0x10000; outputBuffer[outPtr++] = (char) (0xD800 | (d >> 10)); // Need to ensure room for one more char if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } d = (0xDC00 | (d & 0x3FF)); } c = (char) d; } break; case XmlCharTypes.CT_RBRACKET: // ']]>'? { // Let's then just count number of brackets -- // in case they are not followed by '>' int count = 1; while (true) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = inputBuffer[_inputPtr]; if (c != ']') { break; } ++_inputPtr; // to skip past bracket ++count; } if (c == '>' && count > 1) { reportIllegalCDataEnd(); } // Nope. Need to output all brackets, then; except // for one that can be left for normal output while (count > 1) { outputBuffer[outPtr++] = ']'; if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } // Need to ensure room for one more char --count; } } // Can just output the first ']' along normal output c = ']'; break; // default: // Other types are not important here... } } else if (c >= 0xD800) { // high-range, surrogates etc if (c < 0xE000) { // if ok, returns second surrogate; otherwise exception char d = checkSurrogate(c); outputBuffer[outPtr++] = c; if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } c = d; } else if (c >= 0xFFFE) { c = handleInvalidXmlChar(c); } } outputBuffer[outPtr++] = c; } _textBuilder.setCurrentLength(outPtr); } /** * Method that gets called after a primary text segment (of type * CHARACTERS or CDATA, not applicable to SPACE) has been skipped. * Method has to see if the following event would * be textual as well, and if so, skip it (and any other following * textual segments). * * @return True if we encountered an unexpandable entity */ @Override protected final boolean skipCoalescedText() throws XMLStreamException { while (true) { // no matter what, will need (and can get) one char if (_inputPtr >= _inputEnd) { if (!loadMore()) { // most likely an error, will be handled later on return false; } } if (_inputBuffer[_inputPtr] == '<') { // markup of some kind /* In worst case, need 3 chars ("<![") all in all to know * if we are getting a CDATA section */ if ((_inputPtr + 3) >= _inputEnd) { if (!loadAndRetain(3)) { // probably an error, but will be handled later return false; } } if (_inputBuffer[_inputPtr+1] != '!' || _inputBuffer[_inputPtr+2] != '[') { // can't be CDATA, we are done here return false; } // but let's verify it still: _inputPtr += 3; for (int i = 0; i < 6; ++i) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } char c = _inputBuffer[_inputPtr++]; if (c != CDATA_STR.charAt(i)) { reportTreeUnexpChar(c, " (expected '"+CDATA_STR.charAt(i)+"' for CDATA section)"); } } skipCData(); } else { // textual (or entity, error etc) if (skipCharacters()) { return true; } } } } /* /********************************************************************** /* 2nd level parsing for skipping content /********************************************************************** */ @Override protected final void skipComment() throws XMLStreamException { final int[] TYPES = sCharTypes.OTHER_CHARS; final char[] inputBuffer = _inputBuffer; while (true) { char c; // Then the tight ascii non-funny-char loop: ascii_loop: while (true) { int ptr = _inputPtr; int max = _inputEnd; if (ptr >= max) { loadMoreGuaranteed(); ptr = _inputPtr; max = _inputEnd; } while (ptr < max) { c = inputBuffer[ptr++]; if (c <= 0xFF) { if (TYPES[c] != 0) { _inputPtr = ptr; break ascii_loop; } } else if (c >= 0xD800) { // surrogates and 0xFFFE/0xFFFF _inputPtr = ptr; break ascii_loop; } } _inputPtr = ptr; } if (c <= 0xFF) { switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (inputBuffer[_inputPtr] == '\n') { ++_inputPtr; } markLF(); } break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_HYPHEN: // '-->'? if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (_inputBuffer[_inputPtr] == '-') { // ok, must be end then ++_inputPtr; if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (_inputBuffer[_inputPtr++] != '>') { reportDoubleHyphenInComments(); } return; } break; } // default: // Other types are not important here... } } } @Override protected final void skipPI() throws XMLStreamException { final int[] TYPES = sCharTypes.OTHER_CHARS; final char[] inputBuffer = _inputBuffer; while (true) { char c; // Then the tight ascii non-funny-char loop: ascii_loop: while (true) { int ptr = _inputPtr; int max = _inputEnd; if (ptr >= max) { loadMoreGuaranteed(); ptr = _inputPtr; max = _inputEnd; } while (ptr < max) { c = inputBuffer[ptr++]; if (c <= 0xFF) { if (TYPES[c] != 0) { _inputPtr = ptr; break ascii_loop; } } else if (c >= 0xD800) { // surrogates and 0xFFFE/0xFFFF _inputPtr = ptr; break ascii_loop; } } _inputPtr = ptr; } if (c <= 0xFF) { switch (TYPES[c]) { case XmlCharTypes.CT_WS_CR: { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (inputBuffer[_inputPtr] == CHAR_LF) { ++_inputPtr; } markLF(); } break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_QMARK: // '?>'? if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (_inputBuffer[_inputPtr] == '>') { ++_inputPtr; return; } break; // default: // Other types are not important here... } } else if (c >= 0xD800) { // high-range, surrogates etc if (c < 0xE000) { // if ok, returns second surrogate; otherwise exception /*char d =*/ checkSurrogate(c); } else if (c >= 0xFFFE) { c = handleInvalidXmlChar(c); } } // skipping, no need to output } } @Override protected final boolean skipCharacters() throws XMLStreamException { final int[] TYPES = sCharTypes.TEXT_CHARS; final char[] inputBuffer = _inputBuffer; while (true) { char c; // Then the tight ascii non-funny-char loop: ascii_loop: while (true) { int ptr = _inputPtr; int max = _inputEnd; if (ptr >= max) { loadMoreGuaranteed(); ptr = _inputPtr; max = _inputEnd; } while (ptr < max) { c = inputBuffer[ptr++]; if (c <= 0xFF) { if (TYPES[c] != 0) { _inputPtr = ptr; break ascii_loop; } } else if (c >= 0xD800) { // surrogates and 0xFFFE/0xFFFF _inputPtr = ptr; break ascii_loop; } } _inputPtr = ptr; } if (c <= 0xFF) { switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (inputBuffer[_inputPtr] == CHAR_LF) { ++_inputPtr; } markLF(); } break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_LT: --_inputPtr; return false; case XmlCharTypes.CT_AMP: { int d = handleEntityInText(false); if (d == 0) { // unexpandable general parsed entity return true; } } break; case XmlCharTypes.CT_RBRACKET: // ']]>'? { // Let's then just count number of brackets -- // in case they are not followed by '>' int count = 1; while (true) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = inputBuffer[_inputPtr]; if (c != ']') { break; } ++_inputPtr; // to skip past bracket ++count; } if (c == '>' && count > 1) { reportIllegalCDataEnd(); } } // Can just output the first ']' along normal output break; // default: // Other types are not important here... } } else if (c >= 0xD800) { // high-range, surrogates etc if (c < 0xE000) { // if ok, returns second surrogate; otherwise exception /*char d =*/ checkSurrogate(c); } else if (c >= 0xFFFE) { c = handleInvalidXmlChar(c); } } } } @Override protected final void skipCData() throws XMLStreamException { final int[] TYPES = sCharTypes.OTHER_CHARS; final char[] inputBuffer = _inputBuffer; while (true) { char c; // Then the tight ascii non-funny-char loop: ascii_loop: while (true) { int ptr = _inputPtr; int max = _inputEnd; if (ptr >= max) { loadMoreGuaranteed(); ptr = _inputPtr; max = _inputEnd; } while (ptr < max) { c = inputBuffer[ptr++]; if (c <= 0xFF) { if (TYPES[c] != 0) { _inputPtr = ptr; break ascii_loop; } } else if (c >= 0xD800) { // surrogates and 0xFFFE/0xFFFF _inputPtr = ptr; break ascii_loop; } } _inputPtr = ptr; } if (c <= 0xFF) { switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: { int ptr = _inputPtr; if (ptr >= _inputEnd) { loadMoreGuaranteed(); ptr = _inputPtr; } if (inputBuffer[ptr] == CHAR_LF) { ++ptr; ++_inputPtr; } markLF(ptr); } break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_RBRACKET: // ']]>'? { // end is nigh? int count = 0; do { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } ++count; c = _inputBuffer[_inputPtr++]; } while (c == ']'); if (c == '>') { if (count > 1) { // gotcha return; } // can still skip plain ']>'... } else { --_inputPtr; // need to push back last char } } break; // default: // Other types are not important here... } } else if (c >= 0xD800) { // high-range, surrogates etc if (c < 0xE000) { // if ok, returns second surrogate; otherwise exception /*char d =*/ checkSurrogate(c); } else if (c >= 0xFFFE) { c = handleInvalidXmlChar(c); } } } } @Override protected final void skipSpace() throws XMLStreamException { // mTmpChar has a space, but it's been checked, can ignore int ptr = _inputPtr; while (true) { if (ptr >= _inputEnd) { if (!loadMore()) { break; } ptr = _inputPtr; } char c = _inputBuffer[ptr]; if (c > ' ') { // !!! TODO: xml 1.1 ws break; } ++ptr; if (c == '\n') { markLF(ptr); } else if (c == '\r') { if (ptr >= _inputEnd) { if (!loadMore()) { break; } ptr = _inputPtr; } if (_inputBuffer[ptr] == '\n') { ++ptr; } markLF(ptr); } else if (c != ' ' && c != '\t') { _inputPtr = ptr; throwInvalidSpace(c); } } _inputPtr = ptr; } /* /********************************************************************** /* Entity/name handling /********************************************************************** */ /** * @return First byte following skipped white space */ protected char skipInternalWs(boolean reqd, String msg) throws XMLStreamException { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } char c = _inputBuffer[_inputPtr++]; if (c > INT_SPACE) { if (!reqd) { return c; } reportTreeUnexpChar(c, " (expected white space "+msg+")"); } do { // But let's first handle the space we already got: if (c == '\n') { markLF(); } else if (c == '\r') { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (_inputBuffer[_inputPtr] == '\n') { ++_inputPtr; } markLF(); } else if (c != ' ' && c != '\t') { throwInvalidSpace(c); } if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr++]; } while (c <= INT_SPACE); return c; } private final void matchAsciiKeyword(String keyw) throws XMLStreamException { for (int i = 1, len = keyw.length(); i < len; ++i) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } char c = _inputBuffer[_inputPtr++]; if (c != keyw.charAt(i)) { reportTreeUnexpChar(c, " (expected '"+keyw.charAt(i)+"' for "+keyw+" keyword)"); } } } /** *<p> * Note: consequtive white space is only considered indentation, * if the following token seems like a tag (start/end). This so * that if a CDATA section follows, it can be coalesced in * coalescing mode. Although we could check if coalescing mode is * enabled, this should seldom have significant effect either way, * so it removes one possible source of problems in coalescing mode. * * @return -1, if indentation was handled; offset in the output * buffer, if not */ protected final int checkInTreeIndentation(char c) throws XMLStreamException { if (c == '\r') { // First a degenerate case, a lone \r: if (_inputPtr >= _inputEnd && !loadMore()) { _textBuilder.resetWithIndentation(0, CHAR_SPACE); return -1; } if (_inputBuffer[_inputPtr] == '\n') { ++_inputPtr; } } markLF(); // Then need an indentation char (or start/end tag): if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr]; if (c != ' ' && c != '\t') { // May still be indentation, if it's lt + non-exclamation mark if (c == '<') { if ((_inputPtr+1) < _inputEnd && _inputBuffer[_inputPtr+1] != '!') { _textBuilder.resetWithIndentation(0, ' '); return -1; } } char[] outputBuffer = _textBuilder.resetWithEmpty(); outputBuffer[0] = '\n'; _textBuilder.setCurrentLength(1); return 1; } // So how many do we get? ++_inputPtr; int count = 1; int max = (c == ' ') ? TextBuilder.MAX_INDENT_SPACES : TextBuilder.MAX_INDENT_TABS; while (count <= max) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } char c2 = _inputBuffer[_inputPtr]; if (c2 != c) { // Has to be followed by a start/end tag... if (c2 == '<' && (_inputPtr+1) < _inputEnd && _inputBuffer[_inputPtr+1] != '!') { _textBuilder.resetWithIndentation(count, c); return -1; } break; } ++_inputPtr; ++count; } // Nope, hit something else, or too long: need to just copy the stuff // we know buffer has enough room either way char[] outputBuffer = _textBuilder.resetWithEmpty(); outputBuffer[0] = '\n'; for (int i = 1; i <= count; ++i) { outputBuffer[i] = c; } count += 1; // to account for leading lf _textBuilder.setCurrentLength(count); return count; } /** * @return -1, if indentation was handled; offset in the output * buffer, if not */ protected final int checkPrologIndentation(char c) throws XMLStreamException { if (c == '\r') { // First a degenerate case, a lone \r: if (_inputPtr >= _inputEnd && !loadMore()) { _textBuilder.resetWithIndentation(0, CHAR_SPACE); return -1; } if (_inputBuffer[_inputPtr] == '\n') { ++_inputPtr; } } markLF(); // Ok, indentation char? if (_inputPtr >= _inputEnd && !loadMore()) { _textBuilder.resetWithIndentation(0, CHAR_SPACE); return -1; } c = _inputBuffer[_inputPtr]; // won't advance past the char yet if (c != ' ' && c != '\t') { // If lt, it's still indentation ok: if (c == '<') { // need _textBuilder.resetWithIndentation(0, CHAR_SPACE); return -1; } // Nope... something else char[] outputBuffer = _textBuilder.resetWithEmpty(); outputBuffer[0] = '\n'; _textBuilder.setCurrentLength(1); return 1; } // So how many do we get? ++_inputPtr; int count = 1; int max = (c == ' ') ? TextBuilder.MAX_INDENT_SPACES : TextBuilder.MAX_INDENT_TABS; while (true) { if (_inputPtr >= _inputEnd && !loadMore()) { break; } if (_inputBuffer[_inputPtr] != c) { break; } ++_inputPtr; ++count; if (count >= max) { // ok, can't share... but can build it still // we know buffer has enough room char[] outputBuffer = _textBuilder.resetWithEmpty(); outputBuffer[0] = '\n'; for (int i = 1; i <= count; ++i) { outputBuffer[i] = c; } count += 1; // to account for leading lf _textBuilder.setCurrentLength(count); return count; } } // Ok, gotcha? _textBuilder.resetWithIndentation(count, c); return -1; } protected PName parsePName(char c) throws XMLStreamException { char[] nameBuffer = _nameBuffer; /* Let's do just quick sanity check first; a thorough check will be * done later on if necessary, now we'll just do the very cheap * check to catch extra spaces etc. */ if (c < INT_A) { // lowest acceptable start char, except for ':' that would be allowed in non-ns mode throwUnexpectedChar(c, "; expected a name start character"); } nameBuffer[0] = c; int hash = (int) c; int ptr = 1; while (true) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } c = _inputBuffer[_inputPtr]; int d = (int) c; if (d < 65) { // Ok; "_" (45), "." (46) and "0"-"9"/":" (48 - 57/58) still name chars if (d < 45 || d > 58 || d == 47) { // End of name, a single ascii char? PName n = _symbols.findSymbol(nameBuffer, 0, ptr, hash); if (n == null) { n = addPName(nameBuffer, ptr, hash); } return n; } } ++_inputPtr; if (ptr >= nameBuffer.length) { _nameBuffer = nameBuffer = DataUtil.growArrayBy(nameBuffer, nameBuffer.length); } nameBuffer[ptr++] = c; hash = (hash * 31) + d; } } protected final PName addPName(char[] nameBuffer, int nameLen, int hash) throws XMLStreamException { // Let's validate completely, now: char c = nameBuffer[0]; int namePtr = 1; int last_colon = -1; // where the colon is if (c < 0xD800 || c >= 0xE000) { if (!XmlChars.is10NameStartChar(c)) { reportInvalidNameChar(c, 0); } } else { if (nameLen == 1) { reportInvalidFirstSurrogate(c); } // Only returns if ok; throws exception otherwise checkSurrogateNameChar(c, nameBuffer[1], 0); ++namePtr; } for (; namePtr < nameLen; ++namePtr) { c = nameBuffer[namePtr]; if (c < 0xD800 || c >= 0xE000) { if (c == ':') { if (last_colon >= 0) { reportMultipleColonsInName(); } last_colon = namePtr; } else { if (!XmlChars.is10NameChar(c)) { reportInvalidNameChar(c, namePtr); } } } else { if ((namePtr+1) >= nameLen) { // unpaired surrogate reportInvalidFirstSurrogate(c); } checkSurrogateNameChar(c, nameBuffer[namePtr+1], namePtr); } } return _symbols.addSymbol(nameBuffer, 0, nameLen, hash); } protected String parsePublicId(char quoteChar) throws XMLStreamException { char[] outputBuffer = _nameBuffer; int outPtr = 0; final int[] TYPES = XmlCharTypes.PUBID_CHARS; boolean addSpace = false; main_loop: while (true) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } // Easier to check without char type table, first: char c = _inputBuffer[_inputPtr++]; if (c == quoteChar) { break main_loop; } if ((c > 0xFF) || TYPES[c] != XmlCharTypes.PUBID_OK) { throwUnexpectedChar(c, " in public identifier"); } // White space? Needs to be coalecsed if (c <= INT_SPACE) { addSpace = true; continue; } if (addSpace) { if (outPtr >= outputBuffer.length) { outputBuffer = _textBuilder.finishCurrentSegment(); outPtr = 0; } outputBuffer[outPtr++] = ' '; addSpace = false; } if (outPtr >= outputBuffer.length) { _nameBuffer = outputBuffer = DataUtil.growArrayBy(outputBuffer, outputBuffer.length); outPtr = 0; } outputBuffer[outPtr++] = c; } return new String(outputBuffer, 0, outPtr); } protected String parseSystemId(char quoteChar) throws XMLStreamException { char[] outputBuffer = _nameBuffer; int outPtr = 0; // attribute types are closest matches, so let's use them final int[] TYPES = sCharTypes.ATTR_CHARS; //boolean spaceToAdd = false; main_loop: while (true) { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } char c = _inputBuffer[_inputPtr++]; if (TYPES[c] != 0) { switch (TYPES[c]) { case XmlCharTypes.CT_INVALID: c = handleInvalidXmlChar(c); case XmlCharTypes.CT_WS_CR: { if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } if (_inputBuffer[_inputPtr] == '\n') { ++_inputPtr; } markLF(); } c = '\n'; break; case XmlCharTypes.CT_WS_LF: markLF(); break; case XmlCharTypes.CT_ATTR_QUOTE: if (c == quoteChar) { break main_loop; } } } if (outPtr >= outputBuffer.length) { _nameBuffer = outputBuffer = DataUtil.growArrayBy(outputBuffer, outputBuffer.length); outPtr = 0; } outputBuffer[outPtr++] = c; } return new String(outputBuffer, 0, outPtr); } /* /********************************************************************** /* Other parsing helper methods /********************************************************************** */ /** * This method is called to verify that a surrogate * pair found describes a legal surrogate pair (ie. expands * to a legal XML char) */ private char checkSurrogate(char firstChar) throws XMLStreamException { if (firstChar >= 0xDC00) { reportInvalidFirstSurrogate(firstChar); } if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } char sec = _inputBuffer[_inputPtr++]; if (sec < 0xDC00 || sec >= 0xE000) { reportInvalidSecondSurrogate(sec); } // And the composite, is it ok? int val = ((firstChar - 0xD800) << 10) + 0x10000; if (val > XmlConsts.MAX_UNICODE_CHAR) { reportInvalidXmlChar(val); } return sec; } private int checkSurrogateNameChar(char firstChar, char sec, int index) throws XMLStreamException { if (firstChar >= 0xDC00) { reportInvalidFirstSurrogate(firstChar); } if (sec < 0xDC00 || sec >= 0xE000) { reportInvalidSecondSurrogate(sec); } // And the composite, is it ok? int val = ((firstChar - 0xD800) << 10) + 0x10000; if (val > XmlConsts.MAX_UNICODE_CHAR) { reportInvalidXmlChar(val); } // !!! TODO: xml 1.1 vs 1.0 rules: none valid for 1.0, many for 1.1 if (true) { reportInvalidNameChar(val, index); } return val; } /** * This method is similar to <code>checkSurrogate</code>, but * returns the actual character code encoded by the surrogate * pair. This is needed if further validation rules (such as name * charactert checks) are to be done. */ private int decodeSurrogate(char firstChar) throws XMLStreamException { if (firstChar >= 0xDC00) { reportInvalidFirstSurrogate(firstChar); } if (_inputPtr >= _inputEnd) { loadMoreGuaranteed(); } char sec = _inputBuffer[_inputPtr++]; if (sec < 0xDC00 || sec >= 0xE000) { reportInvalidSecondSurrogate(sec); } // And the composite, is it ok? int val = ((firstChar - 0xD800) << 10) + 0x10000; if (val > XmlConsts.MAX_UNICODE_CHAR) { reportInvalidXmlChar(val); } return val; } private void reportInvalidFirstSurrogate(char ch) throws XMLStreamException { reportInputProblem("Invalid surrogate character (code 0x"+Integer.toHexString((int) ch)+"): can not start a surrogate pair"); } private void reportInvalidSecondSurrogate(char ch) throws XMLStreamException { reportInputProblem("Invalid surrogate character (code "+Integer.toHexString((int) ch)+"): is not legal as the second part of a surrogate pair"); } /* /********************************************************************** /* Location handling /********************************************************************** */ @Override public XMLStreamLocation2 getCurrentLocation() { return LocationImpl.fromZeroBased (_config.getPublicId(), _config.getSystemId(), _pastBytesOrChars + _inputPtr, _currRow, _inputPtr - _rowStartOffset); } @Override public int getCurrentColumnNr() { return _inputPtr - _rowStartOffset; } @Override public long getStartingByteOffset() { // N/A for this type return -1L; } @Override public long getStartingCharOffset() { return _startRawOffset; } @Override public long getEndingByteOffset() throws XMLStreamException { // N/A for this type return -1L; } @Override public long getEndingCharOffset() throws XMLStreamException { // Have to complete the token to know the ending location... if (_tokenIncomplete) { finishToken(); } return _pastBytesOrChars + _inputPtr; } protected final void markLF(int offset) { _rowStartOffset = offset; ++_currRow; } protected final void markLF() { _rowStartOffset = _inputPtr; ++_currRow; } protected final void setStartLocation() { _startRawOffset = _pastBytesOrChars + _inputPtr; _startRow = _currRow; _startColumn = _inputPtr - _rowStartOffset; } /* /********************************************************************** /* Input loading /********************************************************************** */ @Override protected final boolean loadMore() throws XMLStreamException { // If it's a block source, there's no Reader, or any more data: if (_in == null) { _inputEnd = 0; return false; } // Otherwise let's update offsets: _pastBytesOrChars += _inputEnd; _rowStartOffset -= _inputEnd; _inputPtr = 0; try { int count = _in.read(_inputBuffer, 0, _inputBuffer.length); if (count < 1) { _inputEnd = 0; if (count == 0) { /* Sanity check; should never happen with correctly written * InputStreams... */ reportInputProblem("Reader returned 0 bytes, even when asked to read up to "+_inputBuffer.length); } return false; } _inputEnd = count; return true; } catch (IOException ioe) { throw new IoStreamException(ioe); } } protected final char loadOne() throws XMLStreamException { if (!loadMore()) { reportInputProblem("Unexpected end-of-input when trying to parse "+ErrorConsts.tokenTypeDesc(_currToken)); } return _inputBuffer[_inputPtr++]; } protected final char loadOne(int type) throws XMLStreamException { if (!loadMore()) { reportInputProblem("Unexpected end-of-input when trying to parse "+ErrorConsts.tokenTypeDesc(type)); } return _inputBuffer[_inputPtr++]; } protected final boolean loadAndRetain(int nrOfChars) throws XMLStreamException { /* first: can't move, if we were handed an immutable block * (alternative to handing Reader as _in) */ if (_in == null) { return false; } // otherwise, need to use cut'n pasted code from loadMore()... _pastBytesOrChars += _inputPtr; _rowStartOffset -= _inputPtr; int remaining = (_inputEnd - _inputPtr); // must be > 0 System.arraycopy(_inputBuffer, _inputPtr, _inputBuffer, 0, remaining); _inputPtr = 0; _inputEnd = remaining; // temporarily set to cover copied stuff try { do { int max = _inputBuffer.length - _inputEnd; int count = _in.read(_inputBuffer, _inputEnd, max); if (count < 1) { if (count == 0) { // Sanity check, should never happen with non-buggy readers/stream reportInputProblem("Reader returned 0 bytes, even when asked to read up to "+max); } return false; } _inputEnd += count; } while (_inputEnd < nrOfChars); return true; } catch (IOException ioe) { throw new IoStreamException(ioe); } } }