/* Aalto XML processor * * Copyright (c) 2006- Tatu Saloranta, tatu.saloranta@iki.fi * * Licensed under the License specified in the file LICENSE which is * included with the source code. * You may not use this file except in compliance with the License. * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.fasterxml.aalto.in; import java.io.*; import java.text.MessageFormat; import javax.xml.stream.Location; import javax.xml.stream.XMLReporter; import javax.xml.stream.XMLStreamException; import com.fasterxml.aalto.impl.ErrorConsts; import com.fasterxml.aalto.impl.IoStreamException; import com.fasterxml.aalto.impl.LocationImpl; import com.fasterxml.aalto.util.CharsetNames; /** * Class that takes care of bootstrapping main document input from * a Stream input source. */ public final class CharSourceBootstrapper extends InputBootstrapper { /** * Whether to use a bigger (4000, ie. 8k) or smaller (2000 -> 4k) * buffer size? */ final static int DEFAULT_BUFFER_SIZE = 4000; final static char CHAR_BOM_MARKER = (char) 0xFEFF; /* /********************************************************************** /* Configuration /********************************************************************** */ /** * Underlying Reader to use for reading content. */ final Reader _in; /* /********************************************************************** /* Input buffering /********************************************************************** */ final char[] _inputBuffer; private int _inputPtr; /** * Offset of the first character after the end of valid buffer * contents. */ private int _inputLast; /* /////////////////////////////////////////////////////////////// // Life-cycle /////////////////////////////////////////////////////////////// */ private CharSourceBootstrapper(ReaderConfig cfg, Reader r) { super(cfg); _in = r; _inputBuffer = cfg.allocFullCBuffer(ReaderConfig.DEFAULT_CHAR_BUFFER_LEN); _inputLast = _inputPtr = 0; } private CharSourceBootstrapper(ReaderConfig cfg, char[] buffer, int start, int len) { super(cfg); _in = null; _inputBuffer = buffer; _inputPtr = start; _inputLast = start+len; } public static CharSourceBootstrapper construct(ReaderConfig cfg, Reader r) throws XMLStreamException { return new CharSourceBootstrapper(cfg, r); } public static CharSourceBootstrapper construct(ReaderConfig cfg, char[] buffer, int start, int len) throws XMLStreamException { return new CharSourceBootstrapper(cfg, buffer, start, len); } @Override public final XmlScanner bootstrap() throws XMLStreamException { try { return doBootstrap(); } catch (IOException ioe) { throw new IoStreamException(ioe); } finally { _config.freeSmallCBuffer(mKeyword); } } public XmlScanner doBootstrap() throws IOException, XMLStreamException { if (_inputPtr >= _inputLast) { initialLoad(7); } String normEnc = null; /* Only need 6 for signature ("<?xml\s"), but there may be a leading * BOM in there... and a valid xml declaration has to be longer * than 7 chars anyway (although, granted, shortest valid xml docl * is just 4 chars... "<a/>") */ if ((_inputLast - _inputPtr) >= 7) { char c = _inputBuffer[_inputPtr]; // BOM to skip? if (c == CHAR_BOM_MARKER) { c = _inputBuffer[++_inputPtr]; } if (c == '<') { if (_inputBuffer[_inputPtr+1] == '?' && _inputBuffer[_inputPtr+2] == 'x' && _inputBuffer[_inputPtr+3] == 'm' && _inputBuffer[_inputPtr+4] == 'l' && _inputBuffer[_inputPtr+5] <= 0x0020) { // Yup, got the declaration ok! _inputPtr += 6; // skip declaration readXmlDeclaration(); if (mFoundEncoding != null) { normEnc = verifyXmlEncoding(mFoundEncoding); } } } else { /* We may also get something that would be invalid xml * ("garbage" char; neither '<' nor space). If so, and * it's one of "well-known" cases, we can not only throw * an exception but also indicate a clue as to what is likely * to be wrong. */ /* Specifically, UTF-8 read via, say, ISO-8859-1 reader, can * "leak" marker (0xEF, 0xBB, 0xBF). While we could just eat * it, there's bound to be other problems cropping up, so let's * inform about the problem right away. */ if (c == 0xEF) { throw new IoStreamException("Unexpected first character (char code 0xEF), not valid in xml document: could be mangled UTF-8 BOM marker. Make sure that the Reader uses correct encoding or pass an InputStream instead"); } } } _config.setActualEncoding(normEnc); _config.setXmlDeclInfo(mDeclaredXmlVersion, mFoundEncoding, mStandalone); return new ReaderScanner(_config, _in, _inputBuffer, _inputPtr, _inputLast); } /* //////////////////////////////////////////////////// // Internal methods, main xml decl processing //////////////////////////////////////////////////// */ /** * @return Normalized encoding name */ protected String verifyXmlEncoding(String enc) throws XMLStreamException { enc = CharsetNames.normalize(enc); // Probably no point in comparing at all... is there? // But we can report a possible problem? String extEnc = _config.getExternalEncoding(); if (extEnc != null && enc != null && !extEnc.equalsIgnoreCase(enc)) { XMLReporter rep = _config.getXMLReporter(); if (rep != null) { Location loc = getLocation(); rep.report(MessageFormat.format(ErrorConsts.W_MIXED_ENCODINGS, new Object[] { extEnc, enc }), ErrorConsts.WT_XML_DECL, this, loc); } } return enc; } /* ///////////////////////////////////////////////////// // Internal methods, loading input data ///////////////////////////////////////////////////// */ protected boolean initialLoad(int minimum) throws IOException { _inputPtr = 0; _inputLast = 0; if (_in == null) { // for block sources return false; } while (_inputLast < minimum) { int count = _in.read(_inputBuffer, _inputLast, _inputBuffer.length - _inputLast); if (count < 1) { return false; } _inputLast += count; } return true; } protected void loadMore() throws IOException, XMLStreamException { /* Need to make sure offsets are properly updated for error * reporting purposes, and do this now while previous amounts * are still known. */ _inputProcessed += _inputLast; _inputRowStart -= _inputLast; if (_in == null) { // for block sources reportEof(); } _inputPtr = 0; _inputLast = _in.read(_inputBuffer, 0, _inputBuffer.length); if (_inputLast < 1) { reportEof(); } } /* ///////////////////////////////////////////////////// // Implementations of abstract parsing methods ///////////////////////////////////////////////////// */ @Override protected void pushback() { --_inputPtr; } @Override protected int getNext() throws IOException, XMLStreamException { return (_inputPtr < _inputLast) ? _inputBuffer[_inputPtr++] : nextChar(); } @Override protected int getNextAfterWs(boolean reqWs) throws IOException, XMLStreamException { int count = 0; while (true) { char c = (_inputPtr < _inputLast) ? _inputBuffer[_inputPtr++] : nextChar(); if (c > CHAR_SPACE) { if (reqWs && count == 0) { reportUnexpectedChar(c, ERR_XMLDECL_EXP_SPACE); } return c; } if (c == CHAR_CR || c == CHAR_LF) { skipCRLF(c); } else if (c == CHAR_NULL) { reportNull(); } ++count; } } /** * @return First character that does not match expected, if any; * CHAR_NULL if match succeeded */ @Override protected int checkKeyword(String exp) throws IOException, XMLStreamException { int len = exp.length(); for (int ptr = 1; ptr < len; ++ptr) { char c = (_inputPtr < _inputLast) ? _inputBuffer[_inputPtr++] : nextChar(); if (c != exp.charAt(ptr)) { return c; } if (c == CHAR_NULL) { reportNull(); } } return CHAR_NULL; } @Override protected int readQuotedValue(char[] kw, int quoteChar) throws IOException, XMLStreamException { int i = 0; int len = kw.length; while (true) { char c = (_inputPtr < _inputLast) ? _inputBuffer[_inputPtr++] : nextChar(); if (c == CHAR_CR || c == CHAR_LF) { skipCRLF(c); } else if (c == CHAR_NULL) { reportNull(); } if (c == quoteChar) { return (i < len) ? i : -1; } // Let's just truncate longer values, but match quote if (i < len) { kw[i++] = c; } } } @Override protected Location getLocation() { return LocationImpl.fromZeroBased (_config.getPublicId(), _config.getSystemId(), _inputProcessed + _inputPtr, _inputRow, _inputPtr - _inputRowStart); } /* /********************************************************************** /* Internal methods, single-byte access methods /********************************************************************** */ protected char nextChar() throws IOException, XMLStreamException { if (_inputPtr >= _inputLast) { loadMore(); } return _inputBuffer[_inputPtr++]; } protected void skipCRLF(char lf) throws IOException, XMLStreamException { if (lf == '\r') { char c = (_inputPtr < _inputLast) ? _inputBuffer[_inputPtr++] : nextChar(); if (c != '\n') { --_inputPtr; // pushback if not 2-char/byte lf } } ++_inputRow; _inputRowStart = _inputPtr; } }