/* * Copyright (c) 2005, 2006, 2007 Henri Sivonen * Copyright (c) 2007-2013 Mozilla Foundation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ package nu.validator.htmlparser.io; import java.io.IOException; import java.io.InputStream; import java.io.Reader; import java.nio.charset.UnsupportedCharsetException; import nu.validator.htmlparser.common.CharacterHandler; import nu.validator.htmlparser.common.EncodingDeclarationHandler; import nu.validator.htmlparser.common.Heuristics; import nu.validator.htmlparser.common.TransitionHandler; import nu.validator.htmlparser.common.XmlViolationPolicy; import nu.validator.htmlparser.extra.NormalizationChecker; import nu.validator.htmlparser.impl.ErrorReportingTokenizer; import nu.validator.htmlparser.impl.Tokenizer; import nu.validator.htmlparser.impl.UTF16Buffer; import nu.validator.htmlparser.rewindable.RewindableInputStream; import org.xml.sax.ErrorHandler; import org.xml.sax.InputSource; import org.xml.sax.Locator; import org.xml.sax.SAXException; import org.xml.sax.SAXParseException; public class Driver implements EncodingDeclarationHandler { /** * The input UTF-16 code unit stream. If a byte stream was given, this * object is an instance of <code>HtmlInputStreamReader</code>. */ private Reader reader; /** * The reference to the rewindable byte stream. <code>null</code> if * prohibited or no longer needed. */ private RewindableInputStream rewindableInputStream; private boolean swallowBom; private Encoding characterEncoding; private boolean allowRewinding = true; private Heuristics heuristics = Heuristics.NONE; private final Tokenizer tokenizer; private Confidence confidence; /** * Used for NFC checking if non-<code>null</code>, source code capture, * etc. */ private CharacterHandler[] characterHandlers = new CharacterHandler[0]; public Driver(Tokenizer tokenizer) { this.tokenizer = tokenizer; tokenizer.setEncodingDeclarationHandler(this); } /** * Returns the allowRewinding. * * @return the allowRewinding */ public boolean isAllowRewinding() { return allowRewinding; } /** * Sets the allowRewinding. * * @param allowRewinding * the allowRewinding to set */ public void setAllowRewinding(boolean allowRewinding) { this.allowRewinding = allowRewinding; } /** * Turns NFC checking on or off. * * @param enable * <code>true</code> if checking on */ public void setCheckingNormalization(boolean enable) { if (enable) { if (isCheckingNormalization()) { return; } else { NormalizationChecker normalizationChecker = new NormalizationChecker(tokenizer); normalizationChecker.setErrorHandler(tokenizer.getErrorHandler()); } } else { if (isCheckingNormalization()) { CharacterHandler[] newHandlers = new CharacterHandler[characterHandlers.length - 1]; boolean skipped = false; int j = 0; for (int i = 0; i < characterHandlers.length; i++) { CharacterHandler ch = characterHandlers[i]; if (!(!skipped && (ch instanceof NormalizationChecker))) { newHandlers[j] = ch; j++; } } characterHandlers = newHandlers; } else { return; } } } public void addCharacterHandler(CharacterHandler characterHandler) { if (characterHandler == null) { throw new IllegalArgumentException("Null argument."); } CharacterHandler[] newHandlers = new CharacterHandler[characterHandlers.length + 1]; System.arraycopy(characterHandlers, 0, newHandlers, 0, characterHandlers.length); newHandlers[characterHandlers.length] = characterHandler; characterHandlers = newHandlers; } /** * Query if checking normalization. * * @return <code>true</code> if checking on */ public boolean isCheckingNormalization() { for (int i = 0; i < characterHandlers.length; i++) { CharacterHandler ch = characterHandlers[i]; if (ch instanceof NormalizationChecker) { return true; } } return false; } /** * Runs the tokenization. This is the main entry point. * * @param is * the input source * @throws SAXException * on fatal error (if configured to treat XML violations as * fatal) or if the token handler threw * @throws IOException * if the stream threw */ public void tokenize(InputSource is) throws SAXException, IOException { if (is == null) { throw new IllegalArgumentException("InputSource was null."); } tokenizer.start(); confidence = Confidence.TENTATIVE; swallowBom = true; rewindableInputStream = null; tokenizer.initLocation(is.getPublicId(), is.getSystemId()); this.reader = is.getCharacterStream(); this.characterEncoding = encodingFromExternalDeclaration(is.getEncoding()); if (this.reader == null) { InputStream inputStream = is.getByteStream(); if (inputStream == null) { throw new SAXException("Both streams in InputSource were null."); } if (this.characterEncoding == null) { if (allowRewinding) { inputStream = rewindableInputStream = new RewindableInputStream( inputStream); } this.reader = new HtmlInputStreamReader(inputStream, tokenizer.getErrorHandler(), tokenizer, this, heuristics); } else { if (this.characterEncoding != Encoding.UTF8) { warnWithoutLocation("Legacy encoding \u201C" + this.characterEncoding.getCanonName() + "\u201D used. Documents should use UTF-8."); } becomeConfident(); this.reader = new HtmlInputStreamReader(inputStream, tokenizer.getErrorHandler(), tokenizer, this, this.characterEncoding); } } else { becomeConfident(); } Throwable t = null; try { for (;;) { try { for (int i = 0; i < characterHandlers.length; i++) { CharacterHandler ch = characterHandlers[i]; ch.start(); } runStates(); break; } catch (ReparseException e) { if (rewindableInputStream == null) { tokenizer.fatal("Changing encoding at this point would need non-streamable behavior."); } else { rewindableInputStream.rewind(); becomeConfident(); this.reader = new HtmlInputStreamReader( rewindableInputStream, tokenizer.getErrorHandler(), tokenizer, this, this.characterEncoding); } continue; } } } catch (Throwable tr) { t = tr; } finally { try { tokenizer.end(); characterEncoding = null; for (int i = 0; i < characterHandlers.length; i++) { CharacterHandler ch = characterHandlers[i]; ch.end(); } reader.close(); reader = null; rewindableInputStream = null; } catch (Throwable tr) { if (t == null) { t = tr; } // else drop the later throwable } if (t != null) { if (t instanceof IOException) { throw (IOException) t; } else if (t instanceof SAXException) { throw (SAXException) t; } else if (t instanceof RuntimeException) { throw (RuntimeException) t; } else if (t instanceof Error) { throw (Error) t; } else { // impossible throw new RuntimeException(t); } } } } void dontSwallowBom() { swallowBom = false; } private void runStates() throws SAXException, IOException { char[] buffer = new char[2048]; UTF16Buffer bufr = new UTF16Buffer(buffer, 0, 0); boolean lastWasCR = false; int len = -1; if ((len = reader.read(buffer)) != -1) { assert len > 0; int streamOffset = 0; int offset = 0; int length = len; if (swallowBom) { if (buffer[0] == '\uFEFF') { streamOffset = -1; offset = 1; length--; } } if (length > 0) { for (int i = 0; i < characterHandlers.length; i++) { CharacterHandler ch = characterHandlers[i]; ch.characters(buffer, offset, length); } tokenizer.setTransitionBaseOffset(streamOffset); bufr.setStart(offset); bufr.setEnd(offset + length); while (bufr.hasMore()) { bufr.adjust(lastWasCR); lastWasCR = false; if (bufr.hasMore()) { lastWasCR = tokenizer.tokenizeBuffer(bufr); } } } streamOffset = length; while ((len = reader.read(buffer)) != -1) { assert len > 0; for (int i = 0; i < characterHandlers.length; i++) { CharacterHandler ch = characterHandlers[i]; ch.characters(buffer, 0, len); } tokenizer.setTransitionBaseOffset(streamOffset); bufr.setStart(0); bufr.setEnd(len); while (bufr.hasMore()) { bufr.adjust(lastWasCR); lastWasCR = false; if (bufr.hasMore()) { lastWasCR = tokenizer.tokenizeBuffer(bufr); } } streamOffset += len; } } tokenizer.eof(); } public void setEncoding(Encoding encoding, Confidence confidence) { this.characterEncoding = encoding; if (confidence == Confidence.CERTAIN) { becomeConfident(); } } public boolean internalEncodingDeclaration(String internalCharset) throws SAXException { try { internalCharset = Encoding.toAsciiLowerCase(internalCharset); Encoding cs; if ("utf-16".equals(internalCharset) || "utf-16be".equals(internalCharset) || "utf-16le".equals(internalCharset)) { tokenizer.errTreeBuilder("Internal encoding declaration specified \u201C" + internalCharset + "\u201D which is not an ASCII superset. Continuing as if the encoding had been \u201Cutf-8\u201D."); cs = Encoding.UTF8; internalCharset = "utf-8"; } else { cs = Encoding.forName(internalCharset); } Encoding actual = cs.getActualHtmlEncoding(); if (actual == null) { actual = cs; } if (!actual.isAsciiSuperset()) { tokenizer.errTreeBuilder("Internal encoding declaration specified \u201C" + internalCharset + "\u201D which is not an ASCII superset. Not changing the encoding."); return false; } if (characterEncoding == null) { // Reader case return true; } if (characterEncoding == actual) { becomeConfident(); return true; } if (confidence == Confidence.CERTAIN && actual != characterEncoding) { tokenizer.errTreeBuilder("Internal encoding declaration \u201C" + internalCharset + "\u201D disagrees with the actual encoding of the document (\u201C" + characterEncoding.getCanonName() + "\u201D)."); } else { Encoding newEnc = whineAboutEncodingAndReturnActual( internalCharset, cs); tokenizer.errTreeBuilder("Changing character encoding \u201C" + internalCharset + "\u201D and reparsing."); characterEncoding = newEnc; throw new ReparseException(); } return true; } catch (UnsupportedCharsetException e) { tokenizer.errTreeBuilder("Internal encoding declaration named an unsupported chararacter encoding \u201C" + internalCharset + "\u201D."); return false; } } /** * */ private void becomeConfident() { if (rewindableInputStream != null) { rewindableInputStream.willNotRewind(); } confidence = Confidence.CERTAIN; tokenizer.becomeConfident(); } /** * Sets the encoding sniffing heuristics. * * @param heuristics * the heuristics to set */ public void setHeuristics(Heuristics heuristics) { this.heuristics = heuristics; } /** * Reports a warning without line/col * * @param message * the message * @throws SAXException */ protected void warnWithoutLocation(String message) throws SAXException { ErrorHandler errorHandler = tokenizer.getErrorHandler(); if (errorHandler == null) { return; } SAXParseException spe = new SAXParseException(message, null, tokenizer.getSystemId(), -1, -1); errorHandler.warning(spe); } /** * Initializes a decoder from external decl. */ protected Encoding encodingFromExternalDeclaration(String encoding) throws SAXException { if (encoding == null) { return null; } encoding = Encoding.toAsciiLowerCase(encoding); try { Encoding cs = Encoding.forName(encoding); if ("utf-16".equals(cs.getCanonName()) || "utf-32".equals(cs.getCanonName())) { swallowBom = false; } return whineAboutEncodingAndReturnActual(encoding, cs); } catch (UnsupportedCharsetException e) { tokenizer.err("Unsupported character encoding name: \u201C" + encoding + "\u201D. Will sniff."); swallowBom = true; } return null; // keep the compiler happy } /** * @param encoding * @param cs * @return * @throws SAXException */ protected Encoding whineAboutEncodingAndReturnActual(String encoding, Encoding cs) throws SAXException { String canonName = cs.getCanonName(); if (!cs.isRegistered()) { if (encoding.startsWith("x-")) { tokenizer.err("The encoding \u201C" + encoding + "\u201D is not an IANA-registered encoding. (Charmod C022)"); } else { tokenizer.err("The encoding \u201C" + encoding + "\u201D is not an IANA-registered encoding and did not use the \u201Cx-\u201D prefix. (Charmod C023)"); } } else if (!canonName.equals(encoding)) { tokenizer.err("The encoding \u201C" + encoding + "\u201D is not the preferred name of the character encoding in use. The preferred name is \u201C" + canonName + "\u201D. (Charmod C024)"); } if (cs.isShouldNot()) { tokenizer.warn("Authors should not use the character encoding \u201C" + encoding + "\u201D. It is recommended to use \u201CUTF-8\u201D."); } else if (cs.isLikelyEbcdic()) { tokenizer.warn("Authors should not use EBCDIC-based encodings. It is recommended to use \u201CUTF-8\u201D."); } else if (cs.isObscure()) { tokenizer.warn("The character encoding \u201C" + encoding + "\u201D is not widely supported. Better interoperability may be achieved by using \u201CUTF-8\u201D."); } Encoding actual = cs.getActualHtmlEncoding(); if (actual == null) { return cs; } else { tokenizer.warn("Using \u201C" + actual.getCanonName() + "\u201D instead of the declared encoding \u201C" + encoding + "\u201D."); return actual; } } private class ReparseException extends SAXException { } void notifyAboutMetaBoundary() { tokenizer.notifyAboutMetaBoundary(); } /** * @param commentPolicy * @see nu.validator.htmlparser.impl.Tokenizer#setCommentPolicy(nu.validator.htmlparser.common.XmlViolationPolicy) */ public void setCommentPolicy(XmlViolationPolicy commentPolicy) { tokenizer.setCommentPolicy(commentPolicy); } /** * @param contentNonXmlCharPolicy * @see nu.validator.htmlparser.impl.Tokenizer#setContentNonXmlCharPolicy(nu.validator.htmlparser.common.XmlViolationPolicy) */ public void setContentNonXmlCharPolicy( XmlViolationPolicy contentNonXmlCharPolicy) { tokenizer.setContentNonXmlCharPolicy(contentNonXmlCharPolicy); } /** * @param contentSpacePolicy * @see nu.validator.htmlparser.impl.Tokenizer#setContentSpacePolicy(nu.validator.htmlparser.common.XmlViolationPolicy) */ public void setContentSpacePolicy(XmlViolationPolicy contentSpacePolicy) { tokenizer.setContentSpacePolicy(contentSpacePolicy); } /** * @param eh * @see nu.validator.htmlparser.impl.Tokenizer#setErrorHandler(org.xml.sax.ErrorHandler) */ public void setErrorHandler(ErrorHandler eh) { tokenizer.setErrorHandler(eh); for (int i = 0; i < characterHandlers.length; i++) { CharacterHandler ch = characterHandlers[i]; if (ch instanceof NormalizationChecker) { NormalizationChecker nc = (NormalizationChecker) ch; nc.setErrorHandler(eh); } } } public void setTransitionHandler(TransitionHandler transitionHandler) { if (tokenizer instanceof ErrorReportingTokenizer) { ErrorReportingTokenizer ert = (ErrorReportingTokenizer) tokenizer; ert.setTransitionHandler(transitionHandler); } else if (transitionHandler != null) { throw new IllegalStateException("Attempt to set a transition handler on a plain tokenizer."); } } /** * @param html4ModeCompatibleWithXhtml1Schemata * @see nu.validator.htmlparser.impl.Tokenizer#setHtml4ModeCompatibleWithXhtml1Schemata(boolean) */ public void setHtml4ModeCompatibleWithXhtml1Schemata( boolean html4ModeCompatibleWithXhtml1Schemata) { tokenizer.setHtml4ModeCompatibleWithXhtml1Schemata(html4ModeCompatibleWithXhtml1Schemata); } /** * @param mappingLangToXmlLang * @see nu.validator.htmlparser.impl.Tokenizer#setMappingLangToXmlLang(boolean) */ public void setMappingLangToXmlLang(boolean mappingLangToXmlLang) { tokenizer.setMappingLangToXmlLang(mappingLangToXmlLang); } /** * @param namePolicy * @see nu.validator.htmlparser.impl.Tokenizer#setNamePolicy(nu.validator.htmlparser.common.XmlViolationPolicy) */ public void setNamePolicy(XmlViolationPolicy namePolicy) { tokenizer.setNamePolicy(namePolicy); } /** * @param xmlnsPolicy * @see nu.validator.htmlparser.impl.Tokenizer#setXmlnsPolicy(nu.validator.htmlparser.common.XmlViolationPolicy) */ public void setXmlnsPolicy(XmlViolationPolicy xmlnsPolicy) { tokenizer.setXmlnsPolicy(xmlnsPolicy); } public String getCharacterEncoding() throws SAXException { return characterEncoding.getCanonName(); } public Locator getDocumentLocator() { return tokenizer; } }