/* * Copyright (c) 2009-2013 Mozilla Foundation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ package nu.validator.htmlparser.impl; import nu.validator.htmlparser.annotation.Inline; import nu.validator.htmlparser.annotation.NoLength; import nu.validator.htmlparser.common.TokenHandler; import nu.validator.htmlparser.common.TransitionHandler; import nu.validator.htmlparser.common.XmlViolationPolicy; import java.util.HashMap; import org.xml.sax.SAXException; import org.xml.sax.SAXParseException; public class ErrorReportingTokenizer extends Tokenizer { /** * Magic value for UTF-16 operations. */ private static final int SURROGATE_OFFSET = (0x10000 - (0xD800 << 10) - 0xDC00); /** * The policy for non-space non-XML characters. */ private XmlViolationPolicy contentNonXmlCharPolicy = XmlViolationPolicy.ALTER_INFOSET; /** * Keeps track of PUA warnings. */ private boolean alreadyWarnedAboutPrivateUseCharacters; /** * The current line number in the current resource being parsed. (First line * is 1.) Passed on as locator data. */ private int line; private int linePrev; /** * The current column number in the current resource being tokenized. (First * column is 1, counted by UTF-16 code units.) Passed on as locator data. */ private int col; private int colPrev; private boolean nextCharOnNewLine; private char prev; private HashMap<String, String> errorProfileMap = null; private TransitionHandler transitionHandler = null; private int transitionBaseOffset = 0; /** * @param tokenHandler * @param newAttributesEachTime */ public ErrorReportingTokenizer(TokenHandler tokenHandler, boolean newAttributesEachTime) { super(tokenHandler, newAttributesEachTime); } /** * @param tokenHandler */ public ErrorReportingTokenizer(TokenHandler tokenHandler) { super(tokenHandler); } /** * @see org.xml.sax.Locator#getLineNumber() */ public int getLineNumber() { if (line > 0) { return line; } else { return -1; } } /** * @see org.xml.sax.Locator#getColumnNumber() */ public int getColumnNumber() { if (col > 0) { return col; } else { return -1; } } /** * Sets the contentNonXmlCharPolicy. * * @param contentNonXmlCharPolicy * the contentNonXmlCharPolicy to set */ public void setContentNonXmlCharPolicy( XmlViolationPolicy contentNonXmlCharPolicy) { this.contentNonXmlCharPolicy = contentNonXmlCharPolicy; } /** * Sets the errorProfile. * * @param errorProfile */ public void setErrorProfile(HashMap<String, String> errorProfileMap) { this.errorProfileMap = errorProfileMap; } /** * Reports on an event based on profile selected. * * @param profile * the profile this message belongs to * @param message * the message itself * @throws SAXException */ public void note(String profile, String message) throws SAXException { if (errorProfileMap == null) return; String level = errorProfileMap.get(profile); if ("warn".equals(level)) { warn(message); } else if ("err".equals(level)) { err(message); // } else if ("info".equals(level)) { // info(message); } } protected void startErrorReporting() throws SAXException { line = linePrev = 0; col = colPrev = 1; nextCharOnNewLine = true; prev = '\u0000'; alreadyWarnedAboutPrivateUseCharacters = false; transitionBaseOffset = 0; } @Inline protected void silentCarriageReturn() { nextCharOnNewLine = true; lastCR = true; } @Inline protected void silentLineFeed() { nextCharOnNewLine = true; } /** * Returns the line. * * @return the line */ public int getLine() { return line; } /** * Returns the col. * * @return the col */ public int getCol() { return col; } /** * Returns the nextCharOnNewLine. * * @return the nextCharOnNewLine */ public boolean isNextCharOnNewLine() { return nextCharOnNewLine; } /** * Flushes coalesced character tokens. * * @param buf * TODO * @param pos * TODO * * @throws SAXException */ @Override protected void flushChars(char[] buf, int pos) throws SAXException { if (pos > cstart) { int currLine = line; int currCol = col; line = linePrev; col = colPrev; tokenHandler.characters(buf, cstart, pos - cstart); line = currLine; col = currCol; } cstart = 0x7fffffff; } @Override protected char checkChar(@NoLength char[] buf, int pos) throws SAXException { linePrev = line; colPrev = col; if (nextCharOnNewLine) { line++; col = 1; nextCharOnNewLine = false; } else { col++; } char c = buf[pos]; switch (c) { case '\u0000': err("Saw U+0000 in stream."); case '\t': case '\r': case '\n': break; case '\u000C': if (contentNonXmlCharPolicy == XmlViolationPolicy.FATAL) { fatal("This document is not mappable to XML 1.0 without data loss due to " + toUPlusString(c) + " which is not a legal XML 1.0 character."); } else { if (contentNonXmlCharPolicy == XmlViolationPolicy.ALTER_INFOSET) { c = buf[pos] = ' '; } warn("This document is not mappable to XML 1.0 without data loss due to " + toUPlusString(c) + " which is not a legal XML 1.0 character."); } break; default: if ((c & 0xFC00) == 0xDC00) { // Got a low surrogate. See if prev was high // surrogate if ((prev & 0xFC00) == 0xD800) { int intVal = (prev << 10) + c + SURROGATE_OFFSET; if ((intVal & 0xFFFE) == 0xFFFE) { err("Astral non-character."); } if (isAstralPrivateUse(intVal)) { warnAboutPrivateUseChar(); } } } else if ((c < ' ' || ((c & 0xFFFE) == 0xFFFE))) { switch (contentNonXmlCharPolicy) { case FATAL: fatal("Forbidden code point " + toUPlusString(c) + "."); break; case ALTER_INFOSET: c = buf[pos] = '\uFFFD'; // fall through case ALLOW: err("Forbidden code point " + toUPlusString(c) + "."); } } else if ((c >= '\u007F') && (c <= '\u009F') || (c >= '\uFDD0') && (c <= '\uFDEF')) { err("Forbidden code point " + toUPlusString(c) + "."); } else if (isPrivateUse(c)) { warnAboutPrivateUseChar(); } } prev = c; return c; } /** * @throws SAXException * @see nu.validator.htmlparser.impl.Tokenizer#transition(int, int, boolean, * int) */ @Override protected int transition(int from, int to, boolean reconsume, int pos) throws SAXException { if (transitionHandler != null) { transitionHandler.transition(from, to, reconsume, transitionBaseOffset + pos); } return to; } private String toUPlusString(int c) { String hexString = Integer.toHexString(c); switch (hexString.length()) { case 1: return "U+000" + hexString; case 2: return "U+00" + hexString; case 3: return "U+0" + hexString; default: return "U+" + hexString; } } /** * Emits a warning about private use characters if the warning has not been * emitted yet. * * @throws SAXException */ private void warnAboutPrivateUseChar() throws SAXException { if (!alreadyWarnedAboutPrivateUseCharacters) { warn("Document uses the Unicode Private Use Area(s), which should not be used in publicly exchanged documents. (Charmod C073)"); alreadyWarnedAboutPrivateUseCharacters = true; } } /** * Tells if the argument is a BMP PUA character. * * @param c * the UTF-16 code unit to check * @return <code>true</code> if PUA character */ private boolean isPrivateUse(char c) { return c >= '\uE000' && c <= '\uF8FF'; } /** * Tells if the argument is an astral PUA character. * * @param c * the code point to check * @return <code>true</code> if astral private use */ private boolean isAstralPrivateUse(int c) { return (c >= 0xF0000 && c <= 0xFFFFD) || (c >= 0x100000 && c <= 0x10FFFD); } @Override protected void errGarbageAfterLtSlash() throws SAXException { err("Garbage after \u201C</\u201D."); } @Override protected void errLtSlashGt() throws SAXException { err("Saw \u201C</>\u201D. Probable causes: Unescaped \u201C<\u201D (escape as \u201C<\u201D) or mistyped end tag."); } @Override protected void errWarnLtSlashInRcdata() throws SAXException { if (html4) { err((stateSave == Tokenizer.DATA ? "CDATA" : "RCDATA") + " element \u201C" + endTagExpectation + "\u201D contained the string \u201C</\u201D, but it was not the start of the end tag. (HTML4-only error)"); } else { warn((stateSave == Tokenizer.DATA ? "CDATA" : "RCDATA") + " element \u201C" + endTagExpectation + "\u201D contained the string \u201C</\u201D, but this did not close the element."); } } @Override protected void errHtml4LtSlashInRcdata(char folded) throws SAXException { if (html4 && (index > 0 || (folded >= 'a' && folded <= 'z')) && ElementName.IFRAME != endTagExpectation) { err((stateSave == Tokenizer.DATA ? "CDATA" : "RCDATA") + " element \u201C" + endTagExpectation.name + "\u201D contained the string \u201C</\u201D, but it was not the start of the end tag. (HTML4-only error)"); } } @Override protected void errCharRefLacksSemicolon() throws SAXException { err("Character reference was not terminated by a semicolon."); } @Override protected void errNoDigitsInNCR() throws SAXException { err("No digits after \u201C" + strBufToString() + "\u201D."); } @Override protected void errGtInSystemId() throws SAXException { err("\u201C>\u201D in system identifier."); } @Override protected void errGtInPublicId() throws SAXException { err("\u201C>\u201D in public identifier."); } @Override protected void errNamelessDoctype() throws SAXException { err("Nameless doctype."); } @Override protected void errConsecutiveHyphens() throws SAXException { err("Consecutive hyphens did not terminate a comment. \u201C--\u201D is not permitted inside a comment, but e.g. \u201C- -\u201D is."); } @Override protected void errPrematureEndOfComment() throws SAXException { err("Premature end of comment. Use \u201C-->\u201D to end a comment properly."); } @Override protected void errBogusComment() throws SAXException { err("Bogus comment."); } @Override protected void errUnquotedAttributeValOrNull(char c) throws SAXException { switch (c) { case '<': err("\u201C<\u201D in an unquoted attribute value. Probable cause: Missing \u201C>\u201D immediately before."); return; case '`': err("\u201C`\u201D in an unquoted attribute value. Probable cause: Using the wrong character as a quote."); return; case '\uFFFD': return; default: err("\u201C" + c + "\u201D in an unquoted attribute value. Probable causes: Attributes running together or a URL query string in an unquoted attribute value."); return; } } @Override protected void errSlashNotFollowedByGt() throws SAXException { err("A slash was not immediately followed by \u201C>\u201D."); } @Override protected void errHtml4XmlVoidSyntax() throws SAXException { if (html4) { err("The \u201C/>\u201D syntax on void elements is not allowed. (This is an HTML4-only error.)"); } } @Override protected void errNoSpaceBetweenAttributes() throws SAXException { err("No space between attributes."); } @Override protected void errHtml4NonNameInUnquotedAttribute(char c) throws SAXException { if (html4 && !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '.' || c == '-' || c == '_' || c == ':')) { err("Non-name character in an unquoted attribute value. (This is an HTML4-only error.)"); } } @Override protected void errLtOrEqualsOrGraveInUnquotedAttributeOrNull( char c) throws SAXException { switch (c) { case '=': err("\u201C=\u201D at the start of an unquoted attribute value. Probable cause: Stray duplicate equals sign."); return; case '<': err("\u201C<\u201D at the start of an unquoted attribute value. Probable cause: Missing \u201C>\u201D immediately before."); return; case '`': err("\u201C`\u201D at the start of an unquoted attribute value. Probable cause: Using the wrong character as a quote."); return; } } @Override protected void errAttributeValueMissing() throws SAXException { err("Attribute value missing."); } @Override protected void errBadCharBeforeAttributeNameOrNull(char c) throws SAXException { if (c == '<') { err("Saw \u201C<\u201D when expecting an attribute name. Probable cause: Missing \u201C>\u201D immediately before."); } else if (c == '=') { errEqualsSignBeforeAttributeName(); } else if (c != '\uFFFD') { errQuoteBeforeAttributeName(c); } } @Override protected void errEqualsSignBeforeAttributeName() throws SAXException { err("Saw \u201C=\u201D when expecting an attribute name. Probable cause: Attribute name missing."); } @Override protected void errBadCharAfterLt(char c) throws SAXException { err("Bad character \u201C" + c + "\u201D after \u201C<\u201D. Probable cause: Unescaped \u201C<\u201D. Try escaping it as \u201C<\u201D."); } @Override protected void errLtGt() throws SAXException { err("Saw \u201C<>\u201D. Probable causes: Unescaped \u201C<\u201D (escape as \u201C<\u201D) or mistyped start tag."); } @Override protected void errProcessingInstruction() throws SAXException { err("Saw \u201C<?\u201D. Probable cause: Attempt to use an XML processing instruction in HTML. (XML processing instructions are not supported in HTML.)"); } @Override protected void errUnescapedAmpersandInterpretedAsCharacterReference() throws SAXException { if (errorHandler == null) { return; } SAXParseException spe = new SAXParseException( "The string following \u201C&\u201D was interpreted as a character reference. (\u201C&\u201D probably should have been escaped as \u201C&\u201D.)", ampersandLocation); errorHandler.error(spe); } @Override protected void errNotSemicolonTerminated() throws SAXException { err("Named character reference was not terminated by a semicolon. (Or \u201C&\u201D should have been escaped as \u201C&\u201D.)"); } @Override protected void errNoNamedCharacterMatch() throws SAXException { if (errorHandler == null) { return; } SAXParseException spe = new SAXParseException( "\u201C&\u201D did not start a character reference. (\u201C&\u201D probably should have been escaped as \u201C&\u201D.)", ampersandLocation); errorHandler.error(spe); } @Override protected void errQuoteBeforeAttributeName(char c) throws SAXException { err("Saw \u201C" + c + "\u201D when expecting an attribute name. Probable cause: \u201C=\u201D missing immediately before."); } @Override protected void errQuoteOrLtInAttributeNameOrNull(char c) throws SAXException { if (c == '<') { err("\u201C<\u201D in attribute name. Probable cause: \u201C>\u201D missing immediately before."); } else if (c != '\uFFFD') { err("Quote \u201C" + c + "\u201D in attribute name. Probable cause: Matching quote missing somewhere earlier."); } } @Override protected void errExpectedPublicId() throws SAXException { err("Expected a public identifier but the doctype ended."); } @Override protected void errBogusDoctype() throws SAXException { err("Bogus doctype."); } @Override protected void maybeWarnPrivateUseAstral() throws SAXException { if (errorHandler != null && isAstralPrivateUse(value)) { warnAboutPrivateUseChar(); } } @Override protected void maybeWarnPrivateUse(char ch) throws SAXException { if (errorHandler != null && isPrivateUse(ch)) { warnAboutPrivateUseChar(); } } @Override protected void maybeErrAttributesOnEndTag(HtmlAttributes attrs) throws SAXException { if (attrs.getLength() != 0) { /* * When an end tag token is emitted with attributes, that is a parse * error. */ err("End tag had attributes."); } } @Override protected void maybeErrSlashInEndTag(boolean selfClosing) throws SAXException { if (selfClosing && endTag) { err("Stray \u201C/\u201D at the end of an end tag."); } } @Override protected char errNcrNonCharacter(char ch) throws SAXException { switch (contentNonXmlCharPolicy) { case FATAL: fatal("Character reference expands to a non-character (" + toUPlusString((char) value) + ")."); break; case ALTER_INFOSET: ch = '\uFFFD'; // fall through case ALLOW: err("Character reference expands to a non-character (" + toUPlusString((char) value) + ")."); } return ch; } /** * @see nu.validator.htmlparser.impl.Tokenizer#errAstralNonCharacter(int) */ @Override protected void errAstralNonCharacter(int ch) throws SAXException { err("Character reference expands to an astral non-character (" + toUPlusString(value) + ")."); } @Override protected void errNcrSurrogate() throws SAXException { err("Character reference expands to a surrogate."); } @Override protected char errNcrControlChar(char ch) throws SAXException { switch (contentNonXmlCharPolicy) { case FATAL: fatal("Character reference expands to a control character (" + toUPlusString((char) value) + ")."); break; case ALTER_INFOSET: ch = '\uFFFD'; // fall through case ALLOW: err("Character reference expands to a control character (" + toUPlusString((char) value) + ")."); } return ch; } @Override protected void errNcrCr() throws SAXException { err("A numeric character reference expanded to carriage return."); } @Override protected void errNcrInC1Range() throws SAXException { err("A numeric character reference expanded to the C1 controls range."); } @Override protected void errEofInPublicId() throws SAXException { err("End of file inside public identifier."); } @Override protected void errEofInComment() throws SAXException { err("End of file inside comment."); } @Override protected void errEofInDoctype() throws SAXException { err("End of file inside doctype."); } @Override protected void errEofInAttributeValue() throws SAXException { err("End of file reached when inside an attribute value. Ignoring tag."); } @Override protected void errEofInAttributeName() throws SAXException { err("End of file occurred in an attribute name. Ignoring tag."); } @Override protected void errEofWithoutGt() throws SAXException { err("Saw end of file without the previous tag ending with \u201C>\u201D. Ignoring tag."); } @Override protected void errEofInTagName() throws SAXException { err("End of file seen when looking for tag name. Ignoring tag."); } @Override protected void errEofInEndTag() throws SAXException { err("End of file inside end tag. Ignoring tag."); } @Override protected void errEofAfterLt() throws SAXException { err("End of file after \u201C<\u201D."); } @Override protected void errNcrOutOfRange() throws SAXException { err("Character reference outside the permissible Unicode range."); } @Override protected void errNcrUnassigned() throws SAXException { err("Character reference expands to a permanently unassigned code point."); } @Override protected void errDuplicateAttribute() throws SAXException { err("Duplicate attribute \u201C" + attributeName.getLocal(AttributeName.HTML) + "\u201D."); } @Override protected void errEofInSystemId() throws SAXException { err("End of file inside system identifier."); } @Override protected void errExpectedSystemId() throws SAXException { err("Expected a system identifier but the doctype ended."); } @Override protected void errMissingSpaceBeforeDoctypeName() throws SAXException { err("Missing space before doctype name."); } @Override protected void errHyphenHyphenBang() throws SAXException { err("\u201C--!\u201D found in comment."); } @Override protected void errNcrControlChar() throws SAXException { err("Character reference expands to a control character (" + toUPlusString((char) value) + ")."); } @Override protected void errNcrZero() throws SAXException { err("Character reference expands to zero."); } @Override protected void errNoSpaceBetweenDoctypeSystemKeywordAndQuote() throws SAXException { err("No space between the doctype \u201CSYSTEM\u201D keyword and the quote."); } @Override protected void errNoSpaceBetweenPublicAndSystemIds() throws SAXException { err("No space between the doctype public and system identifiers."); } @Override protected void errNoSpaceBetweenDoctypePublicKeywordAndQuote() throws SAXException { err("No space between the doctype \u201CPUBLIC\u201D keyword and the quote."); } @Override protected void noteAttributeWithoutValue() throws SAXException { note("xhtml2", "Attribute without value"); } @Override protected void noteUnquotedAttributeValue() throws SAXException { note("xhtml1", "Unquoted attribute value."); } /** * Sets the transitionHandler. * * @param transitionHandler * the transitionHandler to set */ public void setTransitionHandler(TransitionHandler transitionHandler) { this.transitionHandler = transitionHandler; } /** * Sets an offset to be added to the position reported to * <code>TransitionHandler</code>. * * @param offset * the offset */ public void setTransitionBaseOffset(int offset) { this.transitionBaseOffset = offset; } }