/* * Copyright (c) 2007 Henri Sivonen * Copyright (c) 2008-2015 Mozilla Foundation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER * DEALINGS IN THE SOFTWARE. */ package nu.validator.htmlparser.impl; import java.io.IOException; import nu.validator.htmlparser.annotation.Auto; import nu.validator.htmlparser.annotation.Inline; import nu.validator.htmlparser.common.ByteReadable; import org.xml.sax.SAXException; public abstract class MetaScanner { /** * Constant for "charset". */ private static final char[] CHARSET = { 'h', 'a', 'r', 's', 'e', 't' }; /** * Constant for "content". */ private static final char[] CONTENT = { 'o', 'n', 't', 'e', 'n', 't' }; /** * Constant for "http-equiv". */ private static final char[] HTTP_EQUIV = { 't', 't', 'p', '-', 'e', 'q', 'u', 'i', 'v' }; /** * Constant for "content-type". */ private static final char[] CONTENT_TYPE = { 'c', 'o', 'n', 't', 'e', 'n', 't', '-', 't', 'y', 'p', 'e' }; private static final int NO = 0; private static final int M = 1; private static final int E = 2; private static final int T = 3; private static final int A = 4; private static final int DATA = 0; private static final int TAG_OPEN = 1; private static final int SCAN_UNTIL_GT = 2; private static final int TAG_NAME = 3; private static final int BEFORE_ATTRIBUTE_NAME = 4; private static final int ATTRIBUTE_NAME = 5; private static final int AFTER_ATTRIBUTE_NAME = 6; private static final int BEFORE_ATTRIBUTE_VALUE = 7; private static final int ATTRIBUTE_VALUE_DOUBLE_QUOTED = 8; private static final int ATTRIBUTE_VALUE_SINGLE_QUOTED = 9; private static final int ATTRIBUTE_VALUE_UNQUOTED = 10; private static final int AFTER_ATTRIBUTE_VALUE_QUOTED = 11; private static final int MARKUP_DECLARATION_OPEN = 13; private static final int MARKUP_DECLARATION_HYPHEN = 14; private static final int COMMENT_START = 15; private static final int COMMENT_START_DASH = 16; private static final int COMMENT = 17; private static final int COMMENT_END_DASH = 18; private static final int COMMENT_END = 19; private static final int SELF_CLOSING_START_TAG = 20; private static final int HTTP_EQUIV_NOT_SEEN = 0; private static final int HTTP_EQUIV_CONTENT_TYPE = 1; private static final int HTTP_EQUIV_OTHER = 2; /** * The data source. */ protected ByteReadable readable; /** * The state of the state machine that recognizes the tag name "meta". */ private int metaState = NO; /** * The current position in recognizing the attribute name "content". */ private int contentIndex = Integer.MAX_VALUE; /** * The current position in recognizing the attribute name "charset". */ private int charsetIndex = Integer.MAX_VALUE; /** * The current position in recognizing the attribute name "http-equive". */ private int httpEquivIndex = Integer.MAX_VALUE; /** * The current position in recognizing the attribute value "content-type". */ private int contentTypeIndex = Integer.MAX_VALUE; /** * The tokenizer state. */ protected int stateSave = DATA; /** * The currently filled length of strBuf. */ private int strBufLen; /** * Accumulation buffer for attribute values. */ private @Auto char[] strBuf; private String content; private String charset; private int httpEquivState; // CPPONLY: private TreeBuilder treeBuilder; public MetaScanner( // CPPONLY: TreeBuilder tb ) { this.readable = null; this.metaState = NO; this.contentIndex = Integer.MAX_VALUE; this.charsetIndex = Integer.MAX_VALUE; this.httpEquivIndex = Integer.MAX_VALUE; this.contentTypeIndex = Integer.MAX_VALUE; this.stateSave = DATA; this.strBufLen = 0; this.strBuf = new char[36]; this.content = null; this.charset = null; this.httpEquivState = HTTP_EQUIV_NOT_SEEN; // CPPONLY: this.treeBuilder = tb; } @SuppressWarnings("unused") private void destructor() { Portability.releaseString(content); Portability.releaseString(charset); } // [NOCPP[ /** * Reads a byte from the data source. * * -1 means end. * @return * @throws IOException */ protected int read() throws IOException { return readable.readByte(); } // ]NOCPP] // WARNING When editing this, makes sure the bytecode length shown by javap // stays under 8000 bytes! /** * The runs the meta scanning algorithm. */ protected final void stateLoop(int state) throws SAXException, IOException { int c = -1; boolean reconsume = false; stateloop: for (;;) { switch (state) { case DATA: dataloop: for (;;) { if (reconsume) { reconsume = false; } else { c = read(); } switch (c) { case -1: break stateloop; case '<': state = MetaScanner.TAG_OPEN; break dataloop; // FALL THROUGH continue // stateloop; default: continue; } } // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER case TAG_OPEN: tagopenloop: for (;;) { c = read(); switch (c) { case -1: break stateloop; case 'm': case 'M': metaState = M; state = MetaScanner.TAG_NAME; break tagopenloop; // continue stateloop; case '!': state = MetaScanner.MARKUP_DECLARATION_OPEN; continue stateloop; case '?': case '/': state = MetaScanner.SCAN_UNTIL_GT; continue stateloop; case '>': state = MetaScanner.DATA; continue stateloop; default: if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) { metaState = NO; state = MetaScanner.TAG_NAME; break tagopenloop; // continue stateloop; } state = MetaScanner.DATA; reconsume = true; continue stateloop; } } // FALL THROUGH DON'T REORDER case TAG_NAME: tagnameloop: for (;;) { c = read(); switch (c) { case -1: break stateloop; case ' ': case '\t': case '\n': case '\u000C': state = MetaScanner.BEFORE_ATTRIBUTE_NAME; break tagnameloop; // continue stateloop; case '/': state = MetaScanner.SELF_CLOSING_START_TAG; continue stateloop; case '>': state = MetaScanner.DATA; continue stateloop; case 'e': case 'E': if (metaState == M) { metaState = E; } else { metaState = NO; } continue; case 't': case 'T': if (metaState == E) { metaState = T; } else { metaState = NO; } continue; case 'a': case 'A': if (metaState == T) { metaState = A; } else { metaState = NO; } continue; default: metaState = NO; continue; } } // FALLTHRU DON'T REORDER case BEFORE_ATTRIBUTE_NAME: beforeattributenameloop: for (;;) { if (reconsume) { reconsume = false; } else { c = read(); } /* * Consume the next input character: */ switch (c) { case -1: break stateloop; case ' ': case '\t': case '\n': case '\u000C': continue; case '/': state = MetaScanner.SELF_CLOSING_START_TAG; continue stateloop; case '>': if (handleTag()) { break stateloop; } state = DATA; continue stateloop; case 'c': case 'C': contentIndex = 0; charsetIndex = 0; httpEquivIndex = Integer.MAX_VALUE; contentTypeIndex = Integer.MAX_VALUE; state = MetaScanner.ATTRIBUTE_NAME; break beforeattributenameloop; case 'h': case 'H': contentIndex = Integer.MAX_VALUE; charsetIndex = Integer.MAX_VALUE; httpEquivIndex = 0; contentTypeIndex = Integer.MAX_VALUE; state = MetaScanner.ATTRIBUTE_NAME; break beforeattributenameloop; default: contentIndex = Integer.MAX_VALUE; charsetIndex = Integer.MAX_VALUE; httpEquivIndex = Integer.MAX_VALUE; contentTypeIndex = Integer.MAX_VALUE; state = MetaScanner.ATTRIBUTE_NAME; break beforeattributenameloop; // continue stateloop; } } // FALLTHRU DON'T REORDER case ATTRIBUTE_NAME: attributenameloop: for (;;) { c = read(); switch (c) { case -1: break stateloop; case ' ': case '\t': case '\n': case '\u000C': state = MetaScanner.AFTER_ATTRIBUTE_NAME; continue stateloop; case '/': state = MetaScanner.SELF_CLOSING_START_TAG; continue stateloop; case '=': strBufLen = 0; contentTypeIndex = 0; state = MetaScanner.BEFORE_ATTRIBUTE_VALUE; break attributenameloop; // continue stateloop; case '>': if (handleTag()) { break stateloop; } state = MetaScanner.DATA; continue stateloop; default: if (metaState == A) { if (c >= 'A' && c <= 'Z') { c += 0x20; } if (contentIndex < CONTENT.length && c == CONTENT[contentIndex]) { ++contentIndex; } else { contentIndex = Integer.MAX_VALUE; } if (charsetIndex < CHARSET.length && c == CHARSET[charsetIndex]) { ++charsetIndex; } else { charsetIndex = Integer.MAX_VALUE; } if (httpEquivIndex < HTTP_EQUIV.length && c == HTTP_EQUIV[httpEquivIndex]) { ++httpEquivIndex; } else { httpEquivIndex = Integer.MAX_VALUE; } } continue; } } // FALLTHRU DON'T REORDER case BEFORE_ATTRIBUTE_VALUE: beforeattributevalueloop: for (;;) { c = read(); switch (c) { case -1: break stateloop; case ' ': case '\t': case '\n': case '\u000C': continue; case '"': state = MetaScanner.ATTRIBUTE_VALUE_DOUBLE_QUOTED; break beforeattributevalueloop; // continue stateloop; case '\'': state = MetaScanner.ATTRIBUTE_VALUE_SINGLE_QUOTED; continue stateloop; case '>': if (handleTag()) { break stateloop; } state = MetaScanner.DATA; continue stateloop; default: handleCharInAttributeValue(c); state = MetaScanner.ATTRIBUTE_VALUE_UNQUOTED; continue stateloop; } } // FALLTHRU DON'T REORDER case ATTRIBUTE_VALUE_DOUBLE_QUOTED: attributevaluedoublequotedloop: for (;;) { if (reconsume) { reconsume = false; } else { c = read(); } switch (c) { case -1: break stateloop; case '"': handleAttributeValue(); state = MetaScanner.AFTER_ATTRIBUTE_VALUE_QUOTED; break attributevaluedoublequotedloop; // continue stateloop; default: handleCharInAttributeValue(c); continue; } } // FALLTHRU DON'T REORDER case AFTER_ATTRIBUTE_VALUE_QUOTED: afterattributevaluequotedloop: for (;;) { c = read(); switch (c) { case -1: break stateloop; case ' ': case '\t': case '\n': case '\u000C': state = MetaScanner.BEFORE_ATTRIBUTE_NAME; continue stateloop; case '/': state = MetaScanner.SELF_CLOSING_START_TAG; break afterattributevaluequotedloop; // continue stateloop; case '>': if (handleTag()) { break stateloop; } state = MetaScanner.DATA; continue stateloop; default: state = MetaScanner.BEFORE_ATTRIBUTE_NAME; reconsume = true; continue stateloop; } } // FALLTHRU DON'T REORDER case SELF_CLOSING_START_TAG: c = read(); switch (c) { case -1: break stateloop; case '>': if (handleTag()) { break stateloop; } state = MetaScanner.DATA; continue stateloop; default: state = MetaScanner.BEFORE_ATTRIBUTE_NAME; reconsume = true; continue stateloop; } // XXX reorder point case ATTRIBUTE_VALUE_UNQUOTED: for (;;) { if (reconsume) { reconsume = false; } else { c = read(); } switch (c) { case -1: break stateloop; case ' ': case '\t': case '\n': case '\u000C': handleAttributeValue(); state = MetaScanner.BEFORE_ATTRIBUTE_NAME; continue stateloop; case '>': handleAttributeValue(); if (handleTag()) { break stateloop; } state = MetaScanner.DATA; continue stateloop; default: handleCharInAttributeValue(c); continue; } } // XXX reorder point case AFTER_ATTRIBUTE_NAME: for (;;) { c = read(); switch (c) { case -1: break stateloop; case ' ': case '\t': case '\n': case '\u000C': continue; case '/': handleAttributeValue(); state = MetaScanner.SELF_CLOSING_START_TAG; continue stateloop; case '=': strBufLen = 0; contentTypeIndex = 0; state = MetaScanner.BEFORE_ATTRIBUTE_VALUE; continue stateloop; case '>': handleAttributeValue(); if (handleTag()) { break stateloop; } state = MetaScanner.DATA; continue stateloop; case 'c': case 'C': contentIndex = 0; charsetIndex = 0; state = MetaScanner.ATTRIBUTE_NAME; continue stateloop; default: contentIndex = Integer.MAX_VALUE; charsetIndex = Integer.MAX_VALUE; state = MetaScanner.ATTRIBUTE_NAME; continue stateloop; } } // XXX reorder point case MARKUP_DECLARATION_OPEN: markupdeclarationopenloop: for (;;) { c = read(); switch (c) { case -1: break stateloop; case '-': state = MetaScanner.MARKUP_DECLARATION_HYPHEN; break markupdeclarationopenloop; // continue stateloop; default: state = MetaScanner.SCAN_UNTIL_GT; reconsume = true; continue stateloop; } } // FALLTHRU DON'T REORDER case MARKUP_DECLARATION_HYPHEN: markupdeclarationhyphenloop: for (;;) { c = read(); switch (c) { case -1: break stateloop; case '-': state = MetaScanner.COMMENT_START; break markupdeclarationhyphenloop; // continue stateloop; default: state = MetaScanner.SCAN_UNTIL_GT; reconsume = true; continue stateloop; } } // FALLTHRU DON'T REORDER case COMMENT_START: commentstartloop: for (;;) { c = read(); switch (c) { case -1: break stateloop; case '-': state = MetaScanner.COMMENT_START_DASH; continue stateloop; case '>': state = MetaScanner.DATA; continue stateloop; default: state = MetaScanner.COMMENT; break commentstartloop; // continue stateloop; } } // FALLTHRU DON'T REORDER case COMMENT: commentloop: for (;;) { c = read(); switch (c) { case -1: break stateloop; case '-': state = MetaScanner.COMMENT_END_DASH; break commentloop; // continue stateloop; default: continue; } } // FALLTHRU DON'T REORDER case COMMENT_END_DASH: commentenddashloop: for (;;) { c = read(); switch (c) { case -1: break stateloop; case '-': state = MetaScanner.COMMENT_END; break commentenddashloop; // continue stateloop; default: state = MetaScanner.COMMENT; continue stateloop; } } // FALLTHRU DON'T REORDER case COMMENT_END: for (;;) { c = read(); switch (c) { case -1: break stateloop; case '>': state = MetaScanner.DATA; continue stateloop; case '-': continue; default: state = MetaScanner.COMMENT; continue stateloop; } } // XXX reorder point case COMMENT_START_DASH: c = read(); switch (c) { case -1: break stateloop; case '-': state = MetaScanner.COMMENT_END; continue stateloop; case '>': state = MetaScanner.DATA; continue stateloop; default: state = MetaScanner.COMMENT; continue stateloop; } // XXX reorder point case ATTRIBUTE_VALUE_SINGLE_QUOTED: for (;;) { if (reconsume) { reconsume = false; } else { c = read(); } switch (c) { case -1: break stateloop; case '\'': handleAttributeValue(); state = MetaScanner.AFTER_ATTRIBUTE_VALUE_QUOTED; continue stateloop; default: handleCharInAttributeValue(c); continue; } } // XXX reorder point case SCAN_UNTIL_GT: for (;;) { if (reconsume) { reconsume = false; } else { c = read(); } switch (c) { case -1: break stateloop; case '>': state = MetaScanner.DATA; continue stateloop; default: continue; } } } } stateSave = state; } private void handleCharInAttributeValue(int c) { if (metaState == A) { if (contentIndex == CONTENT.length || charsetIndex == CHARSET.length) { addToBuffer(c); } else if (httpEquivIndex == HTTP_EQUIV.length) { if (contentTypeIndex < CONTENT_TYPE.length && toAsciiLowerCase(c) == CONTENT_TYPE[contentTypeIndex]) { ++contentTypeIndex; } else { contentTypeIndex = Integer.MAX_VALUE; } } } } @Inline private int toAsciiLowerCase(int c) { if (c >= 'A' && c <= 'Z') { return c + 0x20; } return c; } /** * Adds a character to the accumulation buffer. * @param c the character to add */ private void addToBuffer(int c) { if (strBufLen == strBuf.length) { char[] newBuf = new char[strBuf.length + (strBuf.length << 1)]; System.arraycopy(strBuf, 0, newBuf, 0, strBuf.length); strBuf = newBuf; } strBuf[strBufLen++] = (char)c; } /** * Attempts to extract a charset name from the accumulation buffer. * @return <code>true</code> if successful * @throws SAXException */ private void handleAttributeValue() throws SAXException { if (metaState != A) { return; } if (contentIndex == CONTENT.length && content == null) { content = Portability.newStringFromBuffer(strBuf, 0, strBufLen // CPPONLY: , treeBuilder ); return; } if (charsetIndex == CHARSET.length && charset == null) { charset = Portability.newStringFromBuffer(strBuf, 0, strBufLen // CPPONLY: , treeBuilder ); return; } if (httpEquivIndex == HTTP_EQUIV.length && httpEquivState == HTTP_EQUIV_NOT_SEEN) { httpEquivState = (contentTypeIndex == CONTENT_TYPE.length) ? HTTP_EQUIV_CONTENT_TYPE : HTTP_EQUIV_OTHER; return; } } private boolean handleTag() throws SAXException { boolean stop = handleTagInner(); Portability.releaseString(content); content = null; Portability.releaseString(charset); charset = null; httpEquivState = HTTP_EQUIV_NOT_SEEN; return stop; } private boolean handleTagInner() throws SAXException { if (charset != null && tryCharset(charset)) { return true; } if (content != null && httpEquivState == HTTP_EQUIV_CONTENT_TYPE) { String extract = TreeBuilder.extractCharsetFromContent(content // CPPONLY: , treeBuilder ); if (extract == null) { return false; } boolean success = tryCharset(extract); Portability.releaseString(extract); return success; } return false; } /** * Tries to switch to an encoding. * * @param encoding * @return <code>true</code> if successful * @throws SAXException */ protected abstract boolean tryCharset(String encoding) throws SAXException; }