/* * Java HTML Tidy - JTidy * HTML parser and pretty printer * * Copyright (c) 1998-2000 World Wide Web Consortium (Massachusetts * Institute of Technology, Institut National de Recherche en * Informatique et en Automatique, Keio University). All Rights * Reserved. * * Contributing Author(s): * * Dave Raggett <dsr@w3.org> * Andy Quick <ac.quick@sympatico.ca> (translation to Java) * Gary L Peskin <garyp@firstech.com> (Java development) * Sami Lempinen <sami@lempinen.net> (release management) * Fabrizio Giustina <fgiust at users.sourceforge.net> * * The contributing author(s) would like to thank all those who * helped with testing, bug fixes, and patience. This wouldn't * have been possible without all of you. * * COPYRIGHT NOTICE: * * This software and documentation is provided "as is," and * the copyright holders and contributing author(s) make no * representations or warranties, express or implied, including * but not limited to, warranties of merchantability or fitness * for any particular purpose or that the use of the software or * documentation will not infringe any third party patents, * copyrights, trademarks or other rights. * * The copyright holders and contributing author(s) will not be * liable for any direct, indirect, special or consequential damages * arising out of any use of the software or documentation, even if * advised of the possibility of such damage. * * Permission is hereby granted to use, copy, modify, and distribute * this source code, or portions hereof, documentation and executables, * for any purpose, without fee, subject to the following restrictions: * * 1. The origin of this source code must not be misrepresented. * 2. Altered versions must be plainly marked as such and must * not be misrepresented as being the original source. * 3. This Copyright notice may not be removed or altered from any * source or altered source distribution. * * The copyright holders and contributing author(s) specifically * permit, without fee, and encourage the use of this source code * as a component for supporting the Hypertext Markup Language in * commercial products. If you use this source code in a product, * acknowledgment is not required but would be appreciated. * */ package org.w3c.tidy; /** * HTML Parser implementation. * @author Dave Raggett <a href="mailto:dsr@w3.org">dsr@w3.org </a> * @author Andy Quick <a href="mailto:ac.quick@sympatico.ca">ac.quick@sympatico.ca </a> (translation to Java) * @author Fabrizio Giustina * @version $Revision: 1100 $ ($Author: aditsu $) */ public final class ParserImpl { /** * parser for html. */ public static final Parser HTML = new ParseHTML(); /** * parser for head. */ public static final Parser HEAD = new ParseHead(); /** * parser for title. */ public static final Parser TITLE = new ParseTitle(); /** * parser for script. */ public static final Parser SCRIPT = new ParseScript(); /** * parser for body. */ public static final Parser BODY = new ParseBody(); /** * parser for frameset. */ public static final Parser FRAMESET = new ParseFrameSet(); /** * parser for inline. */ public static final Parser INLINE = new ParseInline(); /** * parser for list. */ public static final Parser LIST = new ParseList(); /** * parser for definition lists. */ public static final Parser DEFLIST = new ParseDefList(); /** * parser for pre. */ public static final Parser PRE = new ParsePre(); /** * parser for block elements. */ public static final Parser BLOCK = new ParseBlock(); /** * parser for table. */ public static final Parser TABLETAG = new ParseTableTag(); /** * parser for colgroup. */ public static final Parser COLGROUP = new ParseColGroup(); /** * parser for rowgroup. */ public static final Parser ROWGROUP = new ParseRowGroup(); /** * parser for row. */ public static final Parser ROW = new ParseRow(); /** * parser for noframes. */ public static final Parser NOFRAMES = new ParseNoFrames(); /** * parser for select. */ public static final Parser SELECT = new ParseSelect(); /** * parser for text. */ public static final Parser TEXT = new ParseText(); /** * parser for empty elements. */ public static final Parser EMPTY = new ParseEmpty(); /** * parser for optgroup. */ public static final Parser OPTGROUP = new ParseOptGroup(); /** * ParserImpl should not be instantiated. */ private ParserImpl() { // unused } /** * @param lexer * @param node * @param mode */ protected static void parseTag(Lexer lexer, Node node, short mode) { // Fix by GLP 2000-12-21. Need to reset insertspace if this // is both a non-inline and empty tag (base, link, meta, isindex, hr, area). if ((node.tag.model & Dict.CM_EMPTY) != 0) { lexer.waswhite = false; } else if (!((node.tag.model & Dict.CM_INLINE) != 0)) { lexer.insertspace = false; } if (node.tag.getParser() == null) { return; } if (node.type == Node.START_END_TAG) { Node.trimEmptyElement(lexer, node); return; } node.tag.getParser().parse(lexer, node, mode); } /** * Move node to the head, where element is used as starting point in hunt for head. Normally called during parsing. * @param lexer * @param element * @param node */ protected static void moveToHead(Lexer lexer, Node element, Node node) { Node head; node.removeNode(); // make sure that node is isolated TagTable tt = lexer.configuration.tt; if (node.type == Node.START_TAG || node.type == Node.START_END_TAG) { lexer.report.warning(lexer, element, node, Report.TAG_NOT_ALLOWED_IN); while (element.tag != tt.tagHtml) { element = element.parent; } for (head = element.content; head != null; head = head.next) { if (head.tag == tt.tagHead) { head.insertNodeAtEnd(node); break; } } if (node.tag.getParser() != null) { parseTag(lexer, node, Lexer.IGNORE_WHITESPACE); } } else { lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED); } } /** * moves given node to end of body element. * @param lexer Lexer * @param node Node to insert */ static void moveNodeToBody(Lexer lexer, Node node) { node.removeNode(); Node body = lexer.root.findBody(lexer.configuration.tt); body.insertNodeAtEnd(node); } /** * Parser for HTML. */ public static class ParseHTML implements Parser { /** * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short) */ public void parse(Lexer lexer, Node html, short mode) { Node node, head; Node frameset = null; Node noframes = null; lexer.configuration.xmlTags = false; lexer.seenEndBody = false; TagTable tt = lexer.configuration.tt; while (true) { node = lexer.getToken(Lexer.IGNORE_WHITESPACE); if (node == null) { node = lexer.inferredTag("head"); break; } if (node.tag == tt.tagHead) { break; } if (node.tag == html.tag && node.type == Node.END_TAG) { lexer.report.warning(lexer, html, node, Report.DISCARDING_UNEXPECTED); continue; } // deal with comments etc. if (Node.insertMisc(html, node)) { continue; } lexer.ungetToken(); node = lexer.inferredTag("head"); break; } head = node; html.insertNodeAtEnd(head); HEAD.parse(lexer, head, mode); while (true) { node = lexer.getToken(Lexer.IGNORE_WHITESPACE); if (node == null) { if (frameset == null) { // implied body node = lexer.inferredTag("body"); html.insertNodeAtEnd(node); BODY.parse(lexer, node, mode); } return; } // robustly handle html tags if (node.tag == html.tag) { if (node.type != Node.START_TAG && frameset == null) { lexer.report.warning(lexer, html, node, Report.DISCARDING_UNEXPECTED); } else if (node.type == Node.END_TAG) { lexer.seenEndHtml = true; } continue; } // deal with comments etc. if (Node.insertMisc(html, node)) { continue; } // if frameset document coerce <body> to <noframes> if (node.tag == tt.tagBody) { if (node.type != Node.START_TAG) { lexer.report.warning(lexer, html, node, Report.DISCARDING_UNEXPECTED); continue; } if (frameset != null) { lexer.ungetToken(); if (noframes == null) { noframes = lexer.inferredTag("noframes"); frameset.insertNodeAtEnd(noframes); lexer.report.warning(lexer, html, noframes, Report.INSERTING_TAG); } parseTag(lexer, noframes, mode); continue; } lexer.constrainVersion(~Dict.VERS_FRAMESET); break; // to parse body } // flag an error if we see more than one frameset if (node.tag == tt.tagFrameset) { if (node.type != Node.START_TAG) { lexer.report.warning(lexer, html, node, Report.DISCARDING_UNEXPECTED); continue; } if (frameset != null) { lexer.report.error(lexer, html, node, Report.DUPLICATE_FRAMESET); } else { frameset = node; } html.insertNodeAtEnd(node); parseTag(lexer, node, mode); // see if it includes a noframes element so that we can merge subsequent noframes elements for (node = frameset.content; node != null; node = node.next) { if (node.tag == tt.tagNoframes) { noframes = node; } } continue; } // if not a frameset document coerce <noframes> to <body> if (node.tag == tt.tagNoframes) { if (node.type != Node.START_TAG) { lexer.report.warning(lexer, html, node, Report.DISCARDING_UNEXPECTED); continue; } if (frameset == null) { lexer.report.warning(lexer, html, node, Report.DISCARDING_UNEXPECTED); node = lexer.inferredTag("body"); break; } if (noframes == null) { noframes = node; frameset.insertNodeAtEnd(noframes); } parseTag(lexer, noframes, mode); continue; } if (node.type == Node.START_TAG || node.type == Node.START_END_TAG) { if (node.tag != null && (node.tag.model & Dict.CM_HEAD) != 0) { moveToHead(lexer, html, node); continue; } // #427675 - discard illegal frame element following a frameset - fix by Randy Waki 11 Oct 00 if (frameset != null && node.tag == tt.tagFrame) { lexer.report.warning(lexer, html, node, Report.DISCARDING_UNEXPECTED); continue; } } lexer.ungetToken(); // insert other content into noframes element if (frameset != null) { if (noframes == null) { noframes = lexer.inferredTag("noframes"); frameset.insertNodeAtEnd(noframes); } else { lexer.report.warning(lexer, html, node, Report.NOFRAMES_CONTENT); } lexer.constrainVersion(Dict.VERS_FRAMESET); parseTag(lexer, noframes, mode); continue; } node = lexer.inferredTag("body"); lexer.constrainVersion(~Dict.VERS_FRAMESET); break; } // node must be body html.insertNodeAtEnd(node); parseTag(lexer, node, mode); lexer.seenEndHtml = true; } } /** * Parser for HEAD. */ public static class ParseHead implements Parser { /** * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short) */ public void parse(Lexer lexer, Node head, short mode) { Node node; int hasTitle = 0; int hasBase = 0; TagTable tt = lexer.configuration.tt; while ((node = lexer.getToken(Lexer.IGNORE_WHITESPACE)) != null) { if (node.tag == head.tag && node.type == Node.END_TAG) { head.closed = true; break; } if (node.type == Node.TEXT_NODE) { lexer.report.warning(lexer, head, node, Report.TAG_NOT_ALLOWED_IN); lexer.ungetToken(); break; } // deal with comments etc. if (Node.insertMisc(head, node)) { continue; } if (node.type == Node.DOCTYPE_TAG) { Node.insertDocType(lexer, head, node); continue; } // discard unknown tags if (node.tag == null) { lexer.report.warning(lexer, head, node, Report.DISCARDING_UNEXPECTED); continue; } if (!TidyUtils.toBoolean(node.tag.model & Dict.CM_HEAD)) { // #545067 Implicit closing of head broken - warn only for XHTML input if (lexer.isvoyager) { lexer.report.warning(lexer, head, node, Report.TAG_NOT_ALLOWED_IN); } lexer.ungetToken(); break; } if (node.type == Node.START_TAG || node.type == Node.START_END_TAG) { if (node.tag == tt.tagTitle) { ++hasTitle; if (hasTitle > 1) { lexer.report.warning(lexer, head, node, Report.TOO_MANY_ELEMENTS); } } else if (node.tag == tt.tagBase) { ++hasBase; if (hasBase > 1) { lexer.report.warning(lexer, head, node, Report.TOO_MANY_ELEMENTS); } } else if (node.tag == tt.tagNoscript) { lexer.report.warning(lexer, head, node, Report.TAG_NOT_ALLOWED_IN); } head.insertNodeAtEnd(node); parseTag(lexer, node, Lexer.IGNORE_WHITESPACE); continue; } // discard unexpected text nodes and end tags lexer.report.warning(lexer, head, node, Report.DISCARDING_UNEXPECTED); } } } /** * Parser for TITLE. */ public static class ParseTitle implements Parser { /** * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short) */ public void parse(Lexer lexer, Node title, short mode) { Node node; while ((node = lexer.getToken(Lexer.MIXED_CONTENT)) != null) { // [438658] : Missing / in title endtag makes 2 titles if (node.tag == title.tag && node.type == Node.START_TAG) { lexer.report.warning(lexer, title, node, Report.COERCE_TO_ENDTAG); node.type = Node.END_TAG; continue; } else if (node.tag == title.tag && node.type == Node.END_TAG) { title.closed = true; Node.trimSpaces(lexer, title); return; } if (node.type == Node.TEXT_NODE) { // only called for 1st child if (title.content == null) { Node.trimInitialSpace(lexer, title, node); } if (node.start >= node.end) { continue; } title.insertNodeAtEnd(node); continue; } // deal with comments etc. if (Node.insertMisc(title, node)) { continue; } // discard unknown tags if (node.tag == null) { lexer.report.warning(lexer, title, node, Report.DISCARDING_UNEXPECTED); continue; } // pushback unexpected tokens lexer.report.warning(lexer, title, node, Report.MISSING_ENDTAG_BEFORE); lexer.ungetToken(); Node.trimSpaces(lexer, title); return; } lexer.report.warning(lexer, title, node, Report.MISSING_ENDTAG_FOR); } } /** * Parser for SCRIPT. */ public static class ParseScript implements Parser { /** * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short) */ public void parse(Lexer lexer, Node script, short mode) { Node node = lexer.getCDATA(script); if (node != null) { script.insertNodeAtEnd(node); } else { /* handle e.g. a document like "<script>" */ lexer.report.warning(lexer, script, null, Report.MISSING_ENDTAG_FOR); return; } node = lexer.getToken(Lexer.IGNORE_WHITESPACE); if (!(node != null && node.type == Node.END_TAG && node.tag != null && node.tag.name.equalsIgnoreCase(script.tag.name))) { lexer.report.warning(lexer, script, node, Report.MISSING_ENDTAG_FOR); if (node != null) { lexer.ungetToken(); } } } } /** * Parser for BODY. */ public static class ParseBody implements Parser { /** * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short) */ public void parse(Lexer lexer, Node body, short mode) { Node node; boolean checkstack, iswhitenode; mode = Lexer.IGNORE_WHITESPACE; checkstack = true; TagTable tt = lexer.configuration.tt; Clean.bumpObject(lexer, body.parent); while ((node = lexer.getToken(mode)) != null) { // #538536 Extra endtags not detected if (node.tag == tt.tagHtml) { if (node.type == Node.START_TAG || node.type == Node.START_END_TAG || lexer.seenEndHtml) { lexer.report.warning(lexer, body, node, Report.DISCARDING_UNEXPECTED); } else { lexer.seenEndHtml = true; } continue; } if (lexer.seenEndBody && (node.type == Node.START_TAG || node.type == Node.END_TAG || node.type == Node.START_END_TAG)) { lexer.report.warning(lexer, body, node, Report.CONTENT_AFTER_BODY); } if (node.tag == body.tag && node.type == Node.END_TAG) { body.closed = true; Node.trimSpaces(lexer, body); lexer.seenEndBody = true; mode = Lexer.IGNORE_WHITESPACE; if (body.parent.tag == tt.tagNoframes) { break; } continue; } if (node.tag == tt.tagNoframes) { if (node.type == Node.START_TAG) { body.insertNodeAtEnd(node); BLOCK.parse(lexer, node, mode); continue; } if (node.type == Node.END_TAG && body.parent.tag == tt.tagNoframes) { Node.trimSpaces(lexer, body); lexer.ungetToken(); break; } } if ((node.tag == tt.tagFrame || node.tag == tt.tagFrameset) && body.parent.tag == tt.tagNoframes) { Node.trimSpaces(lexer, body); lexer.ungetToken(); break; } iswhitenode = false; if (node.type == Node.TEXT_NODE && node.end <= node.start + 1 && node.textarray[node.start] == (byte) ' ') { iswhitenode = true; } // deal with comments etc. if (Node.insertMisc(body, node)) { continue; } // #538536 Extra endtags not detected // if (lexer.seenEndBody && !iswhitenode) // { // lexer.seenEndBody = true; // lexer.report.warning(lexer, body, node, Report.CONTENT_AFTER_BODY); // } // mixed content model permits text if (node.type == Node.TEXT_NODE) { if (iswhitenode && mode == Lexer.IGNORE_WHITESPACE) { continue; } if (lexer.configuration.encloseBodyText && !iswhitenode) { Node para; lexer.ungetToken(); para = lexer.inferredTag("p"); body.insertNodeAtEnd(para); parseTag(lexer, para, mode); mode = Lexer.MIXED_CONTENT; continue; } // HTML2 and HTML4 strict doesn't allow text here lexer.constrainVersion(~(Dict.VERS_HTML40_STRICT | Dict.VERS_HTML20)); if (checkstack) { checkstack = false; if (lexer.inlineDup(node) > 0) { continue; } } body.insertNodeAtEnd(node); mode = Lexer.MIXED_CONTENT; continue; } if (node.type == Node.DOCTYPE_TAG) { Node.insertDocType(lexer, body, node); continue; } // discard unknown and PARAM tags if (node.tag == null || node.tag == tt.tagParam) { lexer.report.warning(lexer, body, node, Report.DISCARDING_UNEXPECTED); continue; } // Netscape allows LI and DD directly in BODY We infer UL or DL respectively and use this boolean to // exclude block-level elements so as to match Netscape's observed behaviour. lexer.excludeBlocks = false; if ((!((node.tag.model & Dict.CM_BLOCK) != 0) && !((node.tag.model & Dict.CM_INLINE) != 0)) || node.tag == tt.tagInput) { // avoid this error message being issued twice if (!((node.tag.model & Dict.CM_HEAD) != 0)) { lexer.report.warning(lexer, body, node, Report.TAG_NOT_ALLOWED_IN); } if ((node.tag.model & Dict.CM_HTML) != 0) { // copy body attributes if current body was inferred if (node.tag == tt.tagBody && body.implicit && body.attributes == null) { body.attributes = node.attributes; node.attributes = null; } continue; } if ((node.tag.model & Dict.CM_HEAD) != 0) { moveToHead(lexer, body, node); continue; } if ((node.tag.model & Dict.CM_LIST) != 0) { lexer.ungetToken(); node = lexer.inferredTag("ul"); node.addClass("noindent"); lexer.excludeBlocks = true; } else if ((node.tag.model & Dict.CM_DEFLIST) != 0) { lexer.ungetToken(); node = lexer.inferredTag("dl"); lexer.excludeBlocks = true; } else if ((node.tag.model & (Dict.CM_TABLE | Dict.CM_ROWGRP | Dict.CM_ROW)) != 0) { // Issue 2855511 if (node.type != Node.END_TAG) { lexer.ungetToken(); node = lexer.inferredTag("table"); } lexer.excludeBlocks = true; } else if (node.tag == tt.tagInput) { lexer.ungetToken(); node = lexer.inferredTag("form"); lexer.excludeBlocks = true; } else { if (!((node.tag.model & (Dict.CM_ROW | Dict.CM_FIELD)) != 0)) { lexer.ungetToken(); return; } // ignore </td></th> <option> etc. continue; } } if (node.type == Node.END_TAG) { if (node.tag == tt.tagBr) { node.type = Node.START_TAG; } else if (node.tag == tt.tagP) { Node.coerceNode(lexer, node, tt.tagBr); body.insertNodeAtEnd(node); node = lexer.inferredTag("br"); } else if ((node.tag.model & Dict.CM_INLINE) != 0) { lexer.popInline(node); } } if (node.type == Node.START_TAG || node.type == Node.START_END_TAG) { if (((node.tag.model & Dict.CM_INLINE) != 0) && !((node.tag.model & Dict.CM_MIXED) != 0)) { // HTML4 strict doesn't allow inline content here // but HTML2 does allow img elements as children of body if (node.tag == tt.tagImg) { lexer.constrainVersion(~Dict.VERS_HTML40_STRICT); } else { lexer.constrainVersion(~(Dict.VERS_HTML40_STRICT | Dict.VERS_HTML20)); } if (checkstack && !node.implicit) { checkstack = false; if (lexer.inlineDup(node) > 0) { continue; } } mode = Lexer.MIXED_CONTENT; } else { checkstack = true; mode = Lexer.IGNORE_WHITESPACE; } if (node.implicit) { lexer.report.warning(lexer, body, node, Report.INSERTING_TAG); } body.insertNodeAtEnd(node); parseTag(lexer, node, mode); continue; } // discard unexpected tags lexer.report.warning(lexer, body, node, Report.DISCARDING_UNEXPECTED); } } } /** * Parser for FRAMESET. */ public static class ParseFrameSet implements Parser { /** * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short) */ public void parse(Lexer lexer, Node frameset, short mode) { Node node; TagTable tt = lexer.configuration.tt; lexer.badAccess |= Report.USING_FRAMES; while ((node = lexer.getToken(Lexer.IGNORE_WHITESPACE)) != null) { if (node.tag == frameset.tag && node.type == Node.END_TAG) { frameset.closed = true; Node.trimSpaces(lexer, frameset); return; } // deal with comments etc. if (Node.insertMisc(frameset, node)) { continue; } if (node.tag == null) { lexer.report.warning(lexer, frameset, node, Report.DISCARDING_UNEXPECTED); continue; } if (node.type == Node.START_TAG || node.type == Node.START_END_TAG) { if (node.tag != null && (node.tag.model & Dict.CM_HEAD) != 0) { moveToHead(lexer, frameset, node); continue; } } if (node.tag == tt.tagBody) { lexer.ungetToken(); node = lexer.inferredTag("noframes"); lexer.report.warning(lexer, frameset, node, Report.INSERTING_TAG); } if (node.type == Node.START_TAG && (node.tag.model & Dict.CM_FRAMES) != 0) { frameset.insertNodeAtEnd(node); lexer.excludeBlocks = false; parseTag(lexer, node, Lexer.MIXED_CONTENT); continue; } else if (node.type == Node.START_END_TAG && (node.tag.model & Dict.CM_FRAMES) != 0) { frameset.insertNodeAtEnd(node); continue; } // discard unexpected tags lexer.report.warning(lexer, frameset, node, Report.DISCARDING_UNEXPECTED); } lexer.report.warning(lexer, frameset, node, Report.MISSING_ENDTAG_FOR); } } /** * Parser for INLINE. */ public static class ParseInline implements Parser { /** * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short) */ public void parse(Lexer lexer, Node element, short mode) { Node node, parent; TagTable tt = lexer.configuration.tt; if (TidyUtils.toBoolean(element.tag.model & Dict.CM_EMPTY)) { return; } // ParseInline is used for some block level elements like H1 to H6 For such elements we need to insert // inline emphasis tags currently on the inline stack. For Inline elements, we normally push them onto the // inline stack provided they aren't implicit or OBJECT/APPLET. This test is carried out in PushInline and // PopInline, see istack.c We don't push SPAN to replicate current browser behavior if (TidyUtils.toBoolean(element.tag.model & Dict.CM_BLOCK) || (element.tag == tt.tagDt)) { lexer.inlineDup(null); } else if (TidyUtils.toBoolean(element.tag.model & Dict.CM_INLINE) // EUNYEE: Add back this condition // because this causes the infinite loop problem when the span does not have the ending tag. && element.tag != tt.tagA && element.tag != tt.tagSpan) { // && element.tag != tt.tagSpan #540571 Inconsistent behaviour with span inline element lexer.pushInline(element); } if (element.tag == tt.tagNobr) { lexer.badLayout |= Report.USING_NOBR; } else if (element.tag == tt.tagFont) { lexer.badLayout |= Report.USING_FONT; } // Inline elements may or may not be within a preformatted element if (mode != Lexer.PREFORMATTED) { mode = Lexer.MIXED_CONTENT; } while ((node = lexer.getToken(mode)) != null) { // end tag for current element if (node.tag == element.tag && node.type == Node.END_TAG) { if (TidyUtils.toBoolean(element.tag.model & Dict.CM_INLINE)) { lexer.popInline(node); } if (!TidyUtils.toBoolean(mode & Lexer.PREFORMATTED)) { Node.trimSpaces(lexer, element); } // if a font element wraps an anchor and nothing else then move the font element inside the anchor // since otherwise it won't alter the anchor text color if (element.tag == tt.tagFont && element.content != null && element.content == element.last) { Node child = element.content; if (child.tag == tt.tagA) { child.parent = element.parent; child.next = element.next; child.prev = element.prev; if (child.prev != null) { child.prev.next = child; } else { child.parent.content = child; } if (child.next != null) { child.next.prev = child; } else { child.parent.last = child; } element.next = null; element.prev = null; element.parent = child; element.content = child.content; element.last = child.last; child.content = element; child.last = element; for (child = element.content; child != null; child = child.next) { child.parent = element; } } } element.closed = true; Node.trimSpaces(lexer, element); Node.trimEmptyElement(lexer, element); return; } // <u> ... <u> map 2nd <u> to </u> if 1st is explicit // otherwise emphasis nesting is probably unintentional // big and small have cumulative effect to leave them alone if (node.type == Node.START_TAG && node.tag == element.tag && lexer.isPushed(node) && !node.implicit && !element.implicit && node.tag != null && ((node.tag.model & Dict.CM_INLINE) != 0) && node.tag != tt.tagA && node.tag != tt.tagFont && node.tag != tt.tagBig && node.tag != tt.tagSmall && node.tag != tt.tagQ) { if (element.content != null && node.attributes == null) { lexer.report.warning(lexer, element, node, Report.COERCE_TO_ENDTAG); node.type = Node.END_TAG; lexer.ungetToken(); continue; } lexer.report.warning(lexer, element, node, Report.NESTED_EMPHASIS); } else if (lexer.isPushed(node) && node.type == Node.START_TAG && node.tag == tt.tagQ) { lexer.report.warning(lexer, element, node, Report.NESTED_QUOTATION); } if (node.type == Node.TEXT_NODE) { // only called for 1st child if (element.content == null && !TidyUtils.toBoolean(mode & Lexer.PREFORMATTED)) { Node.trimSpaces(lexer, element); } if (node.start >= node.end) { continue; } element.insertNodeAtEnd(node); continue; } // mixed content model so allow text if (Node.insertMisc(element, node)) { continue; } // deal with HTML tags if (node.tag == tt.tagHtml) { if (node.type == Node.START_TAG || node.type == Node.START_END_TAG) { lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED); continue; } // otherwise infer end of inline element lexer.ungetToken(); if (!((mode & Lexer.PREFORMATTED) != 0)) { Node.trimSpaces(lexer, element); } Node.trimEmptyElement(lexer, element); return; } // within <dt> or <pre> map <p> to <br> if (node.tag == tt.tagP && node.type == Node.START_TAG && ((mode & Lexer.PREFORMATTED) != 0 || element.tag == tt.tagDt || element.isDescendantOf(tt.tagDt))) { node.tag = tt.tagBr; node.element = "br"; Node.trimSpaces(lexer, element); element.insertNodeAtEnd(node); continue; } // ignore unknown and PARAM tags if (node.tag == null || node.tag == tt.tagParam) { lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED); continue; } if (node.tag == tt.tagBr && node.type == Node.END_TAG) { node.type = Node.START_TAG; } if (node.type == Node.END_TAG) { // coerce </br> to <br> if (node.tag == tt.tagBr) { node.type = Node.START_TAG; } else if (node.tag == tt.tagP) { // coerce unmatched </p> to <br><br> if (!element.isDescendantOf(tt.tagP)) { Node.coerceNode(lexer, node, tt.tagBr); Node.trimSpaces(lexer, element); element.insertNodeAtEnd(node); node = lexer.inferredTag("br"); continue; } } else if ((node.tag.model & Dict.CM_INLINE) != 0 && node.tag != tt.tagA && !((node.tag.model & Dict.CM_OBJECT) != 0) && (element.tag.model & Dict.CM_INLINE) != 0) { // allow any inline end tag to end current element lexer.popInline(element); if (element.tag != tt.tagA) { if (node.tag == tt.tagA && node.tag != element.tag) { lexer.report.warning(lexer, element, node, Report.MISSING_ENDTAG_BEFORE); lexer.ungetToken(); } else { lexer.report.warning(lexer, element, node, Report.NON_MATCHING_ENDTAG); } if (!((mode & Lexer.PREFORMATTED) != 0)) { Node.trimSpaces(lexer, element); } Node.trimEmptyElement(lexer, element); return; } // if parent is <a> then discard unexpected inline end tag lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED); continue; } // special case </tr> etc. for stuff moved in front of table else if (lexer.exiled && node.tag.model != 0 && (node.tag.model & Dict.CM_TABLE) != 0) { lexer.ungetToken(); Node.trimSpaces(lexer, element); Node.trimEmptyElement(lexer, element); return; } } // allow any header tag to end current header if ((node.tag.model & Dict.CM_HEADING) != 0 && (element.tag.model & Dict.CM_HEADING) != 0) { if (node.tag == element.tag) { lexer.report.warning(lexer, element, node, Report.NON_MATCHING_ENDTAG); } else { lexer.report.warning(lexer, element, node, Report.MISSING_ENDTAG_BEFORE); lexer.ungetToken(); } if (!((mode & Lexer.PREFORMATTED) != 0)) { Node.trimSpaces(lexer, element); } Node.trimEmptyElement(lexer, element); return; } // an <A> tag to ends any open <A> element but <A href=...> is mapped to </A><A href=...> // #427827 - fix by Randy Waki and Bjoern Hoehrmann 23 Aug 00 // if (node.tag == tt.tagA && !node.implicit && lexer.isPushed(node)) if (node.tag == tt.tagA && !node.implicit && (element.tag == tt.tagA || element.isDescendantOf(tt.tagA))) { // coerce <a> to </a> unless it has some attributes // #427827 - fix by Randy Waki and Bjoern Hoehrmann 23 Aug 00 // other fixes by Dave Raggett // if (node.attributes == null) if (node.type != Node.END_TAG && node.attributes == null) { node.type = Node.END_TAG; lexer.report.warning(lexer, element, node, Report.COERCE_TO_ENDTAG); // lexer.popInline(node); lexer.ungetToken(); continue; } lexer.ungetToken(); lexer.report.warning(lexer, element, node, Report.MISSING_ENDTAG_BEFORE); // lexer.popInline(element); if (!((mode & Lexer.PREFORMATTED) != 0)) { Node.trimSpaces(lexer, element); } Node.trimEmptyElement(lexer, element); return; } if ((element.tag.model & Dict.CM_HEADING) != 0) { if (node.tag == tt.tagCenter || node.tag == tt.tagDiv) { if (node.type != Node.START_TAG && node.type != Node.START_END_TAG) { lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED); continue; } lexer.report.warning(lexer, element, node, Report.TAG_NOT_ALLOWED_IN); // insert center as parent if heading is empty if (element.content == null) { Node.insertNodeAsParent(element, node); continue; } // split heading and make center parent of 2nd part element.insertNodeAfterElement(node); if (!((mode & Lexer.PREFORMATTED) != 0)) { Node.trimSpaces(lexer, element); } element = lexer.cloneNode(element); element.start = lexer.lexsize; element.end = lexer.lexsize; node.insertNodeAtEnd(element); continue; } if (node.tag == tt.tagHr) { if (node.type != Node.START_TAG && node.type != Node.START_END_TAG) { lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED); continue; } lexer.report.warning(lexer, element, node, Report.TAG_NOT_ALLOWED_IN); // insert hr before heading if heading is empty if (element.content == null) { Node.insertNodeBeforeElement(element, node); continue; } // split heading and insert hr before 2nd part element.insertNodeAfterElement(node); if (!((mode & Lexer.PREFORMATTED) != 0)) { Node.trimSpaces(lexer, element); } element = lexer.cloneNode(element); element.start = lexer.lexsize; element.end = lexer.lexsize; node.insertNodeAfterElement(element); continue; } } if (element.tag == tt.tagDt) { if (node.tag == tt.tagHr) { Node dd; if (node.type != Node.START_TAG && node.type != Node.START_END_TAG) { lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED); continue; } lexer.report.warning(lexer, element, node, Report.TAG_NOT_ALLOWED_IN); dd = lexer.inferredTag("dd"); // insert hr within dd before dt if dt is empty if (element.content == null) { Node.insertNodeBeforeElement(element, dd); dd.insertNodeAtEnd(node); continue; } // split dt and insert hr within dd before 2nd part element.insertNodeAfterElement(dd); dd.insertNodeAtEnd(node); if (!((mode & Lexer.PREFORMATTED) != 0)) { Node.trimSpaces(lexer, element); } element = lexer.cloneNode(element); element.start = lexer.lexsize; element.end = lexer.lexsize; dd.insertNodeAfterElement(element); continue; } } // if this is the end tag for an ancestor element then infer end tag for this element if (node.type == Node.END_TAG) { for (parent = element.parent; parent != null; parent = parent.parent) { if (node.tag == parent.tag) { if (!((element.tag.model & Dict.CM_OPT) != 0) && !element.implicit) { lexer.report.warning(lexer, element, node, Report.MISSING_ENDTAG_BEFORE); } if (element.tag == tt.tagA) { lexer.popInline(element); } lexer.ungetToken(); if (!((mode & Lexer.PREFORMATTED) != 0)) { Node.trimSpaces(lexer, element); } Node.trimEmptyElement(lexer, element); return; } } } // block level tags end this element if (!((node.tag.model & Dict.CM_INLINE) != 0)) { if (node.type != Node.START_TAG) { lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED); continue; } if (!((element.tag.model & Dict.CM_OPT) != 0)) { lexer.report.warning(lexer, element, node, Report.MISSING_ENDTAG_BEFORE); } if ((node.tag.model & Dict.CM_HEAD) != 0 && !((node.tag.model & Dict.CM_BLOCK) != 0)) { moveToHead(lexer, element, node); continue; } // prevent anchors from propagating into block tags except for headings h1 to h6 if (element.tag == tt.tagA) { if (node.tag != null && !((node.tag.model & Dict.CM_HEADING) != 0)) { lexer.popInline(element); } else if (!(element.content != null)) { Node.discardElement(element); lexer.ungetToken(); return; } } lexer.ungetToken(); if (!((mode & Lexer.PREFORMATTED) != 0)) { Node.trimSpaces(lexer, element); } Node.trimEmptyElement(lexer, element); return; } // parse inline element if (node.type == Node.START_TAG || node.type == Node.START_END_TAG) { if (node.implicit) { lexer.report.warning(lexer, element, node, Report.INSERTING_TAG); } // trim white space before <br> if (node.tag == tt.tagBr) { Node.trimSpaces(lexer, element); } element.insertNodeAtEnd(node); parseTag(lexer, node, mode); continue; } // discard unexpected tags lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED); continue; } if (!((element.tag.model & Dict.CM_OPT) != 0)) { lexer.report.warning(lexer, element, node, Report.MISSING_ENDTAG_FOR); } Node.trimEmptyElement(lexer, element); } } /** * Parser for LIST. */ public static class ParseList implements Parser { public void parse(Lexer lexer, Node list, short mode) { Node node; Node parent; TagTable tt = lexer.configuration.tt; if ((list.tag.model & Dict.CM_EMPTY) != 0) { return; } lexer.insert = -1; // defer implicit inline start tags while ((node = lexer.getToken(Lexer.IGNORE_WHITESPACE)) != null) { if (node.tag == list.tag && node.type == Node.END_TAG) { if ((list.tag.model & Dict.CM_OBSOLETE) != 0) { Node.coerceNode(lexer, list, tt.tagUl); } list.closed = true; Node.trimEmptyElement(lexer, list); return; } // deal with comments etc. if (Node.insertMisc(list, node)) { continue; } if (node.type != Node.TEXT_NODE && node.tag == null) { lexer.report.warning(lexer, list, node, Report.DISCARDING_UNEXPECTED); continue; } // if this is the end tag for an ancestor element then infer end tag for this element if (node.type == Node.END_TAG) { if (node.tag == tt.tagForm) { badForm(lexer); lexer.report.warning(lexer, list, node, Report.DISCARDING_UNEXPECTED); continue; } if (node.tag != null && (node.tag.model & Dict.CM_INLINE) != 0) { lexer.report.warning(lexer, list, node, Report.DISCARDING_UNEXPECTED); lexer.popInline(node); continue; } for (parent = list.parent; parent != null; parent = parent.parent) { if (node.tag == parent.tag) { lexer.report.warning(lexer, list, node, Report.MISSING_ENDTAG_BEFORE); lexer.ungetToken(); if ((list.tag.model & Dict.CM_OBSOLETE) != 0) { Node.coerceNode(lexer, list, tt.tagUl); } Node.trimEmptyElement(lexer, list); return; } } lexer.report.warning(lexer, list, node, Report.DISCARDING_UNEXPECTED); continue; } if (node.tag != tt.tagLi) { lexer.ungetToken(); if (node.tag != null && (node.tag.model & Dict.CM_BLOCK) != 0 && lexer.excludeBlocks) { lexer.report.warning(lexer, list, node, Report.MISSING_ENDTAG_BEFORE); Node.trimEmptyElement(lexer, list); return; } node = lexer.inferredTag("li"); node.addAttribute("style", "list-style: none"); lexer.report.warning(lexer, list, node, Report.MISSING_STARTTAG); } // node should be <LI> list.insertNodeAtEnd(node); parseTag(lexer, node, Lexer.IGNORE_WHITESPACE); } if ((list.tag.model & Dict.CM_OBSOLETE) != 0) { Node.coerceNode(lexer, list, tt.tagUl); } lexer.report.warning(lexer, list, node, Report.MISSING_ENDTAG_FOR); Node.trimEmptyElement(lexer, list); } } /** * Parser for empty elements. */ public static class ParseEmpty implements Parser { /** * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short) */ public void parse(Lexer lexer, Node element, short mode) { if (lexer.isvoyager) { Node node = lexer.getToken(mode); if (node != null && !(node.type == Node.END_TAG && node.tag == element.tag)) { lexer.report.warning(lexer, element, node, Report.ELEMENT_NOT_EMPTY); lexer.ungetToken(); } } } } /** * Parser for DEFLIST. */ public static class ParseDefList implements Parser { /** * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short) */ public void parse(Lexer lexer, Node list, short mode) { Node node, parent; TagTable tt = lexer.configuration.tt; if ((list.tag.model & Dict.CM_EMPTY) != 0) { return; } lexer.insert = -1; // defer implicit inline start tags while ((node = lexer.getToken(Lexer.IGNORE_WHITESPACE)) != null) { if (node.tag == list.tag && node.type == Node.END_TAG) { list.closed = true; Node.trimEmptyElement(lexer, list); return; } // deal with comments etc. if (Node.insertMisc(list, node)) { continue; } if (node.type == Node.TEXT_NODE) { lexer.ungetToken(); node = lexer.inferredTag("dt"); lexer.report.warning(lexer, list, node, Report.MISSING_STARTTAG); } if (node.tag == null) { lexer.report.warning(lexer, list, node, Report.DISCARDING_UNEXPECTED); continue; } // if this is the end tag for an ancestor element then infer end tag for this element if (node.type == Node.END_TAG) { if (node.tag == tt.tagForm) { badForm(lexer); lexer.report.warning(lexer, list, node, Report.DISCARDING_UNEXPECTED); continue; } for (parent = list.parent; parent != null; parent = parent.parent) { if (node.tag == parent.tag) { lexer.report.warning(lexer, list, node, Report.MISSING_ENDTAG_BEFORE); lexer.ungetToken(); Node.trimEmptyElement(lexer, list); return; } } } // center in a dt or a dl breaks the dl list in two if (node.tag == tt.tagCenter) { if (list.content != null) { list.insertNodeAfterElement(node); } else { // trim empty dl list Node.insertNodeBeforeElement(list, node); // #540296 tidy dumps with empty definition list Node.discardElement(list); } // and parse contents of center parseTag(lexer, node, mode); // now create a new dl element list = lexer.inferredTag("dl"); node.insertNodeAfterElement(list); continue; } if (!(node.tag == tt.tagDt || node.tag == tt.tagDd)) { lexer.ungetToken(); if (!((node.tag.model & (Dict.CM_BLOCK | Dict.CM_INLINE)) != 0)) { lexer.report.warning(lexer, list, node, Report.TAG_NOT_ALLOWED_IN); Node.trimEmptyElement(lexer, list); return; } // if DD appeared directly in BODY then exclude blocks if (!((node.tag.model & Dict.CM_INLINE) != 0) && lexer.excludeBlocks) { Node.trimEmptyElement(lexer, list); return; } node = lexer.inferredTag("dd"); lexer.report.warning(lexer, list, node, Report.MISSING_STARTTAG); } if (node.type == Node.END_TAG) { lexer.report.warning(lexer, list, node, Report.DISCARDING_UNEXPECTED); continue; } // node should be <DT> or <DD> list.insertNodeAtEnd(node); parseTag(lexer, node, Lexer.IGNORE_WHITESPACE); } lexer.report.warning(lexer, list, node, Report.MISSING_ENDTAG_FOR); Node.trimEmptyElement(lexer, list); } } /** * Parser for PRE. */ public static class ParsePre implements Parser { /** * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short) */ public void parse(Lexer lexer, Node pre, short mode) { Node node; TagTable tt = lexer.configuration.tt; if ((pre.tag.model & Dict.CM_EMPTY) != 0) { return; } if ((pre.tag.model & Dict.CM_OBSOLETE) != 0) { Node.coerceNode(lexer, pre, tt.tagPre); } lexer.inlineDup(null); // tell lexer to insert inlines if needed while ((node = lexer.getToken(Lexer.PREFORMATTED)) != null) { if (node.tag == pre.tag && node.type == Node.END_TAG) { Node.trimSpaces(lexer, pre); pre.closed = true; Node.trimEmptyElement(lexer, pre); return; } if (node.tag == tt.tagHtml) { if (node.type == Node.START_TAG || node.type == Node.START_END_TAG) { lexer.report.warning(lexer, pre, node, Report.DISCARDING_UNEXPECTED); } continue; } if (node.type == Node.TEXT_NODE) { // if first check for inital newline if (pre.content == null) { if (node.textarray[node.start] == (byte) '\n') { ++node.start; } if (node.start >= node.end) { continue; } } pre.insertNodeAtEnd(node); continue; } // deal with comments etc. if (Node.insertMisc(pre, node)) { continue; } // strip unexpected tags if (!lexer.preContent(node)) { Node newnode; lexer.report.warning(lexer, pre, node, Report.UNESCAPED_ELEMENT); newnode = Node.escapeTag(lexer, node); pre.insertNodeAtEnd(newnode); continue; } if (node.tag == tt.tagP) { if (node.type == Node.START_TAG) { lexer.report.warning(lexer, pre, node, Report.USING_BR_INPLACE_OF); // trim white space before <p> in <pre> Node.trimSpaces(lexer, pre); // coerce both <p> and </p> to <br> Node.coerceNode(lexer, node, tt.tagBr); pre.insertNodeAtEnd(node); } else { lexer.report.warning(lexer, pre, node, Report.DISCARDING_UNEXPECTED); } continue; } if (node.type == Node.START_TAG || node.type == Node.START_END_TAG) { // trim white space before <br> if (node.tag == tt.tagBr) { Node.trimSpaces(lexer, pre); } pre.insertNodeAtEnd(node); parseTag(lexer, node, Lexer.PREFORMATTED); continue; } // discard unexpected tags lexer.report.warning(lexer, pre, node, Report.DISCARDING_UNEXPECTED); } lexer.report.warning(lexer, pre, node, Report.MISSING_ENDTAG_FOR); Node.trimEmptyElement(lexer, pre); } } /** * Parser for block elements. */ public static class ParseBlock implements Parser { /** * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short) */ public void parse(Lexer lexer, Node element, short mode) { // element is node created by the lexer upon seeing the start tag, or by the parser when the start tag is // inferred. Node node, parent; boolean checkstack; int istackbase = 0; TagTable tt = lexer.configuration.tt; checkstack = true; if ((element.tag.model & Dict.CM_EMPTY) != 0) { return; } if (element.tag == tt.tagForm && element.isDescendantOf(tt.tagForm)) { lexer.report.warning(lexer, element, null, Report.ILLEGAL_NESTING); } // InlineDup() asks the lexer to insert inline emphasis tags currently pushed on the istack, but take care // to avoid propagating inline emphasis inside OBJECT or APPLET. For these elements a fresh inline stack // context is created and disposed of upon reaching the end of the element. They thus behave like table // cells in this respect. if ((element.tag.model & Dict.CM_OBJECT) != 0) { istackbase = lexer.istackbase; lexer.istackbase = lexer.istack.size(); } if (!((element.tag.model & Dict.CM_MIXED) != 0)) { lexer.inlineDup(null); } mode = Lexer.IGNORE_WHITESPACE; while ((node = lexer.getToken(mode)) != null) { // end tag for this element if (node.type == Node.END_TAG && node.tag != null && (node.tag == element.tag || element.was == node.tag)) { if ((element.tag.model & Dict.CM_OBJECT) != 0) { // pop inline stack while (lexer.istack.size() > lexer.istackbase) { lexer.popInline(null); } lexer.istackbase = istackbase; } element.closed = true; Node.trimSpaces(lexer, element); Node.trimEmptyElement(lexer, element); return; } if (node.tag == tt.tagHtml || node.tag == tt.tagHead || node.tag == tt.tagBody) { if (node.type == Node.START_TAG || node.type == Node.START_END_TAG) { lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED); } continue; } if (node.type == Node.END_TAG) { if (node.tag == null) { lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED); continue; } else if (node.tag == tt.tagBr) { node.type = Node.START_TAG; } else if (node.tag == tt.tagP) { Node.coerceNode(lexer, node, tt.tagBr); element.insertNodeAtEnd(node); node = lexer.inferredTag("br"); } else { // if this is the end tag for an ancestor element then infer end tag for this element for (parent = element.parent; parent != null; parent = parent.parent) { if (node.tag == parent.tag) { if (!((element.tag.model & Dict.CM_OPT) != 0)) { lexer.report.warning(lexer, element, node, Report.MISSING_ENDTAG_BEFORE); } lexer.ungetToken(); if ((element.tag.model & Dict.CM_OBJECT) != 0) { // pop inline stack while (lexer.istack.size() > lexer.istackbase) { lexer.popInline(null); } lexer.istackbase = istackbase; } Node.trimSpaces(lexer, element); Node.trimEmptyElement(lexer, element); return; } } // special case </tr> etc. for stuff moved in front of table if (lexer.exiled && node.tag.model != 0 && (node.tag.model & Dict.CM_TABLE) != 0) { lexer.ungetToken(); Node.trimSpaces(lexer, element); Node.trimEmptyElement(lexer, element); return; } } } // mixed content model permits text if (node.type == Node.TEXT_NODE) { boolean iswhitenode = false; if (node.type == Node.TEXT_NODE && node.end <= node.start + 1 && lexer.lexbuf[node.start] == (byte) ' ') { iswhitenode = true; } if (lexer.configuration.encloseBlockText && !iswhitenode) { lexer.ungetToken(); node = lexer.inferredTag("p"); element.insertNodeAtEnd(node); parseTag(lexer, node, Lexer.MIXED_CONTENT); continue; } if (checkstack) { checkstack = false; if (!((element.tag.model & Dict.CM_MIXED) != 0)) { if (lexer.inlineDup(node) > 0) { continue; } } } element.insertNodeAtEnd(node); mode = Lexer.MIXED_CONTENT; // HTML4 strict doesn't allow mixed content for elements with %block; as their content model // But only body, map, blockquote, form and noscript have content model %block; if (element.tag == tt.tagBody || element.tag == tt.tagMap || element.tag == tt.tagBlockquote || element.tag == tt.tagForm || element.tag == tt.tagNoscript) { lexer.constrainVersion(~Dict.VERS_HTML40_STRICT); } continue; } if (Node.insertMisc(element, node)) { continue; } // allow PARAM elements? if (node.tag == tt.tagParam) { if (((element.tag.model & Dict.CM_PARAM) != 0) && (node.type == Node.START_TAG || node.type == Node.START_END_TAG)) { element.insertNodeAtEnd(node); continue; } // otherwise discard it lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED); continue; } // allow AREA elements? if (node.tag == tt.tagArea) { if ((element.tag == tt.tagMap) && (node.type == Node.START_TAG || node.type == Node.START_END_TAG)) { element.insertNodeAtEnd(node); continue; } // otherwise discard it lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED); continue; } // ignore unknown start/end tags if (node.tag == null) { lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED); continue; } // Allow Dict.CM_INLINE elements here. Allow Dict.CM_BLOCK elements here unless lexer.excludeBlocks is // yes. LI and DD are special cased. Otherwise infer end tag for this element. if (!((node.tag.model & Dict.CM_INLINE) != 0)) { if (node.type != Node.START_TAG && node.type != Node.START_END_TAG) { if (node.tag == tt.tagForm) { badForm(lexer); } lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED); continue; } // #427671 - Fix by Randy Waki - 10 Aug 00 // If an LI contains an illegal FRAME, FRAMESET, OPTGROUP, or OPTION start tag, discard the start // tag and let the subsequent content get parsed as content of the enclosing LI. This seems to // mimic IE and Netscape, and avoids an infinite loop: without this check, ParseBlock (which is // parsing the LI's content) and ParseList (which is parsing the LI's parent's content) repeatedly // defer to each other to parse the illegal start tag, each time inferring a missing </li> or <li> // respectively. NOTE: This check is a bit fragile. It specifically checks for the four tags that // happen to weave their way through the current series of tests performed by ParseBlock and // ParseList to trigger the infinite loop. if (element.tag == tt.tagLi) { if (node.tag == tt.tagFrame || node.tag == tt.tagFrameset || node.tag == tt.tagOptgroup || node.tag == tt.tagOption) { lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED); continue; } } if (element.tag == tt.tagTd || element.tag == tt.tagTh) { // if parent is a table cell, avoid inferring the end of the cell if ((node.tag.model & Dict.CM_HEAD) != 0) { moveToHead(lexer, element, node); continue; } if ((node.tag.model & Dict.CM_LIST) != 0) { lexer.ungetToken(); node = lexer.inferredTag("ul"); node.addClass("noindent"); lexer.excludeBlocks = true; } else if ((node.tag.model & Dict.CM_DEFLIST) != 0) { lexer.ungetToken(); node = lexer.inferredTag("dl"); lexer.excludeBlocks = true; } // infer end of current table cell if (!((node.tag.model & Dict.CM_BLOCK) != 0)) { lexer.ungetToken(); Node.trimSpaces(lexer, element); Node.trimEmptyElement(lexer, element); return; } } else if ((node.tag.model & Dict.CM_BLOCK) != 0) { if (lexer.excludeBlocks) { if (!((element.tag.model & Dict.CM_OPT) != 0)) { lexer.report.warning(lexer, element, node, Report.MISSING_ENDTAG_BEFORE); } lexer.ungetToken(); if ((element.tag.model & Dict.CM_OBJECT) != 0) { lexer.istackbase = istackbase; } Node.trimSpaces(lexer, element); Node.trimEmptyElement(lexer, element); return; } } else { // things like list items if ((node.tag.model & Dict.CM_HEAD) != 0) { moveToHead(lexer, element, node); continue; } // special case where a form start tag occurs in a tr and is followed by td or th if (element.tag == tt.tagForm && element.parent.tag == tt.tagTd && element.parent.implicit) { if (node.tag == tt.tagTd) { lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED); continue; } if (node.tag == tt.tagTh) { lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED); node = element.parent; node.element = "th"; node.tag = tt.tagTh; continue; } } if (!((element.tag.model & Dict.CM_OPT) != 0) && !element.implicit) { lexer.report.warning(lexer, element, node, Report.MISSING_ENDTAG_BEFORE); } lexer.ungetToken(); if ((node.tag.model & Dict.CM_LIST) != 0) { if (element.parent != null && element.parent.tag != null && element.parent.tag.getParser() == LIST) { Node.trimSpaces(lexer, element); Node.trimEmptyElement(lexer, element); return; } node = lexer.inferredTag("ul"); node.addClass("noindent"); } else if ((node.tag.model & Dict.CM_DEFLIST) != 0) { if (element.parent.tag == tt.tagDl) { Node.trimSpaces(lexer, element); Node.trimEmptyElement(lexer, element); return; } node = lexer.inferredTag("dl"); } else if ((node.tag.model & Dict.CM_TABLE) != 0 || (node.tag.model & Dict.CM_ROW) != 0) { node = lexer.inferredTag("table"); } else if ((element.tag.model & Dict.CM_OBJECT) != 0) { // pop inline stack while (lexer.istack.size() > lexer.istackbase) { lexer.popInline(null); } lexer.istackbase = istackbase; Node.trimSpaces(lexer, element); Node.trimEmptyElement(lexer, element); return; } else { Node.trimSpaces(lexer, element); Node.trimEmptyElement(lexer, element); return; } } } // parse known element if (node.type == Node.START_TAG || node.type == Node.START_END_TAG) { if (TidyUtils.toBoolean(node.tag.model & Dict.CM_INLINE)) { // DSR - 27Apr02 ensure we wrap anchors and other inline content // fgiust: commented out due to [1403105]: java.lang.StackOverflowError in Tidy.parseDOM() // if (lexer.configuration.encloseBlockText) // { // lexer.ungetToken(); // node = lexer.inferredTag("p"); // element.insertNodeAtEnd(node); // parseTag(lexer, node, Lexer.MIXED_CONTENT); // continue; // } if (checkstack && !node.implicit) { checkstack = false; // #431731 - fix by Randy Waki 25 Dec 00 if (!TidyUtils.toBoolean(element.tag.model & Dict.CM_MIXED)) { if (lexer.inlineDup(node) > 0) { continue; } } } mode = Lexer.MIXED_CONTENT; } else { checkstack = true; mode = Lexer.IGNORE_WHITESPACE; } // trim white space before <br> if (node.tag == tt.tagBr) { Node.trimSpaces(lexer, element); } element.insertNodeAtEnd(node); if (node.implicit) { lexer.report.warning(lexer, element, node, Report.INSERTING_TAG); } parseTag(lexer, node, Lexer.IGNORE_WHITESPACE // Lexer.MixedContent ); continue; } // discard unexpected tags if (node.type == Node.END_TAG) { lexer.popInline(node); // if inline end tag } lexer.report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED); continue; } if (!((element.tag.model & Dict.CM_OPT) != 0)) { lexer.report.warning(lexer, element, node, Report.MISSING_ENDTAG_FOR); } if ((element.tag.model & Dict.CM_OBJECT) != 0) { // pop inline stack while (lexer.istack.size() > lexer.istackbase) { lexer.popInline(null); } lexer.istackbase = istackbase; } Node.trimSpaces(lexer, element); Node.trimEmptyElement(lexer, element); } } /** * Parser for TABLE. */ public static class ParseTableTag implements Parser { /** * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short) */ public void parse(Lexer lexer, Node table, short mode) { Node node, parent; int istackbase; TagTable tt = lexer.configuration.tt; lexer.deferDup(); istackbase = lexer.istackbase; lexer.istackbase = lexer.istack.size(); while ((node = lexer.getToken(Lexer.IGNORE_WHITESPACE)) != null) { if (node.tag == table.tag && node.type == Node.END_TAG) { lexer.istackbase = istackbase; table.closed = true; Node.trimEmptyElement(lexer, table); return; } // deal with comments etc. if (Node.insertMisc(table, node)) { continue; } // discard unknown tags if (node.tag == null && node.type != Node.TEXT_NODE) { lexer.report.warning(lexer, table, node, Report.DISCARDING_UNEXPECTED); continue; } // if TD or TH or text or inline or block then infer <TR> if (node.type != Node.END_TAG) { if (node.tag == tt.tagTd || node.tag == tt.tagTh || node.tag == tt.tagTable) { lexer.ungetToken(); node = lexer.inferredTag("tr"); lexer.report.warning(lexer, table, node, Report.MISSING_STARTTAG); } else if (node.type == Node.TEXT_NODE || (node.tag.model & (Dict.CM_BLOCK | Dict.CM_INLINE)) != 0) { Node.insertNodeBeforeElement(table, node); lexer.report.warning(lexer, table, node, Report.TAG_NOT_ALLOWED_IN); lexer.exiled = true; if (!(node.type == Node.TEXT_NODE)) // #427662 - was (!node.type == TextNode) - fix by Young { parseTag(lexer, node, Lexer.IGNORE_WHITESPACE); } lexer.exiled = false; continue; } else if ((node.tag.model & Dict.CM_HEAD) != 0) { moveToHead(lexer, table, node); continue; } } // if this is the end tag for an ancestor element then infer end tag for this element if (node.type == Node.END_TAG) { if (node.tag == tt.tagForm) { badForm(lexer); lexer.report.warning(lexer, table, node, Report.DISCARDING_UNEXPECTED); continue; } if ((node.tag != null && (node.tag.model & (Dict.CM_TABLE | Dict.CM_ROW)) != 0) || (node.tag != null && (node.tag.model & (Dict.CM_BLOCK | Dict.CM_INLINE)) != 0)) { lexer.report.warning(lexer, table, node, Report.DISCARDING_UNEXPECTED); continue; } for (parent = table.parent; parent != null; parent = parent.parent) { if (node.tag == parent.tag) { lexer.report.warning(lexer, table, node, Report.MISSING_ENDTAG_BEFORE); lexer.ungetToken(); lexer.istackbase = istackbase; Node.trimEmptyElement(lexer, table); return; } } } if (!((node.tag.model & Dict.CM_TABLE) != 0)) { lexer.ungetToken(); lexer.report.warning(lexer, table, node, Report.TAG_NOT_ALLOWED_IN); lexer.istackbase = istackbase; Node.trimEmptyElement(lexer, table); return; } if (node.type == Node.START_TAG || node.type == Node.START_END_TAG) { table.insertNodeAtEnd(node); parseTag(lexer, node, Lexer.IGNORE_WHITESPACE); continue; } // discard unexpected text nodes and end tags lexer.report.warning(lexer, table, node, Report.DISCARDING_UNEXPECTED); } lexer.report.warning(lexer, table, node, Report.MISSING_ENDTAG_FOR); Node.trimEmptyElement(lexer, table); lexer.istackbase = istackbase; } } /** * Parser for COLGROUP. */ public static class ParseColGroup implements Parser { /** * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short) */ public void parse(Lexer lexer, Node colgroup, short mode) { Node node, parent; TagTable tt = lexer.configuration.tt; if ((colgroup.tag.model & Dict.CM_EMPTY) != 0) { return; } while ((node = lexer.getToken(Lexer.IGNORE_WHITESPACE)) != null) { if (node.tag == colgroup.tag && node.type == Node.END_TAG) { colgroup.closed = true; return; } // if this is the end tag for an ancestor element then infer end tag for this element if (node.type == Node.END_TAG) { if (node.tag == tt.tagForm) { badForm(lexer); lexer.report.warning(lexer, colgroup, node, Report.DISCARDING_UNEXPECTED); continue; } for (parent = colgroup.parent; parent != null; parent = parent.parent) { if (node.tag == parent.tag) { lexer.ungetToken(); return; } } } if (node.type == Node.TEXT_NODE) { lexer.ungetToken(); return; } // deal with comments etc. if (Node.insertMisc(colgroup, node)) { continue; } // discard unknown tags if (node.tag == null) { lexer.report.warning(lexer, colgroup, node, Report.DISCARDING_UNEXPECTED); continue; } if (node.tag != tt.tagCol) { lexer.ungetToken(); return; } if (node.type == Node.END_TAG) { lexer.report.warning(lexer, colgroup, node, Report.DISCARDING_UNEXPECTED); continue; } // node should be <COL> colgroup.insertNodeAtEnd(node); parseTag(lexer, node, Lexer.IGNORE_WHITESPACE); } } } /** * Parser for ROWGROUP. */ public static class ParseRowGroup implements Parser { /** * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short) */ public void parse(Lexer lexer, Node rowgroup, short mode) { Node node, parent; TagTable tt = lexer.configuration.tt; if ((rowgroup.tag.model & Dict.CM_EMPTY) != 0) { return; } while ((node = lexer.getToken(Lexer.IGNORE_WHITESPACE)) != null) { if (node.tag == rowgroup.tag) { if (node.type == Node.END_TAG) { rowgroup.closed = true; Node.trimEmptyElement(lexer, rowgroup); return; } lexer.ungetToken(); return; } // if </table> infer end tag if (node.tag == tt.tagTable && node.type == Node.END_TAG) { lexer.ungetToken(); Node.trimEmptyElement(lexer, rowgroup); return; } // deal with comments etc. if (Node.insertMisc(rowgroup, node)) { continue; } // discard unknown tags if (node.tag == null && node.type != Node.TEXT_NODE) { lexer.report.warning(lexer, rowgroup, node, Report.DISCARDING_UNEXPECTED); continue; } // if TD or TH then infer <TR> if text or inline or block move before table if head content move to // head if (node.type != Node.END_TAG) { if (node.tag == tt.tagTd || node.tag == tt.tagTh) { lexer.ungetToken(); node = lexer.inferredTag("tr"); lexer.report.warning(lexer, rowgroup, node, Report.MISSING_STARTTAG); } else if (node.type == Node.TEXT_NODE || (node.tag.model & (Dict.CM_BLOCK | Dict.CM_INLINE)) != 0) { Node.moveBeforeTable(rowgroup, node, tt); lexer.report.warning(lexer, rowgroup, node, Report.TAG_NOT_ALLOWED_IN); lexer.exiled = true; // #427662 was (!node.type == TextNode) fix by Young 04 Aug 00 if (node.type != Node.TEXT_NODE) { parseTag(lexer, node, Lexer.IGNORE_WHITESPACE); } lexer.exiled = false; continue; } else if ((node.tag.model & Dict.CM_HEAD) != 0) { lexer.report.warning(lexer, rowgroup, node, Report.TAG_NOT_ALLOWED_IN); moveToHead(lexer, rowgroup, node); continue; } } // if this is the end tag for ancestor element then infer end tag for this element if (node.type == Node.END_TAG) { if (node.tag == tt.tagForm || (node.tag != null && (node.tag.model & (Dict.CM_BLOCK | Dict.CM_INLINE)) != 0)) { if (node.tag == tt.tagForm) { badForm(lexer); } lexer.report.warning(lexer, rowgroup, node, Report.DISCARDING_UNEXPECTED); continue; } if (node.tag == tt.tagTr || node.tag == tt.tagTd || node.tag == tt.tagTh) { lexer.report.warning(lexer, rowgroup, node, Report.DISCARDING_UNEXPECTED); continue; } for (parent = rowgroup.parent; parent != null; parent = parent.parent) { if (node.tag == parent.tag) { lexer.ungetToken(); Node.trimEmptyElement(lexer, rowgroup); return; } } } // if THEAD, TFOOT or TBODY then implied end tag if ((node.tag.model & Dict.CM_ROWGRP) != 0) { if (node.type != Node.END_TAG) { lexer.ungetToken(); } Node.trimEmptyElement(lexer, rowgroup); return; } if (node.type == Node.END_TAG) { lexer.report.warning(lexer, rowgroup, node, Report.DISCARDING_UNEXPECTED); continue; } if (!(node.tag == tt.tagTr)) { node = lexer.inferredTag("tr"); lexer.report.warning(lexer, rowgroup, node, Report.MISSING_STARTTAG); lexer.ungetToken(); } // node should be <TR> rowgroup.insertNodeAtEnd(node); parseTag(lexer, node, Lexer.IGNORE_WHITESPACE); } Node.trimEmptyElement(lexer, rowgroup); } } /** * Parser for ROW. */ public static class ParseRow implements Parser { /** * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short) */ public void parse(Lexer lexer, Node row, short mode) { Node node, parent; boolean excludeState; TagTable tt = lexer.configuration.tt; if ((row.tag.model & Dict.CM_EMPTY) != 0) { return; } while ((node = lexer.getToken(Lexer.IGNORE_WHITESPACE)) != null) { if (node.tag == row.tag) { if (node.type == Node.END_TAG) { row.closed = true; Node.fixEmptyRow(lexer, row); return; } lexer.ungetToken(); Node.fixEmptyRow(lexer, row); return; } // if this is the end tag for an ancestor element then infer end tag for this element if (node.type == Node.END_TAG) { if (((node.tag != null && (node.tag.model & (Dict.CM_HTML | Dict.CM_TABLE)) != 0) || node.tag == tt.tagTable) && row.isDescendantOf(node.tag)) { lexer.ungetToken(); return; } if (node.tag == tt.tagForm || (node.tag != null && (node.tag.model & (Dict.CM_BLOCK | Dict.CM_INLINE)) != 0)) { if (node.tag == tt.tagForm) { badForm(lexer); } lexer.report.warning(lexer, row, node, Report.DISCARDING_UNEXPECTED); continue; } if (node.tag == tt.tagTd || node.tag == tt.tagTh) { lexer.report.warning(lexer, row, node, Report.DISCARDING_UNEXPECTED); continue; } for (parent = row.parent; parent != null; parent = parent.parent) { if (node.tag == parent.tag) { lexer.ungetToken(); Node.trimEmptyElement(lexer, row); return; } } } // deal with comments etc. if (Node.insertMisc(row, node)) { continue; } // discard unknown tags if (node.tag == null && node.type != Node.TEXT_NODE) { lexer.report.warning(lexer, row, node, Report.DISCARDING_UNEXPECTED); continue; } // discard unexpected <table> element if (node.tag == tt.tagTable) { lexer.report.warning(lexer, row, node, Report.DISCARDING_UNEXPECTED); continue; } // THEAD, TFOOT or TBODY if (node.tag != null && (node.tag.model & Dict.CM_ROWGRP) != 0) { lexer.ungetToken(); Node.trimEmptyElement(lexer, row); return; } if (node.type == Node.END_TAG) { lexer.report.warning(lexer, row, node, Report.DISCARDING_UNEXPECTED); continue; } // if text or inline or block move before table if head content move to head if (node.type != Node.END_TAG) { if (node.tag == tt.tagForm) { lexer.ungetToken(); node = lexer.inferredTag("td"); lexer.report.warning(lexer, row, node, Report.MISSING_STARTTAG); } else if (node.type == Node.TEXT_NODE || (node.tag.model & (Dict.CM_BLOCK | Dict.CM_INLINE)) != 0) { Node.moveBeforeTable(row, node, tt); lexer.report.warning(lexer, row, node, Report.TAG_NOT_ALLOWED_IN); lexer.exiled = true; if (node.type != Node.TEXT_NODE) { parseTag(lexer, node, Lexer.IGNORE_WHITESPACE); } lexer.exiled = false; continue; } else if ((node.tag.model & Dict.CM_HEAD) != 0) { lexer.report.warning(lexer, row, node, Report.TAG_NOT_ALLOWED_IN); moveToHead(lexer, row, node); continue; } } if (!(node.tag == tt.tagTd || node.tag == tt.tagTh)) { lexer.report.warning(lexer, row, node, Report.TAG_NOT_ALLOWED_IN); continue; } // node should be <TD> or <TH> row.insertNodeAtEnd(node); excludeState = lexer.excludeBlocks; lexer.excludeBlocks = false; parseTag(lexer, node, Lexer.IGNORE_WHITESPACE); lexer.excludeBlocks = excludeState; // pop inline stack while (lexer.istack.size() > lexer.istackbase) { lexer.popInline(null); } } Node.trimEmptyElement(lexer, row); } } /** * Parser for NOFRAMES. */ public static class ParseNoFrames implements Parser { /** * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short) */ public void parse(Lexer lexer, Node noframes, short mode) { Node node; TagTable tt = lexer.configuration.tt; lexer.badAccess |= Report.USING_NOFRAMES; mode = Lexer.IGNORE_WHITESPACE; while ((node = lexer.getToken(mode)) != null) { if (node.tag == noframes.tag && node.type == Node.END_TAG) { noframes.closed = true; Node.trimSpaces(lexer, noframes); return; } if ((node.tag == tt.tagFrame || node.tag == tt.tagFrameset)) { Node.trimSpaces(lexer, noframes); // fix for [539369] if (node.type == Node.END_TAG) { lexer.report.warning(lexer, noframes, node, Report.DISCARDING_UNEXPECTED); // Throw it away } else { lexer.report.warning(lexer, noframes, node, Report.MISSING_ENDTAG_BEFORE); lexer.ungetToken(); } return; } if (node.tag == tt.tagHtml) { if (node.type == Node.START_TAG || node.type == Node.START_END_TAG) { lexer.report.warning(lexer, noframes, node, Report.DISCARDING_UNEXPECTED); } continue; } // deal with comments etc. if (Node.insertMisc(noframes, node)) { continue; } if (node.tag == tt.tagBody && node.type == Node.START_TAG) { boolean seenbody = lexer.seenEndBody; noframes.insertNodeAtEnd(node); parseTag(lexer, node, Lexer.IGNORE_WHITESPACE); // MixedContent if (seenbody) { Node.coerceNode(lexer, node, tt.tagDiv); moveNodeToBody(lexer, node); } continue; } // implicit body element inferred if (node.type == Node.TEXT_NODE || (node.tag != null && node.type != Node.END_TAG)) { if (lexer.seenEndBody) { Node body = lexer.root.findBody(tt); if (node.type == Node.TEXT_NODE) { lexer.ungetToken(); node = lexer.inferredTag("p"); lexer.report.warning(lexer, noframes, node, Report.CONTENT_AFTER_BODY); } body.insertNodeAtEnd(node); } else { lexer.ungetToken(); node = lexer.inferredTag("body"); if (lexer.configuration.xmlOut) { lexer.report.warning(lexer, noframes, node, Report.INSERTING_TAG); } noframes.insertNodeAtEnd(node); } parseTag(lexer, node, Lexer.IGNORE_WHITESPACE); // MixedContent continue; } // discard unexpected end tags lexer.report.warning(lexer, noframes, node, Report.DISCARDING_UNEXPECTED); } lexer.report.warning(lexer, noframes, node, Report.MISSING_ENDTAG_FOR); } } /** * Parser for SELECT. */ public static class ParseSelect implements Parser { /** * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short) */ public void parse(Lexer lexer, Node field, short mode) { Node node; TagTable tt = lexer.configuration.tt; lexer.insert = -1; // defer implicit inline start tags while ((node = lexer.getToken(Lexer.IGNORE_WHITESPACE)) != null) { if (node.tag == field.tag && node.type == Node.END_TAG) { field.closed = true; Node.trimSpaces(lexer, field); return; } // deal with comments etc. if (Node.insertMisc(field, node)) { continue; } if (node.type == Node.START_TAG && (node.tag == tt.tagOption || node.tag == tt.tagOptgroup || node.tag == tt.tagScript)) { field.insertNodeAtEnd(node); parseTag(lexer, node, Lexer.IGNORE_WHITESPACE); continue; } // discard unexpected tags lexer.report.warning(lexer, field, node, Report.DISCARDING_UNEXPECTED); } lexer.report.warning(lexer, field, node, Report.MISSING_ENDTAG_FOR); } } /** * Parser for text nodes. */ public static class ParseText implements Parser { /** * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short) */ public void parse(Lexer lexer, Node field, short mode) { Node node; TagTable tt = lexer.configuration.tt; lexer.insert = -1; // defer implicit inline start tags if (field.tag == tt.tagTextarea) { mode = Lexer.PREFORMATTED; } else { mode = Lexer.MIXED_CONTENT; // kludge for font tags } while ((node = lexer.getToken(mode)) != null) { if (node.tag == field.tag && node.type == Node.END_TAG) { field.closed = true; Node.trimSpaces(lexer, field); return; } // deal with comments etc. if (Node.insertMisc(field, node)) { continue; } if (node.type == Node.TEXT_NODE) { // only called for 1st child if (field.content == null && !((mode & Lexer.PREFORMATTED) != 0)) { Node.trimSpaces(lexer, field); } if (node.start >= node.end) { continue; } field.insertNodeAtEnd(node); continue; } // for textarea should all cases of < and & be escaped? // discard inline tags e.g. font if (node.tag != null && ((node.tag.model & Dict.CM_INLINE) != 0) && (node.tag.model & Dict.CM_FIELD) == 0) // #487283 - fix by Lee Passey 25 Jan 02 { lexer.report.warning(lexer, field, node, Report.DISCARDING_UNEXPECTED); continue; } // terminate element on other tags if (!((field.tag.model & Dict.CM_OPT) != 0)) { lexer.report.warning(lexer, field, node, Report.MISSING_ENDTAG_BEFORE); } lexer.ungetToken(); Node.trimSpaces(lexer, field); return; } if (!((field.tag.model & Dict.CM_OPT) != 0)) { lexer.report.warning(lexer, field, node, Report.MISSING_ENDTAG_FOR); } } } /** * Parser for OPTGROUP. */ public static class ParseOptGroup implements Parser { /** * @see org.w3c.tidy.Parser#parse(org.w3c.tidy.Lexer, org.w3c.tidy.Node, short) */ public void parse(Lexer lexer, Node field, short mode) { Node node; TagTable tt = lexer.configuration.tt; lexer.insert = -1; // defer implicit inline start tags while ((node = lexer.getToken(Lexer.IGNORE_WHITESPACE)) != null) { if (node.tag == field.tag && node.type == Node.END_TAG) { field.closed = true; Node.trimSpaces(lexer, field); return; } // deal with comments etc. if (Node.insertMisc(field, node)) { continue; } if (node.type == Node.START_TAG && (node.tag == tt.tagOption || node.tag == tt.tagOptgroup)) { if (node.tag == tt.tagOptgroup) { lexer.report.warning(lexer, field, node, Report.CANT_BE_NESTED); } field.insertNodeAtEnd(node); parseTag(lexer, node, Lexer.MIXED_CONTENT); continue; } // discard unexpected tags lexer.report.warning(lexer, field, node, Report.DISCARDING_UNEXPECTED); } } } /** * HTML is the top level element. */ public static Node parseDocument(Lexer lexer) { Node node, document, html; Node doctype = null; TagTable tt = lexer.configuration.tt; document = lexer.newNode(); document.type = Node.ROOT_NODE; lexer.root = document; while ((node = lexer.getToken(Lexer.IGNORE_WHITESPACE)) != null) { // deal with comments etc. if (Node.insertMisc(document, node)) { continue; } if (node.type == Node.DOCTYPE_TAG) { if (doctype == null) { document.insertNodeAtEnd(node); doctype = node; } else { lexer.report.warning(lexer, document, node, Report.DISCARDING_UNEXPECTED); } continue; } if (node.type == Node.END_TAG) { lexer.report.warning(lexer, document, node, Report.DISCARDING_UNEXPECTED); // TODO? continue; } if (node.type != Node.START_TAG || node.tag != tt.tagHtml) { lexer.ungetToken(); html = lexer.inferredTag("html"); } else { html = node; } if (document.findDocType() == null && !lexer.configuration.bodyOnly) { lexer.report.warning(lexer, null, null, Report.MISSING_DOCTYPE); } document.insertNodeAtEnd(html); HTML.parse(lexer, html, (short) 0); // TODO? break; } if (lexer.root.findHTML(lexer.configuration.tt) == null) { /* a later check should complain if <body> is empty */ html = lexer.inferredTag("html"); lexer.root.insertNodeAtEnd(html); HTML.parse(lexer, html, Lexer.IGNORE_WHITESPACE); } if (lexer.root.findTITLE(lexer.configuration.tt) == null) { Node head = lexer.root.findHEAD(lexer.configuration.tt); lexer.report.warning(lexer, head, null, Report.MISSING_TITLE_ELEMENT); head.insertNodeAtEnd(lexer.inferredTag("title")); } return document; } /** * Indicates whether or not whitespace should be preserved for this element. If an <code>xml:space</code> * attribute is found, then if the attribute value is <code>preserve</code>, returns <code>true</code>. For * any other value, returns <code>false</code>. If an <code>xml:space</code> attribute was <em>not</em> * found, then the following element names result in a return value of <code>true: * pre, script, style,</code> and * <code>xsl:text</code>. Finally, if a <code>TagTable</code> was passed in and the element appears as the * "pre" element in the <code>TagTable</code>, then <code>true</code> will be returned. Otherwise, * <code>false</code> is returned. * @param element The <code>Node</code> to test to see if whitespace should be preserved. * @param tt The <code>TagTable</code> to test for the <code>getNodePre()</code> function. This may be * <code>null</code>, in which case this test is bypassed. * @return <code>true</code> or <code>false</code>, as explained above. */ public static boolean XMLPreserveWhiteSpace(Node element, TagTable tt) { AttVal attribute; // search attributes for xml:space for (attribute = element.attributes; attribute != null; attribute = attribute.next) { if (attribute.attribute.equals("xml:space")) { if (attribute.value.equals("preserve")) { return true; } return false; } } if (element.element == null) // Debian Bug #137124. Fix based on suggestion by Cesar Eduardo Barros 06 Mar 02 { return false; } // kludge for html docs without explicit xml:space attribute if ("pre".equalsIgnoreCase(element.element) || "script".equalsIgnoreCase(element.element) || "style".equalsIgnoreCase(element.element)) { return true; } if ((tt != null) && (tt.findParser(element) == PRE)) { return true; } // kludge for XSL docs if ("xsl:text".equalsIgnoreCase(element.element)) { return true; } return false; } /** * XML documents. */ public static void parseXMLElement(Lexer lexer, Node element, short mode) { Node node; // if node is pre or has xml:space="preserve" then do so if (XMLPreserveWhiteSpace(element, lexer.configuration.tt)) { mode = Lexer.PREFORMATTED; } while ((node = lexer.getToken(mode)) != null) { if (node.type == Node.END_TAG && node.element.equals(element.element)) { element.closed = true; break; } // discard unexpected end tags if (node.type == Node.END_TAG) { lexer.report.error(lexer, element, node, Report.UNEXPECTED_ENDTAG); continue; } // parse content on seeing start tag if (node.type == Node.START_TAG) { parseXMLElement(lexer, node, mode); } element.insertNodeAtEnd(node); } // if first child is text then trim initial space and delete text node if it is empty. node = element.content; if (node != null && node.type == Node.TEXT_NODE && mode != Lexer.PREFORMATTED) { if (node.textarray[node.start] == (byte) ' ') { node.start++; if (node.start >= node.end) { Node.discardElement(node); } } } // if last child is text then trim final space and delete the text node if it is empty node = element.last; if (node != null && node.type == Node.TEXT_NODE && mode != Lexer.PREFORMATTED) { if (node.textarray[node.end - 1] == (byte) ' ') { node.end--; if (node.start >= node.end) { Node.discardElement(node); } } } } public static Node parseXMLDocument(Lexer lexer) { Node node, document, doctype; document = lexer.newNode(); document.type = Node.ROOT_NODE; doctype = null; lexer.configuration.xmlTags = true; while ((node = lexer.getToken(Lexer.IGNORE_WHITESPACE)) != null) { // discard unexpected end tags if (node.type == Node.END_TAG) { lexer.report.warning(lexer, null, node, Report.UNEXPECTED_ENDTAG); continue; } // deal with comments etc. if (Node.insertMisc(document, node)) { continue; } if (node.type == Node.DOCTYPE_TAG) { if (doctype == null) { document.insertNodeAtEnd(node); doctype = node; } else { lexer.report.warning(lexer, document, node, Report.DISCARDING_UNEXPECTED); // TODO } continue; } if (node.type == Node.START_END_TAG) { document.insertNodeAtEnd(node); continue; } // if start tag then parse element's content if (node.type == Node.START_TAG) { document.insertNodeAtEnd(node); parseXMLElement(lexer, node, Lexer.IGNORE_WHITESPACE); } } if (doctype != null && !lexer.checkDocTypeKeyWords(doctype)) { lexer.report.warning(lexer, doctype, null, Report.DTYPE_NOT_UPPER_CASE); } // ensure presence of initial <?XML version="1.0"?> if (lexer.configuration.xmlPi) { lexer.fixXmlDecl(document); } return document; } /** * errors in positioning of form start or end tags generally require human intervention to fix. */ static void badForm(Lexer lexer) { lexer.badForm = 1; lexer.errors++; } }