/* * Java HTML Tidy - JTidy * HTML parser and pretty printer * * Copyright (c) 1998-2000 World Wide Web Consortium (Massachusetts * Institute of Technology, Institut National de Recherche en * Informatique et en Automatique, Keio University). All Rights * Reserved. * * Contributing Author(s): * * Dave Raggett <dsr@w3.org> * Andy Quick <ac.quick@sympatico.ca> (translation to Java) * Gary L Peskin <garyp@firstech.com> (Java development) * Sami Lempinen <sami@lempinen.net> (release management) * Fabrizio Giustina <fgiust at users.sourceforge.net> * * The contributing author(s) would like to thank all those who * helped with testing, bug fixes, and patience. This wouldn't * have been possible without all of you. * * COPYRIGHT NOTICE: * * This software and documentation is provided "as is," and * the copyright holders and contributing author(s) make no * representations or warranties, express or implied, including * but not limited to, warranties of merchantability or fitness * for any particular purpose or that the use of the software or * documentation will not infringe any third party patents, * copyrights, trademarks or other rights. * * The copyright holders and contributing author(s) will not be * liable for any direct, indirect, special or consequential damages * arising out of any use of the software or documentation, even if * advised of the possibility of such damage. * * Permission is hereby granted to use, copy, modify, and distribute * this source code, or portions hereof, documentation and executables, * for any purpose, without fee, subject to the following restrictions: * * 1. The origin of this source code must not be misrepresented. * 2. Altered versions must be plainly marked as such and must * not be misrepresented as being the original source. * 3. This Copyright notice may not be removed or altered from any * source or altered source distribution. * * The copyright holders and contributing author(s) specifically * permit, without fee, and encourage the use of this source code * as a component for supporting the Hypertext Markup Language in * commercial products. If you use this source code in a product, * acknowledgment is not required but would be appreciated. * */ package org.w3c.tidy; import java.io.PrintWriter; import java.util.List; import java.util.Stack; import java.util.Vector; /** * Lexer for html parser. * <p> * Given a file stream fp it returns a sequence of tokens. GetToken(fp) gets the next token UngetToken(fp) provides one * level undo The tags include an attribute list: - linked list of attribute/value nodes - each node has 2 * null-terminated strings. - entities are replaced in attribute values white space is compacted if not in preformatted * mode If not in preformatted mode then leading white space is discarded and subsequent white space sequences compacted * to single space chars. If XmlTags is no then Tag names are folded to upper case and attribute names to lower case. * Not yet done: - Doctype subset and marked sections * </p> * @author Dave Raggett <a href="mailto:dsr@w3.org">dsr@w3.org </a> * @author Andy Quick <a href="mailto:ac.quick@sympatico.ca">ac.quick@sympatico.ca </a> (translation to Java) * @author Fabrizio Giustina * @version $Revision: 1168 $ ($Author: aditsu $) */ public class Lexer { /** * state: ignore whitespace. */ public static final short IGNORE_WHITESPACE = 0; /** * state: mixed content. */ public static final short MIXED_CONTENT = 1; /** * state: preformatted. */ public static final short PREFORMATTED = 2; /** * state: ignore markup. */ public static final short IGNORE_MARKUP = 3; /** * URI for XHTML 1.0 transitional DTD. */ private static final String VOYAGER_LOOSE = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"; /** * URI for XHTML 1.0 strict DTD. */ private static final String VOYAGER_STRICT = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"; /** * URI for XHTML 1.0 frameset DTD. */ private static final String VOYAGER_FRAMESET = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd"; /** * URI for XHTML 1.1. */ private static final String VOYAGER_11 = "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"; /** * URI for XHTML Basic 1.0. */ // private static final String VOYAGER_BASIC = "http://www.w3.org/TR/xhtml-basic/xhtml-basic10.dtd"; /** * xhtml namespace. */ private static final String XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml"; /** * lists all the known versions. */ private static final Lexer.W3CVersionInfo[] W3CVERSION = { new W3CVersionInfo("HTML 4.01", "XHTML 1.0 Strict", VOYAGER_STRICT, Dict.VERS_HTML40_STRICT), new W3CVersionInfo("HTML 4.01 Transitional", "XHTML 1.0 Transitional", VOYAGER_LOOSE, Dict.VERS_HTML40_LOOSE), new W3CVersionInfo("HTML 4.01 Frameset", "XHTML 1.0 Frameset", VOYAGER_FRAMESET, Dict.VERS_FRAMESET), new W3CVersionInfo("HTML 4.0", "XHTML 1.0 Strict", VOYAGER_STRICT, Dict.VERS_HTML40_STRICT), new W3CVersionInfo("HTML 4.0 Transitional", "XHTML 1.0 Transitional", VOYAGER_LOOSE, Dict.VERS_HTML40_LOOSE), new W3CVersionInfo("HTML 4.0 Frameset", "XHTML 1.0 Frameset", VOYAGER_FRAMESET, Dict.VERS_FRAMESET), new W3CVersionInfo("HTML 3.2", "XHTML 1.0 Transitional", VOYAGER_LOOSE, Dict.VERS_HTML32), new W3CVersionInfo("HTML 3.2 Final", "XHTML 1.0 Transitional", VOYAGER_LOOSE, Dict.VERS_HTML32), new W3CVersionInfo("HTML 3.2 Draft", "XHTML 1.0 Transitional", VOYAGER_LOOSE, Dict.VERS_HTML32), new W3CVersionInfo("HTML 2.0", "XHTML 1.0 Strict", VOYAGER_STRICT, Dict.VERS_HTML20), new W3CVersionInfo("HTML 4.01", "XHTML 1.1", VOYAGER_STRICT, Dict.VERS_XHTML11)}; /** * getToken state: content. */ private static final short LEX_CONTENT = 0; /** * getToken state: gt. */ private static final short LEX_GT = 1; /** * getToken state: endtag. */ private static final short LEX_ENDTAG = 2; /** * getToken state: start tag. */ private static final short LEX_STARTTAG = 3; /** * getToken state: comment. */ private static final short LEX_COMMENT = 4; /** * getToken state: doctype. */ private static final short LEX_DOCTYPE = 5; /** * getToken state: procinstr. */ private static final short LEX_PROCINSTR = 6; /** * getToken state: cdata. */ private static final short LEX_CDATA = 8; /** * getToken state: section. */ private static final short LEX_SECTION = 9; /** * getToken state: asp. */ private static final short LEX_ASP = 10; /** * getToken state: jste. */ private static final short LEX_JSTE = 11; /** * getToken state: php. */ private static final short LEX_PHP = 12; /** * getToken state: xml declaration. */ private static final short LEX_XMLDECL = 13; /** * file stream. */ protected StreamIn in; /** * error output stream. */ protected PrintWriter errout; /** * for accessibility errors. */ protected short badAccess; /** * for bad style errors. */ protected short badLayout; /** * for bad char encodings. */ protected short badChars; /** * for mismatched/mispositioned form tags. */ protected short badForm; /** * count of warnings in this document. */ protected short warnings; /** * count of errors. */ protected short errors; /** * lines seen. */ protected int lines; /** * at start of current token. */ protected int columns; /** * used to collapse contiguous white space. */ protected boolean waswhite; /** * true after token has been pushed back. */ protected boolean pushed; /** * when space is moved after end tag. */ protected boolean insertspace; /** * Netscape compatibility. */ protected boolean excludeBlocks; /** * true if moved out of table. */ protected boolean exiled; /** * true if xmlns attribute on html element. */ protected boolean isvoyager; /** * bit vector of HTML versions. */ protected short versions; /** * version as given by doctype (if any). */ protected int doctype; /** * set if html or PUBLIC is missing. */ protected boolean badDoctype; /** * start of current node. */ protected int txtstart; /** * end of current node. */ protected int txtend; /** * state of lexer's finite state machine. */ protected short state; /** * current node. */ protected Node token; /** * Lexer character buffer parse tree nodes span onto this buffer which contains the concatenated text contents of * all of the elements. Lexsize must be reset for each file. Byte buffer of UTF-8 chars. */ protected byte[] lexbuf; /** * allocated. */ protected int lexlength; /** * used. */ protected int lexsize; /** * Inline stack for compatibility with Mosaic. For deferring text node. */ protected Node inode; /** * for inferring inline tags. */ protected int insert; /** * stack. */ protected Stack istack; /** * start of frame. */ protected int istackbase; /** * used for cleaning up presentation markup. */ protected Style styles; /** * configuration. */ protected Configuration configuration; /** * already seen end body tag? */ protected boolean seenEndBody; /** * already seen end html tag? */ protected boolean seenEndHtml; /** * report. */ protected Report report; /** * Root node is saved here. */ protected Node root; /** * node list. */ private List nodeList; /** * Instantiates a new Lexer. * @param in StreamIn * @param configuration configuation instance * @param report report instance, for reporting errors */ public Lexer(StreamIn in, Configuration configuration, Report report) { this.report = report; this.in = in; this.lines = 1; this.columns = 1; this.state = LEX_CONTENT; this.versions = (Dict.VERS_ALL | Dict.VERS_PROPRIETARY); this.doctype = Dict.VERS_UNKNOWN; this.insert = -1; this.istack = new Stack(); this.configuration = configuration; this.nodeList = new Vector(); } /** * Creates a new node and add it to nodelist. * @return Node */ public Node newNode() { Node node = new Node(); this.nodeList.add(node); return node; } /** * Creates a new node and add it to nodelist. * @param type node type: Node.ROOT_NODE | Node.DOCTYPE_TAG | Node.COMMENT_TAG | Node.PROC_INS_TAG | Node.TEXT_NODE | * Node.START_TAG | Node.END_TAG | Node.START_END_TAG | Node.CDATA_TAG | Node.SECTION_TAG | Node. ASP_TAG | * Node.JSTE_TAG | Node.PHP_TAG | Node.XML_DECL * @param textarray array of bytes contained in the Node * @param start start position * @param end end position * @return Node */ public Node newNode(short type, byte[] textarray, int start, int end) { Node node = new Node(type, textarray, start, end); this.nodeList.add(node); return node; } /** * Creates a new node and add it to nodelist. * @param type node type: Node.ROOT_NODE | Node.DOCTYPE_TAG | Node.COMMENT_TAG | Node.PROC_INS_TAG | Node.TEXT_NODE | * Node.START_TAG | Node.END_TAG | Node.START_END_TAG | Node.CDATA_TAG | Node.SECTION_TAG | Node. ASP_TAG | * Node.JSTE_TAG | Node.PHP_TAG | Node.XML_DECL * @param textarray array of bytes contained in the Node * @param start start position * @param end end position * @param element tag name * @return Node */ public Node newNode(short type, byte[] textarray, int start, int end, String element) { Node node = new Node(type, textarray, start, end, element, this.configuration.tt); this.nodeList.add(node); return node; } /** * Clones a node and add it to node list. * @param node Node * @return cloned Node */ public Node cloneNode(Node node) { Node cnode = node.cloneNode(false); this.nodeList.add(cnode); for (AttVal att = cnode.attributes; att != null; att = att.next) { if (att.asp != null) { this.nodeList.add(att.asp); } if (att.php != null) { this.nodeList.add(att.php); } } return cnode; } /** * Clones an attribute value and add eventual asp or php node to node list. * @param attrs original AttVal * @return cloned AttVal */ public AttVal cloneAttributes(AttVal attrs) { AttVal cattrs = (AttVal) attrs.clone(); for (AttVal att = cattrs; att != null; att = att.next) { if (att.asp != null) { this.nodeList.add(att.asp); } if (att.php != null) { this.nodeList.add(att.php); } } return cattrs; } /** * Update <code>oldtextarray</code> in the current nodes. * @param oldtextarray previous text array * @param newtextarray new text array */ protected void updateNodeTextArrays(byte[] oldtextarray, byte[] newtextarray) { Node node; for (int i = 0; i < this.nodeList.size(); i++) { node = (Node) (this.nodeList.get(i)); if (node.textarray == oldtextarray) { node.textarray = newtextarray; } } } /** * Adds a new line node. Used for creating preformatted text from Word2000. * @return new line node */ public Node newLineNode() { Node node = newNode(); node.textarray = this.lexbuf; node.start = this.lexsize; addCharToLexer('\n'); node.end = this.lexsize; return node; } /** * Has end of input stream been reached? * @return <code>true</code> if end of input stream been reached */ public boolean endOfInput() { return this.in.isEndOfStream(); } /** * Adds a byte to lexer buffer. * @param c byte to add */ public void addByte(int c) { if (this.lexsize + 1 >= this.lexlength) { while (this.lexsize + 1 >= this.lexlength) { if (this.lexlength == 0) { this.lexlength = 8192; } else { this.lexlength = this.lexlength * 2; } } byte[] temp = this.lexbuf; this.lexbuf = new byte[this.lexlength]; if (temp != null) { System.arraycopy(temp, 0, this.lexbuf, 0, temp.length); updateNodeTextArrays(temp, this.lexbuf); } } this.lexbuf[this.lexsize++] = (byte) c; this.lexbuf[this.lexsize] = (byte) '\0'; // debug } /** * Substitute the last char in buffer. * @param c new char */ public void changeChar(byte c) { if (this.lexsize > 0) { this.lexbuf[this.lexsize - 1] = c; } } /** * Store char c as UTF-8 encoded byte stream. * @param c char to store */ public void addCharToLexer(int c) { // Allow only valid XML characters. See: http://www.w3.org/TR/2004/REC-xml-20040204/#NT-Char // Fix by Pablo Mayrgundter 17-08-2004 if ((this.configuration.xmlOut || this.configuration.xHTML) // only for xml output && !((c >= 0x20 && c <= 0xD7FF) // Check the common-case first. || c == 0x9 || c == 0xA || c == 0xD // Then white-space. || (c >= 0xE000 && c <= 0xFFFD) // Then high-range unicode. || (c >= 0x10000 && c <= 0x10FFFF))) { return; } int i = 0; int[] count = new int[]{0}; byte[] buf = new byte[10]; // unsigned char boolean err = EncodingUtils.encodeCharToUTF8Bytes(c, buf, null, count); if (err) { // replacement char 0xFFFD encoded as UTF-8 buf[0] = (byte) 0xEF; buf[1] = (byte) 0xBF; buf[2] = (byte) 0xBD; count[0] = 3; } for (i = 0; i < count[0]; i++) { addByte(buf[i]); // uint } } /** * Adds a string to lexer buffer. * @param str String to add */ public void addStringToLexer(String str) { for (int i = 0; i < str.length(); i++) { addCharToLexer(str.charAt(i)); } } /** * Parse an html entity. * @param mode mode */ public void parseEntity(short mode) { // No longer attempts to insert missing ';' for unknown // entities unless one was present already, since this // gives unexpected results. // // For example: <a href="something.htm?foo&bar&fred"> // was tidied to: <a href="something.htm?foo&bar;&fred;"> // rather than: <a href="something.htm?foo&bar&fred"> // // My thanks for Maurice Buxton for spotting this. // // Also Randy Waki pointed out the following case for the // 04 Aug 00 version (bug #433012): // // For example: <a href="something.htm?id=1&lang=en"> // was tidied to: <a href="something.htm?id=1⟨=en"> // rather than: <a href="something.htm?id=1&lang=en"> // // where "lang" is a known entity (#9001), but browsers would // misinterpret "⟨" because it had a value > 256. // // So the case of an apparently known entity with a value > 256 and // missing a semicolon is handled specially. // // "ParseEntity" is also a bit of a misnomer - it handles entities and // numeric character references. Invalid NCR's are now reported. int start; boolean first = true; boolean semicolon = false; int c, ch, startcol; String str; start = this.lexsize - 1; // to start at "&" startcol = this.in.getCurcol() - 1; while ((c = this.in.readChar()) != StreamIn.END_OF_STREAM) { if (c == ';') { semicolon = true; break; } if (first && c == '#') { // #431953 - start RJ if (!this.configuration.ncr || "BIG5".equals(this.configuration.getInCharEncodingName()) || "SHIFTJIS".equals(this.configuration.getInCharEncodingName())) { this.in.ungetChar(c); return; } // #431953 - end RJ addCharToLexer(c); first = false; continue; } first = false; if (TidyUtils.isNamechar((char) c)) { addCharToLexer(c); continue; } // otherwise put it back this.in.ungetChar(c); break; } str = TidyUtils.getString(this.lexbuf, start, this.lexsize - start); if ("&apos".equals(str) && !configuration.xmlOut && !this.isvoyager && !configuration.xHTML) { report.entityError(this, Report.APOS_UNDEFINED, str, 39); } ch = EntityTable.getDefaultEntityTable().entityCode(str); // drops invalid numeric entities from XML mode. Fix by Pablo Mayrgundter 17-08-2004 // if ((this.configuration.xmlOut || this.configuration.xHTML) // only for xml output // && !((ch >= 0x20 && ch <= 0xD7FF) // Check the common-case first. // || ch == 0x9 || ch == 0xA || ch == 0xD // Then white-space. // || (ch >= 0xE000 && ch <= 0xFFFD))) // { // this.lexsize = start; // return; // } // deal with unrecognized or invalid entities // #433012 - fix by Randy Waki 17 Feb 01 // report invalid NCR's - Terry Teague 01 Sep 01 if (ch <= 0 || (ch >= 256 && c != ';')) { // set error position just before offending character this.lines = this.in.getCurline(); this.columns = startcol; if (this.lexsize > start + 1) { if (ch >= 128 && ch <= 159) { // invalid numeric character reference int c1 = 0; if ("WIN1252".equals(configuration.replacementCharEncoding)) { c1 = EncodingUtils.decodeWin1252(ch); } else if ("MACROMAN".equals(configuration.replacementCharEncoding)) { c1 = EncodingUtils.decodeMacRoman(ch); } // "or" DISCARDED_CHAR with the other errors if discarding char; otherwise default is replacing int replaceMode = c1 != 0 ? Report.REPLACED_CHAR : Report.DISCARDED_CHAR; if (c != ';') /* issue warning if not terminated by ';' */ { report.entityError(this, Report.MISSING_SEMICOLON_NCR, str, c); } report.encodingError(this, (short) (Report.INVALID_NCR | replaceMode), ch); if (c1 != 0) { // make the replacement this.lexsize = start; addCharToLexer(c1); semicolon = false; } else { /* discard */ this.lexsize = start; semicolon = false; } } else { report.entityError(this, Report.UNKNOWN_ENTITY, str, ch); } if (semicolon) { addCharToLexer(';'); } } else { // naked & report.entityError(this, Report.UNESCAPED_AMPERSAND, str, ch); } } else { // issue warning if not terminated by ';' if (c != ';') { // set error position just before offending character this.lines = this.in.getCurline(); this.columns = startcol; report.entityError(this, Report.MISSING_SEMICOLON, str, c); } this.lexsize = start; if (ch == 160 && TidyUtils.toBoolean(mode & PREFORMATTED)) { ch = ' '; } addCharToLexer(ch); if (ch == '&' && !this.configuration.quoteAmpersand) { addCharToLexer('a'); addCharToLexer('m'); addCharToLexer('p'); addCharToLexer(';'); } } } /** * Parses a tag name. * @return first char after the tag name */ public char parseTagName() { int c; // fold case of first char in buffer c = this.lexbuf[this.txtstart]; if (!this.configuration.xmlTags && TidyUtils.isUpper((char) c)) { c = TidyUtils.toLower((char) c); this.lexbuf[this.txtstart] = (byte) c; } while ((c = this.in.readChar()) != StreamIn.END_OF_STREAM) { if (!TidyUtils.isNamechar((char) c)) { break; } // fold case of subsequent chars if (!this.configuration.xmlTags && TidyUtils.isUpper((char) c)) { c = TidyUtils.toLower((char) c); } addCharToLexer(c); } this.txtend = this.lexsize; return (char) c; } /** * calls addCharToLexer for any char in the string. * @param str input String */ public void addStringLiteral(String str) { int len = str.length(); for (int i = 0; i < len; i++) { addCharToLexer(str.charAt(i)); } } /** * calls addCharToLexer for any char in the string till len is reached. * @param str input String * @param len length of the substring to be added */ void addStringLiteralLen(String str, int len) { int strlen = str.length(); if (strlen < len) { len = strlen; } for (int i = 0; i < len; i++) { addCharToLexer(str.charAt(i)); } } /** * Choose what version to use for new doctype. * @return html version constant */ public short htmlVersion() { if (TidyUtils.toBoolean(versions & Dict.VERS_HTML20)) { return Dict.VERS_HTML20; } if (!(this.configuration.xmlOut | this.configuration.xmlTags | this.isvoyager) && TidyUtils.toBoolean(versions & Dict.VERS_HTML32)) { return Dict.VERS_HTML32; } if (TidyUtils.toBoolean(versions & Dict.VERS_XHTML11)) { return Dict.VERS_XHTML11; } if (TidyUtils.toBoolean(versions & Dict.VERS_HTML40_STRICT)) { return Dict.VERS_HTML40_STRICT; } if (TidyUtils.toBoolean(versions & Dict.VERS_HTML40_LOOSE)) { return Dict.VERS_HTML40_LOOSE; } if (TidyUtils.toBoolean(versions & Dict.VERS_FRAMESET)) { return Dict.VERS_FRAMESET; } return Dict.VERS_UNKNOWN; } /** * Choose what version to use for new doctype. * @return html version name */ public String htmlVersionName() { short guessed; int j; guessed = apparentVersion(); for (j = 0; j < W3CVERSION.length; ++j) { if (guessed == W3CVERSION[j].code) { if (this.isvoyager) { return W3CVERSION[j].voyagerName; } return W3CVERSION[j].name; } } return null; } /** * Add meta element for Tidy. If the meta tag is already present, update release date. * @param root root node * @return <code>true</code> if the tag has been added */ public boolean addGenerator(Node root) { AttVal attval; Node node; Node head = root.findHEAD(this.configuration.tt); if (head != null) { String meta = "HTML Tidy for Java (vers. " + Report.RELEASE_DATE_STRING + "), see jtidy.sourceforge.net"; for (node = head.content; node != null; node = node.next) { if (node.tag == this.configuration.tt.tagMeta) { attval = node.getAttrByName("name"); if (attval != null && attval.value != null && "generator".equalsIgnoreCase(attval.value)) { attval = node.getAttrByName("content"); if (attval != null && attval.value != null && attval.value.length() >= 9 && "HTML Tidy".equalsIgnoreCase(attval.value.substring(0, 9))) { attval.value = meta; return false; } } } } node = this.inferredTag("meta"); node.addAttribute("content", meta); node.addAttribute("name", "generator"); head.insertNodeAtStart(node); return true; } return false; } /** * Check system keywords (keywords should be uppercase). * @param doctype doctype node * @return true if doctype keywords are all uppercase */ public boolean checkDocTypeKeyWords(Node doctype) { int len = doctype.end - doctype.start; String s = TidyUtils.getString(this.lexbuf, doctype.start, len); return !(TidyUtils.findBadSubString("SYSTEM", s, s.length()) || TidyUtils.findBadSubString("PUBLIC", s, s.length()) || TidyUtils.findBadSubString("//DTD", s, s.length()) || TidyUtils.findBadSubString("//W3C", s, s.length()) || TidyUtils.findBadSubString("//EN", s, s.length())); } /** * Examine DOCTYPE to identify version. * @param doctype doctype node * @return version code */ public short findGivenVersion(Node doctype) { String p, s; int i, j; int len; String str1; String str2; // if root tag for doctype isn't html give up now str1 = TidyUtils.getString(this.lexbuf, doctype.start, 5); if (!"html ".equalsIgnoreCase(str1)) { return 0; } if (!checkDocTypeKeyWords(doctype)) { report.warning(this, doctype, null, Report.DTYPE_NOT_UPPER_CASE); } // give up if all we are given is the system id for the doctype str1 = TidyUtils.getString(this.lexbuf, doctype.start + 5, 7); if ("SYSTEM ".equalsIgnoreCase(str1)) { // but at least ensure the case is correct if (!str1.substring(0, 6).equals("SYSTEM")) { System.arraycopy(TidyUtils.getBytes("SYSTEM"), 0, this.lexbuf, doctype.start + 5, 6); } return 0; // unrecognized } if ("PUBLIC ".equalsIgnoreCase(str1)) { if (!str1.substring(0, 6).equals("PUBLIC")) { System.arraycopy(TidyUtils.getBytes("PUBLIC "), 0, this.lexbuf, doctype.start + 5, 6); } } else { this.badDoctype = true; } for (i = doctype.start; i < doctype.end; ++i) { if (this.lexbuf[i] == (byte) '"') { str1 = TidyUtils.getString(this.lexbuf, i + 1, 12); str2 = TidyUtils.getString(this.lexbuf, i + 1, 13); if (str1.equals("-//W3C//DTD ")) { // compute length of identifier e.g. "HTML 4.0 Transitional" for (j = i + 13; j < doctype.end && this.lexbuf[j] != (byte) '/'; ++j) { // } len = j - i - 13; p = TidyUtils.getString(this.lexbuf, i + 13, len); for (j = 1; j < W3CVERSION.length; ++j) { s = W3CVERSION[j].name; if (len == s.length() && s.equals(p)) { return W3CVERSION[j].code; } } // else unrecognized version } else if (str2.equals("-//IETF//DTD ")) { // compute length of identifier e.g. "HTML 2.0" for (j = i + 14; j < doctype.end && this.lexbuf[j] != (byte) '/'; ++j) { // } len = j - i - 14; p = TidyUtils.getString(this.lexbuf, i + 14, len); s = W3CVERSION[0].name; if (len == s.length() && s.equals(p)) { return W3CVERSION[0].code; } // else unrecognized version } break; } } return 0; } /** * Fix xhtml namespace. * @param root root Node * @param profile current profile */ public void fixHTMLNameSpace(Node root, String profile) { Node node; AttVal attr; node = root.content; while (node != null && node.tag != this.configuration.tt.tagHtml) { node = node.next; } if (node != null) { for (attr = node.attributes; attr != null; attr = attr.next) { if (attr.attribute.equals("xmlns")) { break; } } if (attr != null) { if (!attr.value.equals(profile)) { report.warning(this, node, null, Report.INCONSISTENT_NAMESPACE); attr.value = profile; } } else { attr = new AttVal(node.attributes, null, '"', "xmlns", profile); attr.dict = AttributeTable.getDefaultAttributeTable().findAttribute(attr); node.attributes = attr; } } } /** * Put DOCTYPE declaration between the <:?xml version "1.0" ... ?> declaration, if any, and the * <code>html</code> tag. Should also work for any comments, etc. that may precede the <code>html</code> tag. * @param root root node * @return new doctype node */ Node newXhtmlDocTypeNode(Node root) { Node html = root.findHTML(this.configuration.tt); if (html == null) { return null; } Node newdoctype = newNode(); newdoctype.setType(Node.DOCTYPE_TAG); newdoctype.next = html; newdoctype.parent = root; newdoctype.prev = null; if (html == root.content) { // No <?xml ... ?> declaration. root.content.prev = newdoctype; root.content = newdoctype; newdoctype.prev = null; } else { // we have an <?xml ... ?> declaration. newdoctype.prev = html.prev; newdoctype.prev.next = newdoctype; } html.prev = newdoctype; return newdoctype; } /** * Adds a new xhtml doctype to the document. * @param root root node * @return <code>true</code> if a doctype has been added */ public boolean setXHTMLDocType(Node root) { String fpi = " "; String sysid = ""; String namespace = XHTML_NAMESPACE; String dtdsub = null; Node doctype; int dtdlen = 0; doctype = root.findDocType(); fixHTMLNameSpace(root, namespace); // #427839 - fix by Evan Lenz 05 Sep 00 if (this.configuration.docTypeMode == Configuration.DOCTYPE_OMIT) { if (doctype != null) { Node.discardElement(doctype); } return true; } if (this.configuration.docTypeMode == Configuration.DOCTYPE_AUTO) { // see what flavor of XHTML this document matches if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML40_STRICT)) { // use XHTML strict fpi = "-//W3C//DTD XHTML 1.0 Strict//EN"; sysid = VOYAGER_STRICT; } else if (TidyUtils.toBoolean(this.versions & Dict.VERS_FRAMESET)) { // use XHTML frames fpi = "-//W3C//DTD XHTML 1.0 Frameset//EN"; sysid = VOYAGER_FRAMESET; } else if (TidyUtils.toBoolean(this.versions & Dict.VERS_LOOSE)) { fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN"; sysid = VOYAGER_LOOSE; } else if (TidyUtils.toBoolean(this.versions & Dict.VERS_XHTML11)) { // use XHTML 1.1 fpi = "-//W3C//DTD XHTML 1.1//EN"; sysid = VOYAGER_11; } else { // proprietary fpi = null; sysid = ""; if (doctype != null)// #473490 - fix by BjÅ¡rn HÅ¡hrmann 10 Oct 01 { Node.discardElement(doctype); } } } else if (this.configuration.docTypeMode == Configuration.DOCTYPE_STRICT) { fpi = "-//W3C//DTD XHTML 1.0 Strict//EN"; sysid = VOYAGER_STRICT; } else if (this.configuration.docTypeMode == Configuration.DOCTYPE_LOOSE) { fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN"; sysid = VOYAGER_LOOSE; } if (this.configuration.docTypeMode == Configuration.DOCTYPE_USER && this.configuration.docTypeStr != null) { fpi = this.configuration.docTypeStr; sysid = ""; } if (fpi == null) { return false; } if (doctype != null) { // Look for internal DTD subset if (configuration.xHTML || configuration.xmlOut) { int len = doctype.end - doctype.start + 1; String start = TidyUtils.getString(this.lexbuf, doctype.start, len); int dtdbeg = start.indexOf('['); if (dtdbeg >= 0) { int dtdend = start.substring(dtdbeg).indexOf(']'); if (dtdend >= 0) { dtdlen = dtdend + 1; dtdsub = start.substring(dtdbeg); } } } } else { if ((doctype = newXhtmlDocTypeNode(root)) == null) { return false; } } this.txtstart = this.lexsize; this.txtend = this.lexsize; // add public identifier addStringLiteral("html PUBLIC "); // check if the fpi is quoted or not if (fpi.charAt(0) == '"') { addStringLiteral(fpi); } else { addStringLiteral("\""); addStringLiteral(fpi); addStringLiteral("\""); } if (this.configuration.wraplen != 0 && sysid.length() + 6 >= this.configuration.wraplen) { addStringLiteral("\n\""); } else { // FG: don't wrap addStringLiteral(" \""); } // add system identifier addStringLiteral(sysid); addStringLiteral("\""); if (dtdlen > 0 && dtdsub != null) { addCharToLexer(' '); addStringLiteralLen(dtdsub, dtdlen); } this.txtend = this.lexsize; int length = this.txtend - this.txtstart; doctype.textarray = new byte[length]; System.arraycopy(this.lexbuf, this.txtstart, doctype.textarray, 0, length); doctype.start = 0; doctype.end = length; return false; } /** * Return the html version used in document. * @return version code */ public short apparentVersion() { switch (this.doctype) { case Dict.VERS_UNKNOWN : return htmlVersion(); case Dict.VERS_HTML20 : if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML20)) { return Dict.VERS_HTML20; } break; case Dict.VERS_HTML32 : if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML32)) { return Dict.VERS_HTML32; } break; // to replace old version by new case Dict.VERS_HTML40_STRICT : if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML40_STRICT)) { return Dict.VERS_HTML40_STRICT; } break; case Dict.VERS_HTML40_LOOSE : if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML40_LOOSE)) { return Dict.VERS_HTML40_LOOSE; } break; // to replace old version by new case Dict.VERS_FRAMESET : if (TidyUtils.toBoolean(this.versions & Dict.VERS_FRAMESET)) { return Dict.VERS_FRAMESET; } break; case Dict.VERS_XHTML11 : if (TidyUtils.toBoolean(this.versions & Dict.VERS_XHTML11)) { return Dict.VERS_XHTML11; } break; default : // should never reach here break; } // kludge to avoid error appearing at end of file // it would be better to note the actual position // when first encountering the doctype declaration this.lines = 1; this.columns = 1; report.warning(this, null, null, Report.INCONSISTENT_VERSION); return this.htmlVersion(); } /** * Fixup doctype if missing. * @param root root node * @return <code>false</code> if current version has not been identified */ public boolean fixDocType(Node root) { Node doctype; int guessed = Dict.VERS_HTML40_STRICT, i; if (this.badDoctype) { report.warning(this, null, null, Report.MALFORMED_DOCTYPE); } doctype = root.findDocType(); if (this.configuration.docTypeMode == Configuration.DOCTYPE_OMIT) { if (doctype != null) { Node.discardElement(doctype); } return true; } if (this.configuration.xmlOut) { return true; } if (this.configuration.docTypeMode == Configuration.DOCTYPE_STRICT) { Node.discardElement(doctype); doctype = null; guessed = Dict.VERS_HTML40_STRICT; } else if (this.configuration.docTypeMode == Configuration.DOCTYPE_LOOSE) { Node.discardElement(doctype); doctype = null; guessed = Dict.VERS_HTML40_LOOSE; } else if (this.configuration.docTypeMode == Configuration.DOCTYPE_AUTO) { if (doctype != null) { if (this.doctype == Dict.VERS_UNKNOWN) { return false; } switch (this.doctype) { case Dict.VERS_UNKNOWN : return false; case Dict.VERS_HTML20 : if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML20)) { return true; } break; // to replace old version by new case Dict.VERS_HTML32 : if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML32)) { return true; } break; // to replace old version by new case Dict.VERS_HTML40_STRICT : if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML40_STRICT)) { return true; } break; // to replace old version by new case Dict.VERS_HTML40_LOOSE : if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML40_LOOSE)) { return true; } break; // to replace old version by new case Dict.VERS_FRAMESET : if (TidyUtils.toBoolean(this.versions & Dict.VERS_FRAMESET)) { return true; } break; // to replace old version by new case Dict.VERS_XHTML11 : if (TidyUtils.toBoolean(this.versions & Dict.VERS_XHTML11)) { return true; } break; // to replace old version by new default : // should never reach here break; } // INCONSISTENT_VERSION warning is now issued by ApparentVersion() } // choose new doctype guessed = htmlVersion(); } if (guessed == Dict.VERS_UNKNOWN) { return false; } // for XML use the Voyager system identifier if (this.configuration.xmlOut || this.configuration.xmlTags || this.isvoyager) { if (doctype != null) { Node.discardElement(doctype); } fixHTMLNameSpace(root, XHTML_NAMESPACE); // Namespace is the same for all XHTML variants // Also, don't return yet. Still need to add DOCTYPE declaration. // // for (i = 0; i < W3CVersion.length; ++i) // { // if (guessed == W3CVersion[i].code) // { // fixHTMLNameSpace(root, W3CVersion[i].profile); // break; // } // } // return true; } if (doctype == null) { if ((doctype = newXhtmlDocTypeNode(root)) == null) { return false; } } this.txtstart = this.lexsize; this.txtend = this.lexsize; // use the appropriate public identifier addStringLiteral("html PUBLIC "); if (this.configuration.docTypeMode == Configuration.DOCTYPE_USER && this.configuration.docTypeStr != null && this.configuration.docTypeStr.length() > 0) { // check if the fpi is quoted or not if (this.configuration.docTypeStr.charAt(0) == '"') { addStringLiteral(this.configuration.docTypeStr); } else { addStringLiteral("\""); // #431889 - fix by Dave Bryan 04 Jan 2001 addStringLiteral(this.configuration.docTypeStr); addStringLiteral("\""); // #431889 - fix by Dave Bryan 04 Jan 2001 } } else if (guessed == Dict.VERS_HTML20) { addStringLiteral("\"-//IETF//DTD HTML 2.0//EN\""); } else { addStringLiteral("\"-//W3C//DTD "); for (i = 0; i < W3CVERSION.length; ++i) { if (guessed == W3CVERSION[i].code) { addStringLiteral(W3CVERSION[i].name); break; } } addStringLiteral("//EN\""); } this.txtend = this.lexsize; int length = this.txtend - this.txtstart; doctype.textarray = new byte[length]; System.arraycopy(this.lexbuf, this.txtstart, doctype.textarray, 0, length); doctype.start = 0; doctype.end = length; return true; } /** * Ensure XML document starts with <code><?XML version="1.0"?></code>. Add encoding attribute if not using * ASCII or UTF-8 output. * @param root root node * @return always true */ public boolean fixXmlDecl(Node root) { Node xml; AttVal version; AttVal encoding; if (root.content != null && root.content.type == Node.XML_DECL) { xml = root.content; } else { xml = newNode(Node.XML_DECL, this.lexbuf, 0, 0); root.insertNodeAtStart(xml); } version = xml.getAttrByName("version"); encoding = xml.getAttrByName("encoding"); // We need to insert a check if declared encoding and output encoding mismatch // and fix the Xml declaration accordingly!!! if (encoding == null && !"UTF8".equals(this.configuration.getOutCharEncodingName())) { if ("ISO8859_1".equals(this.configuration.getOutCharEncodingName())) { xml.addAttribute("encoding", "iso-8859-1"); } if ("ISO2022".equals(this.configuration.getOutCharEncodingName())) { xml.addAttribute("encoding", "iso-2022"); } } if (version == null) { xml.addAttribute("version", "1.0"); } return true; } /** * Generates and inserts a new node. * @param name tag name * @return generated node */ public Node inferredTag(String name) { Node node; node = newNode(Node.START_TAG, this.lexbuf, this.txtstart, this.txtend, name); node.implicit = true; return node; } private static final int CDATA_INTERMEDIATE = 0; private static final int CDATA_STARTTAG = 1; private static final int CDATA_ENDTAG = 2; /** * Create a text node for the contents of a CDATA element like style or script which ends with </foo> for some * foo. * @param container container node * @return cdata node */ public Node getCDATA(Node container) { int start = 0; int nested = 0; int state = CDATA_INTERMEDIATE; int c; boolean isEmpty = true; boolean matches = false; boolean hasSrc = container.getAttrByName("src") != null; this.lines = this.in.getCurline(); this.columns = this.in.getCurcol(); this.waswhite = false; this.txtstart = this.lexsize; this.txtend = this.lexsize; /* seen start tag, look for matching end tag */ while ((c = this.in.readChar()) != StreamIn.END_OF_STREAM) { addCharToLexer(c); txtend = lexsize; if (state == CDATA_INTERMEDIATE) { if (c != '<') { if (isEmpty && !TidyUtils.isWhite((char) c)) { isEmpty = false; } continue; } c = in.readChar(); if (TidyUtils.isLetter((char) c)) { /* <head><script src=foo><meta name=foo content=bar>*/ if (hasSrc && isEmpty && container.tag == configuration.tt.tagScript) { /* ReportError(doc, container, NULL, MISSING_ENDTAG_FOR); */ lexsize = txtstart; in.ungetChar(c); in.ungetChar('<'); return null; } addCharToLexer(c); start = lexsize - 1; state = CDATA_STARTTAG; } else if (c == '/') { addCharToLexer(c); c = in.readChar(); if (!TidyUtils.isLetter((char) c)) { in.ungetChar(c); continue; } in.ungetChar(c); start = lexsize; state = CDATA_ENDTAG; } else if (c == '\\') { /* recognize document.write("<script><\/script>") */ addCharToLexer(c); c = in.readChar(); if (c != '/') { in.ungetChar(c); continue; } addCharToLexer(c); c = in.readChar(); if (!TidyUtils.isLetter((char) c)) { in.ungetChar(c); continue; } in.ungetChar(c); start = lexsize; state = CDATA_ENDTAG; } else { in.ungetChar(c); } } else if (state == CDATA_STARTTAG) { /* '<' + Letter found */ if (TidyUtils.isLetter((char) c)) { continue; } matches = container.element.equalsIgnoreCase(TidyUtils.getString(lexbuf, start, container.element.length())); if (matches) { nested++; } state = CDATA_INTERMEDIATE; } else if (state == CDATA_ENDTAG) { /* '<' + '/' + Letter found */ if (TidyUtils.isLetter((char) c)) { continue; } matches = container.element.equalsIgnoreCase(TidyUtils.getString(lexbuf, start, container.element.length())); if (isEmpty && !matches) { /* ReportError(doc, container, NULL, MISSING_ENDTAG_FOR); */ for (int i = lexsize - 1; i >= start; --i) { in.ungetChar(lexbuf[i]); } in.ungetChar('/'); in.ungetChar('<'); break; } if (matches && nested-- <= 0) { for (int i = lexsize - 1; i >= start; --i) { in.ungetChar(lexbuf[i]); } in.ungetChar('/'); in.ungetChar('<'); lexsize -= (lexsize - start) + 2; break; } else if (lexbuf[start - 2] != '\\') { /* if the end tag is not already escaped using backslash */ lines = in.getCurline(); columns = in.getCurcol(); columns -= 3; report.warning(this, null, null, Report.BAD_CDATA_CONTENT); /* if javascript insert backslash before / */ if (container.isJavaScript()) { for (int i = lexsize; i > start-1; --i) { lexbuf[i] = lexbuf[i-1]; } lexbuf[start-1] = '\\'; lexsize++; } } state = CDATA_INTERMEDIATE; } } if (isEmpty) { lexsize = txtstart = txtend; } else { txtend = lexsize; } if (c == StreamIn.END_OF_STREAM) { report.warning(this, container, null, Report.MISSING_ENDTAG_FOR); } /* this was disabled for some reason... */ // #if 0 // if (lexer->txtend > lexer->txtstart) // return TextToken(lexer); // else // return NULL; // #else return newNode(Node.TEXT_NODE, lexbuf, txtstart, txtend); // #endif } /** * * */ public void ungetToken() { this.pushed = true; } /** * Gets a token. * @param mode one of the following: * <ul> * <li><code>MixedContent</code>-- for elements which don't accept PCDATA</li> * <li><code>Preformatted</code>-- white spacepreserved as is</li> * <li><code>IgnoreMarkup</code>-- for CDATA elements such as script, style</li> * </ul> * @return next Node */ public Node getToken(short mode) { int c = 0; int badcomment = 0; // pass by reference boolean[] isempty = new boolean[1]; boolean inDTDSubset = false; AttVal attributes = null; if (this.pushed) { // duplicate inlines in preference to pushed text nodes when appropriate if (this.token.type != Node.TEXT_NODE || (this.insert == -1 && this.inode == null)) { this.pushed = false; return this.token; } } // at start of block elements, unclosed inline if (this.insert != -1 || this.inode != null) { return insertedToken(); } this.lines = this.in.getCurline(); this.columns = this.in.getCurcol(); this.waswhite = false; this.txtstart = this.lexsize; this.txtend = this.lexsize; while ((c = this.in.readChar()) != StreamIn.END_OF_STREAM) { // FG fix for [427846] different from tidy // if (this.insertspace && (!TidyUtils.toBoolean(mode & IGNORE_WHITESPACE))) if (this.insertspace && mode != IGNORE_WHITESPACE) { addCharToLexer(' '); } if (this.insertspace && (!TidyUtils.toBoolean(mode & IGNORE_WHITESPACE))) { this.waswhite = true; this.insertspace = false; } // treat \r\n as \n and \r as \n if (c == '\r') { c = this.in.readChar(); if (c != '\n') { this.in.ungetChar(c); } c = '\n'; } addCharToLexer(c); switch (this.state) { case LEX_CONTENT : // element content // Discard white space if appropriate. // Its cheaper to do this here rather than in parser methods for elements that // don't have mixed content. if (TidyUtils.isWhite((char) c) && (mode == IGNORE_WHITESPACE) && this.lexsize == this.txtstart + 1) { --this.lexsize; this.waswhite = false; this.lines = this.in.getCurline(); this.columns = this.in.getCurcol(); continue; } if (c == '<') { this.state = LEX_GT; continue; } if (TidyUtils.isWhite((char) c)) { // was previous char white? if (this.waswhite) { if (mode != PREFORMATTED && mode != IGNORE_MARKUP) { --this.lexsize; this.lines = this.in.getCurline(); this.columns = this.in.getCurcol(); } } else { // prev char wasn't white this.waswhite = true; if (mode != PREFORMATTED && mode != IGNORE_MARKUP && c != ' ') { changeChar((byte) ' '); } } continue; } else if (c == '&' && mode != IGNORE_MARKUP) { parseEntity(mode); } // this is needed to avoid trimming trailing whitespace if (mode == IGNORE_WHITESPACE) { mode = MIXED_CONTENT; } this.waswhite = false; continue; case LEX_GT : // < // check for endtag if (c == '/') { c = this.in.readChar(); if (c == StreamIn.END_OF_STREAM) { this.in.ungetChar(c); continue; } addCharToLexer(c); if (TidyUtils.isLetter((char) c)) { this.lexsize -= 3; this.txtend = this.lexsize; this.in.ungetChar(c); this.state = LEX_ENDTAG; this.lexbuf[this.lexsize] = (byte) '\0'; // debug // changed from // this.in.curcol -= 2; this.columns -= 2; // if some text before the </ return it now if (this.txtend > this.txtstart) { // trim space char before end tag if (mode == IGNORE_WHITESPACE && this.lexbuf[this.lexsize - 1] == (byte) ' ') { this.lexsize -= 1; this.txtend = this.lexsize; } this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend); return this.token; } continue; // no text so keep going } // otherwise treat as CDATA this.waswhite = false; this.state = LEX_CONTENT; continue; } if (mode == IGNORE_MARKUP) { // otherwise treat as CDATA this.waswhite = false; this.state = LEX_CONTENT; continue; } // look out for comments, doctype or marked sections this isn't quite right, but its getting there if (c == '!') { c = this.in.readChar(); if (c == '-') { c = this.in.readChar(); if (c == '-') { this.state = LEX_COMMENT; // comment this.lexsize -= 2; this.txtend = this.lexsize; // if some text before < return it now if (this.txtend > this.txtstart) { this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend); return this.token; } this.txtstart = this.lexsize; continue; } report.warning(this, null, null, Report.MALFORMED_COMMENT); } else if (c == 'd' || c == 'D') { this.state = LEX_DOCTYPE; // doctype this.lexsize -= 2; this.txtend = this.lexsize; mode = IGNORE_WHITESPACE; // skip until white space or '>' for (;;) { c = this.in.readChar(); if (c == StreamIn.END_OF_STREAM || c == '>') { this.in.ungetChar(c); break; } if (!TidyUtils.isWhite((char) c)) { continue; } // and skip to end of whitespace for (;;) { c = this.in.readChar(); if (c == StreamIn.END_OF_STREAM || c == '>') { this.in.ungetChar(c); break; } if (TidyUtils.isWhite((char) c)) { continue; } this.in.ungetChar(c); break; } break; } // if some text before < return it now if (this.txtend > this.txtstart) { this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend); return this.token; } this.txtstart = this.lexsize; continue; } else if (c == '[') { // Word 2000 embeds <![if ...]> ... <![endif]> sequences this.lexsize -= 2; this.state = LEX_SECTION; this.txtend = this.lexsize; // if some text before < return it now if (this.txtend > this.txtstart) { this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend); return this.token; } this.txtstart = this.lexsize; continue; } // otherwise swallow chars up to and including next '>' while (true) { c = this.in.readChar(); if (c == '>') { break; } if (c == -1) { this.in.ungetChar(c); break; } } this.lexsize -= 2; this.lexbuf[this.lexsize] = (byte) '\0'; this.state = LEX_CONTENT; continue; } // processing instructions if (c == '?') { this.lexsize -= 2; this.state = LEX_PROCINSTR; this.txtend = this.lexsize; // if some text before < return it now if (this.txtend > this.txtstart) { this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend); return this.token; } this.txtstart = this.lexsize; continue; } // Microsoft ASP's e.g. <% ... server-code ... %> if (c == '%') { this.lexsize -= 2; this.state = LEX_ASP; this.txtend = this.lexsize; // if some text before < return it now if (this.txtend > this.txtstart) { this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend); return this.token; } this.txtstart = this.lexsize; continue; } // Netscapes JSTE e.g. <# ... server-code ... #> if (c == '#') { this.lexsize -= 2; this.state = LEX_JSTE; this.txtend = this.lexsize; // if some text before < return it now if (this.txtend > this.txtstart) { this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend); return this.token; } this.txtstart = this.lexsize; continue; } // check for start tag if (TidyUtils.isLetter((char) c)) { this.in.ungetChar(c); // push back letter this.lexsize -= 2; // discard " <" + letter this.txtend = this.lexsize; this.state = LEX_STARTTAG; // ready to read tag name // if some text before < return it now if (this.txtend > this.txtstart) { this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend); return this.token; } continue; // no text so keep going } // otherwise treat as CDATA this.state = LEX_CONTENT; this.waswhite = false; continue; case LEX_ENDTAG : // </letter this.txtstart = this.lexsize - 1; // changed from // this.in.curcol -= 2; this.columns -= 2; c = parseTagName(); this.token = newNode(Node.END_TAG, // create endtag token this.lexbuf, this.txtstart, this.txtend, TidyUtils.getString(this.lexbuf, this.txtstart, this.txtend - this.txtstart)); this.lexsize = this.txtstart; this.txtend = this.txtstart; // skip to '>' while (TidyUtils.isWhite((char) c)) { c = this.in.readChar(); } if (c == StreamIn.END_OF_STREAM) { this.in.ungetChar(c); report.attrError(this, this.token, null, Report.UNEXPECTED_GT); continue; } // should be at the '>' if we're not, assume one if (c != '>') { this.in.ungetChar(c); c = '>'; report.attrError(this, this.token, null, Report.UNEXPECTED_GT); } this.state = LEX_CONTENT; this.waswhite = false; return this.token; // the endtag token case LEX_STARTTAG : // first letter of tagname this.txtstart = this.lexsize - 1; // set txtstart to first letter c = parseTagName(); isempty[0] = false; attributes = null; this.token = newNode( (isempty[0] ? Node.START_END_TAG : Node.START_TAG), this.lexbuf, this.txtstart, this.txtend, TidyUtils.getString(this.lexbuf, this.txtstart, this.txtend - this.txtstart)); // parse attributes, consuming closing ">" if (c != '>') { if (c == '/') { this.in.ungetChar(c); } attributes = parseAttrs(isempty); } if (isempty[0]) { this.token.type = Node.START_END_TAG; } this.token.attributes = attributes; this.lexsize = this.txtstart; this.txtend = this.txtstart; // swallow newline following start tag // special check needed for CRLF sequence // this doesn't apply to empty elements // nor to preformatted content that needs escaping if ( (mode != PREFORMATTED || preContent(this.token)) && (this.token.expectsContent() || this.token.tag == this.configuration.tt.tagBr)) { c = this.in.readChar(); if (c == '\r') { c = this.in.readChar(); if (c != '\n') { this.in.ungetChar(c); } } else if (c != '\n' && c != '\f') { this.in.ungetChar(c); } this.waswhite = true; // to swallow leading whitespace } else { this.waswhite = false; } this.state = LEX_CONTENT; if (this.token.tag == null) { report.error(this, null, this.token, Report.UNKNOWN_ELEMENT); } else if (!this.configuration.xmlTags) { constrainVersion(this.token.tag.versions); if (TidyUtils.toBoolean(this.token.tag.versions & Dict.VERS_PROPRIETARY)) { // #427810 - fix by Gary Deschaines 24 May 00 if (this.configuration.makeClean && (this.token.tag != this.configuration.tt.tagNobr && // this.token.tag != this.configuration.tt.tagWbr)) { report.warning(this, null, this.token, Report.PROPRIETARY_ELEMENT); } // #427810 - fix by Terry Teague 2 Jul 01 else if (!this.configuration.makeClean) { report.warning(this, null, this.token, Report.PROPRIETARY_ELEMENT); } } if (this.token.tag.getChkattrs() != null) { this.token.tag.getChkattrs().check(this, this.token); } else { this.token.checkAttributes(this); } // should this be called before attribute checks? this.token.repairDuplicateAttributes(this); } return this.token; // return start tag case LEX_COMMENT : // seen <!-- so look for --> if (c != '-') { continue; } c = this.in.readChar(); addCharToLexer(c); if (c != '-') { continue; } end_comment : while (true) { c = this.in.readChar(); if (c == '>') { if (badcomment != 0) { report.warning(this, null, null, Report.MALFORMED_COMMENT); } this.txtend = this.lexsize - 2; // AQ 8Jul2000 this.lexbuf[this.lexsize] = (byte) '\0'; this.state = LEX_CONTENT; this.waswhite = false; this.token = newNode(Node.COMMENT_TAG, this.lexbuf, this.txtstart, this.txtend); // now look for a line break c = this.in.readChar(); if (c == '\r') { c = this.in.readChar(); if (c != '\n') { this.token.linebreak = true; } } if (c == '\n') { this.token.linebreak = true; } else { this.in.ungetChar(c); } return this.token; } // note position of first such error in the comment if (badcomment == 0) { this.lines = this.in.getCurline(); this.columns = this.in.getCurcol() - 3; } badcomment++; if (this.configuration.fixComments) { this.lexbuf[this.lexsize - 2] = (byte) '='; } addCharToLexer(c); // if '-' then look for '>' to end the comment if (c != '-') { break end_comment; } } // otherwise continue to look for --> this.lexbuf[this.lexsize - 2] = (byte) '='; continue; case LEX_DOCTYPE : // seen <!d so look for '> ' munging whitespace if (TidyUtils.isWhite((char) c)) { if (this.waswhite) { this.lexsize -= 1; } this.waswhite = true; } else { this.waswhite = false; } if (inDTDSubset) { if (c == ']') { inDTDSubset = false; } } else if (c == '[') { inDTDSubset = true; } if (inDTDSubset || c != '>') { continue; } this.lexsize -= 1; this.txtend = this.lexsize; this.lexbuf[this.lexsize] = (byte) '\0'; this.state = LEX_CONTENT; this.waswhite = false; this.token = newNode(Node.DOCTYPE_TAG, this.lexbuf, this.txtstart, this.txtend); // make a note of the version named by the doctype this.doctype = findGivenVersion(this.token); return this.token; case LEX_PROCINSTR : // seen <? so look for '> ' // check for PHP preprocessor instructions <?php ... ?> if (this.lexsize - this.txtstart == 3) { if ((TidyUtils.getString(this.lexbuf, this.txtstart, 3)).equals("php")) { this.state = LEX_PHP; continue; } } if (this.lexsize - this.txtstart == 4) { if ((TidyUtils.getString(this.lexbuf, this.txtstart, 3)).equals("xml") && TidyUtils.isWhite((char) this.lexbuf[this.txtstart + 3])) { this.state = LEX_XMLDECL; attributes = null; continue; } } if (this.configuration.xmlPIs) // insist on ?> as terminator { if (c != '?') { continue; } // now look for '>' c = this.in.readChar(); if (c == StreamIn.END_OF_STREAM) { report.warning(this, null, null, Report.UNEXPECTED_END_OF_FILE); this.in.ungetChar(c); continue; } addCharToLexer(c); } if (c != '>') { continue; } this.lexsize -= 1; this.txtend = this.lexsize; this.lexbuf[this.lexsize] = (byte) '\0'; this.state = LEX_CONTENT; this.waswhite = false; this.token = newNode(Node.PROC_INS_TAG, this.lexbuf, this.txtstart, this.txtend); return this.token; case LEX_ASP : // seen <% so look for "%> " if (c != '%') { continue; } // now look for '>' c = this.in.readChar(); if (c != '>') { this.in.ungetChar(c); continue; } this.lexsize -= 1; this.txtend = this.lexsize; this.lexbuf[this.lexsize] = (byte) '\0'; this.state = LEX_CONTENT; this.waswhite = false; this.token = newNode(Node.ASP_TAG, this.lexbuf, this.txtstart, this.txtend); return this.token; case LEX_JSTE : // seen <# so look for "#> " if (c != '#') { continue; } // now look for '>' c = this.in.readChar(); if (c != '>') { this.in.ungetChar(c); continue; } this.lexsize -= 1; this.txtend = this.lexsize; this.lexbuf[this.lexsize] = (byte) '\0'; this.state = LEX_CONTENT; this.waswhite = false; this.token = newNode(Node.JSTE_TAG, this.lexbuf, this.txtstart, this.txtend); return this.token; case LEX_PHP : // seen " <?php" so look for "?> " if (c != '?') { continue; } // now look for '>' c = this.in.readChar(); if (c != '>') { this.in.ungetChar(c); continue; } this.lexsize -= 1; this.txtend = this.lexsize; this.lexbuf[this.lexsize] = (byte) '\0'; this.state = LEX_CONTENT; this.waswhite = false; this.token = newNode(Node.PHP_TAG, this.lexbuf, this.txtstart, this.txtend); return this.token; case LEX_XMLDECL : // seen "<?xml" so look for "?>" if (TidyUtils.isWhite((char) c) && c != '?') { continue; } // get pseudo-attribute if (c != '?') { String name; Node[] asp = new Node[1]; Node[] php = new Node[1]; AttVal av = new AttVal(); int[] pdelim = new int[1]; isempty[0] = false; this.in.ungetChar(c); name = this.parseAttribute(isempty, asp, php); av.attribute = name; av.value = this.parseValue(name, true, isempty, pdelim); av.delim = pdelim[0]; av.next = attributes; attributes = av; // continue; } // now look for '>' c = this.in.readChar(); if (c != '>') { this.in.ungetChar(c); continue; } this.lexsize -= 1; this.txtend = this.txtstart; this.lexbuf[this.txtend] = '\0'; this.state = LEX_CONTENT; this.waswhite = false; this.token = newNode(Node.XML_DECL, this.lexbuf, this.txtstart, this.txtend); this.token.attributes = attributes; return this.token; case LEX_SECTION : // seen " <![" so look for "]> " if (c == '[') { if (this.lexsize == (this.txtstart + 6) && (TidyUtils.getString(this.lexbuf, this.txtstart, 6)).equals("CDATA[")) { this.state = LEX_CDATA; this.lexsize -= 6; continue; } } if (c != ']') { continue; } // now look for '>' c = this.in.readChar(); if (c != '>') { this.in.ungetChar(c); continue; } this.lexsize -= 1; this.txtend = this.lexsize; this.lexbuf[this.lexsize] = (byte) '\0'; this.state = LEX_CONTENT; this.waswhite = false; this.token = newNode(Node.SECTION_TAG, this.lexbuf, this.txtstart, this.txtend); return this.token; case LEX_CDATA : // seen " <![CDATA[" so look for "]]> " if (c != ']') { continue; } // now look for ']' c = this.in.readChar(); if (c != ']') { this.in.ungetChar(c); continue; } // now look for '>' c = this.in.readChar(); if (c != '>') { this.in.ungetChar(c); continue; } this.lexsize -= 1; this.txtend = this.lexsize; this.lexbuf[this.lexsize] = (byte) '\0'; this.state = LEX_CONTENT; this.waswhite = false; this.token = newNode(Node.CDATA_TAG, this.lexbuf, this.txtstart, this.txtend); return this.token; default : // should never reach here break; } } if (this.state == LEX_CONTENT) // text string { this.txtend = this.lexsize; if (this.txtend > this.txtstart) { this.in.ungetChar(c); if (this.lexbuf[this.lexsize - 1] == (byte) ' ') { this.lexsize -= 1; this.txtend = this.lexsize; } this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend); return this.token; } } else if (this.state == LEX_COMMENT) // comment { if (c == StreamIn.END_OF_STREAM) { report.warning(this, null, null, Report.MALFORMED_COMMENT); } this.txtend = this.lexsize; this.lexbuf[this.lexsize] = (byte) '\0'; this.state = LEX_CONTENT; this.waswhite = false; this.token = newNode(Node.COMMENT_TAG, this.lexbuf, this.txtstart, this.txtend); return this.token; } return null; } /** * parser for ASP within start tags Some people use ASP for to customize attributes Tidy isn't really well suited to * dealing with ASP This is a workaround for attributes, but won't deal with the case where the ASP is used to * tailor the attribute value. Here is an example of a work around for using ASP in attribute values: * <code>href='<%=rsSchool.Fields("ID").Value%>'</code> where the ASP that generates the attribute value is * masked from Tidy by the quotemarks. * @return parsed Node */ public Node parseAsp() { int c; Node asp = null; this.txtstart = this.lexsize; while ((c = this.in.readChar()) != StreamIn.END_OF_STREAM) { addCharToLexer(c); if (c != '%') { continue; } if ((c = this.in.readChar()) == StreamIn.END_OF_STREAM) { break; } addCharToLexer(c); if (c == '>') { break; } } this.lexsize -= 2; this.txtend = this.lexsize; if (this.txtend > this.txtstart) { asp = newNode(Node.ASP_TAG, this.lexbuf, this.txtstart, this.txtend); } this.txtstart = this.txtend; return asp; } /** * PHP is like ASP but is based upon XML processing instructions, e.g. <code><?php ... ?></code>. * @return parsed Node */ public Node parsePhp() { int c; Node php = null; this.txtstart = this.lexsize; while ((c = this.in.readChar()) != StreamIn.END_OF_STREAM) { addCharToLexer(c); if (c != '?') { continue; } if ((c = this.in.readChar()) == StreamIn.END_OF_STREAM) { break; } addCharToLexer(c); if (c == '>') { break; } } this.lexsize -= 2; this.txtend = this.lexsize; if (this.txtend > this.txtstart) { php = newNode(Node.PHP_TAG, this.lexbuf, this.txtstart, this.txtend); } this.txtstart = this.txtend; return php; } /** * consumes the '>' terminating start tags. * @param isempty flag is passed as array so it can be modified * @param asp asp Node, passed as array so it can be modified * @param php php Node, passed as array so it can be modified * @return parsed attribute */ public String parseAttribute(boolean[] isempty, Node[] asp, Node[] php) { int start = 0; String attr; int c = 0; int lastc = 0; asp[0] = null; // clear asp pointer php[0] = null; // clear php pointer // skip white space before the attribute for (;;) { c = this.in.readChar(); if (c == '/') { c = this.in.readChar(); if (c == '>') { isempty[0] = true; return null; } this.in.ungetChar(c); c = '/'; break; } if (c == '>') { return null; } if (c == '<') { c = this.in.readChar(); if (c == '%') { asp[0] = parseAsp(); return null; } else if (c == '?') { php[0] = parsePhp(); return null; } this.in.ungetChar(c); if (this.state != LEX_XMLDECL) // FG fix for 532535 { this.in.ungetChar('<'); // fix for 433360 } report.attrError(this, this.token, null, Report.UNEXPECTED_GT); return null; } if (c == '=') { report.attrError(this, this.token, null, Report.UNEXPECTED_EQUALSIGN); continue; } if (c == '"' || c == '\'') { report.attrError(this, this.token, null, Report.UNEXPECTED_QUOTEMARK); continue; } if (c == StreamIn.END_OF_STREAM) { report.attrError(this, this.token, null, Report.UNEXPECTED_END_OF_FILE); this.in.ungetChar(c); return null; } if (!TidyUtils.isWhite((char) c)) { break; } } start = this.lexsize; lastc = c; for (;;) { // but push back '=' for parseValue() if (c == '=' || c == '>') { this.in.ungetChar(c); break; } if (c == '<' || c == StreamIn.END_OF_STREAM) { this.in.ungetChar(c); break; } if (lastc == '-' && (c == '"' || c == '\'')) { this.lexsize--; this.in.ungetChar(c); break; } if (TidyUtils.isWhite((char) c)) { break; } // what should be done about non-namechar characters? // currently these are incorporated into the attr name if (!this.configuration.xmlTags && TidyUtils.isUpper((char) c)) { c = TidyUtils.toLower((char) c); } // ++len; #427672 - handle attribute names with multibyte chars - fix by Randy Waki - 10 Aug 00 addCharToLexer(c); lastc = c; c = this.in.readChar(); } // #427672 - handle attribute names with multibyte chars - fix by Randy Waki - 10 Aug 00 int len = this.lexsize - start; attr = (len > 0 ? TidyUtils.getString(this.lexbuf, start, len) : null); this.lexsize = start; return attr; } /** * Invoked when < is seen in place of attribute value but terminates on whitespace if not ASP, PHP or Tango this * routine recognizes ' and " quoted strings. * @return delimiter */ public int parseServerInstruction() { int c, delim = '"'; boolean isrule = false; c = this.in.readChar(); addCharToLexer(c); // check for ASP, PHP or Tango if (c == '%' || c == '?' || c == '@') { isrule = true; } for (;;) { c = this.in.readChar(); if (c == StreamIn.END_OF_STREAM) { break; } if (c == '>') { if (isrule) { addCharToLexer(c); } else { this.in.ungetChar(c); } break; } // if not recognized as ASP, PHP or Tango // then also finish value on whitespace if (!isrule) { if (TidyUtils.isWhite((char) c)) { break; } } addCharToLexer(c); if (c == '"') { do { c = this.in.readChar(); if (endOfInput()) // #427840 - fix by Terry Teague 30 Jun 01 { report.attrError(this, this.token, null, Report.UNEXPECTED_END_OF_FILE); this.in.ungetChar(c); return 0; } if (c == '>') // #427840 - fix by Terry Teague 30 Jun 01 { this.in.ungetChar(c); report.attrError(this, this.token, null, Report.UNEXPECTED_GT); return 0; } addCharToLexer(c); } while (c != '"'); delim = '\''; continue; } if (c == '\'') { do { c = this.in.readChar(); if (endOfInput()) // #427840 - fix by Terry Teague 30 Jun 01 { report.attrError(this, this.token, null, Report.UNEXPECTED_END_OF_FILE); this.in.ungetChar(c); return 0; } if (c == '>') // #427840 - fix by Terry Teague 30 Jun 01 { this.in.ungetChar(c); report.attrError(this, this.token, null, Report.UNEXPECTED_GT); return 0; } addCharToLexer(c); } while (c != '\''); } } return delim; } /** * Parse an attribute value. * @param name attribute name * @param foldCase fold case? * @param isempty is attribute empty? Passed as an array reference to allow modification * @param pdelim delimiter, passed as an array reference to allow modification * @return parsed value */ public String parseValue(String name, boolean foldCase, boolean[] isempty, int[] pdelim) { // values start with "=" or " = " etc. // doesn't consume the ">" at end of start tag int len = 0; int start; boolean seenGt = false; boolean munge = true; int c = 0; int lastc, delim, quotewarning; String value; delim = 0; pdelim[0] = '"'; // Henry Zrepa reports that some folk are using the embed element with script attributes where newlines are // significant and must be preserved if (this.configuration.literalAttribs) { munge = false; } // skip white space before the '=' while (true) { c = this.in.readChar(); if (c == StreamIn.END_OF_STREAM) { this.in.ungetChar(c); break; } if (!TidyUtils.isWhite((char) c)) { break; } } // c should be '=' if there is a value other legal possibilities are white space, '/' and '>' if (c != '=' && c != '"' && c != '\'') { this.in.ungetChar(c); return null; } // skip white space after '=' while (true) { c = this.in.readChar(); if (c == StreamIn.END_OF_STREAM) { this.in.ungetChar(c); break; } if (!TidyUtils.isWhite((char) c)) { break; } } // check for quote marks if (c == '"' || c == '\'') { delim = c; } else if (c == '<') { start = this.lexsize; addCharToLexer(c); pdelim[0] = parseServerInstruction(); len = this.lexsize - start; this.lexsize = start; return (len > 0 ? TidyUtils.getString(this.lexbuf, start, len) : null); } else { this.in.ungetChar(c); } // and read the value string check for quote mark if needed quotewarning = 0; start = this.lexsize; c = '\0'; while (true) { lastc = c; // track last character c = this.in.readChar(); if (c == StreamIn.END_OF_STREAM) { report.attrError(this, this.token, null, Report.UNEXPECTED_END_OF_FILE); this.in.ungetChar(c); break; } if (delim == (char) 0) { if (c == '>') { this.in.ungetChar(c); break; } if (c == '"' || c == '\'') { int q = c; report.attrError(this, this.token, null, Report.UNEXPECTED_QUOTEMARK); /* handle <input onclick=s("btn1")> and <a title=foo""">...</a> */ /* this doesn't handle <a title=foo"/> which browsers treat as */ /* 'foo"/' nor <a title=foo" /> which browser treat as 'foo"' */ c = in.readChar(); if (c == '>') { addCharToLexer(q); in.ungetChar(c); break; } else { in.ungetChar(c); c = q; } } if (c == '<') { this.in.ungetChar(c); // fix for 433360 c = '>'; this.in.ungetChar(c); report.attrError(this, this.token, null, Report.UNEXPECTED_GT); break; } // For cases like <br clear=all/> need to avoid treating /> as part of the attribute value, however // care is needed to avoid so treating <a href=http://www.acme.com /> in this way, which would map the // <a> tag to <a href="http://www.acme.com"/> if (c == '/') { // peek ahead in case of /> c = this.in.readChar(); if (c == '>' && !AttributeTable.getDefaultAttributeTable().isUrl(name)) { isempty[0] = true; this.in.ungetChar(c); break; } // unget peeked char this.in.ungetChar(c); c = '/'; } } else { // delim is '\'' or '"' if (c == delim) { break; } // treat CRLF, CR and LF as single line break if (c == '\r') { c = this.in.readChar(); if (c != '\n') { this.in.ungetChar(c); } c = '\n'; } if (c == '\n' || c == '<' || c == '>') { ++quotewarning; } if (c == '>') { seenGt = true; } } if (c == '&') { // no entities in ID attributes if ("id".equalsIgnoreCase(name)) { report.attrError(this, null, null, Report.ENTITY_IN_ID); continue; } addCharToLexer(c); parseEntity((short) 0); continue; } // kludge for JavaScript attribute values with line continuations in string literals if (c == '\\') { c = this.in.readChar(); if (c != '\n') { this.in.ungetChar(c); c = '\\'; } } if (TidyUtils.isWhite((char) c)) { if (delim == (char) 0) { break; } if (munge) { // discard line breaks in quoted URLs // #438650 - fix by Randy Waki if (c == '\n' && AttributeTable.getDefaultAttributeTable().isUrl(name)) { // warn that we discard this newline report.attrError(this, this.token, null, Report.NEWLINE_IN_URI); continue; } c = ' '; if (lastc == ' ') { continue; } } } else if (foldCase && TidyUtils.isUpper((char) c)) { c = TidyUtils.toLower((char) c); } addCharToLexer(c); } if (quotewarning > 10 && seenGt && munge) { // there is almost certainly a missing trailing quote mark as we have see too many newlines, < or > // characters. an exception is made for Javascript attributes and the javascript URL scheme which may // legitimately include < and >, and for attributes starting with "<xml " as generated by Microsoft Office. if (!AttributeTable.getDefaultAttributeTable().isScript(name) && !(AttributeTable.getDefaultAttributeTable().isUrl(name) && "javascript:".equals(TidyUtils.getString( this.lexbuf, start, 11))) && !"<xml ".equals(TidyUtils.getString(this.lexbuf, start, 5))) // #500236 - fix by Klaus Johannes Rusch // 06 Jan 02 { report.error(this, null, null, Report.SUSPECTED_MISSING_QUOTE); } } len = this.lexsize - start; this.lexsize = start; if (len > 0 || delim != 0) { // ignore leading and trailing white space for all but title, alt, value and prompts attributes unless // --literal-attributes is set to yes // #994841 - Whitespace is removed from value attributes if (munge && !TidyUtils.isInValuesIgnoreCase(new String[]{"alt", "title", "value", "prompt"}, name)) { while (TidyUtils.isWhite((char) this.lexbuf[start + len - 1])) { --len; } while (TidyUtils.isWhite((char) this.lexbuf[start]) && start < len) { ++start; --len; } } value = TidyUtils.getString(this.lexbuf, start, len); } else { value = null; } // note delimiter if given if (delim != 0) { pdelim[0] = delim; } else { pdelim[0] = '"'; } return value; } /** * Check if attr is a valid name. * @param attr String to check, must be non-null * @return <code>true</code> if attr is a valid name. */ public static boolean isValidAttrName(String attr) { char c; int i; // first character should be a letter c = attr.charAt(0); if (!TidyUtils.isLetter(c)) { return false; } // remaining characters should be namechars for (i = 1; i < attr.length(); i++) { c = attr.charAt(i); if (TidyUtils.isNamechar(c)) { continue; } return false; } return true; } /** * In CSS1, selectors can contain only the characters A-Z, 0-9, and Unicode characters 161-255, plus dash (-); they * cannot start with a dash or a digit; they can also contain escaped characters and any Unicode character as a * numeric code (see next item). The backslash followed by at most four hexadecimal digits (0..9A..F) stands for the * Unicode character with that number. Any character except a hexadecimal digit can be escaped to remove its special * meaning, by putting a backslash in front. * @param buf css selector name * @return <code>true</code> if the given string is a valid css1 selector name */ public static boolean isCSS1Selector(String buf) { if (buf == null) { return false; } // #508936 - CSS class naming for -clean option boolean valid = true; int esclen = 0; char c; int pos; for (pos = 0; valid && pos < buf.length(); ++pos) { c = buf.charAt(pos); if (c == '\\') { esclen = 1; // ab\555\444 is 4 chars {'a', 'b', \555, \444} } else if (Character.isDigit(c)) { // Digit not 1st, unless escaped (Max length "\112F") if (esclen > 0) { valid = (++esclen < 6); } if (valid) { valid = (pos > 0 || esclen > 0); } } else { valid = (esclen > 0 // Escaped? Anything goes. || (pos > 0 && c == '-') // Dash cannot be 1st char || Character.isLetter(c) // a-z, A-Z anywhere || (c >= 161 && c <= 255)); // Unicode 161-255 anywhere esclen = 0; } } return valid; } /** * Parse tag attributes. * @param isempty is tag empty? * @return parsed attribute/value list */ public AttVal parseAttrs(boolean[] isempty) { AttVal av, list; String attribute, value; int[] delim = new int[1]; Node[] asp = new Node[1]; Node[] php = new Node[1]; list = null; while (!endOfInput()) { attribute = parseAttribute(isempty, asp, php); if (attribute == null) { // check if attributes are created by ASP markup if (asp[0] != null) { av = new AttVal(list, null, asp[0], null, '\0', null, null); list = av; continue; } // check if attributes are created by PHP markup if (php[0] != null) { av = new AttVal(list, null, null, php[0], '\0', null, null); list = av; continue; } break; } value = parseValue(attribute, false, isempty, delim); if (attribute != null && isValidAttrName(attribute)) { av = new AttVal(list, null, null, null, delim[0], attribute, value); av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av); list = av; } else { av = new AttVal(null, null, null, null, 0, attribute, value); // #427664 - fix by Gary Peskin 04 Aug 00; other fixes by Dave Raggett if (value != null) { report.attrError(this, this.token, av, Report.BAD_ATTRIBUTE_VALUE); } else if (TidyUtils.lastChar(attribute) == '"') { report.attrError(this, this.token, av, Report.MISSING_QUOTEMARK); } else { report.attrError(this, this.token, av, Report.UNKNOWN_ATTRIBUTE); } } } return list; } /** * Push a copy of an inline node onto stack but don't push if implicit or OBJECT or APPLET (implicit tags are ones * generated from the istack) One issue arises with pushing inlines when the tag is already pushed. For instance: * <code><p><em> text <p><em> more text</code> Shouldn't be mapped to * <code><p><em> text </em></p><p><em><em> more text </em></em></code> * @param node Node to be pushed */ public void pushInline(Node node) { IStack is; if (node.implicit) { return; } if (node.tag == null) { return; } if (!TidyUtils.toBoolean(node.tag.model & Dict.CM_INLINE)) { return; } if (TidyUtils.toBoolean(node.tag.model & Dict.CM_OBJECT)) { return; } if (node.tag != this.configuration.tt.tagFont && isPushed(node)) { return; } // make sure there is enough space for the stack is = new IStack(); is.tag = node.tag; is.element = node.element; if (node.attributes != null) { is.attributes = cloneAttributes(node.attributes); } this.istack.push(is); } /** * Pop a copy of an inline node from the stack. * @param node Node to be popped */ public void popInline(Node node) { IStack is; if (node != null) { if (node.tag == null) { return; } if (!TidyUtils.toBoolean(node.tag.model & Dict.CM_INLINE)) { return; } if (TidyUtils.toBoolean(node.tag.model & Dict.CM_OBJECT)) { return; } // if node is </a> then pop until we find an <a> if (node.tag == this.configuration.tt.tagA) { while (this.istack.size() > 0) { is = (IStack) this.istack.pop(); if (is.tag == this.configuration.tt.tagA) { break; } } if (this.insert >= this.istack.size()) { this.insert = -1; } return; } } if (this.istack.size() > 0) { is = (IStack) this.istack.pop(); if (this.insert >= this.istack.size()) { this.insert = -1; } } } /** * Is the node in the stack? * @param node Node * @return <code>true</code> is the node is found in the stack */ public boolean isPushed(Node node) { int i; IStack is; for (i = this.istack.size() - 1; i >= 0; --i) { is = (IStack) this.istack.elementAt(i); if (is.tag == node.tag) { return true; } } return false; } /** * This has the effect of inserting "missing" inline elements around the contents of blocklevel elements such as P, * TD, TH, DIV, PRE etc. This procedure is called at the start of ParseBlock. When the inline stack is not empty, as * will be the case in: <code><i><h1>italic heading</h1></i></code> which is then treated as * equivalent to <code><h1><i>italic heading</i></h1></code> This is implemented by setting the lexer * into a mode where it gets tokens from the inline stack rather than from the input stream. * @param node original node * @return stack size */ public int inlineDup(Node node) { int n; n = this.istack.size() - this.istackbase; if (n > 0) { this.insert = this.istackbase; this.inode = node; } return n; } /** * @return */ public Node insertedToken() { Node node; IStack is; int n; // this will only be null if inode != null if (this.insert == -1) { node = this.inode; this.inode = null; return node; } // is this is the "latest" node then update the position, otherwise use current values if (this.inode == null) { this.lines = this.in.getCurline(); this.columns = this.in.getCurcol(); } node = newNode(Node.START_TAG, this.lexbuf, this.txtstart, this.txtend); // GLP: Bugfix 126261. Remove when this change is fixed in istack.c in the original Tidy node.implicit = true; is = (IStack) this.istack.elementAt(this.insert); node.element = is.element; node.tag = is.tag; if (is.attributes != null) { node.attributes = cloneAttributes(is.attributes); } // advance lexer to next item on the stack n = this.insert; // and recover state if we have reached the end if (++n < this.istack.size()) { this.insert = n; } else { this.insert = -1; } return node; } /** * Can the given element be removed? * @param element node * @return <code>true</code> if he element can be removed */ public boolean canPrune(Node element) { if (element.type == Node.TEXT_NODE) { return true; } if (element.content != null) { return false; } if (element.tag == this.configuration.tt.tagA && element.attributes != null) { return false; } if (element.tag == this.configuration.tt.tagP && !this.configuration.dropEmptyParas) { return false; } if (element.tag == null) { return false; } if (TidyUtils.toBoolean(element.tag.model & Dict.CM_ROW)) { return false; } if (TidyUtils.toBoolean(element.tag.model & Dict.CM_EMPTY)) { return false; } if (element.tag == this.configuration.tt.tagApplet) { return false; } if (element.tag == this.configuration.tt.tagObject) { return false; } if (element.tag == this.configuration.tt.tagScript && element.getAttrByName("src") != null) { return false; } // #540555 Empty title tag is trimmed if (element.tag == this.configuration.tt.tagTitle) { return false; } // #433359 - fix by Randy Waki 12 Mar 01 - Empty iframe is trimmed if (element.tag == this.configuration.tt.tagIframe) { return false; } if (element.getAttrByName("id") != null || element.getAttrByName("name") != null) { return false; } return true; } /** * duplicate name attribute as an id and check if id and name match. * @param node Node to check for name/it attributes */ public void fixId(Node node) { AttVal name = node.getAttrByName("name"); AttVal id = node.getAttrByName("id"); if (name != null) { if (id != null) { if (id.value != null && !id.value.equals(name.value)) { report.attrError(this, node, name, Report.ID_NAME_MISMATCH); } } else if (this.configuration.xmlOut) { node.addAttribute("id", name.value); } } } /** * Defer duplicates when entering a table or other element where the inlines shouldn't be duplicated. */ public void deferDup() { this.insert = -1; this.inode = null; } /** * Constraint the html version in the document to the given one. Everything is allowed in proprietary version of * HTML this is handled here rather than in the tag/attr dicts. * @param vers html version code */ void constrainVersion(int vers) { this.versions &= (vers | Dict.VERS_PROPRIETARY); } /** * Is content acceptable for pre elements? * @param node content * @return <code>true</code> if node is acceptable in pre elements */ protected boolean preContent(Node node) { // p is coerced to br's if (node.tag == this.configuration.tt.tagP) { return true; } if (node.tag == null || node.tag == this.configuration.tt.tagP || !TidyUtils.toBoolean(node.tag.model & (Dict.CM_INLINE | Dict.CM_NEW))) { return false; } return true; } /** * document type. */ private static class W3CVersionInfo { /** * name. */ String name; /** * voyager name. */ String voyagerName; /** * profile. */ String profile; /** * code. */ short code; /** * Instantiates a new W3CVersionInfo. * @param name version name * @param voyagerName voyager (xhtml) name * @param profile VOYAGER_STRICT | VOYAGER_LOOSE | VOYAGER_FRAMESET * @param code unique code for this version info */ public W3CVersionInfo(String name, String voyagerName, String profile, short code) { this.name = name; this.voyagerName = voyagerName; this.profile = profile; this.code = code; } } }