/** * Copyright (c) 2009 Juwi MacMillan Group GmbH * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * @(#)Lexer.java 1.11 2000/08/16 * */ package org.tizzit.util.tidy; /** * * Lexer for html parser * * (c) 1998-2000 (W3C) MIT, INRIA, Keio University * See Tidy.java for the copyright notice. * Derived from <a href="http://www.w3.org/People/Raggett/tidy"> * HTML Tidy Release 4 Aug 2000</a> * * @author Dave Raggett <dsr@w3.org> * @author Andy Quick <ac.quick@sympatico.ca> (translation to Java) * @version 1.0, 1999/05/22 * @version 1.0.1, 1999/05/29 * @version 1.1, 1999/06/18 Java Bean * @version 1.2, 1999/07/10 Tidy Release 7 Jul 1999 * @version 1.3, 1999/07/30 Tidy Release 26 Jul 1999 * @version 1.4, 1999/09/04 DOM support * @version 1.5, 1999/10/23 Tidy Release 27 Sep 1999 * @version 1.6, 1999/11/01 Tidy Release 22 Oct 1999 * @version 1.7, 1999/12/06 Tidy Release 30 Nov 1999 * @version 1.8, 2000/01/22 Tidy Release 13 Jan 2000 * @version 1.9, 2000/06/03 Tidy Release 30 Apr 2000 * @version 1.10, 2000/07/22 Tidy Release 8 Jul 2000 * @version 1.11, 2000/08/16 Tidy Release 4 Aug 2000 */ /* Given a file stream fp it returns a sequence of tokens. GetToken(fp) gets the next token UngetToken(fp) provides one level undo The tags include an attribute list: - linked list of attribute/value nodes - each node has 2 null-terminated strings. - entities are replaced in attribute values white space is compacted if not in preformatted mode If not in preformatted mode then leading white space is discarded and subsequent white space sequences compacted to single space chars. If XmlTags is no then Tag names are folded to upper case and attribute names to lower case. Not yet done: - Doctype subset and marked sections */ import java.io.*; import java.util.*; import java.util.Vector; public class Lexer { public StreamIn in; /* file stream */ public PrintWriter errout; /* error output stream */ public short badAccess; /* for accessibility errors */ public short badLayout; /* for bad style errors */ public short badChars; /* for bad char encodings */ public short badForm; /* for mismatched/mispositioned form tags */ public short warnings; /* count of warnings in this document */ public short errors; /* count of errors */ public int lines; /* lines seen */ public int columns; /* at start of current token */ public boolean waswhite; /* used to collapse contiguous white space */ public boolean pushed; /* true after token has been pushed back */ public boolean insertspace; /* when space is moved after end tag */ public boolean excludeBlocks; /* Netscape compatibility */ public boolean exiled; /* true if moved out of table */ public boolean isvoyager; /* true if xmlns attribute on html element */ public short versions; /* bit vector of HTML versions */ public int doctype; /* version as given by doctype (if any) */ public boolean badDoctype; /* e.g. if html or PUBLIC is missing */ public int txtstart; /* start of current node */ public int txtend; /* end of current node */ public short state; /* state of lexer's finite state machine */ public Node token; /* lexer character buffer parse tree nodes span onto this buffer which contains the concatenated text contents of all of the elements. lexsize must be reset for each file. */ public byte[] lexbuf; /* byte buffer of UTF-8 chars */ public int lexlength; /* allocated */ public int lexsize; /* used */ /* Inline stack for compatibility with Mosaic */ public Node inode; /* for deferring text node */ public int insert; /* for inferring inline tags */ public Stack istack; public int istackbase; /* start of frame */ public Style styles; /* used for cleaning up presentation markup */ public Configuration configuration; protected int seenBodyEndTag; /* used by parser */ private Vector nodeList; private static final String ENTITIES_RESOURCE = "/org/apache/xml/serialize/HTMLEntities.res"; private static Hashtable _byChar; private static Hashtable _byName; static { initialize(); } private static void initialize() { InputStream is = null; BufferedReader reader = null; int index; String name; String value; int code; String line; // Make sure not to initialize twice. if (_byName != null) return; try { _byName = new Hashtable(); _byChar = new Hashtable(); is = Lexer.class.getResourceAsStream(ENTITIES_RESOURCE); if (is == null) throw new RuntimeException("SER003 The resource [" + ENTITIES_RESOURCE + "] could not be found.\n" + ENTITIES_RESOURCE); reader = new BufferedReader(new InputStreamReader(is)); line = reader.readLine(); while (line != null) { if (line.length() == 0 || line.charAt(0) == '#') { line = reader.readLine(); continue; } index = line.indexOf(' '); if (index > 1) { name = line.substring(0, index); ++index; if (index < line.length()) { value = line.substring(index); index = value.indexOf(' '); if (index > 0) value = value.substring(0, index); code = Integer.parseInt(value); defineEntity(name, (char) code); } } line = reader.readLine(); } is.close(); } catch (Exception except) { throw new RuntimeException("SER003 The resource [" + ENTITIES_RESOURCE + "] could not load: " + except.toString() + "\n" + ENTITIES_RESOURCE + "\t" + except.toString()); } finally { if (is != null) { try { is.close(); } catch (Exception except) { } } } } private static void defineEntity(String name, char value) { if (_byName.get(name) == null) { _byName.put(name, new Integer(value)); _byChar.put(new Integer(value), name); } } public Lexer(StreamIn in, Configuration configuration) { this.in = in; this.lines = 1; this.columns = 1; this.state = LEX_CONTENT; this.badAccess = 0; this.badLayout = 0; this.badChars = 0; this.badForm = 0; this.warnings = 0; this.errors = 0; this.waswhite = false; this.pushed = false; this.insertspace = false; this.exiled = false; this.isvoyager = false; this.versions = Dict.VERS_EVERYTHING; this.doctype = Dict.VERS_UNKNOWN; this.badDoctype = false; this.txtstart = 0; this.txtend = 0; this.token = null; this.lexbuf = null; this.lexlength = 0; this.lexsize = 0; this.inode = null; this.insert = -1; this.istack = new Stack(); this.istackbase = 0; this.styles = null; this.configuration = configuration; this.seenBodyEndTag = 0; this.nodeList = new Vector(); } public Node newNode() { Node node = new Node(); nodeList.addElement(node); return node; } public Node newNode(short type, byte[] textarray, int start, int end) { Node node = new Node(type, textarray, start, end); nodeList.addElement(node); return node; } public Node newNode(short type, byte[] textarray, int start, int end, String element) { Node node = new Node(type, textarray, start, end, element, configuration.tt); nodeList.addElement(node); return node; } public Node cloneNode(Node node) { Node cnode = (Node) node.clone(); nodeList.addElement(cnode); for (AttVal att = cnode.attributes; att != null; att = att.next) { if (att.asp != null) nodeList.addElement(att.asp); if (att.php != null) nodeList.addElement(att.php); } return cnode; } public AttVal cloneAttributes(AttVal attrs) { AttVal cattrs = (AttVal) attrs.clone(); for (AttVal att = cattrs; att != null; att = att.next) { if (att.asp != null) nodeList.addElement(att.asp); if (att.php != null) nodeList.addElement(att.php); } return cattrs; } protected void updateNodeTextArrays(byte[] oldtextarray, byte[] newtextarray) { Node node; for (int i = 0; i < nodeList.size(); i++) { node = (Node) (nodeList.elementAt(i)); if (node.textarray == oldtextarray) node.textarray = newtextarray; } } /* used for creating preformatted text from Word2000 */ public Node newLineNode() { Node node = newNode(); node.textarray = this.lexbuf; node.start = this.lexsize; addCharToLexer((int) '\n'); node.end = this.lexsize; return node; } // Should always be able convert to/from UTF-8, so encoding exceptions are // converted to an Error to avoid adding throws declarations in // lots of methods. public static byte[] getBytes(String str) { try { return str.getBytes("UTF8"); } catch (java.io.UnsupportedEncodingException e) { throw new Error("string to UTF-8 conversion failed: " + e.getMessage()); } } public static String getString(byte[] bytes, int offset, int length) { try { return new String(bytes, offset, length, "UTF8"); } catch (java.io.UnsupportedEncodingException e) { throw new Error("UTF-8 to string conversion failed: " + e.getMessage()); } } public boolean endOfInput() { return this.in.isEndOfStream(); } public void addByte(int c) { if (this.lexsize + 1 >= this.lexlength) { while (this.lexsize + 1 >= this.lexlength) { if (this.lexlength == 0) this.lexlength = 8192; else this.lexlength = this.lexlength * 2; } byte[] temp = this.lexbuf; this.lexbuf = new byte[this.lexlength]; if (temp != null) { System.arraycopy(temp, 0, this.lexbuf, 0, temp.length); updateNodeTextArrays(temp, this.lexbuf); } } this.lexbuf[this.lexsize++] = (byte) c; this.lexbuf[this.lexsize] = (byte) '\0'; /* debug */ } public void changeChar(byte c) { if (this.lexsize > 0) { this.lexbuf[this.lexsize - 1] = c; } } /* store char c as UTF-8 encoded byte stream */ public void addCharToLexer(int c) { if (c < 128) addByte(c); else if (c <= 0x7FF) { addByte(0xC0 | (c >> 6)); addByte(0x80 | (c & 0x3F)); } else if (c <= 0xFFFF) { addByte(0xE0 | (c >> 12)); addByte(0x80 | ((c >> 6) & 0x3F)); addByte(0x80 | (c & 0x3F)); } else if (c <= 0x1FFFFF) { addByte(0xF0 | (c >> 18)); addByte(0x80 | ((c >> 12) & 0x3F)); addByte(0x80 | ((c >> 6) & 0x3F)); addByte(0x80 | (c & 0x3F)); } else { addByte(0xF8 | (c >> 24)); addByte(0x80 | ((c >> 18) & 0x3F)); addByte(0x80 | ((c >> 12) & 0x3F)); addByte(0x80 | ((c >> 6) & 0x3F)); addByte(0x80 | (c & 0x3F)); } } public void addStringToLexer(String str) { for (int i = 0; i < str.length(); i++) { addCharToLexer((int) str.charAt(i)); } } /* No longer attempts to insert missing ';' for unknown enitities unless one was present already, since this gives unexpected results. For example: <a href="something.htm?foo&bar&fred"> was tidied to: <a href="something.htm?foo&bar;&fred;"> rather than: <a href="something.htm?foo&bar&fred"> My thanks for Maurice Buxton for spotting this. */ public void parseEntity(short mode) { short map; int start; boolean first = true; boolean semicolon = false; boolean numeric = false; int c, ch, startcol; String str; start = this.lexsize - 1; /* to start at "&" */ startcol = this.in.curcol - 1; while (true) { c = this.in.readChar(); if (c == StreamIn.EndOfStream) break; if (c == ';') { semicolon = true; break; } if (first && c == '#') { addCharToLexer(c); first = false; numeric = true; continue; } first = false; map = MAP((char) c); /* AQ: Added flag for numeric entities so that numeric entities with missing semi-colons are recognized. Eg. "rep..." is recognized as "rep" */ if (numeric && ((c == 'x') || ((map & DIGIT) != 0))) { addCharToLexer(c); continue; } if (!numeric && ((map & NAMECHAR) != 0)) { addCharToLexer(c); continue; } /* otherwise put it back */ this.in.ungetChar(c); break; } str = getString(this.lexbuf, start, this.lexsize - start); ch = EntityTable.getDefaultEntityTable().entityCode(str); /* deal with unrecognized entities */ if (ch <= 0) { /* set error position just before offending chararcter */ this.lines = this.in.curline; this.columns = startcol; if (this.lexsize > start + 1) { Report.entityError(this, Report.UNKNOWN_ENTITY, str, ch); if (semicolon) addCharToLexer(';'); } else /* naked & */ { Report.entityError(this, Report.UNESCAPED_AMPERSAND, str, ch); } } else { if (c != ';') /* issue warning if not terminated by ';' */ { /* set error position just before offending chararcter */ this.lines = this.in.curline; this.columns = startcol; Report.entityError(this, Report.MISSING_SEMICOLON, str, c); } this.lexsize = start; if (ch == 160 && (mode & Preformatted) != 0) ch = ' '; Integer charIntVal = new Integer(ch); String retObj = (String) _byChar.get(charIntVal); if (retObj != null) { // we've found an entity in the File addCharToLexer('&'); addCharToLexer('#'); char[] addArr = charIntVal.toString().toCharArray(); for (int i = 0; i < addArr.length; i++) addCharToLexer(addArr[i]); addCharToLexer(';'); } /* if(ch == '<') { addCharToLexer('&'); addCharToLexer('l'); addCharToLexer('t'); addCharToLexer(';'); } else if(ch == '�') { addCharToLexer('&'); addCharToLexer('#'); addCharToLexer('1'); addCharToLexer('2'); addCharToLexer('8'); addCharToLexer(';'); } else if(ch == '>') { addCharToLexer('&'); addCharToLexer('g'); addCharToLexer('t'); addCharToLexer(';'); } else if(ch == '\u2013') {//&ndash addCharToLexer('&'); addCharToLexer('#'); addCharToLexer('1'); addCharToLexer('5'); addCharToLexer('0'); addCharToLexer(';'); }else if(ch == '\u00B4' || ch == '\u2019') {//´ addCharToLexer('&'); addCharToLexer('#'); addCharToLexer('1'); addCharToLexer('8'); addCharToLexer('0'); addCharToLexer(';'); }else if(ch == '\u00B8' || ch == '\u201A') {//¸ addCharToLexer('&'); addCharToLexer('#'); addCharToLexer('1'); addCharToLexer('8'); addCharToLexer('4'); addCharToLexer(';'); }else if(ch == (char)0x03B1) {//&aplha addCharToLexer('&'); addCharToLexer('#'); addCharToLexer('9'); addCharToLexer('4'); addCharToLexer('5'); addCharToLexer(';'); }else if(ch == (char)0x03B2) {//&beta addCharToLexer('&'); addCharToLexer('#'); addCharToLexer('9'); addCharToLexer('4'); addCharToLexer('6'); addCharToLexer(';'); }else if(ch == (char)0x03B3) {//&gamma addCharToLexer('&'); addCharToLexer('#'); addCharToLexer('9'); addCharToLexer('4'); addCharToLexer('7'); addCharToLexer(';'); }else if(ch == (char)0x03B4) {//&delta addCharToLexer('&'); addCharToLexer('#'); addCharToLexer('9'); addCharToLexer('4'); addCharToLexer('8'); addCharToLexer(';'); }else if(ch == (char)0x03B5) {//&omega addCharToLexer('&'); addCharToLexer('#'); addCharToLexer('9'); addCharToLexer('4'); addCharToLexer('9'); addCharToLexer(';'); }else if(ch == '\u201E'){//„ addCharToLexer('&'); addCharToLexer('#'); addCharToLexer('8'); addCharToLexer('2'); addCharToLexer('2'); addCharToLexer('2'); addCharToLexer(';'); }else if(ch == '\u201C') {//“ addCharToLexer('&'); addCharToLexer('#'); addCharToLexer('8'); addCharToLexer('2'); addCharToLexer('2'); addCharToLexer('0'); addCharToLexer(';'); } else if(ch == '\u201D') { addCharToLexer('&'); addCharToLexer('#'); addCharToLexer('8'); addCharToLexer('2'); addCharToLexer('2'); addCharToLexer('1'); addCharToLexer(';'); } */ else { addCharToLexer(ch); if (ch == '&' && this.configuration.QuoteAmpersand) { addCharToLexer('a'); addCharToLexer('m'); addCharToLexer('p'); addCharToLexer(';'); } } } } public char parseTagName() { short map; int c; /* fold case of first char in buffer */ c = this.lexbuf[this.txtstart]; map = MAP((char) c); if (!this.configuration.XmlTags && (map & UPPERCASE) != 0) { c += (int) ((int) 'a' - (int) 'A'); this.lexbuf[this.txtstart] = (byte) c; } while (true) { c = this.in.readChar(); if (c == StreamIn.EndOfStream) break; map = MAP((char) c); if ((map & NAMECHAR) == 0) break; /* fold case of subsequent chars */ if (!this.configuration.XmlTags && (map & UPPERCASE) != 0) c += (int) ((int) 'a' - (int) 'A'); addCharToLexer(c); } this.txtend = this.lexsize; return (char) c; } public void addStringLiteral(String str) { for (int i = 0; i < str.length(); i++) { addCharToLexer((int) str.charAt(i)); } } /* choose what version to use for new doctype */ public short HTMLVersion() { short versions; versions = this.versions; if ((versions & Dict.VERS_HTML20) != 0) return Dict.VERS_HTML20; if ((versions & Dict.VERS_HTML32) != 0) return Dict.VERS_HTML32; if ((versions & Dict.VERS_HTML40_STRICT) != 0) return Dict.VERS_HTML40_STRICT; if ((versions & Dict.VERS_HTML40_LOOSE) != 0) return Dict.VERS_HTML40_LOOSE; if ((versions & Dict.VERS_FRAMES) != 0) return Dict.VERS_FRAMES; return Dict.VERS_UNKNOWN; } public String HTMLVersionName() { short guessed; int j; guessed = apparentVersion(); for (j = 0; j < W3CVersion.length; ++j) { if (guessed == W3CVersion[j].code) { if (this.isvoyager) return W3CVersion[j].voyagerName; return W3CVersion[j].name; } } return null; } /* add meta element for Tidy */ public boolean addGenerator(Node root) { AttVal attval; Node node; Node head = root.findHEAD(configuration.tt); if (head != null) { for (node = head.content; node != null; node = node.next) { if (node.tag == configuration.tt.tagMeta) { attval = node.getAttrByName("name"); if (attval != null && attval.value != null && Lexer.wstrcasecmp(attval.value, "generator") == 0) { attval = node.getAttrByName("content"); if (attval != null && attval.value != null && attval.value.length() >= 9 && Lexer.wstrcasecmp(attval.value.substring(0, 9), "HTML Tidy") == 0) { return false; } } } } node = this.inferredTag("meta"); node.addAttribute("content", "HTML Tidy, see www.w3.org"); node.addAttribute("name", "generator"); Node.insertNodeAtStart(head, node); return true; } return false; } /* return true if substring s is in p and isn't all in upper case */ /* this is used to check the case of SYSTEM, PUBLIC, DTD and EN */ /* len is how many chars to check in p */ private static boolean findBadSubString(String s, String p, int len) { int n = s.length(); int i = 0; String ps; while (n < len) { ps = p.substring(i, i + n); if (wstrcasecmp(s, ps) == 0) return (!ps.equals(s.substring(0, n))); ++i; --len; } return false; } public boolean checkDocTypeKeyWords(Node doctype) { int len = doctype.end - doctype.start; String s = getString(this.lexbuf, doctype.start, len); return !(findBadSubString("SYSTEM", s, len) || findBadSubString("PUBLIC", s, len) || findBadSubString("//DTD", s, len) || findBadSubString("//W3C", s, len) || findBadSubString("//EN", s, len)); } /* examine <!DOCTYPE> to identify version */ public short findGivenVersion(Node doctype) { String p, s; int i, j; int len; String str1; String str2; /* if root tag for doctype isn't html give up now */ str1 = getString(this.lexbuf, doctype.start, 5); if (wstrcasecmp(str1, "html ") != 0) return 0; if (!checkDocTypeKeyWords(doctype)) Report.warning(this, doctype, null, Report.DTYPE_NOT_UPPER_CASE); /* give up if all we are given is the system id for the doctype */ str1 = getString(this.lexbuf, doctype.start + 5, 7); if (wstrcasecmp(str1, "SYSTEM ") == 0) { /* but at least ensure the case is correct */ if (!str1.substring(0, 6).equals("SYSTEM")) System.arraycopy(getBytes("SYSTEM"), 0, this.lexbuf, doctype.start + 5, 6); return 0; /* unrecognized */ } if (wstrcasecmp(str1, "PUBLIC ") == 0) { if (!str1.substring(0, 6).equals("PUBLIC")) System.arraycopy(getBytes("PUBLIC "), 0, this.lexbuf, doctype.start + 5, 6); } else this.badDoctype = true; for (i = doctype.start; i < doctype.end; ++i) { if (this.lexbuf[i] == (byte) '"') { str1 = getString(this.lexbuf, i + 1, 12); str2 = getString(this.lexbuf, i + 1, 13); if (str1.equals("-//W3C//DTD ")) { /* compute length of identifier e.g. "HTML 4.0 Transitional" */ for (j = i + 13; j < doctype.end && this.lexbuf[j] != (byte) '/'; ++j); len = j - i - 13; p = getString(this.lexbuf, i + 13, len); for (j = 1; j < W3CVersion.length; ++j) { s = W3CVersion[j].name; if (len == s.length() && s.equals(p)) return W3CVersion[j].code; } /* else unrecognized version */ } else if (str2.equals("-//IETF//DTD ")) { /* compute length of identifier e.g. "HTML 2.0" */ for (j = i + 14; j < doctype.end && this.lexbuf[j] != (byte) '/'; ++j); len = j - i - 14; p = getString(this.lexbuf, i + 14, len); s = W3CVersion[0].name; if (len == s.length() && s.equals(p)) return W3CVersion[0].code; /* else unrecognized version */ } break; } } return 0; } public void fixHTMLNameSpace(Node root, String profile) { Node node; AttVal prev, attr; for (node = root.content; node != null && node.tag != configuration.tt.tagHtml; node = node.next); if (node != null) { prev = null; for (attr = node.attributes; attr != null; attr = attr.next) { if (attr.attribute.equals("xmlns")) break; prev = attr; } if (attr != null) { if (!attr.value.equals(profile)) { Report.warning(this, node, null, Report.INCONSISTENT_NAMESPACE); attr.value = profile; } } else { attr = new AttVal(node.attributes, null, (int) '"', "xmlns", profile); attr.dict = AttributeTable.getDefaultAttributeTable().findAttribute(attr); node.attributes = attr; } } } public boolean setXHTMLDocType(Node root) { String fpi = " "; String sysid = ""; String namespace = XHTML_NAMESPACE; Node doctype; doctype = root.findDocType(); if (configuration.docTypeMode == Configuration.DOCTYPE_OMIT) { if (doctype != null) Node.discardElement(doctype); return true; } if (configuration.docTypeMode == Configuration.DOCTYPE_AUTO) { /* see what flavor of XHTML this document matches */ if ((this.versions & Dict.VERS_HTML40_STRICT) != 0) { /* use XHTML strict */ fpi = "-//W3C//DTD XHTML 1.0 Strict//EN"; sysid = voyager_strict; } else if ((this.versions & Dict.VERS_LOOSE) != 0) { fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN"; sysid = voyager_loose; } else if ((this.versions & Dict.VERS_FRAMES) != 0) { /* use XHTML frames */ fpi = "-//W3C//DTD XHTML 1.0 Frameset//EN"; sysid = voyager_frameset; } else /* lets assume XHTML transitional */ { fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN"; sysid = voyager_loose; } } else if (configuration.docTypeMode == Configuration.DOCTYPE_STRICT) { fpi = "-//W3C//DTD XHTML 1.0 Strict//EN"; sysid = voyager_strict; } else if (configuration.docTypeMode == Configuration.DOCTYPE_LOOSE) { fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN"; sysid = voyager_loose; } fixHTMLNameSpace(root, namespace); if (doctype == null) { doctype = newNode(Node.DocTypeTag, this.lexbuf, 0, 0); doctype.next = root.content; doctype.parent = root; doctype.prev = null; root.content = doctype; } if (configuration.docTypeMode == Configuration.DOCTYPE_USER && configuration.docTypeStr != null) { fpi = configuration.docTypeStr; sysid = ""; } this.txtstart = this.lexsize; this.txtend = this.lexsize; /* add public identifier */ addStringLiteral("html PUBLIC "); /* check if the fpi is quoted or not */ if (fpi.charAt(0) == '"') addStringLiteral(fpi); else { addStringLiteral("\""); addStringLiteral(fpi); addStringLiteral("\""); } if (sysid.length() + 6 >= this.configuration.wraplen) addStringLiteral("\n\""); else addStringLiteral("\n \""); /* add system identifier */ addStringLiteral(sysid); addStringLiteral("\""); this.txtend = this.lexsize; doctype.start = this.txtstart; doctype.end = this.txtend; return false; } public short apparentVersion() { switch (this.doctype) { case Dict.VERS_UNKNOWN: return HTMLVersion(); case Dict.VERS_HTML20: if ((this.versions & Dict.VERS_HTML20) != 0) return Dict.VERS_HTML20; break; case Dict.VERS_HTML32: if ((this.versions & Dict.VERS_HTML32) != 0) return Dict.VERS_HTML32; break; /* to replace old version by new */ case Dict.VERS_HTML40_STRICT: if ((this.versions & Dict.VERS_HTML40_STRICT) != 0) return Dict.VERS_HTML40_STRICT; break; case Dict.VERS_HTML40_LOOSE: if ((this.versions & Dict.VERS_HTML40_LOOSE) != 0) return Dict.VERS_HTML40_LOOSE; break; /* to replace old version by new */ case Dict.VERS_FRAMES: if ((this.versions & Dict.VERS_FRAMES) != 0) return Dict.VERS_FRAMES; break; } Report.warning(this, null, null, Report.INCONSISTENT_VERSION); return this.HTMLVersion(); } /* fixup doctype if missing */ public boolean fixDocType(Node root) { Node doctype; int guessed = Dict.VERS_HTML40_STRICT, i; if (this.badDoctype) Report.warning(this, null, null, Report.MALFORMED_DOCTYPE); if (configuration.XmlOut) return true; doctype = root.findDocType(); if (configuration.docTypeMode == Configuration.DOCTYPE_OMIT) { if (doctype != null) Node.discardElement(doctype); return true; } if (configuration.docTypeMode == Configuration.DOCTYPE_STRICT) { Node.discardElement(doctype); doctype = null; guessed = Dict.VERS_HTML40_STRICT; } else if (configuration.docTypeMode == Configuration.DOCTYPE_LOOSE) { Node.discardElement(doctype); doctype = null; guessed = Dict.VERS_HTML40_LOOSE; } else if (configuration.docTypeMode == Configuration.DOCTYPE_AUTO) { if (doctype != null) { if (this.doctype == Dict.VERS_UNKNOWN) return false; switch (this.doctype) { case Dict.VERS_UNKNOWN: return false; case Dict.VERS_HTML20: if ((this.versions & Dict.VERS_HTML20) != 0) return true; break; /* to replace old version by new */ case Dict.VERS_HTML32: if ((this.versions & Dict.VERS_HTML32) != 0) return true; break; /* to replace old version by new */ case Dict.VERS_HTML40_STRICT: if ((this.versions & Dict.VERS_HTML40_STRICT) != 0) return true; break; /* to replace old version by new */ case Dict.VERS_HTML40_LOOSE: if ((this.versions & Dict.VERS_HTML40_LOOSE) != 0) return true; break; /* to replace old version by new */ case Dict.VERS_FRAMES: if ((this.versions & Dict.VERS_FRAMES) != 0) return true; break; /* to replace old version by new */ } /* INCONSISTENT_VERSION warning is now issued by ApparentVersion() */ } /* choose new doctype */ guessed = HTMLVersion(); } if (guessed == Dict.VERS_UNKNOWN) return false; /* for XML use the Voyager system identifier */ if (this.configuration.XmlOut || this.configuration.XmlTags || this.isvoyager) { if (doctype != null) Node.discardElement(doctype); for (i = 0; i < W3CVersion.length; ++i) { if (guessed == W3CVersion[i].code) { fixHTMLNameSpace(root, W3CVersion[i].profile); break; } } return true; } if (doctype == null) { doctype = newNode(Node.DocTypeTag, this.lexbuf, 0, 0); doctype.next = root.content; doctype.parent = root; doctype.prev = null; root.content = doctype; } this.txtstart = this.lexsize; this.txtend = this.lexsize; /* use the appropriate public identifier */ addStringLiteral("html PUBLIC "); if (configuration.docTypeMode == Configuration.DOCTYPE_USER && configuration.docTypeStr != null) addStringLiteral(configuration.docTypeStr); else if (guessed == Dict.VERS_HTML20) addStringLiteral("\"-//IETF//DTD HTML 2.0//EN\""); else { addStringLiteral("\"-//W3C//DTD "); for (i = 0; i < W3CVersion.length; ++i) { if (guessed == W3CVersion[i].code) { addStringLiteral(W3CVersion[i].name); break; } } addStringLiteral("//EN\""); } this.txtend = this.lexsize; doctype.start = this.txtstart; doctype.end = this.txtend; return true; } /* ensure XML document starts with <?XML version="1.0"?> */ public boolean fixXMLPI(Node root) { Node xml; int s; if (root.content != null && root.content.type == Node.ProcInsTag) { s = root.content.start; if (this.lexbuf[s] == (byte) 'x' && this.lexbuf[s + 1] == (byte) 'm' && this.lexbuf[s + 2] == (byte) 'l') return true; } xml = newNode(Node.ProcInsTag, this.lexbuf, 0, 0); xml.next = root.content; if (root.content != null) { root.content.prev = xml; xml.next = root.content; } root.content = xml; this.txtstart = this.lexsize; this.txtend = this.lexsize; addStringLiteral("xml version=\"1.0\""); if (this.configuration.CharEncoding == Configuration.LATIN1) addStringLiteral(" encoding=\"ISO-8859-1\""); this.txtend = this.lexsize; xml.start = this.txtstart; xml.end = this.txtend; return false; } public Node inferredTag(String name) { Node node; node = newNode(Node.StartTag, this.lexbuf, this.txtstart, this.txtend, name); node.implicit = true; return node; } public static boolean expectsContent(Node node) { if (node.type != Node.StartTag) return false; /* unknown element? */ if (node.tag == null) return true; if ((node.tag.model & Dict.CM_EMPTY) != 0) return false; return true; } /* create a text node for the contents of a CDATA element like style or script which ends with </foo> for some foo. */ public Node getCDATA(Node container) { int c, lastc, start, len, i; String str; boolean endtag = false; this.lines = this.in.curline; this.columns = this.in.curcol; this.waswhite = false; this.txtstart = this.lexsize; this.txtend = this.lexsize; lastc = (int) '\0'; start = -1; while (true) { c = this.in.readChar(); if (c == StreamIn.EndOfStream) break; /* treat \r\n as \n and \r as \n */ if (c == (int) '/' && lastc == (int) '<') { if (endtag) { this.lines = this.in.curline; this.columns = this.in.curcol - 3; Report.warning(this, null, null, Report.BAD_CDATA_CONTENT); } start = this.lexsize + 1; /* to first letter */ endtag = true; } else if (c == (int) '>' && start >= 0) { len = this.lexsize - start; if (len == container.element.length()) { str = getString(this.lexbuf, start, len); if (Lexer.wstrcasecmp(str, container.element) == 0) { this.txtend = start - 2; break; } } this.lines = this.in.curline; this.columns = this.in.curcol - 3; Report.warning(this, null, null, Report.BAD_CDATA_CONTENT); /* if javascript insert backslash before / */ if (ParserImpl.isJavaScript(container)) { for (i = this.lexsize; i > start - 1; --i) this.lexbuf[i] = this.lexbuf[i - 1]; this.lexbuf[start - 1] = (byte) '\\'; this.lexsize++; } start = -1; } else if (c == (int) '\r') { c = this.in.readChar(); if (c != (int) '\n') this.in.ungetChar(c); c = (int) '\n'; } addCharToLexer((int) c); this.txtend = this.lexsize; lastc = c; } if (c == StreamIn.EndOfStream) Report.warning(this, container, null, Report.MISSING_ENDTAG_FOR); if (this.txtend > this.txtstart) { this.token = newNode(Node.TextNode, this.lexbuf, this.txtstart, this.txtend); return this.token; } return null; } public void ungetToken() { this.pushed = true; } public static final short IgnoreWhitespace = 0; public static final short MixedContent = 1; public static final short Preformatted = 2; public static final short IgnoreMarkup = 3; /* modes for GetToken() MixedContent -- for elements which don't accept PCDATA Preformatted -- white space preserved as is IgnoreMarkup -- for CDATA elements such as script, style */ public Node getToken(short mode) { short map; int c = 0; int lastc; int badcomment = 0; MutableBoolean isempty = new MutableBoolean(); AttVal attributes; if (this.pushed) { /* duplicate inlines in preference to pushed text nodes when appropriate */ if (this.token.type != Node.TextNode || (this.insert == -1 && this.inode == null)) { this.pushed = false; return this.token; } } /* at start of block elements, unclosed inline elements are inserted into the token stream */ if (this.insert != -1 || this.inode != null) return insertedToken(); this.lines = this.in.curline; this.columns = this.in.curcol; this.waswhite = false; this.txtstart = this.lexsize; this.txtend = this.lexsize; while (true) { c = this.in.readChar(); if (c == StreamIn.EndOfStream) break; if (this.insertspace && mode != IgnoreWhitespace) { addCharToLexer(' '); this.waswhite = true; this.insertspace = false; } /* treat \r\n as \n and \r as \n */ if (c == '\r') { c = this.in.readChar(); if (c != '\n') this.in.ungetChar(c); c = '\n'; } addCharToLexer(c); switch (this.state) { case LEX_CONTENT: /* element content */ map = MAP((char) c); /* Discard white space if appropriate. Its cheaper to do this here rather than in parser methods for elements that don't have mixed content. */ if (((map & WHITE) != 0) && (mode == IgnoreWhitespace) && this.lexsize == this.txtstart + 1) { --this.lexsize; this.waswhite = false; this.lines = this.in.curline; this.columns = this.in.curcol; continue; } if (c == '<') { this.state = LEX_GT; continue; } if ((map & WHITE) != 0) { /* was previous char white? */ if (this.waswhite) { if (mode != Preformatted && mode != IgnoreMarkup) { --this.lexsize; this.lines = this.in.curline; this.columns = this.in.curcol; } } else /* prev char wasn't white */ { this.waswhite = true; lastc = c; if (mode != Preformatted && mode != IgnoreMarkup && c != ' ') changeChar((byte) ' '); } continue; } else if (c == '&' && mode != IgnoreMarkup) parseEntity(mode); /* this is needed to avoid trimming trailing whitespace */ if (mode == IgnoreWhitespace) mode = MixedContent; this.waswhite = false; continue; case LEX_GT: /* < */ /* check for endtag */ if (c == '/') { c = this.in.readChar(); if (c == StreamIn.EndOfStream) { this.in.ungetChar(c); continue; } addCharToLexer(c); map = MAP((char) c); if ((map & LETTER) != 0) { this.lexsize -= 3; this.txtend = this.lexsize; this.in.ungetChar(c); this.state = LEX_ENDTAG; this.lexbuf[this.lexsize] = (byte) '\0'; /* debug */ this.in.curcol -= 2; /* if some text before the </ return it now */ if (this.txtend > this.txtstart) { /* trim space char before end tag */ if (mode == IgnoreWhitespace && this.lexbuf[this.lexsize - 1] == (byte) ' ') { this.lexsize -= 1; this.txtend = this.lexsize; } this.token = newNode(Node.TextNode, this.lexbuf, this.txtstart, this.txtend); return this.token; } continue; /* no text so keep going */ } /* otherwise treat as CDATA */ this.waswhite = false; this.state = LEX_CONTENT; continue; } if (mode == IgnoreMarkup) { /* otherwise treat as CDATA */ this.waswhite = false; this.state = LEX_CONTENT; continue; } /* look out for comments, doctype or marked sections this isn't quite right, but its getting there ... */ if (c == '!') { c = this.in.readChar(); if (c == '-') { c = this.in.readChar(); if (c == '-') { this.state = LEX_COMMENT; /* comment */ this.lexsize -= 2; this.txtend = this.lexsize; /* if some text before < return it now */ if (this.txtend > this.txtstart) { this.token = newNode(Node.TextNode, this.lexbuf, this.txtstart, this.txtend); return this.token; } this.txtstart = this.lexsize; continue; } Report.warning(this, null, null, Report.MALFORMED_COMMENT); } else if (c == 'd' || c == 'D') { this.state = LEX_DOCTYPE; /* doctype */ this.lexsize -= 2; this.txtend = this.lexsize; mode = IgnoreWhitespace; /* skip until white space or '>' */ for (;;) { c = this.in.readChar(); if (c == StreamIn.EndOfStream || c == '>') { this.in.ungetChar(c); break; } map = MAP((char) c); if ((map & WHITE) == 0) continue; /* and skip to end of whitespace */ for (;;) { c = this.in.readChar(); if (c == StreamIn.EndOfStream || c == '>') { this.in.ungetChar(c); break; } map = MAP((char) c); if ((map & WHITE) != 0) continue; this.in.ungetChar(c); break; } break; } /* if some text before < return it now */ if (this.txtend > this.txtstart) { this.token = newNode(Node.TextNode, this.lexbuf, this.txtstart, this.txtend); return this.token; } this.txtstart = this.lexsize; continue; } else if (c == '[') { /* Word 2000 embeds <![if ...]> ... <![endif]> sequences */ this.lexsize -= 2; this.state = LEX_SECTION; this.txtend = this.lexsize; /* if some text before < return it now */ if (this.txtend > this.txtstart) { this.token = newNode(Node.TextNode, this.lexbuf, this.txtstart, this.txtend); return this.token; } this.txtstart = this.lexsize; continue; } /* otherwise swallow chars up to and including next '>' */ while (true) { c = this.in.readChar(); if (c == '>') break; if (c == -1) { this.in.ungetChar(c); break; } } this.lexsize -= 2; this.lexbuf[this.lexsize] = (byte) '\0'; this.state = LEX_CONTENT; continue; } /* processing instructions */ if (c == '?') { this.lexsize -= 2; this.state = LEX_PROCINSTR; this.txtend = this.lexsize; /* if some text before < return it now */ if (this.txtend > this.txtstart) { this.token = newNode(Node.TextNode, this.lexbuf, this.txtstart, this.txtend); return this.token; } this.txtstart = this.lexsize; continue; } /* Microsoft ASP's e.g. <% ... server-code ... %> */ if (c == '%') { this.lexsize -= 2; this.state = LEX_ASP; this.txtend = this.lexsize; /* if some text before < return it now */ if (this.txtend > this.txtstart) { this.token = newNode(Node.TextNode, this.lexbuf, this.txtstart, this.txtend); return this.token; } this.txtstart = this.lexsize; continue; } /* Netscapes JSTE e.g. <# ... server-code ... #> */ if (c == '#') { this.lexsize -= 2; this.state = LEX_JSTE; this.txtend = this.lexsize; /* if some text before < return it now */ if (this.txtend > this.txtstart) { this.token = newNode(Node.TextNode, this.lexbuf, this.txtstart, this.txtend); return this.token; } this.txtstart = this.lexsize; continue; } map = MAP((char) c); /* check for start tag */ if ((map & LETTER) != 0) { this.in.ungetChar(c); /* push back letter */ this.lexsize -= 2; /* discard "<" + letter */ this.txtend = this.lexsize; this.state = LEX_STARTTAG; /* ready to read tag name */ /* if some text before < return it now */ if (this.txtend > this.txtstart) { this.token = newNode(Node.TextNode, this.lexbuf, this.txtstart, this.txtend); return this.token; } continue; /* no text so keep going */ } /* otherwise treat as CDATA */ this.state = LEX_CONTENT; this.waswhite = false; continue; case LEX_ENDTAG: /* </letter */ this.txtstart = this.lexsize - 1; this.in.curcol += 2; c = parseTagName(); this.token = newNode(Node.EndTag, /* create endtag token */ this.lexbuf, this.txtstart, this.txtend, getString(this.lexbuf, this.txtstart, this.txtend - this.txtstart)); this.lexsize = this.txtstart; this.txtend = this.txtstart; /* skip to '>' */ while (c != '>') { c = this.in.readChar(); if (c == StreamIn.EndOfStream) break; } if (c == StreamIn.EndOfStream) { this.in.ungetChar(c); continue; } this.state = LEX_CONTENT; this.waswhite = false; return this.token; /* the endtag token */ case LEX_STARTTAG: /* first letter of tagname */ this.txtstart = this.lexsize - 1; /* set txtstart to first letter */ c = parseTagName(); isempty.value = false; attributes = null; this.token = newNode((isempty.value ? Node.StartEndTag : Node.StartTag), this.lexbuf, this.txtstart, this.txtend, getString(this.lexbuf, this.txtstart, this.txtend - this.txtstart)); /* parse attributes, consuming closing ">" */ if (c != '>') { if (c == '/') this.in.ungetChar(c); attributes = parseAttrs(isempty); } if (isempty.value) this.token.type = Node.StartEndTag; this.token.attributes = attributes; this.lexsize = this.txtstart; this.txtend = this.txtstart; /* swallow newline following start tag */ /* special check needed for CRLF sequence */ /* this doesn't apply to empty elements */ if (expectsContent(this.token) || this.token.tag == configuration.tt.tagBr) { c = this.in.readChar(); if (c == '\r') { c = this.in.readChar(); if (c != '\n') this.in.ungetChar(c); } else if (c != '\n' && c != '\f') this.in.ungetChar(c); this.waswhite = true; /* to swallow leading whitespace */ } else this.waswhite = false; this.state = LEX_CONTENT; if (this.token.tag == null) Report.error(this, null, this.token, Report.UNKNOWN_ELEMENT); else if (!this.configuration.XmlTags) { this.versions &= this.token.tag.versions; if ((this.token.tag.versions & Dict.VERS_PROPRIETARY) != 0) { if (!this.configuration.MakeClean && (this.token.tag == configuration.tt.tagNobr || this.token.tag == configuration.tt.tagWbr)) Report.warning(this, null, this.token, Report.PROPRIETARY_ELEMENT); } if (this.token.tag.chkattrs != null) { this.token.checkUniqueAttributes(this); this.token.tag.chkattrs.check(this, this.token); } else this.token.checkAttributes(this); } return this.token; /* return start tag */ case LEX_COMMENT: /* seen <!-- so look for --> */ if (c != '-') continue; c = this.in.readChar(); addCharToLexer(c); if (c != '-') continue; end_comment: while (true) { c = this.in.readChar(); if (c == '>') { if (badcomment != 0) Report.warning(this, null, null, Report.MALFORMED_COMMENT); this.txtend = this.lexsize - 2; // AQ 8Jul2000 this.lexbuf[this.lexsize] = (byte) '\0'; this.state = LEX_CONTENT; this.waswhite = false; this.token = newNode(Node.CommentTag, this.lexbuf, this.txtstart, this.txtend); /* now look for a line break */ c = this.in.readChar(); if (c == '\r') { c = this.in.readChar(); if (c != '\n') this.token.linebreak = true; } if (c == '\n') this.token.linebreak = true; else this.in.ungetChar(c); return this.token; } /* note position of first such error in the comment */ if (badcomment == 0) { this.lines = this.in.curline; this.columns = this.in.curcol - 3; } badcomment++; if (this.configuration.FixComments) this.lexbuf[this.lexsize - 2] = (byte) '='; addCharToLexer(c); /* if '-' then look for '>' to end the comment */ if (c != '-') break end_comment; } /* otherwise continue to look for --> */ this.lexbuf[this.lexsize - 2] = (byte) '='; continue; case LEX_DOCTYPE: /* seen <!d so look for '>' munging whitespace */ map = MAP((char) c); if ((map & WHITE) != 0) { if (this.waswhite) this.lexsize -= 1; this.waswhite = true; } else this.waswhite = false; if (c != '>') continue; this.lexsize -= 1; this.txtend = this.lexsize; this.lexbuf[this.lexsize] = (byte) '\0'; this.state = LEX_CONTENT; this.waswhite = false; this.token = newNode(Node.DocTypeTag, this.lexbuf, this.txtstart, this.txtend); /* make a note of the version named by the doctype */ this.doctype = findGivenVersion(this.token); return this.token; case LEX_PROCINSTR: /* seen <? so look for '>' */ /* check for PHP preprocessor instructions <?php ... ?> */ if (this.lexsize - this.txtstart == 3) { if ((getString(this.lexbuf, this.txtstart, 3)).equals("php")) { this.state = LEX_PHP; continue; } } if (this.configuration.XmlPIs) /* insist on ?> as terminator */ { if (c != '?') continue; /* now look for '>' */ c = this.in.readChar(); if (c == StreamIn.EndOfStream) { Report.warning(this, null, null, Report.UNEXPECTED_END_OF_FILE); this.in.ungetChar(c); continue; } addCharToLexer(c); } if (c != '>') continue; this.lexsize -= 1; this.txtend = this.lexsize; this.lexbuf[this.lexsize] = (byte) '\0'; this.state = LEX_CONTENT; this.waswhite = false; this.token = newNode(Node.ProcInsTag, this.lexbuf, this.txtstart, this.txtend); return this.token; case LEX_ASP: /* seen <% so look for "%>" */ if (c != '%') continue; /* now look for '>' */ c = this.in.readChar(); if (c != '>') { this.in.ungetChar(c); continue; } this.lexsize -= 1; this.txtend = this.lexsize; this.lexbuf[this.lexsize] = (byte) '\0'; this.state = LEX_CONTENT; this.waswhite = false; this.token = newNode(Node.AspTag, this.lexbuf, this.txtstart, this.txtend); return this.token; case LEX_JSTE: /* seen <# so look for "#>" */ if (c != '#') continue; /* now look for '>' */ c = this.in.readChar(); if (c != '>') { this.in.ungetChar(c); continue; } this.lexsize -= 1; this.txtend = this.lexsize; this.lexbuf[this.lexsize] = (byte) '\0'; this.state = LEX_CONTENT; this.waswhite = false; this.token = newNode(Node.JsteTag, this.lexbuf, this.txtstart, this.txtend); return this.token; case LEX_PHP: /* seen "<?php" so look for "?>" */ if (c != '?') continue; /* now look for '>' */ c = this.in.readChar(); if (c != '>') { this.in.ungetChar(c); continue; } this.lexsize -= 1; this.txtend = this.lexsize; this.lexbuf[this.lexsize] = (byte) '\0'; this.state = LEX_CONTENT; this.waswhite = false; this.token = newNode(Node.PhpTag, this.lexbuf, this.txtstart, this.txtend); return this.token; case LEX_SECTION: /* seen "<![" so look for "]>" */ if (c == '[') { if (this.lexsize == (this.txtstart + 6) && (getString(this.lexbuf, this.txtstart, 6)).equals("CDATA[")) { this.state = LEX_CDATA; this.lexsize -= 6; continue; } } if (c != ']') continue; /* now look for '>' */ c = this.in.readChar(); if (c != '>') { this.in.ungetChar(c); continue; } this.lexsize -= 1; this.txtend = this.lexsize; this.lexbuf[this.lexsize] = (byte) '\0'; this.state = LEX_CONTENT; this.waswhite = false; this.token = newNode(Node.SectionTag, this.lexbuf, this.txtstart, this.txtend); return this.token; case LEX_CDATA: /* seen "<![CDATA[" so look for "]]>" */ if (c != ']') continue; /* now look for ']' */ c = this.in.readChar(); if (c != ']') { this.in.ungetChar(c); continue; } /* now look for '>' */ c = this.in.readChar(); if (c != '>') { this.in.ungetChar(c); continue; } this.lexsize -= 1; this.txtend = this.lexsize; this.lexbuf[this.lexsize] = (byte) '\0'; this.state = LEX_CONTENT; this.waswhite = false; this.token = newNode(Node.CDATATag, this.lexbuf, this.txtstart, this.txtend); return this.token; } } if (this.state == LEX_CONTENT) /* text string */ { this.txtend = this.lexsize; if (this.txtend > this.txtstart) { this.in.ungetChar(c); if (this.lexbuf[this.lexsize - 1] == (byte) ' ') { this.lexsize -= 1; this.txtend = this.lexsize; } this.token = newNode(Node.TextNode, this.lexbuf, this.txtstart, this.txtend); return this.token; } } else if (this.state == LEX_COMMENT) /* comment */ { if (c == StreamIn.EndOfStream) Report.warning(this, null, null, Report.MALFORMED_COMMENT); this.txtend = this.lexsize; this.lexbuf[this.lexsize] = (byte) '\0'; this.state = LEX_CONTENT; this.waswhite = false; this.token = newNode(Node.CommentTag, this.lexbuf, this.txtstart, this.txtend); return this.token; } return null; } /* parser for ASP within start tags Some people use ASP for to customize attributes Tidy isn't really well suited to dealing with ASP This is a workaround for attributes, but won't deal with the case where the ASP is used to tailor the attribute value. Here is an example of a work around for using ASP in attribute values: href="<%=rsSchool.Fields("ID").Value%>" where the ASP that generates the attribute value is masked from Tidy by the quotemarks. */ public Node parseAsp() { int c; Node asp = null; this.txtstart = this.lexsize; for (;;) { c = this.in.readChar(); addCharToLexer(c); if (c != '%') continue; c = this.in.readChar(); addCharToLexer(c); if (c == '>') break; } this.lexsize -= 2; this.txtend = this.lexsize; if (this.txtend > this.txtstart) asp = newNode(Node.AspTag, this.lexbuf, this.txtstart, this.txtend); this.txtstart = this.txtend; return asp; } /* PHP is like ASP but is based upon XML processing instructions, e.g. <?php ... ?> */ public Node parsePhp() { int c; Node php = null; this.txtstart = this.lexsize; for (;;) { c = this.in.readChar(); addCharToLexer(c); if (c != '?') continue; c = this.in.readChar(); addCharToLexer(c); if (c == '>') break; } this.lexsize -= 2; this.txtend = this.lexsize; if (this.txtend > this.txtstart) php = newNode(Node.PhpTag, this.lexbuf, this.txtstart, this.txtend); this.txtstart = this.txtend; return php; } /* consumes the '>' terminating start tags */ public String parseAttribute(MutableBoolean isempty, MutableObject asp, MutableObject php) { int start = 0; // int len = 0; Removed by BUGFIX for 126265 short map; String attr; int c = 0; asp.setObject(null); /* clear asp pointer */ php.setObject(null); /* clear php pointer */ /* skip white space before the attribute */ for (;;) { c = this.in.readChar(); if (c == '/') { c = this.in.readChar(); if (c == '>') { isempty.value = true; return null; } this.in.ungetChar(c); c = '/'; break; } if (c == '>') return null; if (c == '<') { c = this.in.readChar(); if (c == '%') { asp.setObject(parseAsp()); return null; } else if (c == '?') { php.setObject(parsePhp()); return null; } this.in.ungetChar(c); Report.attrError(this, this.token, null, Report.UNEXPECTED_GT); return null; } if (c == '"' || c == '\'') { Report.attrError(this, this.token, null, Report.UNEXPECTED_QUOTEMARK); continue; } if (c == StreamIn.EndOfStream) { Report.attrError(this, this.token, null, Report.UNEXPECTED_END_OF_FILE); this.in.ungetChar(c); return null; } map = MAP((char) c); if ((map & WHITE) == 0) break; } start = this.lexsize; for (;;) { /* but push back '=' for parseValue() */ if (c == '=' || c == '>') { this.in.ungetChar(c); break; } if (c == '<' || c == StreamIn.EndOfStream) { this.in.ungetChar(c); break; } map = MAP((char) c); if ((map & WHITE) != 0) break; /* what should be done about non-namechar characters? */ /* currently these are incorporated into the attr name */ if (!this.configuration.XmlTags && (map & UPPERCASE) != 0) c += (int) ('a' - 'A'); // ++len; Removed by BUGFIX for 126265 addCharToLexer(c); c = this.in.readChar(); } // Following line added by GLP to fix BUG 126265. This is a temporary comment // and should be removed when Tidy is fixed. int len = this.lexsize - start; attr = (len > 0 ? getString(this.lexbuf, start, len) : null); this.lexsize = start; return attr; } /* invoked when < is seen in place of attribute value but terminates on whitespace if not ASP, PHP or Tango this routine recognizes ' and " quoted strings */ public int parseServerInstruction() { int c, map, delim = '"'; boolean isrule = false; c = this.in.readChar(); addCharToLexer(c); /* check for ASP, PHP or Tango */ if (c == '%' || c == '?' || c == '@') isrule = true; for (;;) { c = this.in.readChar(); if (c == StreamIn.EndOfStream) break; if (c == '>') { if (isrule) addCharToLexer(c); else this.in.ungetChar(c); break; } /* if not recognized as ASP, PHP or Tango */ /* then also finish value on whitespace */ if (!isrule) { map = MAP((char) c); if ((map & WHITE) != 0) break; } addCharToLexer(c); if (c == '"') { do { c = this.in.readChar(); addCharToLexer(c); } while (c != '"'); delim = '\''; continue; } if (c == '\'') { do { c = this.in.readChar(); addCharToLexer(c); } while (c != '\''); } } return delim; } /* values start with "=" or " = " etc. */ /* doesn't consume the ">" at end of start tag */ public String parseValue(String name, boolean foldCase, MutableBoolean isempty, MutableInteger pdelim) { int len = 0; int start; short map; boolean seen_gt = false; boolean munge = true; int c = 0; int lastc, delim, quotewarning; String value; delim = 0; pdelim.value = (int) '"'; /* Henry Zrepa reports that some folk are using the embed element with script attributes where newlines are significant and must be preserved */ if (configuration.LiteralAttribs) munge = false; /* skip white space before the '=' */ for (;;) { c = this.in.readChar(); if (c == StreamIn.EndOfStream) { this.in.ungetChar(c); break; } map = MAP((char) c); if ((map & WHITE) == 0) break; } /* c should be '=' if there is a value other legal possibilities are white space, '/' and '>' */ if (c != '=') { this.in.ungetChar(c); return null; } /* skip white space after '=' */ for (;;) { c = this.in.readChar(); if (c == StreamIn.EndOfStream) { this.in.ungetChar(c); break; } map = MAP((char) c); if ((map & WHITE) == 0) break; } /* check for quote marks */ if (c == '"' || c == '\'') delim = c; else if (c == '<') { start = this.lexsize; addCharToLexer(c); pdelim.value = parseServerInstruction(); len = this.lexsize - start; this.lexsize = start; return (len > 0 ? getString(this.lexbuf, start, len) : null); } else this.in.ungetChar(c); /* and read the value string check for quote mark if needed */ quotewarning = 0; start = this.lexsize; c = '\0'; for (;;) { lastc = c; /* track last character */ c = this.in.readChar(); if (c == StreamIn.EndOfStream) { Report.attrError(this, this.token, null, Report.UNEXPECTED_END_OF_FILE); this.in.ungetChar(c); break; } if (delim == (char) 0) { if (c == '>') { this.in.ungetChar(c); break; } if (c == '"' || c == '\'') { Report.attrError(this, this.token, null, Report.UNEXPECTED_QUOTEMARK); break; } if (c == '<') { /* this.in.ungetChar(c); */ Report.attrError(this, this.token, null, Report.UNEXPECTED_GT); /* break; */ } /* For cases like <br clear=all/> need to avoid treating /> as part of the attribute value, however care is needed to avoid so treating <a href=http://www.acme.com/> in this way, which would map the <a> tag to <a href="http://www.acme.com"/> */ if (c == '/') { /* peek ahead in case of /> */ c = this.in.readChar(); if (c == '>' && !AttributeTable.getDefaultAttributeTable().isUrl(name)) { isempty.value = true; this.in.ungetChar(c); break; } /* unget peeked char */ this.in.ungetChar(c); c = '/'; } } else /* delim is '\'' or '"' */ { if (c == delim) break; /* treat CRLF, CR and LF as single line break */ if (c == '\r') { c = this.in.readChar(); if (c != '\n') this.in.ungetChar(c); c = '\n'; } if (c == '\n' || c == '<' || c == '>') ++quotewarning; if (c == '>') seen_gt = true; } if (c == '&') { addCharToLexer(c); parseEntity((short) 0); continue; } /* kludge for JavaScript attribute values with line continuations in string literals */ if (c == '\\') { c = this.in.readChar(); if (c != '\n') { this.in.ungetChar(c); c = '\\'; } } map = MAP((char) c); if ((map & WHITE) != 0) { if (delim == (char) 0) break; if (munge) { c = ' '; if (lastc == ' ') continue; } } else if (foldCase && (map & UPPERCASE) != 0) c += (int) ('a' - 'A'); addCharToLexer(c); } if (quotewarning > 10 && seen_gt && munge) { /* there is almost certainly a missing trailling quote mark as we have see too many newlines, < or > characters. an exception is made for Javascript attributes and the javascript URL scheme which may legitimately include < and > */ if (!AttributeTable.getDefaultAttributeTable().isScript(name) && !(AttributeTable.getDefaultAttributeTable().isUrl(name) && (getString(this.lexbuf, start, 11)) .equals("javascript:"))) Report.error(this, null, null, Report.SUSPECTED_MISSING_QUOTE); } len = this.lexsize - start; this.lexsize = start; if (len > 0 || delim != 0) value = getString(this.lexbuf, start, len); else value = null; /* note delimiter if given */ if (delim != 0) pdelim.value = delim; else pdelim.value = (int) '"'; return value; } /* attr must be non-null */ public static boolean isValidAttrName(String attr) { short map; char c; int i; /* first character should be a letter */ c = attr.charAt(0); map = MAP(c); if (!((map & LETTER) != 0)) return false; /* remaining characters should be namechars */ for (i = 1; i < attr.length(); i++) { c = attr.charAt(i); map = MAP(c); if ((map & NAMECHAR) != 0) continue; return false; } return true; } /* swallows closing '>' */ public AttVal parseAttrs(MutableBoolean isempty) { AttVal av, list; String attribute, value; MutableInteger delim = new MutableInteger(); MutableObject asp = new MutableObject(); MutableObject php = new MutableObject(); list = null; for (; !endOfInput();) { attribute = parseAttribute(isempty, asp, php); if (attribute == null) { /* check if attributes are created by ASP markup */ if (asp.getObject() != null) { av = new AttVal(list, null, (Node) asp.getObject(), null, '\0', null, null); list = av; continue; } /* check if attributes are created by PHP markup */ if (php.getObject() != null) { av = new AttVal(list, null, null, (Node) php.getObject(), '\0', null, null); list = av; continue; } break; } value = parseValue(attribute, false, isempty, delim); if (attribute != null && isValidAttrName(attribute)) { av = new AttVal(list, null, null, null, delim.value, attribute, value); av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av); list = av; } else { av = new AttVal(null, null, null, null, 0, attribute, value); Report.attrError(this, this.token, value, Report.BAD_ATTRIBUTE_VALUE); } } return list; } /* push a copy of an inline node onto stack but don't push if implicit or OBJECT or APPLET (implicit tags are ones generated from the istack) One issue arises with pushing inlines when the tag is already pushed. For instance: <p><em>text <p><em>more text Shouldn't be mapped to <p><em>text</em></p> <p><em><em>more text</em></em> */ public void pushInline(Node node) { IStack is; if (node.implicit) return; if (node.tag == null) return; if ((node.tag.model & Dict.CM_INLINE) == 0) return; if ((node.tag.model & Dict.CM_OBJECT) != 0) return; if (node.tag != configuration.tt.tagFont && isPushed(node)) return; // make sure there is enough space for the stack is = new IStack(); is.tag = node.tag; is.element = node.element; if (node.attributes != null) is.attributes = cloneAttributes(node.attributes); this.istack.push(is); } /* pop inline stack */ public void popInline(Node node) { AttVal av; IStack is; if (node != null) { if (node.tag == null) return; if ((node.tag.model & Dict.CM_INLINE) == 0) return; if ((node.tag.model & Dict.CM_OBJECT) != 0) return; // if node is </a> then pop until we find an <a> if (node.tag == configuration.tt.tagA) { while (this.istack.size() > 0) { is = (IStack) this.istack.pop(); if (is.tag == configuration.tt.tagA) { break; } } if (this.insert >= this.istack.size()) this.insert = -1; return; } } if (this.istack.size() > 0) { is = (IStack) this.istack.pop(); if (this.insert >= this.istack.size()) this.insert = -1; } } public boolean isPushed(Node node) { int i; IStack is; for (i = this.istack.size() - 1; i >= 0; --i) { is = (IStack) this.istack.elementAt(i); if (is.tag == node.tag) return true; } return false; } /* This has the effect of inserting "missing" inline elements around the contents of blocklevel elements such as P, TD, TH, DIV, PRE etc. This procedure is called at the start of ParseBlock. when the inline stack is not empty, as will be the case in: <i><h1>italic heading</h1></i> which is then treated as equivalent to <h1><i>italic heading</i></h1> This is implemented by setting the lexer into a mode where it gets tokens from the inline stack rather than from the input stream. */ public int inlineDup(Node node) { int n; n = this.istack.size() - this.istackbase; if (n > 0) { this.insert = this.istackbase; this.inode = node; } return n; } public Node insertedToken() { Node node; IStack is; int n; // this will only be null if inode != null if (this.insert == -1) { node = this.inode; this.inode = null; return node; } // is this is the "latest" node then update // the position, otherwise use current values if (this.inode == null) { this.lines = this.in.curline; this.columns = this.in.curcol; } node = newNode(Node.StartTag, this.lexbuf, this.txtstart, this.txtend); // GLP: Bugfix 126261. Remove when this change // is fixed in istack.c in the original Tidy node.implicit = true; is = (IStack) this.istack.elementAt(this.insert); node.element = is.element; node.tag = is.tag; if (is.attributes != null) node.attributes = cloneAttributes(is.attributes); // advance lexer to next item on the stack n = this.insert; // and recover state if we have reached the end if (++n < this.istack.size()) { this.insert = n; } else { this.insert = -1; } return node; } /* AQ: Try this for speed optimization */ public static int wstrcasecmp(String s1, String s2) { return (s1.equalsIgnoreCase(s2) ? 0 : 1); } public static int wstrcaselexcmp(String s1, String s2) { char c; int i = 0; while (i < s1.length() && i < s2.length()) { c = s1.charAt(i); if (toLower(c) != toLower(s2.charAt(i))) { break; } i += 1; } if (i == s1.length() && i == s2.length()) { return 0; } else if (i == s1.length()) { return -1; } else if (i == s2.length()) { return 1; } else { return (s1.charAt(i) > s2.charAt(i) ? 1 : -1); } } public static boolean wsubstr(String s1, String s2) { int i; int len1 = s1.length(); int len2 = s2.length(); for (i = 0; i <= len1 - len2; ++i) { if (s2.equalsIgnoreCase(s1.substring(i))) return true; } return false; } public boolean canPrune(Node element) { if (element.type == Node.TextNode) return true; if (element.content != null) return false; if (element.tag == configuration.tt.tagA && element.attributes != null) return false; if (element.tag == configuration.tt.tagP && !this.configuration.DropEmptyParas) return false; if (element.tag == null) return false; if ((element.tag.model & Dict.CM_ROW) != 0) return false; if (element.tag == configuration.tt.tagApplet) return false; if (element.tag == configuration.tt.tagObject) return false; if (element.attributes != null && (element.getAttrByName("id") != null || element.getAttrByName("name") != null)) return false; return true; } /* duplicate name attribute as an id */ public void fixId(Node node) { AttVal name = node.getAttrByName("name"); AttVal id = node.getAttrByName("id"); if (name != null) { if (id != null) { if (!id.value.equals(name.value)) Report.attrError(this, node, "name", Report.ID_NAME_MISMATCH); } else if (this.configuration.XmlOut) node.addAttribute("id", name.value); } } /* defer duplicates when entering a table or other element where the inlines shouldn't be duplicated */ public void deferDup() { this.insert = -1; this.inode = null; } /* Private methods and fields */ /* lexer char types */ private static final short DIGIT = 1; private static final short LETTER = 2; private static final short NAMECHAR = 4; private static final short WHITE = 8; private static final short NEWLINE = 16; private static final short LOWERCASE = 32; private static final short UPPERCASE = 64; /* lexer GetToken states */ private static final short LEX_CONTENT = 0; private static final short LEX_GT = 1; private static final short LEX_ENDTAG = 2; private static final short LEX_STARTTAG = 3; private static final short LEX_COMMENT = 4; private static final short LEX_DOCTYPE = 5; private static final short LEX_PROCINSTR = 6; private static final short LEX_ENDCOMMENT = 7; private static final short LEX_CDATA = 8; private static final short LEX_SECTION = 9; private static final short LEX_ASP = 10; private static final short LEX_JSTE = 11; private static final short LEX_PHP = 12; /* used to classify chars for lexical purposes */ private static short[] lexmap = new short[128]; private static void mapStr(String str, short code) { int j; for (int i = 0; i < str.length(); i++) { j = (int) str.charAt(i); lexmap[j] |= code; } } static { mapStr("\r\n\f", (short) (NEWLINE | WHITE)); mapStr(" \t", WHITE); mapStr("-.:_", NAMECHAR); mapStr("0123456789", (short) (DIGIT | NAMECHAR)); mapStr("abcdefghijklmnopqrstuvwxyz", (short) (LOWERCASE | LETTER | NAMECHAR)); mapStr("ABCDEFGHIJKLMNOPQRSTUVWXYZ", (short) (UPPERCASE | LETTER | NAMECHAR)); } private static short MAP(char c) { return ((int) c < 128 ? lexmap[(int) c] : 0); } private static boolean isWhite(char c) { short m = MAP(c); return (m & WHITE) != 0; } private static boolean isDigit(char c) { short m; m = MAP(c); return (m & DIGIT) != 0; } private static boolean isLetter(char c) { short m; m = MAP(c); return (m & LETTER) != 0; } private static char toLower(char c) { short m = MAP(c); if ((m & UPPERCASE) != 0) c = (char) ((int) c + (int) 'a' - (int) 'A'); return c; } private static char toUpper(char c) { short m = MAP(c); if ((m & LOWERCASE) != 0) c = (char) ((int) c + (int) 'A' - (int) 'a'); return c; } public static char foldCase(char c, boolean tocaps, boolean xmlTags) { short m; if (!xmlTags) { m = MAP(c); if (tocaps) { if ((m & LOWERCASE) != 0) c = (char) ((int) c + (int) 'A' - (int) 'a'); } else /* force to lower case */ { if ((m & UPPERCASE) != 0) c = (char) ((int) c + (int) 'a' - (int) 'A'); } } return c; } private static class W3CVersionInfo { String name; String voyagerName; String profile; short code; public W3CVersionInfo(String name, String voyagerName, String profile, short code) { this.name = name; this.voyagerName = voyagerName; this.profile = profile; this.code = code; } } /* the 3 URIs for the XHTML 1.0 DTDs */ private static final String voyager_loose = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"; private static final String voyager_strict = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"; private static final String voyager_frameset = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd"; private static final String XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml"; private static Lexer.W3CVersionInfo[] W3CVersion = { new W3CVersionInfo("HTML 4.01", "XHTML 1.0 Strict", voyager_strict, Dict.VERS_HTML40_STRICT), new W3CVersionInfo("HTML 4.01 Transitional", "XHTML 1.0 Transitional", voyager_loose, Dict.VERS_HTML40_LOOSE), new W3CVersionInfo("HTML 4.01 Frameset", "XHTML 1.0 Frameset", voyager_frameset, Dict.VERS_FRAMES), new W3CVersionInfo("HTML 4.0", "XHTML 1.0 Strict", voyager_strict, Dict.VERS_HTML40_STRICT), new W3CVersionInfo("HTML 4.0 Transitional", "XHTML 1.0 Transitional", voyager_loose, Dict.VERS_HTML40_LOOSE), new W3CVersionInfo("HTML 4.0 Frameset", "XHTML 1.0 Frameset", voyager_frameset, Dict.VERS_FRAMES), new W3CVersionInfo("HTML 3.2", "XHTML 1.0 Transitional", voyager_loose, Dict.VERS_HTML32), new W3CVersionInfo("HTML 2.0", "XHTML 1.0 Strict", voyager_strict, Dict.VERS_HTML20)}; }