/*
* Java HTML Tidy - JTidy
* HTML parser and pretty printer
*
* Copyright (c) 1998-2000 World Wide Web Consortium (Massachusetts
* Institute of Technology, Institut National de Recherche en
* Informatique et en Automatique, Keio University). All Rights
* Reserved.
*
* Contributing Author(s):
*
* Dave Raggett <dsr@w3.org>
* Andy Quick <ac.quick@sympatico.ca> (translation to Java)
* Gary L Peskin <garyp@firstech.com> (Java development)
* Sami Lempinen <sami@lempinen.net> (release management)
* Fabrizio Giustina <fgiust at users.sourceforge.net>
*
* The contributing author(s) would like to thank all those who
* helped with testing, bug fixes, and patience. This wouldn't
* have been possible without all of you.
*
* COPYRIGHT NOTICE:
*
* This software and documentation is provided "as is," and
* the copyright holders and contributing author(s) make no
* representations or warranties, express or implied, including
* but not limited to, warranties of merchantability or fitness
* for any particular purpose or that the use of the software or
* documentation will not infringe any third party patents,
* copyrights, trademarks or other rights.
*
* The copyright holders and contributing author(s) will not be
* liable for any direct, indirect, special or consequential damages
* arising out of any use of the software or documentation, even if
* advised of the possibility of such damage.
*
* Permission is hereby granted to use, copy, modify, and distribute
* this source code, or portions hereof, documentation and executables,
* for any purpose, without fee, subject to the following restrictions:
*
* 1. The origin of this source code must not be misrepresented.
* 2. Altered versions must be plainly marked as such and must
* not be misrepresented as being the original source.
* 3. This Copyright notice may not be removed or altered from any
* source or altered source distribution.
*
* The copyright holders and contributing author(s) specifically
* permit, without fee, and encourage the use of this source code
* as a component for supporting the Hypertext Markup Language in
* commercial products. If you use this source code in a product,
* acknowledgment is not required but would be appreciated.
*
*/
package org.w3c.tidy;
import java.io.PrintWriter;
import java.util.List;
import java.util.Stack;
import java.util.Vector;
/**
* Lexer for html parser.
* <p>
* Given a file stream fp it returns a sequence of tokens. GetToken(fp) gets the next token UngetToken(fp) provides one
* level undo The tags include an attribute list: - linked list of attribute/value nodes - each node has 2
* null-terminated strings. - entities are replaced in attribute values white space is compacted if not in preformatted
* mode If not in preformatted mode then leading white space is discarded and subsequent white space sequences compacted
* to single space chars. If XmlTags is no then Tag names are folded to upper case and attribute names to lower case.
* Not yet done: - Doctype subset and marked sections
* </p>
* @author Dave Raggett <a href="mailto:dsr@w3.org">dsr@w3.org </a>
* @author Andy Quick <a href="mailto:ac.quick@sympatico.ca">ac.quick@sympatico.ca </a> (translation to Java)
* @author Fabrizio Giustina
* @version $Revision: 1168 $ ($Author: aditsu $)
*/
public class Lexer
{
/**
* state: ignore whitespace.
*/
public static final short IGNORE_WHITESPACE = 0;
/**
* state: mixed content.
*/
public static final short MIXED_CONTENT = 1;
/**
* state: preformatted.
*/
public static final short PREFORMATTED = 2;
/**
* state: ignore markup.
*/
public static final short IGNORE_MARKUP = 3;
/**
* URI for XHTML 1.0 transitional DTD.
*/
private static final String VOYAGER_LOOSE = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";
/**
* URI for XHTML 1.0 strict DTD.
*/
private static final String VOYAGER_STRICT = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd";
/**
* URI for XHTML 1.0 frameset DTD.
*/
private static final String VOYAGER_FRAMESET = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd";
/**
* URI for XHTML 1.1.
*/
private static final String VOYAGER_11 = "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd";
/**
* URI for XHTML Basic 1.0.
*/
// private static final String VOYAGER_BASIC = "http://www.w3.org/TR/xhtml-basic/xhtml-basic10.dtd";
/**
* xhtml namespace.
*/
private static final String XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml";
/**
* lists all the known versions.
*/
private static final Lexer.W3CVersionInfo[] W3CVERSION = {
new W3CVersionInfo("HTML 4.01", "XHTML 1.0 Strict", VOYAGER_STRICT, Dict.VERS_HTML40_STRICT),
new W3CVersionInfo("HTML 4.01 Transitional", "XHTML 1.0 Transitional", VOYAGER_LOOSE, Dict.VERS_HTML40_LOOSE),
new W3CVersionInfo("HTML 4.01 Frameset", "XHTML 1.0 Frameset", VOYAGER_FRAMESET, Dict.VERS_FRAMESET),
new W3CVersionInfo("HTML 4.0", "XHTML 1.0 Strict", VOYAGER_STRICT, Dict.VERS_HTML40_STRICT),
new W3CVersionInfo("HTML 4.0 Transitional", "XHTML 1.0 Transitional", VOYAGER_LOOSE, Dict.VERS_HTML40_LOOSE),
new W3CVersionInfo("HTML 4.0 Frameset", "XHTML 1.0 Frameset", VOYAGER_FRAMESET, Dict.VERS_FRAMESET),
new W3CVersionInfo("HTML 3.2", "XHTML 1.0 Transitional", VOYAGER_LOOSE, Dict.VERS_HTML32),
new W3CVersionInfo("HTML 3.2 Final", "XHTML 1.0 Transitional", VOYAGER_LOOSE, Dict.VERS_HTML32),
new W3CVersionInfo("HTML 3.2 Draft", "XHTML 1.0 Transitional", VOYAGER_LOOSE, Dict.VERS_HTML32),
new W3CVersionInfo("HTML 2.0", "XHTML 1.0 Strict", VOYAGER_STRICT, Dict.VERS_HTML20),
new W3CVersionInfo("HTML 4.01", "XHTML 1.1", VOYAGER_STRICT, Dict.VERS_XHTML11)};
/**
* getToken state: content.
*/
private static final short LEX_CONTENT = 0;
/**
* getToken state: gt.
*/
private static final short LEX_GT = 1;
/**
* getToken state: endtag.
*/
private static final short LEX_ENDTAG = 2;
/**
* getToken state: start tag.
*/
private static final short LEX_STARTTAG = 3;
/**
* getToken state: comment.
*/
private static final short LEX_COMMENT = 4;
/**
* getToken state: doctype.
*/
private static final short LEX_DOCTYPE = 5;
/**
* getToken state: procinstr.
*/
private static final short LEX_PROCINSTR = 6;
/**
* getToken state: cdata.
*/
private static final short LEX_CDATA = 8;
/**
* getToken state: section.
*/
private static final short LEX_SECTION = 9;
/**
* getToken state: asp.
*/
private static final short LEX_ASP = 10;
/**
* getToken state: jste.
*/
private static final short LEX_JSTE = 11;
/**
* getToken state: php.
*/
private static final short LEX_PHP = 12;
/**
* getToken state: xml declaration.
*/
private static final short LEX_XMLDECL = 13;
/**
* file stream.
*/
protected StreamIn in;
/**
* error output stream.
*/
protected PrintWriter errout;
/**
* for accessibility errors.
*/
protected short badAccess;
/**
* for bad style errors.
*/
protected short badLayout;
/**
* for bad char encodings.
*/
protected short badChars;
/**
* for mismatched/mispositioned form tags.
*/
protected short badForm;
/**
* count of warnings in this document.
*/
protected short warnings;
/**
* count of errors.
*/
protected short errors;
/**
* lines seen.
*/
protected int lines;
/**
* at start of current token.
*/
protected int columns;
/**
* used to collapse contiguous white space.
*/
protected boolean waswhite;
/**
* true after token has been pushed back.
*/
protected boolean pushed;
/**
* when space is moved after end tag.
*/
protected boolean insertspace;
/**
* Netscape compatibility.
*/
protected boolean excludeBlocks;
/**
* true if moved out of table.
*/
protected boolean exiled;
/**
* true if xmlns attribute on html element.
*/
protected boolean isvoyager;
/**
* bit vector of HTML versions.
*/
protected short versions;
/**
* version as given by doctype (if any).
*/
protected int doctype;
/**
* set if html or PUBLIC is missing.
*/
protected boolean badDoctype;
/**
* start of current node.
*/
protected int txtstart;
/**
* end of current node.
*/
protected int txtend;
/**
* state of lexer's finite state machine.
*/
protected short state;
/**
* current node.
*/
protected Node token;
/**
* Lexer character buffer parse tree nodes span onto this buffer which contains the concatenated text contents of
* all of the elements. Lexsize must be reset for each file. Byte buffer of UTF-8 chars.
*/
protected byte[] lexbuf;
/**
* allocated.
*/
protected int lexlength;
/**
* used.
*/
protected int lexsize;
/**
* Inline stack for compatibility with Mosaic. For deferring text node.
*/
protected Node inode;
/**
* for inferring inline tags.
*/
protected int insert;
/**
* stack.
*/
protected Stack istack;
/**
* start of frame.
*/
protected int istackbase;
/**
* used for cleaning up presentation markup.
*/
protected Style styles;
/**
* configuration.
*/
protected Configuration configuration;
/**
* already seen end body tag?
*/
protected boolean seenEndBody;
/**
* already seen end html tag?
*/
protected boolean seenEndHtml;
/**
* report.
*/
protected Report report;
/**
* Root node is saved here.
*/
protected Node root;
/**
* node list.
*/
private List nodeList;
/**
* Instantiates a new Lexer.
* @param in StreamIn
* @param configuration configuation instance
* @param report report instance, for reporting errors
*/
public Lexer(StreamIn in, Configuration configuration, Report report)
{
this.report = report;
this.in = in;
this.lines = 1;
this.columns = 1;
this.state = LEX_CONTENT;
this.versions = (Dict.VERS_ALL | Dict.VERS_PROPRIETARY);
this.doctype = Dict.VERS_UNKNOWN;
this.insert = -1;
this.istack = new Stack();
this.configuration = configuration;
this.nodeList = new Vector();
}
/**
* Creates a new node and add it to nodelist.
* @return Node
*/
public Node newNode()
{
Node node = new Node();
this.nodeList.add(node);
return node;
}
/**
* Creates a new node and add it to nodelist.
* @param type node type: Node.ROOT_NODE | Node.DOCTYPE_TAG | Node.COMMENT_TAG | Node.PROC_INS_TAG | Node.TEXT_NODE |
* Node.START_TAG | Node.END_TAG | Node.START_END_TAG | Node.CDATA_TAG | Node.SECTION_TAG | Node. ASP_TAG |
* Node.JSTE_TAG | Node.PHP_TAG | Node.XML_DECL
* @param textarray array of bytes contained in the Node
* @param start start position
* @param end end position
* @return Node
*/
public Node newNode(short type, byte[] textarray, int start, int end)
{
Node node = new Node(type, textarray, start, end);
this.nodeList.add(node);
return node;
}
/**
* Creates a new node and add it to nodelist.
* @param type node type: Node.ROOT_NODE | Node.DOCTYPE_TAG | Node.COMMENT_TAG | Node.PROC_INS_TAG | Node.TEXT_NODE |
* Node.START_TAG | Node.END_TAG | Node.START_END_TAG | Node.CDATA_TAG | Node.SECTION_TAG | Node. ASP_TAG |
* Node.JSTE_TAG | Node.PHP_TAG | Node.XML_DECL
* @param textarray array of bytes contained in the Node
* @param start start position
* @param end end position
* @param element tag name
* @return Node
*/
public Node newNode(short type, byte[] textarray, int start, int end, String element)
{
Node node = new Node(type, textarray, start, end, element, this.configuration.tt);
this.nodeList.add(node);
return node;
}
/**
* Clones a node and add it to node list.
* @param node Node
* @return cloned Node
*/
public Node cloneNode(Node node)
{
Node cnode = node.cloneNode(false);
this.nodeList.add(cnode);
for (AttVal att = cnode.attributes; att != null; att = att.next)
{
if (att.asp != null)
{
this.nodeList.add(att.asp);
}
if (att.php != null)
{
this.nodeList.add(att.php);
}
}
return cnode;
}
/**
* Clones an attribute value and add eventual asp or php node to node list.
* @param attrs original AttVal
* @return cloned AttVal
*/
public AttVal cloneAttributes(AttVal attrs)
{
AttVal cattrs = (AttVal) attrs.clone();
for (AttVal att = cattrs; att != null; att = att.next)
{
if (att.asp != null)
{
this.nodeList.add(att.asp);
}
if (att.php != null)
{
this.nodeList.add(att.php);
}
}
return cattrs;
}
/**
* Update <code>oldtextarray</code> in the current nodes.
* @param oldtextarray previous text array
* @param newtextarray new text array
*/
protected void updateNodeTextArrays(byte[] oldtextarray, byte[] newtextarray)
{
Node node;
for (int i = 0; i < this.nodeList.size(); i++)
{
node = (Node) (this.nodeList.get(i));
if (node.textarray == oldtextarray)
{
node.textarray = newtextarray;
}
}
}
/**
* Adds a new line node. Used for creating preformatted text from Word2000.
* @return new line node
*/
public Node newLineNode()
{
Node node = newNode();
node.textarray = this.lexbuf;
node.start = this.lexsize;
addCharToLexer('\n');
node.end = this.lexsize;
return node;
}
/**
* Has end of input stream been reached?
* @return <code>true</code> if end of input stream been reached
*/
public boolean endOfInput()
{
return this.in.isEndOfStream();
}
/**
* Adds a byte to lexer buffer.
* @param c byte to add
*/
public void addByte(int c)
{
if (this.lexsize + 1 >= this.lexlength)
{
while (this.lexsize + 1 >= this.lexlength)
{
if (this.lexlength == 0)
{
this.lexlength = 8192;
}
else
{
this.lexlength = this.lexlength * 2;
}
}
byte[] temp = this.lexbuf;
this.lexbuf = new byte[this.lexlength];
if (temp != null)
{
System.arraycopy(temp, 0, this.lexbuf, 0, temp.length);
updateNodeTextArrays(temp, this.lexbuf);
}
}
this.lexbuf[this.lexsize++] = (byte) c;
this.lexbuf[this.lexsize] = (byte) '\0'; // debug
}
/**
* Substitute the last char in buffer.
* @param c new char
*/
public void changeChar(byte c)
{
if (this.lexsize > 0)
{
this.lexbuf[this.lexsize - 1] = c;
}
}
/**
* Store char c as UTF-8 encoded byte stream.
* @param c char to store
*/
public void addCharToLexer(int c)
{
// Allow only valid XML characters. See: http://www.w3.org/TR/2004/REC-xml-20040204/#NT-Char
// Fix by Pablo Mayrgundter 17-08-2004
if ((this.configuration.xmlOut || this.configuration.xHTML) // only for xml output
&& !((c >= 0x20 && c <= 0xD7FF) // Check the common-case first.
|| c == 0x9
|| c == 0xA
|| c == 0xD // Then white-space.
|| (c >= 0xE000 && c <= 0xFFFD) // Then high-range unicode.
|| (c >= 0x10000 && c <= 0x10FFFF)))
{
return;
}
int i = 0;
int[] count = new int[]{0};
byte[] buf = new byte[10]; // unsigned char
boolean err = EncodingUtils.encodeCharToUTF8Bytes(c, buf, null, count);
if (err)
{
// replacement char 0xFFFD encoded as UTF-8
buf[0] = (byte) 0xEF;
buf[1] = (byte) 0xBF;
buf[2] = (byte) 0xBD;
count[0] = 3;
}
for (i = 0; i < count[0]; i++)
{
addByte(buf[i]); // uint
}
}
/**
* Adds a string to lexer buffer.
* @param str String to add
*/
public void addStringToLexer(String str)
{
for (int i = 0; i < str.length(); i++)
{
addCharToLexer(str.charAt(i));
}
}
/**
* Parse an html entity.
* @param mode mode
*/
public void parseEntity(short mode)
{
// No longer attempts to insert missing ';' for unknown
// entities unless one was present already, since this
// gives unexpected results.
//
// For example: <a href="something.htm?foo&bar&fred">
// was tidied to: <a href="something.htm?foo&bar;&fred;">
// rather than: <a href="something.htm?foo&bar&fred">
//
// My thanks for Maurice Buxton for spotting this.
//
// Also Randy Waki pointed out the following case for the
// 04 Aug 00 version (bug #433012):
//
// For example: <a href="something.htm?id=1&lang=en">
// was tidied to: <a href="something.htm?id=1〈=en">
// rather than: <a href="something.htm?id=1&lang=en">
//
// where "lang" is a known entity (#9001), but browsers would
// misinterpret "〈" because it had a value > 256.
//
// So the case of an apparently known entity with a value > 256 and
// missing a semicolon is handled specially.
//
// "ParseEntity" is also a bit of a misnomer - it handles entities and
// numeric character references. Invalid NCR's are now reported.
int start;
boolean first = true;
boolean semicolon = false;
int c, ch, startcol;
String str;
start = this.lexsize - 1; // to start at "&"
startcol = this.in.getCurcol() - 1;
while ((c = this.in.readChar()) != StreamIn.END_OF_STREAM)
{
if (c == ';')
{
semicolon = true;
break;
}
if (first && c == '#')
{
// #431953 - start RJ
if (!this.configuration.ncr
|| "BIG5".equals(this.configuration.getInCharEncodingName())
|| "SHIFTJIS".equals(this.configuration.getInCharEncodingName()))
{
this.in.ungetChar(c);
return;
}
// #431953 - end RJ
addCharToLexer(c);
first = false;
continue;
}
first = false;
if (TidyUtils.isNamechar((char) c))
{
addCharToLexer(c);
continue;
}
// otherwise put it back
this.in.ungetChar(c);
break;
}
str = TidyUtils.getString(this.lexbuf, start, this.lexsize - start);
if ("&apos".equals(str) && !configuration.xmlOut && !this.isvoyager && !configuration.xHTML)
{
report.entityError(this, Report.APOS_UNDEFINED, str, 39);
}
ch = EntityTable.getDefaultEntityTable().entityCode(str);
// drops invalid numeric entities from XML mode. Fix by Pablo Mayrgundter 17-08-2004
// if ((this.configuration.xmlOut || this.configuration.xHTML) // only for xml output
// && !((ch >= 0x20 && ch <= 0xD7FF) // Check the common-case first.
// || ch == 0x9 || ch == 0xA || ch == 0xD // Then white-space.
// || (ch >= 0xE000 && ch <= 0xFFFD)))
// {
// this.lexsize = start;
// return;
// }
// deal with unrecognized or invalid entities
// #433012 - fix by Randy Waki 17 Feb 01
// report invalid NCR's - Terry Teague 01 Sep 01
if (ch <= 0 || (ch >= 256 && c != ';'))
{
// set error position just before offending character
this.lines = this.in.getCurline();
this.columns = startcol;
if (this.lexsize > start + 1)
{
if (ch >= 128 && ch <= 159)
{
// invalid numeric character reference
int c1 = 0;
if ("WIN1252".equals(configuration.replacementCharEncoding))
{
c1 = EncodingUtils.decodeWin1252(ch);
}
else if ("MACROMAN".equals(configuration.replacementCharEncoding))
{
c1 = EncodingUtils.decodeMacRoman(ch);
}
// "or" DISCARDED_CHAR with the other errors if discarding char; otherwise default is replacing
int replaceMode = c1 != 0 ? Report.REPLACED_CHAR : Report.DISCARDED_CHAR;
if (c != ';') /* issue warning if not terminated by ';' */
{
report.entityError(this, Report.MISSING_SEMICOLON_NCR, str, c);
}
report.encodingError(this, (short) (Report.INVALID_NCR | replaceMode), ch);
if (c1 != 0)
{
// make the replacement
this.lexsize = start;
addCharToLexer(c1);
semicolon = false;
}
else
{
/* discard */
this.lexsize = start;
semicolon = false;
}
}
else
{
report.entityError(this, Report.UNKNOWN_ENTITY, str, ch);
}
if (semicolon)
{
addCharToLexer(';');
}
}
else
{
// naked &
report.entityError(this, Report.UNESCAPED_AMPERSAND, str, ch);
}
}
else
{
// issue warning if not terminated by ';'
if (c != ';')
{
// set error position just before offending character
this.lines = this.in.getCurline();
this.columns = startcol;
report.entityError(this, Report.MISSING_SEMICOLON, str, c);
}
this.lexsize = start;
if (ch == 160 && TidyUtils.toBoolean(mode & PREFORMATTED))
{
ch = ' ';
}
addCharToLexer(ch);
if (ch == '&' && !this.configuration.quoteAmpersand)
{
addCharToLexer('a');
addCharToLexer('m');
addCharToLexer('p');
addCharToLexer(';');
}
}
}
/**
* Parses a tag name.
* @return first char after the tag name
*/
public char parseTagName()
{
int c;
// fold case of first char in buffer
c = this.lexbuf[this.txtstart];
if (!this.configuration.xmlTags && TidyUtils.isUpper((char) c))
{
c = TidyUtils.toLower((char) c);
this.lexbuf[this.txtstart] = (byte) c;
}
while ((c = this.in.readChar()) != StreamIn.END_OF_STREAM)
{
if (!TidyUtils.isNamechar((char) c))
{
break;
}
// fold case of subsequent chars
if (!this.configuration.xmlTags && TidyUtils.isUpper((char) c))
{
c = TidyUtils.toLower((char) c);
}
addCharToLexer(c);
}
this.txtend = this.lexsize;
return (char) c;
}
/**
* calls addCharToLexer for any char in the string.
* @param str input String
*/
public void addStringLiteral(String str)
{
int len = str.length();
for (int i = 0; i < len; i++)
{
addCharToLexer(str.charAt(i));
}
}
/**
* calls addCharToLexer for any char in the string till len is reached.
* @param str input String
* @param len length of the substring to be added
*/
void addStringLiteralLen(String str, int len)
{
int strlen = str.length();
if (strlen < len)
{
len = strlen;
}
for (int i = 0; i < len; i++)
{
addCharToLexer(str.charAt(i));
}
}
/**
* Choose what version to use for new doctype.
* @return html version constant
*/
public short htmlVersion()
{
if (TidyUtils.toBoolean(versions & Dict.VERS_HTML20))
{
return Dict.VERS_HTML20;
}
if (!(this.configuration.xmlOut | this.configuration.xmlTags | this.isvoyager)
&& TidyUtils.toBoolean(versions & Dict.VERS_HTML32))
{
return Dict.VERS_HTML32;
}
if (TidyUtils.toBoolean(versions & Dict.VERS_XHTML11))
{
return Dict.VERS_XHTML11;
}
if (TidyUtils.toBoolean(versions & Dict.VERS_HTML40_STRICT))
{
return Dict.VERS_HTML40_STRICT;
}
if (TidyUtils.toBoolean(versions & Dict.VERS_HTML40_LOOSE))
{
return Dict.VERS_HTML40_LOOSE;
}
if (TidyUtils.toBoolean(versions & Dict.VERS_FRAMESET))
{
return Dict.VERS_FRAMESET;
}
return Dict.VERS_UNKNOWN;
}
/**
* Choose what version to use for new doctype.
* @return html version name
*/
public String htmlVersionName()
{
short guessed;
int j;
guessed = apparentVersion();
for (j = 0; j < W3CVERSION.length; ++j)
{
if (guessed == W3CVERSION[j].code)
{
if (this.isvoyager)
{
return W3CVERSION[j].voyagerName;
}
return W3CVERSION[j].name;
}
}
return null;
}
/**
* Add meta element for Tidy. If the meta tag is already present, update release date.
* @param root root node
* @return <code>true</code> if the tag has been added
*/
public boolean addGenerator(Node root)
{
AttVal attval;
Node node;
Node head = root.findHEAD(this.configuration.tt);
if (head != null)
{
String meta = "HTML Tidy for Java (vers. " + Report.RELEASE_DATE_STRING + "), see jtidy.sourceforge.net";
for (node = head.content; node != null; node = node.next)
{
if (node.tag == this.configuration.tt.tagMeta)
{
attval = node.getAttrByName("name");
if (attval != null && attval.value != null && "generator".equalsIgnoreCase(attval.value))
{
attval = node.getAttrByName("content");
if (attval != null
&& attval.value != null
&& attval.value.length() >= 9
&& "HTML Tidy".equalsIgnoreCase(attval.value.substring(0, 9)))
{
attval.value = meta;
return false;
}
}
}
}
node = this.inferredTag("meta");
node.addAttribute("content", meta);
node.addAttribute("name", "generator");
head.insertNodeAtStart(node);
return true;
}
return false;
}
/**
* Check system keywords (keywords should be uppercase).
* @param doctype doctype node
* @return true if doctype keywords are all uppercase
*/
public boolean checkDocTypeKeyWords(Node doctype)
{
int len = doctype.end - doctype.start;
String s = TidyUtils.getString(this.lexbuf, doctype.start, len);
return !(TidyUtils.findBadSubString("SYSTEM", s, s.length())
|| TidyUtils.findBadSubString("PUBLIC", s, s.length())
|| TidyUtils.findBadSubString("//DTD", s, s.length())
|| TidyUtils.findBadSubString("//W3C", s, s.length())
|| TidyUtils.findBadSubString("//EN", s, s.length()));
}
/**
* Examine DOCTYPE to identify version.
* @param doctype doctype node
* @return version code
*/
public short findGivenVersion(Node doctype)
{
String p, s;
int i, j;
int len;
String str1;
String str2;
// if root tag for doctype isn't html give up now
str1 = TidyUtils.getString(this.lexbuf, doctype.start, 5);
if (!"html ".equalsIgnoreCase(str1))
{
return 0;
}
if (!checkDocTypeKeyWords(doctype))
{
report.warning(this, doctype, null, Report.DTYPE_NOT_UPPER_CASE);
}
// give up if all we are given is the system id for the doctype
str1 = TidyUtils.getString(this.lexbuf, doctype.start + 5, 7);
if ("SYSTEM ".equalsIgnoreCase(str1))
{
// but at least ensure the case is correct
if (!str1.substring(0, 6).equals("SYSTEM"))
{
System.arraycopy(TidyUtils.getBytes("SYSTEM"), 0, this.lexbuf, doctype.start + 5, 6);
}
return 0; // unrecognized
}
if ("PUBLIC ".equalsIgnoreCase(str1))
{
if (!str1.substring(0, 6).equals("PUBLIC"))
{
System.arraycopy(TidyUtils.getBytes("PUBLIC "), 0, this.lexbuf, doctype.start + 5, 6);
}
}
else
{
this.badDoctype = true;
}
for (i = doctype.start; i < doctype.end; ++i)
{
if (this.lexbuf[i] == (byte) '"')
{
str1 = TidyUtils.getString(this.lexbuf, i + 1, 12);
str2 = TidyUtils.getString(this.lexbuf, i + 1, 13);
if (str1.equals("-//W3C//DTD "))
{
// compute length of identifier e.g. "HTML 4.0 Transitional"
for (j = i + 13; j < doctype.end && this.lexbuf[j] != (byte) '/'; ++j)
{
//
}
len = j - i - 13;
p = TidyUtils.getString(this.lexbuf, i + 13, len);
for (j = 1; j < W3CVERSION.length; ++j)
{
s = W3CVERSION[j].name;
if (len == s.length() && s.equals(p))
{
return W3CVERSION[j].code;
}
}
// else unrecognized version
}
else if (str2.equals("-//IETF//DTD "))
{
// compute length of identifier e.g. "HTML 2.0"
for (j = i + 14; j < doctype.end && this.lexbuf[j] != (byte) '/'; ++j)
{
//
}
len = j - i - 14;
p = TidyUtils.getString(this.lexbuf, i + 14, len);
s = W3CVERSION[0].name;
if (len == s.length() && s.equals(p))
{
return W3CVERSION[0].code;
}
// else unrecognized version
}
break;
}
}
return 0;
}
/**
* Fix xhtml namespace.
* @param root root Node
* @param profile current profile
*/
public void fixHTMLNameSpace(Node root, String profile)
{
Node node;
AttVal attr;
node = root.content;
while (node != null && node.tag != this.configuration.tt.tagHtml)
{
node = node.next;
}
if (node != null)
{
for (attr = node.attributes; attr != null; attr = attr.next)
{
if (attr.attribute.equals("xmlns"))
{
break;
}
}
if (attr != null)
{
if (!attr.value.equals(profile))
{
report.warning(this, node, null, Report.INCONSISTENT_NAMESPACE);
attr.value = profile;
}
}
else
{
attr = new AttVal(node.attributes, null, '"', "xmlns", profile);
attr.dict = AttributeTable.getDefaultAttributeTable().findAttribute(attr);
node.attributes = attr;
}
}
}
/**
* Put DOCTYPE declaration between the <:?xml version "1.0" ... ?> declaration, if any, and the
* <code>html</code> tag. Should also work for any comments, etc. that may precede the <code>html</code> tag.
* @param root root node
* @return new doctype node
*/
Node newXhtmlDocTypeNode(Node root)
{
Node html = root.findHTML(this.configuration.tt);
if (html == null)
{
return null;
}
Node newdoctype = newNode();
newdoctype.setType(Node.DOCTYPE_TAG);
newdoctype.next = html;
newdoctype.parent = root;
newdoctype.prev = null;
if (html == root.content)
{
// No <?xml ... ?> declaration.
root.content.prev = newdoctype;
root.content = newdoctype;
newdoctype.prev = null;
}
else
{
// we have an <?xml ... ?> declaration.
newdoctype.prev = html.prev;
newdoctype.prev.next = newdoctype;
}
html.prev = newdoctype;
return newdoctype;
}
/**
* Adds a new xhtml doctype to the document.
* @param root root node
* @return <code>true</code> if a doctype has been added
*/
public boolean setXHTMLDocType(Node root)
{
String fpi = " ";
String sysid = "";
String namespace = XHTML_NAMESPACE;
String dtdsub = null;
Node doctype;
int dtdlen = 0;
doctype = root.findDocType();
fixHTMLNameSpace(root, namespace); // #427839 - fix by Evan Lenz 05 Sep 00
if (this.configuration.docTypeMode == Configuration.DOCTYPE_OMIT)
{
if (doctype != null)
{
Node.discardElement(doctype);
}
return true;
}
if (this.configuration.docTypeMode == Configuration.DOCTYPE_AUTO)
{
// see what flavor of XHTML this document matches
if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML40_STRICT))
{
// use XHTML strict
fpi = "-//W3C//DTD XHTML 1.0 Strict//EN";
sysid = VOYAGER_STRICT;
}
else if (TidyUtils.toBoolean(this.versions & Dict.VERS_FRAMESET))
{
// use XHTML frames
fpi = "-//W3C//DTD XHTML 1.0 Frameset//EN";
sysid = VOYAGER_FRAMESET;
}
else if (TidyUtils.toBoolean(this.versions & Dict.VERS_LOOSE))
{
fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
sysid = VOYAGER_LOOSE;
}
else if (TidyUtils.toBoolean(this.versions & Dict.VERS_XHTML11))
{
// use XHTML 1.1
fpi = "-//W3C//DTD XHTML 1.1//EN";
sysid = VOYAGER_11;
}
else
{
// proprietary
fpi = null;
sysid = "";
if (doctype != null)// #473490 - fix by Bjšrn Hšhrmann 10 Oct 01
{
Node.discardElement(doctype);
}
}
}
else if (this.configuration.docTypeMode == Configuration.DOCTYPE_STRICT)
{
fpi = "-//W3C//DTD XHTML 1.0 Strict//EN";
sysid = VOYAGER_STRICT;
}
else if (this.configuration.docTypeMode == Configuration.DOCTYPE_LOOSE)
{
fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
sysid = VOYAGER_LOOSE;
}
if (this.configuration.docTypeMode == Configuration.DOCTYPE_USER && this.configuration.docTypeStr != null)
{
fpi = this.configuration.docTypeStr;
sysid = "";
}
if (fpi == null)
{
return false;
}
if (doctype != null)
{
// Look for internal DTD subset
if (configuration.xHTML || configuration.xmlOut)
{
int len = doctype.end - doctype.start + 1;
String start = TidyUtils.getString(this.lexbuf, doctype.start, len);
int dtdbeg = start.indexOf('[');
if (dtdbeg >= 0)
{
int dtdend = start.substring(dtdbeg).indexOf(']');
if (dtdend >= 0)
{
dtdlen = dtdend + 1;
dtdsub = start.substring(dtdbeg);
}
}
}
}
else
{
if ((doctype = newXhtmlDocTypeNode(root)) == null)
{
return false;
}
}
this.txtstart = this.lexsize;
this.txtend = this.lexsize;
// add public identifier
addStringLiteral("html PUBLIC ");
// check if the fpi is quoted or not
if (fpi.charAt(0) == '"')
{
addStringLiteral(fpi);
}
else
{
addStringLiteral("\"");
addStringLiteral(fpi);
addStringLiteral("\"");
}
if (this.configuration.wraplen != 0 && sysid.length() + 6 >= this.configuration.wraplen)
{
addStringLiteral("\n\"");
}
else
{
// FG: don't wrap
addStringLiteral(" \"");
}
// add system identifier
addStringLiteral(sysid);
addStringLiteral("\"");
if (dtdlen > 0 && dtdsub != null)
{
addCharToLexer(' ');
addStringLiteralLen(dtdsub, dtdlen);
}
this.txtend = this.lexsize;
int length = this.txtend - this.txtstart;
doctype.textarray = new byte[length];
System.arraycopy(this.lexbuf, this.txtstart, doctype.textarray, 0, length);
doctype.start = 0;
doctype.end = length;
return false;
}
/**
* Return the html version used in document.
* @return version code
*/
public short apparentVersion()
{
switch (this.doctype)
{
case Dict.VERS_UNKNOWN :
return htmlVersion();
case Dict.VERS_HTML20 :
if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML20))
{
return Dict.VERS_HTML20;
}
break;
case Dict.VERS_HTML32 :
if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML32))
{
return Dict.VERS_HTML32;
}
break; // to replace old version by new
case Dict.VERS_HTML40_STRICT :
if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML40_STRICT))
{
return Dict.VERS_HTML40_STRICT;
}
break;
case Dict.VERS_HTML40_LOOSE :
if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML40_LOOSE))
{
return Dict.VERS_HTML40_LOOSE;
}
break; // to replace old version by new
case Dict.VERS_FRAMESET :
if (TidyUtils.toBoolean(this.versions & Dict.VERS_FRAMESET))
{
return Dict.VERS_FRAMESET;
}
break;
case Dict.VERS_XHTML11 :
if (TidyUtils.toBoolean(this.versions & Dict.VERS_XHTML11))
{
return Dict.VERS_XHTML11;
}
break;
default :
// should never reach here
break;
}
// kludge to avoid error appearing at end of file
// it would be better to note the actual position
// when first encountering the doctype declaration
this.lines = 1;
this.columns = 1;
report.warning(this, null, null, Report.INCONSISTENT_VERSION);
return this.htmlVersion();
}
/**
* Fixup doctype if missing.
* @param root root node
* @return <code>false</code> if current version has not been identified
*/
public boolean fixDocType(Node root)
{
Node doctype;
int guessed = Dict.VERS_HTML40_STRICT, i;
if (this.badDoctype)
{
report.warning(this, null, null, Report.MALFORMED_DOCTYPE);
}
doctype = root.findDocType();
if (this.configuration.docTypeMode == Configuration.DOCTYPE_OMIT)
{
if (doctype != null)
{
Node.discardElement(doctype);
}
return true;
}
if (this.configuration.xmlOut)
{
return true;
}
if (this.configuration.docTypeMode == Configuration.DOCTYPE_STRICT)
{
Node.discardElement(doctype);
doctype = null;
guessed = Dict.VERS_HTML40_STRICT;
}
else if (this.configuration.docTypeMode == Configuration.DOCTYPE_LOOSE)
{
Node.discardElement(doctype);
doctype = null;
guessed = Dict.VERS_HTML40_LOOSE;
}
else if (this.configuration.docTypeMode == Configuration.DOCTYPE_AUTO)
{
if (doctype != null)
{
if (this.doctype == Dict.VERS_UNKNOWN)
{
return false;
}
switch (this.doctype)
{
case Dict.VERS_UNKNOWN :
return false;
case Dict.VERS_HTML20 :
if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML20))
{
return true;
}
break; // to replace old version by new
case Dict.VERS_HTML32 :
if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML32))
{
return true;
}
break; // to replace old version by new
case Dict.VERS_HTML40_STRICT :
if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML40_STRICT))
{
return true;
}
break; // to replace old version by new
case Dict.VERS_HTML40_LOOSE :
if (TidyUtils.toBoolean(this.versions & Dict.VERS_HTML40_LOOSE))
{
return true;
}
break; // to replace old version by new
case Dict.VERS_FRAMESET :
if (TidyUtils.toBoolean(this.versions & Dict.VERS_FRAMESET))
{
return true;
}
break; // to replace old version by new
case Dict.VERS_XHTML11 :
if (TidyUtils.toBoolean(this.versions & Dict.VERS_XHTML11))
{
return true;
}
break; // to replace old version by new
default :
// should never reach here
break;
}
// INCONSISTENT_VERSION warning is now issued by ApparentVersion()
}
// choose new doctype
guessed = htmlVersion();
}
if (guessed == Dict.VERS_UNKNOWN)
{
return false;
}
// for XML use the Voyager system identifier
if (this.configuration.xmlOut || this.configuration.xmlTags || this.isvoyager)
{
if (doctype != null)
{
Node.discardElement(doctype);
}
fixHTMLNameSpace(root, XHTML_NAMESPACE);
// Namespace is the same for all XHTML variants
// Also, don't return yet. Still need to add DOCTYPE declaration.
//
// for (i = 0; i < W3CVersion.length; ++i)
// {
// if (guessed == W3CVersion[i].code)
// {
// fixHTMLNameSpace(root, W3CVersion[i].profile);
// break;
// }
// }
// return true;
}
if (doctype == null)
{
if ((doctype = newXhtmlDocTypeNode(root)) == null)
{
return false;
}
}
this.txtstart = this.lexsize;
this.txtend = this.lexsize;
// use the appropriate public identifier
addStringLiteral("html PUBLIC ");
if (this.configuration.docTypeMode == Configuration.DOCTYPE_USER
&& this.configuration.docTypeStr != null
&& this.configuration.docTypeStr.length() > 0)
{
// check if the fpi is quoted or not
if (this.configuration.docTypeStr.charAt(0) == '"')
{
addStringLiteral(this.configuration.docTypeStr);
}
else
{
addStringLiteral("\""); // #431889 - fix by Dave Bryan 04 Jan 2001
addStringLiteral(this.configuration.docTypeStr);
addStringLiteral("\""); // #431889 - fix by Dave Bryan 04 Jan 2001
}
}
else if (guessed == Dict.VERS_HTML20)
{
addStringLiteral("\"-//IETF//DTD HTML 2.0//EN\"");
}
else
{
addStringLiteral("\"-//W3C//DTD ");
for (i = 0; i < W3CVERSION.length; ++i)
{
if (guessed == W3CVERSION[i].code)
{
addStringLiteral(W3CVERSION[i].name);
break;
}
}
addStringLiteral("//EN\"");
}
this.txtend = this.lexsize;
int length = this.txtend - this.txtstart;
doctype.textarray = new byte[length];
System.arraycopy(this.lexbuf, this.txtstart, doctype.textarray, 0, length);
doctype.start = 0;
doctype.end = length;
return true;
}
/**
* Ensure XML document starts with <code><?XML version="1.0"?></code>. Add encoding attribute if not using
* ASCII or UTF-8 output.
* @param root root node
* @return always true
*/
public boolean fixXmlDecl(Node root)
{
Node xml;
AttVal version;
AttVal encoding;
if (root.content != null && root.content.type == Node.XML_DECL) {
xml = root.content;
} else {
xml = newNode(Node.XML_DECL, this.lexbuf, 0, 0);
root.insertNodeAtStart(xml);
}
version = xml.getAttrByName("version");
encoding = xml.getAttrByName("encoding");
// We need to insert a check if declared encoding and output encoding mismatch
// and fix the Xml declaration accordingly!!!
if (encoding == null && !"UTF8".equals(this.configuration.getOutCharEncodingName()))
{
if ("ISO8859_1".equals(this.configuration.getOutCharEncodingName()))
{
xml.addAttribute("encoding", "iso-8859-1");
}
if ("ISO2022".equals(this.configuration.getOutCharEncodingName()))
{
xml.addAttribute("encoding", "iso-2022");
}
}
if (version == null)
{
xml.addAttribute("version", "1.0");
}
return true;
}
/**
* Generates and inserts a new node.
* @param name tag name
* @return generated node
*/
public Node inferredTag(String name)
{
Node node;
node = newNode(Node.START_TAG, this.lexbuf, this.txtstart, this.txtend, name);
node.implicit = true;
return node;
}
private static final int CDATA_INTERMEDIATE = 0;
private static final int CDATA_STARTTAG = 1;
private static final int CDATA_ENDTAG = 2;
/**
* Create a text node for the contents of a CDATA element like style or script which ends with </foo> for some
* foo.
* @param container container node
* @return cdata node
*/
public Node getCDATA(Node container)
{
int start = 0;
int nested = 0;
int state = CDATA_INTERMEDIATE;
int c;
boolean isEmpty = true;
boolean matches = false;
boolean hasSrc = container.getAttrByName("src") != null;
this.lines = this.in.getCurline();
this.columns = this.in.getCurcol();
this.waswhite = false;
this.txtstart = this.lexsize;
this.txtend = this.lexsize;
/* seen start tag, look for matching end tag */
while ((c = this.in.readChar()) != StreamIn.END_OF_STREAM) {
addCharToLexer(c);
txtend = lexsize;
if (state == CDATA_INTERMEDIATE) {
if (c != '<') {
if (isEmpty && !TidyUtils.isWhite((char) c)) {
isEmpty = false;
}
continue;
}
c = in.readChar();
if (TidyUtils.isLetter((char) c)) {
/* <head><script src=foo><meta name=foo content=bar>*/
if (hasSrc && isEmpty && container.tag == configuration.tt.tagScript) {
/* ReportError(doc, container, NULL, MISSING_ENDTAG_FOR); */
lexsize = txtstart;
in.ungetChar(c);
in.ungetChar('<');
return null;
}
addCharToLexer(c);
start = lexsize - 1;
state = CDATA_STARTTAG;
} else if (c == '/') {
addCharToLexer(c);
c = in.readChar();
if (!TidyUtils.isLetter((char) c)) {
in.ungetChar(c);
continue;
}
in.ungetChar(c);
start = lexsize;
state = CDATA_ENDTAG;
} else if (c == '\\') {
/* recognize document.write("<script><\/script>") */
addCharToLexer(c);
c = in.readChar();
if (c != '/') {
in.ungetChar(c);
continue;
}
addCharToLexer(c);
c = in.readChar();
if (!TidyUtils.isLetter((char) c)) {
in.ungetChar(c);
continue;
}
in.ungetChar(c);
start = lexsize;
state = CDATA_ENDTAG;
} else {
in.ungetChar(c);
}
} else if (state == CDATA_STARTTAG) {
/* '<' + Letter found */
if (TidyUtils.isLetter((char) c)) {
continue;
}
matches = container.element.equalsIgnoreCase(TidyUtils.getString(lexbuf, start,
container.element.length()));
if (matches) {
nested++;
}
state = CDATA_INTERMEDIATE;
} else if (state == CDATA_ENDTAG) {
/* '<' + '/' + Letter found */
if (TidyUtils.isLetter((char) c)) {
continue;
}
matches = container.element.equalsIgnoreCase(TidyUtils.getString(lexbuf, start,
container.element.length()));
if (isEmpty && !matches) {
/* ReportError(doc, container, NULL, MISSING_ENDTAG_FOR); */
for (int i = lexsize - 1; i >= start; --i) {
in.ungetChar(lexbuf[i]);
}
in.ungetChar('/');
in.ungetChar('<');
break;
}
if (matches && nested-- <= 0) {
for (int i = lexsize - 1; i >= start; --i) {
in.ungetChar(lexbuf[i]);
}
in.ungetChar('/');
in.ungetChar('<');
lexsize -= (lexsize - start) + 2;
break;
} else if (lexbuf[start - 2] != '\\') {
/* if the end tag is not already escaped using backslash */
lines = in.getCurline();
columns = in.getCurcol();
columns -= 3;
report.warning(this, null, null, Report.BAD_CDATA_CONTENT);
/* if javascript insert backslash before / */
if (container.isJavaScript()) {
for (int i = lexsize; i > start-1; --i) {
lexbuf[i] = lexbuf[i-1];
}
lexbuf[start-1] = '\\';
lexsize++;
}
}
state = CDATA_INTERMEDIATE;
}
}
if (isEmpty) {
lexsize = txtstart = txtend;
} else {
txtend = lexsize;
}
if (c == StreamIn.END_OF_STREAM) {
report.warning(this, container, null, Report.MISSING_ENDTAG_FOR);
}
/* this was disabled for some reason... */
// #if 0
// if (lexer->txtend > lexer->txtstart)
// return TextToken(lexer);
// else
// return NULL;
// #else
return newNode(Node.TEXT_NODE, lexbuf, txtstart, txtend);
// #endif
}
/**
*
*
*/
public void ungetToken()
{
this.pushed = true;
}
/**
* Gets a token.
* @param mode one of the following:
* <ul>
* <li><code>MixedContent</code>-- for elements which don't accept PCDATA</li>
* <li><code>Preformatted</code>-- white spacepreserved as is</li>
* <li><code>IgnoreMarkup</code>-- for CDATA elements such as script, style</li>
* </ul>
* @return next Node
*/
public Node getToken(short mode)
{
int c = 0;
int badcomment = 0;
// pass by reference
boolean[] isempty = new boolean[1];
boolean inDTDSubset = false;
AttVal attributes = null;
if (this.pushed)
{
// duplicate inlines in preference to pushed text nodes when appropriate
if (this.token.type != Node.TEXT_NODE || (this.insert == -1 && this.inode == null))
{
this.pushed = false;
return this.token;
}
}
// at start of block elements, unclosed inline
if (this.insert != -1 || this.inode != null)
{
return insertedToken();
}
this.lines = this.in.getCurline();
this.columns = this.in.getCurcol();
this.waswhite = false;
this.txtstart = this.lexsize;
this.txtend = this.lexsize;
while ((c = this.in.readChar()) != StreamIn.END_OF_STREAM)
{
// FG fix for [427846] different from tidy
// if (this.insertspace && (!TidyUtils.toBoolean(mode & IGNORE_WHITESPACE)))
if (this.insertspace && mode != IGNORE_WHITESPACE)
{
addCharToLexer(' ');
}
if (this.insertspace && (!TidyUtils.toBoolean(mode & IGNORE_WHITESPACE)))
{
this.waswhite = true;
this.insertspace = false;
}
// treat \r\n as \n and \r as \n
if (c == '\r')
{
c = this.in.readChar();
if (c != '\n')
{
this.in.ungetChar(c);
}
c = '\n';
}
addCharToLexer(c);
switch (this.state)
{
case LEX_CONTENT :
// element content
// Discard white space if appropriate.
// Its cheaper to do this here rather than in parser methods for elements that
// don't have mixed content.
if (TidyUtils.isWhite((char) c) && (mode == IGNORE_WHITESPACE) && this.lexsize == this.txtstart + 1)
{
--this.lexsize;
this.waswhite = false;
this.lines = this.in.getCurline();
this.columns = this.in.getCurcol();
continue;
}
if (c == '<')
{
this.state = LEX_GT;
continue;
}
if (TidyUtils.isWhite((char) c))
{
// was previous char white?
if (this.waswhite)
{
if (mode != PREFORMATTED && mode != IGNORE_MARKUP)
{
--this.lexsize;
this.lines = this.in.getCurline();
this.columns = this.in.getCurcol();
}
}
else
{
// prev char wasn't white
this.waswhite = true;
if (mode != PREFORMATTED && mode != IGNORE_MARKUP && c != ' ')
{
changeChar((byte) ' ');
}
}
continue;
}
else if (c == '&' && mode != IGNORE_MARKUP)
{
parseEntity(mode);
}
// this is needed to avoid trimming trailing whitespace
if (mode == IGNORE_WHITESPACE)
{
mode = MIXED_CONTENT;
}
this.waswhite = false;
continue;
case LEX_GT :
// <
// check for endtag
if (c == '/')
{
c = this.in.readChar();
if (c == StreamIn.END_OF_STREAM)
{
this.in.ungetChar(c);
continue;
}
addCharToLexer(c);
if (TidyUtils.isLetter((char) c))
{
this.lexsize -= 3;
this.txtend = this.lexsize;
this.in.ungetChar(c);
this.state = LEX_ENDTAG;
this.lexbuf[this.lexsize] = (byte) '\0'; // debug
// changed from
// this.in.curcol -= 2;
this.columns -= 2;
// if some text before the </ return it now
if (this.txtend > this.txtstart)
{
// trim space char before end tag
if (mode == IGNORE_WHITESPACE && this.lexbuf[this.lexsize - 1] == (byte) ' ')
{
this.lexsize -= 1;
this.txtend = this.lexsize;
}
this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend);
return this.token;
}
continue; // no text so keep going
}
// otherwise treat as CDATA
this.waswhite = false;
this.state = LEX_CONTENT;
continue;
}
if (mode == IGNORE_MARKUP)
{
// otherwise treat as CDATA
this.waswhite = false;
this.state = LEX_CONTENT;
continue;
}
// look out for comments, doctype or marked sections this isn't quite right, but its getting there
if (c == '!')
{
c = this.in.readChar();
if (c == '-')
{
c = this.in.readChar();
if (c == '-')
{
this.state = LEX_COMMENT; // comment
this.lexsize -= 2;
this.txtend = this.lexsize;
// if some text before < return it now
if (this.txtend > this.txtstart)
{
this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend);
return this.token;
}
this.txtstart = this.lexsize;
continue;
}
report.warning(this, null, null, Report.MALFORMED_COMMENT);
}
else if (c == 'd' || c == 'D')
{
this.state = LEX_DOCTYPE; // doctype
this.lexsize -= 2;
this.txtend = this.lexsize;
mode = IGNORE_WHITESPACE;
// skip until white space or '>'
for (;;)
{
c = this.in.readChar();
if (c == StreamIn.END_OF_STREAM || c == '>')
{
this.in.ungetChar(c);
break;
}
if (!TidyUtils.isWhite((char) c))
{
continue;
}
// and skip to end of whitespace
for (;;)
{
c = this.in.readChar();
if (c == StreamIn.END_OF_STREAM || c == '>')
{
this.in.ungetChar(c);
break;
}
if (TidyUtils.isWhite((char) c))
{
continue;
}
this.in.ungetChar(c);
break;
}
break;
}
// if some text before < return it now
if (this.txtend > this.txtstart)
{
this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend);
return this.token;
}
this.txtstart = this.lexsize;
continue;
}
else if (c == '[')
{
// Word 2000 embeds <![if ...]> ... <![endif]> sequences
this.lexsize -= 2;
this.state = LEX_SECTION;
this.txtend = this.lexsize;
// if some text before < return it now
if (this.txtend > this.txtstart)
{
this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend);
return this.token;
}
this.txtstart = this.lexsize;
continue;
}
// otherwise swallow chars up to and including next '>'
while (true)
{
c = this.in.readChar();
if (c == '>')
{
break;
}
if (c == -1)
{
this.in.ungetChar(c);
break;
}
}
this.lexsize -= 2;
this.lexbuf[this.lexsize] = (byte) '\0';
this.state = LEX_CONTENT;
continue;
}
// processing instructions
if (c == '?')
{
this.lexsize -= 2;
this.state = LEX_PROCINSTR;
this.txtend = this.lexsize;
// if some text before < return it now
if (this.txtend > this.txtstart)
{
this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend);
return this.token;
}
this.txtstart = this.lexsize;
continue;
}
// Microsoft ASP's e.g. <% ... server-code ... %>
if (c == '%')
{
this.lexsize -= 2;
this.state = LEX_ASP;
this.txtend = this.lexsize;
// if some text before < return it now
if (this.txtend > this.txtstart)
{
this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend);
return this.token;
}
this.txtstart = this.lexsize;
continue;
}
// Netscapes JSTE e.g. <# ... server-code ... #>
if (c == '#')
{
this.lexsize -= 2;
this.state = LEX_JSTE;
this.txtend = this.lexsize;
// if some text before < return it now
if (this.txtend > this.txtstart)
{
this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend);
return this.token;
}
this.txtstart = this.lexsize;
continue;
}
// check for start tag
if (TidyUtils.isLetter((char) c))
{
this.in.ungetChar(c); // push back letter
this.lexsize -= 2; // discard " <" + letter
this.txtend = this.lexsize;
this.state = LEX_STARTTAG; // ready to read tag name
// if some text before < return it now
if (this.txtend > this.txtstart)
{
this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend);
return this.token;
}
continue; // no text so keep going
}
// otherwise treat as CDATA
this.state = LEX_CONTENT;
this.waswhite = false;
continue;
case LEX_ENDTAG :
// </letter
this.txtstart = this.lexsize - 1;
// changed from
// this.in.curcol -= 2;
this.columns -= 2;
c = parseTagName();
this.token = newNode(Node.END_TAG, // create endtag token
this.lexbuf,
this.txtstart,
this.txtend,
TidyUtils.getString(this.lexbuf, this.txtstart, this.txtend - this.txtstart));
this.lexsize = this.txtstart;
this.txtend = this.txtstart;
// skip to '>'
while (TidyUtils.isWhite((char) c))
{
c = this.in.readChar();
}
if (c == StreamIn.END_OF_STREAM)
{
this.in.ungetChar(c);
report.attrError(this, this.token, null, Report.UNEXPECTED_GT);
continue;
}
// should be at the '>' if we're not, assume one
if (c != '>') {
this.in.ungetChar(c);
c = '>';
report.attrError(this, this.token, null, Report.UNEXPECTED_GT);
}
this.state = LEX_CONTENT;
this.waswhite = false;
return this.token; // the endtag token
case LEX_STARTTAG :
// first letter of tagname
this.txtstart = this.lexsize - 1; // set txtstart to first letter
c = parseTagName();
isempty[0] = false;
attributes = null;
this.token = newNode(
(isempty[0] ? Node.START_END_TAG : Node.START_TAG),
this.lexbuf,
this.txtstart,
this.txtend,
TidyUtils.getString(this.lexbuf, this.txtstart, this.txtend - this.txtstart));
// parse attributes, consuming closing ">"
if (c != '>')
{
if (c == '/')
{
this.in.ungetChar(c);
}
attributes = parseAttrs(isempty);
}
if (isempty[0])
{
this.token.type = Node.START_END_TAG;
}
this.token.attributes = attributes;
this.lexsize = this.txtstart;
this.txtend = this.txtstart;
// swallow newline following start tag
// special check needed for CRLF sequence
// this doesn't apply to empty elements
// nor to preformatted content that needs escaping
if (
(mode != PREFORMATTED || preContent(this.token))
&& (this.token.expectsContent() || this.token.tag == this.configuration.tt.tagBr))
{
c = this.in.readChar();
if (c == '\r')
{
c = this.in.readChar();
if (c != '\n')
{
this.in.ungetChar(c);
}
}
else if (c != '\n' && c != '\f')
{
this.in.ungetChar(c);
}
this.waswhite = true; // to swallow leading whitespace
}
else
{
this.waswhite = false;
}
this.state = LEX_CONTENT;
if (this.token.tag == null)
{
report.error(this, null, this.token, Report.UNKNOWN_ELEMENT);
}
else if (!this.configuration.xmlTags)
{
constrainVersion(this.token.tag.versions);
if (TidyUtils.toBoolean(this.token.tag.versions & Dict.VERS_PROPRIETARY))
{
// #427810 - fix by Gary Deschaines 24 May 00
if (this.configuration.makeClean && (this.token.tag != this.configuration.tt.tagNobr && //
this.token.tag != this.configuration.tt.tagWbr))
{
report.warning(this, null, this.token, Report.PROPRIETARY_ELEMENT);
}
// #427810 - fix by Terry Teague 2 Jul 01
else if (!this.configuration.makeClean)
{
report.warning(this, null, this.token, Report.PROPRIETARY_ELEMENT);
}
}
if (this.token.tag.getChkattrs() != null)
{
this.token.tag.getChkattrs().check(this, this.token);
}
else
{
this.token.checkAttributes(this);
}
// should this be called before attribute checks?
this.token.repairDuplicateAttributes(this);
}
return this.token; // return start tag
case LEX_COMMENT :
// seen <!-- so look for -->
if (c != '-')
{
continue;
}
c = this.in.readChar();
addCharToLexer(c);
if (c != '-')
{
continue;
}
end_comment : while (true)
{
c = this.in.readChar();
if (c == '>')
{
if (badcomment != 0)
{
report.warning(this, null, null, Report.MALFORMED_COMMENT);
}
this.txtend = this.lexsize - 2; // AQ 8Jul2000
this.lexbuf[this.lexsize] = (byte) '\0';
this.state = LEX_CONTENT;
this.waswhite = false;
this.token = newNode(Node.COMMENT_TAG, this.lexbuf, this.txtstart, this.txtend);
// now look for a line break
c = this.in.readChar();
if (c == '\r')
{
c = this.in.readChar();
if (c != '\n')
{
this.token.linebreak = true;
}
}
if (c == '\n')
{
this.token.linebreak = true;
}
else
{
this.in.ungetChar(c);
}
return this.token;
}
// note position of first such error in the comment
if (badcomment == 0)
{
this.lines = this.in.getCurline();
this.columns = this.in.getCurcol() - 3;
}
badcomment++;
if (this.configuration.fixComments)
{
this.lexbuf[this.lexsize - 2] = (byte) '=';
}
addCharToLexer(c);
// if '-' then look for '>' to end the comment
if (c != '-')
{
break end_comment;
}
}
// otherwise continue to look for -->
this.lexbuf[this.lexsize - 2] = (byte) '=';
continue;
case LEX_DOCTYPE :
// seen <!d so look for '> ' munging whitespace
if (TidyUtils.isWhite((char) c))
{
if (this.waswhite)
{
this.lexsize -= 1;
}
this.waswhite = true;
}
else
{
this.waswhite = false;
}
if (inDTDSubset)
{
if (c == ']')
{
inDTDSubset = false;
}
}
else if (c == '[')
{
inDTDSubset = true;
}
if (inDTDSubset || c != '>')
{
continue;
}
this.lexsize -= 1;
this.txtend = this.lexsize;
this.lexbuf[this.lexsize] = (byte) '\0';
this.state = LEX_CONTENT;
this.waswhite = false;
this.token = newNode(Node.DOCTYPE_TAG, this.lexbuf, this.txtstart, this.txtend);
// make a note of the version named by the doctype
this.doctype = findGivenVersion(this.token);
return this.token;
case LEX_PROCINSTR :
// seen <? so look for '> '
// check for PHP preprocessor instructions <?php ... ?>
if (this.lexsize - this.txtstart == 3)
{
if ((TidyUtils.getString(this.lexbuf, this.txtstart, 3)).equals("php"))
{
this.state = LEX_PHP;
continue;
}
}
if (this.lexsize - this.txtstart == 4)
{
if ((TidyUtils.getString(this.lexbuf, this.txtstart, 3)).equals("xml")
&& TidyUtils.isWhite((char) this.lexbuf[this.txtstart + 3]))
{
this.state = LEX_XMLDECL;
attributes = null;
continue;
}
}
if (this.configuration.xmlPIs) // insist on ?> as terminator
{
if (c != '?')
{
continue;
}
// now look for '>'
c = this.in.readChar();
if (c == StreamIn.END_OF_STREAM)
{
report.warning(this, null, null, Report.UNEXPECTED_END_OF_FILE);
this.in.ungetChar(c);
continue;
}
addCharToLexer(c);
}
if (c != '>')
{
continue;
}
this.lexsize -= 1;
this.txtend = this.lexsize;
this.lexbuf[this.lexsize] = (byte) '\0';
this.state = LEX_CONTENT;
this.waswhite = false;
this.token = newNode(Node.PROC_INS_TAG, this.lexbuf, this.txtstart, this.txtend);
return this.token;
case LEX_ASP :
// seen <% so look for "%> "
if (c != '%')
{
continue;
}
// now look for '>'
c = this.in.readChar();
if (c != '>')
{
this.in.ungetChar(c);
continue;
}
this.lexsize -= 1;
this.txtend = this.lexsize;
this.lexbuf[this.lexsize] = (byte) '\0';
this.state = LEX_CONTENT;
this.waswhite = false;
this.token = newNode(Node.ASP_TAG, this.lexbuf, this.txtstart, this.txtend);
return this.token;
case LEX_JSTE :
// seen <# so look for "#> "
if (c != '#')
{
continue;
}
// now look for '>'
c = this.in.readChar();
if (c != '>')
{
this.in.ungetChar(c);
continue;
}
this.lexsize -= 1;
this.txtend = this.lexsize;
this.lexbuf[this.lexsize] = (byte) '\0';
this.state = LEX_CONTENT;
this.waswhite = false;
this.token = newNode(Node.JSTE_TAG, this.lexbuf, this.txtstart, this.txtend);
return this.token;
case LEX_PHP :
// seen " <?php" so look for "?> "
if (c != '?')
{
continue;
}
// now look for '>'
c = this.in.readChar();
if (c != '>')
{
this.in.ungetChar(c);
continue;
}
this.lexsize -= 1;
this.txtend = this.lexsize;
this.lexbuf[this.lexsize] = (byte) '\0';
this.state = LEX_CONTENT;
this.waswhite = false;
this.token = newNode(Node.PHP_TAG, this.lexbuf, this.txtstart, this.txtend);
return this.token;
case LEX_XMLDECL : // seen "<?xml" so look for "?>"
if (TidyUtils.isWhite((char) c) && c != '?')
{
continue;
}
// get pseudo-attribute
if (c != '?')
{
String name;
Node[] asp = new Node[1];
Node[] php = new Node[1];
AttVal av = new AttVal();
int[] pdelim = new int[1];
isempty[0] = false;
this.in.ungetChar(c);
name = this.parseAttribute(isempty, asp, php);
av.attribute = name;
av.value = this.parseValue(name, true, isempty, pdelim);
av.delim = pdelim[0];
av.next = attributes;
attributes = av;
// continue;
}
// now look for '>'
c = this.in.readChar();
if (c != '>')
{
this.in.ungetChar(c);
continue;
}
this.lexsize -= 1;
this.txtend = this.txtstart;
this.lexbuf[this.txtend] = '\0';
this.state = LEX_CONTENT;
this.waswhite = false;
this.token = newNode(Node.XML_DECL, this.lexbuf, this.txtstart, this.txtend);
this.token.attributes = attributes;
return this.token;
case LEX_SECTION :
// seen " <![" so look for "]> "
if (c == '[')
{
if (this.lexsize == (this.txtstart + 6)
&& (TidyUtils.getString(this.lexbuf, this.txtstart, 6)).equals("CDATA["))
{
this.state = LEX_CDATA;
this.lexsize -= 6;
continue;
}
}
if (c != ']')
{
continue;
}
// now look for '>'
c = this.in.readChar();
if (c != '>')
{
this.in.ungetChar(c);
continue;
}
this.lexsize -= 1;
this.txtend = this.lexsize;
this.lexbuf[this.lexsize] = (byte) '\0';
this.state = LEX_CONTENT;
this.waswhite = false;
this.token = newNode(Node.SECTION_TAG, this.lexbuf, this.txtstart, this.txtend);
return this.token;
case LEX_CDATA :
// seen " <![CDATA[" so look for "]]> "
if (c != ']')
{
continue;
}
// now look for ']'
c = this.in.readChar();
if (c != ']')
{
this.in.ungetChar(c);
continue;
}
// now look for '>'
c = this.in.readChar();
if (c != '>')
{
this.in.ungetChar(c);
continue;
}
this.lexsize -= 1;
this.txtend = this.lexsize;
this.lexbuf[this.lexsize] = (byte) '\0';
this.state = LEX_CONTENT;
this.waswhite = false;
this.token = newNode(Node.CDATA_TAG, this.lexbuf, this.txtstart, this.txtend);
return this.token;
default :
// should never reach here
break;
}
}
if (this.state == LEX_CONTENT) // text string
{
this.txtend = this.lexsize;
if (this.txtend > this.txtstart)
{
this.in.ungetChar(c);
if (this.lexbuf[this.lexsize - 1] == (byte) ' ')
{
this.lexsize -= 1;
this.txtend = this.lexsize;
}
this.token = newNode(Node.TEXT_NODE, this.lexbuf, this.txtstart, this.txtend);
return this.token;
}
}
else if (this.state == LEX_COMMENT) // comment
{
if (c == StreamIn.END_OF_STREAM)
{
report.warning(this, null, null, Report.MALFORMED_COMMENT);
}
this.txtend = this.lexsize;
this.lexbuf[this.lexsize] = (byte) '\0';
this.state = LEX_CONTENT;
this.waswhite = false;
this.token = newNode(Node.COMMENT_TAG, this.lexbuf, this.txtstart, this.txtend);
return this.token;
}
return null;
}
/**
* parser for ASP within start tags Some people use ASP for to customize attributes Tidy isn't really well suited to
* dealing with ASP This is a workaround for attributes, but won't deal with the case where the ASP is used to
* tailor the attribute value. Here is an example of a work around for using ASP in attribute values:
* <code>href='<%=rsSchool.Fields("ID").Value%>'</code> where the ASP that generates the attribute value is
* masked from Tidy by the quotemarks.
* @return parsed Node
*/
public Node parseAsp()
{
int c;
Node asp = null;
this.txtstart = this.lexsize;
while ((c = this.in.readChar()) != StreamIn.END_OF_STREAM)
{
addCharToLexer(c);
if (c != '%')
{
continue;
}
if ((c = this.in.readChar()) == StreamIn.END_OF_STREAM)
{
break;
}
addCharToLexer(c);
if (c == '>')
{
break;
}
}
this.lexsize -= 2;
this.txtend = this.lexsize;
if (this.txtend > this.txtstart)
{
asp = newNode(Node.ASP_TAG, this.lexbuf, this.txtstart, this.txtend);
}
this.txtstart = this.txtend;
return asp;
}
/**
* PHP is like ASP but is based upon XML processing instructions, e.g. <code><?php ... ?></code>.
* @return parsed Node
*/
public Node parsePhp()
{
int c;
Node php = null;
this.txtstart = this.lexsize;
while ((c = this.in.readChar()) != StreamIn.END_OF_STREAM)
{
addCharToLexer(c);
if (c != '?')
{
continue;
}
if ((c = this.in.readChar()) == StreamIn.END_OF_STREAM)
{
break;
}
addCharToLexer(c);
if (c == '>')
{
break;
}
}
this.lexsize -= 2;
this.txtend = this.lexsize;
if (this.txtend > this.txtstart)
{
php = newNode(Node.PHP_TAG, this.lexbuf, this.txtstart, this.txtend);
}
this.txtstart = this.txtend;
return php;
}
/**
* consumes the '>' terminating start tags.
* @param isempty flag is passed as array so it can be modified
* @param asp asp Node, passed as array so it can be modified
* @param php php Node, passed as array so it can be modified
* @return parsed attribute
*/
public String parseAttribute(boolean[] isempty, Node[] asp, Node[] php)
{
int start = 0;
String attr;
int c = 0;
int lastc = 0;
asp[0] = null; // clear asp pointer
php[0] = null; // clear php pointer
// skip white space before the attribute
for (;;)
{
c = this.in.readChar();
if (c == '/')
{
c = this.in.readChar();
if (c == '>')
{
isempty[0] = true;
return null;
}
this.in.ungetChar(c);
c = '/';
break;
}
if (c == '>')
{
return null;
}
if (c == '<')
{
c = this.in.readChar();
if (c == '%')
{
asp[0] = parseAsp();
return null;
}
else if (c == '?')
{
php[0] = parsePhp();
return null;
}
this.in.ungetChar(c);
if (this.state != LEX_XMLDECL) // FG fix for 532535
{
this.in.ungetChar('<'); // fix for 433360
}
report.attrError(this, this.token, null, Report.UNEXPECTED_GT);
return null;
}
if (c == '=')
{
report.attrError(this, this.token, null, Report.UNEXPECTED_EQUALSIGN);
continue;
}
if (c == '"' || c == '\'')
{
report.attrError(this, this.token, null, Report.UNEXPECTED_QUOTEMARK);
continue;
}
if (c == StreamIn.END_OF_STREAM)
{
report.attrError(this, this.token, null, Report.UNEXPECTED_END_OF_FILE);
this.in.ungetChar(c);
return null;
}
if (!TidyUtils.isWhite((char) c))
{
break;
}
}
start = this.lexsize;
lastc = c;
for (;;)
{
// but push back '=' for parseValue()
if (c == '=' || c == '>')
{
this.in.ungetChar(c);
break;
}
if (c == '<' || c == StreamIn.END_OF_STREAM)
{
this.in.ungetChar(c);
break;
}
if (lastc == '-' && (c == '"' || c == '\''))
{
this.lexsize--;
this.in.ungetChar(c);
break;
}
if (TidyUtils.isWhite((char) c))
{
break;
}
// what should be done about non-namechar characters?
// currently these are incorporated into the attr name
if (!this.configuration.xmlTags && TidyUtils.isUpper((char) c))
{
c = TidyUtils.toLower((char) c);
}
// ++len; #427672 - handle attribute names with multibyte chars - fix by Randy Waki - 10 Aug 00
addCharToLexer(c);
lastc = c;
c = this.in.readChar();
}
// #427672 - handle attribute names with multibyte chars - fix by Randy Waki - 10 Aug 00
int len = this.lexsize - start;
attr = (len > 0 ? TidyUtils.getString(this.lexbuf, start, len) : null);
this.lexsize = start;
return attr;
}
/**
* Invoked when < is seen in place of attribute value but terminates on whitespace if not ASP, PHP or Tango this
* routine recognizes ' and " quoted strings.
* @return delimiter
*/
public int parseServerInstruction()
{
int c, delim = '"';
boolean isrule = false;
c = this.in.readChar();
addCharToLexer(c);
// check for ASP, PHP or Tango
if (c == '%' || c == '?' || c == '@')
{
isrule = true;
}
for (;;)
{
c = this.in.readChar();
if (c == StreamIn.END_OF_STREAM)
{
break;
}
if (c == '>')
{
if (isrule)
{
addCharToLexer(c);
}
else
{
this.in.ungetChar(c);
}
break;
}
// if not recognized as ASP, PHP or Tango
// then also finish value on whitespace
if (!isrule)
{
if (TidyUtils.isWhite((char) c))
{
break;
}
}
addCharToLexer(c);
if (c == '"')
{
do
{
c = this.in.readChar();
if (endOfInput()) // #427840 - fix by Terry Teague 30 Jun 01
{
report.attrError(this, this.token, null, Report.UNEXPECTED_END_OF_FILE);
this.in.ungetChar(c);
return 0;
}
if (c == '>') // #427840 - fix by Terry Teague 30 Jun 01
{
this.in.ungetChar(c);
report.attrError(this, this.token, null, Report.UNEXPECTED_GT);
return 0;
}
addCharToLexer(c);
}
while (c != '"');
delim = '\'';
continue;
}
if (c == '\'')
{
do
{
c = this.in.readChar();
if (endOfInput()) // #427840 - fix by Terry Teague 30 Jun 01
{
report.attrError(this, this.token, null, Report.UNEXPECTED_END_OF_FILE);
this.in.ungetChar(c);
return 0;
}
if (c == '>') // #427840 - fix by Terry Teague 30 Jun 01
{
this.in.ungetChar(c);
report.attrError(this, this.token, null, Report.UNEXPECTED_GT);
return 0;
}
addCharToLexer(c);
}
while (c != '\'');
}
}
return delim;
}
/**
* Parse an attribute value.
* @param name attribute name
* @param foldCase fold case?
* @param isempty is attribute empty? Passed as an array reference to allow modification
* @param pdelim delimiter, passed as an array reference to allow modification
* @return parsed value
*/
public String parseValue(String name, boolean foldCase, boolean[] isempty, int[] pdelim)
{
// values start with "=" or " = " etc.
// doesn't consume the ">" at end of start tag
int len = 0;
int start;
boolean seenGt = false;
boolean munge = true;
int c = 0;
int lastc, delim, quotewarning;
String value;
delim = 0;
pdelim[0] = '"';
// Henry Zrepa reports that some folk are using the embed element with script attributes where newlines are
// significant and must be preserved
if (this.configuration.literalAttribs)
{
munge = false;
}
// skip white space before the '='
while (true)
{
c = this.in.readChar();
if (c == StreamIn.END_OF_STREAM)
{
this.in.ungetChar(c);
break;
}
if (!TidyUtils.isWhite((char) c))
{
break;
}
}
// c should be '=' if there is a value other legal possibilities are white space, '/' and '>'
if (c != '=' && c != '"' && c != '\'')
{
this.in.ungetChar(c);
return null;
}
// skip white space after '='
while (true)
{
c = this.in.readChar();
if (c == StreamIn.END_OF_STREAM)
{
this.in.ungetChar(c);
break;
}
if (!TidyUtils.isWhite((char) c))
{
break;
}
}
// check for quote marks
if (c == '"' || c == '\'')
{
delim = c;
}
else if (c == '<')
{
start = this.lexsize;
addCharToLexer(c);
pdelim[0] = parseServerInstruction();
len = this.lexsize - start;
this.lexsize = start;
return (len > 0 ? TidyUtils.getString(this.lexbuf, start, len) : null);
}
else
{
this.in.ungetChar(c);
}
// and read the value string check for quote mark if needed
quotewarning = 0;
start = this.lexsize;
c = '\0';
while (true)
{
lastc = c; // track last character
c = this.in.readChar();
if (c == StreamIn.END_OF_STREAM)
{
report.attrError(this, this.token, null, Report.UNEXPECTED_END_OF_FILE);
this.in.ungetChar(c);
break;
}
if (delim == (char) 0)
{
if (c == '>')
{
this.in.ungetChar(c);
break;
}
if (c == '"' || c == '\'')
{
int q = c;
report.attrError(this, this.token, null, Report.UNEXPECTED_QUOTEMARK);
/* handle <input onclick=s("btn1")> and <a title=foo""">...</a> */
/* this doesn't handle <a title=foo"/> which browsers treat as */
/* 'foo"/' nor <a title=foo" /> which browser treat as 'foo"' */
c = in.readChar();
if (c == '>') {
addCharToLexer(q);
in.ungetChar(c);
break;
} else {
in.ungetChar(c);
c = q;
}
}
if (c == '<')
{
this.in.ungetChar(c); // fix for 433360
c = '>';
this.in.ungetChar(c);
report.attrError(this, this.token, null, Report.UNEXPECTED_GT);
break;
}
// For cases like <br clear=all/> need to avoid treating /> as part of the attribute value, however
// care is needed to avoid so treating <a href=http://www.acme.com /> in this way, which would map the
// <a> tag to <a href="http://www.acme.com"/>
if (c == '/')
{
// peek ahead in case of />
c = this.in.readChar();
if (c == '>' && !AttributeTable.getDefaultAttributeTable().isUrl(name))
{
isempty[0] = true;
this.in.ungetChar(c);
break;
}
// unget peeked char
this.in.ungetChar(c);
c = '/';
}
}
else
{
// delim is '\'' or '"'
if (c == delim)
{
break;
}
// treat CRLF, CR and LF as single line break
if (c == '\r')
{
c = this.in.readChar();
if (c != '\n')
{
this.in.ungetChar(c);
}
c = '\n';
}
if (c == '\n' || c == '<' || c == '>')
{
++quotewarning;
}
if (c == '>')
{
seenGt = true;
}
}
if (c == '&')
{
// no entities in ID attributes
if ("id".equalsIgnoreCase(name))
{
report.attrError(this, null, null, Report.ENTITY_IN_ID);
continue;
}
addCharToLexer(c);
parseEntity((short) 0);
continue;
}
// kludge for JavaScript attribute values with line continuations in string literals
if (c == '\\')
{
c = this.in.readChar();
if (c != '\n')
{
this.in.ungetChar(c);
c = '\\';
}
}
if (TidyUtils.isWhite((char) c))
{
if (delim == (char) 0)
{
break;
}
if (munge)
{
// discard line breaks in quoted URLs
// #438650 - fix by Randy Waki
if (c == '\n' && AttributeTable.getDefaultAttributeTable().isUrl(name))
{
// warn that we discard this newline
report.attrError(this, this.token, null, Report.NEWLINE_IN_URI);
continue;
}
c = ' ';
if (lastc == ' ')
{
continue;
}
}
}
else if (foldCase && TidyUtils.isUpper((char) c))
{
c = TidyUtils.toLower((char) c);
}
addCharToLexer(c);
}
if (quotewarning > 10 && seenGt && munge)
{
// there is almost certainly a missing trailing quote mark as we have see too many newlines, < or >
// characters. an exception is made for Javascript attributes and the javascript URL scheme which may
// legitimately include < and >, and for attributes starting with "<xml " as generated by Microsoft Office.
if (!AttributeTable.getDefaultAttributeTable().isScript(name)
&& !(AttributeTable.getDefaultAttributeTable().isUrl(name) && "javascript:".equals(TidyUtils.getString(
this.lexbuf,
start,
11)))
&& !"<xml ".equals(TidyUtils.getString(this.lexbuf, start, 5))) // #500236 - fix by Klaus Johannes Rusch
// 06 Jan 02
{
report.error(this, null, null, Report.SUSPECTED_MISSING_QUOTE);
}
}
len = this.lexsize - start;
this.lexsize = start;
if (len > 0 || delim != 0)
{
// ignore leading and trailing white space for all but title, alt, value and prompts attributes unless
// --literal-attributes is set to yes
// #994841 - Whitespace is removed from value attributes
if (munge && !TidyUtils.isInValuesIgnoreCase(new String[]{"alt", "title", "value", "prompt"}, name))
{
while (TidyUtils.isWhite((char) this.lexbuf[start + len - 1]))
{
--len;
}
while (TidyUtils.isWhite((char) this.lexbuf[start]) && start < len)
{
++start;
--len;
}
}
value = TidyUtils.getString(this.lexbuf, start, len);
}
else
{
value = null;
}
// note delimiter if given
if (delim != 0)
{
pdelim[0] = delim;
}
else
{
pdelim[0] = '"';
}
return value;
}
/**
* Check if attr is a valid name.
* @param attr String to check, must be non-null
* @return <code>true</code> if attr is a valid name.
*/
public static boolean isValidAttrName(String attr)
{
char c;
int i;
// first character should be a letter
c = attr.charAt(0);
if (!TidyUtils.isLetter(c))
{
return false;
}
// remaining characters should be namechars
for (i = 1; i < attr.length(); i++)
{
c = attr.charAt(i);
if (TidyUtils.isNamechar(c))
{
continue;
}
return false;
}
return true;
}
/**
* In CSS1, selectors can contain only the characters A-Z, 0-9, and Unicode characters 161-255, plus dash (-); they
* cannot start with a dash or a digit; they can also contain escaped characters and any Unicode character as a
* numeric code (see next item). The backslash followed by at most four hexadecimal digits (0..9A..F) stands for the
* Unicode character with that number. Any character except a hexadecimal digit can be escaped to remove its special
* meaning, by putting a backslash in front.
* @param buf css selector name
* @return <code>true</code> if the given string is a valid css1 selector name
*/
public static boolean isCSS1Selector(String buf)
{
if (buf == null)
{
return false;
}
// #508936 - CSS class naming for -clean option
boolean valid = true;
int esclen = 0;
char c;
int pos;
for (pos = 0; valid && pos < buf.length(); ++pos)
{
c = buf.charAt(pos);
if (c == '\\')
{
esclen = 1; // ab\555\444 is 4 chars {'a', 'b', \555, \444}
}
else if (Character.isDigit(c))
{
// Digit not 1st, unless escaped (Max length "\112F")
if (esclen > 0)
{
valid = (++esclen < 6);
}
if (valid)
{
valid = (pos > 0 || esclen > 0);
}
}
else
{
valid = (esclen > 0 // Escaped? Anything goes.
|| (pos > 0 && c == '-') // Dash cannot be 1st char
|| Character.isLetter(c) // a-z, A-Z anywhere
|| (c >= 161 && c <= 255)); // Unicode 161-255 anywhere
esclen = 0;
}
}
return valid;
}
/**
* Parse tag attributes.
* @param isempty is tag empty?
* @return parsed attribute/value list
*/
public AttVal parseAttrs(boolean[] isempty)
{
AttVal av, list;
String attribute, value;
int[] delim = new int[1];
Node[] asp = new Node[1];
Node[] php = new Node[1];
list = null;
while (!endOfInput())
{
attribute = parseAttribute(isempty, asp, php);
if (attribute == null)
{
// check if attributes are created by ASP markup
if (asp[0] != null)
{
av = new AttVal(list, null, asp[0], null, '\0', null, null);
list = av;
continue;
}
// check if attributes are created by PHP markup
if (php[0] != null)
{
av = new AttVal(list, null, null, php[0], '\0', null, null);
list = av;
continue;
}
break;
}
value = parseValue(attribute, false, isempty, delim);
if (attribute != null && isValidAttrName(attribute))
{
av = new AttVal(list, null, null, null, delim[0], attribute, value);
av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
list = av;
}
else
{
av = new AttVal(null, null, null, null, 0, attribute, value);
// #427664 - fix by Gary Peskin 04 Aug 00; other fixes by Dave Raggett
if (value != null)
{
report.attrError(this, this.token, av, Report.BAD_ATTRIBUTE_VALUE);
}
else if (TidyUtils.lastChar(attribute) == '"')
{
report.attrError(this, this.token, av, Report.MISSING_QUOTEMARK);
}
else
{
report.attrError(this, this.token, av, Report.UNKNOWN_ATTRIBUTE);
}
}
}
return list;
}
/**
* Push a copy of an inline node onto stack but don't push if implicit or OBJECT or APPLET (implicit tags are ones
* generated from the istack) One issue arises with pushing inlines when the tag is already pushed. For instance:
* <code><p><em> text <p><em> more text</code> Shouldn't be mapped to
* <code><p><em> text </em></p><p><em><em> more text </em></em></code>
* @param node Node to be pushed
*/
public void pushInline(Node node)
{
IStack is;
if (node.implicit)
{
return;
}
if (node.tag == null)
{
return;
}
if (!TidyUtils.toBoolean(node.tag.model & Dict.CM_INLINE))
{
return;
}
if (TidyUtils.toBoolean(node.tag.model & Dict.CM_OBJECT))
{
return;
}
if (node.tag != this.configuration.tt.tagFont && isPushed(node))
{
return;
}
// make sure there is enough space for the stack
is = new IStack();
is.tag = node.tag;
is.element = node.element;
if (node.attributes != null)
{
is.attributes = cloneAttributes(node.attributes);
}
this.istack.push(is);
}
/**
* Pop a copy of an inline node from the stack.
* @param node Node to be popped
*/
public void popInline(Node node)
{
IStack is;
if (node != null)
{
if (node.tag == null)
{
return;
}
if (!TidyUtils.toBoolean(node.tag.model & Dict.CM_INLINE))
{
return;
}
if (TidyUtils.toBoolean(node.tag.model & Dict.CM_OBJECT))
{
return;
}
// if node is </a> then pop until we find an <a>
if (node.tag == this.configuration.tt.tagA)
{
while (this.istack.size() > 0)
{
is = (IStack) this.istack.pop();
if (is.tag == this.configuration.tt.tagA)
{
break;
}
}
if (this.insert >= this.istack.size())
{
this.insert = -1;
}
return;
}
}
if (this.istack.size() > 0)
{
is = (IStack) this.istack.pop();
if (this.insert >= this.istack.size())
{
this.insert = -1;
}
}
}
/**
* Is the node in the stack?
* @param node Node
* @return <code>true</code> is the node is found in the stack
*/
public boolean isPushed(Node node)
{
int i;
IStack is;
for (i = this.istack.size() - 1; i >= 0; --i)
{
is = (IStack) this.istack.elementAt(i);
if (is.tag == node.tag)
{
return true;
}
}
return false;
}
/**
* This has the effect of inserting "missing" inline elements around the contents of blocklevel elements such as P,
* TD, TH, DIV, PRE etc. This procedure is called at the start of ParseBlock. When the inline stack is not empty, as
* will be the case in: <code><i><h1>italic heading</h1></i></code> which is then treated as
* equivalent to <code><h1><i>italic heading</i></h1></code> This is implemented by setting the lexer
* into a mode where it gets tokens from the inline stack rather than from the input stream.
* @param node original node
* @return stack size
*/
public int inlineDup(Node node)
{
int n;
n = this.istack.size() - this.istackbase;
if (n > 0)
{
this.insert = this.istackbase;
this.inode = node;
}
return n;
}
/**
* @return
*/
public Node insertedToken()
{
Node node;
IStack is;
int n;
// this will only be null if inode != null
if (this.insert == -1)
{
node = this.inode;
this.inode = null;
return node;
}
// is this is the "latest" node then update the position, otherwise use current values
if (this.inode == null)
{
this.lines = this.in.getCurline();
this.columns = this.in.getCurcol();
}
node = newNode(Node.START_TAG, this.lexbuf, this.txtstart, this.txtend);
// GLP: Bugfix 126261. Remove when this change is fixed in istack.c in the original Tidy
node.implicit = true;
is = (IStack) this.istack.elementAt(this.insert);
node.element = is.element;
node.tag = is.tag;
if (is.attributes != null)
{
node.attributes = cloneAttributes(is.attributes);
}
// advance lexer to next item on the stack
n = this.insert;
// and recover state if we have reached the end
if (++n < this.istack.size())
{
this.insert = n;
}
else
{
this.insert = -1;
}
return node;
}
/**
* Can the given element be removed?
* @param element node
* @return <code>true</code> if he element can be removed
*/
public boolean canPrune(Node element)
{
if (element.type == Node.TEXT_NODE)
{
return true;
}
if (element.content != null)
{
return false;
}
if (element.tag == this.configuration.tt.tagA && element.attributes != null)
{
return false;
}
if (element.tag == this.configuration.tt.tagP && !this.configuration.dropEmptyParas)
{
return false;
}
if (element.tag == null)
{
return false;
}
if (TidyUtils.toBoolean(element.tag.model & Dict.CM_ROW))
{
return false;
}
if (TidyUtils.toBoolean(element.tag.model & Dict.CM_EMPTY))
{
return false;
}
if (element.tag == this.configuration.tt.tagApplet)
{
return false;
}
if (element.tag == this.configuration.tt.tagObject)
{
return false;
}
if (element.tag == this.configuration.tt.tagScript && element.getAttrByName("src") != null)
{
return false;
}
// #540555 Empty title tag is trimmed
if (element.tag == this.configuration.tt.tagTitle)
{
return false;
}
// #433359 - fix by Randy Waki 12 Mar 01 - Empty iframe is trimmed
if (element.tag == this.configuration.tt.tagIframe)
{
return false;
}
if (element.getAttrByName("id") != null || element.getAttrByName("name") != null)
{
return false;
}
return true;
}
/**
* duplicate name attribute as an id and check if id and name match.
* @param node Node to check for name/it attributes
*/
public void fixId(Node node)
{
AttVal name = node.getAttrByName("name");
AttVal id = node.getAttrByName("id");
if (name != null)
{
if (id != null)
{
if (id.value != null && !id.value.equals(name.value))
{
report.attrError(this, node, name, Report.ID_NAME_MISMATCH);
}
}
else if (this.configuration.xmlOut)
{
node.addAttribute("id", name.value);
}
}
}
/**
* Defer duplicates when entering a table or other element where the inlines shouldn't be duplicated.
*/
public void deferDup()
{
this.insert = -1;
this.inode = null;
}
/**
* Constraint the html version in the document to the given one. Everything is allowed in proprietary version of
* HTML this is handled here rather than in the tag/attr dicts.
* @param vers html version code
*/
void constrainVersion(int vers)
{
this.versions &= (vers | Dict.VERS_PROPRIETARY);
}
/**
* Is content acceptable for pre elements?
* @param node content
* @return <code>true</code> if node is acceptable in pre elements
*/
protected boolean preContent(Node node)
{
// p is coerced to br's
if (node.tag == this.configuration.tt.tagP)
{
return true;
}
if (node.tag == null
|| node.tag == this.configuration.tt.tagP
|| !TidyUtils.toBoolean(node.tag.model & (Dict.CM_INLINE | Dict.CM_NEW)))
{
return false;
}
return true;
}
/**
* document type.
*/
private static class W3CVersionInfo
{
/**
* name.
*/
String name;
/**
* voyager name.
*/
String voyagerName;
/**
* profile.
*/
String profile;
/**
* code.
*/
short code;
/**
* Instantiates a new W3CVersionInfo.
* @param name version name
* @param voyagerName voyager (xhtml) name
* @param profile VOYAGER_STRICT | VOYAGER_LOOSE | VOYAGER_FRAMESET
* @param code unique code for this version info
*/
public W3CVersionInfo(String name, String voyagerName, String profile, short code)
{
this.name = name;
this.voyagerName = voyagerName;
this.profile = profile;
this.code = code;
}
}
}