Lexer.java example

Explorer
tizzit-master
/**
 * Copyright (c) 2009 Juwi MacMillan Group GmbH
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *         http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
/*
 * @(#)Lexer.java   1.11 2000/08/16
 *
 */

package org.tizzit.util.tidy;

/**
 *
 * Lexer for html parser
 *
 * (c) 1998-2000 (W3C) MIT, INRIA, Keio University
 * See Tidy.java for the copyright notice.
 * Derived from <a href="http://www.w3.org/People/Raggett/tidy">
 * HTML Tidy Release 4 Aug 2000</a>
 *
 * @author  Dave Raggett <dsr@w3.org>
 * @author  Andy Quick <ac.quick@sympatico.ca> (translation to Java)
 * @version 1.0, 1999/05/22
 * @version 1.0.1, 1999/05/29
 * @version 1.1, 1999/06/18 Java Bean
 * @version 1.2, 1999/07/10 Tidy Release 7 Jul 1999
 * @version 1.3, 1999/07/30 Tidy Release 26 Jul 1999
 * @version 1.4, 1999/09/04 DOM support
 * @version 1.5, 1999/10/23 Tidy Release 27 Sep 1999
 * @version 1.6, 1999/11/01 Tidy Release 22 Oct 1999
 * @version 1.7, 1999/12/06 Tidy Release 30 Nov 1999
 * @version 1.8, 2000/01/22 Tidy Release 13 Jan 2000
 * @version 1.9, 2000/06/03 Tidy Release 30 Apr 2000
 * @version 1.10, 2000/07/22 Tidy Release 8 Jul 2000
 * @version 1.11, 2000/08/16 Tidy Release 4 Aug 2000
 */

/*
 Given a file stream fp it returns a sequence of tokens.

 GetToken(fp) gets the next token
 UngetToken(fp) provides one level undo

 The tags include an attribute list:

 - linked list of attribute/value nodes
 - each node has 2 null-terminated strings.
 - entities are replaced in attribute values

 white space is compacted if not in preformatted mode
 If not in preformatted mode then leading white space
 is discarded and subsequent white space sequences
 compacted to single space chars.

 If XmlTags is no then Tag names are folded to upper
 case and attribute names to lower case.

 Not yet done:
 -   Doctype subset and marked sections
 */

import java.io.*;
import java.util.*;
import java.util.Vector;

public class Lexer {

	public StreamIn in; /* file stream */
	public PrintWriter errout; /* error output stream */
	public short badAccess; /* for accessibility errors */
	public short badLayout; /* for bad style errors */
	public short badChars; /* for bad char encodings */
	public short badForm; /* for mismatched/mispositioned form tags */
	public short warnings; /* count of warnings in this document */
	public short errors; /* count of errors */
	public int lines; /* lines seen */
	public int columns; /* at start of current token */
	public boolean waswhite; /* used to collapse contiguous white space */
	public boolean pushed; /* true after token has been pushed back */
	public boolean insertspace; /* when space is moved after end tag */
	public boolean excludeBlocks; /* Netscape compatibility */
	public boolean exiled; /* true if moved out of table */
	public boolean isvoyager; /* true if xmlns attribute on html element */
	public short versions; /* bit vector of HTML versions */
	public int doctype; /* version as given by doctype (if any) */
	public boolean badDoctype; /* e.g. if html or PUBLIC is missing */
	public int txtstart; /* start of current node */
	public int txtend; /* end of current node */
	public short state; /* state of lexer's finite state machine */
	public Node token;

	/*
	 lexer character buffer

	 parse tree nodes span onto this buffer
	 which contains the concatenated text
	 contents of all of the elements.

	 lexsize must be reset for each file.
	 */
	public byte[] lexbuf; /* byte buffer of UTF-8 chars */
	public int lexlength; /* allocated */
	public int lexsize; /* used */

	/* Inline stack for compatibility with Mosaic */
	public Node inode; /* for deferring text node */
	public int insert; /* for inferring inline tags */
	public Stack istack;
	public int istackbase; /* start of frame */

	public Style styles; /* used for cleaning up presentation markup */

	public Configuration configuration;
	protected int seenBodyEndTag; /* used by parser */
	private Vector nodeList;

	private static final String ENTITIES_RESOURCE = "/org/apache/xml/serialize/HTMLEntities.res";
	private static Hashtable _byChar;
	private static Hashtable _byName;

	static {
		initialize();
	}

	private static void initialize() {
		InputStream is = null;
		BufferedReader reader = null;
		int index;
		String name;
		String value;
		int code;
		String line;

		// Make sure not to initialize twice.
		if (_byName != null) return;
		try {
			_byName = new Hashtable();
			_byChar = new Hashtable();
			is = Lexer.class.getResourceAsStream(ENTITIES_RESOURCE);
			if (is == null)
					throw new RuntimeException("SER003 The resource [" + ENTITIES_RESOURCE + "] could not be found.\n"
							+ ENTITIES_RESOURCE);
			reader = new BufferedReader(new InputStreamReader(is));
			line = reader.readLine();
			while (line != null) {
				if (line.length() == 0 || line.charAt(0) == '#') {
					line = reader.readLine();
					continue;
				}
				index = line.indexOf(' ');
				if (index > 1) {
					name = line.substring(0, index);
					++index;
					if (index < line.length()) {
						value = line.substring(index);
						index = value.indexOf(' ');
						if (index > 0) value = value.substring(0, index);
						code = Integer.parseInt(value);
						defineEntity(name, (char) code);
					}
				}
				line = reader.readLine();
			}
			is.close();
		} catch (Exception except) {
			throw new RuntimeException("SER003 The resource [" + ENTITIES_RESOURCE + "] could not load: "
					+ except.toString() + "\n" + ENTITIES_RESOURCE + "\t" + except.toString());
		} finally {
			if (is != null) {
				try {
					is.close();
				} catch (Exception except) {
				}
			}
		}
	}

	private static void defineEntity(String name, char value) {
		if (_byName.get(name) == null) {
			_byName.put(name, new Integer(value));
			_byChar.put(new Integer(value), name);
		}
	}

	public Lexer(StreamIn in, Configuration configuration) {
		this.in = in;
		this.lines = 1;
		this.columns = 1;
		this.state = LEX_CONTENT;
		this.badAccess = 0;
		this.badLayout = 0;
		this.badChars = 0;
		this.badForm = 0;
		this.warnings = 0;
		this.errors = 0;
		this.waswhite = false;
		this.pushed = false;
		this.insertspace = false;
		this.exiled = false;
		this.isvoyager = false;
		this.versions = Dict.VERS_EVERYTHING;
		this.doctype = Dict.VERS_UNKNOWN;
		this.badDoctype = false;
		this.txtstart = 0;
		this.txtend = 0;
		this.token = null;
		this.lexbuf = null;
		this.lexlength = 0;
		this.lexsize = 0;
		this.inode = null;
		this.insert = -1;
		this.istack = new Stack();
		this.istackbase = 0;
		this.styles = null;
		this.configuration = configuration;
		this.seenBodyEndTag = 0;
		this.nodeList = new Vector();
	}

	public Node newNode() {
		Node node = new Node();
		nodeList.addElement(node);
		return node;
	}

	public Node newNode(short type, byte[] textarray, int start, int end) {
		Node node = new Node(type, textarray, start, end);
		nodeList.addElement(node);
		return node;
	}

	public Node newNode(short type, byte[] textarray, int start, int end, String element) {
		Node node = new Node(type, textarray, start, end, element, configuration.tt);
		nodeList.addElement(node);
		return node;
	}

	public Node cloneNode(Node node) {
		Node cnode = (Node) node.clone();
		nodeList.addElement(cnode);
		for (AttVal att = cnode.attributes; att != null; att = att.next) {
			if (att.asp != null) nodeList.addElement(att.asp);
			if (att.php != null) nodeList.addElement(att.php);
		}
		return cnode;
	}

	public AttVal cloneAttributes(AttVal attrs) {
		AttVal cattrs = (AttVal) attrs.clone();
		for (AttVal att = cattrs; att != null; att = att.next) {
			if (att.asp != null) nodeList.addElement(att.asp);
			if (att.php != null) nodeList.addElement(att.php);
		}
		return cattrs;
	}

	protected void updateNodeTextArrays(byte[] oldtextarray, byte[] newtextarray) {
		Node node;
		for (int i = 0; i < nodeList.size(); i++) {
			node = (Node) (nodeList.elementAt(i));
			if (node.textarray == oldtextarray) node.textarray = newtextarray;
		}
	}

	/* used for creating preformatted text from Word2000 */
	public Node newLineNode() {
		Node node = newNode();

		node.textarray = this.lexbuf;
		node.start = this.lexsize;
		addCharToLexer((int) '\n');
		node.end = this.lexsize;
		return node;
	}

	// Should always be able convert to/from UTF-8, so encoding exceptions are
	// converted to an Error to avoid adding throws declarations in
	// lots of methods.

	public static byte[] getBytes(String str) {
		try {
			return str.getBytes("UTF8");
		} catch (java.io.UnsupportedEncodingException e) {
			throw new Error("string to UTF-8 conversion failed: " + e.getMessage());
		}
	}

	public static String getString(byte[] bytes, int offset, int length) {
		try {
			return new String(bytes, offset, length, "UTF8");
		} catch (java.io.UnsupportedEncodingException e) {
			throw new Error("UTF-8 to string conversion failed: " + e.getMessage());
		}
	}

	public boolean endOfInput() {
		return this.in.isEndOfStream();
	}

	public void addByte(int c) {
		if (this.lexsize + 1 >= this.lexlength) {
			while (this.lexsize + 1 >= this.lexlength) {
				if (this.lexlength == 0)
					this.lexlength = 8192;
				else
					this.lexlength = this.lexlength * 2;
			}

			byte[] temp = this.lexbuf;
			this.lexbuf = new byte[this.lexlength];
			if (temp != null) {
				System.arraycopy(temp, 0, this.lexbuf, 0, temp.length);
				updateNodeTextArrays(temp, this.lexbuf);
			}
		}

		this.lexbuf[this.lexsize++] = (byte) c;
		this.lexbuf[this.lexsize] = (byte) '\0'; /* debug */
	}

	public void changeChar(byte c) {
		if (this.lexsize > 0) {
			this.lexbuf[this.lexsize - 1] = c;
		}
	}

	/* store char c as UTF-8 encoded byte stream */
	public void addCharToLexer(int c) {
		if (c < 128)
			addByte(c);
		else if (c <= 0x7FF) {
			addByte(0xC0 | (c >> 6));
			addByte(0x80 | (c & 0x3F));
		} else if (c <= 0xFFFF) {
			addByte(0xE0 | (c >> 12));
			addByte(0x80 | ((c >> 6) & 0x3F));
			addByte(0x80 | (c & 0x3F));
		} else if (c <= 0x1FFFFF) {
			addByte(0xF0 | (c >> 18));
			addByte(0x80 | ((c >> 12) & 0x3F));
			addByte(0x80 | ((c >> 6) & 0x3F));
			addByte(0x80 | (c & 0x3F));
		} else {
			addByte(0xF8 | (c >> 24));
			addByte(0x80 | ((c >> 18) & 0x3F));
			addByte(0x80 | ((c >> 12) & 0x3F));
			addByte(0x80 | ((c >> 6) & 0x3F));
			addByte(0x80 | (c & 0x3F));
		}
	}

	public void addStringToLexer(String str) {
		for (int i = 0; i < str.length(); i++) {
			addCharToLexer((int) str.charAt(i));
		}
	}

	/*
	 No longer attempts to insert missing ';' for unknown
	 enitities unless one was present already, since this
	 gives unexpected results.

	 For example:   <a href="something.htm?foo&bar&fred">
	 was tidied to: <a href="something.htm?foo&bar;&fred;">
	 rather than:   <a href="something.htm?foo&bar&fred">

	 My thanks for Maurice Buxton for spotting this.
	 */
	public void parseEntity(short mode) {
		short map;
		int start;
		boolean first = true;
		boolean semicolon = false;
		boolean numeric = false;
		int c, ch, startcol;
		String str;

		start = this.lexsize - 1; /* to start at "&" */
		startcol = this.in.curcol - 1;

		while (true) {
			c = this.in.readChar();
			if (c == StreamIn.EndOfStream) break;
			if (c == ';') {
				semicolon = true;
				break;
			}

			if (first && c == '#') {
				addCharToLexer(c);
				first = false;
				numeric = true;
				continue;
			}

			first = false;
			map = MAP((char) c);

			/* AQ: Added flag for numeric entities so that numeric entities
			 with missing semi-colons are recognized.
			 Eg. "rep..." is recognized as "rep"
			 */
			if (numeric && ((c == 'x') || ((map & DIGIT) != 0))) {
				addCharToLexer(c);
				continue;
			}
			if (!numeric && ((map & NAMECHAR) != 0)) {
				addCharToLexer(c);
				continue;
			}

			/* otherwise put it back */

			this.in.ungetChar(c);
			break;
		}

		str = getString(this.lexbuf, start, this.lexsize - start);
		ch = EntityTable.getDefaultEntityTable().entityCode(str);

		/* deal with unrecognized entities */
		if (ch <= 0) {
			/* set error position just before offending chararcter */
			this.lines = this.in.curline;
			this.columns = startcol;

			if (this.lexsize > start + 1) {
				Report.entityError(this, Report.UNKNOWN_ENTITY, str, ch);

				if (semicolon) addCharToLexer(';');
			} else /* naked & */
			{
				Report.entityError(this, Report.UNESCAPED_AMPERSAND, str, ch);
			}
		} else {
			if (c != ';') /* issue warning if not terminated by ';' */
			{
				/* set error position just before offending chararcter */
				this.lines = this.in.curline;
				this.columns = startcol;
				Report.entityError(this, Report.MISSING_SEMICOLON, str, c);
			}

			this.lexsize = start;

			if (ch == 160 && (mode & Preformatted) != 0) ch = ' ';

			Integer charIntVal = new Integer(ch);
			String retObj = (String) _byChar.get(charIntVal);
			if (retObj != null) {
				// we've found an entity in the File
				addCharToLexer('&');
				addCharToLexer('#');
				char[] addArr = charIntVal.toString().toCharArray();
				for (int i = 0; i < addArr.length; i++)
					addCharToLexer(addArr[i]);
				addCharToLexer(';');
			}
			/*
			 if(ch == '<') {
			 addCharToLexer('&');
			 addCharToLexer('l');
			 addCharToLexer('t');
			 addCharToLexer(';');
			 } else if(ch == '�') {
			 addCharToLexer('&');
			 addCharToLexer('#');
			 addCharToLexer('1');
			 addCharToLexer('2');
			 addCharToLexer('8');
			 addCharToLexer(';');
			 } else if(ch == '>') {
			 addCharToLexer('&');
			 addCharToLexer('g');
			 addCharToLexer('t');
			 addCharToLexer(';');
			 } else if(ch == '\u2013') {//&ndash
			 addCharToLexer('&');
			 addCharToLexer('#');
			 addCharToLexer('1');
			 addCharToLexer('5');
			 addCharToLexer('0');
			 addCharToLexer(';');
			 }else if(ch == '\u00B4' || ch == '\u2019') {//´
			 addCharToLexer('&');
			 addCharToLexer('#');
			 addCharToLexer('1');
			 addCharToLexer('8');
			 addCharToLexer('0');
			 addCharToLexer(';');
			 }else if(ch == '\u00B8' || ch == '\u201A') {//¸
			 addCharToLexer('&');
			 addCharToLexer('#');
			 addCharToLexer('1');
			 addCharToLexer('8');
			 addCharToLexer('4');
			 addCharToLexer(';');
			 }else if(ch == (char)0x03B1) {//&aplha
			 addCharToLexer('&');
			 addCharToLexer('#');
			 addCharToLexer('9');
			 addCharToLexer('4');
			 addCharToLexer('5');
			 addCharToLexer(';');
			 }else if(ch == (char)0x03B2) {//&beta
			 addCharToLexer('&');
			 addCharToLexer('#');
			 addCharToLexer('9');
			 addCharToLexer('4');
			 addCharToLexer('6');
			 addCharToLexer(';');
			 }else if(ch == (char)0x03B3) {//&gamma
			 addCharToLexer('&');
			 addCharToLexer('#');
			 addCharToLexer('9');
			 addCharToLexer('4');
			 addCharToLexer('7');
			 addCharToLexer(';');
			 }else if(ch == (char)0x03B4) {//&delta
			 addCharToLexer('&');
			 addCharToLexer('#');
			 addCharToLexer('9');
			 addCharToLexer('4');
			 addCharToLexer('8');
			 addCharToLexer(';');
			 }else if(ch == (char)0x03B5) {//&omega
			 addCharToLexer('&');
			 addCharToLexer('#');
			 addCharToLexer('9');
			 addCharToLexer('4');
			 addCharToLexer('9');
			 addCharToLexer(';');
			 }else if(ch == '\u201E'){//„
			 addCharToLexer('&');
			 addCharToLexer('#');
			 addCharToLexer('8');
			 addCharToLexer('2');
			 addCharToLexer('2');
			 addCharToLexer('2');
			 addCharToLexer(';');
			 }else if(ch == '\u201C') {//“
			 addCharToLexer('&');
			 addCharToLexer('#');
			 addCharToLexer('8');
			 addCharToLexer('2');
			 addCharToLexer('2');
			 addCharToLexer('0');
			 addCharToLexer(';');
			 } else if(ch == '\u201D') {
			 addCharToLexer('&');
			 addCharToLexer('#');
			 addCharToLexer('8');
			 addCharToLexer('2');
			 addCharToLexer('2');
			 addCharToLexer('1');
			 addCharToLexer(';');
			 }
			 */
			else {
				addCharToLexer(ch);
				if (ch == '&' && this.configuration.QuoteAmpersand) {
					addCharToLexer('a');
					addCharToLexer('m');
					addCharToLexer('p');
					addCharToLexer(';');
				}
			}
		}
	}

	public char parseTagName() {
		short map;
		int c;

		/* fold case of first char in buffer */

		c = this.lexbuf[this.txtstart];
		map = MAP((char) c);

		if (!this.configuration.XmlTags && (map & UPPERCASE) != 0) {
			c += (int) ((int) 'a' - (int) 'A');
			this.lexbuf[this.txtstart] = (byte) c;
		}

		while (true) {
			c = this.in.readChar();
			if (c == StreamIn.EndOfStream) break;
			map = MAP((char) c);

			if ((map & NAMECHAR) == 0) break;

			/* fold case of subsequent chars */

			if (!this.configuration.XmlTags && (map & UPPERCASE) != 0) c += (int) ((int) 'a' - (int) 'A');

			addCharToLexer(c);
		}

		this.txtend = this.lexsize;
		return (char) c;
	}

	public void addStringLiteral(String str) {
		for (int i = 0; i < str.length(); i++) {
			addCharToLexer((int) str.charAt(i));
		}
	}

	/* choose what version to use for new doctype */
	public short HTMLVersion() {
		short versions;

		versions = this.versions;

		if ((versions & Dict.VERS_HTML20) != 0) return Dict.VERS_HTML20;

		if ((versions & Dict.VERS_HTML32) != 0) return Dict.VERS_HTML32;

		if ((versions & Dict.VERS_HTML40_STRICT) != 0) return Dict.VERS_HTML40_STRICT;

		if ((versions & Dict.VERS_HTML40_LOOSE) != 0) return Dict.VERS_HTML40_LOOSE;

		if ((versions & Dict.VERS_FRAMES) != 0) return Dict.VERS_FRAMES;

		return Dict.VERS_UNKNOWN;
	}

	public String HTMLVersionName() {
		short guessed;
		int j;

		guessed = apparentVersion();

		for (j = 0; j < W3CVersion.length; ++j) {
			if (guessed == W3CVersion[j].code) {
				if (this.isvoyager) return W3CVersion[j].voyagerName;

				return W3CVersion[j].name;
			}
		}

		return null;
	}

	/* add meta element for Tidy */
	public boolean addGenerator(Node root) {
		AttVal attval;
		Node node;
		Node head = root.findHEAD(configuration.tt);

		if (head != null) {
			for (node = head.content; node != null; node = node.next) {
				if (node.tag == configuration.tt.tagMeta) {
					attval = node.getAttrByName("name");

					if (attval != null && attval.value != null && Lexer.wstrcasecmp(attval.value, "generator") == 0) {
						attval = node.getAttrByName("content");

						if (attval != null && attval.value != null && attval.value.length() >= 9
								&& Lexer.wstrcasecmp(attval.value.substring(0, 9), "HTML Tidy") == 0) { return false; }
					}
				}
			}

			node = this.inferredTag("meta");
			node.addAttribute("content", "HTML Tidy, see www.w3.org");
			node.addAttribute("name", "generator");
			Node.insertNodeAtStart(head, node);
			return true;
		}

		return false;
	}

	/* return true if substring s is in p and isn't all in upper case */
	/* this is used to check the case of SYSTEM, PUBLIC, DTD and EN */
	/* len is how many chars to check in p */
	private static boolean findBadSubString(String s, String p, int len) {
		int n = s.length();
		int i = 0;
		String ps;

		while (n < len) {
			ps = p.substring(i, i + n);
			if (wstrcasecmp(s, ps) == 0) return (!ps.equals(s.substring(0, n)));

			++i;
			--len;
		}

		return false;
	}

	public boolean checkDocTypeKeyWords(Node doctype) {
		int len = doctype.end - doctype.start;
		String s = getString(this.lexbuf, doctype.start, len);

		return !(findBadSubString("SYSTEM", s, len) || findBadSubString("PUBLIC", s, len)
				|| findBadSubString("//DTD", s, len) || findBadSubString("//W3C", s, len) || findBadSubString("//EN",
				s, len));
	}

	/* examine <!DOCTYPE> to identify version */
	public short findGivenVersion(Node doctype) {
		String p, s;
		int i, j;
		int len;
		String str1;
		String str2;

		/* if root tag for doctype isn't html give up now */
		str1 = getString(this.lexbuf, doctype.start, 5);
		if (wstrcasecmp(str1, "html ") != 0) return 0;

		if (!checkDocTypeKeyWords(doctype)) Report.warning(this, doctype, null, Report.DTYPE_NOT_UPPER_CASE);

		/* give up if all we are given is the system id for the doctype */
		str1 = getString(this.lexbuf, doctype.start + 5, 7);
		if (wstrcasecmp(str1, "SYSTEM ") == 0) {
			/* but at least ensure the case is correct */
			if (!str1.substring(0, 6).equals("SYSTEM"))
					System.arraycopy(getBytes("SYSTEM"), 0, this.lexbuf, doctype.start + 5, 6);
			return 0; /* unrecognized */
		}

		if (wstrcasecmp(str1, "PUBLIC ") == 0) {
			if (!str1.substring(0, 6).equals("PUBLIC"))
					System.arraycopy(getBytes("PUBLIC "), 0, this.lexbuf, doctype.start + 5, 6);
		} else
			this.badDoctype = true;

		for (i = doctype.start; i < doctype.end; ++i) {
			if (this.lexbuf[i] == (byte) '"') {
				str1 = getString(this.lexbuf, i + 1, 12);
				str2 = getString(this.lexbuf, i + 1, 13);
				if (str1.equals("-//W3C//DTD ")) {
					/* compute length of identifier e.g. "HTML 4.0 Transitional" */
					for (j = i + 13; j < doctype.end && this.lexbuf[j] != (byte) '/'; ++j);
					len = j - i - 13;
					p = getString(this.lexbuf, i + 13, len);

					for (j = 1; j < W3CVersion.length; ++j) {
						s = W3CVersion[j].name;
						if (len == s.length() && s.equals(p)) return W3CVersion[j].code;
					}

					/* else unrecognized version */
				} else if (str2.equals("-//IETF//DTD ")) {
					/* compute length of identifier e.g. "HTML 2.0" */
					for (j = i + 14; j < doctype.end && this.lexbuf[j] != (byte) '/'; ++j);
					len = j - i - 14;

					p = getString(this.lexbuf, i + 14, len);
					s = W3CVersion[0].name;
					if (len == s.length() && s.equals(p)) return W3CVersion[0].code;

					/* else unrecognized version */
				}
				break;
			}
		}

		return 0;
	}

	public void fixHTMLNameSpace(Node root, String profile) {
		Node node;
		AttVal prev, attr;

		for (node = root.content; node != null && node.tag != configuration.tt.tagHtml; node = node.next);

		if (node != null) {
			prev = null;

			for (attr = node.attributes; attr != null; attr = attr.next) {
				if (attr.attribute.equals("xmlns")) break;

				prev = attr;
			}

			if (attr != null) {
				if (!attr.value.equals(profile)) {
					Report.warning(this, node, null, Report.INCONSISTENT_NAMESPACE);
					attr.value = profile;
				}
			} else {
				attr = new AttVal(node.attributes, null, (int) '"', "xmlns", profile);
				attr.dict = AttributeTable.getDefaultAttributeTable().findAttribute(attr);
				node.attributes = attr;
			}
		}
	}

	public boolean setXHTMLDocType(Node root) {
		String fpi = " ";
		String sysid = "";
		String namespace = XHTML_NAMESPACE;
		Node doctype;

		doctype = root.findDocType();

		if (configuration.docTypeMode == Configuration.DOCTYPE_OMIT) {
			if (doctype != null) Node.discardElement(doctype);
			return true;
		}

		if (configuration.docTypeMode == Configuration.DOCTYPE_AUTO) {
			/* see what flavor of XHTML this document matches */
			if ((this.versions & Dict.VERS_HTML40_STRICT) != 0) { /* use XHTML strict */
				fpi = "-//W3C//DTD XHTML 1.0 Strict//EN";
				sysid = voyager_strict;
			} else if ((this.versions & Dict.VERS_LOOSE) != 0) {
				fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
				sysid = voyager_loose;
			} else if ((this.versions & Dict.VERS_FRAMES) != 0) { /* use XHTML frames */
				fpi = "-//W3C//DTD XHTML 1.0 Frameset//EN";
				sysid = voyager_frameset;
			} else /* lets assume XHTML transitional */
			{
				fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
				sysid = voyager_loose;
			}
		} else if (configuration.docTypeMode == Configuration.DOCTYPE_STRICT) {
			fpi = "-//W3C//DTD XHTML 1.0 Strict//EN";
			sysid = voyager_strict;
		} else if (configuration.docTypeMode == Configuration.DOCTYPE_LOOSE) {
			fpi = "-//W3C//DTD XHTML 1.0 Transitional//EN";
			sysid = voyager_loose;
		}

		fixHTMLNameSpace(root, namespace);

		if (doctype == null) {
			doctype = newNode(Node.DocTypeTag, this.lexbuf, 0, 0);
			doctype.next = root.content;
			doctype.parent = root;
			doctype.prev = null;
			root.content = doctype;
		}

		if (configuration.docTypeMode == Configuration.DOCTYPE_USER && configuration.docTypeStr != null) {
			fpi = configuration.docTypeStr;
			sysid = "";
		}

		this.txtstart = this.lexsize;
		this.txtend = this.lexsize;

		/* add public identifier */
		addStringLiteral("html PUBLIC ");

		/* check if the fpi is quoted or not */
		if (fpi.charAt(0) == '"')
			addStringLiteral(fpi);
		else {
			addStringLiteral("\"");
			addStringLiteral(fpi);
			addStringLiteral("\"");
		}

		if (sysid.length() + 6 >= this.configuration.wraplen)
			addStringLiteral("\n\"");
		else
			addStringLiteral("\n    \"");

		/* add system identifier */
		addStringLiteral(sysid);
		addStringLiteral("\"");

		this.txtend = this.lexsize;

		doctype.start = this.txtstart;
		doctype.end = this.txtend;

		return false;
	}

	public short apparentVersion() {
		switch (this.doctype) {
			case Dict.VERS_UNKNOWN:
				return HTMLVersion();

			case Dict.VERS_HTML20:
				if ((this.versions & Dict.VERS_HTML20) != 0) return Dict.VERS_HTML20;

				break;

			case Dict.VERS_HTML32:
				if ((this.versions & Dict.VERS_HTML32) != 0) return Dict.VERS_HTML32;

				break; /* to replace old version by new */

			case Dict.VERS_HTML40_STRICT:
				if ((this.versions & Dict.VERS_HTML40_STRICT) != 0) return Dict.VERS_HTML40_STRICT;

				break;

			case Dict.VERS_HTML40_LOOSE:
				if ((this.versions & Dict.VERS_HTML40_LOOSE) != 0) return Dict.VERS_HTML40_LOOSE;

				break; /* to replace old version by new */

			case Dict.VERS_FRAMES:
				if ((this.versions & Dict.VERS_FRAMES) != 0) return Dict.VERS_FRAMES;

				break;
		}

		Report.warning(this, null, null, Report.INCONSISTENT_VERSION);
		return this.HTMLVersion();
	}

	/* fixup doctype if missing */
	public boolean fixDocType(Node root) {
		Node doctype;
		int guessed = Dict.VERS_HTML40_STRICT, i;

		if (this.badDoctype) Report.warning(this, null, null, Report.MALFORMED_DOCTYPE);

		if (configuration.XmlOut) return true;

		doctype = root.findDocType();

		if (configuration.docTypeMode == Configuration.DOCTYPE_OMIT) {
			if (doctype != null) Node.discardElement(doctype);
			return true;
		}

		if (configuration.docTypeMode == Configuration.DOCTYPE_STRICT) {
			Node.discardElement(doctype);
			doctype = null;
			guessed = Dict.VERS_HTML40_STRICT;
		} else if (configuration.docTypeMode == Configuration.DOCTYPE_LOOSE) {
			Node.discardElement(doctype);
			doctype = null;
			guessed = Dict.VERS_HTML40_LOOSE;
		} else if (configuration.docTypeMode == Configuration.DOCTYPE_AUTO) {
			if (doctype != null) {
				if (this.doctype == Dict.VERS_UNKNOWN) return false;

				switch (this.doctype) {
					case Dict.VERS_UNKNOWN:
						return false;

					case Dict.VERS_HTML20:
						if ((this.versions & Dict.VERS_HTML20) != 0) return true;

						break; /* to replace old version by new */

					case Dict.VERS_HTML32:
						if ((this.versions & Dict.VERS_HTML32) != 0) return true;

						break; /* to replace old version by new */

					case Dict.VERS_HTML40_STRICT:
						if ((this.versions & Dict.VERS_HTML40_STRICT) != 0) return true;

						break; /* to replace old version by new */

					case Dict.VERS_HTML40_LOOSE:
						if ((this.versions & Dict.VERS_HTML40_LOOSE) != 0) return true;

						break; /* to replace old version by new */

					case Dict.VERS_FRAMES:
						if ((this.versions & Dict.VERS_FRAMES) != 0) return true;

						break; /* to replace old version by new */
				}

				/* INCONSISTENT_VERSION warning is now issued by ApparentVersion() */
			}

			/* choose new doctype */
			guessed = HTMLVersion();
		}

		if (guessed == Dict.VERS_UNKNOWN) return false;

		/* for XML use the Voyager system identifier */
		if (this.configuration.XmlOut || this.configuration.XmlTags || this.isvoyager) {
			if (doctype != null) Node.discardElement(doctype);

			for (i = 0; i < W3CVersion.length; ++i) {
				if (guessed == W3CVersion[i].code) {
					fixHTMLNameSpace(root, W3CVersion[i].profile);
					break;
				}
			}

			return true;
		}

		if (doctype == null) {
			doctype = newNode(Node.DocTypeTag, this.lexbuf, 0, 0);
			doctype.next = root.content;
			doctype.parent = root;
			doctype.prev = null;
			root.content = doctype;
		}

		this.txtstart = this.lexsize;
		this.txtend = this.lexsize;

		/* use the appropriate public identifier */
		addStringLiteral("html PUBLIC ");

		if (configuration.docTypeMode == Configuration.DOCTYPE_USER && configuration.docTypeStr != null)
			addStringLiteral(configuration.docTypeStr);
		else if (guessed == Dict.VERS_HTML20)
			addStringLiteral("\"-//IETF//DTD HTML 2.0//EN\"");
		else {
			addStringLiteral("\"-//W3C//DTD ");

			for (i = 0; i < W3CVersion.length; ++i) {
				if (guessed == W3CVersion[i].code) {
					addStringLiteral(W3CVersion[i].name);
					break;
				}
			}

			addStringLiteral("//EN\"");
		}

		this.txtend = this.lexsize;

		doctype.start = this.txtstart;
		doctype.end = this.txtend;

		return true;
	}

	/* ensure XML document starts with <?XML version="1.0"?> */
	public boolean fixXMLPI(Node root) {
		Node xml;
		int s;

		if (root.content != null && root.content.type == Node.ProcInsTag) {
			s = root.content.start;

			if (this.lexbuf[s] == (byte) 'x' && this.lexbuf[s + 1] == (byte) 'm' && this.lexbuf[s + 2] == (byte) 'l')
					return true;
		}

		xml = newNode(Node.ProcInsTag, this.lexbuf, 0, 0);
		xml.next = root.content;

		if (root.content != null) {
			root.content.prev = xml;
			xml.next = root.content;
		}

		root.content = xml;

		this.txtstart = this.lexsize;
		this.txtend = this.lexsize;
		addStringLiteral("xml version=\"1.0\"");
		if (this.configuration.CharEncoding == Configuration.LATIN1) addStringLiteral(" encoding=\"ISO-8859-1\"");
		this.txtend = this.lexsize;

		xml.start = this.txtstart;
		xml.end = this.txtend;
		return false;
	}

	public Node inferredTag(String name) {
		Node node;

		node = newNode(Node.StartTag, this.lexbuf, this.txtstart, this.txtend, name);
		node.implicit = true;
		return node;
	}

	public static boolean expectsContent(Node node) {
		if (node.type != Node.StartTag) return false;

		/* unknown element? */
		if (node.tag == null) return true;

		if ((node.tag.model & Dict.CM_EMPTY) != 0) return false;

		return true;
	}

	/*
	 create a text node for the contents of
	 a CDATA element like style or script
	 which ends with </foo> for some foo.
	 */
	public Node getCDATA(Node container) {
		int c, lastc, start, len, i;
		String str;
		boolean endtag = false;

		this.lines = this.in.curline;
		this.columns = this.in.curcol;
		this.waswhite = false;
		this.txtstart = this.lexsize;
		this.txtend = this.lexsize;

		lastc = (int) '\0';
		start = -1;

		while (true) {
			c = this.in.readChar();
			if (c == StreamIn.EndOfStream) break;
			/* treat \r\n as \n and \r as \n */

			if (c == (int) '/' && lastc == (int) '<') {
				if (endtag) {
					this.lines = this.in.curline;
					this.columns = this.in.curcol - 3;

					Report.warning(this, null, null, Report.BAD_CDATA_CONTENT);
				}

				start = this.lexsize + 1; /* to first letter */
				endtag = true;
			} else if (c == (int) '>' && start >= 0) {
				len = this.lexsize - start;
				if (len == container.element.length()) {
					str = getString(this.lexbuf, start, len);
					if (Lexer.wstrcasecmp(str, container.element) == 0) {
						this.txtend = start - 2;
						break;
					}
				}

				this.lines = this.in.curline;
				this.columns = this.in.curcol - 3;

				Report.warning(this, null, null, Report.BAD_CDATA_CONTENT);

				/* if javascript insert backslash before / */

				if (ParserImpl.isJavaScript(container)) {
					for (i = this.lexsize; i > start - 1; --i)
						this.lexbuf[i] = this.lexbuf[i - 1];

					this.lexbuf[start - 1] = (byte) '\\';
					this.lexsize++;
				}

				start = -1;
			} else if (c == (int) '\r') {
				c = this.in.readChar();

				if (c != (int) '\n') this.in.ungetChar(c);

				c = (int) '\n';
			}

			addCharToLexer((int) c);
			this.txtend = this.lexsize;
			lastc = c;
		}

		if (c == StreamIn.EndOfStream) Report.warning(this, container, null, Report.MISSING_ENDTAG_FOR);

		if (this.txtend > this.txtstart) {
			this.token = newNode(Node.TextNode, this.lexbuf, this.txtstart, this.txtend);
			return this.token;
		}

		return null;
	}

	public void ungetToken() {
		this.pushed = true;
	}

	public static final short IgnoreWhitespace = 0;
	public static final short MixedContent = 1;
	public static final short Preformatted = 2;
	public static final short IgnoreMarkup = 3;

	/*
	 modes for GetToken()

	 MixedContent   -- for elements which don't accept PCDATA
	 Preformatted       -- white space preserved as is
	 IgnoreMarkup       -- for CDATA elements such as script, style
	 */

	public Node getToken(short mode) {
		short map;
		int c = 0;
		int lastc;
		int badcomment = 0;
		MutableBoolean isempty = new MutableBoolean();
		AttVal attributes;

		if (this.pushed) {
			/* duplicate inlines in preference to pushed text nodes when appropriate */
			if (this.token.type != Node.TextNode || (this.insert == -1 && this.inode == null)) {
				this.pushed = false;
				return this.token;
			}
		}

		/* at start of block elements, unclosed inline
		 elements are inserted into the token stream */

		if (this.insert != -1 || this.inode != null) return insertedToken();

		this.lines = this.in.curline;
		this.columns = this.in.curcol;
		this.waswhite = false;

		this.txtstart = this.lexsize;
		this.txtend = this.lexsize;

		while (true) {
			c = this.in.readChar();
			if (c == StreamIn.EndOfStream) break;
			if (this.insertspace && mode != IgnoreWhitespace) {
				addCharToLexer(' ');
				this.waswhite = true;
				this.insertspace = false;
			}

			/* treat \r\n as \n and \r as \n */

			if (c == '\r') {
				c = this.in.readChar();

				if (c != '\n') this.in.ungetChar(c);

				c = '\n';
			}

			addCharToLexer(c);

			switch (this.state) {
				case LEX_CONTENT: /* element content */
					map = MAP((char) c);

					/*
					 Discard white space if appropriate. Its cheaper
					 to do this here rather than in parser methods
					 for elements that don't have mixed content.
					 */
					if (((map & WHITE) != 0) && (mode == IgnoreWhitespace) && this.lexsize == this.txtstart + 1) {
						--this.lexsize;
						this.waswhite = false;
						this.lines = this.in.curline;
						this.columns = this.in.curcol;
						continue;
					}

					if (c == '<') {
						this.state = LEX_GT;
						continue;
					}

					if ((map & WHITE) != 0) {
						/* was previous char white? */
						if (this.waswhite) {
							if (mode != Preformatted && mode != IgnoreMarkup) {
								--this.lexsize;
								this.lines = this.in.curline;
								this.columns = this.in.curcol;
							}
						} else /* prev char wasn't white */
						{
							this.waswhite = true;
							lastc = c;

							if (mode != Preformatted && mode != IgnoreMarkup && c != ' ') changeChar((byte) ' ');
						}

						continue;
					} else if (c == '&' && mode != IgnoreMarkup) parseEntity(mode);

					/* this is needed to avoid trimming trailing whitespace */
					if (mode == IgnoreWhitespace) mode = MixedContent;

					this.waswhite = false;
					continue;

				case LEX_GT: /* < */

					/* check for endtag */
					if (c == '/') {
						c = this.in.readChar();
						if (c == StreamIn.EndOfStream) {
							this.in.ungetChar(c);
							continue;
						}

						addCharToLexer(c);
						map = MAP((char) c);

						if ((map & LETTER) != 0) {
							this.lexsize -= 3;
							this.txtend = this.lexsize;
							this.in.ungetChar(c);
							this.state = LEX_ENDTAG;
							this.lexbuf[this.lexsize] = (byte) '\0'; /* debug */
							this.in.curcol -= 2;

							/* if some text before the </ return it now */
							if (this.txtend > this.txtstart) {
								/* trim space char before end tag */
								if (mode == IgnoreWhitespace && this.lexbuf[this.lexsize - 1] == (byte) ' ') {
									this.lexsize -= 1;
									this.txtend = this.lexsize;
								}

								this.token = newNode(Node.TextNode, this.lexbuf, this.txtstart, this.txtend);
								return this.token;
							}

							continue; /* no text so keep going */
						}

						/* otherwise treat as CDATA */
						this.waswhite = false;
						this.state = LEX_CONTENT;
						continue;
					}

					if (mode == IgnoreMarkup) {
						/* otherwise treat as CDATA */
						this.waswhite = false;
						this.state = LEX_CONTENT;
						continue;
					}

					/*
					 look out for comments, doctype or marked sections
					 this isn't quite right, but its getting there ...
					 */
					if (c == '!') {
						c = this.in.readChar();

						if (c == '-') {
							c = this.in.readChar();

							if (c == '-') {
								this.state = LEX_COMMENT; /* comment */
								this.lexsize -= 2;
								this.txtend = this.lexsize;

								/* if some text before < return it now */
								if (this.txtend > this.txtstart) {
									this.token = newNode(Node.TextNode, this.lexbuf, this.txtstart, this.txtend);
									return this.token;
								}

								this.txtstart = this.lexsize;
								continue;
							}

							Report.warning(this, null, null, Report.MALFORMED_COMMENT);
						} else if (c == 'd' || c == 'D') {
							this.state = LEX_DOCTYPE; /* doctype */
							this.lexsize -= 2;
							this.txtend = this.lexsize;
							mode = IgnoreWhitespace;

							/* skip until white space or '>' */

							for (;;) {
								c = this.in.readChar();

								if (c == StreamIn.EndOfStream || c == '>') {
									this.in.ungetChar(c);
									break;
								}

								map = MAP((char) c);

								if ((map & WHITE) == 0) continue;

								/* and skip to end of whitespace */

								for (;;) {
									c = this.in.readChar();

									if (c == StreamIn.EndOfStream || c == '>') {
										this.in.ungetChar(c);
										break;
									}

									map = MAP((char) c);

									if ((map & WHITE) != 0) continue;

									this.in.ungetChar(c);
									break;
								}

								break;
							}

							/* if some text before < return it now */
							if (this.txtend > this.txtstart) {
								this.token = newNode(Node.TextNode, this.lexbuf, this.txtstart, this.txtend);
								return this.token;
							}

							this.txtstart = this.lexsize;
							continue;
						} else if (c == '[') {
							/* Word 2000 embeds <![if ...]> ... <![endif]> sequences */
							this.lexsize -= 2;
							this.state = LEX_SECTION;
							this.txtend = this.lexsize;

							/* if some text before < return it now */
							if (this.txtend > this.txtstart) {
								this.token = newNode(Node.TextNode, this.lexbuf, this.txtstart, this.txtend);
								return this.token;
							}

							this.txtstart = this.lexsize;
							continue;
						}

						/* otherwise swallow chars up to and including next '>' */
						while (true) {
							c = this.in.readChar();
							if (c == '>') break;
							if (c == -1) {
								this.in.ungetChar(c);
								break;
							}
						}

						this.lexsize -= 2;
						this.lexbuf[this.lexsize] = (byte) '\0';
						this.state = LEX_CONTENT;
						continue;
					}

					/*
					 processing instructions
					 */

					if (c == '?') {
						this.lexsize -= 2;
						this.state = LEX_PROCINSTR;
						this.txtend = this.lexsize;

						/* if some text before < return it now */
						if (this.txtend > this.txtstart) {
							this.token = newNode(Node.TextNode, this.lexbuf, this.txtstart, this.txtend);
							return this.token;
						}

						this.txtstart = this.lexsize;
						continue;
					}

					/* Microsoft ASP's e.g. <% ... server-code ... %> */
					if (c == '%') {
						this.lexsize -= 2;
						this.state = LEX_ASP;
						this.txtend = this.lexsize;

						/* if some text before < return it now */
						if (this.txtend > this.txtstart) {
							this.token = newNode(Node.TextNode, this.lexbuf, this.txtstart, this.txtend);
							return this.token;
						}

						this.txtstart = this.lexsize;
						continue;
					}

					/* Netscapes JSTE e.g. <# ... server-code ... #> */
					if (c == '#') {
						this.lexsize -= 2;
						this.state = LEX_JSTE;
						this.txtend = this.lexsize;

						/* if some text before < return it now */
						if (this.txtend > this.txtstart) {
							this.token = newNode(Node.TextNode, this.lexbuf, this.txtstart, this.txtend);
							return this.token;
						}

						this.txtstart = this.lexsize;
						continue;
					}

					map = MAP((char) c);

					/* check for start tag */
					if ((map & LETTER) != 0) {
						this.in.ungetChar(c); /* push back letter */
						this.lexsize -= 2; /* discard "<" + letter */
						this.txtend = this.lexsize;
						this.state = LEX_STARTTAG; /* ready to read tag name */

						/* if some text before < return it now */
						if (this.txtend > this.txtstart) {
							this.token = newNode(Node.TextNode, this.lexbuf, this.txtstart, this.txtend);
							return this.token;
						}

						continue; /* no text so keep going */
					}

					/* otherwise treat as CDATA */
					this.state = LEX_CONTENT;
					this.waswhite = false;
					continue;

				case LEX_ENDTAG: /* </letter */
					this.txtstart = this.lexsize - 1;
					this.in.curcol += 2;
					c = parseTagName();
					this.token = newNode(Node.EndTag, /* create endtag token */
					this.lexbuf, this.txtstart, this.txtend, getString(this.lexbuf, this.txtstart, this.txtend
							- this.txtstart));
					this.lexsize = this.txtstart;
					this.txtend = this.txtstart;

					/* skip to '>' */
					while (c != '>') {
						c = this.in.readChar();

						if (c == StreamIn.EndOfStream) break;
					}

					if (c == StreamIn.EndOfStream) {
						this.in.ungetChar(c);
						continue;
					}

					this.state = LEX_CONTENT;
					this.waswhite = false;
					return this.token; /* the endtag token */

				case LEX_STARTTAG: /* first letter of tagname */
					this.txtstart = this.lexsize - 1; /* set txtstart to first letter */
					c = parseTagName();
					isempty.value = false;
					attributes = null;
					this.token = newNode((isempty.value ? Node.StartEndTag : Node.StartTag), this.lexbuf,
							this.txtstart, this.txtend, getString(this.lexbuf, this.txtstart, this.txtend
									- this.txtstart));

					/* parse attributes, consuming closing ">" */
					if (c != '>') {
						if (c == '/') this.in.ungetChar(c);

						attributes = parseAttrs(isempty);
					}

					if (isempty.value) this.token.type = Node.StartEndTag;

					this.token.attributes = attributes;
					this.lexsize = this.txtstart;
					this.txtend = this.txtstart;

					/* swallow newline following start tag */
					/* special check needed for CRLF sequence */
					/* this doesn't apply to empty elements */

					if (expectsContent(this.token) || this.token.tag == configuration.tt.tagBr) {

						c = this.in.readChar();

						if (c == '\r') {
							c = this.in.readChar();

							if (c != '\n') this.in.ungetChar(c);
						} else if (c != '\n' && c != '\f') this.in.ungetChar(c);

						this.waswhite = true; /* to swallow leading whitespace */
					} else
						this.waswhite = false;

					this.state = LEX_CONTENT;

					if (this.token.tag == null)
						Report.error(this, null, this.token, Report.UNKNOWN_ELEMENT);
					else if (!this.configuration.XmlTags) {
						this.versions &= this.token.tag.versions;

						if ((this.token.tag.versions & Dict.VERS_PROPRIETARY) != 0) {
							if (!this.configuration.MakeClean
									&& (this.token.tag == configuration.tt.tagNobr || this.token.tag == configuration.tt.tagWbr))
									Report.warning(this, null, this.token, Report.PROPRIETARY_ELEMENT);
						}

						if (this.token.tag.chkattrs != null) {
							this.token.checkUniqueAttributes(this);
							this.token.tag.chkattrs.check(this, this.token);
						} else
							this.token.checkAttributes(this);
					}

					return this.token; /* return start tag */

				case LEX_COMMENT: /* seen <!-- so look for --> */

					if (c != '-') continue;

					c = this.in.readChar();
					addCharToLexer(c);

					if (c != '-') continue;

					end_comment: while (true) {
						c = this.in.readChar();

						if (c == '>') {
							if (badcomment != 0) Report.warning(this, null, null, Report.MALFORMED_COMMENT);

							this.txtend = this.lexsize - 2; // AQ 8Jul2000
							this.lexbuf[this.lexsize] = (byte) '\0';
							this.state = LEX_CONTENT;
							this.waswhite = false;
							this.token = newNode(Node.CommentTag, this.lexbuf, this.txtstart, this.txtend);

							/* now look for a line break */

							c = this.in.readChar();

							if (c == '\r') {
								c = this.in.readChar();

								if (c != '\n') this.token.linebreak = true;
							}

							if (c == '\n')
								this.token.linebreak = true;
							else
								this.in.ungetChar(c);

							return this.token;
						}

						/* note position of first such error in the comment */
						if (badcomment == 0) {
							this.lines = this.in.curline;
							this.columns = this.in.curcol - 3;
						}

						badcomment++;
						if (this.configuration.FixComments) this.lexbuf[this.lexsize - 2] = (byte) '=';

						addCharToLexer(c);

						/* if '-' then look for '>' to end the comment */
						if (c != '-') break end_comment;

					}
					/* otherwise continue to look for --> */
					this.lexbuf[this.lexsize - 2] = (byte) '=';
					continue;

				case LEX_DOCTYPE: /* seen <!d so look for '>' munging whitespace */
					map = MAP((char) c);

					if ((map & WHITE) != 0) {
						if (this.waswhite) this.lexsize -= 1;

						this.waswhite = true;
					} else
						this.waswhite = false;

					if (c != '>') continue;

					this.lexsize -= 1;
					this.txtend = this.lexsize;
					this.lexbuf[this.lexsize] = (byte) '\0';
					this.state = LEX_CONTENT;
					this.waswhite = false;
					this.token = newNode(Node.DocTypeTag, this.lexbuf, this.txtstart, this.txtend);
					/* make a note of the version named by the doctype */
					this.doctype = findGivenVersion(this.token);
					return this.token;

				case LEX_PROCINSTR: /* seen <? so look for '>' */
					/* check for PHP preprocessor instructions <?php ... ?> */

					if (this.lexsize - this.txtstart == 3) {
						if ((getString(this.lexbuf, this.txtstart, 3)).equals("php")) {
							this.state = LEX_PHP;
							continue;
						}
					}

					if (this.configuration.XmlPIs) /* insist on ?> as terminator */
					{
						if (c != '?') continue;

						/* now look for '>' */
						c = this.in.readChar();

						if (c == StreamIn.EndOfStream) {
							Report.warning(this, null, null, Report.UNEXPECTED_END_OF_FILE);
							this.in.ungetChar(c);
							continue;
						}

						addCharToLexer(c);
					}

					if (c != '>') continue;

					this.lexsize -= 1;
					this.txtend = this.lexsize;
					this.lexbuf[this.lexsize] = (byte) '\0';
					this.state = LEX_CONTENT;
					this.waswhite = false;
					this.token = newNode(Node.ProcInsTag, this.lexbuf, this.txtstart, this.txtend);
					return this.token;

				case LEX_ASP: /* seen <% so look for "%>" */
					if (c != '%') continue;

					/* now look for '>' */
					c = this.in.readChar();

					if (c != '>') {
						this.in.ungetChar(c);
						continue;
					}

					this.lexsize -= 1;
					this.txtend = this.lexsize;
					this.lexbuf[this.lexsize] = (byte) '\0';
					this.state = LEX_CONTENT;
					this.waswhite = false;
					this.token = newNode(Node.AspTag, this.lexbuf, this.txtstart, this.txtend);
					return this.token;

				case LEX_JSTE: /* seen <# so look for "#>" */
					if (c != '#') continue;

					/* now look for '>' */
					c = this.in.readChar();

					if (c != '>') {
						this.in.ungetChar(c);
						continue;
					}

					this.lexsize -= 1;
					this.txtend = this.lexsize;
					this.lexbuf[this.lexsize] = (byte) '\0';
					this.state = LEX_CONTENT;
					this.waswhite = false;
					this.token = newNode(Node.JsteTag, this.lexbuf, this.txtstart, this.txtend);
					return this.token;

				case LEX_PHP: /* seen "<?php" so look for "?>" */
					if (c != '?') continue;

					/* now look for '>' */
					c = this.in.readChar();

					if (c != '>') {
						this.in.ungetChar(c);
						continue;
					}

					this.lexsize -= 1;
					this.txtend = this.lexsize;
					this.lexbuf[this.lexsize] = (byte) '\0';
					this.state = LEX_CONTENT;
					this.waswhite = false;
					this.token = newNode(Node.PhpTag, this.lexbuf, this.txtstart, this.txtend);
					return this.token;

				case LEX_SECTION: /* seen "<![" so look for "]>" */
					if (c == '[') {
						if (this.lexsize == (this.txtstart + 6)
								&& (getString(this.lexbuf, this.txtstart, 6)).equals("CDATA[")) {
							this.state = LEX_CDATA;
							this.lexsize -= 6;
							continue;
						}
					}

					if (c != ']') continue;

					/* now look for '>' */
					c = this.in.readChar();

					if (c != '>') {
						this.in.ungetChar(c);
						continue;
					}

					this.lexsize -= 1;
					this.txtend = this.lexsize;
					this.lexbuf[this.lexsize] = (byte) '\0';
					this.state = LEX_CONTENT;
					this.waswhite = false;
					this.token = newNode(Node.SectionTag, this.lexbuf, this.txtstart, this.txtend);
					return this.token;

				case LEX_CDATA: /* seen "<![CDATA[" so look for "]]>" */
					if (c != ']') continue;

					/* now look for ']' */
					c = this.in.readChar();

					if (c != ']') {
						this.in.ungetChar(c);
						continue;
					}

					/* now look for '>' */
					c = this.in.readChar();

					if (c != '>') {
						this.in.ungetChar(c);
						continue;
					}

					this.lexsize -= 1;
					this.txtend = this.lexsize;
					this.lexbuf[this.lexsize] = (byte) '\0';
					this.state = LEX_CONTENT;
					this.waswhite = false;
					this.token = newNode(Node.CDATATag, this.lexbuf, this.txtstart, this.txtend);
					return this.token;
			}
		}

		if (this.state == LEX_CONTENT) /* text string */
		{
			this.txtend = this.lexsize;

			if (this.txtend > this.txtstart) {
				this.in.ungetChar(c);

				if (this.lexbuf[this.lexsize - 1] == (byte) ' ') {
					this.lexsize -= 1;
					this.txtend = this.lexsize;
				}

				this.token = newNode(Node.TextNode, this.lexbuf, this.txtstart, this.txtend);
				return this.token;
			}
		} else if (this.state == LEX_COMMENT) /* comment */
		{
			if (c == StreamIn.EndOfStream) Report.warning(this, null, null, Report.MALFORMED_COMMENT);

			this.txtend = this.lexsize;
			this.lexbuf[this.lexsize] = (byte) '\0';
			this.state = LEX_CONTENT;
			this.waswhite = false;
			this.token = newNode(Node.CommentTag, this.lexbuf, this.txtstart, this.txtend);
			return this.token;
		}

		return null;
	}

	/*
	 parser for ASP within start tags

	 Some people use ASP for to customize attributes
	 Tidy isn't really well suited to dealing with ASP
	 This is a workaround for attributes, but won't
	 deal with the case where the ASP is used to tailor
	 the attribute value. Here is an example of a work
	 around for using ASP in attribute values:

	 href="<%=rsSchool.Fields("ID").Value%>"

	 where the ASP that generates the attribute value
	 is masked from Tidy by the quotemarks.

	 */

	public Node parseAsp() {
		int c;
		Node asp = null;

		this.txtstart = this.lexsize;

		for (;;) {
			c = this.in.readChar();
			addCharToLexer(c);

			if (c != '%') continue;

			c = this.in.readChar();
			addCharToLexer(c);

			if (c == '>') break;
		}

		this.lexsize -= 2;
		this.txtend = this.lexsize;

		if (this.txtend > this.txtstart) asp = newNode(Node.AspTag, this.lexbuf, this.txtstart, this.txtend);

		this.txtstart = this.txtend;
		return asp;
	}

	/*
	 PHP is like ASP but is based upon XML
	 processing instructions, e.g. <?php ... ?>
	 */
	public Node parsePhp() {
		int c;
		Node php = null;

		this.txtstart = this.lexsize;

		for (;;) {
			c = this.in.readChar();
			addCharToLexer(c);

			if (c != '?') continue;

			c = this.in.readChar();
			addCharToLexer(c);

			if (c == '>') break;
		}

		this.lexsize -= 2;
		this.txtend = this.lexsize;

		if (this.txtend > this.txtstart) php = newNode(Node.PhpTag, this.lexbuf, this.txtstart, this.txtend);

		this.txtstart = this.txtend;
		return php;
	}

	/* consumes the '>' terminating start tags */
	public String parseAttribute(MutableBoolean isempty, MutableObject asp, MutableObject php) {
		int start = 0;
		// int len = 0;   Removed by BUGFIX for 126265
		short map;
		String attr;
		int c = 0;

		asp.setObject(null); /* clear asp pointer */
		php.setObject(null); /* clear php pointer */
		/* skip white space before the attribute */

		for (;;) {
			c = this.in.readChar();

			if (c == '/') {
				c = this.in.readChar();

				if (c == '>') {
					isempty.value = true;
					return null;
				}

				this.in.ungetChar(c);
				c = '/';
				break;
			}

			if (c == '>') return null;

			if (c == '<') {
				c = this.in.readChar();

				if (c == '%') {
					asp.setObject(parseAsp());
					return null;
				} else if (c == '?') {
					php.setObject(parsePhp());
					return null;
				}

				this.in.ungetChar(c);
				Report.attrError(this, this.token, null, Report.UNEXPECTED_GT);
				return null;
			}

			if (c == '"' || c == '\'') {
				Report.attrError(this, this.token, null, Report.UNEXPECTED_QUOTEMARK);
				continue;
			}

			if (c == StreamIn.EndOfStream) {
				Report.attrError(this, this.token, null, Report.UNEXPECTED_END_OF_FILE);
				this.in.ungetChar(c);
				return null;
			}

			map = MAP((char) c);

			if ((map & WHITE) == 0) break;
		}

		start = this.lexsize;

		for (;;) {
			/* but push back '=' for parseValue() */
			if (c == '=' || c == '>') {
				this.in.ungetChar(c);
				break;
			}

			if (c == '<' || c == StreamIn.EndOfStream) {
				this.in.ungetChar(c);
				break;
			}

			map = MAP((char) c);

			if ((map & WHITE) != 0) break;

			/* what should be done about non-namechar characters? */
			/* currently these are incorporated into the attr name */

			if (!this.configuration.XmlTags && (map & UPPERCASE) != 0) c += (int) ('a' - 'A');

			//  ++len;    Removed by BUGFIX for 126265
			addCharToLexer(c);

			c = this.in.readChar();
		}

		// Following line added by GLP to fix BUG 126265.  This is a temporary comment
		// and should be removed when Tidy is fixed.
		int len = this.lexsize - start;
		attr = (len > 0 ? getString(this.lexbuf, start, len) : null);
		this.lexsize = start;

		return attr;
	}

	/*
	 invoked when < is seen in place of attribute value
	 but terminates on whitespace if not ASP, PHP or Tango
	 this routine recognizes ' and " quoted strings
	 */
	public int parseServerInstruction() {
		int c, map, delim = '"';
		boolean isrule = false;

		c = this.in.readChar();
		addCharToLexer(c);

		/* check for ASP, PHP or Tango */
		if (c == '%' || c == '?' || c == '@') isrule = true;

		for (;;) {
			c = this.in.readChar();

			if (c == StreamIn.EndOfStream) break;

			if (c == '>') {
				if (isrule)
					addCharToLexer(c);
				else
					this.in.ungetChar(c);

				break;
			}

			/* if not recognized as ASP, PHP or Tango */
			/* then also finish value on whitespace */
			if (!isrule) {
				map = MAP((char) c);

				if ((map & WHITE) != 0) break;
			}

			addCharToLexer(c);

			if (c == '"') {
				do {
					c = this.in.readChar();
					addCharToLexer(c);
				} while (c != '"');
				delim = '\'';
				continue;
			}

			if (c == '\'') {
				do {
					c = this.in.readChar();
					addCharToLexer(c);
				} while (c != '\'');
			}
		}

		return delim;
	}

	/* values start with "=" or " = " etc. */
	/* doesn't consume the ">" at end of start tag */

	public String parseValue(String name, boolean foldCase, MutableBoolean isempty, MutableInteger pdelim) {
		int len = 0;
		int start;
		short map;
		boolean seen_gt = false;
		boolean munge = true;
		int c = 0;
		int lastc, delim, quotewarning;
		String value;

		delim = 0;
		pdelim.value = (int) '"';

		/*
		 Henry Zrepa reports that some folk are using the
		 embed element with script attributes where newlines
		 are significant and must be preserved
		 */
		if (configuration.LiteralAttribs) munge = false;

		/* skip white space before the '=' */

		for (;;) {
			c = this.in.readChar();

			if (c == StreamIn.EndOfStream) {
				this.in.ungetChar(c);
				break;
			}

			map = MAP((char) c);

			if ((map & WHITE) == 0) break;
		}

		/*
		 c should be '=' if there is a value
		 other legal possibilities are white
		 space, '/' and '>'
		 */

		if (c != '=') {
			this.in.ungetChar(c);
			return null;
		}

		/* skip white space after '=' */

		for (;;) {
			c = this.in.readChar();

			if (c == StreamIn.EndOfStream) {
				this.in.ungetChar(c);
				break;
			}

			map = MAP((char) c);

			if ((map & WHITE) == 0) break;
		}

		/* check for quote marks */

		if (c == '"' || c == '\'')
			delim = c;
		else if (c == '<') {
			start = this.lexsize;
			addCharToLexer(c);
			pdelim.value = parseServerInstruction();
			len = this.lexsize - start;
			this.lexsize = start;
			return (len > 0 ? getString(this.lexbuf, start, len) : null);
		} else
			this.in.ungetChar(c);

		/*
		 and read the value string
		 check for quote mark if needed
		 */

		quotewarning = 0;
		start = this.lexsize;
		c = '\0';

		for (;;) {
			lastc = c; /* track last character */
			c = this.in.readChar();

			if (c == StreamIn.EndOfStream) {
				Report.attrError(this, this.token, null, Report.UNEXPECTED_END_OF_FILE);
				this.in.ungetChar(c);
				break;
			}

			if (delim == (char) 0) {
				if (c == '>') {
					this.in.ungetChar(c);
					break;
				}

				if (c == '"' || c == '\'') {
					Report.attrError(this, this.token, null, Report.UNEXPECTED_QUOTEMARK);
					break;
				}

				if (c == '<') {
					/* this.in.ungetChar(c); */
					Report.attrError(this, this.token, null, Report.UNEXPECTED_GT);
					/* break; */
				}

				/*
				 For cases like <br clear=all/> need to avoid treating /> as
				 part of the attribute value, however care is needed to avoid
				 so treating <a href=http://www.acme.com/> in this way, which
				 would map the <a> tag to <a href="http://www.acme.com"/>
				 */
				if (c == '/') {
					/* peek ahead in case of /> */
					c = this.in.readChar();

					if (c == '>' && !AttributeTable.getDefaultAttributeTable().isUrl(name)) {
						isempty.value = true;
						this.in.ungetChar(c);
						break;
					}

					/* unget peeked char */
					this.in.ungetChar(c);
					c = '/';
				}
			} else /* delim is '\'' or '"' */
			{
				if (c == delim) break;

				/* treat CRLF, CR and LF as single line break */

				if (c == '\r') {
					c = this.in.readChar();
					if (c != '\n') this.in.ungetChar(c);

					c = '\n';
				}

				if (c == '\n' || c == '<' || c == '>') ++quotewarning;

				if (c == '>') seen_gt = true;
			}

			if (c == '&') {
				addCharToLexer(c);
				parseEntity((short) 0);
				continue;
			}

			/*
			 kludge for JavaScript attribute values
			 with line continuations in string literals
			 */
			if (c == '\\') {
				c = this.in.readChar();

				if (c != '\n') {
					this.in.ungetChar(c);
					c = '\\';
				}
			}

			map = MAP((char) c);

			if ((map & WHITE) != 0) {
				if (delim == (char) 0) break;

				if (munge) {
					c = ' ';

					if (lastc == ' ') continue;
				}
			} else if (foldCase && (map & UPPERCASE) != 0) c += (int) ('a' - 'A');

			addCharToLexer(c);
		}

		if (quotewarning > 10 && seen_gt && munge) {
			/*
			 there is almost certainly a missing trailling quote mark
			 as we have see too many newlines, < or > characters.

			 an exception is made for Javascript attributes and the
			 javascript URL scheme which may legitimately include < and >
			 */
			if (!AttributeTable.getDefaultAttributeTable().isScript(name)
					&& !(AttributeTable.getDefaultAttributeTable().isUrl(name) && (getString(this.lexbuf, start, 11))
							.equals("javascript:"))) Report.error(this, null, null, Report.SUSPECTED_MISSING_QUOTE);
		}

		len = this.lexsize - start;
		this.lexsize = start;

		if (len > 0 || delim != 0)
			value = getString(this.lexbuf, start, len);
		else
			value = null;

		/* note delimiter if given */
		if (delim != 0)
			pdelim.value = delim;
		else
			pdelim.value = (int) '"';

		return value;
	}

	/* attr must be non-null */
	public static boolean isValidAttrName(String attr) {
		short map;
		char c;
		int i;

		/* first character should be a letter */
		c = attr.charAt(0);
		map = MAP(c);

		if (!((map & LETTER) != 0)) return false;

		/* remaining characters should be namechars */
		for (i = 1; i < attr.length(); i++) {
			c = attr.charAt(i);
			map = MAP(c);

			if ((map & NAMECHAR) != 0) continue;

			return false;
		}

		return true;
	}

	/* swallows closing '>' */

	public AttVal parseAttrs(MutableBoolean isempty) {
		AttVal av, list;
		String attribute, value;
		MutableInteger delim = new MutableInteger();
		MutableObject asp = new MutableObject();
		MutableObject php = new MutableObject();

		list = null;

		for (; !endOfInput();) {
			attribute = parseAttribute(isempty, asp, php);

			if (attribute == null) {
				/* check if attributes are created by ASP markup */
				if (asp.getObject() != null) {
					av = new AttVal(list, null, (Node) asp.getObject(), null, '\0', null, null);
					list = av;
					continue;
				}

				/* check if attributes are created by PHP markup */
				if (php.getObject() != null) {
					av = new AttVal(list, null, null, (Node) php.getObject(), '\0', null, null);
					list = av;
					continue;
				}

				break;
			}

			value = parseValue(attribute, false, isempty, delim);

			if (attribute != null && isValidAttrName(attribute)) {
				av = new AttVal(list, null, null, null, delim.value, attribute, value);
				av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
				list = av;
			} else {
				av = new AttVal(null, null, null, null, 0, attribute, value);
				Report.attrError(this, this.token, value, Report.BAD_ATTRIBUTE_VALUE);
			}
		}

		return list;
	}

	/*
	 push a copy of an inline node onto stack
	 but don't push if implicit or OBJECT or APPLET
	 (implicit tags are ones generated from the istack)

	 One issue arises with pushing inlines when
	 the tag is already pushed. For instance:

	 <p><em>text
	 <p><em>more text

	 Shouldn't be mapped to

	 <p><em>text</em></p>
	 <p><em><em>more text</em></em>
	 */
	public void pushInline(Node node) {
		IStack is;

		if (node.implicit) return;

		if (node.tag == null) return;

		if ((node.tag.model & Dict.CM_INLINE) == 0) return;

		if ((node.tag.model & Dict.CM_OBJECT) != 0) return;

		if (node.tag != configuration.tt.tagFont && isPushed(node)) return;

		// make sure there is enough space for the stack
		is = new IStack();
		is.tag = node.tag;
		is.element = node.element;
		if (node.attributes != null) is.attributes = cloneAttributes(node.attributes);
		this.istack.push(is);
	}

	/* pop inline stack */
	public void popInline(Node node) {
		AttVal av;
		IStack is;

		if (node != null) {

			if (node.tag == null) return;

			if ((node.tag.model & Dict.CM_INLINE) == 0) return;

			if ((node.tag.model & Dict.CM_OBJECT) != 0) return;

			// if node is </a> then pop until we find an <a>
			if (node.tag == configuration.tt.tagA) {

				while (this.istack.size() > 0) {
					is = (IStack) this.istack.pop();
					if (is.tag == configuration.tt.tagA) {
						break;
					}
				}

				if (this.insert >= this.istack.size()) this.insert = -1;
				return;
			}
		}

		if (this.istack.size() > 0) {
			is = (IStack) this.istack.pop();
			if (this.insert >= this.istack.size()) this.insert = -1;
		}
	}

	public boolean isPushed(Node node) {
		int i;
		IStack is;

		for (i = this.istack.size() - 1; i >= 0; --i) {
			is = (IStack) this.istack.elementAt(i);
			if (is.tag == node.tag) return true;
		}

		return false;
	}

	/*
	 This has the effect of inserting "missing" inline
	 elements around the contents of blocklevel elements
	 such as P, TD, TH, DIV, PRE etc. This procedure is
	 called at the start of ParseBlock. when the inline
	 stack is not empty, as will be the case in:

	 <i><h1>italic heading</h1></i>

	 which is then treated as equivalent to

	 <h1><i>italic heading</i></h1>

	 This is implemented by setting the lexer into a mode
	 where it gets tokens from the inline stack rather than
	 from the input stream.
	 */
	public int inlineDup(Node node) {
		int n;

		n = this.istack.size() - this.istackbase;
		if (n > 0) {
			this.insert = this.istackbase;
			this.inode = node;
		}

		return n;
	}

	public Node insertedToken() {
		Node node;
		IStack is;
		int n;

		// this will only be null if inode != null
		if (this.insert == -1) {
			node = this.inode;
			this.inode = null;
			return node;
		}

		// is this is the "latest" node then update
		// the position, otherwise use current values

		if (this.inode == null) {
			this.lines = this.in.curline;
			this.columns = this.in.curcol;
		}

		node = newNode(Node.StartTag, this.lexbuf, this.txtstart, this.txtend); // GLP:  Bugfix 126261.  Remove when this change
		//       is fixed in istack.c in the original Tidy
		node.implicit = true;
		is = (IStack) this.istack.elementAt(this.insert);
		node.element = is.element;
		node.tag = is.tag;
		if (is.attributes != null) node.attributes = cloneAttributes(is.attributes);

		// advance lexer to next item on the stack
		n = this.insert;

		// and recover state if we have reached the end
		if (++n < this.istack.size()) {
			this.insert = n;
		} else {
			this.insert = -1;
		}

		return node;
	}

	/* AQ: Try this for speed optimization */
	public static int wstrcasecmp(String s1, String s2) {
		return (s1.equalsIgnoreCase(s2) ? 0 : 1);
	}

	public static int wstrcaselexcmp(String s1, String s2) {
		char c;
		int i = 0;

		while (i < s1.length() && i < s2.length()) {
			c = s1.charAt(i);
			if (toLower(c) != toLower(s2.charAt(i))) {
				break;
			}
			i += 1;
		}
		if (i == s1.length() && i == s2.length()) {
			return 0;
		} else if (i == s1.length()) {
			return -1;
		} else if (i == s2.length()) {
			return 1;
		} else {
			return (s1.charAt(i) > s2.charAt(i) ? 1 : -1);
		}
	}

	public static boolean wsubstr(String s1, String s2) {
		int i;
		int len1 = s1.length();
		int len2 = s2.length();

		for (i = 0; i <= len1 - len2; ++i) {
			if (s2.equalsIgnoreCase(s1.substring(i))) return true;
		}

		return false;
	}

	public boolean canPrune(Node element) {
		if (element.type == Node.TextNode) return true;

		if (element.content != null) return false;

		if (element.tag == configuration.tt.tagA && element.attributes != null) return false;

		if (element.tag == configuration.tt.tagP && !this.configuration.DropEmptyParas) return false;

		if (element.tag == null) return false;

		if ((element.tag.model & Dict.CM_ROW) != 0) return false;

		if (element.tag == configuration.tt.tagApplet) return false;

		if (element.tag == configuration.tt.tagObject) return false;

		if (element.attributes != null
				&& (element.getAttrByName("id") != null || element.getAttrByName("name") != null)) return false;

		return true;
	}

	/* duplicate name attribute as an id */
	public void fixId(Node node) {
		AttVal name = node.getAttrByName("name");
		AttVal id = node.getAttrByName("id");

		if (name != null) {
			if (id != null) {
				if (!id.value.equals(name.value)) Report.attrError(this, node, "name", Report.ID_NAME_MISMATCH);
			} else if (this.configuration.XmlOut) node.addAttribute("id", name.value);
		}
	}

	/*
	 defer duplicates when entering a table or other
	 element where the inlines shouldn't be duplicated
	 */
	public void deferDup() {
		this.insert = -1;
		this.inode = null;
	}

	/* Private methods and fields */

	/* lexer char types */
	private static final short DIGIT = 1;
	private static final short LETTER = 2;
	private static final short NAMECHAR = 4;
	private static final short WHITE = 8;
	private static final short NEWLINE = 16;
	private static final short LOWERCASE = 32;
	private static final short UPPERCASE = 64;

	/* lexer GetToken states */

	private static final short LEX_CONTENT = 0;
	private static final short LEX_GT = 1;
	private static final short LEX_ENDTAG = 2;
	private static final short LEX_STARTTAG = 3;
	private static final short LEX_COMMENT = 4;
	private static final short LEX_DOCTYPE = 5;
	private static final short LEX_PROCINSTR = 6;
	private static final short LEX_ENDCOMMENT = 7;
	private static final short LEX_CDATA = 8;
	private static final short LEX_SECTION = 9;
	private static final short LEX_ASP = 10;
	private static final short LEX_JSTE = 11;
	private static final short LEX_PHP = 12;

	/* used to classify chars for lexical purposes */
	private static short[] lexmap = new short[128];

	private static void mapStr(String str, short code) {
		int j;

		for (int i = 0; i < str.length(); i++) {
			j = (int) str.charAt(i);
			lexmap[j] |= code;
		}
	}

	static {
		mapStr("\r\n\f", (short) (NEWLINE | WHITE));
		mapStr(" \t", WHITE);
		mapStr("-.:_", NAMECHAR);
		mapStr("0123456789", (short) (DIGIT | NAMECHAR));
		mapStr("abcdefghijklmnopqrstuvwxyz", (short) (LOWERCASE | LETTER | NAMECHAR));
		mapStr("ABCDEFGHIJKLMNOPQRSTUVWXYZ", (short) (UPPERCASE | LETTER | NAMECHAR));
	}

	private static short MAP(char c) {
		return ((int) c < 128 ? lexmap[(int) c] : 0);
	}

	private static boolean isWhite(char c) {
		short m = MAP(c);

		return (m & WHITE) != 0;
	}

	private static boolean isDigit(char c) {
		short m;

		m = MAP(c);

		return (m & DIGIT) != 0;
	}

	private static boolean isLetter(char c) {
		short m;

		m = MAP(c);

		return (m & LETTER) != 0;
	}

	private static char toLower(char c) {
		short m = MAP(c);

		if ((m & UPPERCASE) != 0) c = (char) ((int) c + (int) 'a' - (int) 'A');

		return c;
	}

	private static char toUpper(char c) {
		short m = MAP(c);

		if ((m & LOWERCASE) != 0) c = (char) ((int) c + (int) 'A' - (int) 'a');

		return c;
	}

	public static char foldCase(char c, boolean tocaps, boolean xmlTags) {
		short m;

		if (!xmlTags) {
			m = MAP(c);

			if (tocaps) {
				if ((m & LOWERCASE) != 0) c = (char) ((int) c + (int) 'A' - (int) 'a');
			} else /* force to lower case */
			{
				if ((m & UPPERCASE) != 0) c = (char) ((int) c + (int) 'a' - (int) 'A');
			}
		}

		return c;
	}

	private static class W3CVersionInfo {
		String name;
		String voyagerName;
		String profile;
		short code;

		public W3CVersionInfo(String name, String voyagerName, String profile, short code) {
			this.name = name;
			this.voyagerName = voyagerName;
			this.profile = profile;
			this.code = code;
		}
	}

	/* the 3 URIs  for the XHTML 1.0 DTDs */
	private static final String voyager_loose = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";
	private static final String voyager_strict = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd";
	private static final String voyager_frameset = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd";

	private static final String XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml";

	private static Lexer.W3CVersionInfo[] W3CVersion = {
			new W3CVersionInfo("HTML 4.01", "XHTML 1.0 Strict", voyager_strict, Dict.VERS_HTML40_STRICT),
			new W3CVersionInfo("HTML 4.01 Transitional", "XHTML 1.0 Transitional", voyager_loose,
					Dict.VERS_HTML40_LOOSE),
			new W3CVersionInfo("HTML 4.01 Frameset", "XHTML 1.0 Frameset", voyager_frameset, Dict.VERS_FRAMES),
			new W3CVersionInfo("HTML 4.0", "XHTML 1.0 Strict", voyager_strict, Dict.VERS_HTML40_STRICT),
			new W3CVersionInfo("HTML 4.0 Transitional", "XHTML 1.0 Transitional", voyager_loose, Dict.VERS_HTML40_LOOSE),
			new W3CVersionInfo("HTML 4.0 Frameset", "XHTML 1.0 Frameset", voyager_frameset, Dict.VERS_FRAMES),
			new W3CVersionInfo("HTML 3.2", "XHTML 1.0 Transitional", voyager_loose, Dict.VERS_HTML32),
			new W3CVersionInfo("HTML 2.0", "XHTML 1.0 Strict", voyager_strict, Dict.VERS_HTML20)};

}