/*
* GNU LESSER GENERAL PUBLIC LICENSE Copyright (C) 2006 The Lobo Project
*
* This library is free software; you can redistribute it and/or modify it under
* the terms of the GNU Lesser General Public License as published by the Free
* Software Foundation; either version 2.1 of the License, or (at your option)
* any later version.
*
* This library is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
* details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this library; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*
* Contact info: lobochief@users.sourceforge.net
*/
/*
* Created on Aug 28, 2005
*/
package com.nvarghese.beowulf.common.cobra.html.parser;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Map;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.w3c.dom.DOMException;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.xml.sax.ErrorHandler;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
import com.nvarghese.beowulf.common.cobra.html.UserAgentContext;
import com.nvarghese.beowulf.common.cobra.html.io.WritableLineReader;
/**
* The <code>HtmlParser</code> class is an HTML DOM parser. This parser provides
* the functionality for the standard DOM parser implementation
* {@link com.nvarghese.beowulf.common.cobra.html.parser.DocumentBuilderImpl}.
* This parser class may be used directly when a different DOM implementation is
* preferred.
*/
public class HtmlParser {
private static final Logger logger = Logger.getLogger(HtmlParser.class.getName());
private final Document document;
private final UserAgentContext ucontext;
private final ErrorHandler errorHandler;
private final String publicId;
private final String systemId;
private static final Map ENTITIES = new HashMap(256);
private static final Map ELEMENT_INFOS = new HashMap(35);
/**
* A node <code>UserData</code> key used to tell nodes that their content
* may be about to be modified. Elements could use this to temporarily
* suspend notifications. The value set will be either
* <code>Boolean.TRUE</code> or <code>Boolean.FALSE</code>.
*/
public static final String MODIFYING_KEY = "cobra.suspend";
static {
Map entities = ENTITIES;
entities.put("amp", new Character('&'));
entities.put("lt", new Character('<'));
entities.put("gt", new Character('>'));
entities.put("quot", new Character('"'));
entities.put("nbsp", new Character((char) 160));
entities.put("lsquo", new Character('\u2018'));
entities.put("rsquo", new Character('\u2019'));
entities.put("frasl", new Character((char) 47));
entities.put("ndash", new Character((char) 8211));
entities.put("mdash", new Character((char) 8212));
entities.put("iexcl", new Character((char) 161));
entities.put("cent", new Character((char) 162));
entities.put("pound", new Character((char) 163));
entities.put("curren", new Character((char) 164));
entities.put("yen", new Character((char) 165));
entities.put("brvbar", new Character((char) 166));
entities.put("brkbar", new Character((char) 166));
entities.put("sect", new Character((char) 167));
entities.put("uml", new Character((char) 168));
entities.put("die", new Character((char) 168));
entities.put("copy", new Character((char) 169));
entities.put("ordf", new Character((char) 170));
entities.put("laquo", new Character((char) 171));
entities.put("not", new Character((char) 172));
entities.put("shy", new Character((char) 173));
entities.put("reg", new Character((char) 174));
entities.put("macr", new Character((char) 175));
entities.put("hibar", new Character((char) 175));
entities.put("deg", new Character((char) 176));
entities.put("plusmn", new Character((char) 177));
entities.put("sup2", new Character((char) 178));
entities.put("sup3", new Character((char) 179));
entities.put("acute", new Character((char) 180));
entities.put("micro", new Character((char) 181));
entities.put("para", new Character((char) 182));
entities.put("middot", new Character((char) 183));
entities.put("cedil", new Character((char) 184));
entities.put("sup1", new Character((char) 185));
entities.put("ordm", new Character((char) 186));
entities.put("raquo", new Character((char) 187));
entities.put("frac14", new Character((char) 188));
entities.put("frac12", new Character((char) 189));
entities.put("frac34", new Character((char) 190));
entities.put("iquest", new Character((char) 191));
entities.put("Agrave", new Character((char) 192));
entities.put("Aacute", new Character((char) 193));
entities.put("Acirc", new Character((char) 194));
entities.put("Atilde", new Character((char) 195));
entities.put("Auml", new Character((char) 196));
entities.put("Aring", new Character((char) 197));
entities.put("AElig", new Character((char) 198));
entities.put("Ccedil", new Character((char) 199));
entities.put("Egrave", new Character((char) 200));
entities.put("Eacute", new Character((char) 201));
entities.put("Ecirc", new Character((char) 202));
entities.put("Euml", new Character((char) 203));
entities.put("Igrave", new Character((char) 204));
entities.put("Iacute", new Character((char) 205));
entities.put("Icirc", new Character((char) 206));
entities.put("Iuml", new Character((char) 207));
entities.put("ETH", new Character((char) 208));
entities.put("Ntilde", new Character((char) 209));
entities.put("Ograve", new Character((char) 210));
entities.put("Oacute", new Character((char) 211));
entities.put("Ocirc", new Character((char) 212));
entities.put("Otilde", new Character((char) 213));
entities.put("Ouml", new Character((char) 214));
entities.put("times", new Character((char) 215));
entities.put("Oslash", new Character((char) 216));
entities.put("Ugrave", new Character((char) 217));
entities.put("Uacute", new Character((char) 218));
entities.put("Ucirc", new Character((char) 219));
entities.put("Uuml", new Character((char) 220));
entities.put("Yacute", new Character((char) 221));
entities.put("THORN", new Character((char) 222));
entities.put("szlig", new Character((char) 223));
entities.put("agrave", new Character((char) 224));
entities.put("aacute", new Character((char) 225));
entities.put("acirc", new Character((char) 226));
entities.put("atilde", new Character((char) 227));
entities.put("auml", new Character((char) 228));
entities.put("aring", new Character((char) 229));
entities.put("aelig", new Character((char) 230));
entities.put("ccedil", new Character((char) 231));
entities.put("egrave", new Character((char) 232));
entities.put("eacute", new Character((char) 233));
entities.put("ecirc", new Character((char) 234));
entities.put("euml", new Character((char) 235));
entities.put("igrave", new Character((char) 236));
entities.put("iacute", new Character((char) 237));
entities.put("icirc", new Character((char) 238));
entities.put("iuml", new Character((char) 239));
entities.put("eth", new Character((char) 240));
entities.put("ntilde", new Character((char) 241));
entities.put("ograve", new Character((char) 242));
entities.put("oacute", new Character((char) 243));
entities.put("ocirc", new Character((char) 244));
entities.put("otilde", new Character((char) 245));
entities.put("ouml", new Character((char) 246));
entities.put("divide", new Character((char) 247));
entities.put("oslash", new Character((char) 248));
entities.put("ugrave", new Character((char) 249));
entities.put("uacute", new Character((char) 250));
entities.put("ucirc", new Character((char) 251));
entities.put("uuml", new Character((char) 252));
entities.put("yacute", new Character((char) 253));
entities.put("thorn", new Character((char) 254));
entities.put("yuml", new Character((char) 255));
// symbols from http://de.selfhtml.org/html/referenz/zeichen.htm
// greek letters
entities.put("Alpha", new Character((char) 913));
entities.put("Beta", new Character((char) 914));
entities.put("Gamma", new Character((char) 915));
entities.put("Delta", new Character((char) 916));
entities.put("Epsilon", new Character((char) 917));
entities.put("Zeta", new Character((char) 918));
entities.put("Eta", new Character((char) 919));
entities.put("Theta", new Character((char) 920));
entities.put("Iota", new Character((char) 921));
entities.put("Kappa", new Character((char) 922));
entities.put("Lambda", new Character((char) 923));
entities.put("Mu", new Character((char) 924));
entities.put("Nu", new Character((char) 925));
entities.put("Xi", new Character((char) 926));
entities.put("Omicron", new Character((char) 927));
entities.put("Pi", new Character((char) 928));
entities.put("Rho", new Character((char) 929));
entities.put("Sigma", new Character((char) 930));
entities.put("Sigmaf", new Character((char) 931));
entities.put("Tau", new Character((char) 932));
entities.put("Upsilon", new Character((char) 933));
entities.put("Phi", new Character((char) 934));
entities.put("Chi", new Character((char) 935));
entities.put("Psi", new Character((char) 936));
entities.put("Omega", new Character((char) 937));
entities.put("alpha", new Character((char) 945));
entities.put("beta", new Character((char) 946));
entities.put("gamma", new Character((char) 947));
entities.put("delta", new Character((char) 948));
entities.put("epsilon", new Character((char) 949));
entities.put("zeta", new Character((char) 950));
entities.put("eta", new Character((char) 951));
entities.put("theta", new Character((char) 952));
entities.put("iota", new Character((char) 953));
entities.put("kappa", new Character((char) 954));
entities.put("lambda", new Character((char) 955));
entities.put("mu", new Character((char) 956));
entities.put("nu", new Character((char) 957));
entities.put("xi", new Character((char) 958));
entities.put("omicron", new Character((char) 959));
entities.put("pi", new Character((char) 960));
entities.put("rho", new Character((char) 961));
entities.put("sigma", new Character((char) 962));
entities.put("sigmaf", new Character((char) 963));
entities.put("tau", new Character((char) 964));
entities.put("upsilon", new Character((char) 965));
entities.put("phi", new Character((char) 966));
entities.put("chi", new Character((char) 967));
entities.put("psi", new Character((char) 968));
entities.put("omega", new Character((char) 969));
entities.put("thetasym", new Character((char) 977));
entities.put("upsih", new Character((char) 978));
entities.put("piv", new Character((char) 982));
// math symbols
entities.put("forall", new Character((char) 8704));
entities.put("part", new Character((char) 8706));
entities.put("exist", new Character((char) 8707));
entities.put("empty", new Character((char) 8709));
entities.put("nabla", new Character((char) 8711));
entities.put("isin", new Character((char) 8712));
entities.put("notin", new Character((char) 8713));
entities.put("ni", new Character((char) 8715));
entities.put("prod", new Character((char) 8719));
entities.put("sum", new Character((char) 8721));
entities.put("minus", new Character((char) 8722));
entities.put("lowast", new Character((char) 8727));
entities.put("radic", new Character((char) 8730));
entities.put("prop", new Character((char) 8733));
entities.put("infin", new Character((char) 8734));
entities.put("ang", new Character((char) 8736));
entities.put("and", new Character((char) 8743));
entities.put("or", new Character((char) 8744));
entities.put("cap", new Character((char) 8745));
entities.put("cup", new Character((char) 8746));
entities.put("int", new Character((char) 8747));
entities.put("there4", new Character((char) 8756));
entities.put("sim", new Character((char) 8764));
entities.put("cong", new Character((char) 8773));
entities.put("asymp", new Character((char) 8776));
entities.put("ne", new Character((char) 8800));
entities.put("equiv", new Character((char) 8801));
entities.put("le", new Character((char) 8804));
entities.put("ge", new Character((char) 8805));
entities.put("sub", new Character((char) 8834));
entities.put("sup", new Character((char) 8835));
entities.put("nsub", new Character((char) 8836));
entities.put("sube", new Character((char) 8838));
entities.put("supe", new Character((char) 8839));
entities.put("oplus", new Character((char) 8853));
entities.put("otimes", new Character((char) 8855));
entities.put("perp", new Character((char) 8869));
entities.put("sdot", new Character((char) 8901));
entities.put("loz", new Character((char) 9674));
// technical symbols
entities.put("lceil", new Character((char) 8968));
entities.put("rceil", new Character((char) 8969));
entities.put("lfloor", new Character((char) 8970));
entities.put("rfloor", new Character((char) 8971));
entities.put("lang", new Character((char) 9001));
entities.put("rang", new Character((char) 9002));
// arrow symbols
entities.put("larr", new Character((char) 8592));
entities.put("uarr", new Character((char) 8593));
entities.put("rarr", new Character((char) 8594));
entities.put("darr", new Character((char) 8595));
entities.put("harr", new Character((char) 8596));
entities.put("crarr", new Character((char) 8629));
entities.put("lArr", new Character((char) 8656));
entities.put("uArr", new Character((char) 8657));
entities.put("rArr", new Character((char) 8658));
entities.put("dArr", new Character((char) 8659));
entities.put("hArr", new Character((char) 8960));
// divers symbols
entities.put("bull", new Character((char) 8226));
entities.put("prime", new Character((char) 8242));
entities.put("Prime", new Character((char) 8243));
entities.put("oline", new Character((char) 8254));
entities.put("weierp", new Character((char) 8472));
entities.put("image", new Character((char) 8465));
entities.put("real", new Character((char) 8476));
entities.put("trade", new Character((char) 8482));
entities.put("euro", new Character((char) 8364));
entities.put("alefsym", new Character((char) 8501));
entities.put("spades", new Character((char) 9824));
entities.put("clubs", new Character((char) 9827));
entities.put("hearts", new Character((char) 9829));
entities.put("diams", new Character((char) 9830));
// ext lat symbols
entities.put("OElig", new Character((char) 338));
entities.put("oelig", new Character((char) 339));
entities.put("Scaron", new Character((char) 352));
entities.put("scaron", new Character((char) 353));
entities.put("fnof", new Character((char) 402));
// interpunction
entities.put("ensp", new Character((char) 8194));
entities.put("emsp", new Character((char) 8195));
entities.put("thinsp", new Character((char) 8201));
entities.put("zwnj", new Character((char) 8204));
entities.put("zwj", new Character((char) 8205));
entities.put("lrm", new Character((char) 8206));
entities.put("rlm", new Character((char) 8207));
entities.put("sbquo", new Character((char) 8218));
entities.put("ldquo", new Character((char) 8220));
entities.put("rdquo", new Character((char) 8221));
entities.put("bdquo", new Character((char) 8222));
entities.put("dagger", new Character((char) 8224));
entities.put("Dagger", new Character((char) 8225));
entities.put("hellip", new Character((char) 8230));
entities.put("permil", new Character((char) 8240));
entities.put("lsaquo", new Character((char) 8249));
entities.put("rsaquo", new Character((char) 8250));
// diacrit symb
entities.put("circ", new Character((char) 710));
entities.put("tilde", new Character((char) 732));
Map elementInfos = ELEMENT_INFOS;
elementInfos.put("NOSCRIPT", new ElementInfo(true, ElementInfo.END_ELEMENT_REQUIRED, null, true));
ElementInfo optionalEndElement = new ElementInfo(true, ElementInfo.END_ELEMENT_OPTIONAL);
ElementInfo forbiddenEndElement = new ElementInfo(false, ElementInfo.END_ELEMENT_FORBIDDEN);
ElementInfo onlyTextDE = new ElementInfo(false, ElementInfo.END_ELEMENT_REQUIRED, true);
ElementInfo onlyText = new ElementInfo(false, ElementInfo.END_ELEMENT_REQUIRED, false);
Set tableCellStopElements = new HashSet();
tableCellStopElements.add("TH");
tableCellStopElements.add("TD");
tableCellStopElements.add("TR");
ElementInfo tableCellElement = new ElementInfo(true, ElementInfo.END_ELEMENT_OPTIONAL, tableCellStopElements);
Set headStopElements = new HashSet();
headStopElements.add("BODY");
headStopElements.add("DIV");
headStopElements.add("SPAN");
headStopElements.add("TABLE");
ElementInfo headElement = new ElementInfo(true, ElementInfo.END_ELEMENT_OPTIONAL, headStopElements);
Set optionStopElements = new HashSet();
optionStopElements.add("OPTION");
optionStopElements.add("SELECT");
ElementInfo optionElement = new ElementInfo(true, ElementInfo.END_ELEMENT_OPTIONAL, optionStopElements);
Set paragraphStopElements = new HashSet();
paragraphStopElements.add("P");
paragraphStopElements.add("DIV");
paragraphStopElements.add("TABLE");
paragraphStopElements.add("PRE");
paragraphStopElements.add("UL");
paragraphStopElements.add("OL");
ElementInfo paragraphElement = new ElementInfo(true, ElementInfo.END_ELEMENT_OPTIONAL, paragraphStopElements);
// Set liStopElements = new HashSet();
// liStopElements.add("LI");
// liStopElements.add("UL");
// liStopElements.add("OL");
elementInfos.put("SCRIPT", onlyText);
elementInfos.put("STYLE", onlyText);
elementInfos.put("TEXTAREA", onlyTextDE);
elementInfos.put("IMG", forbiddenEndElement);
elementInfos.put("META", forbiddenEndElement);
elementInfos.put("LINK", forbiddenEndElement);
elementInfos.put("BASE", forbiddenEndElement);
elementInfos.put("INPUT", forbiddenEndElement);
elementInfos.put("FRAME", forbiddenEndElement);
elementInfos.put("BR", forbiddenEndElement);
elementInfos.put("HR", forbiddenEndElement);
elementInfos.put("EMBED", forbiddenEndElement);
elementInfos.put("SPACER", forbiddenEndElement);
elementInfos.put("P", paragraphElement);
elementInfos.put("LI", optionalEndElement);
elementInfos.put("DT", optionalEndElement);
elementInfos.put("DD", optionalEndElement);
elementInfos.put("TR", optionalEndElement);
elementInfos.put("TH", tableCellElement);
elementInfos.put("TD", tableCellElement);
elementInfos.put("HEAD", headElement);
elementInfos.put("OPTION", optionElement);
// Note: The specification states anchors have
// a required end element, but browsers generally behave
// as if it's optional.
elementInfos.put("A", optionalEndElement);
elementInfos.put("ANCHOR", optionalEndElement);
// TODO: Keep adding tags here
}
/**
* Constructs a <code>HtmlParser</code>.
*
* @param document
* A W3C Document instance.
* @param errorHandler
* The error handler.
* @param publicId
* The public ID of the document.
* @param systemId
* The system ID of the document.
* @deprecated UserAgentContext should be passed in constructor.
*/
public HtmlParser(Document document, ErrorHandler errorHandler, String publicId, String systemId) {
this.ucontext = null;
this.document = document;
this.errorHandler = errorHandler;
this.publicId = publicId;
this.systemId = systemId;
}
/**
* Constructs a <code>HtmlParser</code>.
*
* @param ucontext
* The user agent context.
* @param document
* An W3C Document instance.
* @param errorHandler
* The error handler.
* @param publicId
* The public ID of the document.
* @param systemId
* The system ID of the document.
*/
public HtmlParser(UserAgentContext ucontext, Document document, ErrorHandler errorHandler, String publicId, String systemId) {
this.ucontext = ucontext;
this.document = document;
this.errorHandler = errorHandler;
this.publicId = publicId;
this.systemId = systemId;
}
/**
* Constructs a <code>HtmlParser</code>.
*
* @param ucontext
* The user agent context.
* @param document
* A W3C Document instance.
*/
public HtmlParser(UserAgentContext ucontext, Document document) {
this.ucontext = ucontext;
this.document = document;
this.errorHandler = null;
this.publicId = null;
this.systemId = null;
}
public static boolean isDecodeEntities(String elementName) {
ElementInfo einfo = (ElementInfo) ELEMENT_INFOS.get(elementName.toUpperCase());
return einfo == null ? true : einfo.decodeEntities;
}
/**
* Parses HTML from an input stream, assuming the character set is
* ISO-8859-1.
*
* @param in
* The input stream.
* @throws IOException
* Thrown when there are errors reading the stream.
* @throws SAXException
* Thrown when there are parse errors.
*/
public void parse(InputStream in) throws IOException, SAXException, UnsupportedEncodingException {
this.parse(in, "ISO-8859-1");
}
/**
* Parses HTML from an input stream, using the given character set.
*
* @param in
* The input stream.
* @param charset
* The character set.
* @throws IOException
* Thrown when there's an error reading from the stream.
* @throws SAXException
* Thrown when there is a parser error.
* @throws UnsupportedEncodingException
* Thrown if the character set is not supported.
*/
public void parse(InputStream in, String charset) throws IOException, SAXException, UnsupportedEncodingException {
WritableLineReader reader = new WritableLineReader(new InputStreamReader(in, charset));
this.parse(reader);
}
/**
* Parses HTML given by a <code>Reader</code>. This method appends nodes to
* the document provided to the parser.
*
* @param reader
* An instance of <code>Reader</code>.
* @throws IOException
* Thrown if there are errors reading the input stream.
* @throws SAXException
* Thrown if there are parse errors.
*/
public void parse(Reader reader) throws IOException, SAXException {
this.parse(new LineNumberReader(reader));
}
public void parse(LineNumberReader reader) throws IOException, SAXException {
Document doc = this.document;
this.parse(reader, doc);
}
/**
* This method may be used when the DOM should be built under a given node,
* such as when <code>innerHTML</code> is used in Javascript.
*
* @param reader
* A document reader.
* @param parent
* The root node for the parsed DOM.
* @throws IOException
* @throws SAXException
*/
public void parse(Reader reader, Node parent) throws IOException, SAXException {
this.parse(new LineNumberReader(reader), parent);
}
/**
* This method may be used when the DOM should be built under a given node,
* such as when <code>innerHTML</code> is used in Javascript.
*
* @param reader
* A LineNumberReader for the document.
* @param parent
* The root node for the parsed DOM.
* @throws IOException
* @throws SAXException
*/
public void parse(LineNumberReader reader, Node parent) throws IOException, SAXException {
// Note: Parser does not clear document. It could be used incrementally.
try {
parent.setUserData(MODIFYING_KEY, Boolean.TRUE, null);
try {
while (this.parseToken(parent, reader, null, new LinkedList()) != TOKEN_EOD) {
;
}
} catch (StopException se) {
throw new SAXException("Unexpected flow exception", se);
}
} finally {
parent.setUserData(MODIFYING_KEY, Boolean.FALSE, null);
}
}
private static final int TOKEN_EOD = 0;
private static final int TOKEN_COMMENT = 1;
private static final int TOKEN_TEXT = 2;
private static final int TOKEN_BEGIN_ELEMENT = 3;
private static final int TOKEN_END_ELEMENT = 4;
private static final int TOKEN_FULL_ELEMENT = 5;
private static final int TOKEN_BAD = 6;
private String normalLastTag = null;
private boolean justReadTagBegin = false;
private boolean justReadTagEnd = false;
/**
* Only set when readAttribute returns false.
*/
private boolean justReadEmptyElement = false;
/**
* Parses text followed by one element.
*
* @param parent
* @param reader
* @param stopAtTagUC
* If this tag is encountered, the method throws StopException.
* @param stopTags
* If tags in this set are encountered, the method throws
* StopException.
* @return
* @throws IOException
* @throws StopException
* @throws SAXException
*/
private final int parseToken(Node parent, LineNumberReader reader, Set stopTags, LinkedList ancestors) throws IOException, StopException,
SAXException {
Document doc = this.document;
StringBuffer textSb = this.readUpToTagBegin(reader);
if (textSb == null) {
return TOKEN_EOD;
}
if (textSb.length() != 0) {
// int textLine = reader.getLineNumber();
StringBuffer decText = this.entityDecode(textSb);
Node textNode = doc.createTextNode(decText.toString());
try {
parent.appendChild(textNode);
} catch (DOMException de) {
if (parent.getNodeType() != Node.DOCUMENT_NODE || de.code != DOMException.HIERARCHY_REQUEST_ERR) {
logger.log(Level.WARNING, "parseToken(): Unable to append child to " + parent + ".", de);
}
}
}
if (this.justReadTagBegin) {
String tag = this.readTag(parent, reader);
if (tag == null) {
return TOKEN_EOD;
}
String normalTag = tag.toUpperCase();
try {
if (tag.startsWith("!")) {
if ("!--".equals(tag)) {
// int commentLine = reader.getLineNumber();
StringBuffer comment = this.passEndOfComment(reader);
StringBuffer decText = this.entityDecode(comment);
parent.appendChild(doc.createComment(decText.toString()));
return TOKEN_COMMENT;
} else {
// TODO: DOCTYPE node
this.passEndOfTag(reader);
return TOKEN_BAD;
}
} else if (tag.startsWith("/")) {
tag = tag.substring(1);
normalTag = normalTag.substring(1);
this.passEndOfTag(reader);
return TOKEN_END_ELEMENT;
} else if (tag.startsWith("?")) {
tag = tag.substring(1);
StringBuffer data = readProcessingInstruction(reader);
parent.appendChild(doc.createProcessingInstruction(tag, data.toString()));
return TOKEN_FULL_ELEMENT;
} else {
Element element = doc.createElement(tag);
element.setUserData(MODIFYING_KEY, Boolean.TRUE, null);
try {
if (!this.justReadTagEnd) {
while (this.readAttribute(reader, element)) {
;
}
}
if (stopTags != null && stopTags.contains(normalTag)) {
// Throw before appending to parent.
// After attributes are set.
// After MODIFYING_KEY is set.
throw new StopException(element);
}
// Add element to parent before children are added.
// This is necessary for incremental rendering.
parent.appendChild(element);
if (!this.justReadEmptyElement) {
ElementInfo einfo = (ElementInfo) ELEMENT_INFOS.get(normalTag);
int endTagType = einfo == null ? ElementInfo.END_ELEMENT_REQUIRED : einfo.endElementType;
if (endTagType != ElementInfo.END_ELEMENT_FORBIDDEN) {
boolean childrenOk = einfo == null ? true : einfo.childElementOk;
Set newStopSet = einfo == null ? null : einfo.stopTags;
if (newStopSet == null) {
if (endTagType == ElementInfo.END_ELEMENT_OPTIONAL) {
newStopSet = Collections.singleton(normalTag);
}
}
if (stopTags != null) {
if (newStopSet != null) {
Set newStopSet2 = new HashSet();
newStopSet2.addAll(stopTags);
newStopSet2.addAll(newStopSet);
newStopSet = newStopSet2;
} else {
newStopSet = endTagType == ElementInfo.END_ELEMENT_REQUIRED ? null : stopTags;
}
}
ancestors.addFirst(normalTag);
try {
for (;;) {
try {
int token;
if (einfo != null && einfo.noScriptElement) {
UserAgentContext ucontext = this.ucontext;
if (ucontext == null || ucontext.isScriptingEnabled()) {
token = this.parseForEndTag(parent, reader, tag, false, einfo.decodeEntities);
} else {
token = this.parseToken(element, reader, newStopSet, ancestors);
}
} else {
token = childrenOk ? this.parseToken(element, reader, newStopSet, ancestors) : this.parseForEndTag(
element, reader, tag, true, einfo.decodeEntities);
}
if (token == TOKEN_END_ELEMENT) {
String normalLastTag = this.normalLastTag;
if (normalTag.equals(normalLastTag)) {
return TOKEN_FULL_ELEMENT;
} else {
ElementInfo closeTagInfo = (ElementInfo) ELEMENT_INFOS.get(normalLastTag);
if (closeTagInfo == null || closeTagInfo.endElementType != ElementInfo.END_ELEMENT_FORBIDDEN) {
// TODO: Rather
// inefficient
// algorithm, but it's
// probably executed
// infrequently?
Iterator i = ancestors.iterator();
if (i.hasNext()) {
i.next();
while (i.hasNext()) {
String normalAncestorTag = (String) i.next();
if (normalLastTag.equals(normalAncestorTag)) {
normalTag = normalLastTag;
return TOKEN_END_ELEMENT;
}
}
}
}
// TODO: Working here
}
} else if (token == TOKEN_EOD) {
return TOKEN_EOD;
}
} catch (StopException se) {
// newElement does not have a
// parent.
Element newElement = se.getElement();
tag = newElement.getTagName();
normalTag = tag.toUpperCase();
// If a subelement throws
// StopException with
// a tag matching the current stop
// tag, the exception
// is rethrown (e.g.
// <TR><TD>blah<TR><TD>blah)
if (stopTags != null && stopTags.contains(normalTag)) {
throw se;
}
einfo = (ElementInfo) ELEMENT_INFOS.get(normalTag);
endTagType = einfo == null ? ElementInfo.END_ELEMENT_REQUIRED : einfo.endElementType;
childrenOk = einfo == null ? true : einfo.childElementOk;
newStopSet = einfo == null ? null : einfo.stopTags;
if (newStopSet == null) {
if (endTagType == ElementInfo.END_ELEMENT_OPTIONAL) {
newStopSet = Collections.singleton(normalTag);
}
}
if (stopTags != null && newStopSet != null) {
Set newStopSet2 = new HashSet();
newStopSet2.addAll(stopTags);
newStopSet2.addAll(newStopSet);
newStopSet = newStopSet2;
}
ancestors.removeFirst();
ancestors.addFirst(normalTag);
// Switch element
element.setUserData(MODIFYING_KEY, Boolean.FALSE, null);
// newElement should have been
// suspended.
element = newElement;
// Add to parent
parent.appendChild(element);
if (this.justReadEmptyElement) {
return TOKEN_BEGIN_ELEMENT;
}
}
}
} finally {
ancestors.removeFirst();
}
}
}
return TOKEN_BEGIN_ELEMENT;
} finally {
// This can inform elements to continue with
// notifications.
// It can also cause Javascript to get processed.
element.setUserData(MODIFYING_KEY, Boolean.FALSE, null);
}
}
} finally {
this.normalLastTag = normalTag;
}
} else {
this.normalLastTag = null;
return TOKEN_TEXT;
}
}
/**
* Reads text until the beginning of the next tag. Leaves the reader offset
* past the opening angle bracket. Returns null only on EOF.
*/
private final StringBuffer readUpToTagBegin(LineNumberReader reader) throws IOException, SAXException {
StringBuffer sb = null;
int intCh;
while ((intCh = reader.read()) != -1) {
char ch = (char) intCh;
if (ch == '<') {
this.justReadTagBegin = true;
this.justReadTagEnd = false;
this.justReadEmptyElement = false;
if (sb == null) {
sb = new StringBuffer(0);
}
return sb;
}
if (sb == null) {
sb = new StringBuffer();
}
sb.append(ch);
}
this.justReadTagBegin = false;
this.justReadTagEnd = false;
this.justReadEmptyElement = false;
return sb;
}
/**
* Assumes that the content is completely made up of text, and parses until
* an ending tag is found.
*
* @param parent
* @param reader
* @param tagName
* @return
* @throws IOException
*/
private final int parseForEndTag(Node parent, LineNumberReader reader, String tagName, boolean addTextNode, boolean decodeEntities)
throws IOException, SAXException {
Document doc = this.document;
int intCh;
StringBuffer sb = new StringBuffer();
while ((intCh = reader.read()) != -1) {
char ch = (char) intCh;
if (ch == '<') {
intCh = reader.read();
if (intCh != -1) {
ch = (char) intCh;
if (ch == '/') {
StringBuffer tempBuffer = new StringBuffer();
INNER: while ((intCh = reader.read()) != -1) {
ch = (char) intCh;
if (ch == '>') {
String thisTag = tempBuffer.toString().trim();
if (thisTag.equalsIgnoreCase(tagName)) {
this.justReadTagBegin = false;
this.justReadTagEnd = true;
this.justReadEmptyElement = false;
this.normalLastTag = thisTag.toUpperCase();
if (addTextNode) {
if (decodeEntities) {
sb = this.entityDecode(sb);
}
String text = sb.toString();
if (text.length() != 0) {
Node textNode = doc.createTextNode(text);
parent.appendChild(textNode);
}
}
return HtmlParser.TOKEN_END_ELEMENT;
} else {
break INNER;
}
} else {
tempBuffer.append(ch);
}
}
sb.append("</");
sb.append(tempBuffer);
} else {
sb.append('<');
}
}
}
sb.append(ch);
}
this.justReadTagBegin = false;
this.justReadTagEnd = false;
this.justReadEmptyElement = false;
if (addTextNode) {
if (decodeEntities) {
sb = this.entityDecode(sb);
}
String text = sb.toString();
if (text.length() != 0) {
Node textNode = doc.createTextNode(text);
parent.appendChild(textNode);
}
}
return HtmlParser.TOKEN_EOD;
}
/**
* The reader offset should be
*
* @param reader
* @return
*/
private final String readTag(Node parent, LineNumberReader reader) throws IOException {
StringBuffer sb = new StringBuffer();
int chInt;
chInt = reader.read();
if (chInt != -1) {
boolean cont = true;
char ch;
LOOP: for (;;) {
ch = (char) chInt;
if (Character.isLetter(ch)) {
// Speed up normal case
break LOOP;
} else if (ch == '!') {
sb.append('!');
chInt = reader.read();
if (chInt != -1) {
ch = (char) chInt;
if (ch == '-') {
sb.append('-');
chInt = reader.read();
if (chInt != -1) {
ch = (char) chInt;
if (ch == '-') {
sb.append('-');
cont = false;
}
} else {
cont = false;
}
}
} else {
cont = false;
}
} else if (ch == '/') {
sb.append(ch);
chInt = reader.read();
if (chInt != -1) {
ch = (char) chInt;
} else {
cont = false;
}
} else if (ch == '<') {
StringBuffer ltText = new StringBuffer(3);
ltText.append('<');
while ((chInt = reader.read()) == '<') {
ltText.append('<');
}
Document doc = this.document;
Node textNode = doc.createTextNode(ltText.toString());
try {
parent.appendChild(textNode);
} catch (DOMException de) {
if (parent.getNodeType() != Node.DOCUMENT_NODE || de.code != DOMException.HIERARCHY_REQUEST_ERR) {
logger.log(Level.WARNING, "parseToken(): Unable to append child to " + parent + ".", de);
}
}
if (chInt == -1) {
cont = false;
} else {
continue LOOP;
}
} else if (Character.isWhitespace(ch)) {
StringBuffer ltText = new StringBuffer();
ltText.append('<');
ltText.append(ch);
while ((chInt = reader.read()) != -1) {
ch = (char) chInt;
if (ch == '<') {
chInt = reader.read();
break;
}
ltText.append(ch);
}
Document doc = this.document;
Node textNode = doc.createTextNode(ltText.toString());
try {
parent.appendChild(textNode);
} catch (DOMException de) {
if (parent.getNodeType() != Node.DOCUMENT_NODE || de.code != DOMException.HIERARCHY_REQUEST_ERR) {
logger.log(Level.WARNING, "parseToken(): Unable to append child to " + parent + ".", de);
}
}
if (chInt == -1) {
cont = false;
} else {
continue LOOP;
}
}
break LOOP;
}
if (cont) {
boolean lastCharSlash = false;
for (;;) {
if (Character.isWhitespace(ch)) {
break;
} else if (ch == '>') {
this.justReadTagEnd = true;
this.justReadTagBegin = false;
this.justReadEmptyElement = lastCharSlash;
String tag = sb.toString();
return tag;
} else if (ch == '/') {
lastCharSlash = true;
} else {
if (lastCharSlash) {
sb.append('/');
}
lastCharSlash = false;
sb.append(ch);
}
chInt = reader.read();
if (chInt == -1) {
break;
}
ch = (char) chInt;
}
}
}
if (sb.length() > 0) {
this.justReadTagEnd = false;
this.justReadTagBegin = false;
this.justReadEmptyElement = false;
}
String tag = sb.toString();
return tag;
}
private final StringBuffer passEndOfComment(LineNumberReader reader) throws IOException {
if (this.justReadTagEnd) {
return new StringBuffer(0);
}
StringBuffer sb = new StringBuffer();
OUTER: for (;;) {
int chInt = reader.read();
if (chInt == -1) {
break OUTER;
}
char ch = (char) chInt;
if (ch == '-') {
chInt = reader.read();
if (chInt == -1) {
sb.append(ch);
break OUTER;
}
ch = (char) chInt;
if (ch == '-') {
StringBuffer extra = null;
INNER: for (;;) {
chInt = reader.read();
if (chInt == -1) {
if (extra != null) {
sb.append(extra.toString());
}
break OUTER;
}
ch = (char) chInt;
if (ch == '>') {
this.justReadTagBegin = false;
this.justReadTagEnd = true;
return sb;
} else if (ch == '-') {
// Allow any number of dashes at the end
if (extra == null) {
extra = new StringBuffer();
extra.append("--");
}
extra.append("-");
} else if (Character.isWhitespace(ch)) {
if (extra == null) {
extra = new StringBuffer();
extra.append("--");
}
extra.append(ch);
} else {
if (extra != null) {
sb.append(extra.toString());
}
sb.append(ch);
break INNER;
}
}
} else {
sb.append('-');
sb.append(ch);
}
} else {
sb.append(ch);
}
}
if (sb.length() > 0) {
this.justReadTagBegin = false;
this.justReadTagEnd = false;
}
return sb;
}
private final void passEndOfTag(Reader reader) throws IOException {
if (this.justReadTagEnd) {
return;
}
boolean readSomething = false;
for (;;) {
int chInt = reader.read();
if (chInt == -1) {
break;
}
readSomething = true;
char ch = (char) chInt;
if (ch == '>') {
this.justReadTagEnd = true;
this.justReadTagBegin = false;
return;
}
}
if (readSomething) {
this.justReadTagBegin = false;
this.justReadTagEnd = false;
}
}
private final StringBuffer readProcessingInstruction(LineNumberReader reader) throws IOException {
StringBuffer pidata = new StringBuffer();
if (this.justReadTagEnd) {
return pidata;
}
int ch;
for (ch = reader.read(); ch != -1 && ch != '>'; ch = reader.read()) {
pidata.append((char) ch);
}
this.justReadTagBegin = false;
this.justReadTagEnd = ch != -1;
return pidata;
}
private final boolean readAttribute(LineNumberReader reader, Element element) throws IOException, SAXException {
if (this.justReadTagEnd) {
return false;
}
// Read attribute name up to '=' character.
// May read several attribute names without explicit values.
StringBuffer attributeName = null;
boolean blankFound = false;
boolean lastCharSlash = false;
for (;;) {
int chInt = reader.read();
if (chInt == -1) {
if (attributeName != null && attributeName.length() != 0) {
String attributeNameStr = attributeName.toString();
element.setAttribute(attributeNameStr, attributeNameStr);
attributeName.setLength(0);
}
this.justReadTagBegin = false;
this.justReadTagEnd = false;
this.justReadEmptyElement = false;
return false;
}
char ch = (char) chInt;
if (ch == '=') {
lastCharSlash = false;
blankFound = false;
break;
} else if (ch == '>') {
if (attributeName != null && attributeName.length() != 0) {
String attributeNameStr = attributeName.toString();
element.setAttribute(attributeNameStr, attributeNameStr);
}
this.justReadTagBegin = false;
this.justReadTagEnd = true;
this.justReadEmptyElement = lastCharSlash;
return false;
} else if (ch == '/') {
blankFound = true;
lastCharSlash = true;
} else if (Character.isWhitespace(ch)) {
lastCharSlash = false;
blankFound = true;
} else {
lastCharSlash = false;
if (blankFound) {
blankFound = false;
if (attributeName != null && attributeName.length() != 0) {
String attributeNameStr = attributeName.toString();
element.setAttribute(attributeNameStr, attributeNameStr);
attributeName.setLength(0);
}
}
if (attributeName == null) {
attributeName = new StringBuffer(6);
}
attributeName.append(ch);
}
}
// Read blanks up to open quote or first non-blank.
StringBuffer attributeValue = null;
int openQuote = -1;
for (;;) {
int chInt = reader.read();
if (chInt == -1) {
break;
}
char ch = (char) chInt;
if (ch == '>') {
if (attributeName != null && attributeName.length() != 0) {
String attributeNameStr = attributeName.toString();
element.setAttribute(attributeNameStr, attributeNameStr);
}
this.justReadTagBegin = false;
this.justReadTagEnd = true;
this.justReadEmptyElement = lastCharSlash;
return false;
} else if (ch == '/') {
lastCharSlash = true;
} else if (Character.isWhitespace(ch)) {
lastCharSlash = false;
} else {
lastCharSlash = false;
if (ch == '"') {
openQuote = '"';
} else if (ch == '\'') {
openQuote = '\'';
} else {
openQuote = -1;
if (attributeValue == null) {
attributeValue = new StringBuffer(6);
}
attributeValue.append(ch);
}
break;
}
}
// Read attribute value
for (;;) {
int chInt = reader.read();
if (chInt == -1) {
break;
}
char ch = (char) chInt;
if (openQuote != -1 && ch == openQuote) {
lastCharSlash = false;
if (attributeName != null) {
String attributeNameStr = attributeName.toString();
if (attributeValue == null) {
// Quotes are closed. There's a distinction
// between blank values and null in HTML, as
// processed by major browsers.
element.setAttribute(attributeNameStr, "");
} else {
StringBuffer actualAttributeValue = this.entityDecode(attributeValue);
element.setAttribute(attributeNameStr, actualAttributeValue.toString());
}
}
this.justReadTagBegin = false;
this.justReadTagEnd = false;
return true;
} else if (openQuote == -1 && ch == '>') {
if (attributeName != null) {
String attributeNameStr = attributeName.toString();
if (attributeValue == null) {
element.setAttribute(attributeNameStr, null);
} else {
StringBuffer actualAttributeValue = this.entityDecode(attributeValue);
element.setAttribute(attributeNameStr, actualAttributeValue.toString());
}
}
this.justReadTagBegin = false;
this.justReadTagEnd = true;
this.justReadEmptyElement = lastCharSlash;
return false;
} else if (openQuote == -1 && Character.isWhitespace(ch)) {
lastCharSlash = false;
if (attributeName != null) {
String attributeNameStr = attributeName.toString();
if (attributeValue == null) {
element.setAttribute(attributeNameStr, null);
} else {
StringBuffer actualAttributeValue = this.entityDecode(attributeValue);
element.setAttribute(attributeNameStr, actualAttributeValue.toString());
}
}
this.justReadTagBegin = false;
this.justReadTagEnd = false;
return true;
} else {
if (attributeValue == null) {
attributeValue = new StringBuffer(6);
}
if (lastCharSlash) {
attributeValue.append('/');
}
lastCharSlash = false;
attributeValue.append(ch);
}
}
this.justReadTagBegin = false;
this.justReadTagEnd = false;
if (attributeName != null) {
String attributeNameStr = attributeName.toString();
if (attributeValue == null) {
element.setAttribute(attributeNameStr, null);
} else {
StringBuffer actualAttributeValue = this.entityDecode(attributeValue);
element.setAttribute(attributeNameStr, actualAttributeValue.toString());
}
}
return false;
}
private final StringBuffer entityDecode(StringBuffer rawText) throws org.xml.sax.SAXException {
int startIdx = 0;
StringBuffer sb = null;
for (;;) {
int ampIdx = rawText.indexOf("&", startIdx);
if (ampIdx == -1) {
if (sb == null) {
return rawText;
} else {
sb.append(rawText.substring(startIdx));
return sb;
}
}
if (sb == null) {
sb = new StringBuffer();
}
sb.append(rawText.substring(startIdx, ampIdx));
int colonIdx = rawText.indexOf(";", ampIdx);
if (colonIdx == -1) {
sb.append('&');
startIdx = ampIdx + 1;
continue;
}
String spec = rawText.substring(ampIdx + 1, colonIdx);
if (spec.startsWith("#")) {
String number = spec.substring(1).toLowerCase();
int decimal;
try {
if (number.startsWith("x")) {
decimal = Integer.parseInt(number.substring(1), 16);
} else {
decimal = Integer.parseInt(number);
}
} catch (NumberFormatException nfe) {
logger.log(Level.WARNING, "entityDecode()", nfe);
decimal = 0;
}
sb.append((char) decimal);
} else {
int chInt = this.getEntityChar(spec);
if (chInt == -1) {
sb.append('&');
sb.append(spec);
sb.append(';');
} else {
sb.append((char) chInt);
}
}
startIdx = colonIdx + 1;
}
}
private final Locator getLocator(int lineNumber, int columnNumber) {
return new LocatorImpl(this.publicId, this.systemId, lineNumber, columnNumber);
}
private final int getEntityChar(String spec) {
// TODO: Declared entities
Character c = (Character) ENTITIES.get(spec);
if (c == null) {
String specTL = spec.toLowerCase();
c = (Character) ENTITIES.get(specTL);
if (c == null) {
return -1;
}
}
return (int) c.charValue();
}
}