HtmlParser.java example

Explorer
Loboevolution-master
/*
    GNU GENERAL LICENSE
    Copyright (C) 2006 The Lobo Project. Copyright (C) 2014 - 2017 Lobo Evolution

    This program is free software; you can redistribute it and/or
    modify it under the terms of the GNU General Public
    License as published by the Free Software Foundation; either
    verion 3 of the License, or (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
    General License for more details.

    You should have received a copy of the GNU General Public
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
    

    Contact info: lobochief@users.sourceforge.net; ivan.difrancesco@yahoo.it
 */
/*
 * Created on Aug 28, 2005
 */
package org.lobobrowser.html.parser;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Map;
import java.util.Set;
import org.apache.logging.log4j.Level;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import org.lobobrowser.html.HtmlMapping;
import org.lobobrowser.html.HtmlMappingChar;
import org.lobobrowser.html.info.ElementInfo;
import org.lobobrowser.html.io.WritableLineReader;
import org.lobobrowser.http.UserAgentContext;
import org.w3c.dom.DOMException;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.xml.sax.ErrorHandler;
import org.xml.sax.SAXException;

/**
 * The <code>HtmlParser</code> class is an HTML DOM parser. This parser provides
 * the functionality for the standard DOM parser implementation
 * {@link org.lobobrowser.html.parser.DocumentBuilderImpl}. This parser class
 * may be used directly when a different DOM implementation is preferred.
 */
public class HtmlParser {

	/** The Constant logger. */
	private static final Logger logger = LogManager.getLogger(HtmlParser.class.getName());

	/** The document. */
	private final Document document;

	/** The ucontext. */
	private final UserAgentContext ucontext;

	/** The entities. */
	private static Map<String, Character> ENTITIES = new HashMap<String, Character>(256);

	/** The element infos. */
	private static Map<String, ElementInfo> ELEMENT_INFOS = new HashMap<String, ElementInfo>(35);
	
	/** The Constant TOKEN_EOD. */
	private static final int TOKEN_EOD = 0;

	/** The Constant TOKEN_COMMENT. */
	private static final int TOKEN_COMMENT = 1;

	/** The Constant TOKEN_TEXT. */
	private static final int TOKEN_TEXT = 2;

	/** The Constant TOKEN_BEGIN_ELEMENT. */
	private static final int TOKEN_BEGIN_ELEMENT = 3;

	/** The Constant TOKEN_END_ELEMENT. */
	private static final int TOKEN_END_ELEMENT = 4;

	/** The Constant TOKEN_FULL_ELEMENT. */
	private static final int TOKEN_FULL_ELEMENT = 5;

	/** The Constant TOKEN_BAD. */
	private static final int TOKEN_BAD = 6;

	/** The normal last tag. */
	private String normalLastTag = null;

	/** The just read tag begin. */
	private boolean justReadTagBegin = false;

	/** The just read tag end. */
	private boolean justReadTagEnd = false;

	/**
	 * Only set when readAttribute returns false.
	 */
	private boolean justReadEmptyElement = false;

	/**
	 * A node <code>UserData</code> key used to tell nodes that their content
	 * may be about to be modified. Elements could use this to temporarily
	 * suspend notifications. The value set will be either
	 * <code>Boolean.TRUE</code> or <code>Boolean.FALSE</code>.
	 */
	public static final String MODIFYING_KEY = "cobra.suspend";

	static {
		ENTITIES = HtmlMappingChar.mappingChar();
		ELEMENT_INFOS = HtmlMapping.mappingTag();
	}

	/**
	 * Constructs a <code>HtmlParser</code>.
	 *
	 * @param ucontext
	 *            The user agent context.
	 * @param document
	 *            An W3C Document instance.
	 * @param errorHandler
	 *            The error handler.
	 * @param publicId
	 *            The public ID of the document.
	 * @param systemId
	 *            The system ID of the document.
	 */
	public HtmlParser(UserAgentContext ucontext, Document document, ErrorHandler errorHandler, String publicId,
			String systemId) {
		this.ucontext = ucontext;
		this.document = document;
	}

	/**
	 * Constructs a <code>HtmlParser</code>.
	 *
	 * @param ucontext
	 *            The user agent context.
	 * @param document
	 *            A W3C Document instance.
	 */
	public HtmlParser(UserAgentContext ucontext, Document document) {
		this.ucontext = ucontext;
		this.document = document;
	}

	/**
	 * Checks if is decode entities.
	 *
	 * @param elementName
	 *            the element name
	 * @return true, if is decode entities
	 */
	public static boolean isDecodeEntities(String elementName) {
		ElementInfo einfo = ELEMENT_INFOS.get(elementName.toUpperCase());
		return einfo == null ? true : einfo.isDecodeEntities();
	}

	/**
	 * Parses HTML from an input stream, assuming the character set is UTF-8.
	 *
	 * @param in
	 *            The input stream.
	 * @throws IOException
	 *             Thrown when there are errors reading the stream.
	 * @throws SAXException
	 *             Thrown when there are parse errors.
	 * @throws UnsupportedEncodingException
	 *             the unsupported encoding exception
	 */
	public void parse(InputStream in) throws IOException, SAXException, UnsupportedEncodingException {
		this.parse(in, "UTF-8");
	}

	/**
	 * Parses HTML from an input stream, using the given character set.
	 *
	 * @param in
	 *            The input stream.
	 * @param charset
	 *            The character set.
	 * @throws IOException
	 *             Thrown when there's an error reading from the stream.
	 * @throws SAXException
	 *             Thrown when there is a parser error.
	 * @throws UnsupportedEncodingException
	 *             Thrown if the character set is not supported.
	 */
	public void parse(InputStream in, String charset) throws IOException, SAXException, UnsupportedEncodingException {
		WritableLineReader reader = new WritableLineReader(new InputStreamReader(in, charset));
		this.parse(reader);
	}

	/**
	 * Parses HTML given by a <code>Reader</code>. This method appends nodes to
	 * the document provided to the parser.
	 *
	 * @param reader
	 *            An instance of <code>Reader</code>.
	 * @throws IOException
	 *             Thrown if there are errors reading the input stream.
	 * @throws SAXException
	 *             Thrown if there are parse errors.
	 */
	public void parse(Reader reader) throws IOException, SAXException {
		this.parse(new LineNumberReader(reader));
	}

	/**
	 * Parses the.
	 *
	 * @param reader
	 *            the reader
	 * @throws IOException
	 *             Signals that an I/O exception has occurred.
	 * @throws SAXException
	 *             the SAX exception
	 */
	public void parse(LineNumberReader reader) throws IOException, SAXException {
		Document doc = this.document;
		this.parse(reader, doc);
	}

	/**
	 * This method may be used when the DOM should be built under a given node,
	 * such as when <code>innerHTML</code> is used in Javascript.
	 *
	 * @param reader
	 *            A document reader.
	 * @param parent
	 *            The root node for the parsed DOM.
	 * @throws IOException
	 *             Signals that an I/O exception has occurred.
	 * @throws SAXException
	 *             the SAX exception
	 */
	public void parse(Reader reader, Node parent) throws IOException, SAXException {
		this.parse(new LineNumberReader(reader), parent);
	}

	/**
	 * This method may be used when the DOM should be built under a given node,
	 * such as when <code>innerHTML</code> is used in Javascript.
	 *
	 * @param reader
	 *            A LineNumberReader for the document.
	 * @param parent
	 *            The root node for the parsed DOM.
	 * @throws IOException
	 *             Signals that an I/O exception has occurred.
	 * @throws SAXException
	 *             the SAX exception
	 */
	public void parse(LineNumberReader reader, Node parent) throws IOException, SAXException {
		// Note: Parser does not clear document. It could be used incrementally.
		try {
			parent.setUserData(MODIFYING_KEY, Boolean.TRUE, null);
			try {
				while (this.parseToken(parent, reader, null, new LinkedList<String>()) != TOKEN_EOD) {
					;
				}
			} catch (StopException se) {
				throw new SAXException("Unexpected flow exception", se);
			}
		} finally {
			parent.setUserData(MODIFYING_KEY, Boolean.FALSE, null);
		}
	}
	
	/**
	 * Parses text followed by one element.
	 *
	 * @param parent
	 *            the parent
	 * @param reader
	 *            the reader
	 * @param stopTags
	 *            If tags in this set are encountered, the method throws
	 *            StopException.
	 * @param ancestors
	 *            the ancestors
	 * @return the int
	 * @throws IOException
	 *             Signals that an I/O exception has occurred.
	 * @throws StopException
	 *             the stop exception
	 * @throws SAXException
	 *             the SAX exception
	 */
	private final int parseToken(Node parent, LineNumberReader reader, Set<String> stopTags,
			LinkedList<String> ancestors) throws IOException, StopException, SAXException {
		Document doc = this.document;
		StringBuffer textSb = this.readUpToTagBegin(reader);
		if (textSb == null) {
			return TOKEN_EOD;
		}
		if (textSb.length() != 0) {
			// int textLine = reader.getLineNumber();
			StringBuffer decText = this.entityDecode(textSb);
			Node textNode = doc.createTextNode(decText.toString());
			try {
				parent.appendChild(textNode);
			} catch (DOMException de) {
				if ((parent.getNodeType() != Node.DOCUMENT_NODE) || (de.code != DOMException.HIERARCHY_REQUEST_ERR)) {
					logger.error("parseToken(): Unable to append child to " + parent + ".", de);
				}
			}
		}
		if (this.justReadTagBegin) {
			String tag = this.readTag(parent, reader);
			if (tag == null) {
				return TOKEN_EOD;
			}
			String normalTag = tag.toUpperCase();
			try {
				if (tag.startsWith("!")) {
					if ("!--".equals(tag)) {
						// int commentLine = reader.getLineNumber();
						StringBuffer comment = this.passEndOfComment(reader);
						StringBuffer decText = this.entityDecode(comment);
						parent.appendChild(doc.createComment(decText.toString()));
						return TOKEN_COMMENT;
					} else {
						// TODO: DOCTYPE node
						this.passEndOfTag(reader);
						return TOKEN_BAD;
					}
				} else if (tag.startsWith("/")) {
					tag = tag.substring(1);
					normalTag = normalTag.substring(1);
					this.passEndOfTag(reader);
					return TOKEN_END_ELEMENT;
				} else if (tag.startsWith("?")) {
					tag = tag.substring(1);
					StringBuffer data = readProcessingInstruction(reader);
					parent.appendChild(doc.createProcessingInstruction(tag, data.toString()));
					return TOKEN_FULL_ELEMENT;
				} else {
					int localIndex = normalTag.indexOf(':');
					boolean tagHasPrefix = localIndex > 0;
					String localName = tagHasPrefix ? normalTag.substring(localIndex + 1) : normalTag;
					Element element = doc.createElement(localName);
					element.setUserData(MODIFYING_KEY, Boolean.TRUE, null);
					try {
						if (!this.justReadTagEnd) {
							while (this.readAttribute(reader, element)) {
								;
							}
						}
						if ((stopTags != null) && stopTags.contains(normalTag)) {
							// Throw before appending to parent.
							// After attributes are set.
							// After MODIFYING_KEY is set.
							throw new StopException(element);
						}
						// Add element to parent before children are added.
						// This is necessary for incremental rendering.
						parent.appendChild(element);
						if (!this.justReadEmptyElement) {
							ElementInfo einfo = ELEMENT_INFOS.get(localName);
							int endTagType = einfo == null ? ElementInfo.END_ELEMENT_REQUIRED : einfo.getEndElementType();
							if (endTagType != ElementInfo.END_ELEMENT_FORBIDDEN) {
								boolean childrenOk = einfo == null ? true : einfo.isChildElementOk();
								Set<String> newStopSet = einfo == null ? null : einfo.getStopTags();
								if (newStopSet == null) {
									if (endTagType == ElementInfo.END_ELEMENT_OPTIONAL) {
										newStopSet = Collections.singleton(normalTag);
									}
								}
								if (stopTags != null) {
									if (newStopSet != null) {
										Set<String> newStopSet2 = new HashSet<String>();
										newStopSet2.addAll(stopTags);
										newStopSet2.addAll(newStopSet);
										newStopSet = newStopSet2;
									} else {
										newStopSet = endTagType == ElementInfo.END_ELEMENT_REQUIRED ? null : stopTags;
									}
								}
								ancestors.addFirst(normalTag);
								try {
									for (;;) {
										try {
											int token;
											if ((einfo != null) && einfo.isNoScriptElement()) {
												UserAgentContext ucontext = this.ucontext;
												if ((ucontext == null) || ucontext.isScriptingEnabled()) {
													token = this.parseForEndTag(parent, reader, tag, false,
															einfo.isDecodeEntities());
												} else {
													token = this.parseToken(element, reader, newStopSet, ancestors);
												}
											} else {
												token = childrenOk
														? this.parseToken(element, reader, newStopSet, ancestors)
														: this.parseForEndTag(element, reader, tag, true,
																einfo.isDecodeEntities());
											}
											if (token == TOKEN_END_ELEMENT) {
												String normalLastTag = this.normalLastTag;
												if (normalTag.equals(normalLastTag)) {
													return TOKEN_FULL_ELEMENT;
												} else {
													ElementInfo closeTagInfo = ELEMENT_INFOS.get(normalLastTag);
													if ((closeTagInfo == null)
															|| (closeTagInfo.getEndElementType() != ElementInfo.END_ELEMENT_FORBIDDEN)) {
														// TODO: Rather
														// inefficient
														// algorithm, but it's
														// probably executed
														// infrequently?
														Iterator<String> i = ancestors.iterator();
														if (i.hasNext()) {
															i.next();
															while (i.hasNext()) {
																String normalAncestorTag = i.next();
																if (normalLastTag.equals(normalAncestorTag)) {
																	normalTag = normalLastTag;
																	return TOKEN_END_ELEMENT;
																}
															}
														}
													}
													// TODO: Working here
												}
											} else if (token == TOKEN_EOD) {
												return TOKEN_EOD;
											}
										} catch (StopException se) {
											// newElement does not have a
											// parent.
											Element newElement = se.getElement();
											tag = newElement.getTagName();
											normalTag = tag.toUpperCase();
											// If a subelement throws
											// StopException with
											// a tag matching the current stop
											// tag, the exception
											// is rethrown (e.g.
											// <TR><TD>blah<TR><TD>blah)
											if ((stopTags != null) && stopTags.contains(normalTag)) {
												throw se;
											}
											einfo = ELEMENT_INFOS.get(normalTag);
											endTagType = einfo == null ? ElementInfo.END_ELEMENT_REQUIRED
													: einfo.getEndElementType();
											childrenOk = einfo == null ? true : einfo.isChildElementOk();
											newStopSet = einfo == null ? null : einfo.getStopTags();
											if (newStopSet == null) {
												if (endTagType == ElementInfo.END_ELEMENT_OPTIONAL) {
													newStopSet = Collections.singleton(normalTag);
												}
											}
											if ((stopTags != null) && (newStopSet != null)) {
												Set<String> newStopSet2 = new HashSet<String>();
												newStopSet2.addAll(stopTags);
												newStopSet2.addAll(newStopSet);
												newStopSet = newStopSet2;
											}
											ancestors.removeFirst();
											ancestors.addFirst(normalTag);
											// Switch element
											element.setUserData(MODIFYING_KEY, Boolean.FALSE, null);
											// newElement should have been
											// suspended.
											element = newElement;
											// Add to parent
											parent.appendChild(element);
											if (this.justReadEmptyElement) {
												return TOKEN_BEGIN_ELEMENT;
											}
										}
									}
								} finally {
									ancestors.removeFirst();
								}
							}
						}
						return TOKEN_BEGIN_ELEMENT;
					} finally {
						// This can inform elements to continue with
						// notifications.
						// It can also cause Javascript to get processed.
						element.setUserData(MODIFYING_KEY, Boolean.FALSE, null);
					}
				}
			} finally {
				this.normalLastTag = normalTag;
			}
		} else {
			this.normalLastTag = null;
			return TOKEN_TEXT;
		}
	}

	/**
	 * Reads text until the beginning of the next tag. Leaves the reader offset
	 * past the opening angle bracket. Returns null only on EOF.
	 *
	 * @param reader
	 *            the reader
	 * @return the string buffer
	 * @throws IOException
	 *             Signals that an I/O exception has occurred.
	 * @throws SAXException
	 *             the SAX exception
	 */
	private final StringBuffer readUpToTagBegin(LineNumberReader reader) throws IOException, SAXException {
		StringBuffer sb = null;
		int intCh;
		while ((intCh = reader.read()) != -1) {
			char ch = (char) intCh;
			if (ch == '<') {
				this.justReadTagBegin = true;
				this.justReadTagEnd = false;
				this.justReadEmptyElement = false;
				if (sb == null) {
					sb = new StringBuffer(0);
				}
				return sb;
			}
			if (sb == null) {
				sb = new StringBuffer();
			}
			sb.append(ch);
		}
		this.justReadTagBegin = false;
		this.justReadTagEnd = false;
		this.justReadEmptyElement = false;
		return sb;
	}

	/**
	 * Assumes that the content is completely made up of text, and parses until
	 * an ending tag is found.
	 *
	 * @param parent
	 *            the parent
	 * @param reader
	 *            the reader
	 * @param tagName
	 *            the tag name
	 * @param addTextNode
	 *            the add text node
	 * @param decodeEntities
	 *            the decode entities
	 * @return the int
	 * @throws IOException
	 *             Signals that an I/O exception has occurred.
	 * @throws SAXException
	 *             the SAX exception
	 */
	private final int parseForEndTag(Node parent, LineNumberReader reader, String tagName, boolean addTextNode,
			boolean decodeEntities) throws IOException, SAXException {
		Document doc = this.document;
		int intCh;
		StringBuffer sb = new StringBuffer();
		while ((intCh = reader.read()) != -1) {
			char ch = (char) intCh;
			if (ch == '<') {
				intCh = reader.read();
				if (intCh != -1) {
					ch = (char) intCh;
					if (ch == '/') {
						StringBuffer tempBuffer = new StringBuffer();
						INNER: while ((intCh = reader.read()) != -1) {
							ch = (char) intCh;
							if (ch == '>') {
								String thisTag = tempBuffer.toString().trim();
								if (thisTag.equalsIgnoreCase(tagName)) {
									this.justReadTagBegin = false;
									this.justReadTagEnd = true;
									this.justReadEmptyElement = false;
									this.normalLastTag = thisTag.toUpperCase();
									if (addTextNode) {
										if (decodeEntities) {
											sb = this.entityDecode(sb);
										}
										String text = sb.toString();
										if (text.length() != 0) {
											Node textNode = doc.createTextNode(text);
											parent.appendChild(textNode);
										}
									}
									return HtmlParser.TOKEN_END_ELEMENT;
								} else {
									break INNER;
								}
							} else {
								tempBuffer.append(ch);
							}
						}
						sb.append("</");
						sb.append(tempBuffer);
						sb.append(">");
					} else if (ch == '!') {
						final String nextSeven = readN(reader, 7);
						if ("[CDATA[".equals(nextSeven)) {
							readCData(reader, sb);
						} else {
							sb.append('!');
							if (nextSeven != null) {
								sb.append(nextSeven);
							}
						}
					} else {
						sb.append('<');
						sb.append(ch);
					}
				} else {
					sb.append('<');
				}
			} else {
				sb.append(ch);
			}
		}

		this.justReadTagBegin = false;
		this.justReadTagEnd = false;
		this.justReadEmptyElement = false;
		if (addTextNode) {
			if (decodeEntities) {
				sb = this.entityDecode(sb);
			}
			String text = sb.toString();
			if (text.length() != 0) {
				Node textNode = doc.createTextNode(text);
				parent.appendChild(textNode);
			}
		}
		return HtmlParser.TOKEN_EOD;
	}

	/**
	 * The reader offset should be.
	 *
	 * @param parent
	 *            the parent
	 * @param reader
	 *            the reader
	 * @return the string
	 * @throws IOException
	 *             Signals that an I/O exception has occurred.
	 */
	private final String readTag(Node parent, LineNumberReader reader) throws IOException {
		StringBuffer sb = new StringBuffer();
		int chInt;
		chInt = reader.read();
		if (chInt != -1) {
			boolean cont = true;
			char ch;
			LOOP: for (;;) {
				ch = (char) chInt;
				if (Character.isLetter(ch)) {
					// Speed up normal case
					break LOOP;
				} else if (ch == '!') {
					sb.append('!');
					chInt = reader.read();
					if (chInt != -1) {
						ch = (char) chInt;
						if (ch == '-') {
							sb.append('-');
							chInt = reader.read();
							if (chInt != -1) {
								ch = (char) chInt;
								if (ch == '-') {
									sb.append('-');
									cont = false;
								}
							} else {
								cont = false;
							}
						}
					} else {
						cont = false;
					}
				} else if (ch == '/') {
					sb.append(ch);
					chInt = reader.read();
					if (chInt != -1) {
						ch = (char) chInt;
					} else {
						cont = false;
					}
				} else if (ch == '<') {
					StringBuffer ltText = new StringBuffer(3);
					ltText.append('<');
					while ((chInt = reader.read()) == '<') {
						ltText.append('<');
					}
					Document doc = this.document;
					Node textNode = doc.createTextNode(ltText.toString());
					try {
						parent.appendChild(textNode);
					} catch (DOMException de) {
						if ((parent.getNodeType() != Node.DOCUMENT_NODE)
								|| (de.code != DOMException.HIERARCHY_REQUEST_ERR)) {
							logger.error("parseToken(): Unable to append child to " + parent + ".", de);
						}
					}
					if (chInt == -1) {
						cont = false;
					} else {
						continue LOOP;
					}
				} else if (Character.isWhitespace(ch)) {
					StringBuffer ltText = new StringBuffer();
					ltText.append('<');
					ltText.append(ch);
					while ((chInt = reader.read()) != -1) {
						ch = (char) chInt;
						if (ch == '<') {
							chInt = reader.read();
							break;
						}
						ltText.append(ch);
					}
					Document doc = this.document;
					Node textNode = doc.createTextNode(ltText.toString());
					try {
						parent.appendChild(textNode);
					} catch (DOMException de) {
						if ((parent.getNodeType() != Node.DOCUMENT_NODE)
								|| (de.code != DOMException.HIERARCHY_REQUEST_ERR)) {
							logger.error("parseToken(): Unable to append child to " + parent + ".", de);
						}
					}
					if (chInt == -1) {
						cont = false;
					} else {
						continue LOOP;
					}
				}
				break LOOP;
			}
			if (cont) {
				boolean lastCharSlash = false;
				for (;;) {
					if (Character.isWhitespace(ch)) {
						break;
					} else if (ch == '>') {
						this.justReadTagEnd = true;
						this.justReadTagBegin = false;
						this.justReadEmptyElement = lastCharSlash;
						String tag = sb.toString();
						return tag;
					} else if (ch == '/') {
						lastCharSlash = true;
					} else {
						if (lastCharSlash) {
							sb.append('/');
						}
						lastCharSlash = false;
						sb.append(ch);
					}
					chInt = reader.read();
					if (chInt == -1) {
						break;
					}
					ch = (char) chInt;
				}
			}
		}
		if (sb.length() > 0) {
			this.justReadTagEnd = false;
			this.justReadTagBegin = false;
			this.justReadEmptyElement = false;
		}
		String tag = sb.toString();
		return tag;
	}

	/**
	 * Pass end of comment.
	 *
	 * @param reader
	 *            the reader
	 * @return the string buffer
	 * @throws IOException
	 *             Signals that an I/O exception has occurred.
	 */
	private final StringBuffer passEndOfComment(LineNumberReader reader) throws IOException {
		if (this.justReadTagEnd) {
			return new StringBuffer(0);
		}
		StringBuffer sb = new StringBuffer();
		OUTER: for (;;) {
			int chInt = reader.read();
			if (chInt == -1) {
				break OUTER;
			}
			char ch = (char) chInt;
			if (ch == '-') {
				chInt = reader.read();
				if (chInt == -1) {
					sb.append(ch);
					break OUTER;
				}
				ch = (char) chInt;
				if (ch == '-') {
					StringBuffer extra = null;
					INNER: for (;;) {
						chInt = reader.read();
						if (chInt == -1) {
							if (extra != null) {
								sb.append(extra.toString());
							}
							break OUTER;
						}
						ch = (char) chInt;
						if (ch == '>') {
							this.justReadTagBegin = false;
							this.justReadTagEnd = true;
							return sb;
						} else if (ch == '-') {
							// Allow any number of dashes at the end
							if (extra == null) {
								extra = new StringBuffer();
								extra.append("--");
							}
							extra.append("-");
						} else if (Character.isWhitespace(ch)) {
							if (extra == null) {
								extra = new StringBuffer();
								extra.append("--");
							}
							extra.append(ch);
						} else {
							if (extra != null) {
								sb.append(extra.toString());
							}
							sb.append(ch);
							break INNER;
						}
					}
				} else {
					sb.append('-');
					sb.append(ch);
				}
			} else {
				sb.append(ch);
			}
		}
		if (sb.length() > 0) {
			this.justReadTagBegin = false;
			this.justReadTagEnd = false;
		}
		return sb;
	}

	/**
	 * Pass end of tag.
	 *
	 * @param reader
	 *            the reader
	 * @throws IOException
	 *             Signals that an I/O exception has occurred.
	 */
	private final void passEndOfTag(Reader reader) throws IOException {
		if (this.justReadTagEnd) {
			return;
		}
		boolean readSomething = false;
		for (;;) {
			int chInt = reader.read();
			if (chInt == -1) {
				break;
			}
			readSomething = true;
			char ch = (char) chInt;
			if (ch == '>') {
				this.justReadTagEnd = true;
				this.justReadTagBegin = false;
				return;
			}
		}
		if (readSomething) {
			this.justReadTagBegin = false;
			this.justReadTagEnd = false;
		}
	}

	/**
	 * Read processing instruction.
	 *
	 * @param reader
	 *            the reader
	 * @return the string buffer
	 * @throws IOException
	 *             Signals that an I/O exception has occurred.
	 */
	private final StringBuffer readProcessingInstruction(LineNumberReader reader) throws IOException {
		StringBuffer pidata = new StringBuffer();
		if (this.justReadTagEnd) {
			return pidata;
		}
		int ch;
		for (ch = reader.read(); (ch != -1) && (ch != '>'); ch = reader.read()) {
			pidata.append((char) ch);
		}
		this.justReadTagBegin = false;
		this.justReadTagEnd = ch != -1;
		return pidata;
	}

	/**
	 * Read attribute.
	 *
	 * @param reader
	 *            the reader
	 * @param element
	 *            the element
	 * @return true, if successful
	 * @throws IOException
	 *             Signals that an I/O exception has occurred.
	 * @throws SAXException
	 *             the SAX exception
	 */
	private final boolean readAttribute(LineNumberReader reader, Element element) throws IOException, SAXException {
		if (this.justReadTagEnd) {
			return false;
		}

		// Read attribute name up to '=' character.
		// May read several attribute names without explicit values.

		StringBuffer attributeName = null;
		boolean blankFound = false;
		boolean lastCharSlash = false;
		for (;;) {
			int chInt = reader.read();
			if (chInt == -1) {
				if ((attributeName != null) && (attributeName.length() != 0)) {
					String attributeNameStr = attributeName.toString();
					element.setAttribute(attributeNameStr, attributeNameStr);
					attributeName.setLength(0);
				}
				this.justReadTagBegin = false;
				this.justReadTagEnd = false;
				this.justReadEmptyElement = false;
				return false;
			}
			char ch = (char) chInt;
			if (ch == '=') {
				lastCharSlash = false;
				blankFound = false;
				break;
			} else if (ch == '>') {
				if ((attributeName != null) && (attributeName.length() != 0)) {
					String attributeNameStr = attributeName.toString();
					element.setAttribute(attributeNameStr, attributeNameStr);
				}
				this.justReadTagBegin = false;
				this.justReadTagEnd = true;
				this.justReadEmptyElement = lastCharSlash;
				return false;
			} else if (ch == '/') {
				blankFound = true;
				lastCharSlash = true;
			} else if (Character.isWhitespace(ch)) {
				lastCharSlash = false;
				blankFound = true;
			} else {
				lastCharSlash = false;
				if (blankFound) {
					blankFound = false;
					if ((attributeName != null) && (attributeName.length() != 0)) {
						String attributeNameStr = attributeName.toString();
						element.setAttribute(attributeNameStr, attributeNameStr);
						attributeName.setLength(0);
					}
				}
				if (attributeName == null) {
					attributeName = new StringBuffer(6);
				}
				attributeName.append(ch);
			}
		}
		// Read blanks up to open quote or first non-blank.
		StringBuffer attributeValue = null;
		int openQuote = -1;
		for (;;) {
			int chInt = reader.read();
			if (chInt == -1) {
				break;
			}
			char ch = (char) chInt;
			if (ch == '>') {
				if ((attributeName != null) && (attributeName.length() != 0)) {
					String attributeNameStr = attributeName.toString();
					element.setAttribute(attributeNameStr, attributeNameStr);
				}
				this.justReadTagBegin = false;
				this.justReadTagEnd = true;
				this.justReadEmptyElement = lastCharSlash;
				return false;
			} else if (ch == '/') {
				lastCharSlash = true;
			} else if (Character.isWhitespace(ch)) {
				lastCharSlash = false;
			} else {
				if (ch == '"') {
					openQuote = '"';
				} else if (ch == '\'') {
					openQuote = '\'';
				} else {
					openQuote = -1;
					if (attributeValue == null) {
						attributeValue = new StringBuffer(6);
					}
					if (lastCharSlash) {
						attributeValue.append('/');
					}
					attributeValue.append(ch);
				}
				lastCharSlash = false;
				break;
			}
		}

		// Read attribute value

		for (;;) {
			int chInt = reader.read();
			if (chInt == -1) {
				break;
			}
			char ch = (char) chInt;
			if ((openQuote != -1) && (ch == openQuote)) {
				lastCharSlash = false;
				if (attributeName != null) {
					String attributeNameStr = attributeName.toString();
					if (attributeValue == null) {
						// Quotes are closed. There's a distinction
						// between blank values and null in HTML, as
						// processed by major browsers.
						element.setAttribute(attributeNameStr, "");
					} else {
						StringBuffer actualAttributeValue = this.entityDecode(attributeValue);
						element.setAttribute(attributeNameStr, actualAttributeValue.toString());
					}
				}
				this.justReadTagBegin = false;
				this.justReadTagEnd = false;
				return true;
			} else if ((openQuote == -1) && (ch == '>')) {
				if (attributeName != null) {
					String attributeNameStr = attributeName.toString();
					if (attributeValue == null) {
						element.setAttribute(attributeNameStr, null);
					} else {
						StringBuffer actualAttributeValue = this.entityDecode(attributeValue);
						element.setAttribute(attributeNameStr, actualAttributeValue.toString());
					}
				}
				this.justReadTagBegin = false;
				this.justReadTagEnd = true;
				this.justReadEmptyElement = lastCharSlash;
				return false;
			} else if ((openQuote == -1) && Character.isWhitespace(ch)) {
				lastCharSlash = false;
				if (attributeName != null) {
					String attributeNameStr = attributeName.toString();
					if (attributeValue == null) {
						element.setAttribute(attributeNameStr, null);
					} else {
						StringBuffer actualAttributeValue = this.entityDecode(attributeValue);
						element.setAttribute(attributeNameStr, actualAttributeValue.toString());
					}
				}
				this.justReadTagBegin = false;
				this.justReadTagEnd = false;
				return true;
			} else {
				if (attributeValue == null) {
					attributeValue = new StringBuffer(6);
				}
				if (lastCharSlash) {
					attributeValue.append('/');
				}
				lastCharSlash = false;
				attributeValue.append(ch);
			}
		}
		this.justReadTagBegin = false;
		this.justReadTagEnd = false;
		if (attributeName != null) {
			String attributeNameStr = attributeName.toString();
			if (attributeValue == null) {
				element.setAttribute(attributeNameStr, null);
			} else {
				StringBuffer actualAttributeValue = this.entityDecode(attributeValue);
				element.setAttribute(attributeNameStr, actualAttributeValue.toString());
			}
		}
		return false;
	}

	/**
	 * Entity decode.
	 *
	 * @param rawText
	 *            the raw text
	 * @return the string buffer
	 * @throws SAXException
	 *             the SAX exception
	 */
	private final StringBuffer entityDecode(StringBuffer rawText) throws org.xml.sax.SAXException {
		int startIdx = 0;
		StringBuffer sb = null;
		for (;;) {
			int ampIdx = rawText.indexOf("&", startIdx);
			if (ampIdx == -1) {
				if (sb == null) {
					return rawText;
				} else {
					sb.append(rawText.substring(startIdx));
					return sb;
				}
			}
			if (sb == null) {
				sb = new StringBuffer();
			}
			sb.append(rawText.substring(startIdx, ampIdx));
			int colonIdx = rawText.indexOf(";", ampIdx);
			if (colonIdx == -1) {
				sb.append('&');
				startIdx = ampIdx + 1;
				continue;
			}
			String spec = rawText.substring(ampIdx + 1, colonIdx);
			if (spec.startsWith("#")) {
				String number = spec.substring(1).toLowerCase();
				int decimal;
				try {
					if (number.startsWith("x")) {
						decimal = Integer.parseInt(number.substring(1), 16);
					} else {
						decimal = Integer.parseInt(number);
					}
				} catch (NumberFormatException nfe) {
					logger.error("entityDecode()", nfe);
					decimal = 0;
				}
				sb.append((char) decimal);
			} else {
				int chInt = this.getEntityChar(spec);
				if (chInt == -1) {
					sb.append('&');
					sb.append(spec);
					sb.append(';');
				} else {
					sb.append((char) chInt);
				}
			}
			startIdx = colonIdx + 1;
		}
	}

	/**
	 * Gets the entity char.
	 *
	 * @param spec
	 *            the spec
	 * @return the entity char
	 */
	private final int getEntityChar(String spec) {
		// TODO: Declared entities
		Character c = ENTITIES.get(spec);
		if (c == null) {
			String specTL = spec.toLowerCase();
			c = ENTITIES.get(specTL);
			if (c == null) {
				return -1;
			}
		}
		return c.charValue();
	}

	/**
	 * read CData
	 *
	 * @param LineNumberReader
	 *            the reader
	 * @param StringBuffer
	 *            the sb
	 * @return void
	 */
	private static void readCData(LineNumberReader reader, StringBuffer sb) throws IOException {
		int next = reader.read();
		while (next >= 0) {
			final char nextCh = (char) next;
			if (nextCh == ']') {
				final String next2 = readN(reader, 2);
				if (next2 != null) {
					if ("]>".equals(next2)) {
						break;
					} else {
						sb.append(next2);
						next = reader.read();
					}
				} else {
					break;
				}
			} else {
				sb.append(nextCh);
				next = reader.read();
			}
		}
	}

	/**
	 * read N Tries to read at most n characters.
	 *
	 * @param LineNumberReader
	 *            the reader
	 * @param n
	 *            the sb
	 * @return String
	 */
	private static String readN(final LineNumberReader reader, final int n) {
		char[] chars = new char[n];
		int i = 0;
		while (i < n) {
			int ich = -1;
			try {
				ich = reader.read();
			} catch (IOException e) {
				break;
			}
			if (ich >= 0) {
				chars[i] = (char) ich;
				i += 1;
			} else {
				break;
			}
		}

		if (i == 0) {
			return null;
		} else {
			return new String(chars, 0, i);
		}
	}

}