/*
GNU GENERAL LICENSE
Copyright (C) 2006 The Lobo Project. Copyright (C) 2014 - 2017 Lobo Evolution
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public
License as published by the Free Software Foundation; either
verion 3 of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General License for more details.
You should have received a copy of the GNU General Public
along with this program. If not, see <http://www.gnu.org/licenses/>.
Contact info: lobochief@users.sourceforge.net; ivan.difrancesco@yahoo.it
*/
/*
* Created on Aug 28, 2005
*/
package org.lobobrowser.html.parser;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Map;
import java.util.Set;
import org.apache.logging.log4j.Level;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.lobobrowser.html.HtmlMapping;
import org.lobobrowser.html.HtmlMappingChar;
import org.lobobrowser.html.info.ElementInfo;
import org.lobobrowser.html.io.WritableLineReader;
import org.lobobrowser.http.UserAgentContext;
import org.w3c.dom.DOMException;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.xml.sax.ErrorHandler;
import org.xml.sax.SAXException;
/**
* The <code>HtmlParser</code> class is an HTML DOM parser. This parser provides
* the functionality for the standard DOM parser implementation
* {@link org.lobobrowser.html.parser.DocumentBuilderImpl}. This parser class
* may be used directly when a different DOM implementation is preferred.
*/
public class HtmlParser {
/** The Constant logger. */
private static final Logger logger = LogManager.getLogger(HtmlParser.class.getName());
/** The document. */
private final Document document;
/** The ucontext. */
private final UserAgentContext ucontext;
/** The entities. */
private static Map<String, Character> ENTITIES = new HashMap<String, Character>(256);
/** The element infos. */
private static Map<String, ElementInfo> ELEMENT_INFOS = new HashMap<String, ElementInfo>(35);
/** The Constant TOKEN_EOD. */
private static final int TOKEN_EOD = 0;
/** The Constant TOKEN_COMMENT. */
private static final int TOKEN_COMMENT = 1;
/** The Constant TOKEN_TEXT. */
private static final int TOKEN_TEXT = 2;
/** The Constant TOKEN_BEGIN_ELEMENT. */
private static final int TOKEN_BEGIN_ELEMENT = 3;
/** The Constant TOKEN_END_ELEMENT. */
private static final int TOKEN_END_ELEMENT = 4;
/** The Constant TOKEN_FULL_ELEMENT. */
private static final int TOKEN_FULL_ELEMENT = 5;
/** The Constant TOKEN_BAD. */
private static final int TOKEN_BAD = 6;
/** The normal last tag. */
private String normalLastTag = null;
/** The just read tag begin. */
private boolean justReadTagBegin = false;
/** The just read tag end. */
private boolean justReadTagEnd = false;
/**
* Only set when readAttribute returns false.
*/
private boolean justReadEmptyElement = false;
/**
* A node <code>UserData</code> key used to tell nodes that their content
* may be about to be modified. Elements could use this to temporarily
* suspend notifications. The value set will be either
* <code>Boolean.TRUE</code> or <code>Boolean.FALSE</code>.
*/
public static final String MODIFYING_KEY = "cobra.suspend";
static {
ENTITIES = HtmlMappingChar.mappingChar();
ELEMENT_INFOS = HtmlMapping.mappingTag();
}
/**
* Constructs a <code>HtmlParser</code>.
*
* @param ucontext
* The user agent context.
* @param document
* An W3C Document instance.
* @param errorHandler
* The error handler.
* @param publicId
* The public ID of the document.
* @param systemId
* The system ID of the document.
*/
public HtmlParser(UserAgentContext ucontext, Document document, ErrorHandler errorHandler, String publicId,
String systemId) {
this.ucontext = ucontext;
this.document = document;
}
/**
* Constructs a <code>HtmlParser</code>.
*
* @param ucontext
* The user agent context.
* @param document
* A W3C Document instance.
*/
public HtmlParser(UserAgentContext ucontext, Document document) {
this.ucontext = ucontext;
this.document = document;
}
/**
* Checks if is decode entities.
*
* @param elementName
* the element name
* @return true, if is decode entities
*/
public static boolean isDecodeEntities(String elementName) {
ElementInfo einfo = ELEMENT_INFOS.get(elementName.toUpperCase());
return einfo == null ? true : einfo.isDecodeEntities();
}
/**
* Parses HTML from an input stream, assuming the character set is UTF-8.
*
* @param in
* The input stream.
* @throws IOException
* Thrown when there are errors reading the stream.
* @throws SAXException
* Thrown when there are parse errors.
* @throws UnsupportedEncodingException
* the unsupported encoding exception
*/
public void parse(InputStream in) throws IOException, SAXException, UnsupportedEncodingException {
this.parse(in, "UTF-8");
}
/**
* Parses HTML from an input stream, using the given character set.
*
* @param in
* The input stream.
* @param charset
* The character set.
* @throws IOException
* Thrown when there's an error reading from the stream.
* @throws SAXException
* Thrown when there is a parser error.
* @throws UnsupportedEncodingException
* Thrown if the character set is not supported.
*/
public void parse(InputStream in, String charset) throws IOException, SAXException, UnsupportedEncodingException {
WritableLineReader reader = new WritableLineReader(new InputStreamReader(in, charset));
this.parse(reader);
}
/**
* Parses HTML given by a <code>Reader</code>. This method appends nodes to
* the document provided to the parser.
*
* @param reader
* An instance of <code>Reader</code>.
* @throws IOException
* Thrown if there are errors reading the input stream.
* @throws SAXException
* Thrown if there are parse errors.
*/
public void parse(Reader reader) throws IOException, SAXException {
this.parse(new LineNumberReader(reader));
}
/**
* Parses the.
*
* @param reader
* the reader
* @throws IOException
* Signals that an I/O exception has occurred.
* @throws SAXException
* the SAX exception
*/
public void parse(LineNumberReader reader) throws IOException, SAXException {
Document doc = this.document;
this.parse(reader, doc);
}
/**
* This method may be used when the DOM should be built under a given node,
* such as when <code>innerHTML</code> is used in Javascript.
*
* @param reader
* A document reader.
* @param parent
* The root node for the parsed DOM.
* @throws IOException
* Signals that an I/O exception has occurred.
* @throws SAXException
* the SAX exception
*/
public void parse(Reader reader, Node parent) throws IOException, SAXException {
this.parse(new LineNumberReader(reader), parent);
}
/**
* This method may be used when the DOM should be built under a given node,
* such as when <code>innerHTML</code> is used in Javascript.
*
* @param reader
* A LineNumberReader for the document.
* @param parent
* The root node for the parsed DOM.
* @throws IOException
* Signals that an I/O exception has occurred.
* @throws SAXException
* the SAX exception
*/
public void parse(LineNumberReader reader, Node parent) throws IOException, SAXException {
// Note: Parser does not clear document. It could be used incrementally.
try {
parent.setUserData(MODIFYING_KEY, Boolean.TRUE, null);
try {
while (this.parseToken(parent, reader, null, new LinkedList<String>()) != TOKEN_EOD) {
;
}
} catch (StopException se) {
throw new SAXException("Unexpected flow exception", se);
}
} finally {
parent.setUserData(MODIFYING_KEY, Boolean.FALSE, null);
}
}
/**
* Parses text followed by one element.
*
* @param parent
* the parent
* @param reader
* the reader
* @param stopTags
* If tags in this set are encountered, the method throws
* StopException.
* @param ancestors
* the ancestors
* @return the int
* @throws IOException
* Signals that an I/O exception has occurred.
* @throws StopException
* the stop exception
* @throws SAXException
* the SAX exception
*/
private final int parseToken(Node parent, LineNumberReader reader, Set<String> stopTags,
LinkedList<String> ancestors) throws IOException, StopException, SAXException {
Document doc = this.document;
StringBuffer textSb = this.readUpToTagBegin(reader);
if (textSb == null) {
return TOKEN_EOD;
}
if (textSb.length() != 0) {
// int textLine = reader.getLineNumber();
StringBuffer decText = this.entityDecode(textSb);
Node textNode = doc.createTextNode(decText.toString());
try {
parent.appendChild(textNode);
} catch (DOMException de) {
if ((parent.getNodeType() != Node.DOCUMENT_NODE) || (de.code != DOMException.HIERARCHY_REQUEST_ERR)) {
logger.error("parseToken(): Unable to append child to " + parent + ".", de);
}
}
}
if (this.justReadTagBegin) {
String tag = this.readTag(parent, reader);
if (tag == null) {
return TOKEN_EOD;
}
String normalTag = tag.toUpperCase();
try {
if (tag.startsWith("!")) {
if ("!--".equals(tag)) {
// int commentLine = reader.getLineNumber();
StringBuffer comment = this.passEndOfComment(reader);
StringBuffer decText = this.entityDecode(comment);
parent.appendChild(doc.createComment(decText.toString()));
return TOKEN_COMMENT;
} else {
// TODO: DOCTYPE node
this.passEndOfTag(reader);
return TOKEN_BAD;
}
} else if (tag.startsWith("/")) {
tag = tag.substring(1);
normalTag = normalTag.substring(1);
this.passEndOfTag(reader);
return TOKEN_END_ELEMENT;
} else if (tag.startsWith("?")) {
tag = tag.substring(1);
StringBuffer data = readProcessingInstruction(reader);
parent.appendChild(doc.createProcessingInstruction(tag, data.toString()));
return TOKEN_FULL_ELEMENT;
} else {
int localIndex = normalTag.indexOf(':');
boolean tagHasPrefix = localIndex > 0;
String localName = tagHasPrefix ? normalTag.substring(localIndex + 1) : normalTag;
Element element = doc.createElement(localName);
element.setUserData(MODIFYING_KEY, Boolean.TRUE, null);
try {
if (!this.justReadTagEnd) {
while (this.readAttribute(reader, element)) {
;
}
}
if ((stopTags != null) && stopTags.contains(normalTag)) {
// Throw before appending to parent.
// After attributes are set.
// After MODIFYING_KEY is set.
throw new StopException(element);
}
// Add element to parent before children are added.
// This is necessary for incremental rendering.
parent.appendChild(element);
if (!this.justReadEmptyElement) {
ElementInfo einfo = ELEMENT_INFOS.get(localName);
int endTagType = einfo == null ? ElementInfo.END_ELEMENT_REQUIRED : einfo.getEndElementType();
if (endTagType != ElementInfo.END_ELEMENT_FORBIDDEN) {
boolean childrenOk = einfo == null ? true : einfo.isChildElementOk();
Set<String> newStopSet = einfo == null ? null : einfo.getStopTags();
if (newStopSet == null) {
if (endTagType == ElementInfo.END_ELEMENT_OPTIONAL) {
newStopSet = Collections.singleton(normalTag);
}
}
if (stopTags != null) {
if (newStopSet != null) {
Set<String> newStopSet2 = new HashSet<String>();
newStopSet2.addAll(stopTags);
newStopSet2.addAll(newStopSet);
newStopSet = newStopSet2;
} else {
newStopSet = endTagType == ElementInfo.END_ELEMENT_REQUIRED ? null : stopTags;
}
}
ancestors.addFirst(normalTag);
try {
for (;;) {
try {
int token;
if ((einfo != null) && einfo.isNoScriptElement()) {
UserAgentContext ucontext = this.ucontext;
if ((ucontext == null) || ucontext.isScriptingEnabled()) {
token = this.parseForEndTag(parent, reader, tag, false,
einfo.isDecodeEntities());
} else {
token = this.parseToken(element, reader, newStopSet, ancestors);
}
} else {
token = childrenOk
? this.parseToken(element, reader, newStopSet, ancestors)
: this.parseForEndTag(element, reader, tag, true,
einfo.isDecodeEntities());
}
if (token == TOKEN_END_ELEMENT) {
String normalLastTag = this.normalLastTag;
if (normalTag.equals(normalLastTag)) {
return TOKEN_FULL_ELEMENT;
} else {
ElementInfo closeTagInfo = ELEMENT_INFOS.get(normalLastTag);
if ((closeTagInfo == null)
|| (closeTagInfo.getEndElementType() != ElementInfo.END_ELEMENT_FORBIDDEN)) {
// TODO: Rather
// inefficient
// algorithm, but it's
// probably executed
// infrequently?
Iterator<String> i = ancestors.iterator();
if (i.hasNext()) {
i.next();
while (i.hasNext()) {
String normalAncestorTag = i.next();
if (normalLastTag.equals(normalAncestorTag)) {
normalTag = normalLastTag;
return TOKEN_END_ELEMENT;
}
}
}
}
// TODO: Working here
}
} else if (token == TOKEN_EOD) {
return TOKEN_EOD;
}
} catch (StopException se) {
// newElement does not have a
// parent.
Element newElement = se.getElement();
tag = newElement.getTagName();
normalTag = tag.toUpperCase();
// If a subelement throws
// StopException with
// a tag matching the current stop
// tag, the exception
// is rethrown (e.g.
// <TR><TD>blah<TR><TD>blah)
if ((stopTags != null) && stopTags.contains(normalTag)) {
throw se;
}
einfo = ELEMENT_INFOS.get(normalTag);
endTagType = einfo == null ? ElementInfo.END_ELEMENT_REQUIRED
: einfo.getEndElementType();
childrenOk = einfo == null ? true : einfo.isChildElementOk();
newStopSet = einfo == null ? null : einfo.getStopTags();
if (newStopSet == null) {
if (endTagType == ElementInfo.END_ELEMENT_OPTIONAL) {
newStopSet = Collections.singleton(normalTag);
}
}
if ((stopTags != null) && (newStopSet != null)) {
Set<String> newStopSet2 = new HashSet<String>();
newStopSet2.addAll(stopTags);
newStopSet2.addAll(newStopSet);
newStopSet = newStopSet2;
}
ancestors.removeFirst();
ancestors.addFirst(normalTag);
// Switch element
element.setUserData(MODIFYING_KEY, Boolean.FALSE, null);
// newElement should have been
// suspended.
element = newElement;
// Add to parent
parent.appendChild(element);
if (this.justReadEmptyElement) {
return TOKEN_BEGIN_ELEMENT;
}
}
}
} finally {
ancestors.removeFirst();
}
}
}
return TOKEN_BEGIN_ELEMENT;
} finally {
// This can inform elements to continue with
// notifications.
// It can also cause Javascript to get processed.
element.setUserData(MODIFYING_KEY, Boolean.FALSE, null);
}
}
} finally {
this.normalLastTag = normalTag;
}
} else {
this.normalLastTag = null;
return TOKEN_TEXT;
}
}
/**
* Reads text until the beginning of the next tag. Leaves the reader offset
* past the opening angle bracket. Returns null only on EOF.
*
* @param reader
* the reader
* @return the string buffer
* @throws IOException
* Signals that an I/O exception has occurred.
* @throws SAXException
* the SAX exception
*/
private final StringBuffer readUpToTagBegin(LineNumberReader reader) throws IOException, SAXException {
StringBuffer sb = null;
int intCh;
while ((intCh = reader.read()) != -1) {
char ch = (char) intCh;
if (ch == '<') {
this.justReadTagBegin = true;
this.justReadTagEnd = false;
this.justReadEmptyElement = false;
if (sb == null) {
sb = new StringBuffer(0);
}
return sb;
}
if (sb == null) {
sb = new StringBuffer();
}
sb.append(ch);
}
this.justReadTagBegin = false;
this.justReadTagEnd = false;
this.justReadEmptyElement = false;
return sb;
}
/**
* Assumes that the content is completely made up of text, and parses until
* an ending tag is found.
*
* @param parent
* the parent
* @param reader
* the reader
* @param tagName
* the tag name
* @param addTextNode
* the add text node
* @param decodeEntities
* the decode entities
* @return the int
* @throws IOException
* Signals that an I/O exception has occurred.
* @throws SAXException
* the SAX exception
*/
private final int parseForEndTag(Node parent, LineNumberReader reader, String tagName, boolean addTextNode,
boolean decodeEntities) throws IOException, SAXException {
Document doc = this.document;
int intCh;
StringBuffer sb = new StringBuffer();
while ((intCh = reader.read()) != -1) {
char ch = (char) intCh;
if (ch == '<') {
intCh = reader.read();
if (intCh != -1) {
ch = (char) intCh;
if (ch == '/') {
StringBuffer tempBuffer = new StringBuffer();
INNER: while ((intCh = reader.read()) != -1) {
ch = (char) intCh;
if (ch == '>') {
String thisTag = tempBuffer.toString().trim();
if (thisTag.equalsIgnoreCase(tagName)) {
this.justReadTagBegin = false;
this.justReadTagEnd = true;
this.justReadEmptyElement = false;
this.normalLastTag = thisTag.toUpperCase();
if (addTextNode) {
if (decodeEntities) {
sb = this.entityDecode(sb);
}
String text = sb.toString();
if (text.length() != 0) {
Node textNode = doc.createTextNode(text);
parent.appendChild(textNode);
}
}
return HtmlParser.TOKEN_END_ELEMENT;
} else {
break INNER;
}
} else {
tempBuffer.append(ch);
}
}
sb.append("</");
sb.append(tempBuffer);
sb.append(">");
} else if (ch == '!') {
final String nextSeven = readN(reader, 7);
if ("[CDATA[".equals(nextSeven)) {
readCData(reader, sb);
} else {
sb.append('!');
if (nextSeven != null) {
sb.append(nextSeven);
}
}
} else {
sb.append('<');
sb.append(ch);
}
} else {
sb.append('<');
}
} else {
sb.append(ch);
}
}
this.justReadTagBegin = false;
this.justReadTagEnd = false;
this.justReadEmptyElement = false;
if (addTextNode) {
if (decodeEntities) {
sb = this.entityDecode(sb);
}
String text = sb.toString();
if (text.length() != 0) {
Node textNode = doc.createTextNode(text);
parent.appendChild(textNode);
}
}
return HtmlParser.TOKEN_EOD;
}
/**
* The reader offset should be.
*
* @param parent
* the parent
* @param reader
* the reader
* @return the string
* @throws IOException
* Signals that an I/O exception has occurred.
*/
private final String readTag(Node parent, LineNumberReader reader) throws IOException {
StringBuffer sb = new StringBuffer();
int chInt;
chInt = reader.read();
if (chInt != -1) {
boolean cont = true;
char ch;
LOOP: for (;;) {
ch = (char) chInt;
if (Character.isLetter(ch)) {
// Speed up normal case
break LOOP;
} else if (ch == '!') {
sb.append('!');
chInt = reader.read();
if (chInt != -1) {
ch = (char) chInt;
if (ch == '-') {
sb.append('-');
chInt = reader.read();
if (chInt != -1) {
ch = (char) chInt;
if (ch == '-') {
sb.append('-');
cont = false;
}
} else {
cont = false;
}
}
} else {
cont = false;
}
} else if (ch == '/') {
sb.append(ch);
chInt = reader.read();
if (chInt != -1) {
ch = (char) chInt;
} else {
cont = false;
}
} else if (ch == '<') {
StringBuffer ltText = new StringBuffer(3);
ltText.append('<');
while ((chInt = reader.read()) == '<') {
ltText.append('<');
}
Document doc = this.document;
Node textNode = doc.createTextNode(ltText.toString());
try {
parent.appendChild(textNode);
} catch (DOMException de) {
if ((parent.getNodeType() != Node.DOCUMENT_NODE)
|| (de.code != DOMException.HIERARCHY_REQUEST_ERR)) {
logger.error("parseToken(): Unable to append child to " + parent + ".", de);
}
}
if (chInt == -1) {
cont = false;
} else {
continue LOOP;
}
} else if (Character.isWhitespace(ch)) {
StringBuffer ltText = new StringBuffer();
ltText.append('<');
ltText.append(ch);
while ((chInt = reader.read()) != -1) {
ch = (char) chInt;
if (ch == '<') {
chInt = reader.read();
break;
}
ltText.append(ch);
}
Document doc = this.document;
Node textNode = doc.createTextNode(ltText.toString());
try {
parent.appendChild(textNode);
} catch (DOMException de) {
if ((parent.getNodeType() != Node.DOCUMENT_NODE)
|| (de.code != DOMException.HIERARCHY_REQUEST_ERR)) {
logger.error("parseToken(): Unable to append child to " + parent + ".", de);
}
}
if (chInt == -1) {
cont = false;
} else {
continue LOOP;
}
}
break LOOP;
}
if (cont) {
boolean lastCharSlash = false;
for (;;) {
if (Character.isWhitespace(ch)) {
break;
} else if (ch == '>') {
this.justReadTagEnd = true;
this.justReadTagBegin = false;
this.justReadEmptyElement = lastCharSlash;
String tag = sb.toString();
return tag;
} else if (ch == '/') {
lastCharSlash = true;
} else {
if (lastCharSlash) {
sb.append('/');
}
lastCharSlash = false;
sb.append(ch);
}
chInt = reader.read();
if (chInt == -1) {
break;
}
ch = (char) chInt;
}
}
}
if (sb.length() > 0) {
this.justReadTagEnd = false;
this.justReadTagBegin = false;
this.justReadEmptyElement = false;
}
String tag = sb.toString();
return tag;
}
/**
* Pass end of comment.
*
* @param reader
* the reader
* @return the string buffer
* @throws IOException
* Signals that an I/O exception has occurred.
*/
private final StringBuffer passEndOfComment(LineNumberReader reader) throws IOException {
if (this.justReadTagEnd) {
return new StringBuffer(0);
}
StringBuffer sb = new StringBuffer();
OUTER: for (;;) {
int chInt = reader.read();
if (chInt == -1) {
break OUTER;
}
char ch = (char) chInt;
if (ch == '-') {
chInt = reader.read();
if (chInt == -1) {
sb.append(ch);
break OUTER;
}
ch = (char) chInt;
if (ch == '-') {
StringBuffer extra = null;
INNER: for (;;) {
chInt = reader.read();
if (chInt == -1) {
if (extra != null) {
sb.append(extra.toString());
}
break OUTER;
}
ch = (char) chInt;
if (ch == '>') {
this.justReadTagBegin = false;
this.justReadTagEnd = true;
return sb;
} else if (ch == '-') {
// Allow any number of dashes at the end
if (extra == null) {
extra = new StringBuffer();
extra.append("--");
}
extra.append("-");
} else if (Character.isWhitespace(ch)) {
if (extra == null) {
extra = new StringBuffer();
extra.append("--");
}
extra.append(ch);
} else {
if (extra != null) {
sb.append(extra.toString());
}
sb.append(ch);
break INNER;
}
}
} else {
sb.append('-');
sb.append(ch);
}
} else {
sb.append(ch);
}
}
if (sb.length() > 0) {
this.justReadTagBegin = false;
this.justReadTagEnd = false;
}
return sb;
}
/**
* Pass end of tag.
*
* @param reader
* the reader
* @throws IOException
* Signals that an I/O exception has occurred.
*/
private final void passEndOfTag(Reader reader) throws IOException {
if (this.justReadTagEnd) {
return;
}
boolean readSomething = false;
for (;;) {
int chInt = reader.read();
if (chInt == -1) {
break;
}
readSomething = true;
char ch = (char) chInt;
if (ch == '>') {
this.justReadTagEnd = true;
this.justReadTagBegin = false;
return;
}
}
if (readSomething) {
this.justReadTagBegin = false;
this.justReadTagEnd = false;
}
}
/**
* Read processing instruction.
*
* @param reader
* the reader
* @return the string buffer
* @throws IOException
* Signals that an I/O exception has occurred.
*/
private final StringBuffer readProcessingInstruction(LineNumberReader reader) throws IOException {
StringBuffer pidata = new StringBuffer();
if (this.justReadTagEnd) {
return pidata;
}
int ch;
for (ch = reader.read(); (ch != -1) && (ch != '>'); ch = reader.read()) {
pidata.append((char) ch);
}
this.justReadTagBegin = false;
this.justReadTagEnd = ch != -1;
return pidata;
}
/**
* Read attribute.
*
* @param reader
* the reader
* @param element
* the element
* @return true, if successful
* @throws IOException
* Signals that an I/O exception has occurred.
* @throws SAXException
* the SAX exception
*/
private final boolean readAttribute(LineNumberReader reader, Element element) throws IOException, SAXException {
if (this.justReadTagEnd) {
return false;
}
// Read attribute name up to '=' character.
// May read several attribute names without explicit values.
StringBuffer attributeName = null;
boolean blankFound = false;
boolean lastCharSlash = false;
for (;;) {
int chInt = reader.read();
if (chInt == -1) {
if ((attributeName != null) && (attributeName.length() != 0)) {
String attributeNameStr = attributeName.toString();
element.setAttribute(attributeNameStr, attributeNameStr);
attributeName.setLength(0);
}
this.justReadTagBegin = false;
this.justReadTagEnd = false;
this.justReadEmptyElement = false;
return false;
}
char ch = (char) chInt;
if (ch == '=') {
lastCharSlash = false;
blankFound = false;
break;
} else if (ch == '>') {
if ((attributeName != null) && (attributeName.length() != 0)) {
String attributeNameStr = attributeName.toString();
element.setAttribute(attributeNameStr, attributeNameStr);
}
this.justReadTagBegin = false;
this.justReadTagEnd = true;
this.justReadEmptyElement = lastCharSlash;
return false;
} else if (ch == '/') {
blankFound = true;
lastCharSlash = true;
} else if (Character.isWhitespace(ch)) {
lastCharSlash = false;
blankFound = true;
} else {
lastCharSlash = false;
if (blankFound) {
blankFound = false;
if ((attributeName != null) && (attributeName.length() != 0)) {
String attributeNameStr = attributeName.toString();
element.setAttribute(attributeNameStr, attributeNameStr);
attributeName.setLength(0);
}
}
if (attributeName == null) {
attributeName = new StringBuffer(6);
}
attributeName.append(ch);
}
}
// Read blanks up to open quote or first non-blank.
StringBuffer attributeValue = null;
int openQuote = -1;
for (;;) {
int chInt = reader.read();
if (chInt == -1) {
break;
}
char ch = (char) chInt;
if (ch == '>') {
if ((attributeName != null) && (attributeName.length() != 0)) {
String attributeNameStr = attributeName.toString();
element.setAttribute(attributeNameStr, attributeNameStr);
}
this.justReadTagBegin = false;
this.justReadTagEnd = true;
this.justReadEmptyElement = lastCharSlash;
return false;
} else if (ch == '/') {
lastCharSlash = true;
} else if (Character.isWhitespace(ch)) {
lastCharSlash = false;
} else {
if (ch == '"') {
openQuote = '"';
} else if (ch == '\'') {
openQuote = '\'';
} else {
openQuote = -1;
if (attributeValue == null) {
attributeValue = new StringBuffer(6);
}
if (lastCharSlash) {
attributeValue.append('/');
}
attributeValue.append(ch);
}
lastCharSlash = false;
break;
}
}
// Read attribute value
for (;;) {
int chInt = reader.read();
if (chInt == -1) {
break;
}
char ch = (char) chInt;
if ((openQuote != -1) && (ch == openQuote)) {
lastCharSlash = false;
if (attributeName != null) {
String attributeNameStr = attributeName.toString();
if (attributeValue == null) {
// Quotes are closed. There's a distinction
// between blank values and null in HTML, as
// processed by major browsers.
element.setAttribute(attributeNameStr, "");
} else {
StringBuffer actualAttributeValue = this.entityDecode(attributeValue);
element.setAttribute(attributeNameStr, actualAttributeValue.toString());
}
}
this.justReadTagBegin = false;
this.justReadTagEnd = false;
return true;
} else if ((openQuote == -1) && (ch == '>')) {
if (attributeName != null) {
String attributeNameStr = attributeName.toString();
if (attributeValue == null) {
element.setAttribute(attributeNameStr, null);
} else {
StringBuffer actualAttributeValue = this.entityDecode(attributeValue);
element.setAttribute(attributeNameStr, actualAttributeValue.toString());
}
}
this.justReadTagBegin = false;
this.justReadTagEnd = true;
this.justReadEmptyElement = lastCharSlash;
return false;
} else if ((openQuote == -1) && Character.isWhitespace(ch)) {
lastCharSlash = false;
if (attributeName != null) {
String attributeNameStr = attributeName.toString();
if (attributeValue == null) {
element.setAttribute(attributeNameStr, null);
} else {
StringBuffer actualAttributeValue = this.entityDecode(attributeValue);
element.setAttribute(attributeNameStr, actualAttributeValue.toString());
}
}
this.justReadTagBegin = false;
this.justReadTagEnd = false;
return true;
} else {
if (attributeValue == null) {
attributeValue = new StringBuffer(6);
}
if (lastCharSlash) {
attributeValue.append('/');
}
lastCharSlash = false;
attributeValue.append(ch);
}
}
this.justReadTagBegin = false;
this.justReadTagEnd = false;
if (attributeName != null) {
String attributeNameStr = attributeName.toString();
if (attributeValue == null) {
element.setAttribute(attributeNameStr, null);
} else {
StringBuffer actualAttributeValue = this.entityDecode(attributeValue);
element.setAttribute(attributeNameStr, actualAttributeValue.toString());
}
}
return false;
}
/**
* Entity decode.
*
* @param rawText
* the raw text
* @return the string buffer
* @throws SAXException
* the SAX exception
*/
private final StringBuffer entityDecode(StringBuffer rawText) throws org.xml.sax.SAXException {
int startIdx = 0;
StringBuffer sb = null;
for (;;) {
int ampIdx = rawText.indexOf("&", startIdx);
if (ampIdx == -1) {
if (sb == null) {
return rawText;
} else {
sb.append(rawText.substring(startIdx));
return sb;
}
}
if (sb == null) {
sb = new StringBuffer();
}
sb.append(rawText.substring(startIdx, ampIdx));
int colonIdx = rawText.indexOf(";", ampIdx);
if (colonIdx == -1) {
sb.append('&');
startIdx = ampIdx + 1;
continue;
}
String spec = rawText.substring(ampIdx + 1, colonIdx);
if (spec.startsWith("#")) {
String number = spec.substring(1).toLowerCase();
int decimal;
try {
if (number.startsWith("x")) {
decimal = Integer.parseInt(number.substring(1), 16);
} else {
decimal = Integer.parseInt(number);
}
} catch (NumberFormatException nfe) {
logger.error("entityDecode()", nfe);
decimal = 0;
}
sb.append((char) decimal);
} else {
int chInt = this.getEntityChar(spec);
if (chInt == -1) {
sb.append('&');
sb.append(spec);
sb.append(';');
} else {
sb.append((char) chInt);
}
}
startIdx = colonIdx + 1;
}
}
/**
* Gets the entity char.
*
* @param spec
* the spec
* @return the entity char
*/
private final int getEntityChar(String spec) {
// TODO: Declared entities
Character c = ENTITIES.get(spec);
if (c == null) {
String specTL = spec.toLowerCase();
c = ENTITIES.get(specTL);
if (c == null) {
return -1;
}
}
return c.charValue();
}
/**
* read CData
*
* @param LineNumberReader
* the reader
* @param StringBuffer
* the sb
* @return void
*/
private static void readCData(LineNumberReader reader, StringBuffer sb) throws IOException {
int next = reader.read();
while (next >= 0) {
final char nextCh = (char) next;
if (nextCh == ']') {
final String next2 = readN(reader, 2);
if (next2 != null) {
if ("]>".equals(next2)) {
break;
} else {
sb.append(next2);
next = reader.read();
}
} else {
break;
}
} else {
sb.append(nextCh);
next = reader.read();
}
}
}
/**
* read N Tries to read at most n characters.
*
* @param LineNumberReader
* the reader
* @param n
* the sb
* @return String
*/
private static String readN(final LineNumberReader reader, final int n) {
char[] chars = new char[n];
int i = 0;
while (i < n) {
int ich = -1;
try {
ich = reader.read();
} catch (IOException e) {
break;
}
if (ich >= 0) {
chars[i] = (char) ich;
i += 1;
} else {
break;
}
}
if (i == 0) {
return null;
} else {
return new String(chars, 0, i);
}
}
}