package com.smartandroid.sa.tag.parser; import java.util.Iterator; import java.util.LinkedList; import com.smartandroid.sa.tag.helper.DescendableLinkedList; import com.smartandroid.sa.tag.helper.StringUtil; import com.smartandroid.sa.tag.nodes.Attribute; import com.smartandroid.sa.tag.nodes.Attributes; import com.smartandroid.sa.tag.nodes.Document; import com.smartandroid.sa.tag.nodes.DocumentType; import com.smartandroid.sa.tag.nodes.Element; import com.smartandroid.sa.tag.nodes.Node; /** * The Tree Builder's current state. Each state embodies the processing for the * state, and transitions to other states. */ enum HtmlTreeBuilderState { Initial { boolean process(Token t, HtmlTreeBuilder tb) { if (isWhitespace(t)) { return true; // ignore whitespace } else if (t.isComment()) { tb.insert(t.asComment()); } else if (t.isDoctype()) { // todo: parse error check on expected doctypes // todo: quirk state check on doctype ids Token.Doctype d = t.asDoctype(); DocumentType doctype = new DocumentType(d.getName(), d.getPublicIdentifier(), d.getSystemIdentifier(), tb.getBaseUri()); tb.getDocument().appendChild(doctype); if (d.isForceQuirks()) tb.getDocument().quirksMode(Document.QuirksMode.quirks); tb.transition(BeforeHtml); } else { // todo: check not iframe srcdoc tb.transition(BeforeHtml); return tb.process(t); // re-process token } return true; } }, BeforeHtml { boolean process(Token t, HtmlTreeBuilder tb) { if (t.isDoctype()) { tb.error(this); return false; } else if (t.isComment()) { tb.insert(t.asComment()); } else if (isWhitespace(t)) { return true; // ignore whitespace } else if (t.isStartTag() && t.asStartTag().name().equals("html")) { tb.insert(t.asStartTag()); tb.transition(BeforeHead); } else if (t.isEndTag() && (StringUtil.in(t.asEndTag().name(), "head", "body", "html", "br"))) { return anythingElse(t, tb); } else if (t.isEndTag()) { tb.error(this); return false; } else { return anythingElse(t, tb); } return true; } private boolean anythingElse(Token t, HtmlTreeBuilder tb) { tb.insert("html"); tb.transition(BeforeHead); return tb.process(t); } }, BeforeHead { boolean process(Token t, HtmlTreeBuilder tb) { if (isWhitespace(t)) { return true; } else if (t.isComment()) { tb.insert(t.asComment()); } else if (t.isDoctype()) { tb.error(this); return false; } else if (t.isStartTag() && t.asStartTag().name().equals("html")) { return InBody.process(t, tb); // does not transition } else if (t.isStartTag() && t.asStartTag().name().equals("head")) { Element head = tb.insert(t.asStartTag()); tb.setHeadElement(head); tb.transition(InHead); } else if (t.isEndTag() && (StringUtil.in(t.asEndTag().name(), "head", "body", "html", "br"))) { tb.process(new Token.StartTag("head")); return tb.process(t); } else if (t.isEndTag()) { tb.error(this); return false; } else { tb.process(new Token.StartTag("head")); return tb.process(t); } return true; } }, InHead { boolean process(Token t, HtmlTreeBuilder tb) { if (isWhitespace(t)) { tb.insert(t.asCharacter()); return true; } switch (t.type) { case Comment: tb.insert(t.asComment()); break; case Doctype: tb.error(this); return false; case StartTag: Token.StartTag start = t.asStartTag(); String name = start.name(); if (name.equals("html")) { return InBody.process(t, tb); } else if (StringUtil.in(name, "base", "basefont", "bgsound", "command", "link")) { Element el = tb.insertEmpty(start); // jsoup special: update base the frist time it is seen if (name.equals("base") && el.hasAttr("href")) tb.maybeSetBaseUri(el); } else if (name.equals("meta")) { Element meta = tb.insertEmpty(start); // todo: charset switches } else if (name.equals("title")) { handleRcData(start, tb); } else if (StringUtil.in(name, "noframes", "style")) { handleRawtext(start, tb); } else if (name.equals("noscript")) { // else if noscript && scripting flag = true: rawtext (jsoup // doesn't run script, to handle as noscript) tb.insert(start); tb.transition(InHeadNoscript); } else if (name.equals("script")) { // skips some script rules as won't execute them tb.tokeniser.transition(TokeniserState.ScriptData); tb.markInsertionMode(); tb.transition(Text); tb.insert(start); } else if (name.equals("head")) { tb.error(this); return false; } else { return anythingElse(t, tb); } break; case EndTag: Token.EndTag end = t.asEndTag(); name = end.name(); if (name.equals("head")) { tb.pop(); tb.transition(AfterHead); } else if (StringUtil.in(name, "body", "html", "br")) { return anythingElse(t, tb); } else { tb.error(this); return false; } break; default: return anythingElse(t, tb); } return true; } private boolean anythingElse(Token t, TreeBuilder tb) { tb.process(new Token.EndTag("head")); return tb.process(t); } }, InHeadNoscript { boolean process(Token t, HtmlTreeBuilder tb) { if (t.isDoctype()) { tb.error(this); } else if (t.isStartTag() && t.asStartTag().name().equals("html")) { return tb.process(t, InBody); } else if (t.isEndTag() && t.asEndTag().name().equals("noscript")) { tb.pop(); tb.transition(InHead); } else if (isWhitespace(t) || t.isComment() || (t.isStartTag() && StringUtil.in(t.asStartTag().name(), "basefont", "bgsound", "link", "meta", "noframes", "style"))) { return tb.process(t, InHead); } else if (t.isEndTag() && t.asEndTag().name().equals("br")) { return anythingElse(t, tb); } else if ((t.isStartTag() && StringUtil.in(t.asStartTag().name(), "head", "noscript")) || t.isEndTag()) { tb.error(this); return false; } else { return anythingElse(t, tb); } return true; } private boolean anythingElse(Token t, HtmlTreeBuilder tb) { tb.error(this); tb.process(new Token.EndTag("noscript")); return tb.process(t); } }, AfterHead { boolean process(Token t, HtmlTreeBuilder tb) { if (isWhitespace(t)) { tb.insert(t.asCharacter()); } else if (t.isComment()) { tb.insert(t.asComment()); } else if (t.isDoctype()) { tb.error(this); } else if (t.isStartTag()) { Token.StartTag startTag = t.asStartTag(); String name = startTag.name(); if (name.equals("html")) { return tb.process(t, InBody); } else if (name.equals("body")) { tb.insert(startTag); tb.framesetOk(false); tb.transition(InBody); } else if (name.equals("frameset")) { tb.insert(startTag); tb.transition(InFrameset); } else if (StringUtil.in(name, "base", "basefont", "bgsound", "link", "meta", "noframes", "script", "style", "title")) { tb.error(this); Element head = tb.getHeadElement(); tb.push(head); tb.process(t, InHead); tb.removeFromStack(head); } else if (name.equals("head")) { tb.error(this); return false; } else { anythingElse(t, tb); } } else if (t.isEndTag()) { if (StringUtil.in(t.asEndTag().name(), "body", "html")) { anythingElse(t, tb); } else { tb.error(this); return false; } } else { anythingElse(t, tb); } return true; } private boolean anythingElse(Token t, HtmlTreeBuilder tb) { tb.process(new Token.StartTag("body")); tb.framesetOk(true); return tb.process(t); } }, InBody { boolean process(Token t, HtmlTreeBuilder tb) { switch (t.type) { case Character: { Token.Character c = t.asCharacter(); if (c.getData().equals(nullString)) { // todo confirm that check tb.error(this); return false; } else if (tb.framesetOk() && isWhitespace(c)) { // don't check // if // whitespace // if frames // already // closed tb.reconstructFormattingElements(); tb.insert(c); } else { tb.reconstructFormattingElements(); tb.insert(c); tb.framesetOk(false); } break; } case Comment: { tb.insert(t.asComment()); break; } case Doctype: { tb.error(this); return false; } case StartTag: Token.StartTag startTag = t.asStartTag(); String name = startTag.name(); if (name.equals("html")) { tb.error(this); // merge attributes onto real html Element html = tb.getStack().getFirst(); for (Attribute attribute : startTag.getAttributes()) { if (!html.hasAttr(attribute.getKey())) html.attributes().put(attribute); } } else if (StringUtil.in(name, Constants.InBodyStartToHead)) { return tb.process(t, InHead); } else if (name.equals("body")) { tb.error(this); LinkedList<Element> stack = tb.getStack(); if (stack.size() == 1 || (stack.size() > 2 && !stack.get(1).nodeName() .equals("body"))) { // only in fragment case return false; // ignore } else { tb.framesetOk(false); Element body = stack.get(1); for (Attribute attribute : startTag.getAttributes()) { if (!body.hasAttr(attribute.getKey())) body.attributes().put(attribute); } } } else if (name.equals("frameset")) { tb.error(this); LinkedList<Element> stack = tb.getStack(); if (stack.size() == 1 || (stack.size() > 2 && !stack.get(1).nodeName() .equals("body"))) { // only in fragment case return false; // ignore } else if (!tb.framesetOk()) { return false; // ignore frameset } else { Element second = stack.get(1); if (second.parent() != null) second.remove(); // pop up to html element while (stack.size() > 1) stack.removeLast(); tb.insert(startTag); tb.transition(InFrameset); } } else if (StringUtil.in(name, Constants.InBodyStartPClosers)) { if (tb.inButtonScope("p")) { tb.process(new Token.EndTag("p")); } tb.insert(startTag); } else if (StringUtil.in(name, Constants.Headings)) { if (tb.inButtonScope("p")) { tb.process(new Token.EndTag("p")); } if (StringUtil.in(tb.currentElement().nodeName(), Constants.Headings)) { tb.error(this); tb.pop(); } tb.insert(startTag); } else if (StringUtil.in(name, Constants.InBodyStartPreListing)) { if (tb.inButtonScope("p")) { tb.process(new Token.EndTag("p")); } tb.insert(startTag); // todo: ignore LF if next token tb.framesetOk(false); } else if (name.equals("form")) { if (tb.getFormElement() != null) { tb.error(this); return false; } if (tb.inButtonScope("p")) { tb.process(new Token.EndTag("p")); } tb.insertForm(startTag, true); } else if (name.equals("li")) { tb.framesetOk(false); LinkedList<Element> stack = tb.getStack(); for (int i = stack.size() - 1; i > 0; i--) { Element el = stack.get(i); if (el.nodeName().equals("li")) { tb.process(new Token.EndTag("li")); break; } if (tb.isSpecial(el) && !StringUtil.in(el.nodeName(), Constants.InBodyStartLiBreakers)) break; } if (tb.inButtonScope("p")) { tb.process(new Token.EndTag("p")); } tb.insert(startTag); } else if (StringUtil.in(name, Constants.DdDt)) { tb.framesetOk(false); LinkedList<Element> stack = tb.getStack(); for (int i = stack.size() - 1; i > 0; i--) { Element el = stack.get(i); if (StringUtil.in(el.nodeName(), Constants.DdDt)) { tb.process(new Token.EndTag(el.nodeName())); break; } if (tb.isSpecial(el) && !StringUtil.in(el.nodeName(), Constants.InBodyStartLiBreakers)) break; } if (tb.inButtonScope("p")) { tb.process(new Token.EndTag("p")); } tb.insert(startTag); } else if (name.equals("plaintext")) { if (tb.inButtonScope("p")) { tb.process(new Token.EndTag("p")); } tb.insert(startTag); tb.tokeniser.transition(TokeniserState.PLAINTEXT); // once // in, // never // gets // out } else if (name.equals("button")) { if (tb.inButtonScope("button")) { // close and reprocess tb.error(this); tb.process(new Token.EndTag("button")); tb.process(startTag); } else { tb.reconstructFormattingElements(); tb.insert(startTag); tb.framesetOk(false); } } else if (name.equals("a")) { if (tb.getActiveFormattingElement("a") != null) { tb.error(this); tb.process(new Token.EndTag("a")); // still on stack? Element remainingA = tb.getFromStack("a"); if (remainingA != null) { tb.removeFromActiveFormattingElements(remainingA); tb.removeFromStack(remainingA); } } tb.reconstructFormattingElements(); Element a = tb.insert(startTag); tb.pushActiveFormattingElements(a); } else if (StringUtil.in(name, Constants.Formatters)) { tb.reconstructFormattingElements(); Element el = tb.insert(startTag); tb.pushActiveFormattingElements(el); } else if (name.equals("nobr")) { tb.reconstructFormattingElements(); if (tb.inScope("nobr")) { tb.error(this); tb.process(new Token.EndTag("nobr")); tb.reconstructFormattingElements(); } Element el = tb.insert(startTag); tb.pushActiveFormattingElements(el); } else if (StringUtil.in(name, Constants.InBodyStartApplets)) { tb.reconstructFormattingElements(); tb.insert(startTag); tb.insertMarkerToFormattingElements(); tb.framesetOk(false); } else if (name.equals("table")) { if (tb.getDocument().quirksMode() != Document.QuirksMode.quirks && tb.inButtonScope("p")) { tb.process(new Token.EndTag("p")); } tb.insert(startTag); tb.framesetOk(false); tb.transition(InTable); } else if (StringUtil.in(name, Constants.InBodyStartEmptyFormatters)) { tb.reconstructFormattingElements(); tb.insertEmpty(startTag); tb.framesetOk(false); } else if (name.equals("input")) { tb.reconstructFormattingElements(); Element el = tb.insertEmpty(startTag); if (!el.attr("type").equalsIgnoreCase("hidden")) tb.framesetOk(false); } else if (StringUtil.in(name, Constants.InBodyStartMedia)) { tb.insertEmpty(startTag); } else if (name.equals("hr")) { if (tb.inButtonScope("p")) { tb.process(new Token.EndTag("p")); } tb.insertEmpty(startTag); tb.framesetOk(false); } else if (name.equals("image")) { if (tb.getFromStack("svg") == null) return tb.process(startTag.name("img")); // change // <image> // to <img>, // unless in // svg else tb.insert(startTag); } else if (name.equals("isindex")) { // how much do we care about the early 90s? tb.error(this); if (tb.getFormElement() != null) return false; tb.tokeniser.acknowledgeSelfClosingFlag(); tb.process(new Token.StartTag("form")); if (startTag.attributes.hasKey("action")) { Element form = tb.getFormElement(); form.attr("action", startTag.attributes.get("action")); } tb.process(new Token.StartTag("hr")); tb.process(new Token.StartTag("label")); // hope you like english. String prompt = startTag.attributes.hasKey("prompt") ? startTag.attributes .get("prompt") : "This is a searchable index. Enter search keywords: "; tb.process(new Token.Character(prompt)); // input Attributes inputAttribs = new Attributes(); for (Attribute attr : startTag.attributes) { if (!StringUtil.in(attr.getKey(), Constants.InBodyStartInputAttribs)) inputAttribs.put(attr); } inputAttribs.put("name", "isindex"); tb.process(new Token.StartTag("input", inputAttribs)); tb.process(new Token.EndTag("label")); tb.process(new Token.StartTag("hr")); tb.process(new Token.EndTag("form")); } else if (name.equals("textarea")) { tb.insert(startTag); // todo: If the next token is a U+000A LINE FEED (LF) // character token, then ignore that token and move on to // the next one. (Newlines at the start of textarea elements // are ignored as an authoring convenience.) tb.tokeniser.transition(TokeniserState.Rcdata); tb.markInsertionMode(); tb.framesetOk(false); tb.transition(Text); } else if (name.equals("xmp")) { if (tb.inButtonScope("p")) { tb.process(new Token.EndTag("p")); } tb.reconstructFormattingElements(); tb.framesetOk(false); handleRawtext(startTag, tb); } else if (name.equals("iframe")) { tb.framesetOk(false); handleRawtext(startTag, tb); } else if (name.equals("noembed")) { // also handle noscript if script enabled handleRawtext(startTag, tb); } else if (name.equals("select")) { tb.reconstructFormattingElements(); tb.insert(startTag); tb.framesetOk(false); HtmlTreeBuilderState state = tb.state(); if (state.equals(InTable) || state.equals(InCaption) || state.equals(InTableBody) || state.equals(InRow) || state.equals(InCell)) tb.transition(InSelectInTable); else tb.transition(InSelect); } else if (StringUtil.in(name, Constants.InBodyStartOptions)) { if (tb.currentElement().nodeName().equals("option")) tb.process(new Token.EndTag("option")); tb.reconstructFormattingElements(); tb.insert(startTag); } else if (StringUtil.in(name, Constants.InBodyStartRuby)) { if (tb.inScope("ruby")) { tb.generateImpliedEndTags(); if (!tb.currentElement().nodeName().equals("ruby")) { tb.error(this); tb.popStackToBefore("ruby"); // i.e. close up to but // not include name } tb.insert(startTag); } } else if (name.equals("math")) { tb.reconstructFormattingElements(); // todo: handle A start tag whose tag name is "math" (i.e. // foreign, mathml) tb.insert(startTag); tb.tokeniser.acknowledgeSelfClosingFlag(); } else if (name.equals("svg")) { tb.reconstructFormattingElements(); // todo: handle A start tag whose tag name is "svg" (xlink, // svg) tb.insert(startTag); tb.tokeniser.acknowledgeSelfClosingFlag(); } else if (StringUtil.in(name, Constants.InBodyStartDrop)) { tb.error(this); return false; } else { tb.reconstructFormattingElements(); tb.insert(startTag); } break; case EndTag: Token.EndTag endTag = t.asEndTag(); name = endTag.name(); if (name.equals("body")) { if (!tb.inScope("body")) { tb.error(this); return false; } else { // todo: error if stack contains something not dd, dt, // li, optgroup, option, p, rp, rt, tbody, td, tfoot, // th, thead, tr, body, html tb.transition(AfterBody); } } else if (name.equals("html")) { boolean notIgnored = tb.process(new Token.EndTag("body")); if (notIgnored) return tb.process(endTag); } else if (StringUtil.in(name, Constants.InBodyEndClosers)) { if (!tb.inScope(name)) { // nothing to close tb.error(this); return false; } else { tb.generateImpliedEndTags(); if (!tb.currentElement().nodeName().equals(name)) tb.error(this); tb.popStackToClose(name); } } else if (name.equals("form")) { Element currentForm = tb.getFormElement(); tb.setFormElement(null); if (currentForm == null || !tb.inScope(name)) { tb.error(this); return false; } else { tb.generateImpliedEndTags(); if (!tb.currentElement().nodeName().equals(name)) tb.error(this); // remove currentForm from stack. will shift anything // under up. tb.removeFromStack(currentForm); } } else if (name.equals("p")) { if (!tb.inButtonScope(name)) { tb.error(this); tb.process(new Token.StartTag(name)); // if no p to // close, // creates an // empty <p></p> return tb.process(endTag); } else { tb.generateImpliedEndTags(name); if (!tb.currentElement().nodeName().equals(name)) tb.error(this); tb.popStackToClose(name); } } else if (name.equals("li")) { if (!tb.inListItemScope(name)) { tb.error(this); return false; } else { tb.generateImpliedEndTags(name); if (!tb.currentElement().nodeName().equals(name)) tb.error(this); tb.popStackToClose(name); } } else if (StringUtil.in(name, Constants.DdDt)) { if (!tb.inScope(name)) { tb.error(this); return false; } else { tb.generateImpliedEndTags(name); if (!tb.currentElement().nodeName().equals(name)) tb.error(this); tb.popStackToClose(name); } } else if (StringUtil.in(name, Constants.Headings)) { if (!tb.inScope(Constants.Headings)) { tb.error(this); return false; } else { tb.generateImpliedEndTags(name); if (!tb.currentElement().nodeName().equals(name)) tb.error(this); tb.popStackToClose(Constants.Headings); } } else if (name.equals("sarcasm")) { // *sigh* return anyOtherEndTag(t, tb); } else if (StringUtil.in(name, Constants.InBodyEndAdoptionFormatters)) { // Adoption Agency Algorithm. OUTER: for (int i = 0; i < 8; i++) { Element formatEl = tb.getActiveFormattingElement(name); if (formatEl == null) return anyOtherEndTag(t, tb); else if (!tb.onStack(formatEl)) { tb.error(this); tb.removeFromActiveFormattingElements(formatEl); return true; } else if (!tb.inScope(formatEl.nodeName())) { tb.error(this); return false; } else if (tb.currentElement() != formatEl) tb.error(this); Element furthestBlock = null; Element commonAncestor = null; boolean seenFormattingElement = false; LinkedList<Element> stack = tb.getStack(); // the spec doesn't limit to < 64, but in degenerate // cases (9000+ stack depth) this prevents // run-aways final int stackSize = stack.size(); for (int si = 0; si < stackSize && si < 64; si++) { Element el = stack.get(si); if (el == formatEl) { commonAncestor = stack.get(si - 1); seenFormattingElement = true; } else if (seenFormattingElement && tb.isSpecial(el)) { furthestBlock = el; break; } } if (furthestBlock == null) { tb.popStackToClose(formatEl.nodeName()); tb.removeFromActiveFormattingElements(formatEl); return true; } // todo: Let a bookmark note the position of the // formatting element in the list of active formatting // elements relative to the elements on either side of // it in the list. // does that mean: int pos of format el in list? Element node = furthestBlock; Element lastNode = furthestBlock; INNER: for (int j = 0; j < 3; j++) { if (tb.onStack(node)) node = tb.aboveOnStack(node); if (!tb.isInActiveFormattingElements(node)) { // note // no // bookmark // check tb.removeFromStack(node); continue INNER; } else if (node == formatEl) break INNER; Element replacement = new Element(Tag.valueOf(node .nodeName()), tb.getBaseUri()); tb.replaceActiveFormattingElement(node, replacement); tb.replaceOnStack(node, replacement); node = replacement; if (lastNode == furthestBlock) { // todo: move the aforementioned bookmark to be // immediately after the new node in the list of // active formatting elements. // not getting how this bookmark both straddles // the element above, but is inbetween here... } if (lastNode.parent() != null) lastNode.remove(); node.appendChild(lastNode); lastNode = node; } if (StringUtil.in(commonAncestor.nodeName(), Constants.InBodyEndTableFosters)) { if (lastNode.parent() != null) lastNode.remove(); tb.insertInFosterParent(lastNode); } else { if (lastNode.parent() != null) lastNode.remove(); commonAncestor.appendChild(lastNode); } Element adopter = new Element(formatEl.tag(), tb.getBaseUri()); adopter.attributes().addAll(formatEl.attributes()); Node[] childNodes = furthestBlock.childNodes().toArray( new Node[furthestBlock.childNodeSize()]); for (Node childNode : childNodes) { adopter.appendChild(childNode); // append will // reparent. thus // the clone to // avoid concurrent // mod. } furthestBlock.appendChild(adopter); tb.removeFromActiveFormattingElements(formatEl); // todo: insert the new element into the list of active // formatting elements at the position of the // aforementioned bookmark. tb.removeFromStack(formatEl); tb.insertOnStackAfter(furthestBlock, adopter); } } else if (StringUtil.in(name, Constants.InBodyStartApplets)) { if (!tb.inScope("name")) { if (!tb.inScope(name)) { tb.error(this); return false; } tb.generateImpliedEndTags(); if (!tb.currentElement().nodeName().equals(name)) tb.error(this); tb.popStackToClose(name); tb.clearFormattingElementsToLastMarker(); } } else if (name.equals("br")) { tb.error(this); tb.process(new Token.StartTag("br")); return false; } else { return anyOtherEndTag(t, tb); } break; case EOF: // todo: error if stack contains something not dd, dt, li, p, // tbody, td, tfoot, th, thead, tr, body, html // stop parsing break; } return true; } boolean anyOtherEndTag(Token t, HtmlTreeBuilder tb) { String name = t.asEndTag().name(); DescendableLinkedList<Element> stack = tb.getStack(); Iterator<Element> it = stack.descendingIterator(); while (it.hasNext()) { Element node = it.next(); if (node.nodeName().equals(name)) { tb.generateImpliedEndTags(name); if (!name.equals(tb.currentElement().nodeName())) tb.error(this); tb.popStackToClose(name); break; } else { if (tb.isSpecial(node)) { tb.error(this); return false; } } } return true; } }, Text { // in script, style etc. normally treated as data tags boolean process(Token t, HtmlTreeBuilder tb) { if (t.isCharacter()) { tb.insert(t.asCharacter()); } else if (t.isEOF()) { tb.error(this); // if current node is script: already started tb.pop(); tb.transition(tb.originalState()); return tb.process(t); } else if (t.isEndTag()) { // if: An end tag whose tag name is "script" -- scripting // nesting level, if evaluating scripts tb.pop(); tb.transition(tb.originalState()); } return true; } }, InTable { boolean process(Token t, HtmlTreeBuilder tb) { if (t.isCharacter()) { tb.newPendingTableCharacters(); tb.markInsertionMode(); tb.transition(InTableText); return tb.process(t); } else if (t.isComment()) { tb.insert(t.asComment()); return true; } else if (t.isDoctype()) { tb.error(this); return false; } else if (t.isStartTag()) { Token.StartTag startTag = t.asStartTag(); String name = startTag.name(); if (name.equals("caption")) { tb.clearStackToTableContext(); tb.insertMarkerToFormattingElements(); tb.insert(startTag); tb.transition(InCaption); } else if (name.equals("colgroup")) { tb.clearStackToTableContext(); tb.insert(startTag); tb.transition(InColumnGroup); } else if (name.equals("col")) { tb.process(new Token.StartTag("colgroup")); return tb.process(t); } else if (StringUtil.in(name, "tbody", "tfoot", "thead")) { tb.clearStackToTableContext(); tb.insert(startTag); tb.transition(InTableBody); } else if (StringUtil.in(name, "td", "th", "tr")) { tb.process(new Token.StartTag("tbody")); return tb.process(t); } else if (name.equals("table")) { tb.error(this); boolean processed = tb.process(new Token.EndTag("table")); if (processed) // only ignored if in fragment return tb.process(t); } else if (StringUtil.in(name, "style", "script")) { return tb.process(t, InHead); } else if (name.equals("input")) { if (!startTag.attributes.get("type").equalsIgnoreCase( "hidden")) { return anythingElse(t, tb); } else { tb.insertEmpty(startTag); } } else if (name.equals("form")) { tb.error(this); if (tb.getFormElement() != null) return false; else { tb.insertForm(startTag, false); } } else { return anythingElse(t, tb); } return true; // todo: check if should return processed // http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#parsing-main-intable } else if (t.isEndTag()) { Token.EndTag endTag = t.asEndTag(); String name = endTag.name(); if (name.equals("table")) { if (!tb.inTableScope(name)) { tb.error(this); return false; } else { tb.popStackToClose("table"); } tb.resetInsertionMode(); } else if (StringUtil.in(name, "body", "caption", "col", "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr")) { tb.error(this); return false; } else { return anythingElse(t, tb); } return true; // todo: as above todo } else if (t.isEOF()) { if (tb.currentElement().nodeName().equals("html")) tb.error(this); return true; // stops parsing } return anythingElse(t, tb); } boolean anythingElse(Token t, HtmlTreeBuilder tb) { tb.error(this); boolean processed = true; if (StringUtil.in(tb.currentElement().nodeName(), "table", "tbody", "tfoot", "thead", "tr")) { tb.setFosterInserts(true); processed = tb.process(t, InBody); tb.setFosterInserts(false); } else { processed = tb.process(t, InBody); } return processed; } }, InTableText { boolean process(Token t, HtmlTreeBuilder tb) { switch (t.type) { case Character: Token.Character c = t.asCharacter(); if (c.getData().equals(nullString)) { tb.error(this); return false; } else { tb.getPendingTableCharacters().add(c); } break; default: if (tb.getPendingTableCharacters().size() > 0) { for (Token.Character character : tb .getPendingTableCharacters()) { if (!isWhitespace(character)) { // InTable anything else section: tb.error(this); if (StringUtil.in(tb.currentElement().nodeName(), "table", "tbody", "tfoot", "thead", "tr")) { tb.setFosterInserts(true); tb.process(character, InBody); tb.setFosterInserts(false); } else { tb.process(character, InBody); } } else tb.insert(character); } tb.newPendingTableCharacters(); } tb.transition(tb.originalState()); return tb.process(t); } return true; } }, InCaption { boolean process(Token t, HtmlTreeBuilder tb) { if (t.isEndTag() && t.asEndTag().name().equals("caption")) { Token.EndTag endTag = t.asEndTag(); String name = endTag.name(); if (!tb.inTableScope(name)) { tb.error(this); return false; } else { tb.generateImpliedEndTags(); if (!tb.currentElement().nodeName().equals("caption")) tb.error(this); tb.popStackToClose("caption"); tb.clearFormattingElementsToLastMarker(); tb.transition(InTable); } } else if ((t.isStartTag() && StringUtil.in(t.asStartTag().name(), "caption", "col", "colgroup", "tbody", "td", "tfoot", "th", "thead", "tr") || t.isEndTag() && t.asEndTag().name().equals("table"))) { tb.error(this); boolean processed = tb.process(new Token.EndTag("caption")); if (processed) return tb.process(t); } else if (t.isEndTag() && StringUtil.in(t.asEndTag().name(), "body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr")) { tb.error(this); return false; } else { return tb.process(t, InBody); } return true; } }, InColumnGroup { boolean process(Token t, HtmlTreeBuilder tb) { if (isWhitespace(t)) { tb.insert(t.asCharacter()); return true; } switch (t.type) { case Comment: tb.insert(t.asComment()); break; case Doctype: tb.error(this); break; case StartTag: Token.StartTag startTag = t.asStartTag(); String name = startTag.name(); if (name.equals("html")) return tb.process(t, InBody); else if (name.equals("col")) tb.insertEmpty(startTag); else return anythingElse(t, tb); break; case EndTag: Token.EndTag endTag = t.asEndTag(); name = endTag.name(); if (name.equals("colgroup")) { if (tb.currentElement().nodeName().equals("html")) { // frag // case tb.error(this); return false; } else { tb.pop(); tb.transition(InTable); } } else return anythingElse(t, tb); break; case EOF: if (tb.currentElement().nodeName().equals("html")) return true; // stop parsing; frag case else return anythingElse(t, tb); default: return anythingElse(t, tb); } return true; } private boolean anythingElse(Token t, TreeBuilder tb) { boolean processed = tb.process(new Token.EndTag("colgroup")); if (processed) // only ignored in frag case return tb.process(t); return true; } }, InTableBody { boolean process(Token t, HtmlTreeBuilder tb) { switch (t.type) { case StartTag: Token.StartTag startTag = t.asStartTag(); String name = startTag.name(); if (name.equals("tr")) { tb.clearStackToTableBodyContext(); tb.insert(startTag); tb.transition(InRow); } else if (StringUtil.in(name, "th", "td")) { tb.error(this); tb.process(new Token.StartTag("tr")); return tb.process(startTag); } else if (StringUtil.in(name, "caption", "col", "colgroup", "tbody", "tfoot", "thead")) { return exitTableBody(t, tb); } else return anythingElse(t, tb); break; case EndTag: Token.EndTag endTag = t.asEndTag(); name = endTag.name(); if (StringUtil.in(name, "tbody", "tfoot", "thead")) { if (!tb.inTableScope(name)) { tb.error(this); return false; } else { tb.clearStackToTableBodyContext(); tb.pop(); tb.transition(InTable); } } else if (name.equals("table")) { return exitTableBody(t, tb); } else if (StringUtil.in(name, "body", "caption", "col", "colgroup", "html", "td", "th", "tr")) { tb.error(this); return false; } else return anythingElse(t, tb); break; default: return anythingElse(t, tb); } return true; } private boolean exitTableBody(Token t, HtmlTreeBuilder tb) { if (!(tb.inTableScope("tbody") || tb.inTableScope("thead") || tb .inScope("tfoot"))) { // frag case tb.error(this); return false; } tb.clearStackToTableBodyContext(); tb.process(new Token.EndTag(tb.currentElement().nodeName())); // tbody, // tfoot, // thead return tb.process(t); } private boolean anythingElse(Token t, HtmlTreeBuilder tb) { return tb.process(t, InTable); } }, InRow { boolean process(Token t, HtmlTreeBuilder tb) { if (t.isStartTag()) { Token.StartTag startTag = t.asStartTag(); String name = startTag.name(); if (StringUtil.in(name, "th", "td")) { tb.clearStackToTableRowContext(); tb.insert(startTag); tb.transition(InCell); tb.insertMarkerToFormattingElements(); } else if (StringUtil.in(name, "caption", "col", "colgroup", "tbody", "tfoot", "thead", "tr")) { return handleMissingTr(t, tb); } else { return anythingElse(t, tb); } } else if (t.isEndTag()) { Token.EndTag endTag = t.asEndTag(); String name = endTag.name(); if (name.equals("tr")) { if (!tb.inTableScope(name)) { tb.error(this); // frag return false; } tb.clearStackToTableRowContext(); tb.pop(); // tr tb.transition(InTableBody); } else if (name.equals("table")) { return handleMissingTr(t, tb); } else if (StringUtil.in(name, "tbody", "tfoot", "thead")) { if (!tb.inTableScope(name)) { tb.error(this); return false; } tb.process(new Token.EndTag("tr")); return tb.process(t); } else if (StringUtil.in(name, "body", "caption", "col", "colgroup", "html", "td", "th")) { tb.error(this); return false; } else { return anythingElse(t, tb); } } else { return anythingElse(t, tb); } return true; } private boolean anythingElse(Token t, HtmlTreeBuilder tb) { return tb.process(t, InTable); } private boolean handleMissingTr(Token t, TreeBuilder tb) { boolean processed = tb.process(new Token.EndTag("tr")); if (processed) return tb.process(t); else return false; } }, InCell { boolean process(Token t, HtmlTreeBuilder tb) { if (t.isEndTag()) { Token.EndTag endTag = t.asEndTag(); String name = endTag.name(); if (StringUtil.in(name, "td", "th")) { if (!tb.inTableScope(name)) { tb.error(this); tb.transition(InRow); // might not be in scope if empty: // <td /> and processing fake // end tag return false; } tb.generateImpliedEndTags(); if (!tb.currentElement().nodeName().equals(name)) tb.error(this); tb.popStackToClose(name); tb.clearFormattingElementsToLastMarker(); tb.transition(InRow); } else if (StringUtil.in(name, "body", "caption", "col", "colgroup", "html")) { tb.error(this); return false; } else if (StringUtil.in(name, "table", "tbody", "tfoot", "thead", "tr")) { if (!tb.inTableScope(name)) { tb.error(this); return false; } closeCell(tb); return tb.process(t); } else { return anythingElse(t, tb); } } else if (t.isStartTag() && StringUtil.in(t.asStartTag().name(), "caption", "col", "colgroup", "tbody", "td", "tfoot", "th", "thead", "tr")) { if (!(tb.inTableScope("td") || tb.inTableScope("th"))) { tb.error(this); return false; } closeCell(tb); return tb.process(t); } else { return anythingElse(t, tb); } return true; } private boolean anythingElse(Token t, HtmlTreeBuilder tb) { return tb.process(t, InBody); } private void closeCell(HtmlTreeBuilder tb) { if (tb.inTableScope("td")) tb.process(new Token.EndTag("td")); else tb.process(new Token.EndTag("th")); // only here if th or td in // scope } }, InSelect { boolean process(Token t, HtmlTreeBuilder tb) { switch (t.type) { case Character: Token.Character c = t.asCharacter(); if (c.getData().equals(nullString)) { tb.error(this); return false; } else { tb.insert(c); } break; case Comment: tb.insert(t.asComment()); break; case Doctype: tb.error(this); return false; case StartTag: Token.StartTag start = t.asStartTag(); String name = start.name(); if (name.equals("html")) return tb.process(start, InBody); else if (name.equals("option")) { tb.process(new Token.EndTag("option")); tb.insert(start); } else if (name.equals("optgroup")) { if (tb.currentElement().nodeName().equals("option")) tb.process(new Token.EndTag("option")); else if (tb.currentElement().nodeName().equals("optgroup")) tb.process(new Token.EndTag("optgroup")); tb.insert(start); } else if (name.equals("select")) { tb.error(this); return tb.process(new Token.EndTag("select")); } else if (StringUtil.in(name, "input", "keygen", "textarea")) { tb.error(this); if (!tb.inSelectScope("select")) return false; // frag tb.process(new Token.EndTag("select")); return tb.process(start); } else if (name.equals("script")) { return tb.process(t, InHead); } else { return anythingElse(t, tb); } break; case EndTag: Token.EndTag end = t.asEndTag(); name = end.name(); if (name.equals("optgroup")) { if (tb.currentElement().nodeName().equals("option") && tb.aboveOnStack(tb.currentElement()) != null && tb.aboveOnStack(tb.currentElement()).nodeName() .equals("optgroup")) tb.process(new Token.EndTag("option")); if (tb.currentElement().nodeName().equals("optgroup")) tb.pop(); else tb.error(this); } else if (name.equals("option")) { if (tb.currentElement().nodeName().equals("option")) tb.pop(); else tb.error(this); } else if (name.equals("select")) { if (!tb.inSelectScope(name)) { tb.error(this); return false; } else { tb.popStackToClose(name); tb.resetInsertionMode(); } } else return anythingElse(t, tb); break; case EOF: if (!tb.currentElement().nodeName().equals("html")) tb.error(this); break; default: return anythingElse(t, tb); } return true; } private boolean anythingElse(Token t, HtmlTreeBuilder tb) { tb.error(this); return false; } }, InSelectInTable { boolean process(Token t, HtmlTreeBuilder tb) { if (t.isStartTag() && StringUtil.in(t.asStartTag().name(), "caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th")) { tb.error(this); tb.process(new Token.EndTag("select")); return tb.process(t); } else if (t.isEndTag() && StringUtil.in(t.asEndTag().name(), "caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th")) { tb.error(this); if (tb.inTableScope(t.asEndTag().name())) { tb.process(new Token.EndTag("select")); return (tb.process(t)); } else return false; } else { return tb.process(t, InSelect); } } }, AfterBody { boolean process(Token t, HtmlTreeBuilder tb) { if (isWhitespace(t)) { return tb.process(t, InBody); } else if (t.isComment()) { tb.insert(t.asComment()); // into html node } else if (t.isDoctype()) { tb.error(this); return false; } else if (t.isStartTag() && t.asStartTag().name().equals("html")) { return tb.process(t, InBody); } else if (t.isEndTag() && t.asEndTag().name().equals("html")) { if (tb.isFragmentParsing()) { tb.error(this); return false; } else { tb.transition(AfterAfterBody); } } else if (t.isEOF()) { // chillax! we're done } else { tb.error(this); tb.transition(InBody); return tb.process(t); } return true; } }, InFrameset { boolean process(Token t, HtmlTreeBuilder tb) { if (isWhitespace(t)) { tb.insert(t.asCharacter()); } else if (t.isComment()) { tb.insert(t.asComment()); } else if (t.isDoctype()) { tb.error(this); return false; } else if (t.isStartTag()) { Token.StartTag start = t.asStartTag(); String name = start.name(); if (name.equals("html")) { return tb.process(start, InBody); } else if (name.equals("frameset")) { tb.insert(start); } else if (name.equals("frame")) { tb.insertEmpty(start); } else if (name.equals("noframes")) { return tb.process(start, InHead); } else { tb.error(this); return false; } } else if (t.isEndTag() && t.asEndTag().name().equals("frameset")) { if (tb.currentElement().nodeName().equals("html")) { // frag tb.error(this); return false; } else { tb.pop(); if (!tb.isFragmentParsing() && !tb.currentElement().nodeName() .equals("frameset")) { tb.transition(AfterFrameset); } } } else if (t.isEOF()) { if (!tb.currentElement().nodeName().equals("html")) { tb.error(this); return true; } } else { tb.error(this); return false; } return true; } }, AfterFrameset { boolean process(Token t, HtmlTreeBuilder tb) { if (isWhitespace(t)) { tb.insert(t.asCharacter()); } else if (t.isComment()) { tb.insert(t.asComment()); } else if (t.isDoctype()) { tb.error(this); return false; } else if (t.isStartTag() && t.asStartTag().name().equals("html")) { return tb.process(t, InBody); } else if (t.isEndTag() && t.asEndTag().name().equals("html")) { tb.transition(AfterAfterFrameset); } else if (t.isStartTag() && t.asStartTag().name().equals("noframes")) { return tb.process(t, InHead); } else if (t.isEOF()) { // cool your heels, we're complete } else { tb.error(this); return false; } return true; } }, AfterAfterBody { boolean process(Token t, HtmlTreeBuilder tb) { if (t.isComment()) { tb.insert(t.asComment()); } else if (t.isDoctype() || isWhitespace(t) || (t.isStartTag() && t.asStartTag().name().equals("html"))) { return tb.process(t, InBody); } else if (t.isEOF()) { // nice work chuck } else { tb.error(this); tb.transition(InBody); return tb.process(t); } return true; } }, AfterAfterFrameset { boolean process(Token t, HtmlTreeBuilder tb) { if (t.isComment()) { tb.insert(t.asComment()); } else if (t.isDoctype() || isWhitespace(t) || (t.isStartTag() && t.asStartTag().name().equals("html"))) { return tb.process(t, InBody); } else if (t.isEOF()) { // nice work chuck } else if (t.isStartTag() && t.asStartTag().name().equals("noframes")) { return tb.process(t, InHead); } else { tb.error(this); return false; } return true; } }, ForeignContent { boolean process(Token t, HtmlTreeBuilder tb) { return true; // todo: implement. Also; how do we get here? } }; private static String nullString = String.valueOf('\u0000'); abstract boolean process(Token t, HtmlTreeBuilder tb); private static boolean isWhitespace(Token t) { if (t.isCharacter()) { String data = t.asCharacter().getData(); // todo: this checks more than spec - "\t", "\n", "\f", "\r", " " for (int i = 0; i < data.length(); i++) { char c = data.charAt(i); if (!StringUtil.isWhitespace(c)) return false; } return true; } return false; } private static void handleRcData(Token.StartTag startTag, HtmlTreeBuilder tb) { tb.insert(startTag); tb.tokeniser.transition(TokeniserState.Rcdata); tb.markInsertionMode(); tb.transition(Text); } private static void handleRawtext(Token.StartTag startTag, HtmlTreeBuilder tb) { tb.insert(startTag); tb.tokeniser.transition(TokeniserState.Rawtext); tb.markInsertionMode(); tb.transition(Text); } // lists of tags to search through. A little harder to read here, but causes // less GC than dynamic varargs. // was contributing around 10% of parse GC load. private static final class Constants { private static final String[] InBodyStartToHead = new String[] { "base", "basefont", "bgsound", "command", "link", "meta", "noframes", "script", "style", "title" }; private static final String[] InBodyStartPClosers = new String[] { "address", "article", "aside", "blockquote", "center", "details", "dir", "div", "dl", "fieldset", "figcaption", "figure", "footer", "header", "hgroup", "menu", "nav", "ol", "p", "section", "summary", "ul" }; private static final String[] Headings = new String[] { "h1", "h2", "h3", "h4", "h5", "h6" }; private static final String[] InBodyStartPreListing = new String[] { "pre", "listing" }; private static final String[] InBodyStartLiBreakers = new String[] { "address", "div", "p" }; private static final String[] DdDt = new String[] { "dd", "dt" }; private static final String[] Formatters = new String[] { "b", "big", "code", "em", "font", "i", "s", "small", "strike", "strong", "tt", "u" }; private static final String[] InBodyStartApplets = new String[] { "applet", "marquee", "object" }; private static final String[] InBodyStartEmptyFormatters = new String[] { "area", "br", "embed", "img", "keygen", "wbr" }; private static final String[] InBodyStartMedia = new String[] { "param", "source", "track" }; private static final String[] InBodyStartInputAttribs = new String[] { "name", "action", "prompt" }; private static final String[] InBodyStartOptions = new String[] { "optgroup", "option" }; private static final String[] InBodyStartRuby = new String[] { "rp", "rt" }; private static final String[] InBodyStartDrop = new String[] { "caption", "col", "colgroup", "frame", "head", "tbody", "td", "tfoot", "th", "thead", "tr" }; private static final String[] InBodyEndClosers = new String[] { "address", "article", "aside", "blockquote", "button", "center", "details", "dir", "div", "dl", "fieldset", "figcaption", "figure", "footer", "header", "hgroup", "listing", "menu", "nav", "ol", "pre", "section", "summary", "ul" }; private static final String[] InBodyEndAdoptionFormatters = new String[] { "a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u" }; private static final String[] InBodyEndTableFosters = new String[] { "table", "tbody", "tfoot", "thead", "tr" }; } }