package com.smartandroid.sa.tag.parser; import java.util.ArrayList; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import com.smartandroid.sa.tag.helper.DescendableLinkedList; import com.smartandroid.sa.tag.helper.StringUtil; import com.smartandroid.sa.tag.helper.Validate; import com.smartandroid.sa.tag.nodes.Comment; import com.smartandroid.sa.tag.nodes.DataNode; import com.smartandroid.sa.tag.nodes.Document; import com.smartandroid.sa.tag.nodes.Element; import com.smartandroid.sa.tag.nodes.FormElement; import com.smartandroid.sa.tag.nodes.Node; import com.smartandroid.sa.tag.nodes.TextNode; import com.smartandroid.sa.tag.select.Elements; /** * HTML Tree Builder; creates a DOM from Tokens. */ class HtmlTreeBuilder extends TreeBuilder { // tag searches private static final String[] TagsScriptStyle = new String[] { "script", "style" }; public static final String[] TagsSearchInScope = new String[] { "applet", "caption", "html", "table", "td", "th", "marquee", "object" }; private static final String[] TagSearchList = new String[] { "ol", "ul" }; private static final String[] TagSearchButton = new String[] { "button" }; private static final String[] TagSearchTableScope = new String[] { "html", "table" }; private static final String[] TagSearchSelectScope = new String[] { "optgroup", "option" }; private static final String[] TagSearchEndTags = new String[] { "dd", "dt", "li", "option", "optgroup", "p", "rp", "rt" }; private static final String[] TagSearchSpecial = new String[] { "address", "applet", "area", "article", "aside", "base", "basefont", "bgsound", "blockquote", "body", "br", "button", "caption", "center", "col", "colgroup", "command", "dd", "details", "dir", "div", "dl", "dt", "embed", "fieldset", "figcaption", "figure", "footer", "form", "frame", "frameset", "h1", "h2", "h3", "h4", "h5", "h6", "head", "header", "hgroup", "hr", "html", "iframe", "img", "input", "isindex", "li", "link", "listing", "marquee", "menu", "meta", "nav", "noembed", "noframes", "noscript", "object", "ol", "p", "param", "plaintext", "pre", "script", "section", "select", "style", "summary", "table", "tbody", "td", "textarea", "tfoot", "th", "thead", "title", "tr", "ul", "wbr", "xmp" }; private HtmlTreeBuilderState state; // the current state private HtmlTreeBuilderState originalState; // original / marked state private boolean baseUriSetFromDoc = false; private Element headElement; // the current head element private FormElement formElement; // the current form element private Element contextElement; // fragment parse context -- could be null // even if fragment parsing private DescendableLinkedList<Element> formattingElements = new DescendableLinkedList<Element>(); // active // (open) // formatting // elements private List<Token.Character> pendingTableCharacters = new ArrayList<Token.Character>(); // chars // in // table // to // be // shifted // out private boolean framesetOk = true; // if ok to go into frameset private boolean fosterInserts = false; // if next inserts should be fostered private boolean fragmentParsing = false; // if parsing a fragment of html HtmlTreeBuilder() { } @Override Document parse(String input, String baseUri, ParseErrorList errors) { state = HtmlTreeBuilderState.Initial; return super.parse(input, baseUri, errors); } List<Node> parseFragment(String inputFragment, Element context, String baseUri, ParseErrorList errors) { // context may be null state = HtmlTreeBuilderState.Initial; initialiseParse(inputFragment, baseUri, errors); contextElement = context; fragmentParsing = true; Element root = null; if (context != null) { if (context.ownerDocument() != null) // quirks setup: doc.quirksMode(context.ownerDocument().quirksMode()); // initialise the tokeniser state: String contextTag = context.tagName(); if (StringUtil.in(contextTag, "title", "textarea")) tokeniser.transition(TokeniserState.Rcdata); else if (StringUtil.in(contextTag, "iframe", "noembed", "noframes", "style", "xmp")) tokeniser.transition(TokeniserState.Rawtext); else if (contextTag.equals("script")) tokeniser.transition(TokeniserState.ScriptData); else if (contextTag.equals(("noscript"))) tokeniser.transition(TokeniserState.Data); // if scripting // enabled, rawtext else if (contextTag.equals("plaintext")) tokeniser.transition(TokeniserState.Data); else tokeniser.transition(TokeniserState.Data); // default root = new Element(Tag.valueOf("html"), baseUri); doc.appendChild(root); stack.push(root); resetInsertionMode(); // setup form element to nearest form on context (up ancestor // chain). ensures form controls are associated // with form correctly Elements contextChain = context.parents(); contextChain.add(0, context); for (Element parent : contextChain) { if (parent instanceof FormElement) { formElement = (FormElement) parent; break; } } } runParser(); if (context != null) return root.childNodes(); else return doc.childNodes(); } @Override protected boolean process(Token token) { currentToken = token; return this.state.process(token, this); } boolean process(Token token, HtmlTreeBuilderState state) { currentToken = token; return state.process(token, this); } void transition(HtmlTreeBuilderState state) { this.state = state; } HtmlTreeBuilderState state() { return state; } void markInsertionMode() { originalState = state; } HtmlTreeBuilderState originalState() { return originalState; } void framesetOk(boolean framesetOk) { this.framesetOk = framesetOk; } boolean framesetOk() { return framesetOk; } Document getDocument() { return doc; } String getBaseUri() { return baseUri; } void maybeSetBaseUri(Element base) { if (baseUriSetFromDoc) // only listen to the first <base href> in parse return; String href = base.absUrl("href"); if (href.length() != 0) { // ignore <base target> etc baseUri = href; baseUriSetFromDoc = true; doc.setBaseUri(href); // set on the doc so doc.createElement(Tag) // will get updated base, and to update all // descendants } } boolean isFragmentParsing() { return fragmentParsing; } void error(HtmlTreeBuilderState state) { if (errors.canAddError()) errors.add(new ParseError(reader.pos(), "Unexpected token [%s] when in state [%s]", currentToken .tokenType(), state)); } Element insert(Token.StartTag startTag) { // handle empty unknown tags // when the spec expects an empty tag, will directly hit insertEmpty, so // won't generate this fake end tag. if (startTag.isSelfClosing()) { Element el = insertEmpty(startTag); stack.add(el); tokeniser.transition(TokeniserState.Data); // handles <script />, // otherwise needs // breakout steps from // script data tokeniser.emit(new Token.EndTag(el.tagName())); // ensure we get out // of whatever state // we are in. // emitted for // yielded // processing return el; } Element el = new Element(Tag.valueOf(startTag.name()), baseUri, startTag.attributes); insert(el); return el; } Element insert(String startTagName) { Element el = new Element(Tag.valueOf(startTagName), baseUri); insert(el); return el; } void insert(Element el) { insertNode(el); stack.add(el); } Element insertEmpty(Token.StartTag startTag) { Tag tag = Tag.valueOf(startTag.name()); Element el = new Element(tag, baseUri, startTag.attributes); insertNode(el); if (startTag.isSelfClosing()) { if (tag.isKnownTag()) { if (tag.isSelfClosing()) tokeniser.acknowledgeSelfClosingFlag(); // if not acked, // promulagates // error } else { // unknown tag, remember this is self closing for output tag.setSelfClosing(); tokeniser.acknowledgeSelfClosingFlag(); // not an distinct error } } return el; } FormElement insertForm(Token.StartTag startTag, boolean onStack) { Tag tag = Tag.valueOf(startTag.name()); FormElement el = new FormElement(tag, baseUri, startTag.attributes); setFormElement(el); insertNode(el); if (onStack) stack.add(el); return el; } void insert(Token.Comment commentToken) { Comment comment = new Comment(commentToken.getData(), baseUri); insertNode(comment); } void insert(Token.Character characterToken) { Node node; // characters in script and style go in as datanodes, not text nodes String tagName = currentElement().tagName(); if (tagName.equals("script") || tagName.equals("style")) node = new DataNode(characterToken.getData(), baseUri); else node = new TextNode(characterToken.getData(), baseUri); currentElement().appendChild(node); // doesn't use insertNode, because // we don't foster these; and will // always have a stack. } private void insertNode(Node node) { // if the stack hasn't been set up yet, elements (doctype, comments) go // into the doc if (stack.size() == 0) doc.appendChild(node); else if (isFosterInserts()) insertInFosterParent(node); else currentElement().appendChild(node); // connect form controls to their form element if (node instanceof Element && ((Element) node).tag().isFormListed()) { if (formElement != null) formElement.addElement((Element) node); } } Element pop() { // todo - dev, remove validation check if (stack.peekLast().nodeName().equals("td") && !state.name().equals("InCell")) Validate.isFalse(true, "pop td not in cell"); if (stack.peekLast().nodeName().equals("html")) Validate.isFalse(true, "popping html!"); return stack.pollLast(); } void push(Element element) { stack.add(element); } DescendableLinkedList<Element> getStack() { return stack; } boolean onStack(Element el) { return isElementInQueue(stack, el); } private boolean isElementInQueue(DescendableLinkedList<Element> queue, Element element) { Iterator<Element> it = queue.descendingIterator(); while (it.hasNext()) { Element next = it.next(); if (next == element) { return true; } } return false; } Element getFromStack(String elName) { Iterator<Element> it = stack.descendingIterator(); while (it.hasNext()) { Element next = it.next(); if (next.nodeName().equals(elName)) { return next; } } return null; } boolean removeFromStack(Element el) { Iterator<Element> it = stack.descendingIterator(); while (it.hasNext()) { Element next = it.next(); if (next == el) { it.remove(); return true; } } return false; } void popStackToClose(String elName) { Iterator<Element> it = stack.descendingIterator(); while (it.hasNext()) { Element next = it.next(); if (next.nodeName().equals(elName)) { it.remove(); break; } else { it.remove(); } } } void popStackToClose(String... elNames) { Iterator<Element> it = stack.descendingIterator(); while (it.hasNext()) { Element next = it.next(); if (StringUtil.in(next.nodeName(), elNames)) { it.remove(); break; } else { it.remove(); } } } void popStackToBefore(String elName) { Iterator<Element> it = stack.descendingIterator(); while (it.hasNext()) { Element next = it.next(); if (next.nodeName().equals(elName)) { break; } else { it.remove(); } } } void clearStackToTableContext() { clearStackToContext("table"); } void clearStackToTableBodyContext() { clearStackToContext("tbody", "tfoot", "thead"); } void clearStackToTableRowContext() { clearStackToContext("tr"); } private void clearStackToContext(String... nodeNames) { Iterator<Element> it = stack.descendingIterator(); while (it.hasNext()) { Element next = it.next(); if (StringUtil.in(next.nodeName(), nodeNames) || next.nodeName().equals("html")) break; else it.remove(); } } Element aboveOnStack(Element el) { assert onStack(el); Iterator<Element> it = stack.descendingIterator(); while (it.hasNext()) { Element next = it.next(); if (next == el) { return it.next(); } } return null; } void insertOnStackAfter(Element after, Element in) { int i = stack.lastIndexOf(after); Validate.isTrue(i != -1); stack.add(i + 1, in); } void replaceOnStack(Element out, Element in) { replaceInQueue(stack, out, in); } private void replaceInQueue(LinkedList<Element> queue, Element out, Element in) { int i = queue.lastIndexOf(out); Validate.isTrue(i != -1); queue.remove(i); queue.add(i, in); } void resetInsertionMode() { boolean last = false; Iterator<Element> it = stack.descendingIterator(); while (it.hasNext()) { Element node = it.next(); if (!it.hasNext()) { last = true; node = contextElement; } String name = node.nodeName(); if ("select".equals(name)) { transition(HtmlTreeBuilderState.InSelect); break; // frag } else if (("td".equals(name) || "td".equals(name) && !last)) { transition(HtmlTreeBuilderState.InCell); break; } else if ("tr".equals(name)) { transition(HtmlTreeBuilderState.InRow); break; } else if ("tbody".equals(name) || "thead".equals(name) || "tfoot".equals(name)) { transition(HtmlTreeBuilderState.InTableBody); break; } else if ("caption".equals(name)) { transition(HtmlTreeBuilderState.InCaption); break; } else if ("colgroup".equals(name)) { transition(HtmlTreeBuilderState.InColumnGroup); break; // frag } else if ("table".equals(name)) { transition(HtmlTreeBuilderState.InTable); break; } else if ("head".equals(name)) { transition(HtmlTreeBuilderState.InBody); break; // frag } else if ("body".equals(name)) { transition(HtmlTreeBuilderState.InBody); break; } else if ("frameset".equals(name)) { transition(HtmlTreeBuilderState.InFrameset); break; // frag } else if ("html".equals(name)) { transition(HtmlTreeBuilderState.BeforeHead); break; // frag } else if (last) { transition(HtmlTreeBuilderState.InBody); break; // frag } } } // todo: tidy up in specific scope methods private boolean inSpecificScope(String targetName, String[] baseTypes, String[] extraTypes) { return inSpecificScope(new String[] { targetName }, baseTypes, extraTypes); } private boolean inSpecificScope(String[] targetNames, String[] baseTypes, String[] extraTypes) { Iterator<Element> it = stack.descendingIterator(); while (it.hasNext()) { Element el = it.next(); String elName = el.nodeName(); if (StringUtil.in(elName, targetNames)) return true; if (StringUtil.in(elName, baseTypes)) return false; if (extraTypes != null && StringUtil.in(elName, extraTypes)) return false; } Validate.fail("Should not be reachable"); return false; } boolean inScope(String[] targetNames) { return inSpecificScope(targetNames, TagsSearchInScope, null); } boolean inScope(String targetName) { return inScope(targetName, null); } boolean inScope(String targetName, String[] extras) { return inSpecificScope(targetName, TagsSearchInScope, extras); // todo: in mathml namespace: mi, mo, mn, ms, mtext annotation-xml // todo: in svg namespace: forignOjbect, desc, title } boolean inListItemScope(String targetName) { return inScope(targetName, TagSearchList); } boolean inButtonScope(String targetName) { return inScope(targetName, TagSearchButton); } boolean inTableScope(String targetName) { return inSpecificScope(targetName, TagSearchTableScope, null); } boolean inSelectScope(String targetName) { Iterator<Element> it = stack.descendingIterator(); while (it.hasNext()) { Element el = it.next(); String elName = el.nodeName(); if (elName.equals(targetName)) return true; if (!StringUtil.in(elName, TagSearchSelectScope)) // all elements // except return false; } Validate.fail("Should not be reachable"); return false; } void setHeadElement(Element headElement) { this.headElement = headElement; } Element getHeadElement() { return headElement; } boolean isFosterInserts() { return fosterInserts; } void setFosterInserts(boolean fosterInserts) { this.fosterInserts = fosterInserts; } FormElement getFormElement() { return formElement; } void setFormElement(FormElement formElement) { this.formElement = formElement; } void newPendingTableCharacters() { pendingTableCharacters = new ArrayList<Token.Character>(); } List<Token.Character> getPendingTableCharacters() { return pendingTableCharacters; } void setPendingTableCharacters(List<Token.Character> pendingTableCharacters) { this.pendingTableCharacters = pendingTableCharacters; } /** * 11.2.5.2 Closing elements that have implied end tags * <p/> * When the steps below require the UA to generate implied end tags, then, * while the current node is a dd element, a dt element, an li element, an * option element, an optgroup element, a p element, an rp element, or an rt * element, the UA must pop the current node off the stack of open elements. * * @param excludeTag * If a step requires the UA to generate implied end tags but * lists an element to exclude from the process, then the UA must * perform the above steps as if that element was not in the * above list. */ void generateImpliedEndTags(String excludeTag) { while ((excludeTag != null && !currentElement().nodeName().equals( excludeTag)) && StringUtil.in(currentElement().nodeName(), TagSearchEndTags)) pop(); } void generateImpliedEndTags() { generateImpliedEndTags(null); } boolean isSpecial(Element el) { // todo: mathml's mi, mo, mn // todo: svg's foreigObject, desc, title String name = el.nodeName(); return StringUtil.in(name, TagSearchSpecial); } // active formatting elements void pushActiveFormattingElements(Element in) { int numSeen = 0; Iterator<Element> iter = formattingElements.descendingIterator(); while (iter.hasNext()) { Element el = iter.next(); if (el == null) // marker break; if (isSameFormattingElement(in, el)) numSeen++; if (numSeen == 3) { iter.remove(); break; } } formattingElements.add(in); } private boolean isSameFormattingElement(Element a, Element b) { // same if: same namespace, tag, and attributes. Element.equals only // checks tag, might in future check children return a.nodeName().equals(b.nodeName()) && // a.namespace().equals(b.namespace()) && a.attributes().equals(b.attributes()); // todo: namespaces } void reconstructFormattingElements() { int size = formattingElements.size(); if (size == 0 || formattingElements.getLast() == null || onStack(formattingElements.getLast())) return; Element entry = formattingElements.getLast(); int pos = size - 1; boolean skip = false; while (true) { if (pos == 0) { // step 4. if none before, skip to 8 skip = true; break; } entry = formattingElements.get(--pos); // step 5. one earlier than // entry if (entry == null || onStack(entry)) // step 6 - neither marker nor // on stack break; // jump to 8, else continue back to 4 } while (true) { if (!skip) // step 7: on later than entry entry = formattingElements.get(++pos); Validate.notNull(entry); // should not occur, as we break at last // element // 8. create new element from element, 9 insert into current node, // onto stack skip = false; // can only skip increment from 4. Element newEl = insert(entry.nodeName()); // todo: avoid fostering // here? // newEl.namespace(entry.namespace()); // todo: namespaces newEl.attributes().addAll(entry.attributes()); // 10. replace entry with new entry formattingElements.add(pos, newEl); formattingElements.remove(pos + 1); // 11 if (pos == size - 1) // if not last entry in list, jump to 7 break; } } void clearFormattingElementsToLastMarker() { while (!formattingElements.isEmpty()) { Element el = formattingElements.peekLast(); formattingElements.removeLast(); if (el == null) break; } } void removeFromActiveFormattingElements(Element el) { Iterator<Element> it = formattingElements.descendingIterator(); while (it.hasNext()) { Element next = it.next(); if (next == el) { it.remove(); break; } } } boolean isInActiveFormattingElements(Element el) { return isElementInQueue(formattingElements, el); } Element getActiveFormattingElement(String nodeName) { Iterator<Element> it = formattingElements.descendingIterator(); while (it.hasNext()) { Element next = it.next(); if (next == null) // scope marker break; else if (next.nodeName().equals(nodeName)) return next; } return null; } void replaceActiveFormattingElement(Element out, Element in) { replaceInQueue(formattingElements, out, in); } void insertMarkerToFormattingElements() { formattingElements.add(null); } void insertInFosterParent(Node in) { Element fosterParent = null; Element lastTable = getFromStack("table"); boolean isLastTableParent = false; if (lastTable != null) { if (lastTable.parent() != null) { fosterParent = lastTable.parent(); isLastTableParent = true; } else fosterParent = aboveOnStack(lastTable); } else { // no table == frag fosterParent = stack.get(0); } if (isLastTableParent) { Validate.notNull(lastTable); // last table cannot be null by this // point. lastTable.before(in); } else fosterParent.appendChild(in); } @Override public String toString() { return "TreeBuilder{" + "currentToken=" + currentToken + ", state=" + state + ", currentElement=" + currentElement() + '}'; } }