package core.framework.impl.template.parser; import core.framework.api.util.Exceptions; import core.framework.api.util.Sets; import core.framework.impl.template.node.Attribute; import core.framework.impl.template.node.Comment; import core.framework.impl.template.node.ContainerNode; import core.framework.impl.template.node.Document; import core.framework.impl.template.node.Element; import core.framework.impl.template.node.Text; import core.framework.impl.template.source.TemplateSource; import java.util.ArrayDeque; import java.util.Deque; import java.util.Set; import java.util.regex.Pattern; /** * @author neo */ public class HTMLParser { private final Pattern tagNamePattern = Pattern.compile("[a-z]+[a-z0-9\\-]*"); private final Set<String> voidElements = Sets.newHashSet("area", "base", "br", "col", "command", "embed", "hr", "img", "input", "keygen", "link", "meta", "param", "source", "track", "wbr"); // loose checking to cover common cases, precise checking will be like e.g. checked attribute on input tag can be boolean attribute private final Set<String> booleanAttributes = Sets.newHashSet("checked", "selected", "disabled", "readonly", "multiple", "ismap", "defer"); private final HTMLLexer lexer; private final Deque<ContainerNode> stack = new ArrayDeque<>(); public HTMLParser(TemplateSource source) { this.lexer = new HTMLLexer(, source.content()); } // only support subnet of HTML, which means enforce strict and consistence rules public Document parse() { Document document = new Document(); stack.push(document); end: while (true) { HTMLTokenType type = lexer.nextNodeToken(); switch (type) { case EOF: break end; case TEXT: stack.peek().add(new Text(lexer.currentToken())); break; case START_COMMENT: lexer.nextEndCommentToken(); stack.peek().add(new Comment(lexer.currentToken())); break; case START_TAG: String tagName = validateTagName(lexer.currentToken().substring(1)); parseElement(tagName); break; case END_TAG: String endTag = lexer.currentToken(); String endTagName = validateTagName(endTag.substring(2, endTag.length() - 1)); if (voidElements.contains(endTagName)) throw Exceptions.error("void element must not have close tag, tag={}, location={}", endTagName, lexer.currentLocation()); closeTag(endTagName); break; default: throw Exceptions.error("unexpected type, type={}, location={}", type, lexer.currentLocation()); } } return document; } private void parseElement(String tagName) { Element currentElement = new Element(tagName); stack.peek().add(currentElement); Attribute currentAttribute = null; while (true) { HTMLTokenType type = lexer.nextElementToken(); switch (type) { case EOF: if (currentAttribute != null) validateAttribute(currentAttribute); return; case START_TAG_END_CLOSE: validateSelfCloseTag(tagName); return; case START_TAG_END: if (currentAttribute != null) validateAttribute(currentAttribute); if (!voidElements.contains(tagName)) stack.push(currentElement); if ("script".equals( || "style".equals( { HTMLTokenType contentType = lexer.nextScriptToken(; if (contentType == HTMLTokenType.TEXT) stack.peek().add(new Text(lexer.currentToken())); } return; case ATTRIBUTE_NAME: if (currentAttribute != null) validateAttribute(currentAttribute); currentAttribute = new Attribute(lexer.currentToken(), tagName, lexer.currentLocation()); currentElement.attributes.add(currentAttribute); break; case ATTRIBUTE_VALUE: if (currentAttribute == null) throw Exceptions.error("attribute syntax is invalid, location={}", lexer.currentLocation()); String attributeValue = lexer.currentToken(); if (attributeValue.startsWith("\"")) { currentAttribute.value = attributeValue.substring(1, attributeValue.length() - 1); currentAttribute.hasDoubleQuote = true; } else if (!"".equals(attributeValue)) { // not assign null attribute value, e.g. <p class=/> currentAttribute.value = attributeValue; } break; default: throw Exceptions.error("unexpected type, type={}, location={}", type, lexer.currentLocation()); } } } private void closeTag(String tagName) { while (true) { ContainerNode lastNode = stack.pop(); if (lastNode instanceof Document) throw Exceptions.error("can not find matched tag to close, tagName={}, location={}", tagName, lexer.currentLocation()); Element element = (Element) lastNode; if ( { element.hasEndTag = true; return; } } } private String validateTagName(String name) { if (!tagNamePattern.matcher(name).matches()) throw Exceptions.error("tag name must match {}, name={}, location={}", tagNamePattern.pattern(), name, lexer.currentLocation()); return name; } private void validateSelfCloseTag(String tagName) { if (voidElements.contains(tagName)) throw Exceptions.error("we recommend not closing void element, tag={}, location={}", tagName, lexer.currentLocation()); else throw Exceptions.error("non void element must not be self-closed, tag={}, location={}", tagName, lexer.currentLocation()); } private void validateAttribute(Attribute attribute) { boolean isBooleanAttribute = booleanAttributes.contains(; if (!isBooleanAttribute && attribute.value == null) throw Exceptions.error("non boolean attribute must have value, attribute={}>{}, location={}", attribute.tagName,, attribute.location); if (isBooleanAttribute && attribute.value != null) throw Exceptions.error("we recommend not putting value for boolean attribute, attribute={}>{}, location={}", attribute.tagName,, attribute.location); if (("link".equals(attribute.tagName) && "href".equals( || ("script".equals(attribute.tagName) && "src".equals( || ("img".equals(attribute.tagName) && "src".equals( { validateStaticResourceURL(attribute); } } private void validateStaticResourceURL(Attribute attribute) { if (!attribute.value.startsWith("http://") && !attribute.value.startsWith("https://") && !attribute.value.startsWith("//") && !attribute.value.startsWith("/")) throw Exceptions.error("static resource url value must be either absolute or start with '/', attribute={}>{}, value={}, location={}", attribute.tagName,, attribute.value, attribute.location); } }