package com.mozz.htmlnative.parser; import android.os.SystemClock; import android.support.annotation.NonNull; import android.support.annotation.Nullable; import android.util.Log; import com.mozz.htmlnative.HNLog; import com.mozz.htmlnative.HNSegment; import com.mozz.htmlnative.HtmlTag; import com.mozz.htmlnative.Tracker; import com.mozz.htmlnative.css.Styles; import com.mozz.htmlnative.dom.HNDomTree; import com.mozz.htmlnative.dom.Meta; import com.mozz.htmlnative.exception.HNSyntaxError; import com.mozz.htmlnative.reader.TextReader; import com.mozz.htmlnative.script.ScriptInfo; import com.mozz.htmlnative.parser.token.Token; import com.mozz.htmlnative.parser.token.TokenType; import com.mozz.htmlnative.utils.ParametersUtils; import java.io.EOFException; import java.util.HashMap; import java.util.Map; import static com.mozz.htmlnative.HNEnvironment.PERFORMANCE_TAG; import static com.mozz.htmlnative.HtmlTag.isSwallowInnerTag; import static com.mozz.htmlnative.parser.token.TokenType.EndAngleBracket; import static com.mozz.htmlnative.parser.token.TokenType.Equal; import static com.mozz.htmlnative.parser.token.TokenType.Exclamation; import static com.mozz.htmlnative.parser.token.TokenType.Head; import static com.mozz.htmlnative.parser.token.TokenType.Html; import static com.mozz.htmlnative.parser.token.TokenType.Id; import static com.mozz.htmlnative.parser.token.TokenType.Inner; import static com.mozz.htmlnative.parser.token.TokenType.Meta; import static com.mozz.htmlnative.parser.token.TokenType.Script; import static com.mozz.htmlnative.parser.token.TokenType.Slash; import static com.mozz.htmlnative.parser.token.TokenType.StartAngleBracket; import static com.mozz.htmlnative.parser.token.TokenType.Style; import static com.mozz.htmlnative.parser.token.TokenType.Template; import static com.mozz.htmlnative.parser.token.TokenType.Title; /** * @author YangTao7 */ public final class Parser { private static final String ID = "id"; private static final String CLAZZ = "class"; private static final String TAG = Parser.class.getSimpleName(); @NonNull private final Lexer mLexer; /** * To handle the css part */ private final CssParser mCssParser; private int mLookFor; @Nullable private Token mCurToken; private boolean mReserved = false; private Map<String, Object> mStyleCache = new HashMap<>(); private Tracker mTracker; private static final int LK_StartArrowBracket = 1; private static final int LK_EndArrowBracket = 1 << 1; private static final int LK_ID = 1 << 2; private static final int LK_VALUE = 1 << 3; private static final int LK_SLASH = 1 << 4; private static final int LK_EQUAL = 1 << 5; private static final int LK_INT = 1 << 6; private static final int LK_DOUBLE = 1 << 7; private static final int LK_CODE = 1 << 8; private static final int LK_INNER = 1 << 9; private static final int LK_NUMBER = LK_INT | LK_DOUBLE; public Parser(TextReader reader) { mLexer = new Lexer(reader); mCssParser = new CssParser(mLexer, this); mTracker = new Tracker(); } public HNSegment process() throws HNSyntaxError { long processStartTime = SystemClock.currentThreadTimeMillis(); HNSegment segment = new HNSegment(); segment.setDom(new HNDomTree(segment.getInlineStyles(), null, 0, 0)); try { scanFor(StartAngleBracket); scan(true); /* * skip the HTML version information. see https://www.w3.org/TR/html4/struct/global * .html#h-7.2 */ if (mCurToken.type() == Exclamation) { mLexer.skipUntil('>'); // consume the reserved scan(); scanFor(EndAngleBracket); scanFor(StartAngleBracket); scan(true); } if (mCurToken.type() == Html) { scan(); scanFor(EndAngleBracket, StartAngleBracket); processHtmlInside(segment); } else { processHtmlInside(segment); } scanFor(StartAngleBracket, Slash, Html, EndAngleBracket); } catch (EOFException ignored) { Log.w(TAG, "Reach the end of file!"); } finally { mLexer.close(); mTracker.record("Parse Css + Html", SystemClock.currentThreadTimeMillis() - processStartTime); Log.i(PERFORMANCE_TAG, mTracker.dump()); return segment; } } @NonNull private void processHtmlInside(HNSegment segment) throws HNSyntaxError, EOFException { HNDomTree currentTree = segment.getDom(); // Look ahead to determine whether current is script or template scan(); switch (mCurToken.type()) { case Template: processTemplateThenScript(currentTree, segment); return; case Head: processHead(segment); scanFor(StartAngleBracket, Template); processTemplateThenScript(currentTree, segment); return; default: Log.e(TAG, "must init with <template> or <script>"); throw new HNSyntaxError("must init with <template> or <script>", mLexer.line(), mLexer.column()); } } private void processTemplateThenScript(HNDomTree tree, HNSegment segment) throws EOFException, HNSyntaxError { processTemplate(tree); scanFor(StartAngleBracket); scan(true); if (mCurToken.type() == Script) { processScript(segment); } else { scanFor(Slash, Html, EndAngleBracket); } scan(); } private void processScript(HNSegment segment) throws HNSyntaxError, EOFException { if (mCurToken.type() != Script) { Log.e(TAG, "Look for script, but " + mCurToken.toString()); throw new HNSyntaxError("Look for script, but " + mCurToken.toString(), mLexer.line() , mLexer.column()); } String attrName = null; lookFor(LK_ID | LK_EndArrowBracket); String type = null; while (true) { scan(); switch (mCurToken.type()) { case EndAngleBracket: { check(LK_EndArrowBracket); Token scriptToken = mLexer.scanScript(); if (scriptToken.type() != TokenType.ScriptCode) { throw new HNSyntaxError("Expect code, but meet " + scriptToken.type() .toString(), mLexer.line(), mLexer.column()); } segment.setScriptInfo(new ScriptInfo(scriptToken, type)); scanFor(StartAngleBracket, Slash, Script, EndAngleBracket); return; } case Id: check(LK_ID); attrName = mCurToken.stringValue(); lookFor(LK_EQUAL); break; case Equal: check(LK_EQUAL); lookFor(LK_VALUE); break; case Value: check(LK_VALUE); if (attrName.equals("type")) { type = mCurToken.stringValue(); } lookFor(LK_EndArrowBracket | LK_ID); break; } } } private void processHead(HNSegment segment) throws HNSyntaxError, EOFException { if (mCurToken.type() != TokenType.Head) { Log.e(TAG, "Look for \"head\", but " + mCurToken.toString()); throw new HNSyntaxError("Look for \"head\", but " + mCurToken.toString(), mLexer.line (), mLexer.column()); } while (true) { scan(); if (mCurToken.type() == Slash) { scanFor(EndAngleBracket); return; } else if (mCurToken.type() == Title) { processTitle(segment); } else if (mCurToken.type() == Style) { processStyle(segment); scanFor(Style, EndAngleBracket); } else if (mCurToken.type() == Meta) { processMeta(segment); } else if (mCurToken.type() == StartAngleBracket) { scan(true); if (mCurToken.type() == Slash) { scanFor(Slash, Head, EndAngleBracket); return; } } } } private void processStyle(HNSegment segment) throws EOFException, HNSyntaxError { long timeStart = SystemClock.currentThreadTimeMillis(); // Ignore the element that is written in <style> tag while (true) { scan(); switch (mCurToken.type()) { case Id: case Value: case Equal: case Int: case Double: continue; case EndAngleBracket: break; default: throw new HNSyntaxError("unknown " + mCurToken.toString() + " token in " + "<style>", mLexer.column(), mLexer.line()); } if (mCurToken.type() == EndAngleBracket) { break; } } try { mCssParser.process(segment); } catch (Exception e) { e.printStackTrace(); } mTracker.record("Parse Css", SystemClock.currentThreadTimeMillis() - timeStart); } private void processTitle(HNSegment segment) throws HNSyntaxError, EOFException { if (mCurToken.type() != Title) { Log.e(TAG, "Look for head, but " + mCurToken.toString()); throw new HNSyntaxError("Look for head, but " + mCurToken.toString(), mLexer.line(), mLexer.column()); } scanFor(EndAngleBracket); scanFor(Inner); String title = mCurToken.stringValue(); segment.getHead().setTitle(title); scanFor(StartAngleBracket, Slash, Title, EndAngleBracket); } private void processMeta(HNSegment segment) throws HNSyntaxError, EOFException { if (mCurToken.type() != Meta) { Log.e(TAG, "Look for meta, but " + mCurToken.toString()); throw new HNSyntaxError("Look for meta, but " + mCurToken.toString(), mLexer.line(), mLexer.column()); } Meta meta = new Meta(); String idCache = null; lookFor(LK_ID | LK_SLASH); while (true) { scan(); switch (mCurToken.type()) { case Id: check(LK_ID); idCache = mCurToken.stringValue(); scanFor(Equal); lookFor(LK_VALUE); break; case Value: check(LK_VALUE); if (com.mozz.htmlnative.dom.Meta.ID_NAME.equalsIgnoreCase(idCache)) { meta.setName(mCurToken.stringValue()); } else if (com.mozz.htmlnative.dom.Meta.ID_CONTENT.equals(idCache)) { meta.setName(mCurToken.stringValue()); } lookFor(LK_ID | LK_SLASH); break; case Slash: segment.getHead().putMeta(meta); check(LK_SLASH); scanFor(EndAngleBracket); return; default: Log.e(TAG, "Unknown token " + mCurToken.toString() + " when " + "parsing <meta>" + mCurToken.toString()); throw new HNSyntaxError("Unknown token " + mCurToken.toString() + " when " + "parsing <meta>" + mCurToken.toString(), mLexer.line(), mLexer.column ()); } } } private void processTemplate(HNDomTree tree) throws HNSyntaxError { long timeStart = SystemClock.currentThreadTimeMillis(); if (mCurToken.type() != Template) { Log.e(TAG, "Look for Template, but " + mCurToken.toString()); throw new HNSyntaxError("Look for Template, but " + mCurToken.toString(), mLexer.line (), mLexer.column()); } tree.setType(mCurToken.stringValue()); processInternal(tree); mTracker.record("Parse Html", SystemClock.currentThreadTimeMillis() - timeStart); } private void processInternal(@NonNull HNDomTree tree) throws HNSyntaxError { processInternal(tree, tree); } /** * parse the tree recursively * * @throws HNSyntaxError */ private void processInternal(@NonNull HNDomTree tree, @NonNull ParseCallback callback) throws HNSyntaxError { HNLog.d(HNLog.PARSER, "init to parse tree " + tree.getType()); int index = 0; int bracketPair = 1; lookFor(LK_ID | LK_EndArrowBracket | LK_SLASH); String attrName = null; boolean meetEndTag = false; int innerCount = 0; callback.onStartParse(); try { while (true) { scan(); switch (mCurToken.type()) { case StartAngleBracket: check(LK_StartArrowBracket); lookFor(LK_SLASH | LK_ID); scan(); if (mCurToken.type() == Slash) { meetEndTag = true; bracketPair++; check(LK_SLASH); scan(); // compare the tag string with tree.nodeName if (!tree.getType().equals(mCurToken.value())) { Log.e(TAG, "View tag should be in pairs, current " + "is<" + tree.getType() + "></" + mCurToken.value() + ">"); throw new HNSyntaxError("View tag should be in pairs, current " + "is<" + tree.getType() + "></" + mCurToken.value() + ">", mLexer.line(), mLexer.column()); } scan(); if (mCurToken.type() != EndAngleBracket) { Log.e(TAG, "View tag must be end with >"); throw new HNSyntaxError("View tag must be end with >", mLexer .line(), mLexer.column()); } bracketPair--; if (bracketPair != 0) { Log.e(TAG, "< > must be in pairs, " + ", current bracket" + " pair is " + bracketPair); throw new HNSyntaxError("< > must be in pairs, " + ", current " + "bracket" + " pair is " + bracketPair, mLexer.line(), mLexer.column()); } // here reach the end of the view tree, just return. callback.onLeaveParse(); return; } else if (mCurToken.type() == Id || mCurToken.type() == Script) { // "mCurToken.type() == Script" is to handle the <script> inside <body> check(LK_ID); String tag = mCurToken.stringValue(); // handle the <br/> tag if (HtmlTag.BR.equalsIgnoreCase(tag)) { if (isSwallowInnerTag(tree.getType())) { tree.appendText("\n"); } else { tree.last().appendText("\n"); } scanFor(TokenType.Slash, TokenType.EndAngleBracket); lookFor(LK_StartArrowBracket | LK_INNER); } else { HNDomTree child = new HNDomTree(tree, tag, index++); tree.addChild(child); processInternal(child); lookFor(LK_StartArrowBracket); } } break; case EndAngleBracket: check(LK_EndArrowBracket); lookFor(LK_StartArrowBracket | LK_INNER); bracketPair--; break; case Id: case Style: check(LK_ID); attrName = mCurToken.stringValue(); lookFor(LK_EQUAL); break; case Equal: check(LK_EQUAL); if (attrName == null) { Log.e(TAG, "attrName is null, please check the state"); throw new HNSyntaxError("attrName is null, please check the state", mLexer.line(), mLexer.column()); } lookFor(LK_VALUE | LK_NUMBER); break; case Value: check(LK_VALUE); parseValue(tree, attrName, mCurToken.stringValue()); lookFor(LK_ID | LK_EndArrowBracket | LK_SLASH); break; case Int: check(LK_INT); tree.addInlineStyle(attrName, mCurToken.intValue()); lookFor(LK_ID | LK_EndArrowBracket); break; case Double: check(LK_DOUBLE); tree.addInlineStyle(attrName, mCurToken.doubleValue()); lookFor(LK_ID | LK_EndArrowBracket); break; case Inner: check(LK_INNER); if (isSwallowInnerTag(tree.getType())) { tree.appendText(mCurToken.stringValue()); } else { HNDomTree innerChild = new HNDomTree(tree, HtmlTag.INNER_TREE_TAG, innerCount++); tree.addChild(innerChild); innerChild.appendText(mCurToken.stringValue()); } lookFor(LK_StartArrowBracket); break; // for <a/> case case Slash: check(LK_SLASH); lookFor(LK_EndArrowBracket); scan(); if (mCurToken.type() != EndAngleBracket) { Log.e(TAG, "unknown state, slash should be followed by " + ">, " + "but currently " + mCurToken.type()); throw new HNSyntaxError("unknown state, slash should be followed by " + ">, " + "but currently " + mCurToken.type(), mLexer.line(), mLexer .column()); } bracketPair--; if (bracketPair != 0) { Log.e(TAG, "< > must be in pairs, " + ", current bracket" + " pair is " + bracketPair); throw new HNSyntaxError("< > must be in pairs, " + ", current bracket" + " pair is " + bracketPair, mLexer.line(), mLexer.column()); } callback.onLeaveParse(); return; default: Log.e(TAG, "unknown token " + mCurToken.toString()); throw new HNSyntaxError("unknown token " + mCurToken.toString(), mLexer .line(), mLexer.column()); } } } catch (EOFException e) { if (meetEndTag) { Log.e(TAG, "View Tag should ends with </"); throw new HNSyntaxError("View Tag should ends with </", mLexer.line(), mLexer .column()); } } } private StringBuilder mStyleKeyCache = new StringBuilder(); private void parseStyle(@NonNull HNDomTree tree, @NonNull String styleString) { CssParser.parseInlineStyle(styleString, mStyleKeyCache, mStyleCache); for (Map.Entry<String, Object> entry : mStyleCache.entrySet()) { tree.addInlineStyle(entry.getKey(), entry.getValue()); } } private boolean isLookingFor(int status) { return (mLookFor & status) != 0; } private void lookFor(int status) { mLookFor = 0; mLookFor |= status; } private void scan() throws EOFException, HNSyntaxError { if (mReserved) { HNLog.d(HNLog.PARSER, "Reprocess token ->" + mCurToken); mReserved = false; return; } if (mCurToken != null) { mCurToken.recycle(); } mCurToken = mLexer.scan(); HNLog.d(HNLog.PARSER, "Process token ->" + mCurToken); } private void scan(boolean reserved) throws EOFException, HNSyntaxError { scan(); mReserved = reserved; } private void scanFor(@NonNull TokenType tokenType) throws EOFException, HNSyntaxError { scan(); if (mCurToken.type() != tokenType) { Log.e(TAG, "syntax error, should be " + tokenType.toString() + ", but current is " + mCurToken.toString()); throw new HNSyntaxError("syntax error, should be " + tokenType.toString() + ", but current is " + mCurToken.toString(), mLexer.line(), mLexer.column()); } } private void scanFor(@NonNull TokenType... tokenTypes) throws EOFException, HNSyntaxError { for (TokenType tokenType : tokenTypes) { scanFor(tokenType); } } private void check(int status) throws HNSyntaxError { if (!isLookingFor(status)) { Log.e(TAG, " Looking for " + lookForToString(status) + ", but " + "currently is " + lookForToString(mLookFor)); throw new HNSyntaxError(" Looking for " + lookForToString(status) + ", but " + "currently is " + lookForToString(mLookFor), mLexer.line(), mLexer.column()); } } private void parseValue(HNDomTree tree, String parameterName, String valueStr) { switch (parameterName) { case Styles.ATTR_STYLE: parseStyle(tree, valueStr); break; case ID: tree.setId(valueStr); break; case CLAZZ: tree.setClazz(ParametersUtils.splitByEmpty(valueStr)); break; default: tree.addInlineStyle(parameterName, valueStr); break; } } public Map<String, Object> getStyleCache() { return mStyleCache; } private static String lookForToString(int lookFor) { StringBuilder sb = new StringBuilder("[ "); if ((lookFor & LK_EndArrowBracket) != 0) { sb.append("> "); } if ((lookFor & LK_StartArrowBracket) != 0) { sb.append("< "); } if ((lookFor & LK_ID) != 0) { sb.append("id "); } if ((lookFor & LK_VALUE) != 0) { sb.append("value "); } if ((lookFor & LK_SLASH) != 0) { sb.append("/ "); } if ((lookFor & LK_EQUAL) != 0) { sb.append("= "); } if ((lookFor & LK_NUMBER) != 0) { sb.append("number "); } if ((lookFor & LK_CODE) != 0) { sb.append("code "); } if ((lookFor & LK_INNER) != 0) { sb.append("innerElement "); } sb.append("]"); return sb.toString(); } }