package com.mozz.htmlnative.parser;
import android.support.annotation.NonNull;
import android.support.annotation.Nullable;
import com.mozz.htmlnative.HNLog;
import com.mozz.htmlnative.HNSegment;
import com.mozz.htmlnative.common.CharQueue;
import com.mozz.htmlnative.exception.HNSyntaxError;
import com.mozz.htmlnative.reader.TextReader;
import com.mozz.htmlnative.parser.token.Token;
import com.mozz.htmlnative.parser.token.TokenType;
import java.io.EOFException;
import static com.mozz.htmlnative.HNLog.LEXER;
class Lexer {
private TextReader mReader;
@NonNull
private StringBuilder mBuffer = new StringBuilder();
private int mLookFor = 0;
private static final int LK_NOTHING = 1;
private static final int LK_INNER = 1 << 1;
private CharQueue mCacheQueue;
private static final int CACHE_SIZE = 7;
private int mReserved = 0;
private char mCurrent = TextReader.INIT_CHAR;
private boolean mIsInStyle = false;
/**
* Add for recognize code from Inner Element. If < script > is meet, than mLookForScript==3.
* <p>
* otherwise, mLookForScript < 3.
* <p>
* When met <, mLookForScript = 1;
* <p>
* When met Script, mLookForScript++;
* <p>
* When met >, mLookForScript++;
* <p>
* Otherwise, mLookForScript = 0;
*/
private int mLookForScript = 0;
public Lexer(TextReader reader) {
mReader = reader;
mCacheQueue = new CharQueue(CACHE_SIZE);
lookFor(LK_NOTHING);
}
@Nullable
public Token scan() throws EOFException, HNSyntaxError {
this.skipWhiteSpace();
switch (peek()) {
case '<':
mLookForScript = 1;
lookFor(LK_NOTHING);
next();
return Token.obtainToken(TokenType.StartAngleBracket, mReader.line(), mReader
.column());
case '"':
next();
mLookForScript = 0;
return scanValue();
case '>':
mLookForScript++;
lookFor(LK_INNER);
next();
return Token.obtainToken(TokenType.EndAngleBracket, mReader.line(), mReader
.column());
case '/':
mLookForScript = 0;
next();
return Token.obtainToken(TokenType.Slash, mReader.line(), mReader.column());
case '=':
mLookForScript = 0;
next();
return Token.obtainToken(TokenType.Equal, mReader.line(), mReader.column());
case '{':
mLookForScript = 0;
next();
return Token.obtainToken(TokenType.StartBrace, mReader.line(), mReader.column());
case '}':
mLookForScript = 0;
next();
return Token.obtainToken(TokenType.EndBrace, mReader.line(), mReader.column());
case '#':
mLookForScript = 0;
next();
return Token.obtainToken(TokenType.Hash, mReader.line(), mReader.column());
case '*':
mLookForScript = 0;
next();
return Token.obtainToken(TokenType.Star, mReader.line(), mReader.column());
case ',':
mLookForScript = 0;
next();
return Token.obtainToken(TokenType.Comma, mReader.line(), mReader.column());
case '.':
mLookForScript = 0;
next();
return Token.obtainToken(TokenType.Dot, mReader.line(), mReader.column());
case ':':
mLookForScript = 0;
next();
return Token.obtainToken(TokenType.Colon, mReader.line(), mReader.column());
case ';':
mLookForScript = 0;
next();
return Token.obtainToken(TokenType.Semicolon, mReader.line(), mReader.column());
case '(':
mLookForScript = 0;
next();
return Token.obtainToken(TokenType.StartParen, mReader.line(), mReader.column());
case ')':
mLookForScript = 0;
next();
return Token.obtainToken(TokenType.EndParen, mReader.line(), mReader.column());
case '!':
mLookForScript = 0;
next();
return Token.obtainToken(TokenType.Exclamation, mReader.line(), mReader.column());
}
if (isLookingFor(LK_INNER) && mLookForScript < 3 && peek() != '<' && !mIsInStyle) {
return scanInner();
}
if (isDigit(peek()) || peek() == '-') {
mLookForScript = 0;
return scanNumber();
}
if (isLetter(peek()) || peek() == '_') {
return scanId();
}
HNLog.e(LEXER, "unknown token " + peek() + " at " + line() + "," + column());
throw new HNSyntaxError("unknown token " + peek(), line(), column());
}
public void skipUntil(char c) throws EOFException {
for (; ; ) {
char ch = peek();
if (ch != c) {
next();
} else {
break;
}
}
}
@Nullable
private Token scanNumber() throws EOFException, HNSyntaxError {
long startColumn = mReader.column();
long line = mReader.line();
int v = 0;
boolean negative = false;
if (peek() == '-') {
negative = true;
next();
}
if (!Lexer.isDigit(peek())) {
HNLog.e(LEXER, "Illegal word " + peek() + " when reading Number!");
throw new HNSyntaxError("Illegal word when reading Number!", line, startColumn);
}
do {
v = 10 * v + (peek() - '0');
next();
} while (isDigit(peek()));
if (peek() != '.' && peek() != 'E' && peek() != 'e' && peek() != '%') {
return Token.obtainToken(TokenType.Int, negative ? -v : v, line, startColumn);
}
if (peek() == '%') {
next();
return Token.obtainToken(TokenType.Double, negative ? -v / 100.f : v / 100.f, line,
startColumn, Token.EXTRA_NUMBER_PERCENTAGE);
}
double x = v, d = 10;
if (peek() == '.') {
for (; ; ) {
next();
if (!Lexer.isDigit(peek())) {
break;
}
x = x + (peek() - '0') / d;
d = d * 10;
}
}
if (peek() == '%') {
next();
return Token.obtainToken(TokenType.Double, negative ? -x / 100.f : x / 100.f, line,
startColumn, Token.EXTRA_NUMBER_PERCENTAGE);
}
mReserved = 1;
if (peek() == 'e' || peek() == 'E') {
//consume the e or E
next();
next();
if (!Lexer.isDigit(peek()) && peek() != '-') {
return Token.obtainToken(TokenType.Double, negative ? -x : x, line, startColumn);
}
boolean expIsNegative = false;
if (peek() == '-') {
expIsNegative = true;
next();
}
int n = 0;
do {
n = 10 * n + (peek() - '0');
next();
} while (Lexer.isDigit(peek()));
n = expIsNegative ? -n : n;
double exp = Math.pow(10, n);
return Token.obtainToken(TokenType.Double, negative ? (-x * exp) : (x * exp), line,
startColumn);
} else {
return Token.obtainToken(TokenType.Double, negative ? -x : x, line, startColumn);
}
}
@Nullable
private Token scanId() throws EOFException {
long startColumn = mReader.column();
long line = mReader.line();
clearBuf();
do {
mBuffer.append(peek());
next();
}
while (isLetter(peek()) || isDigit(peek()) || peek() == '.' || peek() == '-' || peek() ==
'_');
String idStr = mBuffer.toString();
TokenType type = TokenType.Id;
String tokenContent;
if (idStr.equalsIgnoreCase(TokenType.Template.toString()) || idStr.equalsIgnoreCase
(TokenType.Body.toString())) {
type = TokenType.Template;
tokenContent = idStr;
} else if (idStr.equalsIgnoreCase(TokenType.Script.toString())) {
mLookForScript++;
type = TokenType.Script;
tokenContent = idStr;
} else if (idStr.equalsIgnoreCase(TokenType.Head.toString())) {
type = TokenType.Head;
tokenContent = idStr;
} else if (idStr.equalsIgnoreCase(TokenType.Meta.toString())) {
type = TokenType.Meta;
tokenContent = idStr;
} else if (idStr.equalsIgnoreCase(TokenType.Link.toString())) {
type = TokenType.Link;
tokenContent = idStr;
} else if (idStr.equalsIgnoreCase(TokenType.Html.toString())) {
type = TokenType.Html;
tokenContent = idStr;
} else if (idStr.equalsIgnoreCase(TokenType.Title.toString())) {
type = TokenType.Title;
tokenContent = idStr;
} else if (idStr.equalsIgnoreCase(TokenType.Style.toString())) {
type = TokenType.Style;
tokenContent = idStr;
mIsInStyle = !mIsInStyle && peekHistory(6) == '<';
} else {
tokenContent = idStr;
}
return Token.obtainToken(type, tokenContent, line, startColumn);
}
@Nullable
private Token scanValue() throws EOFException {
long startColumn = mReader.column();
long line = mReader.line();
clearBuf();
if (peek() == '"') {
next();
return Token.obtainToken(TokenType.Value, "", line, startColumn);
}
do {
mBuffer.append(peek());
next();
// handling the '\"' case
if (peek() == '\\') {
next();
if (peek() != '"') {
mBuffer.append('\\');
}
} else if (peek() == '"') {
break;
}
} while (true);
next();
return Token.obtainToken(TokenType.Value, mBuffer.toString(), line, startColumn);
}
@Nullable
private Token scanInner() throws EOFException {
long startColumn = mReader.column();
long line = mReader.line();
clearBuf();
do {
mBuffer.append(peek());
next();
if (peek() == '\\') {
next();
if (peek() != '<') {
mBuffer.append('\\');
}
} else if (peek() == '<') {
break;
}
//TODO 考虑其他的情况,这里只会添加一个空格
if (skipWhiteSpaceInner()) {
mBuffer.append(' ');
}
} while (peek() != '<');
lookFor(LK_NOTHING);
char lastChar = mBuffer.charAt(mBuffer.length() - 1);
if (lastChar == '\n' || lastChar == '\r') {
mBuffer.deleteCharAt(mBuffer.length() - 1);
}
return Token.obtainToken(TokenType.Inner, mBuffer.toString(), line, startColumn);
}
/**
* Called by {@link Parser#processScript(HNSegment)}, not by Lexer, the structure may ugly
* but simple to implement. Because Lexer
* can't tell whether it's an script or not, only parser has such ability.
* <br/>
* This function read the script inside "script" tag, no matter it's JavaScript or Lua or
* Other language. It detect the end of script by reading '<' and '/' continuously outside the
* quotation;
*
* @return ScriptInfo string
* @throws EOFException
* @throws HNSyntaxError
*/
public final Token scanScript() throws EOFException, HNSyntaxError {
long startColumn = mReader.column();
long line = mReader.line();
if (currentPositionInFile() < CACHE_SIZE) {
throw new HNSyntaxError("wrong status, too early for script.", line, startColumn);
}
clearBuf();
/*
* to handle the case <script></script>, in this case, before 2 next(), peek() is already
* point to '<'
*/
int meetEndTagFirst = 0;
if (peek() == '<') {
meetEndTagFirst++;
}
next();
if (peek() == '/') {
meetEndTagFirst++;
}
if (meetEndTagFirst == 2) {
mReserved = 2;
next();
return Token.obtainToken(TokenType.ScriptCode, "", line, startColumn);
}
next();
// 0 no in any quota, 1 for quotation, 2 for single quotation
byte inQuotation = 0;
while (true) {
if (inQuotation == 0 && peekHistory(0) == '/' && peekHistory(1) == '<') {
mReserved = 2;
next();
break;
}
char ch = peekHistory(1);
if (inQuotation == 0) {
if (ch == '"') {
inQuotation = 1;
} else if (ch == '\'') {
inQuotation = 2;
}
} else {
if (inQuotation == 1) {
if (ch == '"' && peekHistory(4) != '\\') {
inQuotation = 0;
}
} else if (inQuotation == 2) {
if (ch == '\'' && peekHistory(4) != '\\') {
inQuotation = 0;
}
}
}
mBuffer.append(ch);
next();
}
return Token.obtainToken(TokenType.ScriptCode, mBuffer.toString(), line, startColumn);
}
private boolean skipWhiteSpaceInner() throws EOFException {
boolean meet = false;
for (; ; ) {
char ch = peek();
if (ch == ' ' || ch == '\r' || ch == '\n' || ch == '\t' || ch == '\f' || ch == '\b') {
if (!meet) {
meet = true;
}
next();
} else {
break;
}
}
return meet;
}
public void skipWhiteSpace() throws EOFException {
for (; ; ) {
char ch = peek();
if (ch == ' ' || ch == '\r' || ch == '\n' || ch == '\t' || ch == '\f' || ch == '\b') {
next();
} else {
break;
}
}
}
public void close() {
if (mReader != null) {
mReader.close();
}
}
public long line() {
return mReader.line();
}
public long column() {
return mReader.column();
}
public char peek() {
return mCurrent;
}
private long currentPositionInFile() {
return mReader.countOfRead();
}
/**
* @param historyBackCount must be smaller than {@link Lexer#CACHE_SIZE}
* @return history char saved in {@link Lexer#mCacheQueue}
*/
private char peekHistory(int historyBackCount) {
if (historyBackCount > CACHE_SIZE) {
throw new IllegalArgumentException("HistoryBackCount must be smaller than CACHE_SIZE " +
"(" + CACHE_SIZE + ")");
}
return mCacheQueue.peek(CACHE_SIZE - historyBackCount - 1);
}
public void next() throws EOFException {
if (mReserved > 0) {
mCurrent = mCacheQueue.peek(CACHE_SIZE - mReserved);
mReserved--;
return;
}
this.mReader.nextCh();
mCurrent = this.mReader.current();
mCacheQueue.push(peek());
HNLog.d(LEXER, "next-> " + peek());
}
private void lookFor(int status) {
mLookFor = 0;
mLookFor |= status;
}
private boolean isLookingFor(int status) {
return (mLookFor & status) != 0;
}
public static boolean isDigit(char ch) {
return ch >= '0' && ch <= '9';
}
public static boolean isLetter(char ch) {
return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z');
}
private void clearBuf() {
mBuffer.setLength(0);
}
}