package org.develnext.jphp.core.tokenizer; import org.develnext.jphp.core.common.TokenizeGrammarUtils; import org.develnext.jphp.core.tokenizer.token.*; import org.develnext.jphp.core.tokenizer.token.expr.ValueExprToken; import org.develnext.jphp.core.tokenizer.token.expr.value.StringExprToken; import org.develnext.jphp.core.tokenizer.token.stmt.EchoRawToken; import php.runtime.common.Directive; import php.runtime.common.Messages; import php.runtime.env.Context; import php.runtime.env.TraceInfo; import php.runtime.exceptions.ParseException; import java.io.IOException; import java.lang.reflect.Constructor; import java.lang.reflect.InvocationTargetException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; public class Tokenizer { protected Context context; protected TokenFinder tokenFinder; private int currentPosition; private int startRelativePosition; private int relativePosition; private int currentLine; private final int codeLength; protected final String code; protected Token prevToken; protected boolean rawMode; protected Map<String, Directive> directives = new HashMap<String, Directive>(); public Tokenizer(Context context) throws IOException { this.context = context; this.code = context.getContent(); this.codeLength = code.length(); this.rawMode = context.isLikeFile(); reset(); } public Tokenizer(String code, Context context){ this.context = context; this.currentPosition = -1; this.currentLine = 0; this.relativePosition = 0; this.code = code; this.codeLength = code.length(); this.tokenFinder = new TokenFinder(); this.rawMode = false; } public void reset(){ this.currentPosition = -1; this.currentLine = 0; this.relativePosition = -1; this.tokenFinder = new TokenFinder(); if (!rawMode) this.relativePosition = 0; this.directives.clear(); } public boolean hasDirective(String name){ return directives.get(name) != null; } public Directive getDirective(String name){ Directive value = directives.get(name); if (value == null) return null; else return value; } public String getCode() { return code; } protected final static Map<Class<?>, Constructor> tokenConstructors = new HashMap<Class<?>, Constructor>(); @SuppressWarnings("unchecked") protected <T extends Token> T buildToken(Class<T> clazz, TokenMeta meta){ Constructor<T> constructor = tokenConstructors.get(clazz); if (constructor == null){ try { tokenConstructors.put(clazz, constructor = clazz.getConstructor(TokenMeta.class)); } catch (NoSuchMethodException e) { throw new RuntimeException(e); } } try { return (T) (prevToken = constructor.newInstance(meta)); } catch (InvocationTargetException e) { throw new RuntimeException("Unable build " + clazz.getSimpleName() + " token: " + e.getTargetException()); } catch (Exception e) { throw new RuntimeException(e); } } protected TokenMeta buildMeta(int startPosition, int startLine){ String word = getWord(startPosition, currentPosition); if (word == null) return null; TokenMeta meta = new TokenMeta(word, startLine, currentLine, startRelativePosition, relativePosition); int length = word.length(); meta.setStartIndex(currentPosition - length); meta.setEndIndex(currentPosition); if (length == 1 && TokenizeGrammarUtils.isDelimiter(word.charAt(0))) { meta.setStartIndex(currentPosition); meta.setEndIndex(currentPosition + 1); } return meta; } protected String getWord(int startPosition, int endPosition){ if (endPosition < startPosition) return null; if (endPosition == startPosition){ if (startPosition >= codeLength) return null; return code.substring(startPosition, startPosition + 1); } if (endPosition > codeLength) endPosition = codeLength - 1; return code.substring(startPosition, endPosition); } protected Token tryNextToken(){ int len = 0; char ch; if (currentPosition + 1 < codeLength){ ch = code.charAt(currentPosition + 1); if (TokenizeGrammarUtils.isDelimiter(ch) && !TokenizeGrammarUtils.isSpace(ch)){ len = 1; if (currentPosition + 2 < codeLength){ ch = code.charAt(currentPosition + 2); if (TokenizeGrammarUtils.isDelimiter(ch) && !TokenizeGrammarUtils.isSpace(ch)){ len = 2; } } } } if (len == 0) return null; String word = getWord(currentPosition, currentPosition + len + 1); Class<? extends Token> tokenClass = tokenFinder.find(word); if (len == 2 && tokenClass == null){ len -= 1; word = getWord(currentPosition, currentPosition + len + 1); tokenClass = tokenFinder.find(word); } if (tokenClass != null){ int startPosition = currentPosition; currentPosition += len + 1; Token token = buildToken(tokenClass, buildMeta(startPosition, currentLine)); currentPosition -= 1; return token; } return null; } private void incCurrentPosition(int value){ if (value < 0){ while (currentPosition > 0 && value < 0){ currentPosition--; value++; if (TokenizeGrammarUtils.isNewline(code.charAt(currentPosition))) currentLine--; } } } protected boolean checkNewLine(char ch, boolean invert){ if (TokenizeGrammarUtils.isNewline(ch)){ if (invert) { currentLine--; } else { currentLine++; } relativePosition = 0; startRelativePosition = 0; return true; } return false; } protected boolean checkNewLine(char ch){ return checkNewLine(ch, false); } protected ValueExprToken readString(StringExprToken.Quote quote, int startPosition, int startLine){ int i = currentPosition + 1, pos = relativePosition + 1; StringExprToken.Quote ch_quote = null; boolean slash = false; StringBuilder sb = new StringBuilder(); boolean isMagic = quote != null && quote.isMagic(); String endString = null; int startIndex = currentPosition + 1; if (quote == StringExprToken.Quote.DOC){ StringBuilder tmp = new StringBuilder(); StringExprToken.Quote docType = null; for(; i < codeLength; i++){ char ch = code.charAt(i); pos++; if (docType == null && TokenizeGrammarUtils.isQuote(ch) != null) { docType = TokenizeGrammarUtils.isQuote(ch); } else if (docType != null && docType == TokenizeGrammarUtils.isQuote(ch)) { if (i + 1 >= codeLength || !TokenizeGrammarUtils.isNewline(code.charAt(i + 1))) { throw new ParseException( Messages.ERR_PARSE_UNEXPECTED_END_OF_STRING.fetch(), new TraceInfo(context, currentLine, currentLine, pos + 1, pos + 1) ); } i += 1; break; // nop } else if (tmp.length() == 0 && (ch == ' ' || ch == '\t')) { //nop } else if (TokenizeGrammarUtils.isEngLetter(ch) || ch == '_' || (tmp.length() != 0 && Character.isDigit(ch))){ tmp.append(ch); } else if (tmp.length() > 0 && checkNewLine(ch)){ pos = 0; break; } else { String error = Messages.ERR_PARSE_UNEXPECTED_X.fetch(ch); if (TokenizeGrammarUtils.isNewline(ch)) error = Messages.ERR_PARSE_UNEXPECTED_END_OF_STRING.fetch(); throw new ParseException( error, new TraceInfo(context, currentLine, currentLine, pos, pos) ); } } currentPosition = i; i += 1; // skip \n isMagic = (docType == null || docType.isMagic()); endString = tmp.toString(); } List<StringExprToken.Segment> segments = new ArrayList<StringExprToken.Segment>(); for(; i < codeLength; i++){ char ch = code.charAt(i); pos++; ch_quote = TokenizeGrammarUtils.isQuote(ch); if (endString == null && (ch_quote == quote && !slash)){ currentPosition = i; relativePosition = pos; break; } if (checkNewLine(ch)) { pos = 0; if (endString != null){ int end = i + 1 + endString.length(); if (end < codeLength){ if (code.substring(i + 1, end).equals(endString)) { if ((code.charAt(end) == ';' && TokenizeGrammarUtils.isNewline(code.charAt(end + 1))) || TokenizeGrammarUtils.isNewline(code.charAt(end))) { currentPosition = i + endString.length(); relativePosition = endString.length(); ch_quote = StringExprToken.Quote.DOC; break; } } } } } if (!isMagic){ switch (ch) { case '\\': if (!slash || endString != null) sb.append(ch); slash = !slash; break; case '\'': if (endString == null) sb.deleteCharAt(sb.length() - 1); // remove slash default: sb.append(ch); slash = false; } } else { int dynamic = 0; if (!slash){ if (ch == '$' && (i + 1 < codeLength && code.charAt(i + 1) == '{') ) { dynamic = 2; } if (ch == '{' && (i + 1 < codeLength && code.charAt(i + 1) == '$') ) dynamic = 1; } if (dynamic > 0) { if (dynamic == 2 || i + 1 < codeLength && code.charAt(i + 1) == '$') { slash = false; int opened = dynamic == 2 ? 0 : 1; int j; for(j = i + 1; j < codeLength; j++){ switch (code.charAt(j)){ case '{': opened++; break; case '}': opened--; break; } checkNewLine(code.charAt(j)); if (opened == 0) break; } if (opened != 0) throw new ParseException( Messages.ERR_PARSE_UNEXPECTED_END_OF_STRING.fetch(), new TraceInfo(context, startLine, 0, relativePosition, 0) ); String sub = code.substring(i, j + 1); segments.add(new StringExprToken.Segment( sb.length(), sb.length() + sub.length(), dynamic == 2 )); /*segments.add(new StringExprToken.Segment( i - currentPosition - 1, j - currentPosition, dynamic == 2 ));*/ sb.append(sub); i = j; continue; } } if (slash){ switch (ch){ case 'r': sb.append('\r'); slash = false; break; case 'n': sb.append('\n'); slash = false; break; case 't': sb.append('\t'); slash = false; break; case 'e': sb.append((char)0x1B); slash = false; break; case 'v': sb.append((char)0x0B); slash = false; break; case 'f': sb.append('\f'); slash = false; break; case '\\': sb.append(ch); slash = !slash; break; case '0':case '1':case '2':case '3':case '4':case '5':case '6':case '7': // \[0-7]{1,3} int k = i + 1; for(int j = 1; j < 3; j++){ k = i + j; if (k < codeLength){ char digit = code.charAt(k); if (digit >= '0' && digit <= '7'){ // nop } else break; } else break; } String s = code.substring(i, k); if (s.isEmpty()){ sb.append(ch); } else { int val = Integer.parseInt(s, 8); sb.append((char)val); } i = k - 1; slash = false; break; case 'x': int t = i + 1; for(int j = 1; j < 5; j++){ t = i + j; if (t < codeLength){ char digit = code.charAt(t); if (Character.isDigit(digit) || (digit >= 'A' && digit <= 'F') || (digit >= 'a' && digit <= 'f')){ // nop; } else { break; } } else break; } String s16 = code.substring(i + 1, t); if (s16.isEmpty()){ sb.append(ch); } else { int val16 = Integer.parseInt(s16, 16); sb.append((char)val16); } i = t - 1; slash = false; break; case '$': case '"': default: slash = false; sb.append(ch); break; } } else { switch (ch){ case '\\': slash = true; break; case '$': int k = i + 1; boolean done = false; int opened = 0; int complex = 0; if (k < codeLength) { char first = code.charAt(k); if (TokenizeGrammarUtils.isEngLetter(first) || first == '_'){ k++; done = true; for(; i < codeLength; k++){ if (k < codeLength){ first = code.charAt(k); if (Character.isDigit(first) || TokenizeGrammarUtils.isEngLetter(first) || first == '_') { // nop } else if (complex == 1 && TokenizeGrammarUtils.isVariableChar(first) && code.charAt(k - 1) == '[') { // nop } else if (complex == 0 && first == '[') { opened++; complex = 1; } else if (complex == 1 && opened != 0 && first == ']') { opened--; if (opened <= 0) { k++; break; } } else if (complex == 0 && first == '-'){ if (k + 1 < codeLength && code.charAt(k + 1) == '>'){ k++; complex = 2; } else break; } else break; } else break; } } } if (done){ if (opened != 0) throw new ParseException( Messages.ERR_PARSE_UNEXPECTED_END_OF_STRING.fetch(), new TraceInfo(context, startLine, 0, pos, 0) ); String s = code.substring(i, k); segments.add(new StringExprToken.Segment(sb.length(), sb.length() + s.length(), true)); sb.append(s); } else sb.append(ch); i = k - 1; break; default: sb.append(ch); } } } } if (ch_quote != quote || slash){ throw new ParseException( Messages.ERR_PARSE_UNEXPECTED_END_OF_STRING.fetch(), new TraceInfo(context, currentLine, currentLine, pos, pos) ); } TokenMeta meta = buildMeta(startPosition + 1, startLine); meta.setStartIndex(startIndex - 1); if (quote == StringExprToken.Quote.DOC) { meta.setEndIndex(currentPosition + 3); } else { meta.setEndIndex(currentPosition + 1); } meta.setWord(sb.toString()); StringExprToken expr = new StringExprToken(meta, quote); expr.setSegments(segments); return expr; } protected Token readComment(CommentToken.Kind kind, int startPosition, int startLine){ int i, pos = relativePosition, k = 0; boolean isOldComment = code.charAt(currentPosition) == '#'; for(i = currentPosition + 1; i < codeLength; i++, k++){ char ch = code.charAt(i); pos++; if (checkNewLine(ch)) pos = 0; char prev_ch = i > 0 ? code.charAt(i - 1) : '\0'; boolean closed = false; switch (kind){ case SIMPLE: closed = (TokenizeGrammarUtils.isNewline(ch)); if (TokenizeGrammarUtils.isCloseTag(prev_ch, ch)) { i -= 2; closed = true; } break; case DOCTYPE: case BLOCK: closed = k != 0 && (TokenizeGrammarUtils.isCloseComment(String.valueOf(new char[]{prev_ch, ch}))); break; } closed = closed || i == codeLength - 1; if (closed){ String text = code.substring( startPosition, kind == CommentToken.Kind.SIMPLE ? i : i - 1 ); TokenMeta meta = new TokenMeta( text, startLine, currentLine, startRelativePosition, relativePosition ); meta.setStartIndex(currentPosition - 1); if (isOldComment || kind == CommentToken.Kind.DOCTYPE) { meta.setStartIndex(currentPosition); } if (kind == CommentToken.Kind.BLOCK || kind == CommentToken.Kind.DOCTYPE) { meta.setEndIndex(i + 1); } else { meta.setEndIndex(i); } currentPosition = i; relativePosition = pos; Token result = buildToken(CommentToken.class, meta); if (kind == CommentToken.Kind.SIMPLE && text.startsWith("//")){ String directive = text.substring(2).trim(); if (directive.startsWith("@@")){ int p = directive.indexOf(' '); if (p != -1){ String name = directive.substring(2, p); String value = p + 1 < directive.length() ? directive.substring(p + 1) : ""; if (!directives.containsKey(name.toLowerCase())) directives.put(name.toLowerCase(), new Directive(value, result.toTraceInfo(context))); } } } return result; } } assert false; return null; } protected Token readNumber(int startPosition, int startLine){ int i; boolean dot = false; boolean e_char = false; i = currentPosition; boolean isHex = code.charAt(i) == '0' && (i < codeLength && Character.toLowerCase(code.charAt(i + 1)) == 'x'); boolean isBinary = code.charAt(i) == '0' && (i < codeLength && code.charAt(i + 1) == 'b'); if (isHex || isBinary) i += 2; for(; i < codeLength; i++){ char ch = code.charAt(i); if (!isHex && TokenizeGrammarUtils.isFloatDot(ch)){ if (dot) break; dot = true; } else if (!isHex && (ch == 'e' || ch == 'E')){ if (e_char) break; if (i + 1 >= codeLength){ break; } else { if (code.charAt(i + 1) == '-' || code.charAt(i + 1) == '+' || (i + 2 >= codeLength || Character.isDigit(code.charAt(i + 2)))) { if (i + 2 >= codeLength || !Character.isDigit(code.charAt(i + 2))) { break; } else { i++; } } } e_char = true; } else if (isHex && ((ch >= 'A' && ch <= 'F') || (ch >= 'a' && ch <= 'f'))) { // nop } else if (isBinary && (ch == '0' || ch == '1')) { // nop } else if (!Character.isDigit(ch)) break; } currentPosition = i; TokenMeta meta = buildMeta(startPosition, startLine); Class<? extends Token> tokenClazz = tokenFinder.find(meta); currentPosition -= 1; return buildToken(tokenClazz, meta); } public Token nextToken(){ boolean init = false; char ch = '\0'; char prev_ch = '\0'; int startPosition = currentPosition + 1; startRelativePosition = relativePosition; int startLine = currentLine; StringExprToken.Quote string = null; CommentToken.Kind comment = null; if (codeLength == 0) { return null; } boolean first = true; while (currentPosition < codeLength){ currentPosition++; relativePosition++; if (currentPosition == codeLength) { break; } ch = code.charAt(currentPosition); if (currentPosition > 0 && init) { prev_ch = code.charAt(currentPosition - 1); } checkNewLine(ch); if (rawMode){ if (TokenizeGrammarUtils.isOpenTag(prev_ch, ch)){ TokenMeta meta = new TokenMeta( code.substring(startPosition, currentPosition - 1), startLine, currentLine, startRelativePosition, relativePosition ); rawMode = false; startLine = currentLine; startRelativePosition = relativePosition; EchoRawToken token = buildToken(EchoRawToken.class, meta); if (codeLength >= currentPosition + 4 && code.substring(currentPosition + 1, currentPosition + 4).equals("php")){ relativePosition += 4; currentPosition += 3; token.setShort(false); } else { token.setShort(true); } return token; } else { init = true; first = true; continue; } } if (ch == '=' && prevToken != null && prevToken instanceof EchoRawToken && ((EchoRawToken) prevToken).isShort()){ return buildToken(OpenEchoTagToken.class, buildMeta(startPosition, startLine)); } if (first && (!init || prevToken == null)){ // numbers: integers, doubles, hex if (Character.isDigit(ch) || (ch == '.' && prevToken == null && currentPosition + 1 < codeLength && Character.isDigit(code.charAt(currentPosition + 1)))){ return readNumber(startPosition, startLine); } // comments comment = CommentToken.Kind.isComment(ch, prev_ch); if (comment != null) { return readComment(comment, startPosition, startLine); } // strings, herdoc, etc. string = TokenizeGrammarUtils.isQuote(ch); if (string != null) { return readString(string, startPosition, startLine); } } init = true; first = false; if (TokenizeGrammarUtils.isDelimiter(ch)) { if (startPosition == currentPosition && TokenizeGrammarUtils.isSpace(ch)){ startPosition = currentPosition + 1; startLine = currentLine; startRelativePosition = relativePosition; prevToken = null; first = true; continue; } if (startPosition == currentPosition){ Token token = tryNextToken(); if (token instanceof BreakToken){ rawMode = true; } if (token instanceof CommentToken){ comment = ((CommentToken)token).getKind(); return readComment(comment, startPosition, startLine); //continue; } if (token instanceof StringStartDocToken){ string = StringExprToken.Quote.DOC; return readString(string, startPosition, startLine); } if (token != null) { return token; } } break; } else if (TokenizeGrammarUtils.isVariableChar(ch)){ if (TokenizeGrammarUtils.isVariableChar(prev_ch)){ currentPosition -= 1; break; } } } TokenMeta meta = buildMeta(startPosition, startLine); if (currentPosition != startPosition && TokenizeGrammarUtils.isDelimiter(ch)) { checkNewLine(ch, true); currentPosition -= 1; relativePosition -= 1; } if (meta == null) return null; //currentPosition -= 1; Class<? extends Token> tokenClazz = rawMode ? EchoRawToken.class : tokenFinder.find(meta); if (tokenClazz == null){ return prevToken = new Token(meta, TokenType.T_J_CUSTOM); } else { return buildToken(tokenClazz, meta); } } public List<Token> fetchAll(){ List<Token> result = new ArrayList<Token>(); Token token; while ((token = nextToken()) != null) result.add(token); return result; } public Context getContext() { return context; } }