/* --------------------------------------------------------- * * __________ D E L T A S C R I P T * * (_________() * * / === / - A fast, dynamic scripting language * * | == | - Version 4.13.11.0 * * / === / - Developed by Adam R. Nelson * * | = = | - 2011-2013 * * / === / - Distributed under GNU LGPL v3 * * (________() - http://github.com/ar-nelson/deltascript * * * * --------------------------------------------------------- */ package com.sector91.delta.script.parser; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.nio.charset.Charset; import java.util.Iterator; import java.util.LinkedList; import com.sector91.delta.script.Operator; public class DScriptLexer implements Iterator<LexToken> { public static final Charset CHARSET = Charset.forName("UTF-8"); static final String KW_DEF = "def", KW_FIELD = "field", KW_IF = "if", KW_THEN = "then", KW_ELSEIF = "elif", KW_ELSE = "else", KW_ENDIF = "xi", KW_BRANCH = "branch", KW_CASE = "case", KW_DEFAULT = "otherwise", KW_ENDBRANCH = "xb", KW_FUNC = "func", KW_ENDFUNC = "xf", KW_ARROW = "->", KW_UNDERSCORE = "_", KW_SCOPE = "scope", KW_INCLUDE = "include", KW_ENDSCOPE = "xs", KW_BLOCK = "do", KW_ENDBLOCK = "xd", KW_LOOP = "loop", KW_WHILELOOP = "while", KW_UNTILLOOP = "until", KW_FROMLOOP = "from", KW_LOOPCOND = "where", KW_ENDLOOP = "xl", KW_FOR_COMPRH = "for", KW_RETURN = "return", KW_BREAK = "break", KW_CONTINUE = "continue", KW_TRUE = "true", KW_FALSE = "false", KW_BLANK = "blank", KW_NULL = "null", OCT_DIGITS = "01234567", HEX_DIGITS = "0123456789abcdefABCDEF", OP_CONTINUATION_CHARS = "|^&=*/%~"; static final char C_COMMENT = '#', C_TAG = '\'', C_COLON = ':', C_ESCAPE = '\\', C_STRING = '"', C_DOT = '.', C_COMMA = ',', C_SCOLON = ';', C_NEWLINE = '\n', C_OPAREN = '(', C_CPAREN = ')', C_OBRACKET = '[', C_CBRACKET = ']', C_OBRACE = '{', C_CBRACE = '}', C_OANGLE = '<', C_CANGLE = '>', C_MINUS = '-', C_NUMSEPARATOR = '_', C_EXP_LOWER = 'e', C_EXP_UPPER = 'E'; private char c; private LexToken lastToken; private boolean initialized, finished, expectingOperator, unclosedString, expectingSuffix; private Reader in; private int ch; private LinkedList<TokenType> stack = new LinkedList<TokenType>(); public DScriptLexer(String data) {this(new ByteArrayInputStream(data.getBytes(CHARSET)));} public DScriptLexer(InputStream stream) {in = new InputStreamReader(stream, CHARSET);} private void initialize() throws IOException { read(); ch = 0; finished = false; expectingOperator = false; skipCommentsAndWhitespace(); initialized = true; } public LexToken next() { try { if (!initialized) initialize(); return readNextToken(); } catch (Exception ex) { finished = true; return new ErrorLexToken(ex, ch, ch); } } public boolean hasNext() {return !finished;} public void remove() {throw new UnsupportedOperationException("Unsupported.");} public void close() throws IOException { if (finished) return; finished = true; in.close(); } public TokenType topUnclosedToken() { if (unclosedString) return TokenType.STRING; if (stack.isEmpty()) return null; return stack.getFirst(); } public int nestingDepth() {return unclosedString ? stack.size()+1 : stack.size();} private boolean read() throws IOException { int i = in.read(); while (i == '\r') i = in.read(); // Convert DOS-style line breaks. if (i < 0) { close(); return false; } else { c = (char)i; ch++; return true; } } private boolean charInArray(char chr, char[] arr) { for (char c2 : arr) if (chr == c2) return true; return false; } private void skipCommentsAndWhitespace() throws IOException { while (c == C_COMMENT || (Character.isWhitespace(c) && c != C_NEWLINE)) { expectingSuffix = false; if (c == C_COMMENT) while (read() && c != C_NEWLINE) {/* Do nothing. */} else read(); if (finished) return; } } private LexToken readNextToken() throws DScriptLexerException, IOException { if (finished) return null; final LexToken token; if (Character.isJavaIdentifierStart(c)) token = readAlphanumeric(); else if (Character.isDigit(c)) token = readNumeric(); else token = readSymbolic(); if (token == null) // A null response means keep reading. return readNextToken(); expectingOperator = token.type().followedByOperator; // Maintain a stack of opening/closing pairs of tokens (parens, angle // brackets, etc.) if (token.type().opens) stack.push(token.type()); else if (token.type().closes != null) { // Pop the stack if the last stack element was closed by this token. if (!stack.isEmpty() && stack.getFirst() == token.type().closes) stack.pop(); // Otherwise, this token is out of place; throw an exception. else { if (stack.isEmpty()) throw new DScriptLexerException("Unexpected " + token); else throw new DScriptLexerException("Unexpected " + token + ". Last unclosed token: " + stack.getFirst().friendlyDesc); } } lastToken = token; skipCommentsAndWhitespace(); return token; } private LexToken readAlphanumeric() throws DScriptLexerException,IOException { final int start = this.ch; // Check for '1e10'-style exponent strings. if (expectingSuffix && lastToken != null && ( lastToken.type() == TokenType.NUMBER || lastToken.type() == TokenType.DEC_SUFFIX || lastToken.type() == TokenType.EXP_SUFFIX)) { if (c == C_EXP_UPPER || c == C_EXP_LOWER) { read(); String numStr = ""; if (c == '-' || c == '+') { numStr += Character.toString(c); read(); } numStr += readDecimalNumberString(); return new LexToken(TokenType.EXP_SUFFIX, numStr, start, ch); } else if (Character.isJavaIdentifierStart(c)) { final String str = readAlphanumericString(); return new LexToken(TokenType.NTYPE, str, start, ch); } } final String str = readAlphanumericString(); // Check if the string is a keyword. if (KW_DEF.equals(str)) return new LexToken(TokenType.DEF, start, ch); else if (KW_FIELD.equals(str)) return new LexToken(TokenType.FIELD, start, ch); else if (KW_UNDERSCORE.equals(str)) return new LexToken(TokenType.UNDERSCORE, start, ch); else if (KW_IF.equals(str)) return new LexToken(TokenType.IF, start, ch); else if (KW_THEN.equals(str)) return new LexToken(TokenType.THEN, start, ch); else if (KW_ELSE.equals(str)) return new LexToken(TokenType.ELSE, start, ch); else if (KW_ELSEIF.equals(str)) return new LexToken(TokenType.ELSE_IF, start, ch); else if (KW_ENDIF.equals(str)) return new LexToken(TokenType.END_IF, KW_ENDIF, start, ch); else if (KW_BRANCH.equals(str)) return new LexToken(TokenType.BRANCH, start, ch); else if (KW_CASE.equals(str)) return new LexToken(TokenType.CASE, start, ch); else if (KW_DEFAULT.equals(str)) return new LexToken(TokenType.DEFAULT_CASE, start, ch); else if (KW_ENDBRANCH.equals(str)) return new LexToken(TokenType.END_BRANCH, KW_ENDBRANCH, start, ch); else if (KW_LOOP.equals(str)) return new LexToken(TokenType.LOOP, start, ch); else if (KW_FROMLOOP.equals(str)) return new LexToken(TokenType.LOOP_KW, KW_FROMLOOP, start, ch); else if (KW_WHILELOOP.equals(str)) return new LexToken(TokenType.LOOP_KW, KW_WHILELOOP, start, ch); else if (KW_UNTILLOOP.equals(str)) return new LexToken(TokenType.LOOP_KW, KW_UNTILLOOP, start, ch); else if (KW_LOOPCOND.equals(str)) return new LexToken(TokenType.LOOP_KW, KW_LOOPCOND, start, ch); else if (KW_ENDLOOP.equals(str)) return new LexToken(TokenType.END_LOOP, KW_ENDLOOP, start, ch); else if (KW_FUNC.equals(str)) return new LexToken(TokenType.FUNC, start, ch); else if (KW_ENDFUNC.equals(str)) return new LexToken(TokenType.END_FUNC, KW_ENDFUNC, start, ch); else if (KW_SCOPE.equals(str)) return new LexToken(TokenType.SCOPE, start, ch); else if (KW_ENDSCOPE.equals(str)) return new LexToken(TokenType.END_SCOPE, KW_ENDSCOPE, start, ch); else if (KW_BLOCK.equals(str)) return new LexToken(TokenType.BLOCK, start, ch); else if (KW_ENDBLOCK.equals(str)) return new LexToken(TokenType.END_BLOCK, KW_ENDBLOCK, start, ch); else if (KW_RETURN.equals(str)) return new LexToken(TokenType.RETURN, start, ch); else if (KW_BREAK.equals(str)) return new LexToken(TokenType.BREAK, start, ch); else if (KW_CONTINUE.equals(str)) return new LexToken(TokenType.CONTINUE, start, ch); else if (KW_TRUE.equals(str) || KW_FALSE.equals(str)) return new LexToken(TokenType.BOOLEAN, str, start, ch); else if (KW_BLANK.equals(str) || KW_NULL.equals(str)) return new LexToken(TokenType.BLANK, start, ch); else if (KW_INCLUDE.equals(str)) return new LexToken(TokenType.INCLUDE, start, ch); else if (KW_FOR_COMPRH.equals(str)) return new LexToken(TokenType.FOR_COMPRH, start, ch); // Check if the string is an operator. for (Operator op : Operator.values()) if (op.str.equals(str)) return new LexToken(TokenType.OPERATOR, str, start, ch); // If the string has no special meaning, it is an identifier. return new LexToken(TokenType.IDENTIFIER, str, start, ch); } private String readAlphanumericString() throws IOException { final StringBuilder sb = new StringBuilder(); do { if (!Character.isJavaIdentifierPart(c)) return sb.toString(); sb.append(c); } while (read()); return sb.toString(); } private LexToken readNumeric() throws DScriptLexerException, IOException { final int start = this.ch; if (lastToken != null && lastToken.type()==TokenType.DOT) { expectingSuffix = true; return new LexToken(TokenType.DEC_SUFFIX, readDecimalNumberString(), start, ch); } else if (c == '0') { if (read()) { if (c == 'x') { read(); final String hexStr = readHexNumberString(); return new LexToken(TokenType.HEX_NUMBER, hexStr, start, ch); } else if (c == 'b') { read(); final String binStr = readBinaryNumberString(); return new LexToken(TokenType.BIN_NUMBER, binStr, start, ch); } else if (Character.isDigit(c) || c == C_NUMSEPARATOR) { final String octStr = readOctalNumberString(); return new LexToken(TokenType.OCT_NUMBER, octStr, start, ch); } } return new LexToken(TokenType.NUMBER, "0", start, ch); } expectingSuffix = true; return new LexToken(TokenType.NUMBER, readDecimalNumberString(), start, ch); } private String readDecimalNumberString() throws IOException { final StringBuilder sb = new StringBuilder(); do { if (c == C_NUMSEPARATOR) continue; if (!Character.isDigit(c)) return sb.toString(); sb.append(c); } while (read()); return sb.toString(); } private String readOctalNumberString() throws IOException { final StringBuilder sb = new StringBuilder(); final char[] chars = OCT_DIGITS.toCharArray(); do { if (charInArray(c, chars)) sb.append(c); else if (c != C_NUMSEPARATOR) return sb.toString(); } while (read()); return sb.toString(); } private String readHexNumberString() throws IOException { final StringBuilder sb = new StringBuilder(); final char[] chars = HEX_DIGITS.toCharArray(); do { if (charInArray(c, chars)) sb.append(c); else if (c != C_NUMSEPARATOR) return sb.toString(); } while (read()); return sb.toString(); } private String readBinaryNumberString() throws IOException { final StringBuilder sb = new StringBuilder(); do { if (c == C_NUMSEPARATOR) continue; else if (c == '0' || c == '1') sb.append(c); else return sb.toString(); } while (read()); return sb.toString(); } private LexToken readSymbolic() throws DScriptLexerException, IOException { final int start = this.ch; switch (c) { case C_COMMA: case C_SCOLON: case C_NEWLINE: final String sep = Character.toString(c); read(); return new LexToken(TokenType.SEPARATOR, sep, start, ch); case C_OPAREN: case C_CPAREN: case C_OBRACKET: case C_CBRACKET: case C_OBRACE: case C_CBRACE: case C_COLON: final char lastC = c; read(); return new LexToken(singleCharToken(lastC), start, ch); case C_OANGLE: if (expectingOperator) { final String op = readOperatorString(); return new LexToken(TokenType.OPERATOR, op, start, ch); } read(); return new LexToken(TokenType.O_ANGLE, start, ch); case C_CANGLE: // If the top element on the stack is an opening angle bracket, // don't even try to parse the character '>' as an operator. if (expectingOperator && (stack.isEmpty() || stack.getFirst() != TokenType.O_ANGLE)) { final String op = readOperatorString(); return new LexToken(TokenType.OPERATOR, op, start, ch); } read(); return new LexToken(TokenType.C_ANGLE, start, ch); case C_DOT: if (read() && c == C_DOT) { if (read() && c == C_DOT) { read(); return new LexToken(TokenType.ELLIPSIS, start, ch); } return new LexToken(TokenType.STDLIB_CALL, start, ch); } return new LexToken(TokenType.DOT, start, ch); case C_TAG: read(); final String tagStr = readAlphanumericString(); return new LexToken(TokenType.TAG, tagStr, start, ch); case C_STRING: read(); final String strContents = readStringContents(); read(); return new LexToken(TokenType.STRING, strContents, start, ch); case C_ESCAPE: // TODO: Support number type tags. if (!read() || c != C_NEWLINE) throw new DScriptLexerException("The character '" + C_ESCAPE + "' is only valid outside a string literal if it precedes" + " a newline."); return null; default: if (expectingOperator) { final String op = readOperatorString(); if (Operator.INCREMENT.str.equals(op) || Operator.DECREMENT.str.equals(op)) return new LexToken(TokenType.POSTFIX_OP, op, start, ch); return new LexToken(TokenType.OPERATOR, op, start, ch); } else { final String op = Character.toString(c); read(); if ("-".equals(op) && C_CANGLE == c) { read(); return new LexToken(TokenType.ARROW_FUNC, start, ch); } return new LexToken(TokenType.PREFIX_OP, op, start, ch); } } } private TokenType singleCharToken(char tokenChar) { switch (tokenChar) { case C_OPAREN: return TokenType.O_PAREN; case C_CPAREN: return TokenType.C_PAREN; case C_OBRACKET: return TokenType.O_BRACKET; case C_CBRACKET: return TokenType.C_BRACKET; case C_OBRACE: return TokenType.O_BRACE; case C_CBRACE: return TokenType.C_BRACE; case C_COLON: return TokenType.COLON; default: throw new IllegalArgumentException( "Not a valid single-character token: " + tokenChar); } } private String readStringContents() throws DScriptLexerException,IOException { final StringBuilder sb = new StringBuilder(); do { if (c == C_ESCAPE) { read(); sb.append(readEscape()); } else if (c == C_STRING) return sb.toString(); else sb.append(c); } while (read()); unclosedString = true; return sb.toString(); } private String readEscape() throws DScriptLexerException, IOException { // TODO: Support numeric and Unicode escapes. switch (c) { case 't' : return "\t"; case 'b' : return "\b"; case 'n' : return "\n"; case 'r' : return "\r"; case 'f' : return "\f"; case '\'': return "'"; case '"' : return "\""; case '\n': return " "; case '\\': return "\\"; case 'u' : { final char[] chars = new char[4]; for (int i=0; i<chars.length; i++) { if (!read() || c == C_STRING) throw new DScriptLexerException( "Unicode escape sequence requires 4 hex digits."); chars[i] = c; } try { return Character.toString((char)Integer.parseInt( new String(chars), 16)); } catch (NumberFormatException ex) { throw new DScriptLexerException( "\"\\u" + new String(chars) + "\" is not a valid" + " hexadecimal Unicode escape sequence."); } } default: throw new DScriptLexerException( "Invalid escape sequence: \\" + c); } } private String readOperatorString() throws IOException { final char[] chars = OP_CONTINUATION_CHARS.toCharArray(); final char startChar = c; String opStr = Character.toString(c); while (read()) { if (c == startChar || charInArray(c, chars)) opStr += c; else return opStr; } return opStr; } }