/* --------------------------------------------------------- *
* __________ D E L T A S C R I P T *
* (_________() *
* / === / - A fast, dynamic scripting language *
* | == | - Version 4.13.11.0 *
* / === / - Developed by Adam R. Nelson *
* | = = | - 2011-2013 *
* / === / - Distributed under GNU LGPL v3 *
* (________() - http://github.com/ar-nelson/deltascript *
* *
* --------------------------------------------------------- */
package com.sector91.delta.script.parser;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.Charset;
import java.util.Iterator;
import java.util.LinkedList;
import com.sector91.delta.script.Operator;
public class DScriptLexer implements Iterator<LexToken>
{
public static final Charset CHARSET = Charset.forName("UTF-8");
static final String
KW_DEF = "def",
KW_FIELD = "field",
KW_IF = "if",
KW_THEN = "then",
KW_ELSEIF = "elif",
KW_ELSE = "else",
KW_ENDIF = "xi",
KW_BRANCH = "branch",
KW_CASE = "case",
KW_DEFAULT = "otherwise",
KW_ENDBRANCH = "xb",
KW_FUNC = "func",
KW_ENDFUNC = "xf",
KW_ARROW = "->",
KW_UNDERSCORE = "_",
KW_SCOPE = "scope",
KW_INCLUDE = "include",
KW_ENDSCOPE = "xs",
KW_BLOCK = "do",
KW_ENDBLOCK = "xd",
KW_LOOP = "loop",
KW_WHILELOOP = "while",
KW_UNTILLOOP = "until",
KW_FROMLOOP = "from",
KW_LOOPCOND = "where",
KW_ENDLOOP = "xl",
KW_FOR_COMPRH = "for",
KW_RETURN = "return",
KW_BREAK = "break",
KW_CONTINUE = "continue",
KW_TRUE = "true",
KW_FALSE = "false",
KW_BLANK = "blank",
KW_NULL = "null",
OCT_DIGITS = "01234567",
HEX_DIGITS = "0123456789abcdefABCDEF",
OP_CONTINUATION_CHARS = "|^&=*/%~";
static final char
C_COMMENT = '#',
C_TAG = '\'',
C_COLON = ':',
C_ESCAPE = '\\',
C_STRING = '"',
C_DOT = '.',
C_COMMA = ',',
C_SCOLON = ';',
C_NEWLINE = '\n',
C_OPAREN = '(',
C_CPAREN = ')',
C_OBRACKET = '[',
C_CBRACKET = ']',
C_OBRACE = '{',
C_CBRACE = '}',
C_OANGLE = '<',
C_CANGLE = '>',
C_MINUS = '-',
C_NUMSEPARATOR = '_',
C_EXP_LOWER = 'e',
C_EXP_UPPER = 'E';
private char c;
private LexToken lastToken;
private boolean initialized, finished, expectingOperator, unclosedString,
expectingSuffix;
private Reader in;
private int ch;
private LinkedList<TokenType> stack = new LinkedList<TokenType>();
public DScriptLexer(String data)
{this(new ByteArrayInputStream(data.getBytes(CHARSET)));}
public DScriptLexer(InputStream stream)
{in = new InputStreamReader(stream, CHARSET);}
private void initialize() throws IOException
{
read();
ch = 0;
finished = false;
expectingOperator = false;
skipCommentsAndWhitespace();
initialized = true;
}
public LexToken next()
{
try
{
if (!initialized)
initialize();
return readNextToken();
}
catch (Exception ex)
{
finished = true;
return new ErrorLexToken(ex, ch, ch);
}
}
public boolean hasNext()
{return !finished;}
public void remove()
{throw new UnsupportedOperationException("Unsupported.");}
public void close() throws IOException
{
if (finished)
return;
finished = true;
in.close();
}
public TokenType topUnclosedToken()
{
if (unclosedString)
return TokenType.STRING;
if (stack.isEmpty())
return null;
return stack.getFirst();
}
public int nestingDepth()
{return unclosedString ? stack.size()+1 : stack.size();}
private boolean read() throws IOException
{
int i = in.read();
while (i == '\r') i = in.read(); // Convert DOS-style line breaks.
if (i < 0)
{
close();
return false;
}
else
{
c = (char)i;
ch++;
return true;
}
}
private boolean charInArray(char chr, char[] arr)
{
for (char c2 : arr)
if (chr == c2)
return true;
return false;
}
private void skipCommentsAndWhitespace() throws IOException
{
while (c == C_COMMENT || (Character.isWhitespace(c) && c != C_NEWLINE))
{
expectingSuffix = false;
if (c == C_COMMENT)
while (read() && c != C_NEWLINE) {/* Do nothing. */}
else
read();
if (finished) return;
}
}
private LexToken readNextToken() throws DScriptLexerException, IOException
{
if (finished) return null;
final LexToken token;
if (Character.isJavaIdentifierStart(c))
token = readAlphanumeric();
else if (Character.isDigit(c))
token = readNumeric();
else
token = readSymbolic();
if (token == null) // A null response means keep reading.
return readNextToken();
expectingOperator = token.type().followedByOperator;
// Maintain a stack of opening/closing pairs of tokens (parens, angle
// brackets, etc.)
if (token.type().opens)
stack.push(token.type());
else if (token.type().closes != null)
{
// Pop the stack if the last stack element was closed by this token.
if (!stack.isEmpty() && stack.getFirst() == token.type().closes)
stack.pop();
// Otherwise, this token is out of place; throw an exception.
else
{
if (stack.isEmpty())
throw new DScriptLexerException("Unexpected " + token);
else
throw new DScriptLexerException("Unexpected " + token +
". Last unclosed token: " +
stack.getFirst().friendlyDesc);
}
}
lastToken = token;
skipCommentsAndWhitespace();
return token;
}
private LexToken readAlphanumeric() throws DScriptLexerException,IOException
{
final int start = this.ch;
// Check for '1e10'-style exponent strings.
if (expectingSuffix &&
lastToken != null && (
lastToken.type() == TokenType.NUMBER ||
lastToken.type() == TokenType.DEC_SUFFIX ||
lastToken.type() == TokenType.EXP_SUFFIX))
{
if (c == C_EXP_UPPER || c == C_EXP_LOWER)
{
read();
String numStr = "";
if (c == '-' || c == '+')
{
numStr += Character.toString(c);
read();
}
numStr += readDecimalNumberString();
return new LexToken(TokenType.EXP_SUFFIX, numStr, start, ch);
}
else if (Character.isJavaIdentifierStart(c))
{
final String str = readAlphanumericString();
return new LexToken(TokenType.NTYPE, str, start, ch);
}
}
final String str = readAlphanumericString();
// Check if the string is a keyword.
if (KW_DEF.equals(str))
return new LexToken(TokenType.DEF, start, ch);
else if (KW_FIELD.equals(str))
return new LexToken(TokenType.FIELD, start, ch);
else if (KW_UNDERSCORE.equals(str))
return new LexToken(TokenType.UNDERSCORE, start, ch);
else if (KW_IF.equals(str))
return new LexToken(TokenType.IF, start, ch);
else if (KW_THEN.equals(str))
return new LexToken(TokenType.THEN, start, ch);
else if (KW_ELSE.equals(str))
return new LexToken(TokenType.ELSE, start, ch);
else if (KW_ELSEIF.equals(str))
return new LexToken(TokenType.ELSE_IF, start, ch);
else if (KW_ENDIF.equals(str))
return new LexToken(TokenType.END_IF, KW_ENDIF, start, ch);
else if (KW_BRANCH.equals(str))
return new LexToken(TokenType.BRANCH, start, ch);
else if (KW_CASE.equals(str))
return new LexToken(TokenType.CASE, start, ch);
else if (KW_DEFAULT.equals(str))
return new LexToken(TokenType.DEFAULT_CASE, start, ch);
else if (KW_ENDBRANCH.equals(str))
return new LexToken(TokenType.END_BRANCH, KW_ENDBRANCH, start, ch);
else if (KW_LOOP.equals(str))
return new LexToken(TokenType.LOOP, start, ch);
else if (KW_FROMLOOP.equals(str))
return new LexToken(TokenType.LOOP_KW, KW_FROMLOOP, start, ch);
else if (KW_WHILELOOP.equals(str))
return new LexToken(TokenType.LOOP_KW, KW_WHILELOOP, start, ch);
else if (KW_UNTILLOOP.equals(str))
return new LexToken(TokenType.LOOP_KW, KW_UNTILLOOP, start, ch);
else if (KW_LOOPCOND.equals(str))
return new LexToken(TokenType.LOOP_KW, KW_LOOPCOND, start, ch);
else if (KW_ENDLOOP.equals(str))
return new LexToken(TokenType.END_LOOP, KW_ENDLOOP, start, ch);
else if (KW_FUNC.equals(str))
return new LexToken(TokenType.FUNC, start, ch);
else if (KW_ENDFUNC.equals(str))
return new LexToken(TokenType.END_FUNC, KW_ENDFUNC, start, ch);
else if (KW_SCOPE.equals(str))
return new LexToken(TokenType.SCOPE, start, ch);
else if (KW_ENDSCOPE.equals(str))
return new LexToken(TokenType.END_SCOPE, KW_ENDSCOPE, start, ch);
else if (KW_BLOCK.equals(str))
return new LexToken(TokenType.BLOCK, start, ch);
else if (KW_ENDBLOCK.equals(str))
return new LexToken(TokenType.END_BLOCK, KW_ENDBLOCK, start, ch);
else if (KW_RETURN.equals(str))
return new LexToken(TokenType.RETURN, start, ch);
else if (KW_BREAK.equals(str))
return new LexToken(TokenType.BREAK, start, ch);
else if (KW_CONTINUE.equals(str))
return new LexToken(TokenType.CONTINUE, start, ch);
else if (KW_TRUE.equals(str) || KW_FALSE.equals(str))
return new LexToken(TokenType.BOOLEAN, str, start, ch);
else if (KW_BLANK.equals(str) || KW_NULL.equals(str))
return new LexToken(TokenType.BLANK, start, ch);
else if (KW_INCLUDE.equals(str))
return new LexToken(TokenType.INCLUDE, start, ch);
else if (KW_FOR_COMPRH.equals(str))
return new LexToken(TokenType.FOR_COMPRH, start, ch);
// Check if the string is an operator.
for (Operator op : Operator.values())
if (op.str.equals(str))
return new LexToken(TokenType.OPERATOR, str, start, ch);
// If the string has no special meaning, it is an identifier.
return new LexToken(TokenType.IDENTIFIER, str, start, ch);
}
private String readAlphanumericString() throws IOException
{
final StringBuilder sb = new StringBuilder();
do
{
if (!Character.isJavaIdentifierPart(c))
return sb.toString();
sb.append(c);
} while (read());
return sb.toString();
}
private LexToken readNumeric() throws DScriptLexerException, IOException
{
final int start = this.ch;
if (lastToken != null && lastToken.type()==TokenType.DOT)
{
expectingSuffix = true;
return new LexToken(TokenType.DEC_SUFFIX, readDecimalNumberString(),
start, ch);
}
else if (c == '0')
{
if (read())
{
if (c == 'x')
{
read();
final String hexStr = readHexNumberString();
return new LexToken(TokenType.HEX_NUMBER, hexStr, start, ch);
}
else if (c == 'b')
{
read();
final String binStr = readBinaryNumberString();
return new LexToken(TokenType.BIN_NUMBER, binStr, start, ch);
}
else if (Character.isDigit(c) || c == C_NUMSEPARATOR)
{
final String octStr = readOctalNumberString();
return new LexToken(TokenType.OCT_NUMBER, octStr, start, ch);
}
}
return new LexToken(TokenType.NUMBER, "0", start, ch);
}
expectingSuffix = true;
return new LexToken(TokenType.NUMBER, readDecimalNumberString(), start,
ch);
}
private String readDecimalNumberString() throws IOException
{
final StringBuilder sb = new StringBuilder();
do
{
if (c == C_NUMSEPARATOR)
continue;
if (!Character.isDigit(c))
return sb.toString();
sb.append(c);
} while (read());
return sb.toString();
}
private String readOctalNumberString() throws IOException
{
final StringBuilder sb = new StringBuilder();
final char[] chars = OCT_DIGITS.toCharArray();
do
{
if (charInArray(c, chars))
sb.append(c);
else if (c != C_NUMSEPARATOR)
return sb.toString();
} while (read());
return sb.toString();
}
private String readHexNumberString() throws IOException
{
final StringBuilder sb = new StringBuilder();
final char[] chars = HEX_DIGITS.toCharArray();
do
{
if (charInArray(c, chars))
sb.append(c);
else if (c != C_NUMSEPARATOR)
return sb.toString();
} while (read());
return sb.toString();
}
private String readBinaryNumberString() throws IOException
{
final StringBuilder sb = new StringBuilder();
do
{
if (c == C_NUMSEPARATOR)
continue;
else if (c == '0' || c == '1')
sb.append(c);
else
return sb.toString();
} while (read());
return sb.toString();
}
private LexToken readSymbolic() throws DScriptLexerException, IOException
{
final int start = this.ch;
switch (c)
{
case C_COMMA:
case C_SCOLON:
case C_NEWLINE:
final String sep = Character.toString(c);
read();
return new LexToken(TokenType.SEPARATOR, sep, start, ch);
case C_OPAREN:
case C_CPAREN:
case C_OBRACKET:
case C_CBRACKET:
case C_OBRACE:
case C_CBRACE:
case C_COLON:
final char lastC = c;
read();
return new LexToken(singleCharToken(lastC), start, ch);
case C_OANGLE:
if (expectingOperator)
{
final String op = readOperatorString();
return new LexToken(TokenType.OPERATOR, op, start, ch);
}
read();
return new LexToken(TokenType.O_ANGLE, start, ch);
case C_CANGLE:
// If the top element on the stack is an opening angle bracket,
// don't even try to parse the character '>' as an operator.
if (expectingOperator && (stack.isEmpty() ||
stack.getFirst() != TokenType.O_ANGLE))
{
final String op = readOperatorString();
return new LexToken(TokenType.OPERATOR, op, start, ch);
}
read();
return new LexToken(TokenType.C_ANGLE, start, ch);
case C_DOT:
if (read() && c == C_DOT)
{
if (read() && c == C_DOT)
{
read();
return new LexToken(TokenType.ELLIPSIS, start, ch);
}
return new LexToken(TokenType.STDLIB_CALL, start, ch);
}
return new LexToken(TokenType.DOT, start, ch);
case C_TAG:
read();
final String tagStr = readAlphanumericString();
return new LexToken(TokenType.TAG, tagStr, start, ch);
case C_STRING:
read();
final String strContents = readStringContents();
read();
return new LexToken(TokenType.STRING, strContents, start, ch);
case C_ESCAPE:
// TODO: Support number type tags.
if (!read() || c != C_NEWLINE)
throw new DScriptLexerException("The character '" + C_ESCAPE +
"' is only valid outside a string literal if it precedes" +
" a newline.");
return null;
default:
if (expectingOperator)
{
final String op = readOperatorString();
if (Operator.INCREMENT.str.equals(op) ||
Operator.DECREMENT.str.equals(op))
return new LexToken(TokenType.POSTFIX_OP, op, start, ch);
return new LexToken(TokenType.OPERATOR, op, start, ch);
}
else
{
final String op = Character.toString(c);
read();
if ("-".equals(op) && C_CANGLE == c)
{
read();
return new LexToken(TokenType.ARROW_FUNC, start, ch);
}
return new LexToken(TokenType.PREFIX_OP, op, start, ch);
}
}
}
private TokenType singleCharToken(char tokenChar)
{
switch (tokenChar)
{
case C_OPAREN:
return TokenType.O_PAREN;
case C_CPAREN:
return TokenType.C_PAREN;
case C_OBRACKET:
return TokenType.O_BRACKET;
case C_CBRACKET:
return TokenType.C_BRACKET;
case C_OBRACE:
return TokenType.O_BRACE;
case C_CBRACE:
return TokenType.C_BRACE;
case C_COLON:
return TokenType.COLON;
default:
throw new IllegalArgumentException(
"Not a valid single-character token: " + tokenChar);
}
}
private String readStringContents() throws DScriptLexerException,IOException
{
final StringBuilder sb = new StringBuilder();
do
{
if (c == C_ESCAPE)
{
read();
sb.append(readEscape());
}
else if (c == C_STRING)
return sb.toString();
else
sb.append(c);
} while (read());
unclosedString = true;
return sb.toString();
}
private String readEscape() throws DScriptLexerException, IOException
{
// TODO: Support numeric and Unicode escapes.
switch (c)
{
case 't' : return "\t";
case 'b' : return "\b";
case 'n' : return "\n";
case 'r' : return "\r";
case 'f' : return "\f";
case '\'': return "'";
case '"' : return "\"";
case '\n': return " ";
case '\\': return "\\";
case 'u' :
{
final char[] chars = new char[4];
for (int i=0; i<chars.length; i++)
{
if (!read() || c == C_STRING)
throw new DScriptLexerException(
"Unicode escape sequence requires 4 hex digits.");
chars[i] = c;
}
try
{
return Character.toString((char)Integer.parseInt(
new String(chars), 16));
}
catch (NumberFormatException ex)
{
throw new DScriptLexerException(
"\"\\u" + new String(chars) + "\" is not a valid" +
" hexadecimal Unicode escape sequence.");
}
}
default:
throw new DScriptLexerException(
"Invalid escape sequence: \\" + c);
}
}
private String readOperatorString() throws IOException
{
final char[] chars = OP_CONTINUATION_CHARS.toCharArray();
final char startChar = c;
String opStr = Character.toString(c);
while (read())
{
if (c == startChar || charInArray(c, chars))
opStr += c;
else
return opStr;
}
return opStr;
}
}