/* * This file is a part of Alchemy OS project. * Copyright (C) 2011-2014, Sergey Basalaev <sbasalaev@gmail.com> * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package alchemy.nec; import alchemy.io.UTFReader; import java.io.IOException; /** * Splits Ether source into sequence of tokens. * @author Sergey Basalaev */ class Tokenizer { static private final int EOF_CHAR = -1; static private final int NO_CHAR = -2; static private final int RANGE_CHAR = -3; private final UTFReader r; private final String filename; private final CompilerEnv env; private boolean pushedBack; private int nextch = NO_CHAR; private int linenumber = 1; public int ttype; public int ivalue; public long lvalue; public float fvalue; public double dvalue; public String svalue; private static final int WORDCHAR = 1; private static final int OPCHAR = 2; private static int[] chtypes = new int[128]; static { chtypes['_'] = WORDCHAR; for (int i='0'; i<='9'; i++) { chtypes[i] = WORDCHAR; } for (int i='a'; i<='z'; i++) { chtypes[i] = WORDCHAR; } for (int i='A'; i<='Z'; i++) { chtypes[i] = WORDCHAR; } chtypes['+'] = OPCHAR; chtypes['-'] = OPCHAR; chtypes['*'] = OPCHAR; chtypes['/'] = OPCHAR; chtypes['%'] = OPCHAR; chtypes['='] = OPCHAR; chtypes['<'] = OPCHAR; chtypes['>'] = OPCHAR; chtypes['|'] = OPCHAR; chtypes['&'] = OPCHAR; chtypes['^'] = OPCHAR; chtypes['!'] = OPCHAR; chtypes['~'] = OPCHAR; } /** * Creates new tokenizer to read from this buffer. * If <code>compat</code> is true then work in 2.1 compatibility mode. */ public Tokenizer(CompilerEnv env, String filename, UTFReader r) { this.r = r; this.filename = filename; this.env = env; } public void pushBack() { pushedBack = true; } public int nextToken() throws IOException, ParseException { if (pushedBack) { pushedBack = false; return ttype; } int ch = readChar(); //skipping whitespaces while (ch >= 0 && ch <= ' ') ch = readChar(); //EOF if (ch == EOF_CHAR) { return ttype = Token.EOF; } //range hack if (ch == RANGE_CHAR) { return ttype = Token.RANGE; } //character literal if (ch == '\'') { ch = readChar(); if (ch == '\n' || ch == EOF_CHAR) { throw new ParseException("Unclosed character literal"); } if (ch == '\\') ch = readEscape(); ivalue = ch; ch = readChar(); if (ch != '\'') { throw new ParseException("Unclosed character literal"); } return ttype = Token.CHAR; } //string literal if (ch == '"') { StringBuffer str = new StringBuffer(); ch = readChar(); while (ch != '"' && ch != EOF_CHAR && ch != '\n') { if (ch == '\\') ch = readEscape(); str.append((char)ch); ch = readChar(); } if (ch != '"') { throw new ParseException("Unclosed string literal"); } svalue = str.toString(); return ttype = Token.QUOTED; } // escaped identifier literal if (ch == '`') { StringBuffer id = new StringBuffer(); ch = readChar(); while (ch != '`' && ch != EOF_CHAR && ch != '\n') { id.append((char)ch); ch = readChar(); } if (ch != '`') { throw new ParseException("Unclosed identifier literal"); } svalue = id.toString(); return ttype = Token.WORD; } //dot, range and numbers if (ch >= '0' && ch <= '9' || ch == '.') { boolean dotseen = false; if (ch == '.') { ch = readChar(); if (ch == '.') { return ttype = Token.RANGE; } else if (ch < '0' || ch > '9') { nextch = ch; return ttype = '.'; } dotseen = true; } nextch = ch; StringBuffer number = new StringBuffer(); if (dotseen) number.append('.'); String dec = readDecimal(); //check for possible hex literal if (dec.equals("0") && !dotseen) { ch = readChar(); if (ch == 'x' || ch == 'X') { number.append(readHex()); ch = readChar(); if (ch == 'l' || ch == 'L') { try { lvalue = parseulong16(number.toString()); } catch (Exception nfe) { throw new ParseException("Integer number too large: "+number); } return ttype = Token.LONG; } else { nextch = ch; try { ivalue = parseuint16(number.toString()); } catch (Exception nfe) { throw new ParseException("Integer number too large: "+number); } return ttype = Token.INT; } } else { nextch = ch; } } number.append(dec); //check for possible fraction part if (!dotseen) { ch = readChar(); if (ch == '.') { ch = readChar(); if (ch == '.') { nextch = RANGE_CHAR; } else { dotseen = true; number.append('.'); nextch = ch; if (ch >= '0' && ch <= '9') { number.append(readDecimal()); } } } else { nextch = ch; } } //now all forms of nn, nn., .nn, nn.nn are read //checking for exponent ch = readChar(); if (ch == 'e' || ch == 'E') { number.append((char)ch); ch = readChar(); if (ch == '+' || ch == '-') { number.append((char)ch); ch = readChar(); } if (ch >= '0' && ch <= '9') { nextch = ch; number.append(readDecimal()); } else { throw new ParseException("Malformed floating point literal: "+number); } dotseen = true; } else { nextch = ch; } //checking for suffix ch = readChar(); if (ch == 'f' || ch == 'F') { try { fvalue = Float.parseFloat(number.toString()); } catch (Exception nfe) { throw new ParseException("Floating point number too large: "+number); } return ttype = Token.FLOAT; } if (ch == 'd' || ch == 'D') { try { dvalue = Double.parseDouble(number.toString()); } catch (Exception nfe) { throw new ParseException("Floating point number too large: "+number); } return ttype = Token.DOUBLE; } if (dotseen) { nextch = ch; try { dvalue = Double.parseDouble(number.toString()); } catch (Exception nfe) { throw new ParseException("Floating point number too large: "+number); } return ttype = Token.DOUBLE; } if (ch == 'l' || ch == 'L') { try { lvalue = Long.parseLong(number.toString(), 10); } catch (NumberFormatException nfe) { throw new ParseException("Integer number too large: "+number); } return ttype = Token.LONG; } else { nextch = ch; try { ivalue = Integer.parseInt(number.toString(), 10); } catch (Exception nfe) { throw new ParseException("Integer number too large: "+number); } return ttype = Token.INT; } } //identifiers and keywords if (ch <= 127 && chtypes[ch] == WORDCHAR) { StringBuffer idbuf = new StringBuffer(); idbuf.append((char)ch); ch = readChar(); while (ch != EOF_CHAR && ch <= 127 && chtypes[ch] == WORDCHAR) { idbuf.append((char)ch); ch = readChar(); } nextch = ch; String id = idbuf.toString(); svalue = id; int type = Token.WORD; switch (id.hashCode()) { case 0x059a58ff: if (id.equals("break")) { if (env.hasOption(CompilerEnv.F_COMPAT21)) { env.warn(filename, linenumber, CompilerEnv.W_DEPRECATED, "'break' will be a keyword in Ether 2.2"); } else { type = Token.BREAK; } } break; case 0x002e7b3f: if (id.equals("cast")) type = Token.CAST; break; case 0x05a0eebb: if (id.equals("catch")) type = Token.CATCH; break; case 0x05a73763: if (id.equals("const")) type = Token.CONST; break; case 0xde312ca7: if (id.equals("continue")) { if (env.hasOption(CompilerEnv.F_COMPAT21)) { env.warn(filename, linenumber, CompilerEnv.W_DEPRECATED, "'continue' will be a keyword in Ether 2.2"); } else { type = Token.CONTINUE; } } break; case 0x00018405: if (id.equals("def")) type = Token.DEF; break; case 0x00000c8b: if (id.equals("do")) type = Token.DO; break; case 0x002f8d39: if (id.equals("else")) type = Token.ELSE; break; case 0x05cb1923: if (id.equals("false")) type = Token.FALSE; break; case 0x00018cc9: if (id.equals("for")) type = Token.FOR; break; case 0x00000d1d: if (id.equals("if")) type = Token.IF; break; case 0x00000d25: if (id.equals("in")) { if (env.hasOption(CompilerEnv.F_COMPAT21)) { env.warn(filename, linenumber, CompilerEnv.W_DEPRECATED, "'in' will be a keyword in Ether 2.2"); } else { type = Token.IN; } } break; case 0x0001a9a0: if (id.equals("new")) type = Token.NEW; break; case 0x0033c587: if (id.equals("null")) type = Token.NULL; break; case 0xc84e3d30: if (id.equals("return")) { if (env.hasOption(CompilerEnv.F_COMPAT21)) { env.warn(filename, linenumber, CompilerEnv.W_DEPRECATED, "'return' will be a keyword in Ether 2.2"); } else { type = Token.RETURN; } } break; case 0x68b6f7b: if (id.equals("super")) type = Token.SUPER; break; case 0xcafbb734: if (id.equals("switch")) type = Token.SWITCH; break; case 0x0693a6e6: if (id.equals("throw")) { if (env.hasOption(CompilerEnv.F_COMPAT21)) { env.warn(filename, linenumber, CompilerEnv.W_DEPRECATED, "'throw' will be a keyword in Ether 2.2"); } else { type = Token.THROW; } } break; case 0x0036758e: if (id.equals("true")) type = Token.TRUE; break; case 0x0001c1bb: if (id.equals("try")) type = Token.TRY; break; case 0x00368f3a: if (id.equals("type")) type = Token.TYPE; break; case 0x0001c587: if (id.equals("use")) type = Token.USE; break; case 0x0001c727: if (id.equals("var")) type = Token.VAR; break; case 0x06bdcb31: if (id.equals("while")) type = Token.WHILE; break; } return ttype = type; } //operators and comments if (ch <= 127 && chtypes[ch] == OPCHAR) { int ch2 = readChar(); if (ch2 <= 127 && chtypes[ch2] == OPCHAR) { if (ch == '=' && ch2 == '=') return ttype = Token.EQEQ; if (ch == '<' && ch2 == '=') return ttype = Token.LTEQ; if (ch == '>' && ch2 == '=') return ttype = Token.GTEQ; if (ch == '<' && ch2 == '<') { ch = readChar(); if (ch == '=') { return ttype = Token.LTLTEQ; } else { nextch = ch; return ttype = Token.LTLT; } } if (ch == '>' && ch2 == '>') { ch = readChar(); if (ch == '>') { ch = readChar(); if (ch == '=') { return ttype = Token.GTGTGTEQ; } else { nextch = ch; return ttype = Token.GTGTGT; } } else if (ch == '=') { return ttype = Token.GTGTEQ; } else { nextch = ch; return ttype = Token.GTGT; } } if (ch == '!' && ch2 == '=') return ttype = Token.NOTEQ; if (ch == '+' && ch2 == '=') return ttype = Token.PLUSEQ; if (ch == '-' && ch2 == '=') return ttype = Token.MINUSEQ; if (ch == '*' && ch2 == '=') return ttype = Token.STAREQ; if (ch == '/' && ch2 == '=') return ttype = Token.SLASHEQ; if (ch == '%' && ch2 == '=') return ttype = Token.PERCENTEQ; if (ch == '&' && ch2 == '=') return ttype = Token.AMPEQ; if (ch == '|' && ch2 == '=') return ttype = Token.BAREQ; if (ch == '&' && ch2 == '&') return ttype = Token.AMPAMP; if (ch == '|' && ch2 == '|') return ttype = Token.BARBAR; //line comment if (ch == '/' && ch2 == '/') { do ch = readChar(); while (ch != '\n' && ch != EOF_CHAR); return nextToken(); } //block comment if (ch == '/' && ch2 == '*') { ch = readChar(); ch2 = readChar(); while (ch2 != EOF_CHAR && (ch != '*' || ch2 != '/')) { ch = ch2; ch2 = readChar(); } if (ch2 == EOF_CHAR) { throw new ParseException("Unclosed comment"); } return nextToken(); } } // to this point second character is separate token nextch = ch2; } return ttype = ch; } private int readChar() throws IOException { int ch; if (nextch == NO_CHAR) { ch = r.read(); if (ch == '\n') linenumber++; } else { ch = nextch; nextch = NO_CHAR; } return ch; } private int readEscape() throws IOException, ParseException { int ch = readChar(); switch (ch) { case '\\': return '\\'; case '\'': return '\''; case '"': return '"'; case 'n': return '\n'; case 't': return '\t'; case 'r': return '\r'; case 'b': return '\b'; case 'f': return '\f'; case 'u': { //four hex digits must follow int u1 = hexdigit(readChar()); int u2 = hexdigit(readChar()); int u3 = hexdigit(readChar()); int u4 = hexdigit(readChar()); if ((u1|u2|u3|u4) < 0) { throw new ParseException("Illegal unicode escape"); } return (u1 << 12) | (u2 << 8) | (u3 << 4) | u4; } case '0': //octals case '1': case '2': case '3': case '4': case '5': case '6': case '7': { int octal = ch-'0'; if (ch <= '3') { ch = readChar(); if (ch >= '0' && ch <= '7') { octal <<= 3; octal |= ch-'0'; } else { nextch = ch; return octal; } } ch = readChar(); if (ch >= '0' && ch <= '7') { octal <<= 3; octal |= ch-'0'; } else { nextch = ch; } return octal; } } throw new ParseException("Illegal escape sequence"); } /** * Helper function to read unicode escape. */ private int hexdigit(int ch) { if (ch >= '0' && ch <= '9') return ch-'0'; if (ch >= 'a' && ch <= 'f') return ch-'a'+10; if (ch >= 'A' && ch <= 'F') return ch-'A'+10; return -1; } private String readDecimal() throws IOException { StringBuffer decimal = new StringBuffer(); int ch = readChar(); while (ch >= '0' && ch <= '9') { decimal.append((char)ch); ch = readChar(); } nextch = ch; return decimal.toString(); } private String readHex() throws IOException { StringBuffer hex = new StringBuffer(); int ch = readChar(); while ((ch >= '0' && ch <= '9') || (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F')) { hex.append((char)ch); ch = readChar(); } nextch = ch; return hex.toString(); } /** * Returns string representation of the current token. */ public String toString() { switch (ttype) { case Token.EOF: return "<EOF>"; case Token.INT: return String.valueOf(ivalue); case Token.LONG: return String.valueOf(lvalue); case Token.FLOAT: return String.valueOf(fvalue); case Token.DOUBLE: return String.valueOf(dvalue); case Token.EQEQ: return "=="; case Token.GTEQ: return ">="; case Token.GTGT: return ">>"; case Token.GTGTGT: return ">>>"; case Token.LTEQ: return "<="; case Token.LTLT: return "<<"; case Token.NOTEQ: return "!="; case Token.AMPAMP: return "&&"; case Token.BARBAR: return "||"; case Token.PLUSEQ: return "+="; case Token.MINUSEQ: return "-="; case Token.STAREQ: return "*="; case Token.SLASHEQ: return "/="; case Token.PERCENTEQ: return "%="; case Token.BAREQ: return "|="; case Token.AMPEQ: return "&="; case Token.HATEQ: return "^="; case Token.LTLTEQ: return "<<="; case Token.GTGTEQ: return ">>="; case Token.GTGTGTEQ: return ">>>="; default: return (ttype < 0) ? svalue : String.valueOf((char)ttype); } } /** * Returns number of the current line. */ public int lineNumber() { return linenumber; } private int parseuint16(String hex) { hex = hex.toLowerCase(); if (hex.length() > 8) throw new NumberFormatException("Integer number too large: "+hex); int num=0; for (int i=0; i<hex.length(); i++) { num <<= 4; char ch = hex.charAt(i); if (ch >= '0' && ch <= '9') num |= ch-'0'; else if (ch >= 'a' && ch <= 'f') num |= ch-'a'+10; } return num; } private long parseulong16(String hex) { hex = hex.toLowerCase(); if (hex.length() > 16) throw new NumberFormatException("Integer number too large: "+hex); long num=0; for (int i=0; i<hex.length(); i++) { num <<= 4; char ch = hex.charAt(i); if (ch >= '0' && ch <= '9') num |= ch-'0'; else if (ch >= 'a' && ch <= 'f') num |= ch-'a'+10; } return num; } }