package de.fuberlin.projecta.lexer; import de.fuberlin.commons.lexer.ILexer; import de.fuberlin.commons.lexer.TokenType; import de.fuberlin.projecta.lexer.io.ICharStream; public class Lexer implements ILexer { // denotes the end of any token, hard-coded private static final String DELIMITER_REGEXP = "[\\s(=)+-/;\\*{}\\[]"; private int line; private ICharStream is; /** * Instantiate lexer object * * @param is Input character stream */ public Lexer(ICharStream is) { this.line = 1; // start at line 1 this.is = is; } public Token getNextToken() { skipWhiteSpaceAndCommentary(); if (is.isEmpty()) { // End of input stream - nothing more to read return new Token(TokenType.EOF, null, this.line, is.getOffset()); } String peek = is.getNextChars(1); Token t; if ((t = reservedAndTerminals()) != null) { return t; } if ((t = identifier()) != null) { return t; } if ((t = stringConstant()) != null) { return t; } if ((t = numConstant()) != null) { return t; } // if no rule could be applied there is something wrong! throw new SyntaxErrorException("Undefined something at line " + line + " near " + peek); } /** * Go through whitespace and comments, skip them */ private void skipWhiteSpaceAndCommentary() { String peek; if(is.isEmpty()) return; do { do { peek = is.getNextChars(1); if (peek.matches("\\n")) { this.line += 1; is.resetOffset(); } if (peek.matches("\\s")) is.consumeChars(1); } while (peek.matches("\\s")); peek = is.getNextChars(2); if (peek.equals("//")) { do { is.consumeChars(1); peek = is.getNextChars(2); } while (!peek.endsWith("\n")); is.consumeChars(1); this.line += 1; is.resetOffset(); } else if (peek.equals("/*")) { do { is.consumeChars(1); peek = is.getNextChars(2); if (peek.endsWith("\n")) { this.line += 1; is.resetOffset(); } } while (!peek.equals("*/")); is.consumeChars(2); } peek = is.getNextChars(2); } while (peek.matches("\\s.") || peek.equals("//") || peek.equals("/*")); } /** * Parser num constants * - Handles INT/REAL literals * * @return Token (numeric literals) * @throws SyntaxErrorException */ private Token numConstant() throws SyntaxErrorException { String peek = is.getNextChars(1); String result = ""; if(!peek.matches("[\\d\\.]")){ return null; } while (peek.matches("\\d")) { result += peek; is.consumeChars(1); peek = is.getNextChars(1); } // We now have read a series of digits // if we read a dot or an e/E now we have a real value // every other character will indicate that we have an int value if (!peek.matches("[eE\\.]")) { final int value = Integer.parseInt(result); return new Token(TokenType.INT_LITERAL, value, line, is.getOffset()); } if (peek.matches("\\.")) { result += peek; is.consumeChars(1); peek = is.getNextChars(1); while (peek.matches("\\d")) { result += peek; is.consumeChars(1); peek = is.getNextChars(1); } } if (peek.matches("[eE]")) { result += peek; is.consumeChars(1); peek = is.getNextChars(1); if (peek.matches("[+-]")) { // optional sign result += peek; is.consumeChars(1); peek = is.getNextChars(1); } if (peek.matches("\\d")) { while (peek.matches("\\d")) { result += peek; is.consumeChars(1); peek = is.getNextChars(1); } } else { throw new SyntaxErrorException( "Malformed real value at line: " + this.line + " near: " + is.getOffset()); } final double value = Double.parseDouble(result); return new Token(TokenType.REAL_LITERAL, value, this.line, is.getOffset()); } final double value = Double.parseDouble(result); return new Token(TokenType.REAL_LITERAL, value, this.line, is.getOffset()); } /** * Parses identifiers (note: call this after having parsed reserved keywords) * * @return Token (ID) * @throws SyntaxErrorException */ private Token identifier() throws SyntaxErrorException { String peek = is.getNextChars(1); String id = ""; final int offset = is.getOffset(); if (peek.matches("[A-Za-z]")) while (!is.isEmpty() && peek.matches("[A-Za-z0-9_]")) { id += peek; is.consumeChars(1); peek = is.getNextChars(1); } if (!id.isEmpty()) return new Token(TokenType.ID, id, this.line, offset); return null; } /** * Parse string literals * * @return Token * @throws SyntaxErrorException */ private Token stringConstant() throws SyntaxErrorException { String peek = is.getNextChars(1); String delimiter = "'"; final int offset = is.getOffset(); switch (peek.charAt(0)) { case '\'': break; case '"': delimiter = "\""; break; default: return null; } is.consumeChars(1); String result = ""; while (true) { peek = is.getNextChars(1); is.consumeChars(1); if (peek.matches("\\s") && peek.charAt(0) != ' ') { // throw new // SyntaxErrorException("Unallowed whitespace in string in line " // + this.lineNumber); return null; } if (peek.startsWith(delimiter)) { return new Token(TokenType.STRING_LITERAL, result, this.line, offset); } result += peek; if (is.isEmpty()) { return new Token(TokenType.STRING_LITERAL, result, this.line, offset); } } } /** * Parses reserved words * - Handles keywords: if then else while do break return print * - Handles all terminals * * @return Token * @throws SyntaxErrorException */ private Token reservedAndTerminals() throws SyntaxErrorException { final int offset = is.getOffset(); String s = is.getNextChars(1); if (s.equals("i")) { if (is.getNextChars(3).matches("if" + DELIMITER_REGEXP)) { is.consumeChars(2); return new Token(TokenType.IF, null, this.line, offset); } if (is.getNextChars(4).matches("int" + DELIMITER_REGEXP)) { is.consumeChars(3); return new Token(TokenType.BASIC, "int", this.line, offset); } } if (s.equals("t")) { if (is.getNextChars(5).matches("then" + DELIMITER_REGEXP)) { is.consumeChars(4); return new Token(TokenType.THEN, null, this.line, offset); } if (is.getNextChars(5).matches("true" + DELIMITER_REGEXP)) { is.consumeChars(4); return new Token(TokenType.BOOL_LITERAL, true, this.line, offset); } } if (s.equals("e") && is.getNextChars(5).matches("else" + DELIMITER_REGEXP)) { is.consumeChars(4); return new Token(TokenType.ELSE, null, this.line, offset); } if(s.equals("v") && is.getNextChars(5).matches("void" + DELIMITER_REGEXP)){ is.consumeChars(4); return new Token(TokenType.BASIC, "void", this.line, offset); } if (s.equals("w") && is.getNextChars(6).matches("while" + DELIMITER_REGEXP)) { is.consumeChars(5); return new Token(TokenType.WHILE, null, this.line, offset); } if (s.equals("d")) { if (is.getNextChars(3).matches("do" + DELIMITER_REGEXP)) { is.consumeChars(2); return new Token(TokenType.DO, null, this.line, offset); } else if (is.getNextChars(4).matches("def ")) { is.consumeChars(3); return new Token(TokenType.DEF, null, this.line, offset); } } if (s.equals("r")) { if (is.getNextChars(7).matches("return" + DELIMITER_REGEXP)) { is.consumeChars(6); return new Token(TokenType.RETURN, null, this.line, offset); } if (is.getNextChars(5).equals("real ")) { is.consumeChars(4); return new Token(TokenType.BASIC, "real", this.line, offset); } if (is.getNextChars(7).matches("record" + DELIMITER_REGEXP)) { is.consumeChars(6); return new Token(TokenType.RECORD, null, this.line, offset); } } if (s.equals("s")) { if (is.getNextChars(7).matches("string" + DELIMITER_REGEXP)) { is.consumeChars(6); return new Token(TokenType.BASIC, "string", this.line, offset); } } if (s.equals("b")) { if (is.getNextChars(6).matches("break" + DELIMITER_REGEXP)) { is.consumeChars(5); return new Token(TokenType.BREAK, null, this.line, offset); } if(is.getNextChars(5).matches("bool" + DELIMITER_REGEXP)){ is.consumeChars(4); return new Token(TokenType.BASIC, "bool", line, offset); } } if (s.equals("p") && is.getNextChars(6).matches("print" + DELIMITER_REGEXP)) { is.consumeChars(5); return new Token(TokenType.PRINT, null, this.line, offset); } if (s.equals("f") && is.getNextChars(6).matches("false" + DELIMITER_REGEXP)) { is.consumeChars(5); return new Token(TokenType.BOOL_LITERAL, false, this.line, offset); } if (s.equals("+")) { is.consumeChars(1); return new Token(TokenType.OP_ADD, null, this.line, offset); } if (s.equals("-")) { is.consumeChars(1); return new Token(TokenType.OP_MINUS, null, this.line, offset); } if (s.equals("*")) { is.consumeChars(1); return new Token(TokenType.OP_MUL, null, this.line, offset); } if (s.equals("/")) { is.consumeChars(1); return new Token(TokenType.OP_DIV, null, this.line, offset); } if (s.equals("&")) { if (is.getNextChars(2).equals("&&")) { is.consumeChars(2); return new Token(TokenType.OP_AND, "AND", this.line, offset); } } if (s.equals("|")) { if (is.getNextChars(2).equals("||")) { is.consumeChars(2); return new Token(TokenType.OP_OR, null, this.line, offset); } } if (s.equals("!")) { s = is.getNextChars(2); if (s.equals("!=")) { is.consumeChars(2); return new Token(TokenType.OP_NE, null, this.line, offset); } else { is.consumeChars(1); return new Token(TokenType.OP_NOT, null, this.line, offset); } } if (s.equals("<")) { s = is.getNextChars(2); if (s.equals("<=")) { is.consumeChars(2); return new Token(TokenType.OP_LE, null, this.line, offset); } else { is.consumeChars(1); return new Token(TokenType.OP_LT, null, this.line, offset); } } if (s.equals(">")) { s = is.getNextChars(2); if (s.equals(">=")) { is.consumeChars(2); return new Token(TokenType.OP_GE, null, this.line, offset); } else { is.consumeChars(1); return new Token(TokenType.OP_GT, null, this.line, offset); } } if (s.equals("=")) { s = is.getNextChars(2); if (s.equals("==")) { is.consumeChars(2); return new Token(TokenType.OP_EQ, null, this.line, offset); } else { is.consumeChars(1); return new Token(TokenType.OP_ASSIGN, null, this.line, offset); } } if (s.equals("(")) { is.consumeChars(1); return new Token(TokenType.LPAREN, null, this.line, offset); } if (s.equals(")")) { is.consumeChars(1); return new Token(TokenType.RPAREN, null, this.line, offset); } if (s.equals("[")) { is.consumeChars(1); return new Token(TokenType.LBRACKET, null, this.line, offset); } if (s.equals("]")) { is.consumeChars(1); return new Token(TokenType.RBRACKET, null, this.line, offset); } if (s.equals("{")) { is.consumeChars(1); return new Token(TokenType.LBRACE, null, this.line, offset); } if (s.equals("}")) { is.consumeChars(1); return new Token(TokenType.RBRACE, null, this.line, offset); } if (s.equals(";")) { is.consumeChars(1); return new Token(TokenType.OP_SEMIC, null, this.line, offset); } if (s.equals(",")) { is.consumeChars(1); return new Token(TokenType.OP_COMMA, null, this.line, offset); } if (s.equals(".")) { is.consumeChars(1); return new Token(TokenType.OP_DOT, null, this.line, offset); } return null; } @Override public void reset() { is.resetOffset(); line = 1; } }