/******************************************************************************* * * Copyright (c) 2008 Fujitsu Services Ltd. * * Author: Nick Battle * * This file is part of VDMJ. * * VDMJ is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * VDMJ is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with VDMJ. If not, see <http://www.gnu.org/licenses/>. * ******************************************************************************/ package org.overture.parser.lex; import java.io.File; import java.util.List; import java.util.Stack; import java.util.Vector; import org.overture.ast.intf.lex.ILexLocation; import org.overture.ast.lex.Dialect; import org.overture.ast.lex.LexBooleanToken; import org.overture.ast.lex.LexCharacterToken; import org.overture.ast.lex.LexIdentifierToken; import org.overture.ast.lex.LexIntegerToken; import org.overture.ast.lex.LexKeywordToken; import org.overture.ast.lex.LexLocation; import org.overture.ast.lex.LexNameToken; import org.overture.ast.lex.LexQuoteToken; import org.overture.ast.lex.LexRealToken; import org.overture.ast.lex.LexStringToken; import org.overture.ast.lex.LexToken; import org.overture.ast.lex.VDMToken; import org.overture.parser.config.Properties; /** * The main lexical analyser class. */ public class LexTokenReader extends BacktrackInputReader { /** The filename used for console expressions. */ public static String consoleFileName = "console"; /** The current module name, if parsing a module. */ public String currentModule = ""; /** The current file name. */ public final File file; /** The VDM language dialect we're parsing. */ public final Dialect dialect; /** The current line, starting at line 1. */ private int linecount; /** The current character position on the line, starting at 1. */ private int charpos; /** The number of chars read since the last push. */ private int charsread; /** The number of tokens read since the last push. */ private int tokensread; /** The current offset */ private int offset; /** The next character to process. */ private char ch; /** The last token returned. */ private LexToken last = null; /** True if ch is a quoted double quote, ie. \" */ private boolean quotedQuote = false; /** Added to fix location on the traces **/ private ILexLocation location = null; /** * An inner class to hold all the position details that need to be saved and restored on push/pop. */ private class Position { public int lc; public int cc; public int cr; public int tr; public char c; public LexToken l; /** * Create a Position from the outer class' current position details. */ @SuppressWarnings("synthetic-access") public Position() { lc = linecount; cc = charpos; cr = charsread; tr = tokensread; c = ch; l = last; } /** * Set the outer class position details to those contained in this. */ @SuppressWarnings("synthetic-access") public void set() { linecount = lc; charpos = cc; charsread = cr; tokensread = tr; ch = c; last = l; } } /** A stack of Positions for backtracking. */ private Stack<Position> stack = new Stack<Position>(); /** An end of file symbol. */ private static final char EOF = (char) -1; /** The assumed tab stop, for calculating positions. */ public static/* final */int TABSTOP = 4; /** * Create a LexTokenReader for the filename passed. * * @param file * The filename to parse. * @param dialect * if VDM-SL or VDM++ tokens should be processed. */ public LexTokenReader(File file, Dialect dialect) { super(file); this.file = file; this.dialect = dialect; init(); } /** * Create a LexTokenReader for the filename and charset passed. * * @param file * The filename to parse. * @param dialect * if VDM-SL or VDM++ tokens should be processed. * @param charset * The charset for the file. */ public LexTokenReader(File file, Dialect dialect, String charset) { super(file, charset); this.file = file; this.dialect = dialect; init(); } /** * Create a LexTokenReader for the string passed. * * @param expression * The string (expression) to parse. * @param dialect * Parse VDM++ or VDM-SL tokens. */ public LexTokenReader(String expression, Dialect dialect) { super(expression); this.file = new File(consoleFileName); this.dialect = dialect; init(); } /** * Create a LexTokenReader for the string and charset passed. * * @param expression * The string (expression) to parse. * @param dialect * Parse VDM++ or VDM-SL tokens. * @param charset * The charset to use. */ public LexTokenReader(String expression, Dialect dialect, String charset) { super(expression, charset); this.file = new File(consoleFileName); this.dialect = dialect; init(); } /** * Create a LexTokenReader to read content which originates from a file which is not yet saved and enable the source * of the file to be set. This is used in the IDE to provide while typing outline and parse error info. * * @param content * @param dialect * @param file */ public LexTokenReader(String content, Dialect dialect, File file) { super(content); this.file = file; this.dialect = dialect; init(); } /** * Create a LexTokenReader to read content which originates from a file which is not yet saved and enable the source * of the file to be set. This is used in the IDE to provide while typing outline and parse error info. * * @param content * @param dialect * @param file * @param charset * @param streamReaderType */ public LexTokenReader(String content, Dialect dialect, File file, String charset, ReaderType streamReaderType) { super(content, charset, file, streamReaderType); this.file = file; this.dialect = dialect; init(); } /** * Added to fix the location on traces * * @param content * @param dialect * @param location */ public LexTokenReader(String content, Dialect dialect, ILexLocation location) { super(content); this.file = location.getFile(); this.dialect = dialect; this.location = location; init(); } /** * A string representation of the current location. */ @Override public String toString() { return "Last token [" + last + "], last char [" + ch + "] in " + file + ":" + linecount + ":" + charpos; } /** * Initialize the position counters for a new file/string. */ private void init() { linecount = 1; charpos = 0; rdCh(); charsread = 0; tokensread = 0; } /** * Throw a {@link LexException} with the given message and details of the current file and position appended. * * @param number * The error number. * @param msg * The basic error message. * @throws LexException */ private void throwMessage(int number, String msg) throws LexException { throwMessage(number, linecount, charpos, msg); } private void throwMessage(int number, int line, int pos, String msg) throws LexException { throw new LexException(number, msg, new LexLocation(file, currentModule, line, pos, line, pos, -1, -1));// } /** * Check the next character is as expected. If the character is not as expected, throw the message as a * {@link LexException}. * * @param c * The expected next character. * @param number * The number of the error message. * @param message * The error message. * @throws LexException */ private void checkFor(char c, int number, String message) throws LexException { if (ch == c) { rdCh(); } else { throwMessage(number, message); } } /** * @see org.overture.parser.lex.BacktrackInputReader#push() */ @Override public void push() { super.push(); stack.push(new Position()); charsread = 0; tokensread = 0; // Restored on pop } /** * @see org.overture.parser.lex.BacktrackInputReader#pop() */ @Override public void pop() { super.pop(); stack.pop().set(); LexLocation.clearAfter(file, linecount, charpos); } /** * @see org.overture.parser.lex.BacktrackInputReader#unpush() */ @Override public void unpush() { super.unpush(); Position p = stack.pop(); // Note, don't set the position charsread = charsread + p.cr; tokensread = tokensread + p.tr; } /** * Go back to the mark, and re-mark it. This just calls pop() followed by push(). */ public void retry() { pop(); push(); } /** * @return the number of characters read since the last push(). */ public int getCharsRead() { return charsread; } /** * @return the number of tokens read since the last push(). */ public int getTokensRead() { return tokensread; } /** * Read the next character from the stream. The position details are updated, accounting for newlines and tab stops. * The next character is set in the "ch" field, as well as being returned for convenience. * * @return the next character. */ private char rdCh() { char c = super.readCh(); if (c == '\n') { linecount++; charpos = 0; } else if (c == '\t') { charpos += Properties.parser_tabstop - charpos % Properties.parser_tabstop; } else if (c != (char) -1) { charpos++; } ch = c; charsread++; offset = getCurrentRawReadOffset(); // if(ch == '\r') // { // ch = rdCh(); // }else // { // ch = c; // } return ch; } /** * Read a backslash quoted character from the stream. This method is used when parsing strings and individual quoted * characters, which may include things like "\n". * * @return The actual character value (eg. "\n" returns 10). * @throws LexException */ private char rdQuotedCh() throws LexException { quotedQuote = false; char c = rdCh(); if (c == '\\') { rdCh(); switch (ch) { case 'r': ch = '\r'; break; case 'n': ch = '\n'; break; case 't': ch = '\t'; break; case 'f': ch = '\f'; break; case 'e': ch = '\033'; break; case 'a': ch = '\007'; break; case '\'': ch = '\''; break; case '\"': ch = '\"'; quotedQuote = true; break; case '\\': ch = '\\'; break; case 'x': ch = (char) (value(rdCh()) * 16 + value(rdCh())); break; case 'c': ch = (char) (rdCh() - 'A' + 1); // eg. CTRL-A = 1 break; case 'u': ch = (char) (value(rdCh()) * 4096 + value(rdCh()) * 256 + value(rdCh()) * 16 + value(rdCh())); break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': ch = (char) (value(ch) * 64 + value(rdCh()) * 8 + value(rdCh())); break; default: throwMessage(1000, "Malformed quoted character"); } } return ch; } /** * Return the value of a character for parsing numbers. The ASCII characters 0-9 are turned into decimal 0-9, while * a-f and A-F are turned into the hex values 10-15. Characters outside these ranges return -1. * * @param c * The ASCII value to convert. * @return The converted value. * @see #rdNumber */ private int value(char c) { switch (c) { case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': return c - '0'; case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': return c - 'a' + 10; case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': return c - 'A' + 10; default: return -1; } } /** * Read a number of the given base. Parsing terminates when a character not within the number base is read. * * @param base * The base of the number (eg. 10 for reading decimals) * @return The integer value of the number read. * @throws LexException */ private String rdNumber(int base) throws LexException { StringBuilder v = new StringBuilder(); int n = value(ch); // long v = n; v.append(ch); if (n < 0 || n >= base) { throwMessage(1001, "Invalid char [" + ch + "] in base " + base + " number"); } while (true) { rdCh(); n = value(ch); if (n < 0 || n >= base) { return v.toString(); } // v = (v * base) + n; v.append(ch); } } /** * Read the next complete token from the input stream. Whitespace is skipped, and the start line and position of the * token are noted from the current stream position. Then the next character to process is used to drive a large * switch statement to produce the right {@link VDMToken}. The lastToken field is updated and the result returned. * * @return The next token, or a LexToken of type EOF. * @throws LexException */ public LexToken nextToken() throws LexException { while (Character.isWhitespace(ch)) { rdCh(); } int tokline = linecount; int tokpos = charpos; int tokOffset = offset; VDMToken type = null; last = null; boolean rdch = true; switch (ch) { case EOF: type = VDMToken.EOF; break; case '-': if (rdCh() == '-') { while (ch != '\n' && ch != EOF) { rdCh(); } return nextToken(); } else if (ch == '>') { type = VDMToken.ARROW; } else { rdch = false; type = VDMToken.MINUS; } break; case '+': if (rdCh() == '>') { type = VDMToken.TOTAL_FUNCTION; } else if (ch == '+') { type = VDMToken.PLUSPLUS; } else { rdch = false; type = VDMToken.PLUS; } break; case ':': if (rdCh() == '-') { if (rdCh() == '>') { type = VDMToken.RANGERESBY; } else { rdch = false; type = VDMToken.EQABST; } } else if (ch == '>') { type = VDMToken.RANGERESTO; } else if (ch == '=') { type = VDMToken.ASSIGN; } else if (ch == ':') { type = VDMToken.COLONCOLON; } else { rdch = false; type = VDMToken.COLON; } break; case '|': if (rdCh() == '-') { if (rdCh() == '>') { type = VDMToken.MAPLET; } else { throwMessage(1002, "Expecting '|->'"); } } else if (ch == '|') { type = VDMToken.PIPEPIPE; } else { rdch = false; type = VDMToken.PIPE; } break; case '.': if (rdCh() == '.') { if (rdCh() == '.') { type = VDMToken.RANGE; break; } throwMessage(1003, "Expecting '...'"); } else { rdch = false; type = VDMToken.POINT; } break; case '=': if (rdCh() == '=') { if (rdCh() == '>') { type = VDMToken.OPDEF; } else { rdch = false; type = VDMToken.EQUALSEQUALS; } } else if (ch == '>') { type = VDMToken.IMPLIES; } else { rdch = false; type = VDMToken.EQUALS; } break; case '*': if (rdCh() == '*') { type = VDMToken.STARSTAR; } else { rdch = false; type = VDMToken.TIMES; } break; case '<': push(); if (rdCh() == '=') { unpush(); if (rdCh() == '>') { type = VDMToken.EQUIVALENT; } else { rdch = false; type = VDMToken.LE; } } else if (ch == ':') { unpush(); type = VDMToken.DOMRESTO; } else if (ch == '-') { unpush(); if (rdCh() == ':') { type = VDMToken.DOMRESBY; } else { throwMessage(1004, "Expecting '<-:'"); } } else if (ch == '>') { unpush(); type = VDMToken.NE; } else if (startOfName(ch)) { // <QuoteLiteral> or <x String name = rdIdentifier(); if (ch == '>') { unpush(); last = new LexQuoteToken(name, new LexLocation(file, currentModule, tokline, tokpos, linecount, charpos + 1, tokOffset, offset + 1)); // location(tokline, tokpos, tokOffset, offset)); type = VDMToken.QUOTE; } else { pop(); type = VDMToken.LT; } } else { unpush(); rdch = false; type = VDMToken.LT; } break; case '>': if (rdCh() == '=') { type = VDMToken.GE; } else { rdch = false; type = VDMToken.GT; } break; case '"': rdQuotedCh(); StringBuffer msg = new StringBuffer(); while ((ch != '"' || quotedQuote == true) && ch != EOF) { msg.append(ch); rdQuotedCh(); } checkFor('\"', 1005, "Expecting close double quote"); last = new LexStringToken(msg.toString(), location(tokline, tokpos, tokOffset, offset)); rdch = false; break; case '\'': last = new LexCharacterToken(rdQuotedCh(), location(tokline, tokpos, tokOffset, offset)); rdCh(); checkFor('\'', 1006, "Expecting close quote after character"); rdch = false; break; case '#': if (Character.isLetter(rdCh())) { StringBuilder tag = new StringBuilder(); tag.append('#'); do { tag.append(ch); rdCh(); } while (Character.isLetter(ch)); type = VDMToken.lookup(tag.toString(), dialect); if (type == null) { throwMessage(1007, "Unexpected tag after '#'"); } rdch = false; } else { type = VDMToken.HASH; rdch = false; } break; case '/': if (rdCh() == '*') // Block comments { while (ch != '/' && ch != EOF) { while (ch != '*' && ch != EOF) { rdCh(); } if (ch == EOF) { throwMessage(1011, tokline, tokpos, "Unterminated block comment"); } rdCh(); } rdCh(); return nextToken(); } else { type = VDMToken.DIVIDE; rdch = false; } break; case ',': type = VDMToken.COMMA; break; case ';': type = VDMToken.SEMICOLON; break; case '?': type = VDMToken.QMARK; break; case '@': type = VDMToken.AT; break; case '&': type = VDMToken.AMPERSAND; break; case '^': type = VDMToken.CONCATENATE; break; case '(': type = VDMToken.BRA; break; case ')': type = VDMToken.KET; break; case '{': type = VDMToken.SET_OPEN; break; case '}': type = VDMToken.SET_CLOSE; break; case '[': type = VDMToken.SEQ_OPEN; break; case ']': type = VDMToken.SEQ_CLOSE; break; case '\\': type = VDMToken.SETDIFF; break; case '0': push(); rdCh(); if (ch == 'x' || ch == 'X') { unpush(); rdCh(); String decimal = String.valueOf(Long.parseLong(rdNumber(16), 16)); last = new LexIntegerToken(decimal, location(tokline, tokpos, tokOffset, offset)); } else { pop(); last = rdReal(tokline, tokpos, tokOffset); } rdch = false; break; default: if (ch >= '0' && ch <= '9') { last = rdReal(tokline, tokpos, tokOffset); rdch = false; } else if (startOfName(ch)) { List<String> name = rdName(); // module`name parts rdch = false; switch (name.size()) { case 1: type = VDMToken.lookup(name.get(0), dialect); if (type == null) { last = new LexIdentifierToken(name.get(0), ch == '~', location(tokline, tokpos, tokOffset, offset)); rdch = ch == '~'; } else { switch (type) { case TRUE: case FALSE: last = new LexBooleanToken(type, location(tokline, tokpos, tokOffset, offset)); break; default: last = new LexKeywordToken(type, location(tokline, tokpos, tokOffset, offset)); break; } } break; case 2: last = new LexNameToken(name.get(0), name.get(1), location(tokline, tokpos, tokOffset, offset), false, true); break; default: throwMessage(1008, "Malformed module`name"); } } else { char badch = ch; rdCh(); throwMessage(1009, "Unexpected character '" + badch + "' (code 0x" + Integer.toHexString(badch) + ")"); } break; } if (rdch) { rdCh(); } if (last == null) { last = new LexKeywordToken(type, location(tokline, tokpos, tokOffset, offset)); } tokensread++; return last; } /** * Create a {@link LexLocation} object from the current stream position. * * @param tokline * The token start line. * @param tokpos * The token start position. * @param endOffset * @return A new LexLocation. */ private ILexLocation location(int tokline, int tokpos, int startOffset, int endOffset) { // Fix for location on traces if (this.location != null) { return this.location; } else { return new LexLocation(file, currentModule, tokline, tokpos, linecount, charpos, startOffset, endOffset); } } /** * Read a decimal floating point number. * * @param tokline * The start line of the number. * @param tokpos * The start position of the number. * @param tokOffset * @return Either a LexRealToken or a LexIntegerToken. * @throws LexException */ private LexToken rdReal(int tokline, int tokpos, int tokOffset) throws LexException { String floatSyntax = "Expecting <digits>[.<digits>][e<+-><digits>]"; String value = rdNumber(10); String fraction = null; String exponent = null; boolean negative = false; push(); if (ch == '.') { rdCh(); if (ch >= '0' && ch <= '9') { fraction = rdNumber(10); exponent = "0"; } else { // Somthing like rec.#1.field, so just return the integer pop(); return new LexIntegerToken(value, location(tokline, tokpos, tokOffset, offset)); } } unpush(); if (ch == 'e' || ch == 'E') { if (fraction == null) { fraction = "0"; } switch (rdCh()) { case '+': { rdCh(); exponent = rdNumber(10); break; } case '-': { rdCh(); exponent = rdNumber(10); negative = true; break; } case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': { exponent = rdNumber(10); break; } default: throwMessage(1010, floatSyntax); } } if (fraction != null) { String real = "+" + value + "." + fraction + "e" + (negative ? "-" : "+") + exponent; return new LexRealToken(real, location(tokline, tokpos, tokOffset, offset)); } return new LexIntegerToken(value, location(tokline, tokpos, tokOffset, offset)); } /** * Get the last token returned from the reader. * * @return The last token, or the first token if none read yet. * @throws LexException */ public LexToken getLast() throws LexException { if (last == null) { nextToken(); } return last; } /** * @return True if the character passed can be the start of a variable name. */ private boolean startOfName(char c) { if (c < 0x0100) { return Character.isLetter(c) || c == '$'; } else { switch (Character.getType(c)) { case Character.CONTROL: case Character.LINE_SEPARATOR: case Character.PARAGRAPH_SEPARATOR: case Character.SPACE_SEPARATOR: case Character.SURROGATE: case Character.UNASSIGNED: case Character.DECIMAL_DIGIT_NUMBER: case Character.CONNECTOR_PUNCTUATION: return false; default: return true; } } } /** * @return True if the character passed can be part of a variable name. */ private boolean restOfName(char c) { if (c < 0x0100) { return Character.isLetterOrDigit(c) || c == '$' || c == '_' || c == '\''; } else { switch (Character.getType(c)) { case Character.CONTROL: case Character.LINE_SEPARATOR: case Character.PARAGRAPH_SEPARATOR: case Character.SPACE_SEPARATOR: case Character.SURROGATE: case Character.UNASSIGNED: return false; default: return true; } } } /** * Read a fully qualified module`name. * * @return A list of one or two name parts. */ private List<String> rdName() { List<String> names = new Vector<String>(); names.add(rdIdentifier()); if (ch == '`') { if (startOfName(rdCh())) { names.add(rdIdentifier()); } } if (names.size() == 2) { // We have the strange mk_Mod`name case... String first = names.get(0); if (first.startsWith("mk_") || first.startsWith("is_")) { List<String> one = new Vector<String>(); one.add(first + "`" + names.get(1)); names = one; } } return names; } /** * Read a simple identifier without a module name prefix. * * @return a simple name. */ private String rdIdentifier() { StringBuilder id = new StringBuilder(); id.append(ch); while (restOfName(rdCh())) { id.append(ch); } return id.toString(); } }