/******************************************************************************** * * * (c) Copyright 2010 Verizon Communications USA and The Open University UK * * * * This software is freely distributed in accordance with * * the GNU Lesser General Public (LGPL) license, version 3 or later * * as published by the Free Software Foundation. * * For details see LGPL: http://www.fsf.org/licensing/licenses/lgpl.html * * and GPL: http://www.fsf.org/licensing/licenses/gpl-3.0.html * * * * This software is provided by the copyright holders and contributors "as is" * * and any express or implied warranties, including, but not limited to, the * * implied warranties of merchantability and fitness for a particular purpose * * are disclaimed. In no event shall the copyright owner or contributors be * * liable for any direct, indirect, incidental, special, exemplary, or * * consequential damages (including, but not limited to, procurement of * * substitute goods or services; loss of use, data, or profits; or business * * interruption) however caused and on any theory of liability, whether in * * contract, strict liability, or tort (including negligence or otherwise) * * arising in any way out of the use of this software, even if advised of the * * possibility of such damage. * * * ********************************************************************************/ package com.compendium.io.questmap; import java.io.FileInputStream; import java.io.IOException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.compendium.ProjectCompendium; /** * class Lexer * * Description: * Lexer is a lexical analyser for Brahms models. The lexer is * used by the Yacc parser to retrieve tokens from the input. * The lexer skips white spaces. The lexer can return the * following tokens: * INT token for an integer * STRING token for a string starting with '"' and ending with '"' * ID token for identifiers * num token for keywords and single character symbols (like ';') * each token type has a unique code associated with it. Each keyword and * character symbol is also uniquely identified with a code. The codes for * keywords and characters symbols are loaded in the tables KeywordTable * and CharSymbolTable. The token codes are loaded in the table TokenTable. * * @see com.compendium.io.questmap.TokenTable * @see com.compendium.io.questmap.KeywordTable * @see com.compendium.io.questmap.CharSymbolTable * @see com.compendium.io.questmap.TokenTable * @author Ron van Hoof */ public class Lexer { static final Logger log = LoggerFactory.getLogger(Lexer.class); // constants protected static final int EOF_CHAR = -1; protected static final int EOF = 0; protected static final int ERROR = -1; // attributes protected TokenTable tokens = null; // tokens INT, DOUBLE, ID, STRING, BOOLEAN, RELOP protected KeywordTable keywords = null; // all keywords protected CharSymbolTable charSymbols = null; // character symbols ('(', ')', etc) protected String file = ""; // file to be analysed protected FileInputStream input = null; protected int nextChar = EOF_CHAR; // first lookahead character protected int nextChar2 = EOF_CHAR; // second lookahead character protected int line = 1; // line currently analysed protected int pos = 1; // character position in line protected String yytext = ""; // The last token read protected int yyleng = 0; // the length of yytext /** * Lexer(TokenTable tkns, * KeywordTable kt, * CharSymbolTable cst, * String fileName, * ErrorLog el) throws IOException * * Description: * Creates a new lexical analyser to analyse the given file using * the given tables. Errors will be added to the given ErrorLog. */ public Lexer( TokenTable tkns, KeywordTable kt, CharSymbolTable cst, String fileName) throws IOException { // initialize attributes tokens = tkns; keywords = kt; charSymbols = cst; file = fileName; // initialize the file init(); } // Lexer public void close() { try { input.close(); } catch (IOException e) { ProjectCompendium.APP.displayError( "Error closing file: "+file + "." + e.getMessage()); } // end try } // close /** * void init() throws IOException * * Description: * Opens the file for analysis and reads the lookahead characters */ protected void init() throws IOException { // open the file to analyse input = new FileInputStream(file); // read two lookahead characters nextChar = input.read(); if (nextChar == EOF_CHAR) { nextChar2 = EOF_CHAR; } else { nextChar2 = input.read(); } // end if } // init /** * void advance() throws IOException * * Description: * Reads the next character from the input file and updates * the position counters for the line and character position */ protected void advance() throws IOException { int curChar; // update the two lookahead characters, the second lookahead will // contain the new character. curChar = nextChar; nextChar = nextChar2; if (nextChar == EOF_CHAR) { nextChar2 = EOF_CHAR; close(); } else { nextChar2 = input.read(); } // end if // set the file position pos++; if (curChar == '\n') { line++; pos = 1; // log.addMessage(new Message("Parser line: "+line)); } // end if } // advance /** * int getLine() * * Description: * Returns the line number of the line being analysed. */ public int getLine() { return line; } // getLine /** * int getPos() * * Description: * Returns the character position in the line being analysed. */ public int getPos() { return pos; } // getPos public String getYYText() { return yytext; } // getYYText public int getYYLeng() { return yyleng; } // getYYLeng /** * int nextToken() throws IOException * * Description: * Returns the next token in the input. This method skips white spaces * and comments and can return one of the following tokens: * INT * DOUBLE * BOOLEAN * STRING * ID * num // representing a keyword or character */ public int nextToken(Union yylval) throws IOException { int symNum; // symbol number for(;;) { // skip white spaces if (isWhiteSpace(nextChar)) { skipWhiteSpace(); } // end if // check for string if (nextChar == '"') { return readString(yylval); } // end if // check for numbers (unsigned, integer, float) if (isDigit(nextChar) || ((nextChar == '+' || nextChar == '-' || nextChar == '.') && isDigit(nextChar2))) { return readNumber(yylval); } // end if // check for a single character symbol symNum = charSymbols.isCharSymbol(nextChar); if (symNum != -1) { // it is a character symbol, advance and return token yytext = (new Character((char)nextChar)).toString(); yyleng = 1; advance(); return symNum; } // end if // check for keyword, identifier if (isLetter(nextChar)) { return readIdentifier(yylval); } // end if // check for EOF if (nextChar == EOF_CHAR) { return EOF; } // end if ProjectCompendium.APP.displayError("Error reading file: " + file + " (Cant Recognize Format)", "File Import.. "); // unidentified character, generate error and ignore it log.info("Unidentified character '" + new Character((char)nextChar) + "'(" + nextChar + ")" + " line:" + line + " pos:" + pos + " file:" + file); advance(); } // end for } // nextToken /** * boolean isWhiteSpace(int ch) * * Description: * Returns whether or not the given character is a white space, meaning * one of space, linefeed, carriage return, newline, or tab */ protected boolean isWhiteSpace(int ch) { return (ch == ' ' || ch == '\n' || ch == '\f' || ch == '\t' || ch == '\r'); } // isWhiteSpace /** * boolean isLetter(int ch) * * Description: * Returns whether or not the given character is a letter, meaning * one of 'a'-'z', 'A'-'Z', or '_' */ protected boolean isLetter(int ch) { return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || (ch == '_'); } // isLetter /** * boolean isDigit(int ch) * * Description: * Returns whether or not the given character is a digit, meaning * one of '0'-'9'. */ protected boolean isDigit(int ch) { return (ch >= '0' && ch <= '9'); } // isDigit /** * boolean isIdChar(int ch) * * Description: * Returns whether or not the given character is allowed to be * in an identifier or keyword. Id characters can be a * letter (see isLetter), digit (see isDigit), or '-' * * @see #isLetter * @see #isDigit */ protected boolean isIdChar(int ch) { return isLetter(ch) || isDigit(ch) || ch == '-'; } // isIdChar /** * void skipWhiteSpace() throws IOException * * Description: * Reads and skips all white space characters until a non * white space character is read. */ protected void skipWhiteSpace() throws IOException { // at entrance of this method nextChar == <whitespace> // read all whitespace do { advance(); } while (isWhiteSpace(nextChar)); } // skipWhiteSpace /** * int readString() throws IOException * * Description: * Reads a string starting and ending with '"'. Returns a * STRING token if it succeeded. It can return a token * representing an EOF when the end of file is read and no * closing '"' is found. */ protected int readString(Union yylval) throws IOException { StringBuffer str = new StringBuffer(); int startLine; int startPos; // read past the string opener '"' advance(); startLine = line; startPos = pos; // store the string characters until the end of string marker while (!(nextChar == '"')) { // if we read an EOF, then no end of string, error if (nextChar == EOF_CHAR) { log.info("String not terminated at end of file" + " line:" + startLine + " pos:" + startPos + " file: " + file); yylval.sval = ""; return EOF; } // end if // check if we are dealing a double quote that is part of the string '\"' if (nextChar == '\\' && nextChar2 == '"') { str.append((char)nextChar); str.append((char)nextChar2); advance(); advance(); } else { // valid character, append to string, read next str.append((char)nextChar); advance(); } // end if } // end while // read past the string closer '"' advance(); // return the string token yylval.sval = str.toString(); yytext = str.toString(); yyleng = yytext.length(); return tokens.getCode("STRING"); } // readString /** * int readNumber() throws IOException * * Description: * Reads a number. Must be an integer. * integer ::= {+|-}[digit]+ * Returns either an INT token or a token representing an * error (ERROR). */ protected int readNumber(Union yylval) throws IOException { StringBuffer num = new StringBuffer(); // at entrance nextChar is one of <digit>, '+', '-', or '.' // first read the '+' or '-' sign if one exists if (nextChar == '+' || nextChar == '-') { num.append((char)nextChar); advance(); } // end if // read digits if there are any while (isDigit(nextChar)) { num.append((char)nextChar); advance(); } // end while // now we'll check if a white space follows, if not we // have an error in the number. We'll read the characters // till a whitespace or EOF is encountered. if (isWhiteSpace(nextChar) || nextChar == EOF_CHAR || charSymbols.isCharSymbol(nextChar) >= 0) { // return int token yytext = num.toString(); yyleng = yytext.length(); yylval.ival = (new Integer(num.toString())).intValue(); return tokens.getCode("INT"); } else { // error, other characters found in number log.info("Invalid character in integer" + " line:" + line + " pos:" + pos + " file:" + file); // skip invalid characters while (!isWhiteSpace(nextChar) && nextChar != EOF_CHAR && charSymbols.isCharSymbol(nextChar) == -1) { advance(); } // end while return ERROR; } // end if } // readNumber /** * int readIdentifier() throws IOException * * Description: * Reads an identifier, which can either be a keyword or * a real identifier. * identifier ::= [letter][letter|digit|'-']* * Returns either an ID token (in case of an identifier) or an integer * representing the keyword. */ protected int readIdentifier(Union yylval) throws IOException { StringBuffer str = new StringBuffer(); String id; int keywordNum; // at entrance of this method nextChar is a letter str.append((char)nextChar); advance(); // read identifier characters while(isIdChar(nextChar)) { str.append((char)nextChar); advance(); } // end while // check if identifier is a keyword id = str.toString(); keywordNum = keywords.isKeyword(id); if (keywordNum >= 0) { // keyword yytext = id; yyleng = id.length(); return keywordNum; } else { // is an identifier yytext = id; yyleng = id.length(); yylval.sval = id; return tokens.getCode("ID"); } // end if } // readIdentifier }