/*************************************************************************** * Copyright (C) 2006-2012 by Fabrizio Montesi <famontesi@gmail.com> * * * * This program is free software; you can redistribute it and/or modify * * it under the terms of the GNU Library General Public License as * * published by the Free Software Foundation; either version 2 of the * * License, or (at your option) any later version. * * * * This program is distributed in the hope that it will be useful, * * but WITHOUT ANY WARRANTY; without even the implied warranty of * * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * * GNU General Public License for more details. * * * * You should have received a copy of the GNU Library General Public * * License along with this program; if not, write to the * * Free Software Foundation, Inc., * * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * * * * For details about the authors of this software, see the AUTHORS file. * ***************************************************************************/ package jolie.lang.parse; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.net.URI; import java.util.HashMap; import java.util.Map; import jolie.lang.NativeType; /** * Scanner implementation for the Jolie language parser. * * @author Fabrizio Montesi * */ public class Scanner { /** Token types */ public enum TokenType { EOF, ///< End Of File ID, ///< [a-z][a-zA-Z0-9]* COMMA, ///< , DOT, ///< . INT, ///< [0-9]+ BYTE, ///< [0-9]+B TRUE, ///< true FALSE, ///< false LONG, ///< [0-9]+L INT16, ///< [0-9]+S UINT16, ///< [0-9]+US UINT32, ///< [0-9]+U UINT64, ///< [0-9]+UL DOUBLE, ///< [0-9]*"."[0-9]+(e|E)[0-9]+ LPAREN, ///< ( RPAREN, ///< ) LSQUARE, ///< [ RSQUARE, ///< ] LCURLY, ///< { RCURLY, ///< } //DOLLAR, ///< $ STRING, ///< "[[:graph:]]*" INCREMENT, ///< ++ MINUS, ///< The minus sign - ASTERISK, ///< * DIVIDE, ///< / ASSIGN, ///< = PLUS, ///< + ADD_ASSIGN, ///< += MINUS_ASSIGN, ///< -= MULTIPLY_ASSIGN, ///< *= DIVIDE_ASSIGN, ///< %= SEQUENCE, ///< ; IF, ///< if ELSE, ///< else LANGLE, ///< < RANGLE, ///< > AT, ///< @ LINKIN, ///< linkIn LINKOUT, ///< linkOut INSTANCE_OF, ///< instanceof EQUAL, ///< == AND, ///< && OR, ///< || PARALLEL, ///< | NOT, ///< ! CARET, ///< ^ COLON, ///< : OP_OW, ///< OneWay OP_RR, ///< RequestResponse DEFINE, ///< define MAJOR_OR_EQUAL, ///< >= MINOR_OR_EQUAL, ///< <= NOT_EQUAL, ///< != NULL_PROCESS, ///< nullProcess WHILE, ///< while EXECUTION, ///< execution THROW, ///< throw DOCUMENTATION_COMMENT, INSTALL, ///< install SCOPE, ///< scope SPAWN, ///< spawn THIS, ///< this COMPENSATE, ///< comp EXIT, ///< exit INCLUDE, ///< include CONSTANTS, ///< constants POINTS_TO, ///< -> QUESTION_MARK, ///< ? ARROW, ///< => DEEP_COPY_LEFT, ///< << RUN, ///< run UNDEF, ///< undef HASH, ///< # PERCENT_SIGN, ///< % FOR, ///< for FOREACH, ///< foreach WITH, ///< with DECREMENT, ///< -- IS_STRING, ///< is_string IS_INT, ///< is_int IS_DOUBLE, ///< is_double IS_BOOL, ///< is_bool IS_LONG, ///< is_long IS_DEFINED, ///< is_defined CAST_INT, ///< int CAST_STRING, ///< string CAST_DOUBLE, ///< double CAST_BOOL, ///< bool CAST_LONG, ///< long SYNCHRONIZED, ///< synchronized THROWS, ///< throws CURRENT_HANDLER, ///< cH INIT, ///< init ERROR ///< Scanner error } /* * Map of unreserved keywords, * which can be considered as IDs in certain places (e.g. variables). */ private static final Map< String, TokenType > unreservedKeywords = new HashMap< String, TokenType >(); static { // Initialise the unreserved keywords map. unreservedKeywords.put( "OneWay", TokenType.OP_OW ); unreservedKeywords.put( "RequestResponse", TokenType.OP_RR ); unreservedKeywords.put( "linkIn", TokenType.LINKIN ); unreservedKeywords.put( "linkOut", TokenType.LINKOUT ); unreservedKeywords.put( "if", TokenType.IF ); unreservedKeywords.put( "else", TokenType.ELSE ); unreservedKeywords.put( "include", TokenType.INCLUDE ); unreservedKeywords.put( "define", TokenType.DEFINE ); unreservedKeywords.put( "nullProcess", TokenType.NULL_PROCESS ); unreservedKeywords.put( "while", TokenType.WHILE ); unreservedKeywords.put( "execution", TokenType.EXECUTION ); unreservedKeywords.put( "install", TokenType.INSTALL ); unreservedKeywords.put( "this", TokenType.THIS ); unreservedKeywords.put( "synchronized", TokenType.SYNCHRONIZED ); unreservedKeywords.put( "throw", TokenType.THROW ); unreservedKeywords.put( "scope", TokenType.SCOPE ); unreservedKeywords.put( "spawn", TokenType.SPAWN ); unreservedKeywords.put( "comp", TokenType.COMPENSATE ); unreservedKeywords.put( "exit", TokenType.EXIT ); unreservedKeywords.put( "constants", TokenType.CONSTANTS ); unreservedKeywords.put( "undef", TokenType.UNDEF ); unreservedKeywords.put( "for", TokenType.FOR ); unreservedKeywords.put( "foreach", TokenType.FOREACH ); unreservedKeywords.put( "is_defined", TokenType.IS_DEFINED ); unreservedKeywords.put( "is_string", TokenType.IS_STRING ); unreservedKeywords.put( "is_int", TokenType.IS_INT ); unreservedKeywords.put( "is_bool", TokenType.IS_BOOL ); unreservedKeywords.put( "is_long", TokenType.IS_LONG ); unreservedKeywords.put( "is_double", TokenType.IS_DOUBLE ); unreservedKeywords.put( "instanceof", TokenType.INSTANCE_OF ); unreservedKeywords.put( NativeType.INT.id(), TokenType.CAST_INT ); unreservedKeywords.put( NativeType.STRING.id(), TokenType.CAST_STRING ); unreservedKeywords.put( NativeType.BOOL.id(), TokenType.CAST_BOOL ); unreservedKeywords.put( NativeType.DOUBLE.id(), TokenType.CAST_DOUBLE ); unreservedKeywords.put( "throws", TokenType.THROWS ); unreservedKeywords.put( "cH", TokenType.CURRENT_HANDLER ); unreservedKeywords.put( "init", TokenType.INIT ); unreservedKeywords.put( "with", TokenType.WITH ); unreservedKeywords.put( "true", TokenType.TRUE ); unreservedKeywords.put( "false", TokenType.FALSE ); } /** * This class represents an input token read by the Scanner class. * * @see Scanner * @author Fabrizio Montesi * @version 1.0 * */ public static class Token { private final TokenType type; private final String content; private final boolean isUnreservedKeyword; /** * Constructor. The content of the token will be set to "". * @param type the type of this token */ public Token( TokenType type ) { this.type = type; this.content = ""; this.isUnreservedKeyword = false; } /** * Constructor. * @param type the type of this token * @param content the content of this token */ public Token( TokenType type, String content ) { this.type = type; this.content = content; this.isUnreservedKeyword = false; } /** * Constructor. * @param type the type of this token * @param content the content of this token * @param isUnreservedKeyword specifies whether this token is an unreserved keyword */ public Token( TokenType type, String content, boolean isUnreservedKeyword ) { this.type = type; this.content = content; this.isUnreservedKeyword = isUnreservedKeyword; } /** * Returns the content of this token. * @return the content of this token */ public String content() { return content; } /** * Returns the type of this token. * @return the type of this token */ public TokenType type() { return type; } /** * Returns <code>true</code> if this token can be considered as a valid * value for a constant, <code>false</code> otherwise. * @return <code>true</code> if this token can be considered as a valid * value for a constant, <code>false</code> otherwise */ public boolean isValidConstant() { return type == TokenType.STRING || type == TokenType.INT || type == TokenType.ID || type == TokenType.LONG || type == TokenType.TRUE || type == TokenType.FALSE || type == TokenType.DOUBLE; } /** * Equivalent to <code>is(TokenType.EOF)</code> * @return <code>true</code> if this token has type <code>TokenType.EOF</code>, false otherwise */ public boolean isEOF() { return ( type == TokenType.EOF ); } /** * Returns <code>true</code> if this token has the passed type, <code>false</code> otherwise. * @param compareType the type to compare the type of this token with * @return <code>true</code> if this token has the passed type, <code>false</code> otherwise */ public boolean is( TokenType compareType ) { return ( type == compareType ); } /** * Returns <code>true</code> if this token has a different type from the passed one, <code>false</code> otherwise. * @param compareType the type to compare the type of this token with * @return <code>true</code> if this token has a different type from the passed one, <code>false</code> otherwise */ public boolean isNot( TokenType compareType ) { return ( type != compareType ); } /** * Returns <code>true</code> if this token has type <code>TokenType.ID</code> * and its content is equal to the passed parameter, <code>false</code> otherwise. * @param keyword the keyword to check the content of this token against * @return <code>true</code> if this token has type <code>TokenType.ID</code> * and its content is equal to the passed parameter, <code>false</code> otherwise */ public boolean isKeyword( String keyword ) { return( type == TokenType.ID && content.equals( keyword ) ); } /** * Returns <code>true</code> if this token has type <code>TokenType.ID</code> * or is a token for an unreserved keyword, <code>false</code> otherwise. * @return <code>true</code> if this token has type <code>TokenType.ID</code> * or is a token for an unreserved keyword, <code>false</code> otherwise. */ public boolean isIdentifier() { return( type == TokenType.ID || isUnreservedKeyword ); } /** * This method behaves as {@link #isKeyword(java.lang.String) isKeyword}, except that * it is case insensitive. * @param keyword the keyword to check the content of this token against * @return */ public boolean isKeywordIgnoreCase( String keyword ) { return( type == TokenType.ID && content.equalsIgnoreCase( keyword ) ); } } private final InputStream stream; // input stream private final Reader reader; // data input protected char ch; // current character protected int currInt; // current stream int protected int state; // current state private int line; // current line private final URI source; // source name /** * Constructor * @param stream the <code>InputStream</code> to use for input reading * @param sourceName an arbitrary name * @throws java.io.IOException if the input reading initialization fails */ public Scanner( InputStream stream, URI source ) throws IOException { this.stream = stream; this.reader = new InputStreamReader( stream ); this.source = source; line = 1; readChar(); } public String readWord() throws IOException { return readWord( true ); } public String readWord( boolean readChar ) throws IOException { StringBuilder buffer = new StringBuilder(); if ( readChar ) { readChar(); } do { buffer.append( ch ); readChar(); } while( !isSeparator( ch ) ); return buffer.toString(); } public String readLine() throws IOException { StringBuilder buffer = new StringBuilder(); readChar(); while( !isNewLineChar( ch ) ) { buffer.append( ch ); readChar(); } return buffer.toString(); } public static String addStringTerminator( String str ) { return str + -1; } public InputStream inputStream() { return stream; } /** * Returns the current line the scanner is reading. * * @return the current line the scanner is reading. */ public int line() { return line; } /** * Returns the source URI the scanner is reading. * * @return the source URI the scanner is reading */ public URI source() { return source; } public String sourceName() { return source.getSchemeSpecificPart(); } /** * Eats all separators (whitespace) until the next input. * @throws IOException */ public void eatSeparators() throws IOException { while( isSeparator( ch ) ) { readChar(); } } public void eatSeparatorsUntilEOF() throws IOException { while( isSeparator( ch ) && stream.available() > 0 ) { readChar(); } } /** * Checks whether a character is a separator (whitespace). * @param c the character to check as a whitespace * @return <code>true</code> if <code>c</code> is a separator (whitespace) */ public static boolean isSeparator( int c ) { return isNewLineChar( c ) || c == '\t' || c == ' '; } /** * Checks whether a character is a newline character. * @param c the character to check * @return <code>true</code> if <code>c</code> is a newline character */ public static boolean isNewLineChar( int c ) { return ( c == '\n' || c == '\r' ); } /** * Reads the next character and loads it into the scanner local state. * @throws IOException if the source cannot be read */ public final void readChar() throws IOException { currInt = reader.read(); ch = (char)currInt; if ( ch == '\n' ) { line++; } } /** * Returns the current character in the scanner local state. * @return the current character in the scanner local state */ public char currentCharacter() { return ch; } /** * Consumes characters from the source text and returns its corresponding token. * @return the token corresponding to the consumed characters * @throws IOException if not enough characters can be read from the source */ public Token getToken() throws IOException { boolean keepRun = true; state = 1; while ( currInt != -1 && isSeparator( ch ) ) { readChar(); } if ( currInt == -1 ) { return new Token( TokenType.EOF ); } boolean stopOneChar = false; Token retval = null; StringBuilder builder = new StringBuilder(); while ( keepRun ) { if ( currInt == -1 && retval == null ) { keepRun = false; // We *need* a token at this point } switch( state ) { /* When considering multi-characters tokens (states > 1), * remember to read another character in case of a * specific character (==) check. */ case 1: // First character if ( Character.isLetter( ch ) || ch == '_' ) { state = 2; } else if ( Character.isDigit( ch ) ) { state = 3; } else if ( ch == '"' ) { state = 4; } else if ( ch == '+' ) { state = 5; } else if ( ch == '*' ) { state = 23; } else if ( ch == '=' ) { state = 6; } else if ( ch == '|' ) { state = 7; } else if ( ch == '&' ) { state = 8; } else if ( ch == '<' ) { state = 9; } else if ( ch == '>' ) { state = 10; } else if ( ch == '!' ) { state = 11; } else if ( ch == '/' ) { state = 12; } else if ( ch == '-' ) { state = 14; } else if ( ch == '.') { // DOT or REAL state = 16; } else { // ONE CHARACTER TOKEN if ( ch == '(' ) { retval = new Token( TokenType.LPAREN ); } else if ( ch == ')' ) { retval = new Token( TokenType.RPAREN ); } else if ( ch == '[' ) { retval = new Token( TokenType.LSQUARE ); } else if ( ch == ']' ) { retval = new Token( TokenType.RSQUARE ); } else if ( ch == '{' ) { retval = new Token( TokenType.LCURLY ); } else if ( ch == '}' ) { retval = new Token( TokenType.RCURLY ); } else if ( ch == '@' ) { retval = new Token( TokenType.AT ); } else if ( ch == ':' ) { retval = new Token( TokenType.COLON ); } else if ( ch == ',' ) { retval = new Token( TokenType.COMMA ); } else if ( ch == ';' ) { retval = new Token( TokenType.SEQUENCE ); } else if ( ch == '%' ) { retval = new Token( TokenType.PERCENT_SIGN ); //else if ( ch == '.' ) //retval = new Token( TokenType.DOT ); } else if ( ch == '#' ) { retval = new Token( TokenType.HASH ); } else if ( ch == '^' ) { retval = new Token( TokenType.CARET ); } else if ( ch == '?' ) { retval = new Token( TokenType.QUESTION_MARK ); } /*else if ( ch == '$' ) retval = new Token( TokenType.DOLLAR );*/ readChar(); } break; case 2: // ID (or unreserved keyword) if ( !Character.isLetterOrDigit( ch ) && ch != '_' ) { String str = builder.toString(); TokenType tt = unreservedKeywords.get( str ); if ( tt != null ) { // It is an unreserved keyword retval = new Token( tt, str, true ); } else { // It is a normal ID, not corresponding to any keyword retval = new Token( TokenType.ID, str ); } } break; case 3: // INT (or LONG, or DOUBLE or UIN32 or UINT16 or INT16 or BYTE or UINT64) if ( ch == 'e'|| ch == 'E' ){ state = 19; } else if ( !Character.isDigit( ch ) && ch != '.' ) { if ( ch == 'L' ) { retval = new Token( TokenType.LONG, builder.toString() ); readChar(); } else if ( ch == 'U'){ readChar(); if(ch == 'L'){ retval = new Token(TokenType.UINT64, builder.toString() ); readChar(); } else if (ch == 'S'){ retval = new Token(TokenType.UINT16, builder.toString() ); readChar(); } else { retval = new Token( TokenType.UINT32, builder.toString() ); } } else if ( ch == 'S' ) { retval = new Token( TokenType.INT16, builder.toString() ); readChar(); } else if ( ch == 'B'){ retval = new Token ( TokenType.BYTE, builder.toString() ); readChar(); } else { retval = new Token( TokenType.INT, builder.toString() ); } } else if ( ch == '.' ) { builder.append( ch ); readChar(); if ( !Character.isDigit( ch ) ) { retval = new Token( TokenType.ERROR, builder.toString() ); } else state = 17; // recognized a DOUBLE } break; case 4: // STRING if ( ch == '"' ) { retval = new Token( TokenType.STRING, builder.toString().substring( 1 ) ); readChar(); } else if ( ch == '\\' ) { // Parse special characters readChar(); if ( ch == '\\' ) builder.append( '\\' ); else if ( ch == 'n' ) builder.append( '\n' ); else if ( ch == 't' ) builder.append( '\t' ); else if ( ch == 'r' ) builder.append( '\r' ); else if ( ch == '"' ) builder.append( '"' ); else if ( ch == 'u' ) builder.append( 'u' ); else throw new IOException( "malformed string: bad \\ usage" ); stopOneChar = true; readChar(); } break; case 5: // PLUS OR CHOICE if ( ch == '=' ) { retval = new Token( TokenType.ADD_ASSIGN ); readChar(); } else if ( ch == '+' ) { retval = new Token( TokenType.INCREMENT ); readChar(); } else { retval = new Token( TokenType.PLUS ); } break; case 23: // MULTIPLY or MULTIPLY_ASSIGN if ( ch == '=' ) { retval = new Token( TokenType.MULTIPLY_ASSIGN ); readChar(); } else { retval = new Token( TokenType.ASTERISK, "*" ); } break; case 6: // ASSIGN OR EQUAL if ( ch == '=' ) { retval = new Token( TokenType.EQUAL ); readChar(); } else if ( ch == '>' ) { retval = new Token( TokenType.ARROW ); readChar(); } else retval = new Token( TokenType.ASSIGN ); break; case 7: // PARALLEL OR LOGICAL OR if ( ch == '|' ) { retval = new Token( TokenType.OR ); readChar(); } else retval = new Token( TokenType.PARALLEL ); break; case 8: // LOGICAL AND if ( ch == '&' ) { retval = new Token( TokenType.AND ); readChar(); } break; case 9: // LANGLE OR MINOR_OR_EQUAL OR DEEP_COPY_LEFT if ( ch == '=' ) { retval = new Token( TokenType.MINOR_OR_EQUAL ); readChar(); } else if ( ch == '<' ) { retval = new Token( TokenType.DEEP_COPY_LEFT ); readChar(); } else retval = new Token( TokenType.LANGLE ); break; case 10: // RANGLE OR MINOR_OR_EQUAL if ( ch == '=' ) { retval = new Token( TokenType.MAJOR_OR_EQUAL ); readChar(); } else retval = new Token( TokenType.RANGLE ); break; case 11: // NOT OR NOT_EQUAL if ( ch == '=' ) { retval = new Token( TokenType.NOT_EQUAL ); readChar(); } else retval = new Token( TokenType.NOT ); break; case 12: // DIVIDE OR BEGIN_COMMENT OR LINE_COMMENT if ( ch == '*' ) { state = 13; } else if ( ch == '/' ) { state = 15; } else if( ch == '=' ) { retval = new Token( TokenType.DIVIDE_ASSIGN ); readChar(); } else retval = new Token( TokenType.DIVIDE ); break; case 13: // WAITING FOR END_COMMENT if ( ch == '*' ) { readChar(); stopOneChar = true; if ( ch == '/' ) { readChar(); retval = getToken(); } else if ( ch == '!' ) { builder = new StringBuilder(); readChar(); state = 21; } } break; case 14: // MINUS OR (negative) NUMBER OR POINTS_TO if ( Character.isDigit( ch ) ) state = 3; else if ( ch == '-' ) { retval = new Token( TokenType.DECREMENT ); readChar(); } else if ( ch == '>' ) { retval = new Token( TokenType.POINTS_TO ); readChar(); } else if ( ch == '=' ) { retval = new Token( TokenType.MINUS_ASSIGN ); readChar(); } else if ( ch == '.' ) { builder.append( ch ); readChar(); if ( !Character.isDigit( ch ) ) retval = new Token( TokenType.ERROR, "-." ); else state = 17; } else retval = new Token( TokenType.MINUS ); break; case 15: // LINE_COMMENT: waiting for end of line if ( isNewLineChar( ch ) ) { readChar(); retval = getToken(); } break; case 16: // DOT if ( !Character.isDigit( ch ) ) retval = new Token( TokenType.DOT ); else state = 17; // It's a REAL break; case 17: // REAL "."[0-9]+ if ( ch == 'E' || ch == 'e' ) state = 18; else if ( !Character.isDigit( ch ) ) retval = new Token( TokenType.DOUBLE, builder.toString() ); break; case 18: // Scientific notation, first char after 'E' if ( ch == '-' || ch == '+' ) state = 19; else if ( Character.isDigit( ch ) ) state = 20; else retval = new Token( TokenType.ERROR ); break; case 19: // Scientific notation, first exp. digit if ( !Character.isDigit( ch ) ) retval = new Token( TokenType.ERROR ); else state = 20; break; case 20: // Scientific notation: from second digit to end if ( !Character.isDigit( ch ) ) retval = new Token( TokenType.DOUBLE, builder.toString() ); break; case 21: // Documentation comment if ( ch == '*' ) { readChar(); stopOneChar = true; if ( ch == '/' ) { readChar(); retval = new Token( TokenType.DOCUMENTATION_COMMENT, builder.toString() ); } } break; default: retval = new Token( TokenType.ERROR, builder.toString() ); break; } if ( retval == null ) { if ( stopOneChar ) stopOneChar = false; else { builder.append( ch ); readChar(); } } else { keepRun = false; // Ok, we are done. } } if ( retval == null ) { retval = new Token( TokenType.ERROR ); } return retval; } }