// Copyright (c) 2011, David J. Pearce (djp@ecs.vuw.ac.nz) // All rights reserved. // // This software may be modified and distributed under the terms // of the BSD license. See the LICENSE file for details. package wyc.io; import java.io.BufferedReader; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.Reader; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import wybs.lang.Attribute; import wybs.lang.SyntacticElement; import wybs.lang.SyntaxError; import wyc.lang.WhileyFile; import wyfs.lang.Path; /** * Split a source file into a list of tokens. These tokens can then be fed into * the parser in order to generate an Abstract Syntax Tree (AST). * * @author David J. Pearce * */ public class WhileyFileLexer { private final Path.Entry<WhileyFile> entry; private StringBuilder input; private int pos; public WhileyFileLexer(Path.Entry<WhileyFile> entry) throws IOException { this.entry = entry; Reader reader = new InputStreamReader(entry.inputStream()); BufferedReader in = new BufferedReader(reader); StringBuilder text = new StringBuilder(); int len = 0; char[] buf = new char[1024]; while ((len = in.read(buf)) != -1) { text.append(buf, 0, len); } input = text; } /** * Scan all characters from the input stream and generate a corresponding * list of tokens, whilst discarding all whitespace and comments. * * @return */ public List<Token> scan() { ArrayList<Token> tokens = new ArrayList<>(); pos = 0; while (pos < input.length()) { char c = input.charAt(pos); if (Character.isDigit(c)) { tokens.add(scanNumericConstant()); } else if (c == '"') { tokens.add(scanStringConstant()); } else if (c == '\'') { tokens.add(scanCharacterConstant()); } else if (isOperatorStart(c)) { tokens.add(scanOperator()); } else if (Character.isLetter(c) || c == '_') { tokens.add(scanIdentifier()); } else if (Character.isWhitespace(c)) { scanWhiteSpace(tokens); } else { syntaxError("unknown token encountered",pos); } } return tokens; } /** * Scan a numeric constant. That is a sequence of digits which gives either * an integer constant, or a real constant (if it includes a dot) or a byte * (if it ends in a 'b'). * * @return */ public Token scanNumericConstant() { int start = pos; while (pos < input.length() && Character.isDigit(input.charAt(pos))) { pos = pos + 1; } if (pos < input.length() && input.charAt(pos) == '.') { pos = pos + 1; if (pos < input.length() && input.charAt(pos) == '.') { // this is case for range e.g. 0..1 pos = pos - 1; return new Token(Token.Kind.IntValue, input.substring(start, pos), start); } while (pos < input.length() && Character.isDigit(input.charAt(pos))) { pos = pos + 1; } return new Token(Token.Kind.RealValue, input.substring(start, pos), start); } else if(pos < input.length() && input.charAt(pos) == 'b') { pos = pos + 1; return new Token(Token.Kind.ByteValue, input.substring(start, pos), start); } else { return new Token(Token.Kind.IntValue, input.substring(start, pos), start); } } /** * Scan a character constant, such as e.g. 'c'. Observe that care must be * taken to properly handle escape codes. For example, '\n' is a single * character constant which is made up from two characters in the input * string. * * @return */ public Token scanCharacterConstant() { int start = pos; pos++; char c = input.charAt(pos++); if (c == '\\') { // escape code switch (input.charAt(pos++)) { case 'b': c = '\b'; break; case 't': c = '\t'; break; case 'n': c = '\n'; break; case 'f': c = '\f'; break; case 'r': c = '\r'; break; case '"': c = '\"'; break; case '\'': c = '\''; break; case '\\': c = '\\'; break; default: syntaxError("unrecognised escape character", pos); } } if (input.charAt(pos) != '\'') { syntaxError("unexpected end-of-character", pos); } pos = pos + 1; return new Token(Token.Kind.CharValue, input.substring(start, pos), start); } public Token scanStringConstant() { int start = pos; boolean escaped = false; pos++; while (pos < input.length()) { char c = input.charAt(pos); if (c == '"' && !escaped) { String v = input.substring(start, ++pos); return new Token(Token.Kind.StringValue, v, start); } else if(c == '\\' && !escaped) { escaped = true; } else { escaped = false; } pos = pos + 1; } syntaxError("unexpected end-of-string", pos - 1); return null; } public static final char UC_FORALL = '\u2200'; public static final char UC_EXISTS = '\u2203'; public static final char UC_EMPTYSET = '\u2205'; public static final char UC_SUBSET = '\u2282'; public static final char UC_SUBSETEQ = '\u2286'; public static final char UC_SUPSET = '\u2283'; public static final char UC_SUPSETEQ = '\u2287'; public static final char UC_SETUNION = '\u222A'; public static final char UC_SETINTERSECTION = '\u2229'; public static final char UC_LESSEQUALS = '\u2264'; public static final char UC_GREATEREQUALS = '\u2265'; public static final char UC_ELEMENTOF = '\u2208'; public static final char UC_LOGICALAND = '\u2227'; public static final char UC_LOGICALOR = '\u2228'; static final char[] opStarts = { ',', '(', ')', '[', ']', '{', '}', '+', '-', '*', '/', '%', '^', '!', '?', '=', '<', '>', ':', ';', '&', '|', '.', '~', // Unicode operators UC_FORALL, UC_EXISTS, UC_EMPTYSET, UC_SUBSET, UC_SUBSETEQ, UC_SUPSET, UC_SUPSETEQ, UC_SETUNION, UC_SETINTERSECTION, UC_LESSEQUALS, UC_GREATEREQUALS, UC_ELEMENTOF }; public boolean isOperatorStart(char c) { for (char o : opStarts) { if (c == o) { return true; } } return false; } public Token scanOperator() { char c = input.charAt(pos); switch (c) { case '.': if ((pos + 1) < input.length() && input.charAt(pos + 1) == '.') { pos = pos + 2; if (pos < input.length() && input.charAt(pos) == '.') { return new Token(Token.Kind.DotDotDot, "..", pos++); } else { return new Token(Token.Kind.DotDot, "..", pos); } } else { return new Token(Token.Kind.Dot, ".", pos++); } case ',': return new Token(Token.Kind.Comma, ",", pos++); case ';': return new Token(Token.Kind.SemiColon, ";", pos++); case ':': return new Token(Token.Kind.Colon, ":", pos++); case '|': if (pos + 1 < input.length() && input.charAt(pos + 1) == '|') { pos += 2; return new Token(Token.Kind.LogicalOr, "||", pos - 2); } else { return new Token(Token.Kind.VerticalBar, "|", pos++); } case '(': return new Token(Token.Kind.LeftBrace, "(", pos++); case ')': return new Token(Token.Kind.RightBrace, ")", pos++); case '[': return new Token(Token.Kind.LeftSquare, "[", pos++); case ']': return new Token(Token.Kind.RightSquare, "]", pos++); case '{': return new Token(Token.Kind.LeftCurly, "{", pos++); case '}': return new Token(Token.Kind.RightCurly, "}", pos++); case '+': if ((pos + 1) < input.length() && input.charAt(pos + 1) == '+') { pos = pos + 2; return new Token(Token.Kind.PlusPlus, "++", pos); } else { return new Token(Token.Kind.Plus, "+", pos++); } case '-': if (pos + 1 < input.length() && input.charAt(pos + 1) == '>') { pos += 2; return new Token(Token.Kind.MinusGreater, "->", pos - 2); } else { return new Token(Token.Kind.Minus, "-", pos++); } case '*': return new Token(Token.Kind.Star, "*", pos++); case '&': if (pos + 1 < input.length() && input.charAt(pos + 1) == '&') { pos += 2; return new Token(Token.Kind.LogicalAnd, "&&", pos - 2); } else { return new Token(Token.Kind.Ampersand, "&", pos++); } case '/': if((pos+1) < input.length() && input.charAt(pos+1) == '/') { return scanLineComment(); } else if((pos+1) < input.length() && input.charAt(pos+1) == '*') { return scanBlockComment(); } else { return new Token(Token.Kind.RightSlash, "/", pos++); } case '%': return new Token(Token.Kind.Percent, "%", pos++); case '^': return new Token(Token.Kind.Caret, "^", pos++); case '~': return new Token(Token.Kind.Tilde, "~", pos++); case '!': if ((pos + 1) < input.length() && input.charAt(pos + 1) == '=') { pos += 2; return new Token(Token.Kind.NotEquals, "!=", pos - 2); } else { return new Token(Token.Kind.Shreak, "!", pos++); } case '=': if ((pos + 1) < input.length() && input.charAt(pos + 1) == '=') { pos += 2; if (pos < input.length() && input.charAt(pos) == '>') { pos++; return new Token(Token.Kind.LogicalImplication, "==>", pos - 3); } else { return new Token(Token.Kind.EqualsEquals, "==", pos - 2); } } else if ((pos + 1) < input.length() && input.charAt(pos + 1) == '>') { pos += 2; return new Token(Token.Kind.EqualsGreater, "=>", pos - 2); } else { return new Token(Token.Kind.Equals, "=", pos++); } case '<': if ((pos + 1) < input.length() && input.charAt(pos + 1) == '=') { pos += 2; if ((pos+1) < input.length() && input.charAt(pos) == '=' && input.charAt(pos+1) == '>') { pos += 2; return new Token(Token.Kind.LogicalIff, "<==>", pos - 4); } else { return new Token(Token.Kind.LessEquals, "<=", pos - 2); } } else if ((pos + 1) < input.length() && input.charAt(pos + 1) == '<') { pos += 2; return new Token(Token.Kind.LeftAngleLeftAngle, "<<", pos - 2); } else{ return new Token(Token.Kind.LeftAngle, "<", pos++); } case '>': if ((pos + 1) < input.length() && input.charAt(pos + 1) == '=') { pos += 2; return new Token(Token.Kind.GreaterEquals, ">=", pos - 2); } else if ((pos + 1) < input.length() && input.charAt(pos + 1) == '>') { pos += 2; return new Token(Token.Kind.RightAngleRightAngle, ">>", pos - 2); } else { return new Token(Token.Kind.RightAngle, ">", pos++); } // ================================================================= // // ================================================================= case UC_LESSEQUALS: return new Token(Token.Kind.LessEquals, "" + c, pos++); case UC_GREATEREQUALS: return new Token(Token.Kind.GreaterEquals, "" + c, pos++); case UC_SETUNION: return new Token(Token.Kind.SetUnion, "" + c, pos++); case UC_SETINTERSECTION: return new Token(Token.Kind.SetIntersection, "" + c, pos++); case UC_ELEMENTOF: return new Token(Token.Kind.ElementOf, "" + c, pos++); case UC_SUBSET: return new Token(Token.Kind.Subset, "" + c, pos++); case UC_SUBSETEQ: return new Token(Token.Kind.SubsetEquals, "" + c, pos++); case UC_SUPSET: return new Token(Token.Kind.Superset, "" + c, pos++); case UC_SUPSETEQ: return new Token(Token.Kind.SupersetEquals, "" + c, pos++); case UC_EMPTYSET: return new Token(Token.Kind.EmptySet, "" + c, pos++); case UC_LOGICALOR: return new Token(Token.Kind.LogicalOr, "" + c, pos++); case UC_LOGICALAND: return new Token(Token.Kind.LogicalAnd, "" + c, pos++); } syntaxError("unknown operator encountered: " + c, pos); return null; } public Token scanIdentifier() { int start = pos; while (pos < input.length() && (input.charAt(pos) == '_' || Character.isLetterOrDigit(input .charAt(pos)))) { pos++; } String text = input.substring(start, pos); // now, check for keywords Token.Kind kind = keywords.get(text); if (kind == null) { // not a keyword, so just a regular identifier. kind = Token.Kind.Identifier; } return new Token(kind, text, start); } public void scanWhiteSpace(List<Token> tokens) { while (pos < input.length() && Character.isWhitespace(input.charAt(pos))) { if (input.charAt(pos) == ' ' || input.charAt(pos) == '\t') { tokens.add(scanIndent()); } else if (input.charAt(pos) == '\n') { tokens.add(new Token(Token.Kind.NewLine, input.substring(pos, pos + 1), pos)); pos = pos + 1; } else if (input.charAt(pos) == '\r' && (pos + 1) < input.length() && input.charAt(pos + 1) == '\n') { tokens.add(new Token(Token.Kind.NewLine, input.substring(pos, pos + 2), pos)); pos = pos + 2; } else { syntaxError("unknown whitespace character encounterd: \"" + input.charAt(pos), pos); } } } /** * Scan one or more spaces or tab characters, combining them to form an * "indent". * * @return */ public Token scanIndent() { int start = pos; while (pos < input.length() && (input.charAt(pos) == ' ' || input.charAt(pos) == '\t')) { pos++; } return new Token(Token.Kind.Indent, input.substring(start, pos), start); } public Token scanLineComment() { int start = pos; while (pos < input.length() && input.charAt(pos) != '\n') { pos++; } return new Token(Token.Kind.LineComment, input.substring(start, pos), start); } public Token scanBlockComment() { int start = pos; while((pos+1) < input.length() && (input.charAt(pos) != '*' || input.charAt(pos+1) != '/')) { pos++; } pos++; pos++; return new Token(Token.Kind.BlockComment,input.substring(start,pos),start); } /** * Skip over any whitespace at the current index position in the input * string. * * @param tokens */ public void skipWhitespace(List<Token> tokens) { while (pos < input.length() && (input.charAt(pos) == '\n' || input.charAt(pos) == '\t')) { pos++; } } /** * Raise a syntax error with a given message at given index. * * @param msg * --- message to raise. * @param index * --- index position to associate the error with. */ private void syntaxError(String msg, int index) { // FIXME: this is clearly not a sensible approach SyntacticElement unknown = new SyntacticElement.Impl() {}; unknown.attributes().add(new Attribute.Source(index, index, -1)); throw new SyntaxError(msg, entry, unknown); } /** * A map from identifier strings to the corresponding token kind. */ public static final HashMap<String, Token.Kind> keywords = new HashMap<String, Token.Kind>() { { // types put("void", Token.Kind.Void); put("any", Token.Kind.Any); put("null", Token.Kind.Null); put("bool", Token.Kind.Bool); put("byte", Token.Kind.Byte); put("int", Token.Kind.Int); put("real", Token.Kind.Real); // constants put("true", Token.Kind.True); put("false", Token.Kind.False); // statements put("assert", Token.Kind.Assert); put("assume", Token.Kind.Assume); put("break", Token.Kind.Break); put("case", Token.Kind.Case); put("catch", Token.Kind.Catch); put("continue", Token.Kind.Continue); put("debug", Token.Kind.Debug); put("default", Token.Kind.Default); put("do", Token.Kind.Do); put("else", Token.Kind.Else); put("ensures", Token.Kind.Ensures); put("fail", Token.Kind.Fail); put("for", Token.Kind.For); put("if", Token.Kind.If); put("new", Token.Kind.New); put("return", Token.Kind.Return); put("requires", Token.Kind.Requires); put("skip", Token.Kind.Skip); put("switch", Token.Kind.Switch); put("throw", Token.Kind.Throw); put("throws", Token.Kind.Throws); put("try", Token.Kind.Try); put("while", Token.Kind.While); // expressions put("all", Token.Kind.All); put("no", Token.Kind.No); put("some", Token.Kind.Some); put("is", Token.Kind.Is); put("in", Token.Kind.In); put("where", Token.Kind.Where); // declarations put("import", Token.Kind.Import); put("function", Token.Kind.Function); put("method", Token.Kind.Method); put("property", Token.Kind.Property); // modifiers put("public", Token.Kind.Public); put("private", Token.Kind.Private); put("native", Token.Kind.Native); put("export", Token.Kind.Export); put("package", Token.Kind.Package); // lifetimes put("this", Token.Kind.This); } }; /** * The base class for all tokens. * * @author David J. Pearce * */ public static class Token { public enum Kind { Identifier, // Constants True("true"), False("false"), ByteValue, RealValue, IntValue, CharValue, StringValue, // Types Null("null"), Void("void"), Any("any"), Bool("bool"), Byte("byte"), Int("int"), Real("real"), Char("char"), String("string"), // Statements Assert("assert"), Assume("assume"), Break("break"), Case("case"), Catch("catch"), Continue("continue"), Debug("debug"), Default("default"), Do("do"), Else("else"), Ensures("ensures"), For("for"), Fail("for"), If("if"), New("new"), Return("return"), Requires("requires"), Skip("skip"), Switch("switch"), Throw("throw"), Throws("throws"), Try("try"), While("while"), // Declarations Package("package"), Import("import"), Public("public"), Private("private"), Native("native"), Export("export"), Function("function"), Method("method"), Property("property"), // Lifetimes This("this"), // Expressions All("all"), No("no"), Some("some"), Is("is"), In("in"), Where("where"), Comma(","), SemiColon(";"), Colon(":"), Ampersand("&"), VerticalBar("|"), LeftBrace("("), RightBrace(")"), LeftSquare("["), RightSquare("]"), LeftAngleLeftAngle("<<"), LeftAngle("<"), RightAngleRightAngle(">>"), RightAngle(">"), LeftCurly("{"), RightCurly("}"), PlusPlus("++"), Plus("+"), Minus("-"), Star("*"), LeftSlash("\\"), RightSlash("//"), Percent("%"), Shreak("!"), Caret("^"), Tilde("~"), Dot("."), DotDot(".."), DotDotDot("..."), Equals("="), EqualsEquals("=="), NotEquals("!="), LessEquals("<="), GreaterEquals(">="), EqualsGreater("=>"), MinusGreater("->"), LogicalAnd("&&"), LogicalOr("||"), LogicalImplication("==>"), LogicalIff("<==>"), SetUnion("" + UC_SETUNION), SetIntersection("" + UC_SETINTERSECTION), ElementOf("" + UC_ELEMENTOF), EmptySet("" + UC_EMPTYSET), Subset("" + UC_SUBSET), SubsetEquals("" + UC_SUBSETEQ), Superset("" + UC_SUPSETEQ), SupersetEquals("" + UC_SUPSETEQ), // Other NewLine, Indent, LineComment, BlockComment; private final String displayString; private Kind() { this.displayString = null; // Use default toString } private Kind(String displayString) { this.displayString = displayString; } @Override public String toString() { // Use displayString if present, otherwise default toString return (displayString != null) ? displayString : super.toString(); } } public final Kind kind; public final String text; public final int start; public Token(Kind kind, String text, int pos) { this.kind = kind; this.text = text; this.start = pos; } public int end() { return start + text.length() - 1; } } }