package net.sourceforge.mayfly.parser; import net.sourceforge.mayfly.MayflyException; import net.sourceforge.mayfly.MayflyInternalException; import net.sourceforge.mayfly.util.ImmutableByteArray; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.Reader; import java.io.StringReader; import java.util.ArrayList; import java.util.List; public class Lexer { private static final int SIZE_OF_HEX_CONSTANT_TO_DISPLAY_IN_MESSAGES = 18; private static final int END_OF_FILE_CHARACTER = -1; private final Reader sql; private int currentLine; private int currentColumn; private int previousLine = -1; private int previousColumn = -1; private int tokenLine; private int tokenColumn; /** * Command, if we are just lexing a single command. */ private final String command; /* * Start machinery used to keep track of commands if we are lexing from * a Reader. */ private StringBuilder currentCommand; private List commands; private List commandLocations; private int commandLine; private int commandColumn; // End command-tracking machinery private int current; private List tokens; public Lexer(String sql) { this(new StringReader(sql), sql); } /** * Create a lexer which reads input from a Reader. * The caller is responsible for closing the Reader. */ public Lexer(Reader sql) { this(sql, null); this.commands = new ArrayList(); this.commandLocations = new ArrayList(); } public Lexer(Reader sql, String command) { this.sql = sql; this.command = command; this.currentLine = 1; this.currentColumn = 1; } Lexer() { this((Reader)null); } public List tokens() { startCommand(); List tokens = lex(); if (commands == null) { return tokens; } else { return attachCommandsToEachToken(tokens); } } private List attachCommandsToEachToken(List tokens) { List result = new ArrayList(); for (int i = 0; i < tokens.size(); ++i) { Token token = (Token) tokens.get(i); result.add( token.withCommand( locationToCommand( token.startLineNumber(), token.startColumn()))); } return result; } private List lex() { tokens = new ArrayList(); current = nextCharacter(); markTokenStart(); characterLoop: while (true) { switch (current) { case '.': current = nextCharacter(); addToken(tokens, TokenType.PERIOD, "."); break; case ';': endOfCommand(); current = nextCharacter(); addToken(tokens, TokenType.SEMICOLON, ";"); break; case ',': current = nextCharacter(); addToken(tokens, TokenType.COMMA, ","); break; case '+': current = nextCharacter(); addToken(tokens, TokenType.PLUS, "+"); break; case '-': current = nextCharacter(); if (current == '-') { while (true) { current = nextCharacter(); if (current == '\n') { current = nextCharacter(); markTokenStart(); break; } else if (current == END_OF_FILE_CHARACTER) { markTokenStart(); break; } } } else { addToken(tokens, TokenType.MINUS, "-"); } break; case '/': current = nextCharacter(); if (current == '*') { boolean gotStar = false; while (true) { current = nextCharacter(); if (current == '*') { gotStar = true; } else if (gotStar && current == '/') { current = nextCharacter(); markTokenStart(); break; } else if (current == END_OF_FILE_CHARACTER) { throw new MayflyException("unclosed comment", tokenLocation()); } else { gotStar = false; } } } else { addToken(tokens, TokenType.DIVIDE, "/"); } break; case '*': current = nextCharacter(); addToken(tokens, TokenType.ASTERISK, "*"); break; case '(': current = nextCharacter(); addToken(tokens, TokenType.OPEN_PAREN, "("); break; case ')': current = nextCharacter(); addToken(tokens, TokenType.CLOSE_PAREN, ")"); break; case '?': current = nextCharacter(); addToken(tokens, TokenType.PARAMETER, "?"); break; case '<': current = nextCharacter(); if (current == '>') { current = nextCharacter(); addToken(tokens, TokenType.LESS_GREATER, "<>"); } else if (current == '=') { current = nextCharacter(); addToken(tokens, TokenType.LESS_EQUAL, "<="); } else { addToken(tokens, TokenType.LESS, "<"); } break; case '>': current = nextCharacter(); if (current == '=') { current = nextCharacter(); addToken(tokens, TokenType.GREATER_EQUAL, ">="); } else { addToken(tokens, TokenType.GREATER, ">"); } break; case '=': current = nextCharacter(); addToken(tokens, TokenType.EQUAL, "="); break; case '!': current = nextCharacter(); if (current == '=') { current = nextCharacter(); addToken(tokens, TokenType.BANG_EQUAL, "!="); } else { throw new MayflyException("expected '=' but got " + describeCharacter(current)); } break; case '|': current = nextCharacter(); if (current == '|') { current = nextCharacter(); addToken(tokens, TokenType.CONCATENATE, "||"); } else { throw new MayflyException("expected '|' but got " + describeCharacter(current)); } break; case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u': case 'v': case 'w': case 'x': case 'y': case 'z': case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': case 'Y': case 'Z': lexIdentifierOrHex(); break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': { StringBuilder text = new StringBuilder(); while (current >= '0' && current <= '9') { text.append((char)current); current = nextCharacter(); } addToken(tokens, TokenType.NUMBER, text.toString()); break; } case '\"': { StringBuilder text = new StringBuilder(); current = nextCharacter(); while (current != '\"') { text.append((char)current); current = nextCharacter(); if (current == -1) { throw new MayflyException("unterminated quoted identifier"); } } current = nextCharacter(); addToken(tokens, TokenType.IDENTIFIER, text.toString()); break; } case ' ': case '\t': case '\n': case '\r': current = nextCharacter(); markTokenStart(); break; case -1: addEndOfFile(tokens); endOfCommand(); break characterLoop; case '\'': { StringBuilder text = new StringBuilder(); text.append("'"); current = nextCharacter(); while (true) { if (current == -1) { throw new MayflyException("unterminated string"); } if (current == '\'') { current = nextCharacter(); if (current == '\'') { current = nextCharacter(); text.append("''"); } else { break; } } else { text.append((char)current); current = nextCharacter(); } } text.append("'"); addToken(tokens, TokenType.QUOTED_STRING, text.toString()); break; } default: throw new MayflyException("unexpected character " + describeCharacter(current)); } } return tokens; } private void lexIdentifierOrHex() { StringBuilder textBuilder = new StringBuilder(); if (current == 'x' || current == 'X') { textBuilder.append((char)current); current = nextCharacter(); if (current == '\'') { lexHexConstant(textBuilder); return; } } /** * Inlined call to {@link #isIdentifierCharacter(char)} * based on profiler data. */ while (((current >= 'a' && current <= 'z') || (current >= 'A' && current <= 'Z')) || (current >= '0' && current <= '9') || current == '_') { textBuilder.append((char)current); current = nextCharacter(); } String text = textBuilder.toString(); addToken(tokens, keywordOrIdentifier(text), text); } private void lexHexConstant(StringBuilder textBuilder) { textBuilder.append((char)current); current = nextCharacter(); ByteArrayOutputStream bytes = new ByteArrayOutputStream(); while (current != '\'') { int first = nextHexDigit(textBuilder); if (current == '\'') { throw new MayflyException( "hex constant " + displayForHexConstant(textBuilder) + " must have an even number of digits", currentCharacter()); } int second = nextHexDigit(textBuilder); bytes.write(combineHexDigits(first, second)); } current = nextCharacter(); Token newToken = new BinaryToken( new ImmutableByteArray(bytes.toByteArray()), tokenLocation()); addToken(tokens, newToken); } private String displayForHexConstant(StringBuilder textBuilder) { if (textBuilder.length() == SIZE_OF_HEX_CONSTANT_TO_DISPLAY_IN_MESSAGES) { textBuilder.append("'..."); } else { textBuilder.append("'"); } return textBuilder.toString(); } private int nextHexDigit(StringBuilder textBuilder) { int first = parseHex(current); if (textBuilder.length() < SIZE_OF_HEX_CONSTANT_TO_DISPLAY_IN_MESSAGES) { textBuilder.append((char)current); } current = nextCharacter(); return first; } int combineHexDigits(int first, int second) { return (first << 4) + second; } int parseHex(int character) { if (character >= '0' && character <= '9') { return character - '0'; } if (character >= 'a' && character <= 'f') { return character - 'a' + 10; } if (character >= 'A' && character <= 'F') { return character - 'A' + 10; } else { /* Not desirable to give the whole string constant, as it might be quite long. */ throw new MayflyException( "invalid character " + describeCharacter(character) + " in hex constant", currentCharacter()); } } private Location currentCharacter() { return new Location(previousLine, previousColumn, currentLine, currentColumn, command); } /** * @internal * Usage is to call nextCharacter() and then call this * method. In other words, the character most recently * read by nextCharacter is <i>not</i> part of the token * we are adding here; the character before that is the * last character of the token. * * This method is a performance bottleneck, which is why many * of the functions which it calls have been inlined. */ private void addToken(List tokens, TokenType tokenType, String text) { TextToken newToken = new TextToken( tokenType, text, new Location(tokenLine, tokenColumn, previousLine, previousColumn, command) ); tokens.add(newToken); tokenLine = previousLine; tokenColumn = previousColumn; } private void addToken(List tokens, Token newToken) { tokens.add(newToken); markTokenStart(); } private void markTokenStart() { tokenLine = previousLine; tokenColumn = previousColumn; } private Location tokenLocation() { return new Location(tokenLine, tokenColumn, previousLine, previousColumn, command); } /** * Key difference with {@link #addToken(List, TokenType, String)} * is that we haven't called nextCharacter (since we are at the * end of file). */ private void addEndOfFile(List tokens) { tokens.add( new EndOfFileToken(previousLine, previousColumn, command) ); } String describeCharacter(int current) { if (current == -1) { return "end of file"; } else if (current >= 0 && current <= 0x1f || current >= 0x7f && current <= 0xa0) { return "0x" + Integer.toHexString(current); } else if (current == '\'') { return "single quote"; } // Doesn't work for surrogate pairs. return "'" + (char)current + "'"; } private TokenType keywordOrIdentifier(String text) { TokenType type = TokenType.lookupKeyword(text); return type != null ? type : TokenType.IDENTIFIER; } private int nextCharacter() { try { previousLine = currentLine; previousColumn = currentColumn; int character = sql.read(); if (character != -1) { if (commands != null) { currentCommand.append((char)character); } } if (character == '\n') { currentColumn = 1; ++currentLine; } else { ++currentColumn; } return character; } catch (IOException e) { throw new MayflyException(e); } } public static boolean isIdentifierStart(int current) { return (current >= 'a' && current <= 'z') || (current >= 'A' && current <= 'Z'); } public static boolean isIdentifierCharacter(int current) { return isIdentifierStart(current) || (current >= '0' && current <= '9') || current == '_'; } public int commandCount() { if (commands.size() != commandLocations.size()) { throw new MayflyInternalException("confused about command tracking"); } return commands.size(); } public String command(int index) { return (String) commands.get(index); } public String locationToCommand(int line, int column) { for (int i = 0; i < commandCount(); ++i) { Location location = (Location) commandLocations.get(i); if (location.contains(line, column)) { return command(i); } } /* This happens for semicolons and end-of-file tokens. Anything else? */ return null; } private void endOfCommand() { if (commands != null) { if (currentCommand.length() == 0) { commands.add(currentCommand.toString()); } else { int lastCharacter = currentCommand.length() - 1; if (currentCommand.charAt(lastCharacter) == ';') { currentCommand.delete(lastCharacter, lastCharacter + 1); } commands.add(currentCommand.toString()); } commandLocations.add(new Location( commandLine, commandColumn, previousLine, previousColumn)); startCommand(); } } private void startCommand() { currentCommand = new StringBuilder(); commandLine = currentLine; commandColumn = currentColumn; } }