SQLLexer.java example

Explorer
voltdb-master
/* This file is part of VoltDB.
 * Copyright (C) 2008-2017 VoltDB Inc.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with VoltDB.  If not, see <http://www.gnu.org/licenses/>.
 */

package org.voltdb.parser;

import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Provides an API for performing various lexing operations on SQL/DML/DDL text.
 * Ideally it shouldn't be doing "parsing", i.e. language-aware token processing.
 * In reality the code is not split cleanly and lexing and parsing overlap a bit.
 *
 * Keep the regular expressions private and just expose methods needed for parsing.
 *
 * Avoid external dependencies since this is linked with the client.
 */
public class SQLLexer extends SQLPatternFactory
{
    //===== Fundamental (not derived) parsing data

    private static class VerbToken
    {
        final String token;
        final boolean supported;

        VerbToken(String token, boolean supported)
        {
            this.token = token;
            this.supported = supported;
        }
    };

    private final static VerbToken[] VERB_TOKENS = {
        // Supported verbs
        new VerbToken("alter", true),
        new VerbToken("create", true),
        new VerbToken("drop", true),
        new VerbToken("export", true),
        new VerbToken("partition", true),
        new VerbToken("dr", true),
        new VerbToken("set", true),
        // Unsupported verbs
        new VerbToken("import", false)
    };

    private static class ObjectToken
    {
        final String token;
        final boolean renameable;

        ObjectToken(String token, boolean renameable)
        {
            this.token = token;
            this.renameable = renameable;
        }
    };

    private final static ObjectToken[] OBJECT_TOKENS = {
        // Rename-able objects
        new ObjectToken("table", true),
        new ObjectToken("stream", true),
        new ObjectToken("column", true),
        new ObjectToken("index", true),
        // Non-rename-able objects
        new ObjectToken("view", false),
        new ObjectToken("procedure", false),
        new ObjectToken("role", false),
        new ObjectToken("function", false)
    };

    private final static String[] MODIFIER_TOKENS = {
        "assumeunique", "unique"
    };

    static final char BLOCK_DELIMITER_CHAR = '#';
    static final String BLOCK_DELIMITER = "###";

    //===== Special non-DDL/DML/SQL patterns

    // Match single-line comments
    private static final Pattern PAT_SINGLE_LINE_COMMENT = Pattern.compile(
            "^\\s*" + // start of line, 0 or more whitespace
            "--" + // start of comment
            ".*$"); // everything to end of line

    private static final Pattern PAT_STRIP_CSTYLE_COMMENTS = Pattern.compile(
            "/\\*(.|\\n)*?\\*/"
            );

    //===== Derived parsing data (populated in static block on first demand)

    // Simplest possible SQL DDL token lexer. (set in static block)
    private static Pattern PAT_ANY_DDL_FIRST_TOKEN = null;

    // All handled patterns. (set in static block)
    private static CheckedPattern[] WHITELISTS = null;

    // All rejected patterns. (set in static block)
    private static CheckedPattern[] BLACKLISTS = null;

    // Extracts the table or stream name for DDL batch conflicting command checks.
    private static final Pattern PAT_TABLE_DDL_PREAMBLE =
        SPF.statementLeader(
            SPF.capture(SPF.tokenAlternatives("create", "drop")),   // DDL commands we're looking for
            SPF.tokenAlternatives("table", "stream"),               // target is table or stream
            SPF.capture(SPF.databaseObjectName())                   // table name (captured)
        ).compile("PAT_TABLE_DDL_PREAMBLE");

    // Matches the start of a SELECT statement
    private static final Pattern PAT_SELECT_STATEMENT_PREAMBLE =
        SPF.statementLeader(
            SPF.token("select")
        ).compile("PAT_SELECT_STATEMENT_PREAMBLE");

    // Capture group number defns for regex below.
    // Don't use capture labels because it is not supported in 1.6
    // and this class needs to compile in 1.6.
    private static final int PARENTTYPE_GROUP=1;
    @SuppressWarnings("unused") // We don't get this group as of now
    private static final int PARENTNAME_GROUP=2;
    private static final int CHILDTYPE_GROUP=3;
    @SuppressWarnings("unused") // We don't get this group as of now
    private static final int CHILDNAME_GROUP=4;
    // Pattern for plausible ALTER...RENAME statements.
    // Keep the matching loose in order to support clear messaging.
    private static final Pattern PAT_ALTER_RENAME =
        SPF.statementLeader(
            SPF.token("alter"),
            SPF.capture(SPF.databaseObjectTypeName()),
            SPF.capture(SPF.databaseObjectName()),
            SPF.optional(
                SPF.clause(
                    SPF.token("alter"),
                    SPF.capture(SPF.databaseObjectTypeName()),
                    SPF.capture(SPF.databaseObjectName())
                )
            ),
            SPF.token("rename"), SPF.token("to")
        ).compile("PAT_ALTER_RENAME");

    //========== Public Methods ==========

    /**
     * Check if a SQL string is a comment.
     * @param sql  SQL string
     * @return     true if it's a comment
     */
    public static boolean isComment(String sql)
    {
        Matcher commentMatcher = PAT_SINGLE_LINE_COMMENT.matcher(sql);
        return commentMatcher.matches();
    }

    /**
     * Test if character is block delimiter
     * @param c  character to test
     * @return   true if c is block delimiter
     */
    public static boolean isBlockDelimiter(char c)
    {
        return c == BLOCK_DELIMITER_CHAR;
    }

    /**
     * Get the DDL token, if any, at the start of this statement.
     * @return returns token, or null if it wasn't DDL
     */
    public static String extractDDLToken(String sql)
    {
        String ddlToken = null;
        Matcher ddlMatcher = PAT_ANY_DDL_FIRST_TOKEN.matcher(sql);
        if (ddlMatcher.find()) {
            ddlToken = ddlMatcher.group(1).toLowerCase();
        }
        return ddlToken;
    }

    /** Remove c-style comments globally and -- comments from the end of lines */
    public static String stripComments(String ddl) {
        ddl = removeCStyleComments(ddl);
        StringBuilder sb = new StringBuilder();
        String[] ddlLines = ddl.split("\n");
        for (String ddlLine : ddlLines) {
            sb.append(stripCommentFromLine(ddlLine)).append(' ');
        }
        return sb.toString();
    }

    /** Strip -- comments from the end of a single line */
    public static String stripCommentFromLine(String ddlLine) {
        boolean inQuote = false;
        char quoteChar = ' '; // will be written before use
        boolean lastCharWasDash = false;
        int length = ddlLine.length();

        for (int i = 0; i < length; i++) {
            char c = ddlLine.charAt(i);
            if (inQuote) {
                if (quoteChar == c) {
                    inQuote = false;
                }
            }
            else {
                if (c == '-') {
                    if (lastCharWasDash) {
                        return ddlLine.substring(0, i - 1);
                    }
                    else {
                        lastCharWasDash = true;
                    }
                }
                else {
                    lastCharWasDash = false;
                    if (c == '\"' || c == '\'') {
                        inQuote = true;
                        quoteChar = c;
                    }
                }
            }
        }

        return ddlLine;
    }

    /**
     * Get the table name for a CREATE or DROP DDL statement.
     * @return returns token, or null if the DDL isn't (CREATE|DROP) TABLE
     */
    public static String extractDDLTableName(String sql)
    {
        Matcher matcher = PAT_TABLE_DDL_PREAMBLE.matcher(sql);
        if (matcher.find()) {
            return matcher.group(2).toLowerCase();
        }
        return null;
    }

    /**
     * Naive filtering for stuff we haven't implemented yet.
     * Hopefully this gets whittled away and eventually disappears.
     *
     * @param sql  statement to check
     * @return     rejection explanation string or null if accepted
     */
    public static String checkPermitted(String sql)
    {
        /*
         *  IMPORTANT: Black-lists are checked first because they know more about
         * what they don't like about a statement and can provide a better message.
         * It requires that black-lists patterns be very selective and that they
         * don't mind seeing statements that wouldn't pass the white-lists.
         */

        //=== Check against blacklists, must not be rejected by any.

        for (CheckedPattern cp : BLACKLISTS) {
            CheckedPattern.Result result = cp.check(sql);
            if (result.matcher != null) {
                return String.format("%s, in statement: %s", result.explanation, sql);
            }
        }

        //=== Check against whitelists, must be accepted by at least one.

        boolean hadWLMatch = false;
        for (CheckedPattern cp : WHITELISTS) {
            if (cp.matches(sql)) {
                hadWLMatch = true;
                break;
            }
        }
        if (!hadWLMatch) {
            return String.format("AdHoc DDL contains an unsupported statement: %s", sql);
        }

        // The statement is permitted.
        return null;
    }

    /**
     * Split SQL statements on semi-colons with quoted string and comment support.
     *
     * Degenerate formats such as escape as the last character or unclosed strings are ignored and
     * left to the SQL parser to complain about. This is a simple string splitter that errs on the
     * side of not splitting.
     *
     * Regexes are avoided.
     *
     * Handle single and double quoted strings and backslash escapes. Backslashes escape a single
     * character.
     *
     * Handle double-dash (single line) and C-style (muli-line) comments. Nested C-style comments
     * are not supported.
     *
     * @param sql raw SQL text to split
     * @return list of individual SQL statements
     */
    public static List<String> splitStatements(final String sql) {
        List<String> statements = new ArrayList<>();
        // Use a character array for efficient character-at-a-time scanning.
        char[] buf = sql.toCharArray();
        // Set to null outside of quoted segments or the quote character inside them.
        Character cQuote = null;
        // Set to null outside of comments or to the string that ends the comment.
        String sCommentEnd = null;
        // Index to start of current statement.
        int iStart = 0;
        // Index to current character.
        // IMPORTANT: The loop is structured in a way that requires all if/else/... blocks to bump
        // iCur appropriately. Failure of a corner case to bump iCur will cause an infinite loop.
        boolean statementIsComment = false;
        boolean inStatement = false;
        int iCur = 0;
        while (iCur < buf.length) {
            // Eat up whitespace outside of a statement
            if (!inStatement) {
                if (Character.isWhitespace(buf[iCur])) {
                    iCur++;
                    iStart = iCur;
                }
                else {
                    inStatement = true;
                }
            }
            else if (sCommentEnd != null) {
                // Processing the interior of a comment. Check if at the comment or buffer end.
                if (iCur >= buf.length - sCommentEnd.length()) {
                    // Exit
                    iCur = buf.length;
                } else if (String.copyValueOf(buf, iCur, sCommentEnd.length()).equals(sCommentEnd)) {
                    // Move past the comment end.
                    iCur += sCommentEnd.length();
                    sCommentEnd = null;
                    // If the comment is the whole of the statement so far, terminate it
                    if (statementIsComment) {
                        String statement = String.copyValueOf(buf, iStart, iCur - iStart).trim();
                        if (!statement.isEmpty()) {
                            statements.add(statement);
                        }
                        iStart = iCur;
                        statementIsComment = false;
                        inStatement = false;
                    }
                } else {
                    // Keep going inside the comment.
                    iCur++;
                }
            } else if (cQuote != null) {
                // Processing the interior of a quoted string.
                if (buf[iCur] == '\\') {
                    // Skip the '\' escape and the trailing single escaped character.
                    // Doesn't matter if iCur is beyond the end, it won't be used in that case.
                    iCur += 2;
                } else if (buf[iCur] == cQuote) {
                    // Look at the next character to distinguish a double escaped quote
                    // from the end of the quoted string.
                    iCur++;
                    if (iCur < buf.length) {
                        if (buf[iCur] != cQuote) {
                            // Not a double escaped quote - end of quoted string.
                            cQuote = null;
                        } else {
                            // Move past the double escaped quote.
                            iCur++;
                        }
                    }
                } else {
                    // Move past an ordinary character.
                    iCur++;
                }
            } else {
                // Outside of a quoted string - watch for the next separator, quote or comment.
                if (buf[iCur] == ';') {
                    // Add terminated statement (if not empty after trimming).
                    String statement = String.copyValueOf(buf, iStart, iCur - iStart).trim();
                    if (!statement.isEmpty()) {
                        statements.add(statement);
                    }
                    iStart = iCur + 1;
                    iCur = iStart;
                    inStatement = false;
                } else if (buf[iCur] == '"' || buf[iCur] == '\'') {
                    // Start of quoted string.
                    cQuote = buf[iCur];
                    iCur++;
                } else if (iCur <= buf.length - 2) {
                    // Comment (double-dash or C-style)?
                    if (buf[iCur] == '-' && buf[iCur+1] == '-') {
                        // One line double-dash comment start.
                        sCommentEnd = "\n"; // Works for *IX (\n) and Windows (\r\n).
                        if (iCur == iStart) {
                            statementIsComment = true;
                        }
                        iCur += 2;
                    } else if (buf[iCur] == '/' && buf[iCur+1] == '*') {
                        // Multi-line C-style comment start.
                        sCommentEnd = "*/";
                        if (iCur == iStart) {
                            statementIsComment = true;
                        }
                        iCur += 2;
                    } else {
                        // Not a comment start, move past this character.
                        iCur++;
                    }
                } else {
                    // Move past a non-quote/non-separator character.
                    iCur++;
                }
            }
        }
        // Get the last statement, if any.
        if (iStart < buf.length) {
            String statement = String.copyValueOf(buf, iStart, iCur - iStart).trim();
            if (!statement.isEmpty()) {
                statements.add(statement);
            }
        }
        return statements;
    }

    /**
     * Check if a statement is a SELECT.
     * @param statement  statement to check
     * @return           true if it's a SELECT statement
     */
    public static boolean isSelect(String statement)
    {
        return PAT_SELECT_STATEMENT_PREAMBLE.matcher(statement).matches();
    }

    //========== Private ==========

    /**
     * Initialize derived data
     */
    static
    {
        // Simplest possible SQL DDL token lexer
        String[] verbsAll = new String[VERB_TOKENS.length];
        for (int i = 0; i < VERB_TOKENS.length; ++i) {
            verbsAll[i] = VERB_TOKENS[i].token;
        }
        PAT_ANY_DDL_FIRST_TOKEN =
            SPF.statementLeader(
                SPF.capture(SPF.tokenAlternatives(verbsAll)),
                SPF.anyClause()
            ).compile("PAT_ANY_DDL_FIRST_TOKEN");

        // Whitelists for acceptable statement preambles.
        WHITELISTS = new CheckedPattern[] {
            new WhitelistSupportedPreamblePattern(),
            new CheckedPattern(SQLParser.SET_GLOBAL_PARAM_FOR_WHITELIST) {
                @Override
                String explainMatch(Matcher matcher) {
                    return null;
                }
            }
        };

        BLACKLISTS = new CheckedPattern[] {
            new BlacklistUnsupportedPreamblePattern(),
            new BlacklistRenamePattern()
        };
    }

    /** Remove c-style comments from a string aggressively */
    private static String removeCStyleComments(String ddl)
    {
        // Avoid Apache commons StringUtils.join() to minimize client dependencies.
        StringBuilder sb = new StringBuilder();
        for (String part : PAT_STRIP_CSTYLE_COMMENTS.split(ddl)) {
            sb.append(part);
        }
        return sb.toString();
    }

    /**
     * Find information about an object type token, if it's a known object type.
     * @param objectTypeName  object type name to look up
     * @return                object token information or null if it wasn't found
     */
    private static ObjectToken findObjectToken(String objectTypeName)
    {
        if (objectTypeName != null) {
            for (ObjectToken ot : OBJECT_TOKENS) {
                if (ot.token.equalsIgnoreCase(objectTypeName)) {
                    return ot;
                }
            }
        }
        return null;
    }

    /**
     * Abstract base for whitelists and blacklists
     */
    private static abstract class CheckedPattern
    {
        Pattern pattern;

        CheckedPattern(Pattern pattern)
        {
            this.pattern = pattern;
        }

        static class Result
        {
            // non-null Matcher with groups() set if it matched
            Matcher matcher = null;
            // optional explanation, e.g. blacklist rejection message, or null if it didn't match
            String explanation = null;
        }

        /**
         * Check if statement matches.
         * @param statement  statement to match against
         * @return           result object with m
         */
        Result check(String statement)
        {
            Result result = new Result();
            Matcher matcher = this.pattern.matcher(statement);
            if (matcher.matches()) {
                result.matcher = matcher;
                result.explanation = this.explainMatch(matcher);
            }
            return result;
        }

        /**
         * Simplified yes/no match check
         * @param statement  statement to match against
         * @return           true if it matches
         */
        boolean matches(String statement)
        {
            return this.check(statement).matcher != null;
        }

        // Override to provide an explanation, e.g. for blacklist rejection.
        abstract String explainMatch(Matcher matcher);
    }

    /**
     * Whitelist matcher for supported two token preambles.
     *
     * Provides no explanation.
     */
    private static class WhitelistSupportedPreamblePattern extends CheckedPattern
    {
        private static Pattern initPattern()
        {
            // All handled (white-listed) patterns.
            String[] secondTokens = new String[OBJECT_TOKENS.length + MODIFIER_TOKENS.length];
            for (int i = 0; i < OBJECT_TOKENS.length; ++i) {
                secondTokens[i] = OBJECT_TOKENS[i].token;
            }
            // Modifier tokens are supported in the place of object tokens following
            // a verb to allow the "verb modifier object" pattern like "CREATE UNIQUE INDEX".
            // For simplicity, "CREATE UNIQUE" et. al. are considered sufficient evidence that
            // the statement is a permitted white-listed DDL statement.
            // We seem to be more concerned about accidentally permitting
            // "CREATE <non-permitted-object> ..."
            // than "CREATE UNIQUE <non-permitted-object> ...".
            // Otherwise, we'd require the modifiers to be part of a nested
            // "modifier object" subpattern.
            for (int j = 0; j < MODIFIER_TOKENS.length; ++j) {
                secondTokens[OBJECT_TOKENS.length + j] = MODIFIER_TOKENS[j];
            }
            int supportedVerbCount = 0;
            for (int i = 0; i < VERB_TOKENS.length; ++i) {
                if (VERB_TOKENS[i].supported) {
                    supportedVerbCount++;
                }
            }
            String[] verbsSupported = new String[supportedVerbCount];
            supportedVerbCount = 0;     // Reuse to build supported verb array.
            for (int i = 0; i < VERB_TOKENS.length; ++i) {
                if (VERB_TOKENS[i].supported) {
                    verbsSupported[supportedVerbCount++] = VERB_TOKENS[i].token;
                }
            }
            Pattern whitelistPattern =
                SPF.statementLeader(
                    SPF.clause(
                        SPF.tokenAlternatives(verbsSupported),
                        SPF.tokenAlternatives(secondTokens)
                    )
                ).compile("PAT_WHITELISTS-PREAMBLES");
            return whitelistPattern;
        }

        WhitelistSupportedPreamblePattern()
        {
            super(initPattern());
        }

        // Whitelist match provides no explanation.
        @Override
        String explainMatch(Matcher matcher)
        {
            return null;
        }
    }

    /**
     * Blacklists known unsupported statement preambles and explains rejections.
     */
    private static class BlacklistUnsupportedPreamblePattern extends CheckedPattern
    {
        private static Pattern initPattern()
        {
            int unsupportedVerbCount = 0;
            for (int i = 0; i < VERB_TOKENS.length; ++i) {
                if (!VERB_TOKENS[i].supported) {
                    unsupportedVerbCount++;
                }
            }
            String[] verbsNotSupported = new String[unsupportedVerbCount];
            unsupportedVerbCount = 0;   // Reuse to build unsupported verb array.
            for (int i = 0; i < VERB_TOKENS.length; ++i) {
                if (!VERB_TOKENS[i].supported) {
                    verbsNotSupported[unsupportedVerbCount++] = VERB_TOKENS[i].token;
                }
            }
            Pattern blacklistPattern =
                SPF.statementLeader(
                    SPF.capture(SPF.tokenAlternatives(verbsNotSupported))
                ).compile("PAT_BLACKLISTS-PREAMBLES");
            return blacklistPattern;
        }

        BlacklistUnsupportedPreamblePattern()
        {
            super(initPattern());
        }

        /**
         * Provide a match explanation, assuming it's a rejection.
         */
        @Override
        String explainMatch(Matcher matcher)
        {
            return String.format("Statement is not supported: %s", matcher.group(1).toUpperCase());
        }
    }

    /**
     * Blacklists ALTER/RENAME and provides focused rejection explanations.
     */
    private static class BlacklistRenamePattern extends CheckedPattern
    {
        BlacklistRenamePattern()
        {
            super(PAT_ALTER_RENAME);
        }

        /**
         * (Internal)
         * See if there's something to say about a parent or child target object type.
         * @param typeName   object type name to check
         * @param isParent   true when a child object is available and this is the parent
         * @return           explanation string or null when a child still needs checking
         */
        private static String getExplanation(String typeName, boolean isParent)
        {
            assert typeName != null;
            ObjectToken token = findObjectToken(typeName);
            if (token == null) {
                return String.format("AdHoc DDL ALTER/RENAME refers to an unknown object type '%s'", typeName);
            }
            if (isParent) {
                // The parent is okay, still need to check the child.
                return null;
            }
            if (!token.renameable) {
                return String.format("AdHoc DDL ALTER/RENAME is not supported for object type '%s'", typeName);
            }
            return "AdHoc DDL ALTER/RENAME is not yet supported";
        }

        /**
         * Provide a match explanation, assuming it's a rejection.
         */
        @Override
        String explainMatch(Matcher matcher)
        {
            String parentType = matcher.group(PARENTTYPE_GROUP);
            String childType = matcher.group(CHILDTYPE_GROUP);
            // See if there's something to say about the parent object type.
            String explanation = getExplanation(parentType, childType != null);
            // If not see if there's something to say about the child type, when applicable.
            if (explanation == null) {
                explanation = getExplanation(childType, false);
            }
            return explanation;
        }
    }
}