/* * Copyright (C) 2014 Indeed Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except * in compliance with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software distributed under the * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either * express or implied. See the License for the specific language governing permissions and * limitations under the License. */ package com.indeed.imhotep.sql.parser; import com.google.common.collect.Lists; import com.indeed.imhotep.sql.ast2.QueryParts; import org.codehaus.jparsec.*; import org.codehaus.jparsec.functors.Map5; import org.codehaus.jparsec.misc.Mapper; import org.codehaus.jparsec.pattern.Patterns; import java.util.List; import java.util.Set; /** * @author vladimir */ public class QuerySplitter { private static final String[] KEYWORDS = new String[] { "select", "from", "where", "group", "by", "limit", "=", ":", //query }; public static final Parser<String> wordParser = Scanners.pattern(Patterns.regex("[^\\s]+"), "word").source(); public static final Parser<String> tokensParser = Parsers.or(Terminals.StringLiteral.SINGLE_QUOTE_TOKENIZER, Terminals.StringLiteral.DOUBLE_QUOTE_TOKENIZER, wordParser).many1().source(); private static final Terminals TERMS = Terminals.caseInsensitive(tokensParser, new String[0], KEYWORDS); public static QueryParts splitQuery(String query) { Parser<?> termsTokenizer = TERMS.tokenizer(); Parser<Void> nonTokensParser = Scanners.WHITESPACES; Parser<List<Token>> tokenizer = termsTokenizer.lexer(nonTokensParser); Parser<QueryParts> querySQLParser = getQuerySQLParser(); Parser<QueryParts> queryLINQParser = getQueryLINQParser(); Parser<QueryParts> queryFragmentsParser = Parsers.or(queryLINQParser, querySQLParser); // combine Tokenizer with the token/fragment level Parser Parser<QueryParts> queryParser = queryFragmentsParser.from(tokenizer); // we probably could store this parser statically instead of recreating on each call // finally parse the query string return queryParser.parse(query); } private static Parser<QueryParts> getQueryLINQParser() { Parser<Token> selectLINQParser = TERMS.token("select").next(getContentParser("limit")); Parser<Token> fromLINQParser = TERMS.token("from").next(getContentParser("where", "group", "select", "limit")); Parser<Token> whereLINQParser = TERMS.token("where").next(getContentParser(true, "group", "select", "limit")); Parser<Token> groupByLINQParser = TERMS.phrase("group", "by").next(getContentParser("select", "limit")); Parser<Token> limitParser = TERMS.token("limit").next(getContentParser()); return Parsers.sequence(fromLINQParser, whereLINQParser.optional(), groupByLINQParser.optional(), selectLINQParser.optional(), limitParser.optional(), new Map5<Token, Token, Token, Token, Token, QueryParts>() { @Override public QueryParts map(Token from, Token where, Token groupBy, Token select, Token limit) { return new QueryParts(from, where, groupBy, select, limit); } }); } private static Parser<QueryParts> getQuerySQLParser() { Parser<Token> selectSQLParser = TERMS.token("select").next(getContentParser("from")); Parser<Token> fromSQLParser = TERMS.token("from").next(getContentParser("where", "group", "limit")); Parser<Token> whereSQLParser = TERMS.token("where").next(getContentParser(true, "group", "limit")); Parser<Token> groupBySQLParser = TERMS.phrase("group", "by").next(getContentParser("limit")); Parser<Token> limitParser = TERMS.token("limit").next(getContentParser()); return Parsers.sequence(selectSQLParser.optional(), fromSQLParser, whereSQLParser.optional(), groupBySQLParser.optional(), limitParser.optional(), new Map5<Token, Token, Token, Token, Token, QueryParts>() { @Override public QueryParts map(Token select, Token from, Token where, Token groupBy, Token limit) { return new QueryParts(from, where, groupBy, select, limit); } }); } private static Parser<Token> getContentParser(String... excludedTerms) { return getContentParser(false, excludedTerms); } private static Parser<Token> getContentParser(boolean allowExcludedAfterEq, String... excludedTerms) { List<Parser<?>> alternatives = Lists.newArrayList(); if(allowExcludedAfterEq) { for(String term : excludedTerms) { alternatives.add(Mapper._(TERMS.phrase("=", term))); alternatives.add(Mapper._(TERMS.phrase(":", term))); alternatives.add(Mapper._(TERMS.phrase(term, "="))); alternatives.add(Mapper._(TERMS.phrase(term, ":"))); } } // consume any token as long as it is not one of the excluded ones alternatives.add(TERMS.token(excludedTerms).not().next(Parsers.ANY_TOKEN)); return Parsers.sequence(Parsers.ANY_TOKEN.optional(), Parsers.or(alternatives).many()).source().token(); } static void runBenchmark() { runBenchmarkJParsec(); System.out.println("Warm up done"); long start = System.currentTimeMillis(); runBenchmarkJParsec(); // 5.3s this jparsec implementation, 3.7s custom implementation System.out.println("done in " + (System.currentTimeMillis()-start)); } private static void runBenchmarkJParsec() { for (int i = 0; i < 100000; i++) { splitQuery("select count() from ramsaccess \"2012-02-01T00:00:00\" \"2013-02-09T00:00:00\" where searches=1 and fields in (\"affiliate\", \"affchan\", \"affchannel\", \"affshr\", \"affshare\", \"agent\", \"asecmp\", \"ascompany\", \"asettl\", \"astitle\", \"emailuser\", \"emaildomain\", \"adschn\", \"adschan\", \"adschannel\", \"language\", \"ch\", \"chn\", \"spon\", \"clkcnt\", \"clk\", \"rq\", \"conversion\") group by fields"); } } }