package com.tesora.dve.sql.parser;
/*
* #%L
* Tesora Inc.
* Database Virtualization Engine
* %%
* Copyright (C) 2011 - 2014 Tesora Inc.
* %%
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License, version 3,
* as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
* #L%
*/
import java.util.ArrayList;
import java.util.List;
import com.tesora.dve.sql.transform.execution.ExecutionType;
// this is not really a parser
// what we're going to do is scan the input string left to right and build two objects:
// [1] a query with all literals removed and replaced with parameters
// [2] a list of literals
//
// [1] has to be sufficiently unique to be a good cache key
// [2] has to be the same as we would get from the full parser
//
// The rules are - numbers can only start if previous is nonletter
// numbers are composed of digits + '.'
// String literals are enclosed in single quotes, and previous cannot be a letter to start
// (note that this means no hex literals for now, but so what)
// When accumulating a string literal, keep track of escaped single quotes
//
// the purpose of this thing is to be really fast - so if we get confused just give up
// we gain if we handle 80% of the cases, we don't need to handle all of them
public class CandidateParser {
private String in;
private String shrunk;
private List<ExtractedLiteral> literals = new ArrayList<ExtractedLiteral>();
private int state = NONE;
private int litBegin = -1;
private StringBuilder acc = null;
private char pth;
private char ith;
public CandidateParser(String given) {
shrunk = null;
in = given.trim();
acc = new StringBuilder(in.length());
}
public List<ExtractedLiteral> getLiterals() { return literals; }
public String getShrunk() { return shrunk; }
private static final char quote1 = '\'';
private static final char quote2 = '"';
private static final char escape = '\\';
private static final char tab = '\t';
private static final char space = ' ';
private static final char cr = '\n';
public static String shrinkAnything(String in) {
String actual = in.trim();
CandidateParser cp = new CandidateParser(actual);
if (cp.onlyShrink(1, actual.length()))
return cp.getShrunk();
return null;
}
private boolean onlyShrink(int i, int len) {
// ok, so we've detected the type - regular dml
// let's see if we can find literals
pth = in.charAt(i - 1);
while(i < len && state != ERROR) {
ith = in.charAt(i);
int stateWas = state;
switch(state) {
case NONE:
maybeStartLiteral(i);
break;
case IDENTIFIER:
continueIdentifier(i);
break;
case STRINGLIT:
maybeAccStringLit(i);
break;
case INTLIT:
maybeAccIntLit(i);
break;
case DECLIT:
maybeAccDecLit(i);
break;
case HEXLIT:
maybeAccHexLit(i);
break;
case START_HEXLIT:
maybeStartHexLit(i);
break;
case ERROR:
return false;
default:
break;
}
if ((stateWas == NONE || stateWas == IDENTIFIER) && (state == NONE || state == IDENTIFIER) && !normalizeWhitespace())
// no state change
acc.append(ith);
pth = ith;
i++;
}
finish(i);
if (state == ERROR) return false;
shrunk = acc.toString();
return true;
}
private boolean normalizeWhitespace() {
if (state == NONE) {
if (ith == tab || ith == cr)
ith = space;
if (ith == space && pth == space)
return true;
}
return false;
}
public boolean shrink() {
Prefix type = detectType(in);
if (type == null) return false;
int len = in.length();
int i = 0;
for(; i < type.characters.length; i++)
acc.append(in.charAt(i));
return onlyShrink(i,len);
}
private void finish(int i) {
if (state == ERROR) return;
if (state != NONE && state != IDENTIFIER) {
// out of characters, let's see if we can get the last literal
if (state == STRINGLIT || state == HEXLIT) {
// didn't see the closing quote, we're confused, give up
state = ERROR;
return;
}
if (state != INTLIT && state != DECLIT) {
state = ERROR;
return;
}
String lit = in.substring(litBegin,i);
if (state == INTLIT)
literals.add(ExtractedLiteral.makeIntegralLiteral(lit,acc.length()));
else if (state == DECLIT)
literals.add(ExtractedLiteral.makeDecimalLiteral(lit,acc.length()));
acc.append("?");
}
}
private void maybeAccStringLit(int i) {
// this chunk of code here is to capture the string '\\'
boolean counts = false;
if (ith == quote1 || ith == quote2) {
counts = true;
if (pth == escape) {
counts = false;
char ughth = in.charAt(i - 2);
if (ughth == escape) {
// counts
counts = true;
}
}
}
if (counts) {
String stringlit = in.substring(litBegin, i+1);
literals.add(ExtractedLiteral.makeStringLiteral(stringlit,acc.length()));
state = NONE;
litBegin = -1;
acc.append("?");
}
}
private void maybeAccHexLit(int i) {
// no need to worry about \\ - so if ith is quote then the hexlit is done
if (ith == quote1) {
String hexlit = in.substring(litBegin, i+1);
literals.add(ExtractedLiteral.makeHexLiteral(hexlit,acc.length()));
state = NONE;
litBegin = -1;
acc.append("?");
} else if (ith == ' ') {
// hex literals can't have spaces
state = ERROR;
}
}
private void maybeStartHexLit(int i) {
if (ith == quote1) {
state = HEXLIT;
// don't change the litBegin
} else {
state = IDENTIFIER;
// acc the pth letter and ith
acc.append(pth);
acc.append(ith);
litBegin = -1;
}
}
private void maybeAccDecLit(int i) {
if (!Character.isDigit(ith)) {
String declit = in.substring(litBegin, i);
literals.add(ExtractedLiteral.makeDecimalLiteral(declit,acc.length()));
state = NONE;
litBegin = -1;
acc.append("?").append(ith);
}
}
private void maybeAccIntLit(int i) {
if ('.' == ith) {
state = DECLIT;
} else if (!Character.isDigit(ith)) {
String intlit = in.substring(litBegin, i);
literals.add(ExtractedLiteral.makeIntegralLiteral(intlit,acc.length()));
state = NONE;
litBegin = -1;
acc.append("?").append(ith);
}
}
private void maybeStartLiteral(int i) {
// neither string literals nor numeric literals can start with a character
// furthermore, disallow letter-quote-number
/*
if (Character.isLetter(ith) && !Character.isLetter(pth)) {
if (pth == ' ' && ith == 'X' || ith == 'x') {
state = START_HEXLIT;
litBegin = i;
} else {
// we were already in the non state, try moving to the identifier state
state = IDENTIFIER;
}
} else if (!Character.isLetter(pth) && pth != '_') {
if (quote == ith) {
state = STRINGLIT;
litBegin = i;
} else if (Character.isDigit(ith)) {
char oth = in.charAt(i - 2);
if (Character.isLetter(oth) && quote == pth) {
// tagged literal, ignore for now
state = ERROR;
} else {
litBegin = i;
state = INTLIT;
}
}
}*/
// restructuring the above code to this method gave a slight edge over 13m calls; since this is
// hot path stuff I guess it's worth it
if (!Character.isLetter(pth)) {
if (Character.isLetter(ith)) {
if (pth == ' ' && ith == 'X' || ith == 'x') {
state = START_HEXLIT;
litBegin = i;
} else {
state = IDENTIFIER;
}
} else if (pth != '_') {
if (quote1 == ith || quote2 == ith) {
state = STRINGLIT;
litBegin = i;
} else if (Character.isDigit(ith)) {
char oth = in.charAt(i - 2);
if (Character.isLetter(oth) && (quote1 == pth || quote2 == pth)) {
// tagged literal, ignore for now
state = ERROR;
} else {
litBegin = i;
state = INTLIT;
}
}
}
}
}
private void continueIdentifier(int i) {
// we can only leave this state if we see something that isn't a character and isn't a number
// but not for '.'
if (ith == '.') return;
if (!Character.isLetter(ith) && !Character.isDigit(ith)) {
state = NONE;
}
return;
}
// states
private static final int ERROR = -1;
private static final int NONE = 0;
private static final int STRINGLIT = 1;
private static final int INTLIT = 2;
private static final int DECLIT = 3;
// some people like storing hex literals, no idea why
private static final int HEXLIT = 4;
// fake state - we saw ' ' 'x' - is the next character quote?
private static final int START_HEXLIT = 5;
// sysbench specific - we go into identifier state
// when we see some characters, and leave it upon whitespace, or nonintegral character
private static final int IDENTIFIER = 6;
public static class Prefix {
public final char[] characters;
public final ExecutionType type;
public Prefix(String v, ExecutionType et) {
characters = v.toCharArray();
type = et;
}
}
private static final Prefix[] recognized = new Prefix[] {
new Prefix("select",ExecutionType.SELECT),
new Prefix("insert",ExecutionType.INSERT),
new Prefix("delete",ExecutionType.DELETE),
new Prefix("update",ExecutionType.UPDATE),
new Prefix("commit",ExecutionType.TRANSACTION),
new Prefix("begin", ExecutionType.TRANSACTION),
new Prefix("set", ExecutionType.SESSION)
};
public static boolean isInsert(String in) {
Prefix p = detectType(in);
if (p == null) return false;
return p.type == ExecutionType.INSERT;
}
public static Prefix detectType(String in) {
int len = in.length();
if (len < 3) return null;
int decision = -1;
for(int i = 0; i < 6 && i < len; i++) {
char ith = Character.toLowerCase(in.charAt(i));
if (decision == -1) {
for(int d = 0; d < recognized.length; d++) {
if (ith == recognized[d].characters[0]) {
decision = d;
break;
}
}
// not dml
if (decision == -1) return null;
} else {
if (recognized[decision].characters.length <= i)
break;
if (ith != recognized[decision].characters[i]) {
if (decision == 0 && ith == recognized[6].characters[i]) {
decision = 6;
} else {
// doesn't match
return null;
}
}
}
}
return recognized[decision];
}
public static void main(String[] ignored) {
int tuples = 100000;
StringBuffer buf = new StringBuffer();
// specifically from sysbench
buf.append("INSERT INTO sbtest(k, c, pad) values ");
for(int i = 0; i < tuples; i++) {
if (i > 0) buf.append(", ");
buf.append("(0, ' ', 'qqqqqqqqqqwwwwwwwwwweeeeeeeeeerrrrrrrrrtttttttttt')");
}
try {
CandidateParser cp = new CandidateParser(buf.toString());
long startAt = System.nanoTime();
cp.shrink();
long delta = System.nanoTime() - startAt;
System.out.println(tuples + " tuples took " + delta/1000 + " microseconds, with " + cp.getLiterals().size() + " literals");
} catch (Throwable t) {
t.printStackTrace();
}
}
}