/** * VMware Continuent Tungsten Replicator * Copyright (C) 2015 VMware, Inc. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * Initial developer(s): Robert Hodges * Contributor(s): */ package com.continuent.tungsten.common.parsing.bytes; import java.io.UnsupportedEncodingException; /** * Utility class to translate MySQL statements from a native charset to Java * Unicode strings accounting for introducers for binary and alternative * character sets. Syntax for MySQL strings is described in <a * href="http://dev.mysql.com/doc/refman/5.1/en/string-syntax.html"> MySQL * online documentation</a>. A typical example looks like the following: * <p> * <code><pre> * INSERT INTO `storage'_binary'` VALUES (25, 'col_binary', _binary'\0\0\0\0') * </pre></code> This string illustrates some of the potential ambiguities in * string translation. To avoid confusion we implement a full tokenizer that * ignores data embedded in normal strings or comments. We thus translate the * preceding string into the following, where binary data are replaced by a * hexadecimal string format: * <p> * <code><pre> * INSERT INTO `storage'_binary'` VALUES (25, 'col_binary', _binary x'00000000') * </pre></code> The translation is based on state machines according to the * following principles. * <p> * <ul> * <li>Binary and alternative charset strings denoted by introducers of the form * _<introducer>'value' or _<introducer>"value" are converted to hex * strings that can translate safely to Unicode.</li> * <li>All bytes outside of such strings are translated using the character set * assigned when creating the MySQLStatementTranslator instance.</li> * <li>Values within ordinary strings starting with backtick (`), single, and * double quotes excluded from parsing for introducers. The same applies to * values within comments.</li> * </ul> * Performance is an important consideration in the translation algorithm as * binary strings in particular are potentially quite large. The parsing and * translation processing uses pointers into the byte string to minimize object * creation. The translation values for byte strings are pre-computed strings. * The performance overhead of parsing + translation is about 10% over unparsed * string translation. * <p> * Finally, it should be noted that translation into the safe hex format doubles * the size of binary strings. Users should expect to double memory allocations * accordingly, including MySQL specific settings like max_packet_size, which * sets the maximum size of a single client request. * * @author <a href="mailto:robert.hodges@continuent.com">Robert Hodges</a> * @version 1.0 */ public class MySQLStatementTranslator { // Character set names used for introducers. private static String charsetNames[] = {"armscii8", "ascii", "big5", "binary", "cp1250", "cp1251", "cp1256", "cp1257", "cp850", "cp852", "cp866", "cp932", "dec8", "eucjpms", "euckr", "gb2312", "gbk", "geostd8", "greek", "hebrew", "hp8", "keybcs2", "koi8r", "koi8u", "latin1", "latin2", "latin5", "latin7", "macce", "macroman", "sjis", "swe7", "tis620", "ucs2", "ujis", "utf8"}; // Character string and comment fragments. private static final String UNDERSCORE = "_"; private static final String SINGLE_QUOTE = "'"; private static final String DOUBLE_QUOTE = "\""; private static final String BACKTICK = "`"; private static final String ESCAPE = "\\"; private static final String COMMENT_START = "/*"; private static final String COMMENT_END = "*/"; // Tokens private static final int TOK_INTRO_SINGLE_QUOTE = 1; private static final int TOK_INTRO_DOUBLE_QUOTE = 2; private static final int TOK_SINGLE_QUOTE = 3; private static final int TOK_DOUBLE_QUOTE = 4; private static final int TOK_BACKTICK = 5; private static final int TOK_COMMENT_START = 6; private static final int TOK_COMMENT_END = 7; // Charset state machines. private String charset; private ByteTranslationStateMachine textFsm; private ByteTranslationStateMachine embeddedStringFsm; private ByteTranslationStateMachine normalStringFsm; private ByteTranslationStateMachine commentFsm; // Escape characters. private byte char_0; private byte char_b; private byte char_n; private byte char_r; private byte char_t; private byte char_Z; // Character lengths. private int singleQuoteLength; private int doubleQuoteLength; // Translation table for byte values. private String[] byteXlationTable = new String[256]; public MySQLStatementTranslator(String charset) throws UnsupportedEncodingException { this.charset = charset; setup(); } // Set up state machines. private void setup() throws UnsupportedEncodingException { // Set up text translation machine with single and double quote // introducers, start characters for normal strings, and comment // start sequence. textFsm = new ByteTranslationStateMachine(); for (String charsetName : charsetNames) { // Add single and double quote versions of next introducer. String introSingleQuote = UNDERSCORE + charsetName + SINGLE_QUOTE; String introDoubleQuote = UNDERSCORE + charsetName + DOUBLE_QUOTE; String substitute = UNDERSCORE + charsetName; textFsm.load(introSingleQuote.getBytes(charset), TOK_INTRO_SINGLE_QUOTE, substitute.getBytes(charset), false); textFsm.load(introDoubleQuote.getBytes(charset), TOK_INTRO_DOUBLE_QUOTE, substitute.getBytes(charset), false); } textFsm.load(SINGLE_QUOTE.getBytes(charset), TOK_SINGLE_QUOTE, null, false); textFsm.load(DOUBLE_QUOTE.getBytes(charset), TOK_DOUBLE_QUOTE, null, false); textFsm.load(BACKTICK.getBytes(charset), TOK_BACKTICK, null, false); textFsm.load(COMMENT_START.getBytes(charset), TOK_COMMENT_START, null, false); // Set up embedded string state machine with quote characters and // escape sequences. embeddedStringFsm = new ByteTranslationStateMachine(); embeddedStringFsm.load(SINGLE_QUOTE.getBytes(charset), TOK_SINGLE_QUOTE, SINGLE_QUOTE.getBytes(charset), false); embeddedStringFsm.load(DOUBLE_QUOTE.getBytes(charset), TOK_DOUBLE_QUOTE, DOUBLE_QUOTE.getBytes(charset), false); embeddedStringFsm.load(ESCAPE.getBytes(charset), -1, null, true); // Set up state machine for normal strings. normalStringFsm = new ByteTranslationStateMachine(); normalStringFsm.load(SINGLE_QUOTE.getBytes(charset), TOK_SINGLE_QUOTE, null, false); normalStringFsm.load(DOUBLE_QUOTE.getBytes(charset), TOK_DOUBLE_QUOTE, null, false); normalStringFsm.load(BACKTICK.getBytes(charset), TOK_BACKTICK, null, false); normalStringFsm.load(ESCAPE.getBytes(charset), -1, null, true); commentFsm = new ByteTranslationStateMachine(); commentFsm.load(COMMENT_END.getBytes(charset), TOK_COMMENT_END, null, false); commentFsm.load(ESCAPE.getBytes(charset), -1, null, true); // Following are standard escape sequences for embedded strings. char_0 = toSingleByte(charset, "0"); char_b = toSingleByte(charset, "b"); char_n = toSingleByte(charset, "n"); char_r = toSingleByte(charset, "r"); char_t = toSingleByte(charset, "t"); char_Z = toSingleByte(charset, "Z"); // Populate byte string translation table. for (int i = 0; i < 256; i++) { byte b = (byte) i; byteXlationTable[i] = String.format("%02X", b); } // Compute lengths of single and double quote byte strings in current // charset. singleQuoteLength = new String(SINGLE_QUOTE).getBytes(charset).length; doubleQuoteLength = new String(DOUBLE_QUOTE).getBytes(charset).length; } public String toJavaString(byte[] bytes, int offset, int length) throws UnsupportedEncodingException { // Set up translation buffer data. CharacterTranslationBuffer ctb = new CharacterTranslationBuffer(); ctb.load(bytes, offset, length, charset); textFsm.init(); // Loop. while (ctb.hasNext()) { ByteState state = textFsm.add(ctb.next()); if (state == ByteState.ACCEPTED) { int token = textFsm.getToken(); switch (token) { case TOK_INTRO_SINGLE_QUOTE : // Translate up to single quote char and process // embedded string. ctb.translateAndAppendPending(singleQuoteLength); processEmbeddedString(ctb, embeddedStringFsm, charset, TOK_SINGLE_QUOTE); break; case TOK_INTRO_DOUBLE_QUOTE : // Translate up to double quote char and process // embedded string. ctb.translateAndAppendPending(doubleQuoteLength); processEmbeddedString(ctb, embeddedStringFsm, charset, TOK_DOUBLE_QUOTE); break; case TOK_SINGLE_QUOTE : case TOK_DOUBLE_QUOTE : case TOK_BACKTICK : ctb.translateAndAppendPending(0); processNormalString(ctb, normalStringFsm, charset, token); break; case TOK_COMMENT_START : ctb.translateAndAppendPending(0); processComment(ctb, commentFsm, charset, TOK_COMMENT_END); break; } } } ctb.translateAndAppendPending(0); // Return the translated output. return ctb.getOutput(); } // Process an embedded string, handling escape characters and correctly // recognizing the terminating string. private void processEmbeddedString(CharacterTranslationBuffer ctb, ByteTranslationStateMachine stringFsm, String charset, int terminatingToken) throws UnsupportedEncodingException { stringFsm.init(); ctb.append(" x'"); while (ctb.hasNext()) { byte c = ctb.next(); ByteState state = stringFsm.add(c); if (state == ByteState.ACCEPTED) { int token = stringFsm.getToken(); if (token == terminatingToken) { ctb.appendAndClearPending(new String(stringFsm .getSubstitute(), charset)); break; } else if (stringFsm.isSubstitute()) { for (byte b : stringFsm.getSubstitute()) ctb.appendAndClearPending(byteToHexString(b)); } else { ctb.appendAndClearPending(byteToHexString(c)); } } else if (state == ByteState.ESCAPE) { // Process escape sequences for MySQL. String escapedValue; if (c == char_0) escapedValue = "00"; // null; else if (c == char_b) escapedValue = "08"; // backspace else if (c == char_n) escapedValue = "0A"; // new line else if (c == char_r) escapedValue = "0D"; // carriage return else if (c == char_t) escapedValue = "09"; // tab else if (c == char_Z) escapedValue = "1A"; // ^Z, means EOF on Windows else escapedValue = byteToHexString(c); ctb.appendAndClearPending(escapedValue); } } } // Process a normal string or comment. We just ignore all characters // through the end of the string. private void processNormalString(CharacterTranslationBuffer ctb, ByteTranslationStateMachine normalStringFsm, String charset, int terminatingToken) { normalStringFsm.init(); while (ctb.hasNext()) { byte c = ctb.next(); ByteState state = normalStringFsm.add(c); if (state == ByteState.ACCEPTED && normalStringFsm.getToken() == terminatingToken) { break; } } } // Process a comment. We just ignore all characters // through the end of the comment. private void processComment(CharacterTranslationBuffer ctb, ByteTranslationStateMachine commentFsm, String charset, int terminatingToken) { commentFsm.init(); while (ctb.hasNext()) { byte c = ctb.next(); ByteState state = commentFsm.add(c); if (state == ByteState.ACCEPTED && commentFsm.getToken() == terminatingToken) { break; } } } // Translate a byte to hex representation. private String byteToHexString(byte b) { return byteXlationTable[(int) b & 0xFF]; } // Convert String to single byte. private byte toSingleByte(String charset, String c) throws UnsupportedEncodingException { byte[] bytes = c.getBytes(charset); if (bytes.length > 1) throw new UnsupportedEncodingException( "Escape character must be single byte: " + c); else return bytes[0]; } }