/* * Copyright (c) 2007 BUSINESS OBJECTS SOFTWARE LIMITED * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * * Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * * Neither the name of Business Objects nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * StringEncoder.java * Creation date: (January 10, 2001 3:12:59 PM) * By: Bo Ilic */ package org.openquark.cal.compiler; /** * A utility class for converting encoded strings and characters to the underlying string or character. * The encoding assumed is the encoding for string and character literals as specified in the Java Language * Specification. This is also the syntax used for string and character literals in CAL. * Creation date: (January 10, 2001 3:12:59 PM) * @author Bo Ilic */ public final class StringEncoder { /** ASCII characters have codes 0-127. */ private final static int MAX_ASCII_CODE = 127; static private class CharIntPair { private final char c; private final int i; CharIntPair(char c, int i) { this.c = c; this.i = i; } public char getChar() { return c; } public int getInt() { return i; } } private StringEncoder() {} /** * Create a textual representation of a String value that can be parsed in CAL or Java source code as a String literal. * * In particular, the special characters newline, carriage return, horizontal tab, backspace, form feed, double quote, * single quote, and backslash are output in their escaped format. * Non- ASCII characters i.e. having code >= 128 are output in hex escaped form e.g. "\u1234" etc. * ISO-control characters (0-31 and 127-159) within the ASCII range are also escaped. * * Does the opposite of unencodeString. * Eg: Turns 'a' + 'b' + '\n' into '"' + 'a' + 'b' + '\' + 'n' + '"' * Creation date: (06/04/01 8:33:00 AM) * @param unencodedString String * @return includes the starting and ending double quote characters. */ public static String encodeString(String unencodedString) { StringBuilder sbEncoded = new StringBuilder("\""); // Examine each char in the unencodedString, and if it's a special character, encode it and add it to sbEncoded. // If it's just a plain ol' regular run o' the mill char, then just add it to sbEncoded. for (int i = 0, unencodedStringLength = unencodedString.length(); i < unencodedStringLength; i++) { char c = unencodedString.charAt(i); sbEncoded.append(charToEncodedCharFragment(c)); } // Need to add ending enclosing double quotes. sbEncoded.append('\"'); return sbEncoded.toString(); } /** * Create a textual representation of a char value that can be parsed in CAL or Java source code as a char literal. * * In particular, the special characters newline, carriage return, horizontal tab, backspace, form feed, double quote, * single quote, and backslash are output in their escaped format. * Non- ASCII characters i.e. having code >= 128 are output in escaped form e.g. "\u1234" etc. * ISO-control characters (0-31 and 127-159) within the ASCII range are also escaped. * * @param c * @return - The textual representation for the escape sequence representing the character. * eg. '\n', or '\uA123', or 'w'. Note that the start and end single quote are included. What is returned * can directly be parsed as a char literal in CAL or Java source. */ public static String encodeChar(char c) { return new StringBuilder ("\'").append(charToEncodedCharFragment(c)).append('\'').toString(); } private static String charToEncodedCharFragment(char c) { switch (c) { case '\n' : //newline return "\\n"; case '\r' : //carriage return return "\\r"; case '\t' : //horizontal tab return "\\t"; case '\b' : //backspace return "\\b"; case '\f' : //form feed return "\\f"; case '\"' : //double quote return "\\\""; case '\'' : //single quote return "\\\'"; case '\\' : //back slash return "\\\\"; default : { if (c > MAX_ASCII_CODE || Character.isISOControl(c)) { String hs = Integer.toHexString(c); switch (hs.length()) { case 1: return "\\u000" + hs; case 2: return "\\u00" + hs; case 3: return "\\u0" + hs; case 4: return "\\u" + hs; default: assert (false); return null; } } else { return Character.toString(c); } } } } /** * Is the character an octal digit? * Creation date: (1/10/01 3:39:26 PM) * @return boolean * @param c char */ static private boolean isOctalDigit(char c) { return c >= '0' && c <= '7'; } /** * Converts a char literal (as obtained from the lexer) from its quoted and escaped form * to its underlying string value. * e.g. "'\n'" is converted to '\n'. * * Creation date: (1/10/01 2:35:01 PM) * @return char * @param encodedChar String * @exception IllegalArgumentException The exception description. */ public static char unencodeChar(String encodedChar) throws IllegalArgumentException { int encodedCharLength = encodedChar.length(); if (encodedCharLength <= 2 || encodedChar.charAt(0) != '\'' || encodedChar.charAt(encodedCharLength - 1) != '\'') { throw new IllegalArgumentException(); } char c = encodedChar.charAt(1); switch (c) { case '\'' : throw new IllegalArgumentException(); case '\\' : { //escaped character CharIntPair charInt = unencodeEscape(encodedChar.substring(2)); if (encodedCharLength > charInt.getInt() + 3) { throw new IllegalArgumentException(); } return charInt.getChar(); } default : return c; } } /** * Attempts to parse a character from an escape string and returns the pair consisting of the * parsed character and the number of characters consumed from the escapeString. * For example, "naa" returns ('\n', 1) and "u1234abc" returns ('\u1234', 5). * Creation date: (1/10/01 1:53:28 PM) * @return CharIntPair the escape character, followed by the number of character consumed from escapeString * @param escapeString String * @exception IllegalArgumentException thrown if a char can not be extracted */ private static CharIntPair unencodeEscape(String escapeString) throws IllegalArgumentException { int escapeStringLength = escapeString.length(); if (escapeStringLength == 0) { throw new IllegalArgumentException(); } char c = escapeString.charAt(0); char resultChar; int nCharsConsumed = 1; switch (c) { case 'n' : resultChar = '\n'; break; case 'r' : resultChar = '\r'; break; case 't' : resultChar = '\t'; break; case 'b' : resultChar = '\b'; break; case 'f' : resultChar = '\f'; break; case '"' : resultChar = '"'; break; case '\'' : resultChar = '\''; break; case '\\' : resultChar = '\\'; break; case 'u' : { //hex control character //('u')+ HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT int firstHexDigitIndex = 1; int lastHexDigitIndexPlusOne = firstHexDigitIndex + 4; while (escapeString.charAt(firstHexDigitIndex) == 'u') { firstHexDigitIndex++; lastHexDigitIndexPlusOne++; if (escapeStringLength < lastHexDigitIndexPlusOne) { throw new IllegalArgumentException(); } } String hexString = escapeString.substring(firstHexDigitIndex, lastHexDigitIndexPlusOne); int controlChar = 0; try { controlChar = Integer.parseInt(hexString, 16); } catch (NumberFormatException e) { throw new IllegalArgumentException(); } resultChar = (char)controlChar; nCharsConsumed = lastHexDigitIndexPlusOne; break; } case '0' : case '1' : case '2' : case '3' : case '4' : case '5' : case '6' : case '7' : { //octal control character //(ZeroToThree OctalDigit OctalDigit) | (OctalDigit OctalDigit) | (OctalDigit) int octalLength = 1; if (escapeStringLength >= 2 && isOctalDigit(escapeString.charAt(1))) { if (escapeStringLength >= 3 && (c >= '0' || c <= '3') && isOctalDigit(escapeString.charAt(2))) { octalLength = 3; } else { octalLength = 2; } } String octalString = escapeString.substring(0, octalLength); int controlChar = 0; try { controlChar = Integer.parseInt(octalString, 8); } catch (NumberFormatException e) { throw new IllegalArgumentException(); } resultChar = (char) controlChar; nCharsConsumed = octalLength; break; } default : throw new IllegalArgumentException(); } return new CharIntPair(resultChar, nCharsConsumed); } /** * Converts a string literal (as obtained from the lexer) from its quoted and escaped form * to its underlying string value. * e.g. '"' + 'a' + '\\' + 'n' + 'b' + '"' is converted to 'a' + '\n' + 'b' * * Creation date: (1/10/01 9:43:02 AM) * @return String * @param encodedString String * @exception IllegalArgumentException if the string is not a valid encoded string */ public static String unencodeString(String encodedString) throws IllegalArgumentException { int encodedStringLength = encodedString.length(); if (encodedStringLength < 2 || encodedString.charAt(0) != '"' || encodedString.charAt(encodedStringLength - 1) != '"') { throw new IllegalArgumentException(); } StringBuilder sb = new StringBuilder(); for (int i = 1; i < encodedStringLength - 1; ++i) { char c = encodedString.charAt(i); switch (c) { case '"' : throw new IllegalArgumentException(); case '\\' : { //escaped character CharIntPair charInt = unencodeEscape(encodedString.substring(i + 1)); sb.append(charInt.getChar()); i += charInt.getInt(); break; } default : sb.append(c); break; } } return sb.toString(); } }