/*
* Copyright (c) 2007 BUSINESS OBJECTS SOFTWARE LIMITED
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* * Neither the name of Business Objects nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* StringEncoder.java
* Creation date: (January 10, 2001 3:12:59 PM)
* By: Bo Ilic
*/
package org.openquark.cal.compiler;
/**
* A utility class for converting encoded strings and characters to the underlying string or character.
* The encoding assumed is the encoding for string and character literals as specified in the Java Language
* Specification. This is also the syntax used for string and character literals in CAL.
* Creation date: (January 10, 2001 3:12:59 PM)
* @author Bo Ilic
*/
public final class StringEncoder {
/** ASCII characters have codes 0-127. */
private final static int MAX_ASCII_CODE = 127;
static private class CharIntPair {
private final char c;
private final int i;
CharIntPair(char c, int i) {
this.c = c;
this.i = i;
}
public char getChar() {
return c;
}
public int getInt() {
return i;
}
}
private StringEncoder() {}
/**
* Create a textual representation of a String value that can be parsed in CAL or Java source code as a String literal.
*
* In particular, the special characters newline, carriage return, horizontal tab, backspace, form feed, double quote,
* single quote, and backslash are output in their escaped format.
* Non- ASCII characters i.e. having code >= 128 are output in hex escaped form e.g. "\u1234" etc.
* ISO-control characters (0-31 and 127-159) within the ASCII range are also escaped.
*
* Does the opposite of unencodeString.
* Eg: Turns 'a' + 'b' + '\n' into '"' + 'a' + 'b' + '\' + 'n' + '"'
* Creation date: (06/04/01 8:33:00 AM)
* @param unencodedString String
* @return includes the starting and ending double quote characters.
*/
public static String encodeString(String unencodedString) {
StringBuilder sbEncoded = new StringBuilder("\"");
// Examine each char in the unencodedString, and if it's a special character, encode it and add it to sbEncoded.
// If it's just a plain ol' regular run o' the mill char, then just add it to sbEncoded.
for (int i = 0, unencodedStringLength = unencodedString.length(); i < unencodedStringLength; i++) {
char c = unencodedString.charAt(i);
sbEncoded.append(charToEncodedCharFragment(c));
}
// Need to add ending enclosing double quotes.
sbEncoded.append('\"');
return sbEncoded.toString();
}
/**
* Create a textual representation of a char value that can be parsed in CAL or Java source code as a char literal.
*
* In particular, the special characters newline, carriage return, horizontal tab, backspace, form feed, double quote,
* single quote, and backslash are output in their escaped format.
* Non- ASCII characters i.e. having code >= 128 are output in escaped form e.g. "\u1234" etc.
* ISO-control characters (0-31 and 127-159) within the ASCII range are also escaped.
*
* @param c
* @return - The textual representation for the escape sequence representing the character.
* eg. '\n', or '\uA123', or 'w'. Note that the start and end single quote are included. What is returned
* can directly be parsed as a char literal in CAL or Java source.
*/
public static String encodeChar(char c) {
return new StringBuilder ("\'").append(charToEncodedCharFragment(c)).append('\'').toString();
}
private static String charToEncodedCharFragment(char c) {
switch (c) {
case '\n' : //newline
return "\\n";
case '\r' : //carriage return
return "\\r";
case '\t' : //horizontal tab
return "\\t";
case '\b' : //backspace
return "\\b";
case '\f' : //form feed
return "\\f";
case '\"' : //double quote
return "\\\"";
case '\'' : //single quote
return "\\\'";
case '\\' : //back slash
return "\\\\";
default :
{
if (c > MAX_ASCII_CODE || Character.isISOControl(c)) {
String hs = Integer.toHexString(c);
switch (hs.length()) {
case 1:
return "\\u000" + hs;
case 2:
return "\\u00" + hs;
case 3:
return "\\u0" + hs;
case 4:
return "\\u" + hs;
default:
assert (false);
return null;
}
} else {
return Character.toString(c);
}
}
}
}
/**
* Is the character an octal digit?
* Creation date: (1/10/01 3:39:26 PM)
* @return boolean
* @param c char
*/
static private boolean isOctalDigit(char c) {
return c >= '0' && c <= '7';
}
/**
* Converts a char literal (as obtained from the lexer) from its quoted and escaped form
* to its underlying string value.
* e.g. "'\n'" is converted to '\n'.
*
* Creation date: (1/10/01 2:35:01 PM)
* @return char
* @param encodedChar String
* @exception IllegalArgumentException The exception description.
*/
public static char unencodeChar(String encodedChar) throws IllegalArgumentException {
int encodedCharLength = encodedChar.length();
if (encodedCharLength <= 2 || encodedChar.charAt(0) != '\'' || encodedChar.charAt(encodedCharLength - 1) != '\'') {
throw new IllegalArgumentException();
}
char c = encodedChar.charAt(1);
switch (c) {
case '\'' :
throw new IllegalArgumentException();
case '\\' :
{
//escaped character
CharIntPair charInt = unencodeEscape(encodedChar.substring(2));
if (encodedCharLength > charInt.getInt() + 3) {
throw new IllegalArgumentException();
}
return charInt.getChar();
}
default :
return c;
}
}
/**
* Attempts to parse a character from an escape string and returns the pair consisting of the
* parsed character and the number of characters consumed from the escapeString.
* For example, "naa" returns ('\n', 1) and "u1234abc" returns ('\u1234', 5).
* Creation date: (1/10/01 1:53:28 PM)
* @return CharIntPair the escape character, followed by the number of character consumed from escapeString
* @param escapeString String
* @exception IllegalArgumentException thrown if a char can not be extracted
*/
private static CharIntPair unencodeEscape(String escapeString) throws IllegalArgumentException {
int escapeStringLength = escapeString.length();
if (escapeStringLength == 0) {
throw new IllegalArgumentException();
}
char c = escapeString.charAt(0);
char resultChar;
int nCharsConsumed = 1;
switch (c) {
case 'n' :
resultChar = '\n';
break;
case 'r' :
resultChar = '\r';
break;
case 't' :
resultChar = '\t';
break;
case 'b' :
resultChar = '\b';
break;
case 'f' :
resultChar = '\f';
break;
case '"' :
resultChar = '"';
break;
case '\'' :
resultChar = '\'';
break;
case '\\' :
resultChar = '\\';
break;
case 'u' :
{
//hex control character
//('u')+ HEX_DIGIT HEX_DIGIT HEX_DIGIT HEX_DIGIT
int firstHexDigitIndex = 1;
int lastHexDigitIndexPlusOne = firstHexDigitIndex + 4;
while (escapeString.charAt(firstHexDigitIndex) == 'u') {
firstHexDigitIndex++;
lastHexDigitIndexPlusOne++;
if (escapeStringLength < lastHexDigitIndexPlusOne) {
throw new IllegalArgumentException();
}
}
String hexString = escapeString.substring(firstHexDigitIndex, lastHexDigitIndexPlusOne);
int controlChar = 0;
try {
controlChar = Integer.parseInt(hexString, 16);
} catch (NumberFormatException e) {
throw new IllegalArgumentException();
}
resultChar = (char)controlChar;
nCharsConsumed = lastHexDigitIndexPlusOne;
break;
}
case '0' :
case '1' :
case '2' :
case '3' :
case '4' :
case '5' :
case '6' :
case '7' :
{
//octal control character
//(ZeroToThree OctalDigit OctalDigit) | (OctalDigit OctalDigit) | (OctalDigit)
int octalLength = 1;
if (escapeStringLength >= 2 && isOctalDigit(escapeString.charAt(1))) {
if (escapeStringLength >= 3 && (c >= '0' || c <= '3') && isOctalDigit(escapeString.charAt(2))) {
octalLength = 3;
} else {
octalLength = 2;
}
}
String octalString = escapeString.substring(0, octalLength);
int controlChar = 0;
try {
controlChar = Integer.parseInt(octalString, 8);
} catch (NumberFormatException e) {
throw new IllegalArgumentException();
}
resultChar = (char) controlChar;
nCharsConsumed = octalLength;
break;
}
default :
throw new IllegalArgumentException();
}
return new CharIntPair(resultChar, nCharsConsumed);
}
/**
* Converts a string literal (as obtained from the lexer) from its quoted and escaped form
* to its underlying string value.
* e.g. '"' + 'a' + '\\' + 'n' + 'b' + '"' is converted to 'a' + '\n' + 'b'
*
* Creation date: (1/10/01 9:43:02 AM)
* @return String
* @param encodedString String
* @exception IllegalArgumentException if the string is not a valid encoded string
*/
public static String unencodeString(String encodedString) throws IllegalArgumentException {
int encodedStringLength = encodedString.length();
if (encodedStringLength < 2 || encodedString.charAt(0) != '"' || encodedString.charAt(encodedStringLength - 1) != '"') {
throw new IllegalArgumentException();
}
StringBuilder sb = new StringBuilder();
for (int i = 1; i < encodedStringLength - 1; ++i) {
char c = encodedString.charAt(i);
switch (c) {
case '"' :
throw new IllegalArgumentException();
case '\\' :
{
//escaped character
CharIntPair charInt = unencodeEscape(encodedString.substring(i + 1));
sb.append(charInt.getChar());
i += charInt.getInt();
break;
}
default :
sb.append(c);
break;
}
}
return sb.toString();
}
}