/*******************************************************************************
* Copyright (c) 2012, 2016 Mathias Kunter and others.
* All rights reserved. This program and the accompanying materials
* are made available under the terms of the Eclipse Public License v1.0
* which accompanies this distribution, and is available at
* http://www.eclipse.org/legal/epl-v10.html
*
* Contributors:
* Mathias Kunter - Initial Implementation (Bug 307311)
*******************************************************************************/
package org.eclipse.cdt.dsf.mi.service.command.output;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.text.ParseException;
import java.util.EnumSet;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Map.Entry;
/**
* The MIStringHandler class provides several static functions to handle C and / or MI strings.
* @since 4.1
*/
public class MIStringHandler {
/**
* A map of special characters which are used within escape notations to represent a
* corresponding Unicode code point (i.e. character code).
*/
// Use a LinkedHashMap to preserve order, so as to get 'e' and not 'E'
private static Map<Character,Integer> fSpecialCharactersToCodePointMap = new LinkedHashMap<Character,Integer>();
static {
fSpecialCharactersToCodePointMap.put('a', 0x07); // Alert (bell) character
fSpecialCharactersToCodePointMap.put('b', 0x08); // Backspace character
fSpecialCharactersToCodePointMap.put('e', 0x1B); // GNU extension: Escape character
fSpecialCharactersToCodePointMap.put('E', 0x1B); // same as 'e'
fSpecialCharactersToCodePointMap.put('f', 0x0C); // Form feed character
fSpecialCharactersToCodePointMap.put('n', 0x0A); // New line character
fSpecialCharactersToCodePointMap.put('r', 0x0D); // Carriage return character
fSpecialCharactersToCodePointMap.put('t', 0x09); // Horizontal tabulation character
fSpecialCharactersToCodePointMap.put('v', 0x0B); // Vertical tabulation character
fSpecialCharactersToCodePointMap.put('\'', 0x27); // Single quotation mark
fSpecialCharactersToCodePointMap.put('"', 0x22); // Double quotation mark
fSpecialCharactersToCodePointMap.put('\\', 0x5C); // Backslash
fSpecialCharactersToCodePointMap.put('?', 0x3F); // Literal question mark
}
/**
* An internal helper enumeration which holds the current status while parsing an escaped
* text sequence.
*/
private enum EscapeStatus {
NONE,
BEGIN,
OCTAL_NUMBER,
HEX_NUMBER,
UNICODE_SHORT_NUMBER,
UNICODE_LONG_NUMBER,
VALID,
INVALID
}
/**
* An enumeration defining the escape sequences which should be parsed.
*/
public enum ParseFlags {
SPECIAL_CHARS,
OCTAL_NUMBERS,
HEX_NUMBERS,
UNICODE_SHORT_NUMBERS,
UNICODE_LONG_NUMBERS
}
/**
* Translates the given C string into a string suitable for display. This includes handling
* of escaped characters and different string encodings. This is necessary in order to correctly
* deal with non-ASCII strings.
* @param str The C string to translate.
* @param escapeChars Defines whether non-printable characters should be escaped within
* the translated string, or not.
* @return The translated string.
*/
public static String translateCString(String str, boolean escapeChars) {
if (escapeChars) {
// Don't parse the special character escape notations here. We can do this here because
// we want to keep them in their escaped form anyway, and because the following string
// transcoding process isn't affected by escaped special chars. By doing so we avoid
// caring about some nasty details of the special character escaping process: for
// example, single quotation marks are commonly only escaped within character constants,
// while double quotation marks are commonly only escaped within string constants. By
// not parsing the special character escape notations at all here, we just keep the
// original special character escaping provided by the given MI string.
str = parseString(str, EnumSet.complementOf(EnumSet.of(ParseFlags.SPECIAL_CHARS)));
} else {
// Parse all escaped characters.
str = parseString(str);
}
// Transcode the string in order to handle non-ASCII strings correctly.
str = transcodeString(str);
if (escapeChars) {
// Escape any non-printable characters again, as we want to be able to display them.
// However, don't escape any printable special chars, as they haven't been parsed before.
str = escapeString(str, false);
} else {
// No escaping necessary here. We however have to make sure that we use the correct line
// separation character sequence.
str = str.replace("\n", System.getProperty("line.separator", "\n")); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$
}
return str;
}
/**
* Returns whether the given character is a special character, or not.
* @param c The character to test.
* @return The test result.
*/
public static boolean isSpecialChar(char c) {
return fSpecialCharactersToCodePointMap.containsKey(c);
}
/**
* Returns whether the given Unicode code point is a special code point, or not.
* @param codePoint The Unicode code point to test.
* @return The test result.
*/
public static boolean isSpecialCodePoint(int codePoint) {
return fSpecialCharactersToCodePointMap.containsValue(codePoint);
}
/**
* Parses the given special character into an Unicode code point.
* @param c The special character to parse.
* @return The parsed Unicode code point.
* @throws ParseException Thrown when the given character can't be parsed. This happens when it's
* not a special character.
*/
public static int parseSpecialChar(char c) throws ParseException {
Integer codePoint = fSpecialCharactersToCodePointMap.get(c);
if (codePoint != null) {
return codePoint;
}
throw new ParseException("The given character '" + c + "' is not a special character.", 0); //$NON-NLS-1$ //$NON-NLS-2$
}
/**
* Parses the given special Unicode code point into a character.
* @param codePoint The special Unicode code point to parse.
* @return The parsed character.
* @throws ParseException Thrown when the given Unicode code point can't be parsed. This happens
* when it's not a special code point.
*/
public static char parseSpecialCodePoint(int codePoint) throws ParseException {
for (Entry<Character, Integer> entry : fSpecialCharactersToCodePointMap.entrySet()) {
if (entry.getValue().equals(codePoint)) {
return entry.getKey();
}
}
throw new ParseException("The given Unicode code point " + codePoint + " is not a special code point.", 0); //$NON-NLS-1$ //$NON-NLS-2$
}
/**
* This is an overloaded function. See the Javadoc of the other function overload for details.
* @param str The string which should be parsed.
* @return The parsed string.
*/
public static String parseString(String str) {
return parseString(str, EnumSet.allOf(ParseFlags.class));
}
/**
* Parses any escaped characters and replaces them with their corresponding Unicode code points.
* This function parses all escape notations which are supported by gcc and / or gdb. Those are:</br></br>
*
* <ul>
* <li>Special char escape notations: \a, \b, \e, \E, \f, \n, \r, \t, \v, \', \", \\, and \?</li>
*
* <li>Octal escape notation: An initial backslash, followed by 1, 2, or 3 octal digits. Values
* above 0xFF are ignored. Octal escape notations may not use more than 3 octal digits.</li>
*
* <li>Hexadecimal escape notation: An initial backslash, followed by an "x" and 1 or more
* hexadecimal digits. Hexadecimal escape notations may not use more than 4 hexadecimal digits
* (although gcc accepts hexadecimal escape notations of any arbitrary length).</li>
*
* <li>Short Unicode escape notation: An initial backslash, followed by an "u" and exactly 4
* hexadecimal digits.</li>
*
* <li>Long Unicode escape notation: An initial backslash, followed by an "U" and exactly 8
* hexadecimal digits.</li>
* </ul>
* @param str The string which should be parsed.
* @param parseFlags The set of escape notations which should be parsed.
* @return The parsed string.
*/
public static String parseString(String str, EnumSet<ParseFlags> parseFlags) {
StringBuilder buffer = new StringBuilder();
StringBuilder escapeBuffer = new StringBuilder();
EscapeStatus escStatus = EscapeStatus.NONE;
for (int i = 0; i < str.length(); i++) {
char c = str.charAt(i);
boolean consumeChar = true;
boolean isLastChar = i == str.length() - 1;
if (escStatus == EscapeStatus.NONE) {
if (c == '\\') {
// Escaping begins. Reset the escape buffer.
escapeBuffer.setLength(0);
escapeBuffer.append(c);
escStatus = EscapeStatus.BEGIN;
}
} else if (escStatus == EscapeStatus.BEGIN) {
if (parseFlags.contains(ParseFlags.SPECIAL_CHARS) && isSpecialChar(c)) {
try {
buffer.appendCodePoint(parseSpecialChar(c));
escStatus = EscapeStatus.VALID;
} catch (ParseException e) {
// This is just for completeness. We will actually never catch any ParseException here
// since we already checked the character with isSpecialChar() before.
escapeBuffer.append(c);
escStatus = EscapeStatus.INVALID;
}
} else if (parseFlags.contains(ParseFlags.OCTAL_NUMBERS) && c >= '0' && c <= '7') {
escStatus = EscapeStatus.OCTAL_NUMBER;
// Don't consume this character right now - as this wouldn't work if it's the last character.
consumeChar = false;
} else if (parseFlags.contains(ParseFlags.HEX_NUMBERS) && c == 'x') {
escStatus = EscapeStatus.HEX_NUMBER;
} else if (parseFlags.contains(ParseFlags.UNICODE_SHORT_NUMBERS) && c == 'u') {
escStatus = EscapeStatus.UNICODE_SHORT_NUMBER;
} else if (parseFlags.contains(ParseFlags.UNICODE_LONG_NUMBERS) && c == 'U') {
escStatus = EscapeStatus.UNICODE_LONG_NUMBER;
} else {
escStatus = EscapeStatus.INVALID;
}
if (consumeChar) {
escapeBuffer.append(c);
}
} else if (escStatus == EscapeStatus.HEX_NUMBER) {
// Only consume this character if it belongs to the escape sequence.
consumeChar = (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
if (consumeChar) {
escapeBuffer.append(c);
}
if (!consumeChar || isLastChar || escapeBuffer.length() == 6) {
// The escape sequence is terminated. Set the escape status to invalid until
// we know that it's actually valid.
escStatus = EscapeStatus.INVALID;
if (escapeBuffer.length() > 2) {
// Decode the hexadecimal number.
try {
int codePoint = Integer.parseInt(escapeBuffer.toString().substring(2), 16);
if (codePoint <= 0x10FFFF) {
buffer.appendCodePoint(codePoint);
escStatus = EscapeStatus.VALID;
}
} catch (NumberFormatException e) {
}
}
}
} else if (escStatus == EscapeStatus.UNICODE_SHORT_NUMBER || escStatus == EscapeStatus.UNICODE_LONG_NUMBER) {
// Only consume this character if it belongs to the escape sequence.
consumeChar = (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
if (consumeChar) {
escapeBuffer.append(c);
}
int finalLength = escStatus == EscapeStatus.UNICODE_SHORT_NUMBER ? 6 : 10;
if (escapeBuffer.length() == finalLength) {
// The escape sequence is terminated. Set the escape status to invalid until
// we know that it's actually valid. Decode the hexadecimal number.
escStatus = EscapeStatus.INVALID;
try {
int codePoint = Integer.parseInt(escapeBuffer.toString().substring(2), 16);
if (codePoint <= 0x10FFFF) {
buffer.appendCodePoint(codePoint);
escStatus = EscapeStatus.VALID;
}
} catch (NumberFormatException e) {
}
} else if (!consumeChar || isLastChar) {
// The escape sequence is terminated and invalid.
escStatus = EscapeStatus.INVALID;
}
} else if (escStatus == EscapeStatus.OCTAL_NUMBER) {
// Only consume this character if it belongs to the escape sequence.
consumeChar = c >= '0' && c <= '7';
if (consumeChar) {
escapeBuffer.append(c);
}
if (!consumeChar || isLastChar || escapeBuffer.length() == 4) {
// The escape sequence is terminated. Set the escape status to invalid until
// we know that it's actually valid.
escStatus = EscapeStatus.INVALID;
if (escapeBuffer.length() > 1) {
// Decode the octal number.
try {
int codePoint = Integer.parseInt(escapeBuffer.toString().substring(1), 8);
if (codePoint <= 0xFF) {
buffer.appendCodePoint(codePoint);
escStatus = EscapeStatus.VALID;
}
} catch (NumberFormatException e) {
}
}
}
}
if (escStatus == EscapeStatus.NONE) {
// Current character isn't escaped - copy it over to the destination buffer.
buffer.append(c);
} else if (escStatus == EscapeStatus.VALID) {
escStatus = EscapeStatus.NONE;
} else if (escStatus == EscapeStatus.INVALID) {
buffer.append(escapeBuffer);
escStatus = EscapeStatus.NONE;
}
if (!consumeChar) {
// Don't consume the current character.
i--;
}
}
// Check for non-finished escape sequences at the end of the string.
if (escStatus != EscapeStatus.NONE) {
buffer.append(escapeBuffer);
}
// Convert the buffer into a string and return it.
return buffer.toString();
}
/**
* Transcodes the given string. This is done as follows:</br></br>
* 1) The given string is encoded into a binary byte buffer.</br></br>
* 2) It's tested whether this binary byte buffer seems to represent a string which is encoded as
* either ASCII, Latin-1, or UTF-8. If this is the case, the binary byte buffer is decoded back into
* a string and this string is returned. If the test is negative, the given string is returned without
* modification because its encoding can't be reliably determined in this case.
* The most important use case of this function is to transcode a string which is actually UTF-8 but has
* been incorrectly decoded as Latin-1 instead.
* @param str The string to transcode.
* @return The transcoded string.
*/
public static String transcodeString(String str) {
// Try to transcode the string from Latin-1 to UTF-8 (ASCII doesn't need to be explicitly
// considered here since Latin-1 is backwards compatible with ASCII). The transcoding will
// almost certainly only succeed if the string actually *is* encoded in UTF-8. If the
// transcoding fails, the string is simply left unchanged.
try {
// First, try to encode the string as Latin-1 in order to obtain the binary byte
// representation of the string.
CharsetEncoder latin1Encoder = Charset.forName("ISO-8859-1").newEncoder(); //$NON-NLS-1$
ByteBuffer stringBytes = latin1Encoder.encode(CharBuffer.wrap(str.toCharArray()));
// Next, try to decode the string as UTF-8. This will almost certainly only succeed
// if the string actually *is* encoded in UTF-8. Note that if the decoding fails,
// an exception is thrown before the str variable is assigned. The original string
// is therefore left unchanged in this case.
CharsetDecoder utf8Decoder = Charset.forName("UTF-8").newDecoder(); //$NON-NLS-1$
str = utf8Decoder.decode(stringBytes).toString();
} catch (Exception e) {
}
return str;
}
/**
* Escapes any non-printable characters as well as the printable special characters single quotation
* mark, double quotation mark, backslash, and literal question mark within the given string. Supports
* the entire Unicode code space.
* @param str The string which should be escaped.
* @return The escaped string.
*/
public static String escapeString(String str) {
return escapeString(str, true);
}
/**
* Escapes any non-printable characters within the given string. Supports the entire Unicode code space.
* @param str The string which should be escaped.
* @param escapePrintableSpecialChars Defines whether the printable special characters single
* quotation mark, double quotation mark, backslash, and literal question mark should be
* escaped as well, or not.
* @return The escaped string.
*/
public static String escapeString(String str, boolean escapePrintableSpecialChars) {
StringBuilder buffer = new StringBuilder();
for (int i = 0; i < str.length(); i++) {
// Get the current character code point. Note that using the Java "char" data type isn't
// sufficient here, as it can't handle all Unicode characters.
int codePoint = str.codePointAt(i);
if (Character.isSupplementaryCodePoint(codePoint)) {
i++;
}
// Check the code point type of the character in order to determine whether it's
// printable or not.
int codePointType = Character.getType(codePoint);
switch (codePointType) {
case Character.LINE_SEPARATOR:
case Character.PARAGRAPH_SEPARATOR:
case Character.CONTROL:
case Character.PRIVATE_USE:
case Character.SURROGATE:
case Character.UNASSIGNED:
// Non-printable character.
if (isSpecialCodePoint(codePoint)) {
// Escape by using the special character escape notation.
buffer.append('\\');
try {
buffer.append(parseSpecialCodePoint(codePoint));
} catch (ParseException e) {
buffer.appendCodePoint(codePoint);
}
} else if (codePoint == 0x00) {
// Escape the null character separately - don't use leading zeros.
buffer.append("\\0"); //$NON-NLS-1$
} else if (codePoint <= 0xFF) {
// Escape by using the octal escape notation.
buffer.append(String.format("\\%03o", codePoint)); //$NON-NLS-1$
} else if (codePoint <= 0xFFFF) {
// Escape by using the short Unicode escape notation.
buffer.append(String.format("\\u%04x", codePoint)); //$NON-NLS-1$
} else {
// Escape by using the long Unicode escape notation.
buffer.append(String.format("\\U%08x", codePoint)); //$NON-NLS-1$
}
break;
default:
// Printable character.
if (escapePrintableSpecialChars && isSpecialCodePoint(codePoint)) {
// Escape by using the special character escape notation.
buffer.append('\\');
try {
buffer.append(parseSpecialCodePoint(codePoint));
} catch (ParseException e) {
buffer.appendCodePoint(codePoint);
}
} else {
// Don't escape.
buffer.appendCodePoint(codePoint);
}
}
}
return buffer.toString();
}
}