package org.jrdf.util; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.log4j.Logger; /** * A utility which applies N-Triples escaping. * * @author Andrew Newman * @author Paula Gearon * @version $Revision: 624 $ */ public class EscapeUtil { /** Logger. */ private static final Logger logger = Logger.getLogger(EscapeUtil.class.getName()); /** * A regular expression to pick out characters needing escape from Unicode to * ASCII. A different regular expression is used depending on which version of the JDK is detected - Java 1.4 has * different character support compared with 1.5 and above. * <p/> * This is used by the {@link #escape} method. */ private static Pattern pattern; static { try { if (System.getProperty("java.version").indexOf("1.4") >= 0) { pattern = Pattern.compile("[\\uD800-\\uDBFF][\\uDC00-\\uDFFF]" + "|" + "[\\x00-\\x1F\\x22\\\\\\x7F-\\uFFFF]"); } else { pattern = Pattern.compile("[\uD800\uDC00-\uDBFF\uDFFF]" + "|" + "[\\x00-\\x1F\\x22\\\\\\x7F-\\uFFFF]"); } } catch (Exception e) { logger.error("Unable to initialize Regex pattern", e); } } /** * Base UTF Code point. */ private static final int UTF_BASE_CODEPOINT = 0x10000; /** * How shift to get UTF-16 to character codes. */ private static final int CHARACTER_CODE_OFFSET = 0x3FF; /** * How many characters at a time to decode for 8 bit encoding. */ private static final int CHARACTER_LENGTH_8_BIT = 11; /** * How many characters at a time to decode for 16 bit encoding. */ private static final int CHARACTER_LENGTH_16_BIT = 7; private EscapeUtil() { } /** * Escapes a string literal to a string that is N-Triple escaped. * * @param string a string to escape, never <code>null</code>. * @return a version of the <var>string</var> with N-Triples escapes applied. */ public static final String escape(String string) { assert null != string; // Obtain a fresh matcher Matcher matcher = pattern.matcher(string); // Try to short-circuit the whole process -- maybe nothing needs escaping? if (!matcher.find()) { return string; } // Perform escape character substitutions on each match found by the // matcher, accumulating the escaped text into a stringBuffer StringBuffer stringBuffer = new StringBuffer(); do { // The escape text with which to replace the current match String escapeString; // Depending of the character sequence we're escaping, determine an // appropriate replacement String groupString = matcher.group(); switch (groupString.length()) { case 1: // 16-bit characters requiring escaping switch (groupString.charAt(0)) { case '\t': // tab escapeString = "\\\\t"; break; case '\n': // newline escapeString = "\\\\n"; break; case '\r': // carriage return escapeString = "\\\\r"; break; case '"': // quote escapeString = "\\\\\\\""; break; case '\\': // backslash escapeString = "\\\\\\\\"; break; default: // other characters use 4-digit hex escapes String hexString = Integer.toHexString(groupString.charAt(0)).toUpperCase(); escapeString = "\\\\u0000".substring(0, CHARACTER_LENGTH_16_BIT - hexString.length()) + hexString; assert CHARACTER_LENGTH_16_BIT == escapeString.length(); assert escapeString.startsWith("\\\\u"); break; } break; case 2: // surrogate pairs are represented as 8-digit hex escapes assert Character.SURROGATE == Character.getType(groupString.charAt(0)); assert Character.SURROGATE == Character.getType(groupString.charAt(1)); int highSurrogate = ((groupString.charAt(0) & CHARACTER_CODE_OFFSET) << 10); int lowSurrogate = (groupString.charAt(1) & CHARACTER_CODE_OFFSET); String hexString = Integer.toHexString(highSurrogate + lowSurrogate + UTF_BASE_CODEPOINT). toUpperCase(); escapeString = "\\\\U00000000".substring(0, CHARACTER_LENGTH_8_BIT - hexString.length()) + hexString; assert CHARACTER_LENGTH_8_BIT == escapeString.length(); assert escapeString.startsWith("\\\\U00") : "Expected a start of \\\\U00, but got " + escapeString; break; default: throw new Error("Escape sequence " + groupString + " has no handler"); } assert null != escapeString; // Having determined an appropriate escapeString, add it to the // stringBuffer matcher.appendReplacement(stringBuffer, escapeString); } while (matcher.find()); // Finish off by appending any remaining text that didn't require escaping, // and return the assembled buffer matcher.appendTail(stringBuffer); return stringBuffer.toString(); } /** * Escapes a string which contains a UTF-8 encoding in the internal array of char. * If a UTF-8 encoding is found to be invalid, then this will drop back to * escaping the data as a normal string. Escaping is performed with the NTriples * encoding recommendation: * <a href="http://www.w3.org/TR/2004/REC-rdf-testcases-20040210/#ntrip_strings">§3.2</a> * @param string The string to escape. * @return An escaped version of the string. */ public static final String escapeUTF8(String string) { assert null != string; // Perform escape character substitutions on each match found by the // matcher, accumulating the escaped text into a stringBuilder StringBuilder buffer = new StringBuilder(); try { int i = 0; while (i < string.length()) { char c = string.charAt(i); int bytes = getByteCount(c); if (bytes == 4) { int codepoint = getCodepoint(string, i, c); buffer.append(String.format("\\U%08X", codepoint)); } else { if (bytes != 1) c = getChar(string, i, bytes, c); switch (c) { case 0x9: buffer.append("\\t"); break; case 0xA: buffer.append("\\n"); break; case 0xD: buffer.append("\\r"); break; case 0x22: buffer.append("\\\""); break; case 0x5C: buffer.append("\\\\"); break; default: if (c <= 0x1F || c >= 0x7F) { buffer.append(String.format("\\u%04X", (int)c)); } else { buffer.append(c); } } } i += bytes; } return buffer.toString(); } catch (Exception e) { // This is not a sequence of UTF-8 characters. Fall back to the old escape algorithm. return escape(string); } } /** * Determine the number of characters in a UTF-8 sequence, based on the start of the sequence. * @param c The first byte from the sequence, held in a char. * @return The number of bytes in the sequence. * @throws IllegalArgumentException If the bit pattern in the character does not represent a valid sequence. */ static final int getByteCount(char c) { if ((c & 0xFF80) == 0) return 1; if ((c & 0xFFE0) == 0xC0) return 2; if ((c & 0xFFF0) == 0xE0) return 3; if ((c & 0xFFF8) != 0xF0) throw new IllegalArgumentException("Not a character from a UTF-8 sequence."); return 4; } /** * Calculate the codepoint (a character that doesn't fit into a char) represented * by a 4 byte UTF-8 encoding. * @param s The string containing the encoding. Each char in the string contains * a single byte from the sequence. * @param offset The start of the 4 byte sequence. * @param startChar The first byte (retrieved as a char) in the sequence. * This is identical to s.charAt(offset) but this was already called * for {@link #getByteCount(char)}, so we reuse it here. * @return The Unicode codepoint represented by the 4 byte sequence. * @throws IllegalArgumentException If the bit pattern in the character does not represent a valid sequence. */ static final int getCodepoint(String s, int offset, char startChar) { int secondChar = s.charAt(offset + 1); int thirdChar = s.charAt(offset + 2); int fourthChar = s.charAt(offset + 3); // byte sequence is: 11110zzz, 10zzyyyy, 10yyyyxx, 10xxxxxx // check that the trailing bytes all start correctly if ((secondChar & 0xC0) != 0x80 || (thirdChar & 0xC0) != 0x80 || (fourthChar & 0xC0) != 0x80) { throw new IllegalArgumentException("Not a character from a UTF-8 sequence."); } int x = fourthChar & 0x3F; int yx = thirdChar & 0x3F; int zy = secondChar & 0x3F; int z = (startChar & 0x07) << 2 | zy >> 4; x |= (yx & 0x03) << 6; int y = yx >> 2 | (zy & 0x0F) << 4; return (z << 16) | (y << 8) | x; } /** * Calculate the character represented by a 2 byte or 3 byte UTF-8 encoding. * @param s The string containing the encoding. Each char in the string contains * a single byte from the sequence. * @param offset The start of the 2 or 3 byte sequence. * @param count The number of bytes in the sequence * (already determined through {@link #getByteCount(char)}). * @param startChar The first byte (retrieved as a char) in the sequence. * This is identical to s.charAt(offset) but this was already called * for {@link #getByteCount(char)}, so we reuse it here. * @return The Unicode character represented by the 2 or 3 byte sequence. */ static final char getChar(String s, int offset, int count, char startChar) { assert count == 2 || count == 3; int lastPos = offset + count - 1; int lastChar = s.charAt(lastPos); // check that the last byte matches 10xxxxxx if ((lastChar & 0xC0) != 0x80) throw new IllegalArgumentException("Not a character from a UTF-8 sequence."); int x = lastChar & 0x3F; int yx; int y; if (count == 2) { // 2 byte sequence. First byte is 110yyyxx, second is 10xxxxxx yx = startChar & 0x3F; y = yx >> 2; } else { // 3 byte sequence. First byte is 1110yyyy, Second byte is 10yyyyxx int secondChar = s.charAt(offset + 1); // check that second byte starts correctly if ((secondChar & 0xC0) != 0x80) throw new IllegalArgumentException("Not a character from a UTF-8 sequence."); yx = secondChar & 0x3F; y = (yx >> 2) | (startChar & 0x0F) << 4; } x |= (yx & 0x03) << 6; return (char)(y << 8 | x); } }