package org.rdfhdt.hdt.util.string; import java.io.IOException; public class UnicodeEscape { private UnicodeEscape() {} /** * Checks whether the supplied character is a letter or number * according to the N-Triples specification. * @see #isLetter * @see #isNumber */ public static boolean isLetterOrNumber(int c) { return isLetter(c) || isNumber(c); } /** * Checks whether the supplied character is a letter according to * the N-Triples specification. N-Triples letters are A - Z and a - z. */ public static boolean isLetter(int c) { return (c >= 65 && c <= 90) || // A - Z (c >= 97 && c <= 122); // a - z } /** * Checks whether the supplied character is a number according to * the N-Triples specification. N-Triples numbers are 0 - 9. */ public static boolean isNumber(int c) { return c >= 48 && c <= 57; // 0 - 9 } /** * Escapes a Unicode string to an all-ASCII character sequence. Any special * characters are escaped using backslashes (<tt>"</tt> becomes <tt>\"</tt>, * etc.), and non-ascii/non-printable characters are escaped using Unicode * escapes (<tt>\uxxxx</tt> and <tt>\Uxxxxxxxx</tt>). */ public static String escapeString(String label) { try { StringBuilder sb = new StringBuilder(2 * label.length()); escapeString(label, sb); return sb.toString(); } catch (IOException e) { throw new AssertionError(); } } /** * Escapes a Unicode string to an all-ASCII character sequence. Any special * characters are escaped using backslashes (<tt>"</tt> becomes <tt>\"</tt>, * etc.), and non-ascii/non-printable characters are escaped using Unicode * escapes (<tt>\uxxxx</tt> and <tt>\Uxxxxxxxx</tt>). * * @throws IOException */ public static void escapeString(String label, Appendable appendable) throws IOException { int first = 0; int last = label.length(); if(last>1 && label.charAt(0)=='<' && label.charAt(last-1)=='>') { first++; last--; } else if(label.charAt(0)=='"') { first = 1; appendable.append('"'); for(int i=last-1;i>0; i--) { char curr = label.charAt(i); if(curr=='"') { // The datatype or lang must be after the last " symbol. last=i; break; } char prev = label.charAt(i-1); if(curr=='@' && prev=='"') { last = i-2; break; } if(curr=='^' && prev=='^') { last = i-2; break; } } } for (int i = first; i < last; i++) { char c = label.charAt(i); int cInt = c; if (c == '\\') { appendable.append("\\\\"); } else if (c == '"') { appendable.append("\\\""); } else if (c == '\n') { appendable.append("\\n"); } else if (c == '\r') { appendable.append("\\r"); } else if (c == '\t') { appendable.append("\\t"); } else if ( cInt >= 0x0 && cInt <= 0x8 || cInt == 0xB || cInt == 0xC || cInt >= 0xE && cInt <= 0x1F || cInt >= 0x7F && cInt <= 0xFFFF) { appendable.append("\\u"); appendable.append(toHexString(cInt, 4)); } else if (cInt >= 0x10000 && cInt <= 0x10FFFF) { appendable.append("\\U"); appendable.append(toHexString(cInt, 8)); } else { appendable.append(c); } } appendable.append(label.subSequence(last, label.length())); } /** * Unescapes an escaped Unicode string. Any Unicode sequences * (<tt>\uxxxx</tt> and <tt>\Uxxxxxxxx</tt>) are restored to the * value indicated by the hexadecimal argument and any backslash-escapes * (<tt>\"</tt>, <tt>\\</tt>, etc.) are decoded to their original form. * * @param s An escaped Unicode string. * @return The unescaped string. * @throws IllegalArgumentException If the supplied string is not a * correctly escaped N-Triples string. */ public static String unescapeString(String s) { int backSlashIdx = s.indexOf('\\'); if (backSlashIdx == -1) { // No escaped characters found return s; } int startIdx = 0; int sLength = s.length(); StringBuilder sb = new StringBuilder(sLength); while (backSlashIdx != -1) { sb.append(s.substring(startIdx, backSlashIdx)); if (backSlashIdx + 1 >= sLength) { throw new IllegalArgumentException("Unescaped backslash in: " + s); } char c = s.charAt(backSlashIdx + 1); if (c == 't') { sb.append('\t'); startIdx = backSlashIdx + 2; } else if (c == 'r') { sb.append('\r'); startIdx = backSlashIdx + 2; } else if (c == 'n') { sb.append('\n'); startIdx = backSlashIdx + 2; } else if (c == '"') { sb.append('"'); startIdx = backSlashIdx + 2; } else if (c == '\\') { sb.append('\\'); startIdx = backSlashIdx + 2; } else if (c == 'u') { // \\uxxxx if (backSlashIdx + 5 >= sLength) { throw new IllegalArgumentException( "Incomplete Unicode escape sequence in: " + s); } String xx = s.substring(backSlashIdx + 2, backSlashIdx + 6); try { c = (char)Integer.parseInt(xx, 16); sb.append(c); startIdx = backSlashIdx + 6; } catch (NumberFormatException e) { throw new IllegalArgumentException( "Illegal Unicode escape sequence '\\u" + xx + "' in: " + s); } } else if (c == 'U') { // \\Uxxxxxxxx if (backSlashIdx + 9 >= sLength) { throw new IllegalArgumentException( "Incomplete Unicode escape sequence in: " + s); } String xx = s.substring(backSlashIdx + 2, backSlashIdx + 10); try { c = (char)Integer.parseInt(xx, 16); sb.append(c); startIdx = backSlashIdx + 10; } catch (NumberFormatException e) { throw new IllegalArgumentException( "Illegal Unicode escape sequence '\\U" + xx + "' in: " + s); } } else { throw new IllegalArgumentException("Unescaped backslash in: " + s); } backSlashIdx = s.indexOf('\\', startIdx); } sb.append(s.substring(startIdx)); return sb.toString(); } /** * Converts a decimal value to a hexadecimal string represention * of the specified length. * * @param decimal A decimal value. * @param stringLength The length of the resulting string. */ public static String toHexString(int decimal, int stringLength) { StringBuilder sb = new StringBuilder(stringLength); String hexVal = Integer.toHexString(decimal).toUpperCase(); // insert zeros if hexVal has less than stringLength characters: int nofZeros = stringLength - hexVal.length(); for (int i = 0; i < nofZeros; i++) { sb.append('0'); } sb.append(hexVal); return sb.toString(); } }