/* * Copyright (C) 2009 Archie L. Cobbs. All rights reserved. * Provided as is, as stated in http://jira.codehaus.org/browse/JIBX-346 */ package de.foltin; /** * Encodes/decodes XML-invalid characters in Java strings so they may be * included as XML text. */ public final class StringEncoder { private static final String HEXDIGITS = "0123456789abcdef"; private StringEncoder() { } /** * Encode a string, escaping any invalid XML characters. * * <p> * Invalid characters are escaped using <code>\uNNNN</code> notation * like Java unicode characters, e.g., <code>0x001f</code> would appear in * the encoded string as <code>\u001f</code>. Backslash characters are * themselves encoded with a double backslash. * * @param value * string to encode (possibly null) * @return the encoded version of {@code value}, or {@code null} if * {@code value} was {@code null} * @see #decode */ public static String encode(String value) { if (value == null) return value; StringBuilder buf = new StringBuilder(value.length() + 4); final int limit = value.length(); for (int i = 0; i < limit; i++) { final char ch = value.charAt(i); // Handle escape character if (ch == '\\') { buf.append('\\'); buf.append('\\'); continue; } // If character is an otherwise valid XML character, pass it through // unchanged if (isValidXMLChar(ch)) { buf.append(ch); continue; } // Escape it buf.append('\\'); buf.append('u'); for (int shift = 12; shift >= 0; shift -= 4) buf.append(HEXDIGITS.charAt((ch >> shift) & 0x0f)); } return buf.toString(); } /** * Decode a string encoded by {@link #encode}. * * <p> * The parsing is strict; any ill-formed backslash escape sequence (i.e., * not of the form <code>\uNNNN</code> or <code>\\</code>) will cause an * exception to be thrown. * * @param text * string to decode (possibly null) * @return the decoded version of {@code text}, or {@code null} if * {@code text} was {@code null} * @throws IllegalArgumentException * if {@code text} contains an invalid escape sequence * @see #encode */ public static String decode(String text) { if (text == null) return null; StringBuilder buf = new StringBuilder(text.length()); final int limit = text.length(); for (int i = 0; i < limit; i++) { char ch = text.charAt(i); // Handle unescaped characters if (ch != '\\') { buf.append(ch); continue; } // Get next char if (++i >= limit) throw new IllegalArgumentException( "illegal trailing '\\' in encoded string"); ch = text.charAt(i); // Check for backslash escape if (ch == '\\') { buf.append(ch); continue; } // Must be unicode escape if (ch != 'u') throw new IllegalArgumentException( "illegal escape sequence '\\" + ch + "' in encoded string"); // Decode hex value int value = 0; for (int j = 0; j < 4; j++) { if (++i >= limit) throw new IllegalArgumentException( "illegal truncated '\\u' escape sequence in encoded string"); int nibble = Character.digit(text.charAt(i), 16); if (nibble == -1) { throw new IllegalArgumentException( "illegal escape sequence '" + text.substring(i - j - 2, i - j + 4) + "' in encoded string"); } // assert nibble >= 0 && nibble <= 0xf; value = (value << 4) | nibble; } // Append decodec character buf.append((char) value); } return buf.toString(); } /** * Determine if the given character is a valid XML character according to * the XML 1.0 specification. * * @see <a href="http://www.w3.org/TR/REC-xml/#charsets">The XML 1.0 * Specification</a> */ public static boolean isValidXMLChar(char ch) { return (ch >= '\u0020' && ch <= '\ud7ff') || (ch >= '\ue000' && ch <= '\ufffd'); } }