/******************************************************************************* * Copyright (c) 2012, 2015 Oracle and/or its affiliates. All rights reserved. * This program and the accompanying materials are made available under the * terms of the Eclipse Public License v1.0 and Eclipse Distribution License v. 1.0 * which accompanies this distribution. * The Eclipse Public License is available at http://www.eclipse.org/legal/epl-v10.html * and the Eclipse Distribution License is available at * http://www.eclipse.org/org/documents/edl-v10.php. * * Contributors: * Oracle - initial API and implementation * ******************************************************************************/ package org.eclipse.persistence.jpa.jpql.tools.utility; import java.util.HashMap; import java.util.Map; /** * This converter handles references when dealing with text or markup in an XML document. Those * references (escape characters) are defined in ISO-8859-1 Reference. * <p> * The conversion supports both converting a numeric character reference (&#nnnn; where nnnn is * the code point in decimal form or &xhhhh; where hhhh is the code point in hexadecimal point) * and a character entity reference (&name; where name is the case-sensitive name of the entity). * * @version 2.5 * @since 2.5 * @author Pascal Filion */ @SuppressWarnings("nls") public final class XmlEscapeCharacterConverter { /** * The entity name for ampersand: <b>&</b>. */ public static final String AMPERSAND_ENTITY_NAME = "&"; /** * The entity name for apostrophe: <b>&apos;</b>. */ public static final String APOSTROPHE_ENTITY_NAME = "'"; /** * The map of symbol mapped to the unicode character. */ private static final Map<String, String> dictionary = buildDictionary(); /** * The entity name for greater-than symbol: <b>></b>. */ public static final String GREATER_THAN_ENTITY_NAME = ">"; /** * The entity name for less-than symbol: <b><</b>. */ public static final String LESS_THAN_ENTITY_NAME = "<"; /** * The entity name for quotation mark: <b>"</b>. */ public static final String QUOTATION_MARK_NAME = """; /** * Cannot instantiate <code>XmlEscapeCharacterConverter</code>. */ private XmlEscapeCharacterConverter() { super(); } private static Map<String, String> buildDictionary() { Map<String, String> dictionary = new HashMap<String, String>(); // Reserved characters dictionary.put("quot", "\""); // Quotation Mark dictionary.put("apos", "'"); // Apostrophe dictionary.put("amp", "&"); // Ampersand dictionary.put("lt", "<"); // Less Than Symbol dictionary.put("gt", ">"); // Greater Than Symbol // ISO-8859-1 symbols dictionary.put("nbsp", "\u00A0"); // Nonbreaking space dictionary.put("iexcl", "\u00A1"); // Inverted Exclamation Point dictionary.put("cent", "\u00A2"); // Cent Sign dictionary.put("pound", "\u00A3"); // Pound Sterling dictionary.put("curren", "\u00A4"); // General Currency Sign dictionary.put("yen", "\u00A5"); // Yen Sign dictionary.put("brvbar", "\u00A6"); // Broken Vertical Bar dictionary.put("sect", "\u00A7"); // Section Sign dictionary.put("uml", "\u00A8"); // Umlaut dictionary.put("copy", "\u00A9"); // Copyright dictionary.put("ordf", "\u00AA"); // Feminine Ordinal dictionary.put("laquo", "\u00AB"); // Left Angle Quote dictionary.put("not", "\u00AC"); // Not Sign dictionary.put("shy", "\u00AD"); // Soft Hyphen dictionary.put("reg", "\u00AE"); // Registered Trademark dictionary.put("macr", "\u00AF"); // Macron Accent dictionary.put("deg", "\u00B0"); // Degree Sign dictionary.put("plusmn", "\u00B1"); // Plus or Minus dictionary.put("sup2", "\u00B2"); // Superscript Two dictionary.put("sup3", "\u00B3"); // Superscript Three dictionary.put("acute", "\u00B4"); // Acute Accent dictionary.put("micro", "\u00B5"); // Micro Sign dictionary.put("para", "\u00B6"); // Paragraph Sign dictionary.put("middot", "\u00B7"); // Middle Dot dictionary.put("cedil", "\u00B8"); // Cedilla dictionary.put("sup1", "\u00B9"); // Superscript One dictionary.put("ordm", "\u00BA"); // Masculine Ordinal dictionary.put("raquo", "\u00BB"); // Right Angle Quote dictionary.put("frac14", "\u00BC"); // Fraction One-Forth dictionary.put("frac12", "\u00BD"); // Fraction One-Half dictionary.put("frac34", "\u00BE"); // Fraction Three-Fourths dictionary.put("iquest", "\u00BF"); // Inverted Question Mark dictionary.put("times", "\u00D7"); // Multiplication dictionary.put("divide", "\u00F7"); // Division // ISO-8859-1 characters dictionary.put("Agrave", "\u00C0"); // Latin capital letter A with grave accent dictionary.put("Aacute", "\u00C1"); // Latin capital letter A with acute accent dictionary.put("Acirc", "\u00C2"); // Latin capital letter A with circumflex dictionary.put("Atilde", "\u00C3"); // Latin capital letter A with tilde dictionary.put("Auml", "\u00C4"); // Latin capital letter A with diaeresis dictionary.put("Aring", "\u00C5"); // Latin capital letter A with ring above dictionary.put("AElig", "\u00C6"); // Latin capital letter AE dictionary.put("Ccedil", "\u00C7"); // Latin capital letter C with cedilla dictionary.put("Egrave", "\u00C8"); // Latin capital letter E with grave accent dictionary.put("Eacute", "\u00C9"); // Latin capital letter E with acute accent dictionary.put("Ecirc", "\u00CA"); // Latin capital letter E with circumflex dictionary.put("Euml", "\u00CB"); // Latin capital letter E with diaeresis dictionary.put("Igrave", "\u00CC"); // Latin capital letter I with grave accent dictionary.put("Iacute", "\u00CD"); // Latin capital letter I with acute accent dictionary.put("Icirc", "\u00CE"); // Latin capital letter I with circumflex dictionary.put("Iuml", "\u00CF"); // Latin capital letter I with diaeresis dictionary.put("ETH", "\u00D0"); // Latin capital letter Eth dictionary.put("Ntilde", "\u00D1"); // Latin capital letter N with tilde dictionary.put("Ograve", "\u00D2"); // Latin capital letter O with grave accent dictionary.put("Oacute", "\u00D3"); // Latin capital letter O with acute accent dictionary.put("Ocirc", "\u00D4"); // Latin capital letter O with circumflex dictionary.put("Otilde", "\u00D5"); // Latin capital letter O with tilde dictionary.put("Ouml", "\u00D6"); // Latin capital letter O with diaeresis dictionary.put("Oslash", "\u00D8"); // Latin capital letter O with stroke dictionary.put("Ugrave", "\u00D9"); // Latin capital letter U with grave accent dictionary.put("Uacute", "\u00DA"); // Latin capital letter U with acute accent dictionary.put("Ucirc", "\u00DB"); // Latin capital letter U with circumflex dictionary.put("Uuml", "\u00DC"); // Latin capital letter U with diaeresis dictionary.put("Yacute", "\u00DD"); // Latin capital letter Y with acute accent dictionary.put("THORN", "\u00DE"); // Latin capital letter THORN dictionary.put("szlig", "\u00DF"); // Latin small letter sharp s dictionary.put("agrave", "\u00E0"); // Latin small letter a with grave accent dictionary.put("aacute", "\u00E1"); // Latin small letter a with acute accent dictionary.put("acirc", "\u00E2"); // Latin small letter a with circumflex dictionary.put("atilde", "\u00E3"); // Latin small letter a with tilde dictionary.put("auml", "\u00E4"); // Latin small letter a with diaeresis dictionary.put("aring", "\u00E5"); // Latin small letter a with ring above dictionary.put("aelig", "\u00E6"); // Latin small letter ae dictionary.put("ccedil", "\u00E7"); // Latin small letter c with cedilla dictionary.put("egrave", "\u00E8"); // Latin small letter e with grave accent dictionary.put("eacute", "\u00E9"); // Latin small letter e with acute accent dictionary.put("ecirc", "\u00EA"); // Latin small letter e with circumflex dictionary.put("euml", "\u00EB"); // Latin small letter e with diaeresis dictionary.put("igrave", "\u00EC"); // Latin small letter i with grave accent dictionary.put("iacute", "\u00ED"); // Latin small letter i with acute accent dictionary.put("icirc", "\u00EE"); // Latin small letter i with circumflex dictionary.put("iuml", "\u00EF"); // Latin small letter i with diaeresis dictionary.put("eth", "\u00F0"); // Latin small letter eth dictionary.put("ntilde", "\u00F1"); // Latin small letter n with tilde dictionary.put("ograve", "\u00F2"); // Latin small letter o with grave accent dictionary.put("oacute", "\u00F3"); // Latin small letter o with acute accent dictionary.put("ocirc", "\u00F4"); // Latin small letter o with circumflex dictionary.put("otilde", "\u00F5"); // Latin small letter o with tilde dictionary.put("ouml", "\u00F6"); // Latin small letter o with diaeresis dictionary.put("oslash", "\u00F8"); // Latin small letter o with stroke dictionary.put("ugrave", "\u00F9"); // Latin small letter u with grave accent dictionary.put("uacute", "\u00FA"); // Latin small letter u with acute accent dictionary.put("ucirc", "\u00FB"); // Latin small letter u with circumflex dictionary.put("uuml", "\u00FC"); // Latin small letter u with diaeresis dictionary.put("yacute", "\u00FD"); // Latin small letter y with acute accent dictionary.put("thorn", "\u00FE"); // Latin small letter thorn dictionary.put("yuml", "\u00FF"); // Latin small letter y with diaeresis // Math Symbols dictionary.put("forall", "\u2200"); // For all dictionary.put("part", "\u2202"); // Partial differential dictionary.put("exist", "\u2203"); // There exists dictionary.put("empty", "\u2205"); // Empty set; Null Set; Diameter dictionary.put("nabla", "\u2207"); // Nabla; Backward difference dictionary.put("isin", "\u2208"); // Element of dictionary.put("notin", "\u2209"); // Not an element of dictionary.put("ni", "\u220B"); // Contains as member dictionary.put("prod", "\u220F"); // N-ary product; Product sign dictionary.put("sum", "\u2211"); // N-ary sumation dictionary.put("minus", "\u2212"); // Minus sign dictionary.put("lowast", "\u2217"); // Asterisk operator dictionary.put("radic", "\u221A"); // Square root; Radical sign dictionary.put("prop", "\u221D"); // Proportional to dictionary.put("infin", "\u221E"); // Infinity dictionary.put("ang", "\u2220"); // Angle dictionary.put("and", "\u2227"); // Logical and; Wedge dictionary.put("or", "\u2228"); // Logical or; Vee dictionary.put("cap", "\u2229"); // Intersection; Cap dictionary.put("cup", "\u222A"); // Union; Cup dictionary.put("int", "\u222B"); // Integral dictionary.put("there4", "\u2234"); // Therefore dictionary.put("sim", "\u223C"); // Tilde operator; Varies with; Similar to dictionary.put("cong", "\u2245"); // Approximately equal to dictionary.put("asymp", "\u2248"); // Almost equal to; Asymptotic to dictionary.put("ne", "\u2260"); // Not equal to dictionary.put("equiv", "\u2261"); // Identical to dictionary.put("le", "\u2264"); // Less-than or equal to dictionary.put("ge", "\u2265"); // Greater-than or equal to dictionary.put("sub", "\u2282"); // Subset of dictionary.put("sup", "\u2283"); // Superset of dictionary.put("nsub", "\u2284"); // Not a subset of dictionary.put("sube", "\u2286"); // Subset of or equal to dictionary.put("supe", "\u2287"); // Superset of or equal to dictionary.put("oplus", "\u2295"); // Circled plus; Direct sum dictionary.put("otimes", "\u2297"); // Circled times; Vector product dictionary.put("perp", "\u22A5"); // Up tack; Orthogonal to; Perpendicular dictionary.put("sdot", "\u22C5"); // Dot operator // Arrows dictionary.put("larr", "\u2190"); // Leftwards arrow dictionary.put("uarr", "\u2191"); // Upwards arrow dictionary.put("rarr", "\u2192"); // Rightwards arrow dictionary.put("darr", "\u2193"); // Downwards arrow dictionary.put("harr", "\u2194"); // Left right arrow dictionary.put("crarr", "\u21B5"); // Downwards arrow with corner leftwards; Carriage return symbol dictionary.put("lArr", "\u21D0"); // Leftwards double arrow dictionary.put("uArr", "\u21D1"); // Upwards double arrow dictionary.put("rArr", "\u21D2"); // Rightwards double arrow dictionary.put("dArr", "\u21D3"); // Downwards double arrow dictionary.put("hArr", "\u21D4"); // Left right double arrow // Greek Capital Letters dictionary.put("Alpha", "\u0391"); // Greek capital letter alpha dictionary.put("Beta", "\u0392"); // Greek capital letter beta dictionary.put("Gamma", "\u0393"); // Greek capital letter gamma dictionary.put("Delta", "\u0394"); // Greek capital letter delta dictionary.put("Epsilon", "\u0395"); // Greek capital letter epsilon dictionary.put("Zeta", "\u0396"); // Greek capital letter zeta dictionary.put("Eta", "\u0397"); // Greek capital letter eta dictionary.put("Theta", "\u0398"); // Greek capital letter theta dictionary.put("Iota", "\u0399"); // Greek capital letter iota dictionary.put("Kappa", "\u039A"); // Greek capital letter kappa dictionary.put("Lambda", "\u039B"); // Greek capital letter lambda dictionary.put("Mu", "\u039C"); // Greek capital letter mu dictionary.put("Nu", "\u039D"); // Greek capital letter nu dictionary.put("Xi", "\u039E"); // Greek capital letter xi dictionary.put("Omicron", "\u039F"); // Greek capital letter omicron dictionary.put("Pi", "\u03A0"); // Greek capital letter pi dictionary.put("Rho", "\u03A1"); // Greek capital letter rho dictionary.put("Sigma", "\u03A3"); // Greek capital letter sigma dictionary.put("Tau", "\u03A4"); // Greek capital letter tau dictionary.put("Upsilon", "\u03A5"); // Greek capital letter upsilon dictionary.put("Phi", "\u03A6"); // Greek capital letter phi dictionary.put("Chi", "\u03A7"); // Greek capital letter chi dictionary.put("Psi", "\u03A8"); // Greek capital letter psi dictionary.put("Omega", "\u03A9"); // Greek capital letter omega // Greek Small Letters dictionary.put("alpha", "\u03B1"); // Greek small letter alpha dictionary.put("beta", "\u03B2"); // Greek small letter beta dictionary.put("gamma", "\u03B3"); // Greek small letter gamma dictionary.put("delta", "\u03B4"); // Greek small letter delta dictionary.put("epsilon", "\u03B5"); // Greek small letter epsilon dictionary.put("zeta", "\u03B6"); // Greek small letter zeta dictionary.put("eta", "\u03B7"); // Greek small letter eta dictionary.put("theta", "\u03B8"); // Greek small letter theta dictionary.put("iota", "\u03B9"); // Greek small letter iota dictionary.put("kappa", "\u03BA"); // Greek small letter kappa dictionary.put("lambda", "\u03BB"); // Greek small letter lambda dictionary.put("mu", "\u03BC"); // Greek small letter mu dictionary.put("nu", "\u03BD"); // Greek small letter nu dictionary.put("xi", "\u03BE"); // Greek small letter xi dictionary.put("omicron", "\u03BF"); // Greek small letter omicron dictionary.put("pi", "\u03C0"); // Greek small letter pi dictionary.put("rho", "\u03C1"); // Greek small letter rho dictionary.put("sigmaf", "\u03C2"); // Greek small letter final sigma dictionary.put("sigma", "\u03C3"); // Greek small letter sigma dictionary.put("tau", "\u03C4"); // Greek small letter tau dictionary.put("upsilon", "\u03C5"); // Greek small letter upsilon dictionary.put("phi", "\u03C6"); // Greek small letter phi dictionary.put("chi", "\u03C7"); // Greek small letter chi dictionary.put("psi", "\u03C8"); // Greek small letter psi dictionary.put("omega", "\u03C9"); // Greek small letter omega dictionary.put("theta", "\u03D1"); // Greek small letter theta symbol dictionary.put("upsih", "\u03D2"); // Greek upsilon with hook symbol dictionary.put("piv", "\u03D6"); // Greek pi symbol // Latin Extended-A and Letterlike Symbols dictionary.put("OElig", "\u0152"); // Latin capital ligature oe dictionary.put("oelig", "\u0153"); // Latin small ligature oe dictionary.put("Scaron", "\u0160"); // Latin capital letter s with caron dictionary.put("scaron", "\u0161"); // Latin small letter s with caron dictionary.put("Yuml", "\u0178"); // Latin capital letter y with diaeresis dictionary.put("fnof", "\u0192"); // Latin small f with hook dictionary.put("weierp", "\u2118"); // Script capital P; Power set; Weierstrass p dictionary.put("image", "\u2111"); // Blackletter capital I; Imaginary part dictionary.put("real", "\u211C"); // Blackletter capital R; Real part symbol dictionary.put("trade", "\u2122"); // Trade mark sign dictionary.put("alefsym", "\u2135"); // Alef symbol; First transfinite cardinal // Miscellaneous Shapes dictionary.put("spades", "\u2660"); // Black spade suit dictionary.put("clubs", "\u2663"); // Black club suit; Shamrock dictionary.put("hearts", "\u2665"); // Black heart suit; Valentine dictionary.put("diams", "\u2666"); // Black diamond suit dictionary.put("loz", "\u25CA"); // Lozenge // Miscellaneous Technical Symbols dictionary.put("lceil", "\u2308"); // Left ceiling; Apl upstile dictionary.put("rceil", "\u2309"); // Right ceiling dictionary.put("lfloor", "\u230A"); // Left floor; Apl downstile dictionary.put("rfloor", "\u230B"); // Right floor dictionary.put("lang", "\u2329"); // Left-pointing angle bracket dictionary.put("rang", "\u232A"); // Right-pointing angle bracket // Spacing Modifier Characters and Bi-directional Characters dictionary.put("circ", "\u02C6"); // Modifier letter circumflex accent dictionary.put("tilde", "\u02DC"); // Small tilde dictionary.put("zwnj", "\u200C"); // Zero width non-joiner dictionary.put("zwj", "\u200D"); // Zero width joiner dictionary.put("lrm", "\u200E"); // Left-to-right mark dictionary.put("rlm", "\u200F"); // Right-to-left mark // General Punctuation Set 1 dictionary.put("bull", "\u2022"); // Bullet; Black small circle dictionary.put("hellip", "\u2026"); // Horizontal ellipsis; Three dot leader dictionary.put("prime", "\u2032"); // Prime; Minutes; Feet dictionary.put("Prime", "\u2033"); // Double prime; Seconds; Inches dictionary.put("oline", "\u203E"); // Overline; Spacing overscore dictionary.put("frasl", "\u2044"); // Fraction slash // General Punctuation Set 2 dictionary.put("ensp", "\u2002"); // En space dictionary.put("emsp", "\u2003"); // Em space dictionary.put("thinsp", "\u2009"); // Thin space dictionary.put("zwnj", "\u200C"); // Zero width non-joiner dictionary.put("zwj", "\u200D"); // Zero width joiner dictionary.put("lrm", "\u200E"); // Left-to-right mark dictionary.put("rlm", "\u200F"); // Right-to-left mark dictionary.put("ndash", "\u2013"); // En dash dictionary.put("mdash", "\u2014"); // Em dash dictionary.put("lsquo", "\u2018"); // Left single quotation mark dictionary.put("rsquo", "\u2019"); // Right single quotation mark dictionary.put("sbquo", "\u201A"); // Single low-9 quotation mark dictionary.put("ldquo", "\u201C"); // Left double quotation mark dictionary.put("rdquo", "\u201D"); // Right double quotation mark dictionary.put("bdquo", "\u201E"); // Double low-9 quotation mark dictionary.put("dagger", "\u2020"); // Dagger dictionary.put("Dagger", "\u2021"); // Double dagger dictionary.put("permil", "\u2030"); // Per mille sign dictionary.put("lsaquo", "\u2039"); // Single left-pointing angle quotation mark dictionary.put("rsaquo", "\u203A"); // Single right-pointing angle quotation mark dictionary.put("euro", "\u20AC"); // Euro return dictionary; } /** * Converts the characters that are reserved in an XML document the given string may have into * their corresponding references (escape characters) using the character entity reference. * * @param value A string that may contain characters that need to be escaped * @param positions This array of length one or two can be used to adjust the position of the * cursor or a text range within the string during the conversion of the reserved characters * @return The given string with any reserved characters converted into the escape characters */ public static String escape(String value, int[] positions) { if ((value == null) || (value.length() == 0)) { return value; } StringBuilder sb = new StringBuilder(value.length()); int startPosition = positions[0]; int endPosition = (positions.length > 1) ? positions[1] : -1; for (int index = 0, count = value.length(); index < count; index++) { char character = value.charAt(index); // The character is one of the reserved character if (isReserved(character)) { // Retrieve the corresponding entity name String name = getEscapeCharacter(character); sb.append(name); // Adjust the position if (startPosition > index) { // -1 for the character itself that is replaced by the entity name positions[0] += (name.length() - 1); } if ((endPosition > -1) && (index < endPosition)) { // -1 for the character itself that is replaced by the entity name positions[1] += (name.length() - 1); } } else { sb.append(character); } } return sb.toString(); } /** * Returns the Unicode character for the given reference (which is either a numeric character * reference or a character entity reference). * * @param reference The numeric character or character entity reference stripped of the leading * ampersand and trailing semi-colon * @return The Unicode character mapped to the given reference or <code>null</code> if the * reference is invalid or unknown */ public static String getCharacter(String reference) { if (reference == null) { return null; } int length = reference.length(); if (length == 0) { return null; } // Character reference if (reference.charAt(0) == '#') { if (length == 1) { return null; } // Parse the numeric value String value; int radix; // Hexadecimal if (reference.charAt(1) == 'x') { radix = 16; value = reference.substring(2); } // Decimal else { radix = 10; value = reference.substring(1); } // No minus accepted if ((value.length() == 0) || (value.charAt(0) == '-')) { return null; } // Convert the numeric value into the actual character char character = 0; try { character = (char) Integer.parseInt(value, radix); } catch (NumberFormatException ex) { // Simply ignore } // The null character � is not permitted if (character == 0) { return null; } return String.valueOf(character); } // Entity reference return dictionary.get(reference); } /** * Returns the escaped character for the given reserved character. * * @param character The reserved character to retrieve its escape character with the entity name * @return The escape character with the entity name of the given character if it is a reserved * character; otherwise returns <code>null</code> */ public static String getEscapeCharacter(char character) { switch (character) { case '<': return LESS_THAN_ENTITY_NAME; case '>': return GREATER_THAN_ENTITY_NAME; case '&': return AMPERSAND_ENTITY_NAME; case '\'': return APOSTROPHE_ENTITY_NAME; case '\"': return QUOTATION_MARK_NAME; default: return null; } } /** * Determines if the given character is one of the XML/HTML reserved characters. * * @param character The character to verify if it's one of the reserved characters * @return <code>true</code> if the given character is defined as a reserved characters; * <code>false</code> otherwise */ public static boolean isReserved(char character) { switch (character) { case '<': case '>': case '&': case '\'': case '\"': return true; default: return false; } } /** * Re-adjusts the given positions, which is based on the non-escaped version of the given * <em>query</em>, by making sure it is pointing at the same position within <em>query</em>, * which contains references (escape characters). * <p> * The escape characters are either the character entity references or the numeric character * references used in an XML document. * <p> * <b>Important:</b> The given query should contain the exact same amount of whitespace than the * query used to calculate the given positions. * * @param query The query that may contain escape characters * @param positions The position within the non-escaped version of the given query, which is * either a single element position or two positions that is used as a text range. After execution * contains the adjusted positions by moving it based on the difference between the escape and * non-escaped versions of the query * @since 2.5 */ public static void reposition(CharSequence query, int[] positions) { if ((query == null) || (query.length() == 0)) { return; } StringBuilder sb = new StringBuilder(query); for (int index = 0, count = sb.length(); index < count; index++) { char character = sb.charAt(index); // The beginning of the escape character if ((character == '&') && (index + 1 < count)) { // Find the ending of the escape character int semiColonIndex = sb.indexOf(";", index + 1); if (semiColonIndex > -1) { // Retrieve the reference value String reference = sb.substring(index + 1, semiColonIndex); if (reference.length() > 0) { // Retrieve the character mapped to the entity name String unicodeCharacter = XmlEscapeCharacterConverter.getCharacter(reference); if (unicodeCharacter != null) { // length = '&' + 'reference' + ';' - 'Unicode character' int length = (semiColonIndex - index); // Translate both positions because a Unicode // character is written with its escape character if (index < positions[0]) { positions[0] += length; positions[1] += length; } // Only translate the end position because the start // position is before the current index else if (index < positions[1]) { positions[1] += length; } index = semiColonIndex; } } } } } } /** * Converts the references (escape characters) the given string may have into their corresponding * Unicode characters. * * <ul> * <li>Character entity reference: <b>&copy;</b> for <b>©</b></li> * <li>Numeric character reference (decimal value): <b>&#169;</b> for <b>©</b></li> * <li>Numeric character reference (hexadecimal value): <b>&#xA9;</b> for <b>©</b></li> * </ul> * * @param value A string that may contain escape characters * @param position This array of length one can be used to adjust the position of the cursor * within the string during the conversion of the escape characters * @return The given string with any escape characters converted into the actual Unicode characters */ public static String unescape(String value, int[] position) { if ((value == null) || (value.length() == 0)) { return value; } StringBuilder sb = new StringBuilder(value); for (int index = 0, count = sb.length(); index < count; index++) { char character = sb.charAt(index); // The beginning of the escape character if ((character == '&') && (index + 1 < count)) { // Find the ending of the escape character int semiColonIndex = sb.indexOf(";", index + 1); if (semiColonIndex > -1) { // Retrieve the reference String reference = sb.substring(index + 1, semiColonIndex); if (reference.length() > 0) { // Retrieve the character mapped to the reference String specialCharacter = getCharacter(reference); if (specialCharacter != null) { // Replace the reference by the Unicode character sb.replace(index, semiColonIndex + 1, specialCharacter); // Make sure the count is updated count -= (semiColonIndex - index); // "& + reference + ; - Unicode character" int length = (1 + reference.length()); // Adjust the position // Case 1: The cursor is within the escape character, move it to the beginning if ((position[0] >= index) && (position[0] <= index + length)) { position[0] = index; } // Case 2: the cursor is after the escape character, just do an adjustment as // if it was a single character else if (position[0] > index + length) { position[0] -= length; } } } } } } return sb.toString(); } }