/* * Copyright 2010 Google Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.google.template.soy.internal.base; import com.google.common.collect.ImmutableMap; /** * Utilities for unescaping strings from context-specific formats. * */ public class UnescapeUtils { private UnescapeUtils() {} /** * Unescapes a Javascript string. Throws an IllegalArgumentException if the string contains bad * escaping. */ public static String unescapeJs(String s) { StringBuilder sb = new StringBuilder(s.length()); for (int i = 0; i < s.length(); ) { char c = s.charAt(i); if (c == '\\') { i = unescapeJsHelper(s, i + 1, sb); } else { sb.append(c); i++; } } return sb.toString(); } /** * Looks for an escape code starting at index i of s, and appends it to sb. * * @return the index of the first character in s after the escape code. * @throws IllegalArgumentException if the escape code is invalid */ private static int unescapeJsHelper(String s, int i, StringBuilder sb) { if (i >= s.length()) { throw new IllegalArgumentException("End-of-string after escape character in [" + s + "]"); } char c = s.charAt(i++); switch (c) { case 'n': sb.append('\n'); break; case 'r': sb.append('\r'); break; case 't': sb.append('\t'); break; case 'b': sb.append('\b'); break; case 'f': sb.append('\f'); break; case '\\': case '\"': case '\'': case '>': sb.append(c); break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': --i; // backup to first octal digit int nOctalDigits = 1; int digitLimit = c < '4' ? 3 : 2; while (nOctalDigits < digitLimit && i + nOctalDigits < s.length() && isOctal(s.charAt(i + nOctalDigits))) { ++nOctalDigits; } sb.append((char) Integer.parseInt(s.substring(i, i + nOctalDigits), 8)); i += nOctalDigits; break; case 'x': case 'u': String hexCode; int nHexDigits = (c == 'u' ? 4 : 2); try { hexCode = s.substring(i, i + nHexDigits); } catch (IndexOutOfBoundsException ioobe) { throw new IllegalArgumentException( "Invalid unicode sequence [" + s.substring(i) + "] at index " + i + " in [" + s + "]"); } int unicodeValue; try { unicodeValue = Integer.parseInt(hexCode, 16); } catch (NumberFormatException nfe) { throw new IllegalArgumentException( "Invalid unicode sequence [" + hexCode + "] at index " + i + " in [" + s + "]"); } sb.append((char) unicodeValue); i += nHexDigits; break; default: throw new IllegalArgumentException( "Unknown escape code [" + c + "] at index " + i + " in [" + s + "]"); } return i; } private static boolean isOctal(char c) { return (c >= '0') && (c <= '7'); } /** * Replace all the occurrences of HTML entities with the appropriate code-points. * * @param s HTML. * @return Plain text. */ public static String unescapeHtml(String s) { int amp = s.indexOf('&'); if (amp < 0) { // Fast path. return s; } int n = s.length(); StringBuilder sb = new StringBuilder(n); int pos = 0; do { // All numeric entities and all named entities can be represented in less than 12 chars, so // avoid any O(n**2) problem on "&&&&&&&&&" by not looking for ; more than 12 chars out. int end = -1; int entityLimit = Math.min(n, amp + 12); for (int i = amp + 1; i < entityLimit; ++i) { if (s.charAt(i) == ';') { end = i + 1; break; } } int cp = -1; if (end == -1) { cp = -1; } else { if (s.charAt(amp + 1) == '#') { // Decode a numeric entity char ch = s.charAt(amp + 2); try { if (ch == 'x' || ch == 'X') { // hex // & # x A B C D ; // ^ ^ ^ ^ ^ // amp + 0 1 2 3 end - 1 cp = Integer.parseInt(s.substring(amp + 3, end - 1), 16); } else { // decimal // & # 1 6 0 ; // ^ ^ ^ ^ // amp + 0 1 2 end - 1 cp = Integer.parseInt(s.substring(amp + 2, end - 1), 10); } } catch (NumberFormatException ex) { cp = -1; // Malformed numeric entity } } else { // & q u o t ; // ^ ^ // amp end Integer cpI = HTML_ENTITY_TO_CODEPOINT.get(s.substring(amp, end)); cp = cpI != null ? cpI.intValue() : -1; } } if (cp == -1) { // Don't decode end = amp + 1; } else { sb.append(s, pos, amp); sb.appendCodePoint(cp); pos = end; } amp = s.indexOf('&', end); } while (amp >= 0); return sb.append(s, pos, n).toString(); } // Reverse of map used in com.google.common.html.HtmlEscapers.htmlCharEscaper() private static final ImmutableMap<String, Integer> HTML_ENTITY_TO_CODEPOINT = ImmutableMap.<String, Integer>builder() .put(""", (int) '"') .put("'", (int) '\'') .put("&", (int) '&') .put("<", (int) '<') .put(">", (int) '>') .put(" ", (int) '\u00A0') .put("¡", (int) '\u00A1') .put("¢", (int) '\u00A2') .put("£", (int) '\u00A3') .put("¤", (int) '\u00A4') .put("¥", (int) '\u00A5') .put("¦", (int) '\u00A6') .put("§", (int) '\u00A7') .put("¨", (int) '\u00A8') .put("©", (int) '\u00A9') .put("ª", (int) '\u00AA') .put("«", (int) '\u00AB') .put("¬", (int) '\u00AC') .put("­", (int) '\u00AD') .put("®", (int) '\u00AE') .put("¯", (int) '\u00AF') .put("°", (int) '\u00B0') .put("±", (int) '\u00B1') .put("²", (int) '\u00B2') .put("³", (int) '\u00B3') .put("´", (int) '\u00B4') .put("µ", (int) '\u00B5') .put("¶", (int) '\u00B6') .put("·", (int) '\u00B7') .put("¸", (int) '\u00B8') .put("¹", (int) '\u00B9') .put("º", (int) '\u00BA') .put("»", (int) '\u00BB') .put("¼", (int) '\u00BC') .put("½", (int) '\u00BD') .put("¾", (int) '\u00BE') .put("¿", (int) '\u00BF') .put("À", (int) '\u00C0') .put("Á", (int) '\u00C1') .put("Â", (int) '\u00C2') .put("Ã", (int) '\u00C3') .put("Ä", (int) '\u00C4') .put("Å", (int) '\u00C5') .put("Æ", (int) '\u00C6') .put("Ç", (int) '\u00C7') .put("È", (int) '\u00C8') .put("É", (int) '\u00C9') .put("Ê", (int) '\u00CA') .put("Ë", (int) '\u00CB') .put("Ì", (int) '\u00CC') .put("Í", (int) '\u00CD') .put("Î", (int) '\u00CE') .put("Ï", (int) '\u00CF') .put("Ð", (int) '\u00D0') .put("Ñ", (int) '\u00D1') .put("Ò", (int) '\u00D2') .put("Ó", (int) '\u00D3') .put("Ô", (int) '\u00D4') .put("Õ", (int) '\u00D5') .put("Ö", (int) '\u00D6') .put("×", (int) '\u00D7') .put("Ø", (int) '\u00D8') .put("Ù", (int) '\u00D9') .put("Ú", (int) '\u00DA') .put("Û", (int) '\u00DB') .put("Ü", (int) '\u00DC') .put("Ý", (int) '\u00DD') .put("Þ", (int) '\u00DE') .put("ß", (int) '\u00DF') .put("à", (int) '\u00E0') .put("á", (int) '\u00E1') .put("â", (int) '\u00E2') .put("ã", (int) '\u00E3') .put("ä", (int) '\u00E4') .put("å", (int) '\u00E5') .put("æ", (int) '\u00E6') .put("ç", (int) '\u00E7') .put("è", (int) '\u00E8') .put("é", (int) '\u00E9') .put("ê", (int) '\u00EA') .put("ë", (int) '\u00EB') .put("ì", (int) '\u00EC') .put("í", (int) '\u00ED') .put("î", (int) '\u00EE') .put("ï", (int) '\u00EF') .put("ð", (int) '\u00F0') .put("ñ", (int) '\u00F1') .put("ò", (int) '\u00F2') .put("ó", (int) '\u00F3') .put("ô", (int) '\u00F4') .put("õ", (int) '\u00F5') .put("ö", (int) '\u00F6') .put("÷", (int) '\u00F7') .put("ø", (int) '\u00F8') .put("ù", (int) '\u00F9') .put("ú", (int) '\u00FA') .put("û", (int) '\u00FB') .put("ü", (int) '\u00FC') .put("ý", (int) '\u00FD') .put("þ", (int) '\u00FE') .put("ÿ", (int) '\u00FF') .put("Œ", (int) '\u0152') .put("œ", (int) '\u0153') .put("Š", (int) '\u0160') .put("š", (int) '\u0161') .put("Ÿ", (int) '\u0178') .put("ƒ", (int) '\u0192') .put("ˆ", (int) '\u02C6') .put("˜", (int) '\u02DC') .put("Α", (int) '\u0391') .put("Β", (int) '\u0392') .put("Γ", (int) '\u0393') .put("Δ", (int) '\u0394') .put("Ε", (int) '\u0395') .put("Ζ", (int) '\u0396') .put("Η", (int) '\u0397') .put("Θ", (int) '\u0398') .put("Ι", (int) '\u0399') .put("Κ", (int) '\u039A') .put("Λ", (int) '\u039B') .put("Μ", (int) '\u039C') .put("Ν", (int) '\u039D') .put("Ξ", (int) '\u039E') .put("Ο", (int) '\u039F') .put("Π", (int) '\u03A0') .put("Ρ", (int) '\u03A1') .put("Σ", (int) '\u03A3') .put("Τ", (int) '\u03A4') .put("Υ", (int) '\u03A5') .put("Φ", (int) '\u03A6') .put("Χ", (int) '\u03A7') .put("Ψ", (int) '\u03A8') .put("Ω", (int) '\u03A9') .put("α", (int) '\u03B1') .put("β", (int) '\u03B2') .put("γ", (int) '\u03B3') .put("δ", (int) '\u03B4') .put("ε", (int) '\u03B5') .put("ζ", (int) '\u03B6') .put("η", (int) '\u03B7') .put("θ", (int) '\u03B8') .put("ι", (int) '\u03B9') .put("κ", (int) '\u03BA') .put("λ", (int) '\u03BB') .put("μ", (int) '\u03BC') .put("ν", (int) '\u03BD') .put("ξ", (int) '\u03BE') .put("ο", (int) '\u03BF') .put("π", (int) '\u03C0') .put("ρ", (int) '\u03C1') .put("ς", (int) '\u03C2') .put("σ", (int) '\u03C3') .put("τ", (int) '\u03C4') .put("υ", (int) '\u03C5') .put("φ", (int) '\u03C6') .put("χ", (int) '\u03C7') .put("ψ", (int) '\u03C8') .put("ω", (int) '\u03C9') .put("ϑ", (int) '\u03D1') .put("ϒ", (int) '\u03D2') .put("ϖ", (int) '\u03D6') .put(" ", (int) '\u2002') .put(" ", (int) '\u2003') .put(" ", (int) '\u2009') .put("‌", (int) '\u200C') .put("‍", (int) '\u200D') .put("‎", (int) '\u200E') .put("‏", (int) '\u200F') .put("–", (int) '\u2013') .put("—", (int) '\u2014') .put("‘", (int) '\u2018') .put("’", (int) '\u2019') .put("‚", (int) '\u201A') .put("“", (int) '\u201C') .put("”", (int) '\u201D') .put("„", (int) '\u201E') .put("†", (int) '\u2020') .put("‡", (int) '\u2021') .put("•", (int) '\u2022') .put("…", (int) '\u2026') .put("‰", (int) '\u2030') .put("′", (int) '\u2032') .put("″", (int) '\u2033') .put("‹", (int) '\u2039') .put("›", (int) '\u203A') .put("‾", (int) '\u203E') .put("⁄", (int) '\u2044') .put("€", (int) '\u20AC') .put("ℑ", (int) '\u2111') .put("℘", (int) '\u2118') .put("ℜ", (int) '\u211C') .put("™", (int) '\u2122') .put("ℵ", (int) '\u2135') .put("←", (int) '\u2190') .put("↑", (int) '\u2191') .put("→", (int) '\u2192') .put("↓", (int) '\u2193') .put("↔", (int) '\u2194') .put("↵", (int) '\u21B5') .put("⇐", (int) '\u21D0') .put("⇑", (int) '\u21D1') .put("⇒", (int) '\u21D2') .put("⇓", (int) '\u21D3') .put("⇔", (int) '\u21D4') .put("∀", (int) '\u2200') .put("∂", (int) '\u2202') .put("∃", (int) '\u2203') .put("∅", (int) '\u2205') .put("∇", (int) '\u2207') .put("∈", (int) '\u2208') .put("∉", (int) '\u2209') .put("∋", (int) '\u220B') .put("∏", (int) '\u220F') .put("∑", (int) '\u2211') .put("−", (int) '\u2212') .put("∗", (int) '\u2217') .put("√", (int) '\u221A') .put("∝", (int) '\u221D') .put("∞", (int) '\u221E') .put("∠", (int) '\u2220') .put("∧", (int) '\u2227') .put("∨", (int) '\u2228') .put("∩", (int) '\u2229') .put("∪", (int) '\u222A') .put("∫", (int) '\u222B') .put("∴", (int) '\u2234') .put("∼", (int) '\u223C') .put("≅", (int) '\u2245') .put("≈", (int) '\u2248') .put("≠", (int) '\u2260') .put("≡", (int) '\u2261') .put("≤", (int) '\u2264') .put("≥", (int) '\u2265') .put("⊂", (int) '\u2282') .put("⊃", (int) '\u2283') .put("⊄", (int) '\u2284') .put("⊆", (int) '\u2286') .put("⊇", (int) '\u2287') .put("⊕", (int) '\u2295') .put("⊗", (int) '\u2297') .put("⊥", (int) '\u22A5') .put("⋅", (int) '\u22C5') .put("⌈", (int) '\u2308') .put("⌉", (int) '\u2309') .put("⌊", (int) '\u230A') .put("⌋", (int) '\u230B') .put("⟨", (int) '\u2329') .put("⟩", (int) '\u232A') .put("◊", (int) '\u25CA') .put("♠", (int) '\u2660') .put("♣", (int) '\u2663') .put("♥", (int) '\u2665') .put("♦", (int) '\u2666') .build(); }