/*
* Copyright 2010 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.template.soy.internal.base;
import com.google.common.collect.ImmutableMap;
/**
* Utilities for unescaping strings from context-specific formats.
*
*/
public class UnescapeUtils {
private UnescapeUtils() {}
/**
* Unescapes a Javascript string. Throws an IllegalArgumentException if the string contains bad
* escaping.
*/
public static String unescapeJs(String s) {
StringBuilder sb = new StringBuilder(s.length());
for (int i = 0; i < s.length(); ) {
char c = s.charAt(i);
if (c == '\\') {
i = unescapeJsHelper(s, i + 1, sb);
} else {
sb.append(c);
i++;
}
}
return sb.toString();
}
/**
* Looks for an escape code starting at index i of s, and appends it to sb.
*
* @return the index of the first character in s after the escape code.
* @throws IllegalArgumentException if the escape code is invalid
*/
private static int unescapeJsHelper(String s, int i, StringBuilder sb) {
if (i >= s.length()) {
throw new IllegalArgumentException("End-of-string after escape character in [" + s + "]");
}
char c = s.charAt(i++);
switch (c) {
case 'n':
sb.append('\n');
break;
case 'r':
sb.append('\r');
break;
case 't':
sb.append('\t');
break;
case 'b':
sb.append('\b');
break;
case 'f':
sb.append('\f');
break;
case '\\':
case '\"':
case '\'':
case '>':
sb.append(c);
break;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
--i; // backup to first octal digit
int nOctalDigits = 1;
int digitLimit = c < '4' ? 3 : 2;
while (nOctalDigits < digitLimit
&& i + nOctalDigits < s.length()
&& isOctal(s.charAt(i + nOctalDigits))) {
++nOctalDigits;
}
sb.append((char) Integer.parseInt(s.substring(i, i + nOctalDigits), 8));
i += nOctalDigits;
break;
case 'x':
case 'u':
String hexCode;
int nHexDigits = (c == 'u' ? 4 : 2);
try {
hexCode = s.substring(i, i + nHexDigits);
} catch (IndexOutOfBoundsException ioobe) {
throw new IllegalArgumentException(
"Invalid unicode sequence ["
+ s.substring(i)
+ "] at index "
+ i
+ " in ["
+ s
+ "]");
}
int unicodeValue;
try {
unicodeValue = Integer.parseInt(hexCode, 16);
} catch (NumberFormatException nfe) {
throw new IllegalArgumentException(
"Invalid unicode sequence [" + hexCode + "] at index " + i + " in [" + s + "]");
}
sb.append((char) unicodeValue);
i += nHexDigits;
break;
default:
throw new IllegalArgumentException(
"Unknown escape code [" + c + "] at index " + i + " in [" + s + "]");
}
return i;
}
private static boolean isOctal(char c) {
return (c >= '0') && (c <= '7');
}
/**
* Replace all the occurrences of HTML entities with the appropriate code-points.
*
* @param s HTML.
* @return Plain text.
*/
public static String unescapeHtml(String s) {
int amp = s.indexOf('&');
if (amp < 0) { // Fast path.
return s;
}
int n = s.length();
StringBuilder sb = new StringBuilder(n);
int pos = 0;
do {
// All numeric entities and all named entities can be represented in less than 12 chars, so
// avoid any O(n**2) problem on "&&&&&&&&&" by not looking for ; more than 12 chars out.
int end = -1;
int entityLimit = Math.min(n, amp + 12);
for (int i = amp + 1; i < entityLimit; ++i) {
if (s.charAt(i) == ';') {
end = i + 1;
break;
}
}
int cp = -1;
if (end == -1) {
cp = -1;
} else {
if (s.charAt(amp + 1) == '#') { // Decode a numeric entity
char ch = s.charAt(amp + 2);
try {
if (ch == 'x' || ch == 'X') { // hex
// & # x A B C D ;
// ^ ^ ^ ^ ^
// amp + 0 1 2 3 end - 1
cp = Integer.parseInt(s.substring(amp + 3, end - 1), 16);
} else { // decimal
// & # 1 6 0 ;
// ^ ^ ^ ^
// amp + 0 1 2 end - 1
cp = Integer.parseInt(s.substring(amp + 2, end - 1), 10);
}
} catch (NumberFormatException ex) {
cp = -1; // Malformed numeric entity
}
} else {
// & q u o t ;
// ^ ^
// amp end
Integer cpI = HTML_ENTITY_TO_CODEPOINT.get(s.substring(amp, end));
cp = cpI != null ? cpI.intValue() : -1;
}
}
if (cp == -1) { // Don't decode
end = amp + 1;
} else {
sb.append(s, pos, amp);
sb.appendCodePoint(cp);
pos = end;
}
amp = s.indexOf('&', end);
} while (amp >= 0);
return sb.append(s, pos, n).toString();
}
// Reverse of map used in com.google.common.html.HtmlEscapers.htmlCharEscaper()
private static final ImmutableMap<String, Integer> HTML_ENTITY_TO_CODEPOINT =
ImmutableMap.<String, Integer>builder()
.put(""", (int) '"')
.put("'", (int) '\'')
.put("&", (int) '&')
.put("<", (int) '<')
.put(">", (int) '>')
.put(" ", (int) '\u00A0')
.put("¡", (int) '\u00A1')
.put("¢", (int) '\u00A2')
.put("£", (int) '\u00A3')
.put("¤", (int) '\u00A4')
.put("¥", (int) '\u00A5')
.put("¦", (int) '\u00A6')
.put("§", (int) '\u00A7')
.put("¨", (int) '\u00A8')
.put("©", (int) '\u00A9')
.put("ª", (int) '\u00AA')
.put("«", (int) '\u00AB')
.put("¬", (int) '\u00AC')
.put("", (int) '\u00AD')
.put("®", (int) '\u00AE')
.put("¯", (int) '\u00AF')
.put("°", (int) '\u00B0')
.put("±", (int) '\u00B1')
.put("²", (int) '\u00B2')
.put("³", (int) '\u00B3')
.put("´", (int) '\u00B4')
.put("µ", (int) '\u00B5')
.put("¶", (int) '\u00B6')
.put("·", (int) '\u00B7')
.put("¸", (int) '\u00B8')
.put("¹", (int) '\u00B9')
.put("º", (int) '\u00BA')
.put("»", (int) '\u00BB')
.put("¼", (int) '\u00BC')
.put("½", (int) '\u00BD')
.put("¾", (int) '\u00BE')
.put("¿", (int) '\u00BF')
.put("À", (int) '\u00C0')
.put("Á", (int) '\u00C1')
.put("Â", (int) '\u00C2')
.put("Ã", (int) '\u00C3')
.put("Ä", (int) '\u00C4')
.put("Å", (int) '\u00C5')
.put("Æ", (int) '\u00C6')
.put("Ç", (int) '\u00C7')
.put("È", (int) '\u00C8')
.put("É", (int) '\u00C9')
.put("Ê", (int) '\u00CA')
.put("Ë", (int) '\u00CB')
.put("Ì", (int) '\u00CC')
.put("Í", (int) '\u00CD')
.put("Î", (int) '\u00CE')
.put("Ï", (int) '\u00CF')
.put("Ð", (int) '\u00D0')
.put("Ñ", (int) '\u00D1')
.put("Ò", (int) '\u00D2')
.put("Ó", (int) '\u00D3')
.put("Ô", (int) '\u00D4')
.put("Õ", (int) '\u00D5')
.put("Ö", (int) '\u00D6')
.put("×", (int) '\u00D7')
.put("Ø", (int) '\u00D8')
.put("Ù", (int) '\u00D9')
.put("Ú", (int) '\u00DA')
.put("Û", (int) '\u00DB')
.put("Ü", (int) '\u00DC')
.put("Ý", (int) '\u00DD')
.put("Þ", (int) '\u00DE')
.put("ß", (int) '\u00DF')
.put("à", (int) '\u00E0')
.put("á", (int) '\u00E1')
.put("â", (int) '\u00E2')
.put("ã", (int) '\u00E3')
.put("ä", (int) '\u00E4')
.put("å", (int) '\u00E5')
.put("æ", (int) '\u00E6')
.put("ç", (int) '\u00E7')
.put("è", (int) '\u00E8')
.put("é", (int) '\u00E9')
.put("ê", (int) '\u00EA')
.put("ë", (int) '\u00EB')
.put("ì", (int) '\u00EC')
.put("í", (int) '\u00ED')
.put("î", (int) '\u00EE')
.put("ï", (int) '\u00EF')
.put("ð", (int) '\u00F0')
.put("ñ", (int) '\u00F1')
.put("ò", (int) '\u00F2')
.put("ó", (int) '\u00F3')
.put("ô", (int) '\u00F4')
.put("õ", (int) '\u00F5')
.put("ö", (int) '\u00F6')
.put("÷", (int) '\u00F7')
.put("ø", (int) '\u00F8')
.put("ù", (int) '\u00F9')
.put("ú", (int) '\u00FA')
.put("û", (int) '\u00FB')
.put("ü", (int) '\u00FC')
.put("ý", (int) '\u00FD')
.put("þ", (int) '\u00FE')
.put("ÿ", (int) '\u00FF')
.put("Œ", (int) '\u0152')
.put("œ", (int) '\u0153')
.put("Š", (int) '\u0160')
.put("š", (int) '\u0161')
.put("Ÿ", (int) '\u0178')
.put("ƒ", (int) '\u0192')
.put("ˆ", (int) '\u02C6')
.put("˜", (int) '\u02DC')
.put("Α", (int) '\u0391')
.put("Β", (int) '\u0392')
.put("Γ", (int) '\u0393')
.put("Δ", (int) '\u0394')
.put("Ε", (int) '\u0395')
.put("Ζ", (int) '\u0396')
.put("Η", (int) '\u0397')
.put("Θ", (int) '\u0398')
.put("Ι", (int) '\u0399')
.put("Κ", (int) '\u039A')
.put("Λ", (int) '\u039B')
.put("Μ", (int) '\u039C')
.put("Ν", (int) '\u039D')
.put("Ξ", (int) '\u039E')
.put("Ο", (int) '\u039F')
.put("Π", (int) '\u03A0')
.put("Ρ", (int) '\u03A1')
.put("Σ", (int) '\u03A3')
.put("Τ", (int) '\u03A4')
.put("Υ", (int) '\u03A5')
.put("Φ", (int) '\u03A6')
.put("Χ", (int) '\u03A7')
.put("Ψ", (int) '\u03A8')
.put("Ω", (int) '\u03A9')
.put("α", (int) '\u03B1')
.put("β", (int) '\u03B2')
.put("γ", (int) '\u03B3')
.put("δ", (int) '\u03B4')
.put("ε", (int) '\u03B5')
.put("ζ", (int) '\u03B6')
.put("η", (int) '\u03B7')
.put("θ", (int) '\u03B8')
.put("ι", (int) '\u03B9')
.put("κ", (int) '\u03BA')
.put("λ", (int) '\u03BB')
.put("μ", (int) '\u03BC')
.put("ν", (int) '\u03BD')
.put("ξ", (int) '\u03BE')
.put("ο", (int) '\u03BF')
.put("π", (int) '\u03C0')
.put("ρ", (int) '\u03C1')
.put("ς", (int) '\u03C2')
.put("σ", (int) '\u03C3')
.put("τ", (int) '\u03C4')
.put("υ", (int) '\u03C5')
.put("φ", (int) '\u03C6')
.put("χ", (int) '\u03C7')
.put("ψ", (int) '\u03C8')
.put("ω", (int) '\u03C9')
.put("ϑ", (int) '\u03D1')
.put("ϒ", (int) '\u03D2')
.put("ϖ", (int) '\u03D6')
.put(" ", (int) '\u2002')
.put(" ", (int) '\u2003')
.put(" ", (int) '\u2009')
.put("", (int) '\u200C')
.put("", (int) '\u200D')
.put("", (int) '\u200E')
.put("", (int) '\u200F')
.put("–", (int) '\u2013')
.put("—", (int) '\u2014')
.put("‘", (int) '\u2018')
.put("’", (int) '\u2019')
.put("‚", (int) '\u201A')
.put("“", (int) '\u201C')
.put("”", (int) '\u201D')
.put("„", (int) '\u201E')
.put("†", (int) '\u2020')
.put("‡", (int) '\u2021')
.put("•", (int) '\u2022')
.put("…", (int) '\u2026')
.put("‰", (int) '\u2030')
.put("′", (int) '\u2032')
.put("″", (int) '\u2033')
.put("‹", (int) '\u2039')
.put("›", (int) '\u203A')
.put("‾", (int) '\u203E')
.put("⁄", (int) '\u2044')
.put("€", (int) '\u20AC')
.put("ℑ", (int) '\u2111')
.put("℘", (int) '\u2118')
.put("ℜ", (int) '\u211C')
.put("™", (int) '\u2122')
.put("ℵ", (int) '\u2135')
.put("←", (int) '\u2190')
.put("↑", (int) '\u2191')
.put("→", (int) '\u2192')
.put("↓", (int) '\u2193')
.put("↔", (int) '\u2194')
.put("↵", (int) '\u21B5')
.put("⇐", (int) '\u21D0')
.put("⇑", (int) '\u21D1')
.put("⇒", (int) '\u21D2')
.put("⇓", (int) '\u21D3')
.put("⇔", (int) '\u21D4')
.put("∀", (int) '\u2200')
.put("∂", (int) '\u2202')
.put("∃", (int) '\u2203')
.put("∅", (int) '\u2205')
.put("∇", (int) '\u2207')
.put("∈", (int) '\u2208')
.put("∉", (int) '\u2209')
.put("∋", (int) '\u220B')
.put("∏", (int) '\u220F')
.put("∑", (int) '\u2211')
.put("−", (int) '\u2212')
.put("∗", (int) '\u2217')
.put("√", (int) '\u221A')
.put("∝", (int) '\u221D')
.put("∞", (int) '\u221E')
.put("∠", (int) '\u2220')
.put("∧", (int) '\u2227')
.put("∨", (int) '\u2228')
.put("∩", (int) '\u2229')
.put("∪", (int) '\u222A')
.put("∫", (int) '\u222B')
.put("∴", (int) '\u2234')
.put("∼", (int) '\u223C')
.put("≅", (int) '\u2245')
.put("≈", (int) '\u2248')
.put("≠", (int) '\u2260')
.put("≡", (int) '\u2261')
.put("≤", (int) '\u2264')
.put("≥", (int) '\u2265')
.put("⊂", (int) '\u2282')
.put("⊃", (int) '\u2283')
.put("⊄", (int) '\u2284')
.put("⊆", (int) '\u2286')
.put("⊇", (int) '\u2287')
.put("⊕", (int) '\u2295')
.put("⊗", (int) '\u2297')
.put("⊥", (int) '\u22A5')
.put("⋅", (int) '\u22C5')
.put("⌈", (int) '\u2308')
.put("⌉", (int) '\u2309')
.put("⌊", (int) '\u230A')
.put("⌋", (int) '\u230B')
.put("〈", (int) '\u2329')
.put("〉", (int) '\u232A')
.put("◊", (int) '\u25CA')
.put("♠", (int) '\u2660')
.put("♣", (int) '\u2663')
.put("♥", (int) '\u2665')
.put("♦", (int) '\u2666')
.build();
}