/* Copyright (c) 2008 Google Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.google.gdata.util.common.base; import static com.google.gdata.util.common.base.Preconditions.checkNotNull; import java.io.IOException; /** * Utility functions for dealing with {@code CharEscaper}s, and some commonly * used {@code CharEscaper} instances. * * * */ public final class CharEscapers { private CharEscapers() {} // For each xxxEscaper method, please add links to external // reference pages that we consider authoritative for what // that escaper should exactly be doing. /** * Performs no escaping. */ private static final CharEscaper NULL_ESCAPER = new CharEscaper() { @Override public String escape(String string) { checkNotNull(string); return string; } @Override public Appendable escape(final Appendable out) { checkNotNull(out); // we can't simply return out because the CharEscaper contract says that // the returned Appendable will throw a NullPointerException if asked to // append null. return new Appendable() { public Appendable append(CharSequence csq) throws IOException { checkNotNull(csq); out.append(csq); return this; } public Appendable append(CharSequence csq, int start, int end) throws IOException { checkNotNull(csq); out.append(csq, start, end); return this; } public Appendable append(char c) throws IOException { out.append(c); return this; } }; } @Override protected char[] escape(char c) { return null; } }; /** * Returns a {@link CharEscaper} that does no escaping. */ public static CharEscaper nullEscaper() { return NULL_ESCAPER; } /** * Returns a {@link CharEscaper} instance that escapes special characters in a * string so it can safely be included in an XML document in either element * content or attribute values. * * <p><b>Note</b></p>: silently removes null-characters and control * characters, as there is no way to represent them in XML. */ public static CharEscaper xmlEscaper() { return XML_ESCAPER; } /** * Escapes special characters from a string so it can safely be included in an * XML document in either element content or attribute values. Also removes * null-characters and control characters, as there is no way to represent * them in XML. */ private static final CharEscaper XML_ESCAPER = newBasicXmlEscapeBuilder() .addEscape('"', """) .addEscape('\'', "'") .toEscaper(); /** * Returns a {@link CharEscaper} instance that escapes special characters in a * string so it can safely be included in an XML document in element content. * * <p><b>Note</b></p>: double and single quotes are not escaped, so it is not * safe to use this escaper to escape attribute values. Use the * {@link #xmlEscaper()} escaper to escape attribute values or if you are * unsure. Also silently removes non-whitespace control characters, as there * is no way to represent them in XML. */ public static CharEscaper xmlContentEscaper() { return XML_CONTENT_ESCAPER; } /** * Escapes special characters from a string so it can safely be included in an * XML document in element content. Note that quotes are <em>not</em> * escaped, so <em>this is not safe for use in attribute values</em>. Use * {@link #XML_ESCAPER} for attribute values, or if you are unsure. Also * removes non-whitespace control characters, as there is no way to represent * them in XML. */ private static final CharEscaper XML_CONTENT_ESCAPER = newBasicXmlEscapeBuilder().toEscaper(); /** * Returns a {@link CharEscaper} instance that escapes special characters in a * string so it can safely be included in an HTML document in either element * content or attribute values. * * <p><b>Note</b></p>: alters non-ASCII and control characters. * * The entity list was taken from: * <a href="http://www.w3.org/TR/html4/sgml/entities.html">here</a> */ public static CharEscaper htmlEscaper() { return HtmlEscaperHolder.HTML_ESCAPER; } /** * A lazy initialization holder for HTML_ESCAPER. */ private static class HtmlEscaperHolder { private static final CharEscaper HTML_ESCAPER = new HtmlCharEscaper(new CharEscaperBuilder() .addEscape('"', """) .addEscape('\'', "'") .addEscape('&', "&") .addEscape('<', "<") .addEscape('>', ">") .addEscape('\u00A0', " ") .addEscape('\u00A1', "¡") .addEscape('\u00A2', "¢") .addEscape('\u00A3', "£") .addEscape('\u00A4', "¤") .addEscape('\u00A5', "¥") .addEscape('\u00A6', "¦") .addEscape('\u00A7', "§") .addEscape('\u00A8', "¨") .addEscape('\u00A9', "©") .addEscape('\u00AA', "ª") .addEscape('\u00AB', "«") .addEscape('\u00AC', "¬") .addEscape('\u00AD', "­") .addEscape('\u00AE', "®") .addEscape('\u00AF', "¯") .addEscape('\u00B0', "°") .addEscape('\u00B1', "±") .addEscape('\u00B2', "²") .addEscape('\u00B3', "³") .addEscape('\u00B4', "´") .addEscape('\u00B5', "µ") .addEscape('\u00B6', "¶") .addEscape('\u00B7', "·") .addEscape('\u00B8', "¸") .addEscape('\u00B9', "¹") .addEscape('\u00BA', "º") .addEscape('\u00BB', "»") .addEscape('\u00BC', "¼") .addEscape('\u00BD', "½") .addEscape('\u00BE', "¾") .addEscape('\u00BF', "¿") .addEscape('\u00C0', "À") .addEscape('\u00C1', "Á") .addEscape('\u00C2', "Â") .addEscape('\u00C3', "Ã") .addEscape('\u00C4', "Ä") .addEscape('\u00C5', "Å") .addEscape('\u00C6', "Æ") .addEscape('\u00C7', "Ç") .addEscape('\u00C8', "È") .addEscape('\u00C9', "É") .addEscape('\u00CA', "Ê") .addEscape('\u00CB', "Ë") .addEscape('\u00CC', "Ì") .addEscape('\u00CD', "Í") .addEscape('\u00CE', "Î") .addEscape('\u00CF', "Ï") .addEscape('\u00D0', "Ð") .addEscape('\u00D1', "Ñ") .addEscape('\u00D2', "Ò") .addEscape('\u00D3', "Ó") .addEscape('\u00D4', "Ô") .addEscape('\u00D5', "Õ") .addEscape('\u00D6', "Ö") .addEscape('\u00D7', "×") .addEscape('\u00D8', "Ø") .addEscape('\u00D9', "Ù") .addEscape('\u00DA', "Ú") .addEscape('\u00DB', "Û") .addEscape('\u00DC', "Ü") .addEscape('\u00DD', "Ý") .addEscape('\u00DE', "Þ") .addEscape('\u00DF', "ß") .addEscape('\u00E0', "à") .addEscape('\u00E1', "á") .addEscape('\u00E2', "â") .addEscape('\u00E3', "ã") .addEscape('\u00E4', "ä") .addEscape('\u00E5', "å") .addEscape('\u00E6', "æ") .addEscape('\u00E7', "ç") .addEscape('\u00E8', "è") .addEscape('\u00E9', "é") .addEscape('\u00EA', "ê") .addEscape('\u00EB', "ë") .addEscape('\u00EC', "ì") .addEscape('\u00ED', "í") .addEscape('\u00EE', "î") .addEscape('\u00EF', "ï") .addEscape('\u00F0', "ð") .addEscape('\u00F1', "ñ") .addEscape('\u00F2', "ò") .addEscape('\u00F3', "ó") .addEscape('\u00F4', "ô") .addEscape('\u00F5', "õ") .addEscape('\u00F6', "ö") .addEscape('\u00F7', "÷") .addEscape('\u00F8', "ø") .addEscape('\u00F9', "ù") .addEscape('\u00FA', "ú") .addEscape('\u00FB', "û") .addEscape('\u00FC', "ü") .addEscape('\u00FD', "ý") .addEscape('\u00FE', "þ") .addEscape('\u00FF', "ÿ") .addEscape('\u0152', "Œ") .addEscape('\u0153', "œ") .addEscape('\u0160', "Š") .addEscape('\u0161', "š") .addEscape('\u0178', "Ÿ") .addEscape('\u0192', "ƒ") .addEscape('\u02C6', "ˆ") .addEscape('\u02DC', "˜") .addEscape('\u0391', "Α") .addEscape('\u0392', "Β") .addEscape('\u0393', "Γ") .addEscape('\u0394', "Δ") .addEscape('\u0395', "Ε") .addEscape('\u0396', "Ζ") .addEscape('\u0397', "Η") .addEscape('\u0398', "Θ") .addEscape('\u0399', "Ι") .addEscape('\u039A', "Κ") .addEscape('\u039B', "Λ") .addEscape('\u039C', "Μ") .addEscape('\u039D', "Ν") .addEscape('\u039E', "Ξ") .addEscape('\u039F', "Ο") .addEscape('\u03A0', "Π") .addEscape('\u03A1', "Ρ") .addEscape('\u03A3', "Σ") .addEscape('\u03A4', "Τ") .addEscape('\u03A5', "Υ") .addEscape('\u03A6', "Φ") .addEscape('\u03A7', "Χ") .addEscape('\u03A8', "Ψ") .addEscape('\u03A9', "Ω") .addEscape('\u03B1', "α") .addEscape('\u03B2', "β") .addEscape('\u03B3', "γ") .addEscape('\u03B4', "δ") .addEscape('\u03B5', "ε") .addEscape('\u03B6', "ζ") .addEscape('\u03B7', "η") .addEscape('\u03B8', "θ") .addEscape('\u03B9', "ι") .addEscape('\u03BA', "κ") .addEscape('\u03BB', "λ") .addEscape('\u03BC', "μ") .addEscape('\u03BD', "ν") .addEscape('\u03BE', "ξ") .addEscape('\u03BF', "ο") .addEscape('\u03C0', "π") .addEscape('\u03C1', "ρ") .addEscape('\u03C2', "ς") .addEscape('\u03C3', "σ") .addEscape('\u03C4', "τ") .addEscape('\u03C5', "υ") .addEscape('\u03C6', "φ") .addEscape('\u03C7', "χ") .addEscape('\u03C8', "ψ") .addEscape('\u03C9', "ω") .addEscape('\u03D1', "ϑ") .addEscape('\u03D2', "ϒ") .addEscape('\u03D6', "ϖ") .addEscape('\u2002', " ") .addEscape('\u2003', " ") .addEscape('\u2009', " ") .addEscape('\u200C', "‌") .addEscape('\u200D', "‍") .addEscape('\u200E', "‎") .addEscape('\u200F', "‏") .addEscape('\u2013', "–") .addEscape('\u2014', "—") .addEscape('\u2018', "‘") .addEscape('\u2019', "’") .addEscape('\u201A', "‚") .addEscape('\u201C', "“") .addEscape('\u201D', "”") .addEscape('\u201E', "„") .addEscape('\u2020', "†") .addEscape('\u2021', "‡") .addEscape('\u2022', "•") .addEscape('\u2026', "…") .addEscape('\u2030', "‰") .addEscape('\u2032', "′") .addEscape('\u2033', "″") .addEscape('\u2039', "‹") .addEscape('\u203A', "›") .addEscape('\u203E', "‾") .addEscape('\u2044', "⁄") .addEscape('\u20AC', "€") .addEscape('\u2111', "ℑ") .addEscape('\u2118', "℘") .addEscape('\u211C', "ℜ") .addEscape('\u2122', "™") .addEscape('\u2135', "ℵ") .addEscape('\u2190', "←") .addEscape('\u2191', "↑") .addEscape('\u2192', "→") .addEscape('\u2193', "↓") .addEscape('\u2194', "↔") .addEscape('\u21B5', "↵") .addEscape('\u21D0', "⇐") .addEscape('\u21D1', "⇑") .addEscape('\u21D2', "⇒") .addEscape('\u21D3', "⇓") .addEscape('\u21D4', "⇔") .addEscape('\u2200', "∀") .addEscape('\u2202', "∂") .addEscape('\u2203', "∃") .addEscape('\u2205', "∅") .addEscape('\u2207', "∇") .addEscape('\u2208', "∈") .addEscape('\u2209', "∉") .addEscape('\u220B', "∋") .addEscape('\u220F', "∏") .addEscape('\u2211', "∑") .addEscape('\u2212', "−") .addEscape('\u2217', "∗") .addEscape('\u221A', "√") .addEscape('\u221D', "∝") .addEscape('\u221E', "∞") .addEscape('\u2220', "∠") .addEscape('\u2227', "∧") .addEscape('\u2228', "∨") .addEscape('\u2229', "∩") .addEscape('\u222A', "∪") .addEscape('\u222B', "∫") .addEscape('\u2234', "∴") .addEscape('\u223C', "∼") .addEscape('\u2245', "≅") .addEscape('\u2248', "≈") .addEscape('\u2260', "≠") .addEscape('\u2261', "≡") .addEscape('\u2264', "≤") .addEscape('\u2265', "≥") .addEscape('\u2282', "⊂") .addEscape('\u2283', "⊃") .addEscape('\u2284', "⊄") .addEscape('\u2286', "⊆") .addEscape('\u2287', "⊇") .addEscape('\u2295', "⊕") .addEscape('\u2297', "⊗") .addEscape('\u22A5', "⊥") .addEscape('\u22C5', "⋅") .addEscape('\u2308', "⌈") .addEscape('\u2309', "⌉") .addEscape('\u230A', "⌊") .addEscape('\u230B', "⌋") .addEscape('\u2329', "⟨") .addEscape('\u232A', "⟩") .addEscape('\u25CA', "◊") .addEscape('\u2660', "♠") .addEscape('\u2663', "♣") .addEscape('\u2665', "♥") .addEscape('\u2666', "♦") .toArray()); } /** * Returns a {@link CharEscaper} instance that escapes special characters in a * string so it can safely be included in an HTML document in either element * content or attribute values. * * <p><b>Note</b></p>: does not alter non-ASCII and control characters. */ public static CharEscaper asciiHtmlEscaper() { return ASCII_HTML_ESCAPER; } /** * Escapes special characters from a string so it can safely be included in an * HTML document in either element content or attribute values. Does * <em>not</em> alter non-ASCII characters or control characters. */ private static final CharEscaper ASCII_HTML_ESCAPER = new CharEscaperBuilder() .addEscape('"', """) .addEscape('\'', "'") .addEscape('&', "&") .addEscape('<', "<") .addEscape('>', ">") .toEscaper(); /** * Returns an {@link Escaper} instance that escapes Java chars so they can be * safely included in URIs. For details on escaping URIs, see section 2.4 of * <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>. * * <p>When encoding a String, the following rules apply: * <ul> * <li>The alphanumeric characters "a" through "z", "A" through "Z" and "0" * through "9" remain the same. * <li>The special characters ".", "-", "*", and "_" remain the same. * <li>The space character " " is converted into a plus sign "+". * <li>All other characters are converted into one or more bytes using UTF-8 * encoding and each byte is then represented by the 3-character string * "%XY", where "XY" is the two-digit, uppercase, hexadecimal * representation of the byte value. * <ul> * * <p><b>Note</b>: Unlike other escapers, URI escapers produce uppercase * hexidecimal sequences. From <a href="http://www.ietf.org/rfc/rfc3986.txt"> * RFC 3986</a>:<br> * <i>"URI producers and normalizers should use uppercase hexadecimal digits * for all percent-encodings."</i> * * <p>This escaper has identical behavior to (but is potentially much faster * than): * <ul> * <li>{@link com.google.gdata.util.httputil.FastURLEncoder#encode(String)} * <li>{@link com.google.gdata.util.httputil.FastURLEncoder#encode(String,String)} * with the encoding name "UTF-8" * <li>{@link com.google.gdata.util.common.net.UriEncoder#encode(String)} * <li>{@link com.google.gdata.util.common.net.UriEncoder#encode(String,java.nio.charset.Charset)} * with the UTF_8 Charset * <li>{@link java.net.URLEncoder#encode(String, String)} * with the encoding name "UTF-8" * </ul> * * <p>This method is equivalent to {@code uriEscaper(true)}. */ public static Escaper uriEscaper() { return uriEscaper(true); } /** * Returns an {@link Escaper} instance that escapes Java chars so they can be * safely included in URI path segments. For details on escaping URIs, see * section 2.4 of <a href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>. * * <p>When encoding a String, the following rules apply: * <ul> * <li>The alphanumeric characters "a" through "z", "A" through "Z" and "0" * through "9" remain the same. * <li>The unreserved characters ".", "-", "~", and "_" remain the same. * <li>The general delimiters "@" and ":" remain the same. * <li>The subdelimiters "!", "$", "&", "'", "(", ")", "*", ",", ";", * and "=" remain the same. * <li>The space character " " is converted into %20. * <li>All other characters are converted into one or more bytes using UTF-8 * encoding and each byte is then represented by the 3-character string * "%XY", where "XY" is the two-digit, uppercase, hexadecimal * representation of the byte value. * </ul> * * <p><b>Note</b>: Unlike other escapers, URI escapers produce uppercase * hexidecimal sequences. From <a href="http://www.ietf.org/rfc/rfc3986.txt"> * RFC 3986</a>:<br> * <i>"URI producers and normalizers should use uppercase hexadecimal digits * for all percent-encodings."</i> */ public static Escaper uriPathEscaper() { return URI_PATH_ESCAPER; } /** * Returns an {@link Escaper} instance that escapes Java chars so they can be * safely included in URI query string segments. When the query string * consists of a sequence of name=value pairs separated by &, the names * and values should be individually encoded. If you escape an entire query * string in one pass with this escaper, then the "=" and "&" characters * used as separators will also be escaped. * * <p>This escaper is also suitable for escaping fragment identifiers. * * <p>For details on escaping URIs, see * section 2.4 of <a href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>. * * <p>When encoding a String, the following rules apply: * <ul> * <li>The alphanumeric characters "a" through "z", "A" through "Z" and "0" * through "9" remain the same. * <li>The unreserved characters ".", "-", "~", and "_" remain the same. * <li>The general delimiters "@" and ":" remain the same. * <li>The path delimiters "/" and "?" remain the same. * <li>The subdelimiters "!", "$", "'", "(", ")", "*", ",", and ";", * remain the same. * <li>The space character " " is converted into %20. * <li>The equals sign "=" is converted into %3D. * <li>The ampersand "&" is converted into %26. * <li>All other characters are converted into one or more bytes using UTF-8 * encoding and each byte is then represented by the 3-character string * "%XY", where "XY" is the two-digit, uppercase, hexadecimal * representation of the byte value. * </ul> * * <p><b>Note</b>: Unlike other escapers, URI escapers produce uppercase * hexidecimal sequences. From <a href="http://www.ietf.org/rfc/rfc3986.txt"> * RFC 3986</a>:<br> * <i>"URI producers and normalizers should use uppercase hexadecimal digits * for all percent-encodings."</i> */ public static Escaper uriQueryStringEscaper() { return URI_QUERY_STRING_ESCAPER; } /** * Returns a {@link Escaper} instance that escapes Java characters so they can * be safely included in URIs. For details on escaping URIs, see section 2.4 * of <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>. * * <p>When encoding a String, the following rules apply: * <ul> * <li>The alphanumeric characters "a" through "z", "A" through "Z" and "0" * through "9" remain the same. * <li>The special characters ".", "-", "*", and "_" remain the same. * <li>If {@code plusForSpace} was specified, the space character " " is * converted into a plus sign "+". Otherwise it is converted into "%20". * <li>All other characters are converted into one or more bytes using UTF-8 * encoding and each byte is then represented by the 3-character string * "%XY", where "XY" is the two-digit, uppercase, hexadecimal * representation of the byte value. * </ul> * * <p><b>Note</b>: Unlike other escapers, URI escapers produce uppercase * hexidecimal sequences. From <a href="http://www.ietf.org/rfc/rfc3986.txt"> * RFC 3986</a>:<br> * <i>"URI producers and normalizers should use uppercase hexadecimal digits * for all percent-encodings."</i> * * @param plusForSpace if {@code true} space is escaped to {@code +} otherwise * it is escaped to {@code %20}. Although common, the escaping of * spaces as plus signs has a very ambiguous status in the relevant * specifications. You should prefer {@code %20} unless you are doing * exact character-by-character comparisons of URLs and backwards * compatibility requires you to use plus signs. * * @see #uriEscaper() */ public static Escaper uriEscaper(boolean plusForSpace) { return plusForSpace ? URI_ESCAPER : URI_ESCAPER_NO_PLUS; } private static final Escaper URI_ESCAPER = new PercentEscaper(PercentEscaper.SAFECHARS_URLENCODER, true); private static final Escaper URI_ESCAPER_NO_PLUS = new PercentEscaper(PercentEscaper.SAFECHARS_URLENCODER, false); private static final Escaper URI_PATH_ESCAPER = new PercentEscaper(PercentEscaper.SAFEPATHCHARS_URLENCODER, false); private static final Escaper URI_QUERY_STRING_ESCAPER = new PercentEscaper(PercentEscaper.SAFEQUERYSTRINGCHARS_URLENCODER, false); /** * Returns a {@link Escaper} instance that escapes Java characters in a manner * compatible with the C++ webutil/url URL class (the {@code kGoogle1Escape} * set). * * <p>When encoding a String, the following rules apply: * <ul> * <li>The alphanumeric characters "a" through "z", "A" through "Z" and "0" * through "9" remain the same. * <li>The special characters "!", "(", ")", "*", "-", ".", "_", "~", ",", "/" * and ":" remain the same. * <li>The space character " " is converted into a plus sign "+". * <li>All other characters are converted into one or more bytes using UTF-8 * encoding and each byte is then represented by the 3-character string * "%XY", where "XY" is the two-digit, uppercase, hexadecimal * representation of the byte value. * </ul> * * <p><b>Note</b>: Unlike other escapers, URI escapers produce uppercase * hexidecimal sequences. From <a href="http://www.ietf.org/rfc/rfc3986.txt"> * RFC 3986</a>:<br> * <i>"URI producers and normalizers should use uppercase hexadecimal digits * for all percent-encodings."</i> * * <p><b>Note</b>: This escaper is a special case and is <em>not * compliant</em> with <a href="http://www.ietf.org/rfc/rfc2396.txt"> * RFC 2396</a>. Specifically it will not escape "/", ":" and ",". This is * only provided for certain limited use cases and you should favor using * {@link #uriEscaper()} whenever possible. */ public static Escaper cppUriEscaper() { return CPP_URI_ESCAPER; } // Based on comments from FastURLEncoder: // These octets mimic the ones escaped by the C++ webutil/url URL class -- // the kGoogle1Escape set. // To produce the same escaping as C++, use this set with the plusForSpace // option. // WARNING: Contrary to RFC 2396 ",", "/" and ":" are listed as safe here. private static final Escaper CPP_URI_ESCAPER = new PercentEscaper("!()*-._~,/:", true); /** * Returns a {@link CharEscaper} instance that escapes special characters in a * string so it can safely be included in a Java string literal. * * <p><b>Note</b></p>: does not escape single quotes, so use the escaper * returned by {@link #javaCharEscaper()} if you are generating char * literals or if you are unsure. */ public static CharEscaper javaStringEscaper() { return JAVA_STRING_ESCAPER; } /** * Escapes special characters from a string so it can safely be included in a * Java string literal. Does <em>not</em> escape single-quotes, so use * JAVA_CHAR_ESCAPE if you are generating char literals, or if you are unsure. * * <p>Note that non-ASCII characters will be octal or Unicode escaped. */ private static final CharEscaper JAVA_STRING_ESCAPER = new JavaCharEscaper(new CharEscaperBuilder() .addEscape('\b', "\\b") .addEscape('\f', "\\f") .addEscape('\n', "\\n") .addEscape('\r', "\\r") .addEscape('\t', "\\t") .addEscape('\"', "\\\"") .addEscape('\\', "\\\\") .toArray()); /** * Returns a {@link CharEscaper} instance that escapes special characters in a * string so it can safely be included in a Java char or string literal. The * behavior of this escaper is the same as that of the * {@link #javaStringEscaper()}, except it also escapes single quotes. */ public static CharEscaper javaCharEscaper() { return JAVA_CHAR_ESCAPER; } /** * Escapes special characters from a string so it can safely be included in a * Java char literal or string literal. * * <p>Note that non-ASCII characters will be octal or Unicode escaped. * * <p>This is the same as {@link #JAVA_STRING_ESCAPER}, except that it escapes * single quotes. */ private static final CharEscaper JAVA_CHAR_ESCAPER = new JavaCharEscaper(new CharEscaperBuilder() .addEscape('\b', "\\b") .addEscape('\f', "\\f") .addEscape('\n', "\\n") .addEscape('\r', "\\r") .addEscape('\t', "\\t") .addEscape('\'', "\\'") .addEscape('\"', "\\\"") .addEscape('\\', "\\\\") .toArray()); /** * Returns a {@link CharEscaper} instance that replaces non-ASCII characters * in a string with their Unicode escape sequences ({@code \\uxxxx} where * {@code xxxx} is a hex number). Existing escape sequences won't be affected. */ public static CharEscaper javaStringUnicodeEscaper() { return JAVA_STRING_UNICODE_ESCAPER; } /** * Escapes each non-ASCII character in with its Unicode escape sequence * {@code \\uxxxx} where {@code xxxx} is a hex number. Existing escape * sequences won't be affected. */ private static final CharEscaper JAVA_STRING_UNICODE_ESCAPER = new CharEscaper() { @Override protected char[] escape(char c) { if (c <= 127) { return null; } char[] r = new char[6]; r[5] = HEX_DIGITS[c & 15]; c >>>= 4; r[4] = HEX_DIGITS[c & 15]; c >>>= 4; r[3] = HEX_DIGITS[c & 15]; c >>>= 4; r[2] = HEX_DIGITS[c & 15]; r[1] = 'u'; r[0] = '\\'; return r; } }; /** * Returns a {@link CharEscaper} instance that escapes special characters from * a string so it can safely be included in a Python string literal. Does not * have any special handling for non-ASCII characters. */ public static CharEscaper pythonEscaper() { return PYTHON_ESCAPER; } /** * Escapes special characters in a string so it can safely be included in a * Python string literal. Does not have any special handling for non-ASCII * characters. */ private static final CharEscaper PYTHON_ESCAPER = new CharEscaperBuilder() .addEscape('\n', "\\n") .addEscape('\r', "\\r") .addEscape('\t', "\\t") .addEscape('\\', "\\\\") .addEscape('\"', "\\\"") .addEscape('\'', "\\\'") .toEscaper(); /** * Returns a {@link CharEscaper} instance that escapes non-ASCII characters in * a string so it can safely be included in a Javascript string literal. * Non-ASCII characters are replaced with their ASCII javascript escape * sequences (e.g., \\uhhhh or \xhh). */ public static CharEscaper javascriptEscaper() { return JAVASCRIPT_ESCAPER; } /** * {@code CharEscaper} to escape javascript strings. Turns all non-ASCII * characters into ASCII javascript escape sequences (e.g., \\uhhhh or \xhh). */ private static final CharEscaper JAVASCRIPT_ESCAPER = new JavascriptCharEscaper(new CharEscaperBuilder() .addEscape('\'', "\\x27") .addEscape('"', "\\x22") .addEscape('<', "\\x3c") .addEscape('=', "\\x3d") .addEscape('>', "\\x3e") .addEscape('&', "\\x26") .addEscape('\b', "\\b") .addEscape('\t', "\\t") .addEscape('\n', "\\n") .addEscape('\f', "\\f") .addEscape('\r', "\\r") .addEscape('\\', "\\\\") .toArray()); private static CharEscaperBuilder newBasicXmlEscapeBuilder() { return new CharEscaperBuilder() .addEscape('&', "&") .addEscape('<', "<") .addEscape('>', ">") .addEscapes(new char[] { '\000', '\001', '\002', '\003', '\004', '\005', '\006', '\007', '\010', '\013', '\014', '\016', '\017', '\020', '\021', '\022', '\023', '\024', '\025', '\026', '\027', '\030', '\031', '\032', '\033', '\034', '\035', '\036', '\037'}, ""); } /** * Returns a composite {@link CharEscaper} instance that tries to escape * characters using a primary {@code CharEscaper} first and falls back to a * secondary one if there is no escaping. * * <p>The returned escaper will attempt to escape each character using the * primary escaper, and if the primary escaper has no escaping for that * character, it will use the secondary escaper. If the secondary escaper has * no escaping for a character either, the original character will be used. * If the primary escaper has an escape for a character, the secondary escaper * will not be used at all for that character; the escaped output of the * primary is not run through the secondary. For a case where you would like * to first escape with one escaper, and then with another, it is recommended * that you call each escaper in order. * * @param primary The primary {@code CharEscaper} to use * @param secondary The secondary {@code CharEscaper} to use if the first one * has no escaping rule for a character * @throws NullPointerException if any of the arguments is null */ public static CharEscaper fallThrough(CharEscaper primary, CharEscaper secondary) { checkNotNull(primary); checkNotNull(secondary); return new FallThroughCharEscaper(primary, secondary); } /** * A fast {@link CharEscaper} that uses an array of replacement characters and * a range of safe characters. It overrides {@link #escape(String)} to improve * performance. Rough benchmarking shows that this almost doubles the speed * when processing strings that do not require escaping (providing the escape * test itself is efficient). */ private static abstract class FastCharEscaper extends CharEscaper { protected final char[][] replacements; protected final int replacementLength; protected final char safeMin; protected final char safeMax; public FastCharEscaper(char[][] replacements, char safeMin, char safeMax) { this.replacements = replacements; this.replacementLength = replacements.length; this.safeMin = safeMin; this.safeMax = safeMax; } /** Overridden for performance (see {@link FastCharEscaper}). */ @Override public String escape(String s) { int slen = s.length(); for (int index = 0; index < slen; index++) { char c = s.charAt(index); if ((c < replacementLength && replacements[c] != null) || c < safeMin || c > safeMax) { return escapeSlow(s, index); } } return s; } } /** * Escaper for Java character escaping, contains both an array and a * backup function. We're not overriding the array decorator because we * want to keep this as fast as possible, so no calls to super.escape first. */ private static class JavaCharEscaper extends FastCharEscaper { public JavaCharEscaper(char[][] replacements) { super(replacements, ' ', '~'); } @Override protected char[] escape(char c) { // First check if our array has a valid escaping. if (c < replacementLength) { char[] r = replacements[c]; if (r != null) { return r; } } // This range is un-escaped. if (safeMin <= c && c <= safeMax) { return null; } if (c <= 0xFF) { // Convert c to an octal-escaped string. // Equivalent to String.format("\\%03o", (int)c); char[] r = new char[4]; r[0] = '\\'; r[3] = HEX_DIGITS[c & 7]; c >>>= 3; r[2] = HEX_DIGITS[c & 7]; c >>>= 3; r[1] = HEX_DIGITS[c & 7]; return r; } // Convert c to a hex-escaped string. // Equivalent to String.format("\\u%04x", (int)c); char[] r = new char[6]; r[0] = '\\'; r[1] = 'u'; r[5] = HEX_DIGITS[c & 15]; c >>>= 4; r[4] = HEX_DIGITS[c & 15]; c >>>= 4; r[3] = HEX_DIGITS[c & 15]; c >>>= 4; r[2] = HEX_DIGITS[c & 15]; return r; } } /** * Escaper for javascript character escaping, contains both an array and a * backup function. We're not overriding the array decorator because we * want to keep this as fast as possible, so no calls to super.escape first. */ private static class JavascriptCharEscaper extends FastCharEscaper { public JavascriptCharEscaper(char[][] replacements) { super(replacements, ' ', '~'); } @Override protected char[] escape(char c) { // First check if our array has a valid escaping. if (c < replacementLength) { char[] r = replacements[c]; if (r != null) { return r; } } // This range is unescaped. if (safeMin <= c && c <= safeMax) { return null; } // we can do a 2 digit hex escape for chars less that 0x100 if (c < 0x100) { char[] r = new char[4]; r[3] = HEX_DIGITS[c & 0xf]; c >>>= 4; r[2] = HEX_DIGITS[c & 0xf]; r[1] = 'x'; r[0] = '\\'; return r; } // 4 digit hex escape everything else char[] r = new char[6]; r[5] = HEX_DIGITS[c & 0xf]; c >>>= 4; r[4] = HEX_DIGITS[c & 0xf]; c >>>= 4; r[3] = HEX_DIGITS[c & 0xf]; c >>>= 4; r[2] = HEX_DIGITS[c & 0xf]; r[1] = 'u'; r[0] = '\\'; return r; } } /** * Escaper for HTML character escaping, contains both an array and a * backup function. We're not overriding the array decorator because we * want to keep this as fast as possible, so no calls to super.escape first. */ private static class HtmlCharEscaper extends FastCharEscaper { public HtmlCharEscaper(char[][] replacements) { super(replacements, Character.MIN_VALUE, '~'); } @Override protected char[] escape(char c) { // First check if our array has a valid escaping. if (c < replacementLength) { char[] r = replacements[c]; if (r != null) { return r; } } // ~ is ASCII 126, the highest value char that does not need // to be escaped if (c <= safeMax) { return null; } int index; if (c < 1000) { index = 4; } else if (c < 10000) { index = 5; } else { index = 6; } char[] result = new char[index + 2]; result[0] = '&'; result[1] = '#'; result[index + 1] = ';'; // to avoid the division and modulo operators. int intValue = c; for (; index > 1; index--) { result[index] = HEX_DIGITS[intValue % 10]; intValue /= 10; } return result; } } /** * A composite {@code CharEscaper} object that tries to escape characters * using a primary {@code CharEscaper} first and falls back to a secondary * one if there is no escaping. */ private static class FallThroughCharEscaper extends CharEscaper { private final CharEscaper primary; private final CharEscaper secondary; public FallThroughCharEscaper(CharEscaper primary, CharEscaper secondary) { this.primary = primary; this.secondary = secondary; } @Override protected char[] escape(char c) { char result[] = primary.escape(c); if (result == null) { result = secondary.escape(c); } return result; } } private static final char[] HEX_DIGITS = "0123456789abcdef".toCharArray(); }