/*
* Copyright 2010 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.template.soy.shared.restricted;
import static java.nio.charset.StandardCharsets.UTF_8;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;
import com.google.common.escape.Escaper;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.regex.Pattern;
import javax.annotation.Nullable;
import javax.annotation.ParametersAreNonnullByDefault;
/**
* Definitions of escaping functions that behave consistently in JavaScript and Java that implement
* the escaping directives as in <code>{print $x <b>|escapeJsString</b>}</code>.
*
* <p>An escaping convention is defined in terms of
*
* <ol>
* <li>An optional filter predicate that all valid inputs must match.
* <li>An optional function name from the closure JavaScript library that already implements the
* escaping convention.
* <li>A required mapping from characters to escaping strings.
* </ol>
*
* <p>Escaping functions are exposed as {@link Escaper}s in Java and via a JavaScript code
* generating ant task for JavaScript.
*
*/
@ParametersAreNonnullByDefault
public final class EscapingConventions {
// Below we take advantage of lazy class loading to avoid doing the work of initializing maps
// or loading code for escaping conventions never used by the Java runtime.
// We first define a base class that collects the information above, and that allows enumeration
// over escaped characters.
// Each escaping convention is its own public interface to java code, and the JavaScript code
// generator uses a public accessor that ties them all together.
/** The list of potential languages which are used by the escapers. */
public static enum EscapingLanguage {
JAVASCRIPT,
PYTHON
}
/**
* A mapping from a plain text character to the escaped text in the target language. We define a
* character below as a code unit, not a codepoint as none of the target languages treat
* supplementary codepoints as special.
*/
public static final class Escape implements Comparable<Escape> {
private final char plainText;
private final String escaped;
public Escape(char plainText, String escaped) {
this.plainText = plainText;
this.escaped = escaped;
}
/** A character in the input language. */
public char getPlainText() {
return plainText;
}
/**
* A string in the output language that corresponds to {@link #getPlainText} in the input
* language.
*/
public String getEscaped() {
return escaped;
}
@Override
public int compareTo(Escape b) {
return this.plainText - b.plainText;
}
}
/**
* A transformation on strings that preserves some correctness or safety properties. Subclasses
* come in three varieties:
*
* <dl>
* <dt>Escaper
* <dd>A mapping from strings in an input language to strings in an output language that
* preserves the content. E.g. the plain text string {@code 1 < 2} can be escaped to the
* equivalent HTML string {@code 1 < 2}.
* <dt>Normalizer
* <dd>A mapping from strings in a language to equivalent strings in the same language but that
* can be more easily embedded in another language. E.g. the URI {@code
* http://www.google.com/search?q=O'Reilly} is equivalent to {@code
* http://www.google.com/search?q=O%27Reilly} but the latter can be safely embedded in a
* single quoted HTML attribute.
* <dt>Filter
* <dd>A mapping from strings in a language to the same value or to an innocuous value. E.g. the
* string {@code h1} might pass an html identifier filter but the string {@code
* ><script>alert('evil')</script>} should not and could be replaced by an innocuous value
* like {@code zzz}.
* </dl>
*/
public abstract static class CrossLanguageStringXform extends Escaper {
private final String directiveName;
private final @Nullable Pattern valueFilter;
private final ImmutableList<Escape> escapes;
/**
* A dense mapping mirroring escapes. I.e. for each element of {@link #escapes} {@code e} such
* that {@code e.plainText < 0x80}, {@code escapesByCodeUnit[e.plainText] == e.escaped}.
*/
private final String[] escapesByCodeUnit;
/** Keys in a sparse mapping for the non ASCII {@link #escapes}. */
private final char[] nonAsciiCodeUnits;
/** Values in a sparse mapping corresponding to {@link #nonAsciiCodeUnits}. */
private final String[] nonAsciiEscapes;
/** @see #getNonAsciiPrefix */
private final @Nullable String nonAsciiPrefix;
/**
* @param valueFilter {@code null} if the directive accepts all strings as inputs. Otherwise a
* regular expression that accepts only strings that can be escaped by this directive.
* @param nonAsciiPrefix An escaping prefix in {@code "%", "\\u", "\\"} which specifies how to
* escape non-ASCII code units not in the sparse mapping. If null, then non-ASCII code units
* outside the sparse map can appear unescaped.
*/
protected CrossLanguageStringXform(
@Nullable Pattern valueFilter, @Nullable String nonAsciiPrefix) {
String simpleName = getClass().getSimpleName();
// EscapeHtml -> |escapeHtml
this.directiveName =
("|" + Character.toLowerCase(simpleName.charAt(0)) + simpleName.substring(1));
this.valueFilter = valueFilter;
this.escapes = defineEscapes();
// Now create the maps used by the escape methods. The below depends on defineEscapes()
// returning sorted escapes. EscapeListBuilder.build() sorts its escapes.
int numEscapes = escapes.size();
int numAsciiEscapes = escapes.size();
while (numAsciiEscapes > 0 && escapes.get(numAsciiEscapes - 1).plainText >= 0x80) {
--numAsciiEscapes;
}
// Create the dense ASCII map.
if (numAsciiEscapes != 0) {
escapesByCodeUnit = new String[escapes.get(numAsciiEscapes - 1).plainText + 1];
for (Escape escape : escapes.subList(0, numAsciiEscapes)) {
escapesByCodeUnit[escape.plainText] = escape.escaped;
}
} else {
escapesByCodeUnit = new String[0];
}
// Create the sparse non-ASCII map.
if (numEscapes != numAsciiEscapes) {
int numNonAsciiEscapes = numEscapes - numAsciiEscapes;
nonAsciiCodeUnits = new char[numNonAsciiEscapes];
nonAsciiEscapes = new String[numNonAsciiEscapes];
for (int i = 0; i < numNonAsciiEscapes; ++i) {
Escape esc = escapes.get(numAsciiEscapes + i);
nonAsciiCodeUnits[i] = esc.plainText;
nonAsciiEscapes[i] = esc.escaped;
}
} else {
nonAsciiCodeUnits = new char[0];
nonAsciiEscapes = new String[0];
}
// The fallback mode if neither the ASCII nor non-ASCII escaping maps contain a mapping.
this.nonAsciiPrefix = nonAsciiPrefix;
}
/** Returns the escapes used for this escaper. */
protected abstract ImmutableList<Escape> defineEscapes();
/**
* The name of the directive associated with this escaping function.
*
* @return E.g. {@code |escapeHtml}
*/
public String getDirectiveName() {
return directiveName;
}
/**
* An escaping prefix in {@code "%", "\\u", "\\"} which specifies how to escape non-ASCII code
* units not in the sparse mapping. If null, then non-ASCII code units outside the sparse map
* can appear unescaped.
*/
public final @Nullable String getNonAsciiPrefix() {
return nonAsciiPrefix;
}
/**
* Null if the escaper accepts all strings as inputs, or otherwise a regular expression that
* accepts only strings that can be escaped by this escaper.
*/
public final @Nullable Pattern getValueFilter() {
return valueFilter;
}
/** The escapes need to translate the input language to the output language. */
public final ImmutableList<Escape> getEscapes() {
return escapes;
}
/**
* The names of existing language builtins or available library functions (such as Google
* Closure) that implement the escaping convention.
*
* @param language The language being escaped.
* @return {@code null} if there is no such function.
*/
public List<String> getLangFunctionNames(EscapingLanguage language) {
return ImmutableList.<String>of();
}
/** Returns an innocuous string in this context that can be used when filtering. */
public String getInnocuousOutput() {
return INNOCUOUS_OUTPUT;
}
// Methods that satisfy the Escaper interface.
@Override
public final String escape(String string) {
// We pass null so that we don't unnecessarily allocate (and zero) or copy char arrays.
StringBuilder sb = maybeEscapeOnto(string, null);
return sb != null ? sb.toString() : string;
}
// TODO(lukes): consider eliminating this method, it was removed from the Escaper interface.
public final Appendable escape(final Appendable out) {
return new Appendable() {
@Override
public Appendable append(CharSequence csq) throws IOException {
maybeEscapeOnto(csq, out, 0, csq.length());
return this;
}
@Override
public Appendable append(CharSequence csq, int start, int end) throws IOException {
maybeEscapeOnto(csq, out, start, end);
return this;
}
@Override
public Appendable append(char c) throws IOException {
if (c < escapesByCodeUnit.length) { // Use the dense map.
String esc = escapesByCodeUnit[c];
if (esc != null) {
out.append(esc);
return this;
}
} else if (c >= 0x80) {
int index = Arrays.binarySearch(nonAsciiCodeUnits, c);
if (index >= 0) { // Found in the sparse map.
out.append(nonAsciiEscapes[index]);
return this;
}
if (nonAsciiPrefix != null) { // Fallback for non-ASCII code units.
escapeUsingPrefix(c, out);
return this;
}
}
out.append(c);
return this;
}
};
}
/**
* Escapes the given char sequence onto the given buffer iff it contains characters that need to
* be escaped.
*
* @return null if no output buffer was passed in, and s contains no characters that need
* escaping. Otherwise out, or a StringBuilder if one needed to be allocated.
*/
private @Nullable StringBuilder maybeEscapeOnto(CharSequence s, @Nullable StringBuilder out) {
try {
return (StringBuilder) maybeEscapeOnto(s, out, 0, s.length());
} catch (IOException ex) {
// StringBuilders should not throw IOExceptions.
throw new AssertionError(ex);
}
}
/**
* Escapes the given range of the given sequence onto the given buffer iff it contains
* characters that need to be escaped.
*
* @return null if no output buffer was passed in, and s contains no characters that need
* escaping. Otherwise out, or a StringBuilder if one needed to be allocated.
*/
private @Nullable Appendable maybeEscapeOnto(
CharSequence s, @Nullable Appendable out, int start, int end) throws IOException {
int pos = start;
for (int i = start; i < end; ++i) {
char c = s.charAt(i);
if (c < escapesByCodeUnit.length) { // Use the dense map.
String esc = escapesByCodeUnit[c];
if (esc != null) {
if (out == null) {
// Create a new buffer if we need to escape a character in s.
// We add 32 to the size to leave a decent amount of space for escape characters.
out = new StringBuilder(end - start + 32);
}
out.append(s, pos, i).append(esc);
pos = i + 1;
}
} else if (c >= 0x80) { // Use the sparse map.
int index = Arrays.binarySearch(nonAsciiCodeUnits, c);
if (index >= 0) {
if (out == null) {
out = new StringBuilder(end - start + 32);
}
out.append(s, pos, i).append(nonAsciiEscapes[index]);
pos = i + 1;
} else if (nonAsciiPrefix != null) { // Fallback to the prefix based escaping.
if (out == null) {
out = new StringBuilder(end - start + 32);
}
out.append(s, pos, i);
escapeUsingPrefix(c, out);
pos = i + 1;
}
}
}
if (out != null) {
out.append(s, pos, end);
}
return out;
}
/**
* Appends a hex representation of the given code unit to out preceded by the {@link
* #nonAsciiPrefix}.
*
* @param c A code unit greater than or equal to 0x80.
* @param out written to.
*/
private void escapeUsingPrefix(char c, Appendable out) throws IOException {
if ("%".equals(nonAsciiPrefix)) { // Use a UTF-8
if (c < 0x800) {
out.append('%');
appendHexPair(((c >>> 6) & 0x1f) | 0xc0, out);
} else {
out.append('%');
appendHexPair(((c >>> 12) & 0xf) | 0xe0, out);
out.append('%');
appendHexPair(((c >>> 6) & 0x3f) | 0x80, out);
}
out.append('%');
appendHexPair((c & 0x3f) | 0x80, out);
} else {
out.append(nonAsciiPrefix);
appendHexPair((c >>> 8) & 0xff, out);
appendHexPair(c & 0xff, out);
if ("\\".equals(nonAsciiPrefix)) {
// Append with a space so that CSS escape doesn't pull in any hex digits following.
out.append(' ');
}
}
}
private static final char[] HEX_DIGITS = {
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F',
};
/** Given {@code 0x20} appends {@code "20"} to the given output buffer. */
private void appendHexPair(int b, Appendable out) throws IOException {
out.append(HEX_DIGITS[b >>> 4]);
out.append(HEX_DIGITS[b & 0xf]);
}
}
/** A builder for lists of escapes. */
private abstract static class EscapeListBuilder {
private final List<Escape> escapes = Lists.newArrayList();
/**
* Computes the numeric escape in the output language for the given codepoint in the input
* language. E.g. in C, the numeric escape for space is {@code \x20}.
*/
abstract String getNumericEscapeFor(char plainText);
/** Adds an escape for the given code unit in the input language to the given escaped text. */
final EscapeListBuilder escape(char plainText, String escaped) {
escapes.add(new Escape(plainText, escaped));
return this;
}
/**
* Adds an escape for the given code unit in the input language using the numeric escaping
* scheme.
*/
final EscapeListBuilder escape(char plainText) {
escapes.add(new Escape(plainText, getNumericEscapeFor(plainText)));
return this;
}
/** Adds a numeric escape for each code unit in the input string. */
final EscapeListBuilder escapeAll(String plainTextCodeUnits) {
int numCodeUnits = plainTextCodeUnits.length();
for (int i = 0; i < numCodeUnits; ++i) {
escape(plainTextCodeUnits.charAt(i));
}
return this;
}
/** Adds numeric escapes for each code unit in the given range not in the exclusion set. */
final EscapeListBuilder escapeAllInRangeExcept(
int startInclusive, int endExclusive, char... notEscaped) {
notEscaped = notEscaped.clone();
Arrays.sort(notEscaped);
int k = 0;
int numNotEscaped = notEscaped.length;
for (int i = startInclusive; i < endExclusive; ++i) {
while (k < numNotEscaped && notEscaped[k] < i) {
++k;
}
if (k < numNotEscaped && notEscaped[k] == i) {
continue;
}
escape((char) i);
}
return this;
}
/** The list of all escapes defined thus far. */
final ImmutableList<Escape> build() {
Collections.sort(escapes);
return ImmutableList.copyOf(escapes);
}
}
/** Escapes using HTML/XML numeric entities : {@code 'A' -> "A"}. */
private static final class HtmlEscapeListBuilder extends EscapeListBuilder {
@Override
String getNumericEscapeFor(char plainText) {
return "" + ((int) plainText) + ";";
}
}
// Implementations of particular escapers.
// These names follow the convention defined in Escaper's constructor above where
// class EscapeFoo
// is the concrete definition for
// |escapeFoo
// Each also provides a singleton INSTANCE member.
/** Implements the {@code |escapeHtml} directive. */
public static final class EscapeHtml extends CrossLanguageStringXform {
/** Implements the {@code |escapeHtml} directive. */
public static final EscapeHtml INSTANCE = new EscapeHtml();
private EscapeHtml() {
super(null, null);
}
@Override
protected ImmutableList<Escape> defineEscapes() {
return new HtmlEscapeListBuilder()
.escape('&', "&")
.escape('<', "<")
.escape('>', ">")
.escape('"', """)
// It escapes ' to ' instead of ' which is not standardized in XML.
.escapeAll("\0'")
.build();
}
@Override
public List<String> getLangFunctionNames(EscapingLanguage language) {
if (language == EscapingLanguage.JAVASCRIPT) {
return ImmutableList.<String>of("goog.string.htmlEscape");
}
return super.getLangFunctionNames(language);
}
}
/**
* A directive that encodes any HTML special characters that can appear in RCDATA unescaped but
* that can be escaped without changing semantics. From <a
* href="http://www.w3.org/TR/html5/tokenization.html#rcdata-state">HTML 5</a>:
*
* <blockquote>
*
* <h4>8.2.4.3 RCDATA state</h4>
*
* Consume the next input character:
*
* <ul>
* <li>U+0026 AMPERSAND (&) <br>
* Switch to the character reference in RCDATA state.
* <li>U+003C LESS-THAN SIGN (<) <br>
* Switch to the RCDATA less-than sign state.
* <li>EOF <br>
* Emit an end-of-file token.
* <li>Anything else <br>
* Emit the current input character as a character token.
* </ul>
*
* </blockquote>
*
* So all HTML special characters can be escaped, except ampersand, since escaping that would lead
* to overescaping of legitimate HTML entities.
*/
public static final class NormalizeHtml extends CrossLanguageStringXform {
/** Implements the {@code |normalizeHtml} directive. */
public static final NormalizeHtml INSTANCE = new NormalizeHtml();
private NormalizeHtml() {
super(null, null);
}
@Override
protected ImmutableList<Escape> defineEscapes() {
ImmutableList.Builder<Escape> escapes = ImmutableList.builder();
for (Escape esc : EscapeHtml.INSTANCE.getEscapes()) {
if (esc.plainText != '&') {
escapes.add(esc);
}
}
return escapes.build();
}
}
/**
* Implements the {@code |escapeHtmlNoSpace} directive which allows arbitrary content to be
* included in the value of an unquoted HTML attribute.
*/
public static final class EscapeHtmlNospace extends CrossLanguageStringXform {
/** Implements the {@code |escapeHtmlNospace} directive. */
public static final EscapeHtmlNospace INSTANCE = new EscapeHtmlNospace();
private EscapeHtmlNospace() {
super(null, null);
}
@Override
protected ImmutableList<Escape> defineEscapes() {
return new HtmlEscapeListBuilder()
.escape('&', "&")
.escape('<', "<")
.escape('>', ">")
.escape('"', """)
// The below list of characters are all those that need to be encode to prevent unquoted
// value splitting.
//
// From the XML spec,
// [3] S ::= (#x20 | #x9 | #xD | #xA)+
// From section 2.4.1 of the HTML5 draft,
// The space characters, for the purposes of this specification, are
// U+0020 SPACE, U+0009 CHARACTER TABULATION (tab), U+000A LINE FEED (LF),
// U+000C FORM FEED (FF), and U+000D CARRIAGE RETURN (CR).
// The White_Space characters are those that have the Unicode property
// "White_Space" in the Unicode PropList.txt data file.
// From XML processing notes:
// [XML1.1] also normalizes NEL (U+0085) and U+2028 LINE SEPARATOR, but
// U+2029 PARAGRAPH SEPARATOR is not treated that way.
// Those newline characters are described at
// http://unicode.org/reports/tr13/tr13-9.html
//
// Empirically, we need to quote
// U+0009 - U+000d, U+0020, double quote, single quote, '>', and back quote.
// based on running
// <body>
// <div id=d></div>
// <script>
// var d = document.getElementById('d');
//
// for (var i = 0x0; i <= 0xffff; ++i) {
// var unsafe = false;
//
// var ch = String.fromCharCode(i);
//
// d.innerHTML = '<input title=foo' + ch + 'checked>';
// var inp = d.getElementsByTagName('INPUT')[0];
// if (inp && (inp.getAttribute('title') === 'foo' || inp.checked)) {
// unsafe = true;
// } else { // Try it as a quoting character.
// d.innerHTML = '<input title=' + ch + 'foo' + ch + 'checked>';
// inp = d.getElementsByTagName('INPUT')[0];
// unsafe = !!(inp && (inp.getAttribute('title') === 'foo' || inp.checked));
// }
// if (unsafe) {
// var fourhex = i.toString(16);
// fourhex = "0000".substring(fourhex.length) + fourhex;
// document.write('\\u' + fourhex + '<br>');
// }
// }
// </script>
// in a variety of browsers.
//
// We supplement that set with the quotes and equal sign which have special
// meanings in attributes, and with the XML normalized spaces.
.escapeAll("\u0000\u0009\n\u000B\u000C\r '-/=\u0060\u0085\u00a0\u2028\u2029")
.build();
}
}
/**
* A directive that encodes any HTML special characters and unquoted attribute terminators that
* can appear in RCDATA unescaped but that can be escaped without changing semantics.
*/
public static final class NormalizeHtmlNospace extends CrossLanguageStringXform {
/** Implements the {@code |normalizeHtml} directive. */
public static final NormalizeHtmlNospace INSTANCE = new NormalizeHtmlNospace();
private NormalizeHtmlNospace() {
super(null, null);
}
@Override
protected ImmutableList<Escape> defineEscapes() {
ImmutableList.Builder<Escape> escapes = ImmutableList.builder();
for (Escape esc : EscapeHtmlNospace.INSTANCE.getEscapes()) {
if (esc.plainText != '&') {
escapes.add(esc);
}
}
return escapes.build();
}
}
/** Escapes using hex escapes since octal are non-standard. 'A' -> "\\x41" */
private static final class JsEscapeListBuilder extends EscapeListBuilder {
@Override
String getNumericEscapeFor(char plainText) {
return String.format(plainText < 0x100 ? "\\x%02x" : "\\u%04x", (int) plainText);
}
}
/**
* Implements the {@code |escapeJsString} directive which allows arbitrary content to be included
* inside a quoted JavaScript string.
*/
public static final class EscapeJsString extends CrossLanguageStringXform {
/** Implements the {@code |escapeJsString} directive. */
public static final EscapeJsString INSTANCE = new EscapeJsString();
private EscapeJsString() {
super(null, null); // TODO(msamuel): Maybe use goog.string.quote
}
@Override
protected ImmutableList<Escape> defineEscapes() {
return new JsEscapeListBuilder()
// Some control characters.
.escape('\u0000')
.escape('\b') // \\b means word-break inside RegExps.
.escape('\t', "\\t")
.escape('\n', "\\n")
.escape('\u000b') // \\v not consistently supported on IE.
.escape('\f', "\\f")
.escape('\r', "\\r")
.escape('\\', "\\\\")
// Quoting characters. / is also instrumental in </script>.
.escape('"')
.escape('\'')
.escape('/', "\\/")
.escapeAll("\u2028\u2029") // JavaScript newlines
.escape('\u0085') // A JavaScript newline according to at least one draft spec.
// HTML special characters. Note, that this provides added protection against problems
// with </script> <![CDATA[, ]]>, <!--, -->, etc.
.escapeAll("<>&=")
.build();
}
}
/**
* Implements the {@code |escapeJsRegex} directive which allows arbitrary content to be included
* inside a JavaScript regular expression.
*/
public static final class EscapeJsRegex extends CrossLanguageStringXform {
/** Implements the {@code |escapeJsRegex} directive. */
public static final EscapeJsRegex INSTANCE = new EscapeJsRegex();
private EscapeJsRegex() {
// TODO(msamuel): maybe use goog.string.regExpEscape after fixing it to escape
// [\r\n\u2028\u2029]
super(null, null);
}
@Override
protected ImmutableList<Escape> defineEscapes() {
return new JsEscapeListBuilder()
// Some control characters.
.escape('\u0000')
.escape('\b') // \\b means word-break inside RegExps.
.escape('\t', "\\t")
.escape('\n', "\\n")
.escape('\u000b') // \\v not consistently supported on IE.
.escape('\f', "\\f")
.escape('\r', "\\r")
.escape('\\', "\\\\") // Escape prefix
.escapeAll("\u2028\u2029") // JavaScript newlines
.escape('\u0085') // A JavaScript newline according to at least one draft spec.
// Quoting characters. / is also instrumental in </script>.
.escape('"')
.escape('\'')
.escape('/', "\\/")
// HTML special characters. Note, that this provides added protection against problems
// with </script> <![CDATA[, ]]>, <!--, -->, etc.
.escapeAll("<>&=")
// Special in regular expressions. / is also special, but is escaped above.
.escapeAll("$()*+-.:?[]^{|},")
.build();
}
}
/**
* Escapes using CSS hex escapes with a space at the end in case a hex digit is the next character
* : {@code 'A' => "\41 "}
*/
private static final class CssEscapeListBuilder extends EscapeListBuilder {
@Override
String getNumericEscapeFor(char plainText) {
return String.format("\\%x ", (int) plainText);
}
}
/**
* Implements the {@code |escapeCssString} directive which allows arbitrary content to be included
* in a CSS quoted string or identifier.
*/
public static final class EscapeCssString extends CrossLanguageStringXform {
/** Implements the {@code |escapeCssString} directive. */
public static final EscapeCssString INSTANCE = new EscapeCssString();
private EscapeCssString() {
super(null, null);
}
@Override
protected ImmutableList<Escape> defineEscapes() {
return new CssEscapeListBuilder()
// Escape newlines and similar control characters, quotes, HTML special characters, and
// CSS punctuation that might cause CSS error recovery code to restart parsing in the
// middle of a string.
// Semicolons, close curlies, and @ (which precedes top-level directives like @media),
// and slashes in comment delimiters are all good places for CSS error recovery code to
// skip to.
// Quotes and parentheses are used as string and URL delimiters.
// Angle brackets and slashes appear in escaping text spans allowed in HTML5 <style>
// that might affect the parsing of subsequent content, and < appears in
// </style> which could prematurely close a style element.
// Newlines are disallowed in strings, so not escaping them can trigger CSS error
// recovery.
.escapeAll("\u0000\b\t\n\u000b\f\r\u0085\u00a0\u2028\u2029\"\'\\<>&{};:()@/=*")
.build();
}
}
/**
* Implements the {@code |filterCssValue} directive which filters out strings that are not valid
* CSS property names, keyword values, quantities, hex colors, or ID or class literals.
*/
public static final class FilterCssValue extends CrossLanguageStringXform {
/**
* Matches a CSS token that can appear unquoted as part of an ID, class, font-family-name, or
* CSS keyword value.
*/
public static final Pattern CSS_WORD =
Pattern.compile(
// See http://www.owasp.org/index.php/XSS_(Cross_Site_Scripting)_Prevention_Cheat_Sheet
// #RULE_.234_-_CSS_Escape_Before_Inserting_Untrusted_Data_into_HTML_Style_Property_Values
// for an explanation of why expression and moz-binding are bad.
"^(?!-*(?:expression|(?:moz-)?binding))"
+ // Should not start with spaces. Since we allow spaces between sub-values,
// we need this condition to disable space-only values.
"(?!\\s+)(?:"
+ // A latin class name or ID, CSS identifier, hex color or unicode range.
"[.#]?-?(?:[_a-z0-9-]+)(?:-[_a-z0-9-]+)*-?|"
+ // A non-hex color
"(?:rgb|hsl)a?\\([0-9.%, ]+\\)|"
+ // A quantity
"-?(?:[0-9]+(?:\\.[0-9]*)?|\\.[0-9]+)(?:[a-z]{1,2}|%)?|"
+ // The special value !important.
"!important|"
+ // Spaces.
"\\s+"
+ ")*\\z",
Pattern.CASE_INSENSITIVE);
/** Implements the {@code |filterCssValue} directive. */
public static final FilterCssValue INSTANCE = new FilterCssValue();
private FilterCssValue() {
super(CSS_WORD, null);
}
@Override
protected ImmutableList<Escape> defineEscapes() {
return ImmutableList.<Escape>of();
}
}
/** Escapes using URI percent encoding : {@code 'A' => "%41"} */
private static final class UriEscapeListBuilder extends EscapeListBuilder {
@Override
String getNumericEscapeFor(char plainText) {
// URI encoding is different from the other escaping schemes.
// The others are transformations on strings of UTF-16 code units, but URIs are composed of
// strings of bytes. We assume UTF-8 as the standard way to convert between bytes and code
// units below.
byte[] bytes = Character.toString(plainText).getBytes(UTF_8);
int numBytes = bytes.length;
StringBuilder sb = new StringBuilder(numBytes * 3);
for (int i = 0; i < numBytes; ++i) {
// Use uppercase escapes for consistency with CharEscapers.uriEscaper().
sb.append(String.format("%%%02X", bytes[i]));
}
return sb.toString();
}
}
/**
* Implements the {@code |normalizeUri} directive which allows arbitrary content to be included in
* a URI regardless of the string delimiters of the the surrounding language. This normalizes, but
* does not escape, so it does not affect URI special characters, but instead escapes HTML, CSS,
* and JS delimiters.
*/
public static final class NormalizeUri extends CrossLanguageStringXform {
/** Implements the {@code |normalizeUri} directive. */
public static final NormalizeUri INSTANCE = new NormalizeUri();
private NormalizeUri() {
super(null, null);
}
@Override
protected ImmutableList<Escape> defineEscapes() {
return new UriEscapeListBuilder()
// Escape all ASCII control characters.
.escapeAll("\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007")
.escapeAll("\u0008\u0009\n\u000B\u000C\r\u000E\u000F")
.escapeAll("\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017")
.escapeAll("\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F")
.escape('\u007f')
// Escape non-special URI characters that might prematurely close an unquoted CSS URI or
// HTML attribute.
// Parentheses and single quote are technically sub-delims, but not in HTTP or HTTPS,
// only appearing in the obsolete mark rule in section D.2. of RFC 3986.
// It is important to encode parentheses to prevent CSS URIs from being broken as in:
// background: {lb} background-image: url( /foo/{print $x}.png ) {rb}
// It is important to encode both quote characters to prevent broken CSS URIs and HTML
// attributes as in:
// background: {lb} background-image: url('/foo/{print $x}.png') {rb}
// and
// <img src="/foo/{print $x}.png">
.escapeAll(" (){}\"\'\\<>")
// More spaces and newlines.
.escapeAll("\u0085\u00A0\u2028\u2029")
// Make sure that full-width versions of reserved characters are escaped.
// Some user-agents treat full-width characters in URIs entered in the URL bar the same
// as the ASCII version so that URLs copied and pasted from written Chinese work.
// Each Latin printable character has a full-width equivalent in the U+FF00 code plane,
// e.g. the full-width colon is \uFF1A.
// http://www.cisco.com/en/US/products/products_security_response09186a008083f82e.html
// says that it is possible to route malicious URLs through intervening layers to the
// browser by using the full-width equivalents of special characters.
.escapeAll(toFullWidth(":/?#[]@!$&'()*+,;="))
.build();
}
}
/** Like {@link NormalizeUri} but filters out dangerous protocols. */
public static final class FilterNormalizeUri extends CrossLanguageStringXform {
/** Implements the {@code |filterNormalizeUri} directive. */
public static final FilterNormalizeUri INSTANCE = new FilterNormalizeUri();
private FilterNormalizeUri() {
// Disallows any protocol that is not in a whitelist.
// The below passes if there is
// (1) Either a protocol in a whitelist (http, https, mailto). This could be expanded but
// talk to your friendly local ise-team@ first.
// (2) or no protocol. A protocol must be followed by a colon. The below allows that by
// allowing colons only after one of the characters [/?#].
// A colon after a hash (#) must be in the fragment.
// Otherwise, a colon after a (?) must be in a query.
// Otherwise, a colon after a single solidus (/) must be in a path.
// Otherwise, a colon after a double solidus (//) must be in the authority (before port).
//
// Finally, the pattern disallows &, used in HTML entity declarations before one of the
// characters in [/?#].
// This disallows HTML entities used in the protocol name, which should never happen,
// e.g. "http" for "http".
// It also disallows HTML entities in the first path part of a relative path,
// e.g. "foo<bar/baz". Our existing escaping functions should not produce that.
// More importantly, it disallows masking of a colon, e.g. "javascript:...".
//
// Also Rejects paths with the following properties:
// (3) paths containing /../
// (4) paths ending in /..
super(
Pattern.compile(
"^"
+
// Reject case (3) and (4)
"(?![^#?]*/(?:\\.|%2E){2}(?:[/?#]|\\z))"
+
// Accept cases (1) and (2)
"(?:(?:https?|mailto):|[^&:/?#]*(?:[/?#]|\\z))",
Pattern.CASE_INSENSITIVE),
null);
}
@Override
protected ImmutableList<Escape> defineEscapes() {
return NormalizeUri.INSTANCE.defineEscapes();
}
@Override
public String getInnocuousOutput() {
return "about:invalid#" + INNOCUOUS_OUTPUT;
}
}
/**
* Like {@link FilterNormalizeUri}, but also accepts {@code data:} and {@code blob:} URIs, since
* image sources don't execute script in the same origin as the page (although image handling
* 0-days are available from time to time, but a templating language can't realistically try to
* protect against such a thing).
*
* <p>Only intended to be used with images; for videos and audio we expect some sort of further
* review since they can more easily be used for social engineering. Video and audio still accept
* http/https because remote video and audio can still be protected against via CSP, but data URIs
* don't have self-evident provenance.
*/
public static final class FilterNormalizeMediaUri extends CrossLanguageStringXform {
/** Implements the {@code |filterNormalizeMediaUri} directive. */
public static final FilterNormalizeMediaUri INSTANCE = new FilterNormalizeMediaUri();
private FilterNormalizeMediaUri() {
// For image URIs, we use a relatively permissive filter. We accept:
// - http and https URLs
// - data URLs of supported types
// We don't worry about sequences of "/../" here, because path traversal isn't a worry for
// images, and detecting /../ sequences would add unnecessary complexity here.
super(
Pattern.compile(
// Allow relative URIs.
"^[^&:/?#]*(?:[/?#]|\\z)"
// Allow http and https URIs.
+ "|^https?:"
// Allow image data URIs. Ignore the subtype because browsers ignore them anyways.
// In fact, most browsers happily accept text/html or a completely empty MIME, but
// it doesn't hurt to verify that it at least looks vaguely correct.
+ "|^data:image/[a-z0-9+]+"
+ ";base64,[a-z0-9+/]+=*\\z"
// Blob URIs -- while there's no saying what's in them, (a) they are created on
// the same origin, and (b) no worse than loading a random http/https link.
+ "|^blob:",
Pattern.CASE_INSENSITIVE),
null);
}
@Override
protected ImmutableList<Escape> defineEscapes() {
return NormalizeUri.INSTANCE.defineEscapes();
}
@Override
public String getInnocuousOutput() {
// NOTE: about:invalid is registered in http://www.w3.org/TR/css3-values/#about-invalid :
// "The about:invalid URI references a non-existent document with a generic error condition.
// It can be used when a URI is necessary, but the default value shouldn't be resolveable as
// any type of document."
return "about:invalid#" + INNOCUOUS_OUTPUT;
}
}
/**
* Accepts only data URI's that contain an image.
*
* <p>Developers use this simultaneously to allow data URI's, but also to ensure that the image
* tag won't initiate any HTTP requests.
*
* <p>NOTE: We may consider deprecating this now that img/data URIs are allowed by default, since
* it's unlikely too many projects need a mechanism to double-check that images are only loaded
* from data URIs; anyone else that does can simply scan the URL and fail if it detects
* http/https.
*/
public static final class FilterImageDataUri extends CrossLanguageStringXform {
/** Implements the {@code |filterImageDataUri} directive. */
public static final FilterImageDataUri INSTANCE = new FilterImageDataUri();
private FilterImageDataUri() {
super(
Pattern.compile(
"^data:image/(?:bmp|gif|jpe?g|png|tiff|webp);base64,[a-z0-9+/]+=*\\z",
Pattern.CASE_INSENSITIVE),
null);
}
@Override
protected ImmutableList<Escape> defineEscapes() {
// No normalization or escaping necessary -- the filter is limited to a strict subset that
// doesn't involve html stop-chars.
return ImmutableList.<Escape>of();
}
@Override
public String getInnocuousOutput() {
// Return something that is both clearly an image, but clearly invalid. We don't want the
// browser to fetch anything. We also don't necessarily want a transparent gif, since it
// doesn't alert developers to an issue. And finally, by not starting with GIF89a, we ensure
// the browser doesn't attempt to actually decode it and crash.
return "data:image/gif;base64,zSoyz";
}
}
/**
* Accepts only tel URIs but does not verify complete correctness.
*
* <p>The RFC for the tel: URI https://tools.ietf.org/html/rfc3966
*/
public static final class FilterTelUri extends CrossLanguageStringXform {
/** Implements the {@code |filterTelUri} directive. */
public static final FilterTelUri INSTANCE = new FilterTelUri();
private FilterTelUri() {
super(
Pattern.compile("^tel:[0-9a-z;=\\-+._!~*' /():&$#?@,]+\\z", Pattern.CASE_INSENSITIVE),
null);
}
@Override
protected ImmutableList<Escape> defineEscapes() {
return ImmutableList.<Escape>of();
}
@Override
public String getInnocuousOutput() {
// NOTE: about:invalid is registered in http://www.w3.org/TR/css3-values/#about-invalid :
// "The about:invalid URI references a non-existent document with a generic error condition.
// It can be used when a URI is necessary, but the default value shouldn't be resolveable as
// any type of document."
return "about:invalid#" + INNOCUOUS_OUTPUT;
}
}
/**
* Implements the {@code |escapeUri} directive which allows arbitrary content to be included in a
* URI regardless of the string delimiters of the the surrounding language.
*/
public static final class EscapeUri extends CrossLanguageStringXform {
/** Implements the {@code |escapeUri} directive. */
public static final EscapeUri INSTANCE = new EscapeUri();
private EscapeUri() {
super(null, "%");
}
@Override
protected ImmutableList<Escape> defineEscapes() {
// From Appendix A of RFC 3986
// unreserved := ALPHA / DIGIT / "-" / "." / "_" / "~"
String unreservedChars = "-.";
for (char c = '0'; c <= '9'; c++) {
unreservedChars += c;
}
for (char c = 'A'; c <= 'Z'; c++) {
unreservedChars += c;
}
unreservedChars += '_';
for (char c = 'a'; c <= 'z'; c++) {
unreservedChars += c;
}
unreservedChars += '~';
return new UriEscapeListBuilder()
.escapeAllInRangeExcept(0, 0x80, unreservedChars.toCharArray())
// All non-ASCII codepoints escaped per the constructor above.
.build();
}
@Override
public List<String> getLangFunctionNames(EscapingLanguage language) {
if (language == EscapingLanguage.JAVASCRIPT) {
return ImmutableList.<String>of("goog.string.urlEncode", "encodeURIComponent");
} else if (language == EscapingLanguage.PYTHON) {
return ImmutableList.<String>of("urllib.quote");
}
return super.getLangFunctionNames(language);
}
}
/**
* Implements the {@code |filterHtmlAttributes} directive which filters out identifiers that can't
* appear as part of an HTML tag or attribute name.
*/
public static final class FilterHtmlAttributes extends CrossLanguageStringXform {
/** Implements the {@code |filterHtmlAttributes} directive. */
public static final FilterHtmlAttributes INSTANCE = new FilterHtmlAttributes();
private FilterHtmlAttributes() {
super(
Pattern.compile(
"^"
// Disallow on* and src* attribute names.
+ "(?!on|src|"
// Disallow specific other attribute names.
+ "(?:style|action|archive|background|cite|classid|codebase|data|dsync|href"
+ "|longdesc|usemap)\\s*$)"
+ "(?:"
// Must match letters
+ "[a-z0-9_$:-]*"
// Match until the end.
+ ")\\z",
Pattern.CASE_INSENSITIVE),
null);
}
@Override
protected ImmutableList<Escape> defineEscapes() {
return ImmutableList.<Escape>of();
}
}
/**
* Implements the {@code |filterHtmlElementName} directive which filters out identifiers that
* can't appear as part of an HTML tag or attribute name.
*/
public static final class FilterHtmlElementName extends CrossLanguageStringXform {
/** Implements the {@code |filterHtmlElementName} directive. */
public static final FilterHtmlElementName INSTANCE = new FilterHtmlElementName();
private FilterHtmlElementName() {
super(
Pattern.compile(
"^"
// Disallow special element names.
+ "(?!script|style|title|textarea|xmp|no)"
+ "[a-z0-9_$:-]*\\z",
Pattern.CASE_INSENSITIVE),
null);
}
@Override
protected ImmutableList<Escape> defineEscapes() {
return ImmutableList.<Escape>of();
}
}
/**
* Implements the {@code |filterCspNonceValue} directive
*
* <p>This only allows alphanumeric, plus, slash, and equals. So importantly it shouldn't be used
* in any programming-languagey context, such as:
*
* <ul>
* <li>JavaScript outside a string
* <li>CSS outside a string
* <li>tag names, attribute names ("attributes" context)
* </ul>
*
* <p>It is allowed in:
*
* <ul>
* <li>HTML pcdata, rcdata, attribute values, even nospace
* <li>CSS and JS strings
* <li>HTML, JS, CSS comments
* </ul>
*
* <p>And in practice, it is only used in:
*
* <ul>
* <li>HTML attribute values
* </ul>
*
* <p>See also https://www.w3.org/TR/CSP2/#nonce_value
*/
public static final class FilterCspNonceValue extends CrossLanguageStringXform {
public static final FilterCspNonceValue INSTANCE = new FilterCspNonceValue();
private FilterCspNonceValue() {
super(Pattern.compile("^[a-zA-Z0-9+/]+=*$"), null);
}
@Override
protected ImmutableList<Escape> defineEscapes() {
return ImmutableList.<Escape>of();
}
@Override
public String getInnocuousOutput() {
return INNOCUOUS_OUTPUT;
}
}
/** An accessor for all string transforms defined above. */
public static Iterable<CrossLanguageStringXform> getAllEscapers() {
// This list is hard coded but is checked by unittests for the contextual auto-escaper.
return ImmutableList.of(
EscapeHtml.INSTANCE,
NormalizeHtml.INSTANCE,
EscapeHtmlNospace.INSTANCE,
NormalizeHtmlNospace.INSTANCE,
EscapeJsString.INSTANCE,
EscapeJsRegex.INSTANCE,
EscapeCssString.INSTANCE,
FilterCssValue.INSTANCE,
EscapeUri.INSTANCE,
NormalizeUri.INSTANCE,
FilterCspNonceValue.INSTANCE,
FilterNormalizeUri.INSTANCE,
FilterNormalizeMediaUri.INSTANCE,
FilterImageDataUri.INSTANCE,
FilterTelUri.INSTANCE,
FilterHtmlAttributes.INSTANCE,
FilterHtmlElementName.INSTANCE);
}
/**
* A string, used as the result of a filter when the filter pattern does not match the input, that
* is not a substring of any keyword or well-known identifier in HTML, JS, or CSS and that is a
* valid identifier part in all those languages, and which cannot terminate a string, comment, or
* other bracketed section.
*
* <p>This string is also longer than necessary so that developers can use grep when it starts
* showing up in their output.
*
* <p>If grep directed you here, then one of your Soy templates is using a filter directive that
* is receiving a potentially unsafe input. Run your app in debug mode and you should get the name
* of the directive and the input deemed unsafe.
*/
public static final String INNOCUOUS_OUTPUT = "zSoyz";
/**
* Loose matcher for HTML tags, DOCTYPEs, and HTML comments. This will reliably find HTML tags
* (though not CDATA tags and not XML tags whose name or namespace starts with a non-latin
* character), and will do a good job with DOCTYPES (though will have trouble with complex
* doctypes that define their own entities) and does a decent job with simple HTML comments.
*
* <p>This should be good enough since HTML sanitizers do not typically output comments, or CDATA,
* or RCDATA content.
*
* <p>The tag name, if any is in group 1.
*/
public static final Pattern HTML_TAG_CONTENT =
Pattern.compile(
// Matches a left angle bracket followed by either
// (1) a "!" which indicates a doctype or comment, or
// (2) an optional solidus (/, indicating an end tag) and an HTML tag name.
// followed by any number of quoted strings (found in tags and doctypes) or other content
// terminated by a right angle bracket.
"<(?:!|/?([a-zA-Z][a-zA-Z0-9:\\-]*))(?:[^>'\"]|\"[^\"]*\"|'[^']*')*>");
/**
* Convert an ASCII string to full-width. Full-width characters are in Unicode page U+FFxx and are
* used to allow ASCII characters to be embedded in written Chinese without breaking alignment --
* so a sinograph which occupies two columns can line up properly with a Latin letter or symbol
* which normally occupies only one column.
*
* <p>See <a
* href="http://en.wikipedia.org/wiki/Duplicate_characters_in_Unicode#CJK_fullwidth_forms">CJK
* fullwidth forms</a> and <a href="unicode.org/charts/PDF/UFF00.pdf">unicode.org</a>.
*/
private static String toFullWidth(String ascii) {
int numChars = ascii.length();
StringBuilder sb = new StringBuilder(ascii);
for (int i = 0; i < numChars; ++i) {
char ch = ascii.charAt(i);
if (ch < 0x80) {
sb.setCharAt(i, (char) (ch + 0xff00 - 0x20));
}
}
return sb.toString();
}
}