/*
* Copyright 2010 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.template.soy.parsepasses.contextautoesc;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Optional;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
import com.google.template.soy.data.SanitizedContent.ContentKind;
import com.google.template.soy.soytree.EscapingMode;
import com.google.template.soy.soytree.HtmlContext;
import com.google.template.soy.soytree.PrintDirectiveNode;
import java.util.Arrays;
import java.util.EnumMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import javax.annotation.Nullable;
/**
* Encapsulates the context in which a Soy node appears. This helps us distinguish Soy nodes that
* can only be preceded by fully formed HTML tags and text chunks from ones that appear inside
* JavaScript, from ones that appear inside URIs, etc.
*
* <p>This is an immutable bit-packable struct that contains a number of enums. These enums have
* their own nullish values like {@link Context.ElementType#NONE} so should always be non-null.
*
* <p>The contextual autoescape rewriter propagates contexts so that it can infer an appropriate
* {@link EscapingMode escaping function} for each <code>{print ...}</code> command.
*
* <p>To make sure it can correctly identify a unique escape convention for all paths to a
* particular print command, it may clone a template for each context in which it is called, using
* the {@link Context#packedBits bitpacked} form of the context to generate a unique template name.
*
*/
public final class Context {
/** The state the text preceding the context point describes. */
public final HtmlContext state;
/**
* Describes the innermost element that the text preceding the context point is in. An element is
* considered entered once its name has been seen in the start tag and is considered closed once
* the name of its end tag is seen. E.g. the open point is marked with O below and C marks the
* close point. {@code <b id="boldly-going">Hello, World!</b > ^ ^ O C } Outside an element, or in
* PCDATA text, this will be the nullish value {@link ElementType#NONE}.
*/
public final ElementType elType;
/**
* Describes the attribute whose value the context point is in. Outside an attribute value, this
* will be the nullish value {@link AttributeType#NONE}.
*/
public final AttributeType attrType;
/**
* Describes the quoting convention for the attribute value that the context point is in. Outside
* an attribute value, this will be the nullish value {@link AttributeEndDelimiter#NONE}.
*/
public final AttributeEndDelimiter delimType;
/**
* Determines what we will do with a slash token {@code /}. This is irrelevant outside JavaScript
* contexts, but inside JavaScript, it helps us distinguish the contexts of <code>{$bar}</code> in
* <code>"foo".replace(/{$bar}/i)</code> and <code>x/{$bar}/i</code>
*/
public final JsFollowingSlash slashType;
/** Determines how we encode interpolations in URI attributes and CSS {@code uri(...)}. */
public final UriPart uriPart;
/** Determines the context in which this URI is being used. */
public final UriType uriType;
/** The count of {@code <template>} elements entered and not subsequently exited. */
public final int templateNestDepth;
/** Use {@link Builder} to construct instances. */
private Context(
HtmlContext state,
ElementType elType,
AttributeType attrType,
AttributeEndDelimiter delimType,
JsFollowingSlash slashType,
UriPart uriPart,
UriType uriType,
int templateNestDepth) {
this.state = state;
this.elType = elType;
this.attrType = attrType;
this.delimType = delimType;
this.slashType = slashType;
this.uriPart = uriPart;
this.uriType = uriType;
// NOTE: The constraint is one-way; once we see the src attribute we may set the UriType before
// we start actually parsing the URI.
Preconditions.checkArgument(
!(uriPart != UriPart.NONE && uriType == UriType.NONE),
"If in a URI, the type of URI must be specified. UriType = %s but UriPart = %s",
uriType,
uriPart);
this.templateNestDepth = templateNestDepth;
}
/** A context in the given state outside any element, attribute, or Javascript content. */
private Context(HtmlContext state) {
this(
state,
ElementType.NONE,
AttributeType.NONE,
AttributeEndDelimiter.NONE,
JsFollowingSlash.NONE,
UriPart.NONE,
UriType.NONE,
0);
}
/**
* The normal context for HTML where a less than opens a tag and an ampersand starts an HTML
* entity.
*/
public static final Context HTML_PCDATA = new Context(HtmlContext.HTML_PCDATA);
/** Returns a context that differs only in the state. */
public Context derive(HtmlContext state) {
return state == this.state ? this : toBuilder().withState(state).build();
}
/** Returns a context that differs only in the following slash. */
public Context derive(JsFollowingSlash slashType) {
return slashType == this.slashType ? this : toBuilder().withSlashType(slashType).build();
}
/** Returns a context that differs only in the uri part. */
public Context derive(UriPart uriPart) {
return uriPart == this.uriPart ? this : toBuilder().withUriPart(uriPart).build();
}
/** A mutable builder that allows deriving variant contexts. */
Builder toBuilder() {
return new Builder(this);
}
/**
* The context after printing a correctly-escaped dynamic value in this context.
*
* <p>This makes the optimistic assumption that the escaped string is not empty. This can lead to
* correctness behaviors, but the default is to fail closed; for example, printing an empty string
* at UriPart.START switches to MAYBE_VARIABLE_SCHEME, which is designed not to trust the printed
* value anyway. Same in JS -- we might switch to DIV_OP when we should have stayed in REGEX, but
* in the worse case, we'll just produce JavaScript that doesn't compile (which is safe).
*/
public Context getContextAfterDynamicValue() {
if (state == HtmlContext.JS) {
switch (slashType) {
case DIV_OP:
case UNKNOWN:
return this;
case REGEX:
return derive(JsFollowingSlash.DIV_OP);
case NONE:
default:
throw new IllegalStateException(slashType.name());
}
} else if (state == HtmlContext.HTML_BEFORE_OPEN_TAG_NAME
|| state == HtmlContext.HTML_BEFORE_CLOSE_TAG_NAME) {
// We assume ElementType.NORMAL, because filterHtmlElementName filters dangerous tag names.
return toBuilder()
.withState(HtmlContext.HTML_TAG_NAME)
.withElType(ElementType.NORMAL)
.build();
} else if (state == HtmlContext.HTML_TAG) {
// To handle a substitution that starts an attribute name <tag {$attrName}=...>
return toBuilder()
.withState(HtmlContext.HTML_ATTRIBUTE_NAME)
.withAttrType(AttributeType.PLAIN_TEXT)
.build();
} else if (uriPart == UriPart.START) {
// TODO(gboyer): When we start enforcing strict URI syntax, make it an error to call this if
// we're already in MAYBE*_SCHEME, because it is possible in a non-strict contextual template
// that someone would use noAutoescape to try and get around the requirement of no print
// statements in MAYBE*_SCHEME.
return derive(UriPart.MAYBE_VARIABLE_SCHEME);
}
return this;
}
/** Returns a context that can be used to compute the escaping mode for a dynamic value. */
Context getContextBeforeDynamicValue() {
// Some epsilon transitions need to be delayed until we get into a branch.
// For example, we do not transition into an unquoted attribute value context just because
// the raw text node that contained the "=" did not contain a quote character because the
// quote character may appear inside branches as in
// <a href={if ...}"..."{else}"..."{/if}>
// which was derived from production code.
// But we need to force epsilon transitions to happen consistentky before a dynamic value is
// considered as in
// <a href={print $x}>
// where we consider $x as happening in an unquoted attribute value context, not as occuring
// before an attribute value.
if (state == HtmlContext.HTML_BEFORE_ATTRIBUTE_VALUE) {
return computeContextAfterAttributeDelimiter(
elType, attrType, AttributeEndDelimiter.SPACE_OR_TAG_END, uriType, templateNestDepth);
}
return this;
}
/**
* Computes the context after an attribute delimiter is seen.
*
* @param elType The type of element whose tag the attribute appears in.
* @param attrType The type of attribute whose value the delimiter starts.
* @param delim The type of delimiter that will mark the end of the attribute value.
* @param templateNestDepth The number of (@code <template>} elements on the open element stack.
* @return A context suitable for the start of the attribute value.
*/
static Context computeContextAfterAttributeDelimiter(
ElementType elType,
AttributeType attrType,
AttributeEndDelimiter delim,
UriType uriType,
int templateNestDepth) {
HtmlContext state;
JsFollowingSlash slash = JsFollowingSlash.NONE;
UriPart uriPart = UriPart.NONE;
switch (attrType) {
case PLAIN_TEXT:
state = HtmlContext.HTML_NORMAL_ATTR_VALUE;
break;
case SCRIPT:
state = HtmlContext.JS;
// Start a JS block in a regex state since
// /foo/.test(str) && doSideEffect();
// which starts with a regular expression literal is a valid and possibly useful program,
// but there is no valid program which starts with a division operator.
slash = JsFollowingSlash.REGEX;
break;
case STYLE:
state = HtmlContext.CSS;
break;
case URI:
state = HtmlContext.URI;
uriPart = UriPart.START;
break;
// NONE is not a valid AttributeType inside an attribute value.
default:
throw new AssertionError("Unexpected attribute type " + attrType);
}
Preconditions.checkArgument(
(uriType != UriType.NONE) == (attrType == AttributeType.URI),
"uriType=%s but attrType=%s",
uriType,
attrType);
return new Context(state, elType, attrType, delim, slash, uriPart, uriType, templateNestDepth);
}
/**
* Returns the escaping mode appropriate for dynamic content inserted in this context.
*
* @return Empty if there is no appropriate escaping convention to use, e.g. for comments which do
* not have escaping conventions.
*/
public ImmutableList<EscapingMode> getEscapingModes(List<PrintDirectiveNode> printDirectives) {
EscapingMode escapingMode = state.getEscapingMode();
// Short circuit on the error case first.
if (escapingMode == null) {
throw SoyAutoescapeException.createWithoutMetaInfo(state.getErrorMessage());
}
// Any additional mode that allows the primary escaping mode's output language to be
// embedded in the specific quoting context in which it appears.
EscapingMode extraEscapingMode = null;
// Keep track of whether an URI is a TrustedResource. We want some resource URIs like sources to
// be safe and not in attacker control. Hence, a restriction that these resouce URIs need to be
// compile time constants is being set. To makes sure these are compile time constants these
// either need to be of type string or TrustedResourceUrl.
EscapingMode truMode = null;
if (uriType == UriType.TRUSTED_RESOURCE) {
truMode = EscapingMode.FILTER_TRUSTED_RESOURCE_URI;
for (PrintDirectiveNode directive : printDirectives) {
// If a print directive with the name "|blessStringAsTrustedResourceUrlForLegacy" exists
// we don't want to enforce presence of a trusted resource URL. This is mainly done so as
// not to break the legacy soy files.
if (directive.getName().equals("|blessStringAsTrustedResourceUrlForLegacy")) {
truMode = null;
break;
}
}
}
// Make sure we're using the right part for a URI context.
switch (uriPart) {
case QUERY:
escapingMode = EscapingMode.ESCAPE_URI;
break;
case START:
if (truMode == null) {
// We need to filter substitutions at the start of a URL since they can switch the
// protocol to a code loading protocol like javascript:. We don't want these filters to
// happen when the URL in question is TrustedResourceUrl as we are sure it is not in
// attacker control.
if (escapingMode != EscapingMode.NORMALIZE_URI) {
extraEscapingMode = escapingMode;
}
// Use a different escaping mode depending on what kind of URL is being used.
if (uriType == UriType.MEDIA) {
escapingMode = EscapingMode.FILTER_NORMALIZE_MEDIA_URI;
} else {
escapingMode = EscapingMode.FILTER_NORMALIZE_URI;
}
}
break;
case UNKNOWN:
case UNKNOWN_PRE_FRAGMENT:
// We can't choose an appropriate escaping convention if we're in a URI but don't know which
// part. E.g. in
// <a href="
// {if ...}
// ?foo=
// {else}
// /bar/
// {/else}
// {$baz}">
// Is {$baz} part of a query or part of a path?
// TODO(gboyer): In these unknown states, it might be interesting to indicate what the two
// contexts were.
throw SoyAutoescapeException.createWithoutMetaInfo(
"Cannot determine which part of the URL this dynamic value is in. Most likely, a"
+ " preceding conditional block began a ?query or #fragment, "
+ "but only on one branch.");
case MAYBE_VARIABLE_SCHEME:
// Is $y in the scheme, path, query, or fragment below?
// <a href="{$x}{$y}">
throw SoyAutoescapeException.createWithoutMetaInfo(
"Soy can't prove this URI concatenation has a safe scheme at compile time."
+ " Either combine adjacent print statements (e.g. {$x + $y} instead of {$x}{$y}),"
+ " or introduce disambiguating characters"
+ " (e.g. {$x}/{$y}, {$x}?y={$y}, {$x}&y={$y}, {$x}#{$y})");
case MAYBE_SCHEME:
// Could $x cause a bad scheme, e.g. if it's "script:deleteMyAccount()"?
// <a href="java{$x}">
throw SoyAutoescapeException.createWithoutMetaInfo(
"Soy can't prove this URI has a safe scheme at compile time. Either make sure one of"
+ " ':', '/', '?', or '#' comes before the dynamic value (e.g. foo/{$bar}), or"
+ " move the print statement to the start of the URI to enable runtime validation"
+ " (e.g. href=\"{'foo' + $bar}\" instead of href=\"foo{$bar}\").");
case DANGEROUS_SCHEME:
// After javascript: or other dangerous schemes.
throw SoyAutoescapeException.createWithoutMetaInfo(
"Soy can't properly escape for this URI scheme. For image sources, you can print full"
+ " data and blob URIs directly (e.g. src=\"{$someDataUri}\")."
+ " Otherwise, hardcode the full URI in the template or pass a complete"
+ " SanitizedContent or SafeUri object.");
default:
break;
}
// Check the quote embedding mode.
switch (delimType) {
case SPACE_OR_TAG_END:
// Also escape any spaces that could prematurely end the attribute value.
// E.g. when the value of $s is "was checked" in
// <input value={$s}>
// then we want to emit
// <input name=was checked>
// instead of
// <input name=was checked>
if (escapingMode == EscapingMode.ESCAPE_HTML_ATTRIBUTE
|| escapingMode == EscapingMode.NORMALIZE_URI) {
escapingMode = EscapingMode.ESCAPE_HTML_ATTRIBUTE_NOSPACE;
} else {
extraEscapingMode = EscapingMode.ESCAPE_HTML_ATTRIBUTE_NOSPACE;
}
break;
case SINGLE_QUOTE:
case DOUBLE_QUOTE:
if (escapingMode == EscapingMode.NORMALIZE_URI) {
// URI's should still be HTML-escaped to escape ampersands, quotes, and other characters.
// Normalizing a URI (which mostly percent-encodes quotes) is unnecessary if it's going
// to be escaped as an HTML attribute, so as a performance optimization, we simply
// replace the escaper.
escapingMode = EscapingMode.ESCAPE_HTML_ATTRIBUTE;
} else if (!escapingMode.isHtmlEmbeddable) {
// Some modes, like JS and CSS value modes, might insert quotes to make
// a quoted string, so make sure to escape those as HTML.
// E.g. when the value of $s is " onmouseover=evil() foo=", in
// <a onclick='alert({$s})'>
// we want to produce
// <a onclick='alert(' onmouseover=evil() foo=')'>
// instead of
// <a onclick='alert(' onmouseover=evil() foo=')'>
extraEscapingMode = EscapingMode.ESCAPE_HTML_ATTRIBUTE;
}
break;
case NONE:
break;
}
// Return and immutable list of (truMode, escapingMode, extraEscapingMode)
ImmutableList.Builder<EscapingMode> escapingListBuilder = new ImmutableList.Builder<>();
if (truMode != null) {
escapingListBuilder.add(truMode);
}
escapingListBuilder.add(escapingMode);
if (extraEscapingMode != null) {
escapingListBuilder.add(extraEscapingMode);
}
return escapingListBuilder.build();
}
/** Policy for how to handle escaping of a translatable message. */
static final class MsgEscapingStrategy {
/**
* The context in which to parse the message itself. This affects how print nodes are escaped.
*/
final Context childContext;
/**
* The escaping directives for the entire message after all print nodes have been substituted.
*/
final ImmutableList<EscapingMode> escapingModesForFullMessage;
MsgEscapingStrategy(
Context childContext, ImmutableList<EscapingMode> escapingModesForFullMessage) {
this.childContext = childContext;
this.escapingModesForFullMessage = escapingModesForFullMessage;
}
}
/**
* Determines the strategy to escape Soy msg tags.
*
* <p>Importantly, this determines the context that the message should be considered in, how the
* print nodes will be escaped, and how the entire message will be escaped. We need different
* strategies in different contexts because messages in general aren't trusted, but we also need
* to be able to include markup interspersed in an HTML message; for example, an anchor that Soy
* factored out of the message.
*
* <p>Note that it'd be very nice to be able to simply escape the strings that came out of the
* translation database, and distribute the escaping entirely over the print nodes. However, the
* translation machinery, especially in Javascript, doesn't offer a way to escape just the bits
* that come from the translation database without also re-escaping the substitutions.
*
* @return relevant strategy, or absent in case there's no valid strategy and it is an error to
* have a message in this context
*/
Optional<MsgEscapingStrategy> getMsgEscapingStrategy() {
switch (state) {
case HTML_PCDATA:
// In normal HTML PCDATA context, it makes sense to escape all of the print nodes, but not
// escape the entire message. This allows Soy to support putting anchors and other small
// bits of HTML in messages.
return Optional.of(new MsgEscapingStrategy(this, ImmutableList.<EscapingMode>of()));
case CSS_DQ_STRING:
case CSS_SQ_STRING:
case JS_DQ_STRING:
case JS_SQ_STRING:
case TEXT:
case URI:
if (state == HtmlContext.URI && uriPart != UriPart.QUERY) {
// NOTE: Only support the query portion of URIs.
return Optional.<MsgEscapingStrategy>absent();
}
// In other contexts like JS and CSS strings, it makes sense to treat the message's
// placeholders as plain text, but escape the entire result of message evaluation.
return Optional.of(
new MsgEscapingStrategy(
new Context(HtmlContext.TEXT),
getEscapingModes(ImmutableList.<PrintDirectiveNode>of())));
case HTML_RCDATA:
case HTML_NORMAL_ATTR_VALUE:
case HTML_COMMENT:
// The weirdest case is HTML attributes. Ideally, we'd like to treat these as a text string
// and escape when done. However, many messages have HTML entities such as » in them.
// A good way around this is to escape the print nodes in the message, but normalize
// (escape except for ampersands) the final message.
// Also, content inside <title>, <textarea>, and HTML comments have a similar requirement,
// where any entities in the messages are probably intended to be preserved.
return Optional.of(
new MsgEscapingStrategy(this, ImmutableList.of(EscapingMode.NORMALIZE_HTML)));
default:
// Other contexts, primarily source code contexts, don't have a meaningful way to support
// natural language text.
return Optional.<MsgEscapingStrategy>absent();
}
}
/** True if the given escaping mode could make sense in this context. */
public boolean isCompatibleWith(EscapingMode mode) {
// TODO: Come up with a compatibility matrix.
if (mode == EscapingMode.ESCAPE_JS_VALUE) {
// Don't introduce quotes inside a string.
switch (state) {
case JS_SQ_STRING:
case JS_DQ_STRING:
case CSS_SQ_STRING:
case CSS_DQ_STRING:
return false;
default:
return true;
}
} else if (mode == EscapingMode.TEXT) {
// The TEXT directive may only be used in TEXT mode; in any other context, it would act as
// autoescape-cancelling.
return state == HtmlContext.TEXT;
} else if (delimType == AttributeEndDelimiter.SPACE_OR_TAG_END) {
// Need ESCAPE_HTML_ATTRIBUTE_NOSPACE instead.
if (mode == EscapingMode.ESCAPE_HTML
|| mode == EscapingMode.ESCAPE_HTML_ATTRIBUTE
|| mode == EscapingMode.ESCAPE_HTML_RCDATA) {
return false;
}
}
return true;
}
/**
* Checks if two states are completely identical.
*
* <p>Note it's better to compare either states, or use predicates like isValidEndContext.
*/
@Override
public boolean equals(Object o) {
if (!(o instanceof Context)) {
return false;
}
Context that = (Context) o;
return this.state == that.state
&& this.elType == that.elType
&& this.attrType == that.attrType
&& this.delimType == that.delimType
&& this.slashType == that.slashType
&& this.uriPart == that.uriPart
&& this.uriType == that.uriType
&& this.templateNestDepth == that.templateNestDepth;
}
@Override
public int hashCode() {
return packedBits();
}
/**
* An integer form that uniquely identifies this context. This form is not guaranteed to be stable
* across versions, so do not use as a long-lived serialized form.
*/
public int packedBits() {
int bits = templateNestDepth;
bits = (bits << N_URI_TYPE_BITS) | uriType.ordinal();
bits = (bits << N_URI_PART_BITS) | uriPart.ordinal();
bits = (bits << N_JS_SLASH_BITS) | slashType.ordinal();
bits = (bits << N_DELIM_BITS) | delimType.ordinal();
bits = (bits << N_ATTR_BITS) | attrType.ordinal();
bits = (bits << N_ELEMENT_BITS) | elType.ordinal();
bits = (bits << N_STATE_BITS) | state.ordinal();
return bits;
}
/** The number of bits needed to store a {@link HtmlContext} value. */
private static final int N_STATE_BITS = 5;
/** The number of bits needed to store a {@link ElementType} value. */
private static final int N_ELEMENT_BITS = 4;
/** The number of bits needed to store a {@link AttributeType} value. */
private static final int N_ATTR_BITS = 3;
/** The number of bits needed to store a {@link AttributeEndDelimiter} value. */
private static final int N_DELIM_BITS = 2;
/** The number of bits needed to store a {@link JsFollowingSlash} value. */
private static final int N_JS_SLASH_BITS = 2;
/** The number of bits needed to store a {@link UriPart} value. */
private static final int N_URI_PART_BITS = 4;
/** The number of bits needed to store a {@link UriType} value. */
private static final int N_URI_TYPE_BITS = 2;
static {
// We'd better have enough bits in an int.
if ((N_STATE_BITS
+ N_ELEMENT_BITS
+ N_ATTR_BITS
+ N_DELIM_BITS
+ N_JS_SLASH_BITS
+ N_URI_PART_BITS
+ N_URI_TYPE_BITS)
> 32) {
throw new AssertionError();
}
// And each enum's ordinals must fit in the bits allocated.
if ((1 << N_STATE_BITS) < HtmlContext.values().length
|| (1 << N_ELEMENT_BITS) < ElementType.values().length
|| (1 << N_ATTR_BITS) < AttributeType.values().length
|| (1 << N_DELIM_BITS) < AttributeEndDelimiter.values().length
|| (1 << N_JS_SLASH_BITS) < JsFollowingSlash.values().length
|| (1 << N_URI_PART_BITS) < UriPart.values().length
|| (1 << N_URI_TYPE_BITS) < UriType.values().length) {
throw new AssertionError();
}
}
/** Determines the correct URI part if two branches are joined. */
private static UriPart unionUriParts(UriPart a, UriPart b) {
Preconditions.checkArgument(a != b);
if (a == UriPart.DANGEROUS_SCHEME || b == UriPart.DANGEROUS_SCHEME) {
// Dangerous schemes (like javascript:) are poison -- if either side is dangerous, the whole
// thing is.
return UriPart.DANGEROUS_SCHEME;
} else if (a == UriPart.FRAGMENT
|| b == UriPart.FRAGMENT
|| a == UriPart.UNKNOWN
|| b == UriPart.UNKNOWN) {
// UNKNOWN means one part is in the #fragment and one is not. This is the case if one is
// FRAGMENT and the other is not, or if one of the branches was UNKNOWN to begin with.
return UriPart.UNKNOWN;
} else if ((a == UriPart.MAYBE_VARIABLE_SCHEME || b == UriPart.MAYBE_VARIABLE_SCHEME)
&& a != UriPart.UNKNOWN_PRE_FRAGMENT
&& b != UriPart.UNKNOWN_PRE_FRAGMENT) {
// This is the case you might see on a URL that starts with a print statement, and one
// branch has a slash or ampersand but the other doesn't. Re-entering
// MAYBE_VARIABLE_SCHEME allows us to pretend that the last branch was just part of the
// leading print statement, which leaves us in a relatively-unknown state, but no more
// unknown had it just been completely opaque.
//
// Good Example 1: {$urlWithQuery}{if $a}&a={$a}{/if}{if $b}&b={$b}{/if}
// In this example, the first "if" statement has two branches:
// - "true": {$urlWithQuey}&a={$a} looks like a QUERY due to hueristics
// - "false": {$urlWithQuery} only, which Soy doesn't know at compile-time to actually
// have a query, and it remains in MAYBE_VARIABLE_SCHEME.
// Instead of yielding UNKNOWN, this yields MAYBE_VARIABLE_SCHEME, which the second
// {if $b} can safely deal with.
//
// Good Example 2: {$base}{if $a}/a{/if}{if $b}/b{/if}
// In this, one branch transitions definitely into an authority or path, but the other
// might not. However, we can remain in MAYBE_VARIABLE_SCHEME safely.
return UriPart.MAYBE_VARIABLE_SCHEME;
} else {
// The part is unknown, but we think it's before the fragment. In this case, it's clearly
// ambiguous at compile-time that it's not clear what to do. Examples:
//
// /foo/{if $cond}?a={/if}
// {$base}{if $cond}?a={$a}{else}/b{/if}
// {if $cond}{$base}{else}/a{if $cond2}?b=1{/if}{/if}
//
// Unlike MAYBE_VARIABLE_SCHEME, we don't need to try to gracefully recover here, because
// the template author can easily disambiguate this.
return UriPart.UNKNOWN_PRE_FRAGMENT;
}
}
/**
* A context which is consistent with both contexts. This should be used when multiple execution
* paths join, such as the path through the then-clause of an <code>{if}</code> command and the
* path through the else-clause.
*
* @return Optional.absent() when there is no such context consistent with both.
*/
static Optional<Context> union(Context a, Context b) {
// NOTE: Avoid the temptation to return early; instead, rely on the equals() check at the end
// to ensure all properties match. Checking equals() at the end ensures that when new
// properties are added, they get checked automatically.
// Try to reconcile each property one-by-one.
if (a.slashType != b.slashType) {
a = a.derive(JsFollowingSlash.UNKNOWN);
b = b.derive(JsFollowingSlash.UNKNOWN);
}
if (a.uriPart != b.uriPart) {
UriPart unionedUriPart = unionUriParts(a.uriPart, b.uriPart);
a = a.derive(unionedUriPart);
b = b.derive(unionedUriPart);
}
if (a.state != b.state) {
// Order by state so that we don't have to duplicate tests below.
if (a.state.compareTo(b.state) > 0) {
Context swap = a;
a = b;
b = swap;
}
// consider <div foo=bar{if $p} onclick=foo(){/if} x=y>
// if both branches need a space or tag end to complete, and their states aren't compatible
// switch to TAG_NAME to require a space
if (a.delimType == AttributeEndDelimiter.SPACE_OR_TAG_END
&& b.delimType == AttributeEndDelimiter.SPACE_OR_TAG_END
&& a.state != b.state) {
// we need to switch to a state that requires a space
// TODO(lukes): given this usecase, HTML_TAG_NAME is poorly named, consider
// AFTER_TAG_OR_UNQUOTED_ATTR? maybe just HTML_TAG_NEEDS_SPACE
a = a.toBuilder().withState(HtmlContext.HTML_TAG_NAME).withoutAttrContext().build();
// The next block will clean up b.
}
// consider <input{if $foo} disabled{/if}> or <input{$if foo} disabled=true{/if}
// if we start in a tag name and end in an attribute name or value, assume we are still in a
// tag name.
if (a.state == HtmlContext.HTML_TAG_NAME) {
if (b.state == HtmlContext.HTML_ATTRIBUTE_NAME
|| b.delimType == AttributeEndDelimiter.SPACE_OR_TAG_END) {
// clear attributes from a also, this is counterintuitive because tagnames shouldn't have
// attrccontext at all, but prior reconciliation of slashtype may have added one. so
// clear it.
a = a.toBuilder().withoutAttrContext().build();
b = b.toBuilder().withState(HtmlContext.HTML_TAG_NAME).withoutAttrContext().build();
}
}
// If we start in a tag name and end between attributes, then treat us as between attributes.
// This handles <b{if $bool} attrName="value"{/if}>.
if (a.state == HtmlContext.HTML_TAG_NAME && b.state == HtmlContext.HTML_TAG) {
// Note we only change the state; if the element type is different, we don't want it to
// join.
// TODO(gboyer): The withoutAttrContext() doesn't make any sense, since HTML_TAG_NAME can't
// have an attribute context.
a = a.toBuilder().withState(HtmlContext.HTML_TAG).withoutAttrContext().build();
}
if (a.state == HtmlContext.HTML_TAG) {
// If one branch is waiting for an attribute name, and the other is waiting for an equal
// sign before an attribute value OR the end of an unquoted attribute value, then commit to
// the view that the attribute name was a valueless attribute and transition to a state
// waiting for another attribute name or the end of a tag.
// Examples:
// - state == HTML_ATTRIBUTE_NAME: <input {if $x}disabled{/if}
// - delimType == SPACE_TAG_OR_END: <input {if $x}type=text{/if}
if (b.state == HtmlContext.HTML_ATTRIBUTE_NAME
|| b.delimType == AttributeEndDelimiter.SPACE_OR_TAG_END) {
// TODO(gboyer): do we need to require a space before any new attribute name after an
// unquoted attribute?
b = b.toBuilder().withState(HtmlContext.HTML_TAG).withoutAttrContext().build();
}
}
}
return a.equals(b) ? Optional.of(a) : Optional.<Context>absent();
}
static Optional<Context> union(Iterable<Context> contexts) {
Iterator<Context> iterator = contexts.iterator();
Optional<Context> context = Optional.of(iterator.next());
while (iterator.hasNext() && context.isPresent()) {
context = union(context.get(), iterator.next());
}
return context;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder("(Context ").append(state.name());
if (elType != ElementType.NONE) {
sb.append(' ').append(elType.name());
}
if (attrType != AttributeType.NONE) {
sb.append(' ').append(attrType.name());
}
if (delimType != AttributeEndDelimiter.NONE) {
sb.append(' ').append(delimType.name());
}
if (slashType != JsFollowingSlash.NONE) {
sb.append(' ').append(slashType.name());
}
if (uriPart != UriPart.NONE) {
sb.append(' ').append(uriPart.name());
}
if (uriType != UriType.NONE) {
sb.append(' ').append(uriType.name());
}
if (templateNestDepth != 0) {
sb.append(" templateNestDepth=").append(templateNestDepth);
}
return sb.append(')').toString();
}
/** Parses a condensed string version of a context, for use in tests. */
@VisibleForTesting
static Context parse(String text) {
Queue<String> parts = Lists.newLinkedList(Arrays.asList(text.split(" ")));
Context.Builder builder = HTML_PCDATA.toBuilder();
builder.withState(HtmlContext.valueOf(parts.remove()));
if (!parts.isEmpty()) {
try {
builder.withElType(ElementType.valueOf(parts.element()));
parts.remove();
} catch (IllegalArgumentException ex) {
// OK
}
}
if (!parts.isEmpty()) {
try {
builder.withAttrType(AttributeType.valueOf(parts.element()));
parts.remove();
} catch (IllegalArgumentException ex) {
// OK
}
}
if (!parts.isEmpty()) {
try {
builder.withDelimType(AttributeEndDelimiter.valueOf(parts.element()));
parts.remove();
} catch (IllegalArgumentException ex) {
// OK
}
}
if (!parts.isEmpty()) {
try {
builder.withSlashType(JsFollowingSlash.valueOf(parts.element()));
parts.remove();
} catch (IllegalArgumentException ex) {
// OK
}
}
if (!parts.isEmpty()) {
try {
builder.withUriPart(UriPart.valueOf(parts.element()));
parts.remove();
} catch (IllegalArgumentException ex) {
// OK
}
}
if (!parts.isEmpty()) {
try {
builder.withUriType(UriType.valueOf(parts.element()));
parts.remove();
} catch (IllegalArgumentException ex) {
// OK
}
}
if (!parts.isEmpty()) {
String part = parts.element();
String prefix = "templateNestDepth=";
if (part.startsWith(prefix)) {
try {
builder.withTemplateNestDepth(Integer.parseInt(part.substring(prefix.length())));
parts.remove();
} catch (NumberFormatException ex) {
// OK
}
}
}
if (!parts.isEmpty()) {
throw new IllegalArgumentException(
"Unable to parse context \"" + text + "\". Unparsed portion: " + parts);
}
Context result = builder.build();
return result;
}
/**
* Returns the autoescape {@link Context} that produces sanitized content of the given {@link
* ContentKind}.
*
* <p>Given a {@link ContentKind}, returns the corresponding {@link Context} such that contextual
* autoescaping of a block of Soy code with that context as the start context results in a value
* that adheres to the contract of {@link com.google.template.soy.data.SanitizedContent} of the
* given kind.
*/
public static Context getStartContextForContentKind(ContentKind contentKind) {
return HTML_PCDATA.toBuilder().withStartKind(contentKind).build();
}
/**
* Determines whether a particular context is valid at the start of a block of a particular
* content kind.
*/
public boolean isValidStartContextForContentKind(ContentKind contentKind) {
if (templateNestDepth != 0) {
return false;
}
switch (contentKind) {
case ATTRIBUTES:
// Allow HTML attribute names, regardless of the kind of attribute (e.g. plain text)
// or immediately after an open tag.
return state == HtmlContext.HTML_ATTRIBUTE_NAME || state == HtmlContext.HTML_TAG;
default:
// NOTE: For URI's, we need to be picky that the context has no attribute type, since we
// don't want to forget to escape ampersands.
return this.equals(getStartContextForContentKind(contentKind));
}
}
/**
* Determines whether a particular context is allowed for contextual to strict calls.
*
* <p>This is slightly more relaxed, and used to help piecemeal transition of templates from
* contextual to strict.
*/
public boolean isValidStartContextForContentKindLoose(ContentKind contentKind) {
switch (contentKind) {
case URI:
// Allow contextual templates directly call URI templates, even if we technically need to
// do HTML-escaping for correct output. Supported browsers recover gracefully when
// ampersands are underescaped, as long as there are no nearby semicolons. However, this
// special case is limited ONLY to transitional cases, where the caller is contextual and
// the callee is strict.
return state == HtmlContext.URI;
default:
return isValidStartContextForContentKind(contentKind);
}
}
private static final ImmutableMap<HtmlContext, ContentKind> STATE_TO_CONTENT_KIND;
static {
Map<HtmlContext, ContentKind> stateToContextKind = new EnumMap<>(HtmlContext.class);
stateToContextKind.put(HtmlContext.CSS, ContentKind.CSS);
stateToContextKind.put(HtmlContext.HTML_PCDATA, ContentKind.HTML);
stateToContextKind.put(HtmlContext.HTML_TAG, ContentKind.ATTRIBUTES);
stateToContextKind.put(HtmlContext.JS, ContentKind.JS);
stateToContextKind.put(HtmlContext.URI, ContentKind.URI);
stateToContextKind.put(HtmlContext.TEXT, ContentKind.TEXT);
STATE_TO_CONTENT_KIND = ImmutableMap.copyOf(stateToContextKind);
}
/**
* Returns the most sensible content kind for a context.
*
* <p>This is primarily for error messages, indicating to the user what content kind can be used
* to mostly null out the escaping. Returns TEXT if no useful match was detected.
*/
public ContentKind getMostAppropriateContentKind() {
ContentKind kind = STATE_TO_CONTENT_KIND.get(state);
if (kind != null && isValidStartContextForContentKindLoose(kind)) {
return kind;
}
return ContentKind.TEXT;
}
/**
* Determines whether a particular context is valid for the end of a block of a particular content
* kind.
*/
public final boolean isValidEndContextForContentKind(ContentKind contentKind) {
if (templateNestDepth != 0) {
return false;
}
switch (contentKind) {
case CSS:
return state == HtmlContext.CSS && elType == ElementType.NONE;
case HTML:
return state == HtmlContext.HTML_PCDATA && elType == ElementType.NONE;
case ATTRIBUTES:
// Allow any html attribute context or html tag this. HTML_TAG is needed for constructs
// like "checked" that don't require an attribute value. Explicitly disallow
// HTML_NORMAL_ATTR_VALUE (e.g. foo={$x} without quotes) to help catch cases where
// attributes aren't safely composable (e.g. foo={$x}checked would end up with one long
// attribute value, whereas foo="{$x}"checked would be parsed as intended).
return state == HtmlContext.HTML_ATTRIBUTE_NAME || state == HtmlContext.HTML_TAG;
case JS:
// Just ensure the state is JS -- don't worry about whether a regex is coming or not.
return state == HtmlContext.JS && elType == ElementType.NONE;
case URI:
// Ensure that the URI content is non-empty and the URI type remains normal (which is
// the assumed type of the URI content kind).
return state == HtmlContext.URI && uriType == UriType.NORMAL && uriPart != UriPart.START;
case TEXT:
return state == HtmlContext.TEXT;
default:
throw new IllegalArgumentException("Specified content kind has no associated end context.");
}
}
/**
* Returns a plausible human-readable description of a context mismatch;
*
* <p>This assumes that the provided context is an invalid end context for the particular content
* kind.
*/
public final String getLikelyEndContextMismatchCause(ContentKind contentKind) {
Preconditions.checkArgument(!isValidEndContextForContentKind(contentKind));
if (contentKind == ContentKind.ATTRIBUTES) {
// Special error message for ATTRIBUTES since it has some specific logic.
return "an unterminated attribute value, or ending with an unquoted attribute";
}
switch (state) {
case HTML_TAG_NAME:
case HTML_TAG:
case HTML_ATTRIBUTE_NAME:
case HTML_NORMAL_ATTR_VALUE:
return "an unterminated HTML tag or attribute";
case CSS:
return "an unclosed style block or attribute";
case JS:
case JS_LINE_COMMENT: // Line comments are terminated by end of input.
return "an unclosed script block or attribute";
case CSS_COMMENT:
case HTML_COMMENT:
case JS_BLOCK_COMMENT:
return "an unterminated comment";
case CSS_DQ_STRING:
case CSS_SQ_STRING:
case JS_DQ_STRING:
case JS_SQ_STRING:
return "an unterminated string literal";
case URI:
case CSS_URI:
case CSS_DQ_URI:
case CSS_SQ_URI:
return "an unterminated or empty URI";
case JS_REGEX:
return "an unterminated regular expression";
default:
if (templateNestDepth != 0) {
return "an unterminated <template> element";
} else {
return "unknown to compiler";
}
}
}
/** A type of HTML element. */
public enum ElementType {
/** No element. */
NONE,
/** A script element whose content is raw JavaScript. */
SCRIPT,
/** A style element whose content is raw CSS. */
STYLE,
/** A textarea element whose content is encoded HTML but which cannot contain elements. */
TEXTAREA,
/** A title element whose content is encoded HTML but which cannot contain elements. */
TITLE,
/** An XMP element whose content is raw CDATA. */
XMP,
/** An image element, so that we can process the src attribute specially. */
MEDIA,
/** An element whose content is normal mixed PCDATA and child elements. */
NORMAL,
;
}
/** Describes the content of an HTML attribute. */
public enum AttributeType {
/** No attribute. */
NONE,
/** Mime-type text/javascript. */
SCRIPT,
/** Mime-type text/css. */
STYLE,
/** A URI or URI reference. */
URI,
/** Other content. Human readable or other non-structured plain text or keyword values. */
PLAIN_TEXT,
;
}
/** Describes the content that will end the current HTML attribute. */
public enum AttributeEndDelimiter {
/** Not in an attribute. */
NONE,
/** {@code "} */
DOUBLE_QUOTE("\""),
/** {@code '} */
SINGLE_QUOTE("'"),
/** A space or {@code >} symbol. */
SPACE_OR_TAG_END(""),
;
/**
* The suffix of the attribute that is not part of the attribute value. E.g. in {@code
* href="foo"} the trailing double quote is part of the attribute but not part of the value.
* Whereas for space delimited attributes like {@code width=32}, there is no non-empty suffix
* that is part of the attribute but not part of the value.
*/
public final @Nullable String text;
AttributeEndDelimiter(String text) {
this.text = text;
}
AttributeEndDelimiter() {
this.text = null;
}
}
/**
* Describes what a slash ({@code /}) means when parsing JavaScript source code. A slash that is
* not followed by another slash or an asterisk (<tt>*</tt>) can either start a regular expression
* literal or start a division operator. This determination is made based on the full grammar, but
* Waldemar defined a very close to accurate grammar for a JavaScript 1.9 draft based purely on a
* regular lexical grammar which is what we use in the autoescaper.
*
* @see JsUtil#isRegexPreceder
*/
public enum JsFollowingSlash {
/** Not in JavaScript. */
NONE,
/** A slash as the next token would start a regular expression literal. */
REGEX,
/** A slash as the next token would start a division operator. */
DIV_OP,
/**
* We do not know what a slash as the next token would start so it is an error for the next
* token to be a slash.
*/
UNKNOWN,
;
}
/**
* Describes the part of a URI reference that the context point is in.
*
* <p>We need to distinguish these so that we can
*
* <ul>
* <li>normalize well-formed URIs that appear before the query,
* <li>encode raw values interpolated as query parameters or keys,
* <li>filter out values that specify a scheme like {@code javascript:}.
* </ul>
*/
public enum UriPart {
/** Not in a URI. */
NONE,
/**
* At the absolute beginning of a URI.
*
* <p>At ^ in {@code ^http://host/path?k=v#frag} or {@code ^foo/bar?a=1}.
*/
START,
/**
* After a print statement in the beginning of a URI, where it's still possible to be in the
* scheme.
*
* <p>For example, after {@code href="{$x}}, it's hard to know what will happen.
* For example, if $x is "java" (a perfectly valid relative URI on its own), then
* "script:alert(1)" would execute as Javascript. But if $x is "java" followed by "/test.html",
* it's a relative URI.
*
* <p>This state is kept until we see anything that's hard-coded that makes it clear that we've
* left the scheme context; while remaining in this state, print statements and colons are
* forbidden, since we don't want what looks like a relative URI to set the scheme.
*/
MAYBE_VARIABLE_SCHEME,
/**
* Still possibly in the scheme, though it could also be a relative path, but no print
* statements have been seen yet.
*
* <p>For example, between carets in {@code h^ttp^://host/path} or {@code f^oo^/bar.html}.
*
* <p>This is similar to MAYBE_VARIABLE_SCHEME in that print statements are forbidden; however,
* colons are allowed and transition to AUTHORITY_OR_PATH.
*/
MAYBE_SCHEME,
/** In the scheme, authority, or path. Between ^s in {@code h^ttp://host/path^?k=v#frag}. */
AUTHORITY_OR_PATH,
/** In the query portion. Between ^s in {@code http://host/path?^k=v^#frag} */
QUERY,
/** In the fragment. After ^ in {@code http://host/path?k=v#^frag} */
FRAGMENT,
/** Not {@link #NONE} or {@link #FRAGMENT}, but unknown. Used to join different contexts. */
UNKNOWN_PRE_FRAGMENT,
/** Not {@link #NONE}, but unknown. Used to join different contexts. */
UNKNOWN,
/** A known-dangerous scheme where dynamic content is forbidden. */
DANGEROUS_SCHEME;
}
/**
* Describes the type or context of a URI that is currently being or about to be parsed.
*
* <p>This distinguishes between the types of URI safety concerns, which vary between images,
* scripts, and other types.
*/
public enum UriType {
/**
* Not in or about to be in a URI.
*
* <p>Note the URI type can be set even if we haven't entered the URI itself yet.
*/
NONE,
/**
* General URI context suitable for most URI types.
*
* <p>The biggest use-case here is for anchors, where we want to prevent Javascript URLs that
* can cause XSS. However, this grabs other types of URIs such as stylesheets, prefetch, SEO
* metadata, and attributes that look like they're supposed to contain URIs but might just be
* harmless metadata because they end with "url".
*
* <p>It's expected that this will be split up over time to address the different safety levels
* of the different URI types.
*/
NORMAL,
/**
* Image URL type.
*
* <p>Here, we can relax some some rules. For example, a data URI in an image is unlikely to do
* anything that loading an image from a 3rd party http/https site.
*
* <p>At present, note that Soy doesn't do anything to prevent referer[r]er leakage. At some
* future point, we may want to provide configuration options to avoid 3rd party or
* http-in-the-clear image loading.
*
* <p>In the future, this might also encompass video and audio, if we can find ways to reduce
* the risk of social engineering.
*/
MEDIA,
/**
* A URI which loads resources. This is intended to be used in scrips, stylesheets, etc which
* should not be in attacker control.
*/
TRUSTED_RESOURCE
}
/** A mutable builder for {@link Context}s. */
static final class Builder {
private HtmlContext state;
private ElementType elType;
private AttributeType attrType;
private AttributeEndDelimiter delimType;
private JsFollowingSlash slashType;
private UriPart uriPart;
private UriType uriType;
private int templateNestDepth;
private Builder(Context context) {
this.state = context.state;
this.elType = context.elType;
this.attrType = context.attrType;
this.delimType = context.delimType;
this.slashType = context.slashType;
this.uriPart = context.uriPart;
this.uriType = context.uriType;
this.templateNestDepth = context.templateNestDepth;
}
Builder withState(HtmlContext state) {
this.state = Preconditions.checkNotNull(state);
return this;
}
Builder withElType(ElementType elType) {
this.elType = Preconditions.checkNotNull(elType);
return this;
}
Builder withAttrType(AttributeType attrType) {
this.attrType = Preconditions.checkNotNull(attrType);
return this;
}
Builder withDelimType(AttributeEndDelimiter delimType) {
this.delimType = Preconditions.checkNotNull(delimType);
return this;
}
Builder withSlashType(JsFollowingSlash slashType) {
this.slashType = Preconditions.checkNotNull(slashType);
return this;
}
Builder withUriPart(UriPart uriPart) {
this.uriPart = Preconditions.checkNotNull(uriPart);
return this;
}
Builder withUriType(UriType uriType) {
this.uriType = Preconditions.checkNotNull(uriType);
return this;
}
Builder withTemplateNestDepth(int templateNestDepth) {
Preconditions.checkArgument(templateNestDepth >= 0);
this.templateNestDepth = templateNestDepth;
return this;
}
Builder withoutAttrContext() {
return this.withAttrType(Context.AttributeType.NONE)
.withDelimType(Context.AttributeEndDelimiter.NONE)
.withSlashType(Context.JsFollowingSlash.NONE)
.withUriPart(Context.UriPart.NONE)
.withUriType(Context.UriType.NONE);
}
/**
* Reset to a {@link Context} such that contextual autoescaping of a block of Soy code with the
* corresponding {@link ContentKind} results in a value that adheres to the contract of {@link
* com.google.template.soy.data.SanitizedContent} of this kind.
*/
Builder withStartKind(ContentKind contentKind) {
boolean inTag = false;
withoutAttrContext();
switch (contentKind) {
case CSS:
withState(HtmlContext.CSS);
break;
case HTML:
withState(HtmlContext.HTML_PCDATA);
break;
case ATTRIBUTES:
withState(HtmlContext.HTML_TAG);
inTag = true;
break;
case JS:
withState(HtmlContext.JS);
withSlashType(JsFollowingSlash.REGEX);
break;
case URI:
withState(HtmlContext.URI);
withUriPart(UriPart.START);
// Assume a let block of kind="uri" is a "normal" URI.
withUriType(UriType.NORMAL);
break;
case TEXT:
withState(HtmlContext.TEXT);
break;
default:
break;
}
if (!inTag) {
withElType(ElementType.NONE);
}
return this;
}
Context build() {
return new Context(
state, elType, attrType, delimType, slashType, uriPart, uriType, templateNestDepth);
}
}
}