/*
* Copyright 2010 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.template.soy.parsepasses.contextautoesc;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.template.soy.data.SanitizedContent.ContentKind;
import com.google.template.soy.internal.base.UnescapeUtils;
import com.google.template.soy.parsepasses.contextautoesc.Context.UriPart;
import com.google.template.soy.parsepasses.contextautoesc.Context.UriType;
import com.google.template.soy.soytree.HtmlContext;
import com.google.template.soy.soytree.RawTextNode;
import java.util.List;
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Propagates {@link Context}s across raw text chunks using a state-machine parser for HTML/CSS/JS.
*
* <p>Given some raw HTML text {@code "<b>Hello, World!</b>"} and the {@link Context#HTML_PCDATA
* HTML_PCDATA} starting context, this class will decompose the rawText into a number of tokens and
* compute follow on contexts for each.
*
* <table>
* <tr><td>{@code <}</td><td>{@link HtmlContext#HTML_TAG_NAME}</td></tr>
* <tr><td>{@code b}</td><td>{@link HtmlContext#HTML_TAG}</td></tr>
* <tr><td>{@code >}</td><td>{@link HtmlContext#HTML_PCDATA}</td></tr>
* <tr><td>{@code Hello, World!}</td><td>{@link HtmlContext#HTML_PCDATA}</td></tr>
* <tr><td>{@code </}</td><td>{@link HtmlContext#HTML_TAG_NAME}</td></tr>
* <tr><td>{@code b}</td><td>{@link HtmlContext#HTML_TAG}</td></tr>
* <tr><td>{@code >}</td><td>{@link HtmlContext#HTML_PCDATA}</td></tr>
* </table>
*
*/
final class RawTextContextUpdater {
/**
* @param rawTextNode A chunk of HTML/CSS/JS.
* @param context The context before rawText.
* @return The input text node with context transitions marked.
*/
public static SlicedRawTextNode processRawText(RawTextNode rawTextNode, Context context)
throws SoyAutoescapeException {
SlicedRawTextNode slicedRawTextNode = new SlicedRawTextNode(rawTextNode, context);
String rawText = rawTextNode.getRawText();
int offset = 0;
int length = rawText.length();
while (offset < length) {
String unprocessedRawText = rawText.substring(offset);
int startOffset = offset;
int endOffset;
Context startContext = context;
Context endContext;
// If we are in an attribute value, then decode the remaining text
// (except for the delimiter) up to the next occurrence of delimiter.
// The end of the section to decode. Either before a delimiter or > symbol that closes an
// attribute, at the end of the rawText, or -1 if no decoding needs to happen.
int attrValueEnd = findEndOfAttributeValue(unprocessedRawText, context.delimType);
if (attrValueEnd == -1) {
// Outside an attribute value. No need to decode.
RawTextContextUpdater cu = new RawTextContextUpdater();
cu.processNextToken(unprocessedRawText, context);
endOffset = offset + cu.numCharsConsumed;
endContext = cu.next;
} else {
// Inside an attribute value. Find the end and decode up to it.
// All of the languages we deal with (HTML, CSS, and JS) use quotes as delimiters.
// When one language is embedded in the other, we need to decode delimiters before trying
// to parse the content in the embedded language.
//
// For example, in
// <a onclick="alert("Hello {$world}")">
// the decoded value of the event handler is
// alert("Hello {$world}")
// so to determine the appropriate escaping convention we decode the attribute value
// before delegating to processNextToken.
//
// We could take the cross-product of two languages to avoid decoding but that leads to
// either an explosion in the number of states, or the amount of lookahead required.
int unprocessedRawTextLen = unprocessedRawText.length();
// The end of the attribute value relative to offset.
// At attrValueEnd, or attrValueend + 1 if a delimiter
// needs to be consumed.
int attrEnd =
attrValueEnd < unprocessedRawTextLen
? attrValueEnd + context.delimType.text.length()
: -1;
// Decode so that the JavaScript rules work on attribute values like
// <a onclick='alert("{$msg}!")'>
// If we've already processed the tokens "<a", " onclick='" to get into the
// single quoted JS attribute context, then we do three things:
// (1) This class will decode """ to "\"" and work below to go from State.JS to
// State.JS_DQ_STRING.
// (2) Then the caller checks {$msg} and realizes that $msg is part of a JS string.
// (3) Then, the above will identify the "'" as the end, and so we reach here with:
// r a w T e x t = " ! & q u o t ; ) ' > "
// ^ ^
// attrValueEnd attrEnd
// We use this example more in the comments below.
String attrValueTail =
UnescapeUtils.unescapeHtml(unprocessedRawText.substring(0, attrValueEnd));
// attrValueTail is "!\")" in the example above.
// Recurse on the decoded value.
RawTextContextUpdater cu = new RawTextContextUpdater();
Context attrContext = startContext;
while (attrValueTail.length() != 0) {
cu.processNextToken(attrValueTail, attrContext);
attrValueTail = attrValueTail.substring(cu.numCharsConsumed);
attrContext = cu.next;
}
// TODO: Maybe check that context is legal to leave an attribute in. Throw if the attribute
// ends inside a quoted string.
if (attrEnd != -1) {
endOffset = offset + attrEnd;
// rawText.charAt(endOffset) is now ">" in the example above.
// When an attribute ends, we're back in the tag.
endContext =
context.toBuilder().withState(HtmlContext.HTML_TAG).withoutAttrContext().build();
} else {
// Whole tail is part of an unterminated attribute.
if (attrValueEnd != unprocessedRawTextLen) {
throw new IllegalStateException();
}
endOffset = length;
endContext = attrContext;
}
}
slicedRawTextNode.addSlice(startOffset, endOffset, startContext);
context = endContext;
offset = endOffset;
}
slicedRawTextNode.setEndContext(context);
return slicedRawTextNode;
}
/**
* @return The end of the attribute value of -1 if delim indicates we are not in an attribute.
* {@code rawText.length()} if we are in an attribute but the end does not appear in rawText.
*/
private static int findEndOfAttributeValue(String rawText, Context.AttributeEndDelimiter delim) {
int rawTextLen = rawText.length();
switch (delim) {
case DOUBLE_QUOTE:
case SINGLE_QUOTE:
int quote = rawText.indexOf(delim.text.charAt(0));
return quote >= 0 ? quote : rawTextLen;
case SPACE_OR_TAG_END:
for (int i = 0; i < rawTextLen; ++i) {
char ch = rawText.charAt(i);
if (ch == '>' || Character.isWhitespace(ch)) {
return i;
}
}
return rawTextLen;
case NONE:
return -1;
}
throw new AssertionError("Unrecognized delimiter " + delim);
}
/** The amount of rawText consumed. */
private int numCharsConsumed;
/** The context to which we transition. */
private Context next;
private RawTextContextUpdater() {
// NOP
}
/**
* Consume a portion of text and compute the next context. Output is stored in member variables.
*
* @param text Non empty.
*/
private void processNextToken(String text, Context context) throws SoyAutoescapeException {
// Find the transition whose pattern matches earliest in the raw text.
int earliestStart = Integer.MAX_VALUE;
int earliestEnd = -1;
Transition earliestTransition = null;
Matcher earliestMatcher = null;
for (Transition transition : TRANSITIONS.get(context.state)) {
Matcher matcher = transition.pattern.matcher(text);
try {
if (matcher.find()) {
int start = matcher.start();
if (start < earliestStart) {
int end = matcher.end();
if (transition.isApplicableTo(context, matcher)) {
earliestStart = start;
earliestEnd = end;
earliestTransition = transition;
earliestMatcher = matcher;
}
}
}
} catch (StackOverflowError soe) {
// catch and annotate with the pattern.
throw new RuntimeException("StackOverflow while matching: " + transition.pattern, soe);
}
}
if (earliestTransition != null) {
this.next = earliestTransition.computeNextContext(context, earliestMatcher);
this.numCharsConsumed = earliestEnd;
} else {
throw SoyAutoescapeException.createWithoutMetaInfo(
"Error determining next state when encountering \"" + text + "\" in " + context);
}
if (numCharsConsumed == 0 && this.next.state == context.state) {
throw new IllegalStateException("Infinite loop at `" + text + "` / " + context);
}
}
/**
* Encapsulates a grammar production and the context after that production is seen in a chunk of
* HTML/CSS/JS input.
*/
private abstract static class Transition {
/** Matches a token. */
final Pattern pattern;
Transition(Pattern pattern) {
this.pattern = pattern;
}
Transition(String regex) {
this(Pattern.compile(regex, Pattern.DOTALL));
}
/**
* True iff this transition can produce a context after the text in rawText[0:matcher.end()].
* This should not destructively modify the matcher. Specifically, it should not call {@code
* find()} again.
*
* @param prior The context before the start of the token in matcher.
* @param matcher The token matched by {@code this.pattern}.
*/
boolean isApplicableTo(Context prior, Matcher matcher) {
return true;
}
/**
* Computes the context that this production transitions to after rawText[0:matcher.end()].
*
* @param prior The context prior to the token in matcher.
* @param matcher The token matched by {@code this.pattern}.
* @return The context after the given token.
*/
abstract Context computeNextContext(Context prior, Matcher matcher)
throws SoyAutoescapeException;
}
/** A transition to a given context. */
private static Transition makeTransitionTo(String regex, final ContentKind kind) {
return new Transition(regex) {
@Override
Context computeNextContext(Context prior, Matcher matcher) {
return prior.toBuilder().withStartKind(kind).build();
}
};
}
/**
* A pattern to match the beginning of a tag with the given name.
*
* @param allowClose true to match a close tag and leave any {@code "/"} indicating that the tag
* starts with {@code </} in group 1.
* @return Given {@code "script"}, a pattern that matches the prefix {@code "<script"} of {@code
* "<script>"} but does not match any prefix of {@code "<scriptsareawesome>"}.
*/
private static String regexForSpecialTagNamed(String tagName, boolean allowClose) {
return ("(?i)" // Tag names are case-insensitive
+ "<" // Starts tag
+ (allowClose ? "(/?)" : "")
+ tagName
+ "(?=" // Lookahead to make sure we're not matching just a prefix of the tag name.
+ "[\\s>/]|\\z" // Tag names are terminated by a space, or tag end marker, or end of input.
+ ")");
// The "/" in the lookahead is correct.
// <script/>alert(1)</script> and <script/style>alert(2)</script> both alert in Chrome. Whee!
}
/** Map of special tag names to their element types. */
private static final ImmutableMap<String, Context.ElementType> SPECIAL_ELEMENT_TYPES =
ImmutableMap.<String, Context.ElementType>builder()
// We currently only treat <img> and SVG's <image> as a media type, since for <video> and
// <audio> there are concerns that attackers could introduce rich video or audio that
// facilitates social engineering. Upon further review, it's possible we may allow them.
.put("img", Context.ElementType.MEDIA)
.put("image", Context.ElementType.MEDIA)
.put("script", Context.ElementType.SCRIPT)
.put("style", Context.ElementType.STYLE)
.put("textarea", Context.ElementType.TEXTAREA)
.put("title", Context.ElementType.TITLE)
.put("xmp", Context.ElementType.XMP)
.build();
/**
* Transition from left angle bracket (and optional slash) to a tag name.
*
* <p>Note that this will not match things like < script because the space breaks the tag name.
*
* <p>Spec: http://www.w3.org/TR/html5/syntax.html#tag-name-state -- however, unlike the spec,
* which appears to allow arbitrary Unicode chars after the first char, we only parse ASCII
* identifier tag names.
*/
private static final Transition TRANSITION_TO_TAG_NAME =
new Transition("(?i)^([a-z][a-z0-9:-]*)") {
@Override
Context computeNextContext(Context prior, Matcher matcher) {
String tagName = matcher.group(1).toLowerCase(Locale.ENGLISH);
Context.ElementType elType = SPECIAL_ELEMENT_TYPES.get(tagName);
if (elType == null) {
elType = Context.ElementType.NORMAL;
}
if (prior.state == HtmlContext.HTML_BEFORE_CLOSE_TAG_NAME
&& elType != Context.ElementType.NORMAL
&& elType != Context.ElementType.MEDIA) {
// For special tags that change context (other than normal and media) we flag it as an
// error when seeing an unmatched close tag. e.g. </script> suggests something fishy
// happened earlier.
throw SoyAutoescapeException.createWithoutMetaInfo(
"Saw unmatched close tag for context-changing tag: " + tagName);
}
return prior
.toBuilder()
.withState(HtmlContext.HTML_TAG_NAME)
.withoutAttrContext()
.withElType(elType)
.build();
}
};
/** Transitions from tag name to tag body after seeing a space. */
private static final Transition TRANSITION_TO_TAG_BODY =
new Transition("^(?=[/\\s>])") {
@Override
Context computeNextContext(Context prior, Matcher matcher) {
// Make sure the element type was pre-determined when setting the tag name.
Preconditions.checkArgument(prior.elType != Context.ElementType.NONE);
return prior.toBuilder().withState(HtmlContext.HTML_TAG).withoutAttrContext().build();
}
};
/** A transition on a template tag that updates the template nest depth. */
private static Transition makeTemplateTagTransition() {
String regex = regexForSpecialTagNamed("template", true);
return new Transition(regex) {
@Override
Context computeNextContext(Context prior, Matcher matcher) {
boolean isEndTag = "/".equals(matcher.group(1));
if (isEndTag && prior.templateNestDepth == 0) {
throw SoyAutoescapeException.createWithoutMetaInfo(
"Saw an html5 </template> without encountering <template>.");
}
Context.Builder builder =
prior
.toBuilder()
.withTemplateNestDepth(prior.templateNestDepth + (isEndTag ? -1 : 1))
.withoutAttrContext();
if (isEndTag) {
builder.withState(HtmlContext.HTML_TAG).withElType(Context.ElementType.NORMAL);
} else {
builder.withState(HtmlContext.HTML_PCDATA).withElType(Context.ElementType.NONE);
}
return builder.build();
}
};
}
/** A transition back to a context in the body of an open tag. */
private static Transition makeTransitionBackToTag(String regex) {
return new Transition(regex) {
@Override
Context computeNextContext(Context prior, Matcher matcher) {
return prior.toBuilder().withState(HtmlContext.HTML_TAG).withoutAttrContext().build();
}
};
}
/**
* A transition to a context in the name of an attribute whose type is determined from its name.
*
* @param regex A regular expression whose group 1 is a prefix of an attribute name.
*/
private static Transition makeTransitionToAttrName(String regex) {
return new Transition(regex) {
@Override
Context computeNextContext(Context prior, Matcher matcher) {
String attrName = matcher.group(1).toLowerCase(Locale.ENGLISH);
// Get the local name so we can treat xlink:href and svg:style as per HTML.
int colon = attrName.lastIndexOf(':');
String localName = attrName.substring(colon + 1);
Context.AttributeType attr;
UriType uriType = UriType.NONE;
if (localName.startsWith("on")) {
attr = Context.AttributeType.SCRIPT;
} else if ("style".equals(localName)) {
attr = Context.AttributeType.STYLE;
} else if (prior.elType == Context.ElementType.MEDIA
&& ("src".equals(attrName) || "xlink:href".equals(attrName))) {
// TODO(gboyer): We should treat script srcs as trusted and impose additional
// restrictions.
attr = Context.AttributeType.URI;
uriType = UriType.MEDIA;
} else if (prior.elType == Context.ElementType.SCRIPT && "src".equals(attrName)) {
attr = Context.AttributeType.URI;
uriType = Context.UriType.TRUSTED_RESOURCE;
} else if (URI_ATTR_NAMES.contains(localName)
|| CUSTOM_URI_ATTR_NAMING_CONVENTION.matcher(localName).find()
|| "xmlns".equals(attrName)
|| attrName.startsWith("xmlns:")) {
attr = Context.AttributeType.URI;
uriType = UriType.NORMAL;
} else {
attr = Context.AttributeType.PLAIN_TEXT;
}
return prior
.toBuilder()
.withState(HtmlContext.HTML_ATTRIBUTE_NAME)
.withoutAttrContext()
.withAttrType(attr)
.withUriType(uriType)
.build();
}
};
}
/** A transition to a context in the name of an attribute of the given type. */
private static Transition makeTransitionToAttrValue(
String regex, final Context.AttributeEndDelimiter delim) {
return new Transition(regex) {
@Override
Context computeNextContext(Context prior, Matcher matcher) {
return Context.computeContextAfterAttributeDelimiter(
prior.elType, prior.attrType, delim, prior.uriType, prior.templateNestDepth);
}
};
}
/**
* Lower case names of attributes whose value is a URI. This does not identify attributes like
* {@code <meta content>} which is conditionally a URI depending on the value of other attributes.
*
* @see <a href="http://www.w3.org/TR/html4/index/attributes.html">HTML4 attrs with type %URI</a>
*/
private static final ImmutableSet<String> URI_ATTR_NAMES =
ImmutableSet.of(
"action",
"archive",
"base",
"background",
"cite",
"classid",
"codebase",
/**
* TODO: content is only a URL sometimes depending on other parameters and existing
* templates use content with non-URL values. Fix those templates or otherwise flag
* interpolations into content.
*/
// "content",
"data",
"dsync",
"formaction",
"href",
"icon",
"longdesc",
"manifest",
"poster",
"src",
"usemap",
// Custom attributes that are reliably URLs in existing code.
"entity");
/** Matches lower-case attribute local names that start or end with "url" or "uri". */
private static final Pattern CUSTOM_URI_ATTR_NAMING_CONVENTION =
Pattern.compile("\\bur[il]|ur[il]s?$");
/** A transition to the given state. */
private static Transition makeTransitionToState(String regex, final HtmlContext state) {
return new Transition(regex) {
@Override
Context computeNextContext(Context prior, Matcher matcher) {
Context.Builder builder = prior.toBuilder().withState(state).withUriPart(UriPart.NONE);
if (prior.uriPart != UriPart.NONE) {
// Only reset the URI type if we're leaving a URI; intentionally, URI type needs to
// remain prior to the URI, for example, to maintain state between "src", the "=", and
// the opening quotes (if any).
builder.withUriType(UriType.NONE);
}
return builder.build();
}
};
}
/** A transition to an state. */
private static Transition makeTransitionToError(String regex, final String message) {
return new Transition(regex) {
@Override
Context computeNextContext(Context prior, Matcher matcher) {
throw SoyAutoescapeException.createWithoutMetaInfo(message);
}
};
}
/** A transition to the given JS string start state. */
private static Transition makeTransitionToJsString(String regex, final HtmlContext state) {
return new Transition(regex) {
@Override
Context computeNextContext(Context prior, Matcher matcher) {
return prior
.toBuilder()
.withState(state)
.withSlashType(Context.JsFollowingSlash.NONE)
.withUriPart(UriPart.NONE)
.build();
}
};
}
/** A transition that consumes some content without changing state. */
private static Transition makeTransitionToSelf(String regex) {
return new Transition(regex) {
@Override
Context computeNextContext(Context prior, Matcher matcher) {
return prior;
}
};
}
/** Consumes the entire content without change if nothing else matched. */
private static final Transition TRANSITION_TO_SELF = makeTransitionToSelf("\\z");
// Matching at the end is lowest possible precedence.
private static UriPart getNextUriPart(UriPart uriPart, char matchChar) {
// This switch statement is designed to process a URI in order via a sequence of fall throughs.
switch (uriPart) {
case MAYBE_SCHEME:
case MAYBE_VARIABLE_SCHEME:
// From the RFC: https://tools.ietf.org/html/rfc3986#section-3.1
// scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
// At this point, our goal is to try to prove that we've safely left the scheme, and then
// transition to a more specific state.
if (matchChar == ':') {
// Ah, it looks like we might be able to conclude we've set the scheme, but...
if (uriPart == UriPart.MAYBE_VARIABLE_SCHEME) {
// At the start of a URL, and we already saw a print statement, and now we suddenly
// see a colon. While this could be relatively safe if it's a {$host}:{$port} pair,
// at compile-time, we can't be sure that "$host" isn't something like "javascript"
// and "$port" isn't "deleteMyAccount()".
throw SoyAutoescapeException.createWithoutMetaInfo(
"Soy can't safely process a URI that might start with a variable scheme. "
+ "For example, {$x}:{$y} could have an XSS if $x is 'javascript' and $y is "
+ "attacker-controlled. Either use a hard-coded scheme, or introduce "
+ "disambiguating characters (e.g. http://{$x}:{$y}, ./{$x}:{$y}, or "
+ "{$x}?foo=:{$y})");
} else {
// At the start of the URL, and we just saw some hard-coded characters and a colon,
// like http:. This is safe (assuming it's a good scheme), and now we're on our way to
// the authority. Note if javascript: was seen, we would have scanned it already and
// entered a separate state (unless the developer is malicious and tries to obscure it
// via a conditional).
return UriPart.AUTHORITY_OR_PATH;
}
}
if (matchChar == '/') {
// Upon seeing a slash, it's impossible to set a valid scheme anymore. Either we're in the
// path, or we're starting a protocol-relative URI. (For all we know, we *could* be
// in the query, e.g. {$base}/foo if $base has a question mark, but sadly we have to go
// by what we know statically. However, usually query param groups tend to contain
// ampersands and equal signs, which we check for later heuristically.)
return UriPart.AUTHORITY_OR_PATH;
}
if ((matchChar == '=' || matchChar == '&') && uriPart == UriPart.MAYBE_VARIABLE_SCHEME) {
// This case is really special, and is only seen in cases like href="{$x}&foo={$y}" or
// href="{$x}foo={$y}". While in this case we can never be sure that we're in the query
// part, we do know two things:
//
// 1) We can't possibly set a dangerous scheme, since no valid scheme contains = or &
// 2) Within QUERY, all print statements are encoded as a URI component, which limits
// the damage that can be done; it can't even break into another path segment.
// Therefore, it is secure to assume this.
//
// Note we can safely handle ampersand even in HTML contexts because attribute values
// are processed unescaped.
return UriPart.QUERY;
}
// fall through
case AUTHORITY_OR_PATH:
case UNKNOWN_PRE_FRAGMENT:
if (matchChar == '?') {
// Upon a ? we can be pretty sure we're in the query. While it's possible for something
// like {$base}?foo=bar to be in the fragment if $base contains a #, it's safe to assume
// we're in the query, because query params are escaped more strictly than the fragment.
return UriPart.QUERY;
}
// fall through
case QUERY:
case UNKNOWN:
if (matchChar == '#') {
// A # anywhere proves we're in the fragment, even if we're already in the fragment.
return UriPart.FRAGMENT;
}
// fall through
case FRAGMENT:
// No transitions for fragment.
return uriPart;
case DANGEROUS_SCHEME:
// Dangerous schemes remain dangerous.
return UriPart.DANGEROUS_SCHEME;
default:
throw new AssertionError("Unanticipated URI part: " + uriPart);
}
}
/**
* Transition between different parts of an http-like URL.
*
* <p>This happens on the first important URI character, or upon seeing the end of the raw text
* segment and not seeing anything else.
*/
private static final Transition URI_PART_TRANSITION =
new Transition("([:./&?=#])|\\z") {
@Override
boolean isApplicableTo(Context prior, Matcher matcher) {
return true;
}
@Override
Context computeNextContext(Context prior, Matcher matcher) {
UriPart uriPart = prior.uriPart;
if (uriPart == UriPart.START) {
uriPart = UriPart.MAYBE_SCHEME;
}
String match = matcher.group(1);
if (match != null) {
uriPart = getNextUriPart(uriPart, match.charAt(0));
}
return prior.derive(uriPart);
}
};
/** Transition to detect dangerous URI schemes. */
private static final Transition URI_START_TRANSITION =
new Transition("(?i)^(javascript|data|blob|filesystem):") {
@Override
boolean isApplicableTo(Context prior, Matcher matcher) {
return prior.uriPart == UriPart.START;
}
@Override
Context computeNextContext(Context prior, Matcher matcher) {
// TODO(gboyer): Ban all but whitelisted schemes.
return prior.derive(UriPart.DANGEROUS_SCHEME);
}
};
/** Matches the end of a special tag like {@code script}. */
private static Transition makeEndTagTransition(String tagName) {
return new Transition("(?i)</" + tagName + "\\b") {
@Override
boolean isApplicableTo(Context prior, Matcher matcher) {
return prior.attrType == Context.AttributeType.NONE;
}
@Override
Context computeNextContext(Context prior, Matcher matcher) {
return prior
.toBuilder()
.withState(HtmlContext.HTML_TAG)
.withElType(Context.ElementType.NORMAL)
.withoutAttrContext()
.build();
}
};
// TODO: This transitions to an HTML_TAG state which can accept attributes.
// So we allow nonsensical constructs like </br foo="bar">.
// Add another HTML_END_TAG state that just accepts space and >.
}
/** Matches the beginning of a CSS URI with the delimiter, if any, in group 1. */
private static Transition makeCssUriTransition(String regex, final UriType uriType) {
return new Transition(regex) {
@Override
Context computeNextContext(Context prior, Matcher matcher) {
String delim = matcher.group(1);
HtmlContext state;
if ("\"".equals(delim)) {
state = HtmlContext.CSS_DQ_URI;
} else if ("'".equals(delim)) {
state = HtmlContext.CSS_SQ_URI;
} else {
state = HtmlContext.CSS_URI;
}
return prior
.toBuilder()
.withState(state)
.withUriType(uriType)
.withUriPart(UriPart.START)
.build();
}
};
}
/** Matches a portion of JavaScript that can precede a division operator. */
private static Transition makeDivPreceder(String regex) {
return new Transition(regex) {
@Override
Context computeNextContext(Context prior, Matcher matcher) {
return prior
.toBuilder()
.withState(HtmlContext.JS)
.withSlashType(Context.JsFollowingSlash.DIV_OP)
.build();
}
};
}
/** Characters that break a line in JavaScript source suitable for use in a regex charset. */
private static final String JS_LINEBREAKS = "\\r\\n\u2028\u2029";
/**
* For each state, a group of rules for consuming raw text and how that affects the document
* context. The rules each have an associated pattern, and the rule whose pattern matches earliest
* in the text wins.
*/
private static final ImmutableMap<HtmlContext, List<Transition>> TRANSITIONS =
ImmutableMap.<HtmlContext, List<Transition>>builder()
.put(
HtmlContext.HTML_PCDATA,
ImmutableList.of(
makeTransitionToState("<!--", HtmlContext.HTML_COMMENT),
makeTemplateTagTransition(),
makeTransitionToState("<", HtmlContext.HTML_BEFORE_OPEN_TAG_NAME),
makeTransitionToSelf("[^<]+")))
.put(
HtmlContext.HTML_BEFORE_OPEN_TAG_NAME,
ImmutableList.of(
TRANSITION_TO_TAG_NAME,
// Or, maybe it's a close-tag!
makeTransitionToState("^/", HtmlContext.HTML_BEFORE_CLOSE_TAG_NAME),
// This is for things like "I <3 Kittens" or "Styles < Scripts"
makeTransitionTo("", ContentKind.HTML)))
.put(
HtmlContext.HTML_BEFORE_CLOSE_TAG_NAME,
ImmutableList.of(
TRANSITION_TO_TAG_NAME, makeTransitionToError("", "Invalid end-tag name.")))
.put(
HtmlContext.HTML_TAG_NAME,
ImmutableList.of(
TRANSITION_TO_TAG_BODY,
// Anything else:
makeTransitionToError(
"\\z",
"Tag names should not be split up. For example, Soy can't easily understand "
+ "that <s{if 1}cript{/if}> is a script tag.")))
.put(
HtmlContext.HTML_TAG,
ImmutableList.of(
/**
* Regex for allowed attribute names. Intentionally more restrictive than spec:
* https://html.spec.whatwg.org/multipage/syntax.html#attribute-name-state Allows
* {@code data-foo} and other dashed attribute names, but intentionally disallows
* "--" as an attribute name so that a tag ending after a value-less attribute
* named "--" cannot be confused with an HTML comment end ("-->"). Also prevents
* unicode normalized characters. Regular expression is a case insensitive match
* of any number of whitespace characters followed by a capture group for an
* attribute name composed of an alphabetic character followed by any number of
* alpha, numeric, underscore color and dash, ending in alpha, numeric, question
* or dollar characters.
*/
makeTransitionToAttrName("(?i)^\\s*([a-z](?:[a-z0-9_:?$\\-]*[a-z0-9?$])?)"),
new Transition("^\\s*/?>") {
@Override
Context computeNextContext(Context prior, Matcher matcher) {
Context.Builder builder = prior.toBuilder();
builder.withoutAttrContext();
switch (prior.elType) {
case SCRIPT:
builder
.withState(HtmlContext.JS)
.withSlashType(Context.JsFollowingSlash.REGEX)
.withElType(Context.ElementType.NONE);
break;
case STYLE:
builder.withState(HtmlContext.CSS).withElType(Context.ElementType.NONE);
break;
case TEXTAREA:
case TITLE:
case XMP:
builder.withState(HtmlContext.HTML_RCDATA);
break;
// All normal or void tags fit here.
case NORMAL:
case MEDIA:
builder
.withState(HtmlContext.HTML_PCDATA)
.withElType(Context.ElementType.NONE);
break;
case NONE:
throw new IllegalStateException();
default:
throw new AssertionError("Unrecognized state " + prior.elType);
}
return builder.build();
}
},
makeTransitionToSelf("^\\s+\\z")))
.put(
HtmlContext.HTML_ATTRIBUTE_NAME,
ImmutableList.of(
makeTransitionToState("^\\s*=", HtmlContext.HTML_BEFORE_ATTRIBUTE_VALUE),
// For a value-less attribute, make an epsilon transition back to the tag body
// context to look for a tag end or another attribute name.
makeTransitionBackToTag("^")))
.put(
HtmlContext.HTML_BEFORE_ATTRIBUTE_VALUE,
ImmutableList.of(
makeTransitionToAttrValue("^\\s*\"", Context.AttributeEndDelimiter.DOUBLE_QUOTE),
makeTransitionToAttrValue("^\\s*\'", Context.AttributeEndDelimiter.SINGLE_QUOTE),
makeTransitionToAttrValue(
"^(?=[^\"\'\\s>])", // Matches any unquoted value part.
Context.AttributeEndDelimiter.SPACE_OR_TAG_END),
// Epsilon transition back if there is an empty value followed by an obvious
// attribute name or a tag end.
// The first branch handles the blank value in:
// <input value=>
// and the second handles the blank value in:
// <input value= name=foo>
makeTransitionBackToTag("^(?=>|\\s+[\\w-]+\\s*=)"),
makeTransitionToSelf("^\\s+")))
.put(
HtmlContext.HTML_COMMENT,
ImmutableList.of(makeTransitionTo("-->", ContentKind.HTML), TRANSITION_TO_SELF))
.put(HtmlContext.HTML_NORMAL_ATTR_VALUE, ImmutableList.of(TRANSITION_TO_SELF))
// The CSS transitions below are based on http://www.w3.org/TR/css3-syntax/#lexical
.put(
HtmlContext.CSS,
ImmutableList.of(
makeTransitionToState("/\\*", HtmlContext.CSS_COMMENT),
// TODO: Do we need to support non-standard but widely supported C++ style
// comments?
makeTransitionToState("\"", HtmlContext.CSS_DQ_STRING),
makeTransitionToState("'", HtmlContext.CSS_SQ_STRING),
// Although we don't contextually parse CSS, certain property names are only used
// in conjunction with images. This pretty basic regexp does a decent job on CSS
// that is not attempting to be malicious (for example, doesn't handle comments).
// Note that this can be fooled with {if 1}foo-{/if}background, but it's not worth
// really worrying about.
makeCssUriTransition(
"(?i)(?:[^a-z0-9-]|^)\\s*"
+ "(?:background|background-image|border-image|content"
+ "|cursor|list-style|list-style-image)"
+ "\\s*:\\s*url\\s*\\(\\s*(['\"]?)",
UriType.MEDIA),
// TODO(gboyer): We should treat @import, @font-face src, etc as trusted
// resources, once trusted URLs are implemented.
makeCssUriTransition("(?i)\\burl\\s*\\(\\s*(['\"]?)", UriType.NORMAL),
makeEndTagTransition("style"),
TRANSITION_TO_SELF))
.put(
HtmlContext.CSS_COMMENT,
ImmutableList.of(
makeTransitionToState("\\*/", HtmlContext.CSS),
makeEndTagTransition("style"),
TRANSITION_TO_SELF))
.put(
HtmlContext.CSS_DQ_STRING,
ImmutableList.of(
makeTransitionToState("\"", HtmlContext.CSS),
makeTransitionToSelf("\\\\(?:\r\n?|[\n\f\"])"), // Line continuation or escape.
makeTransitionToError("[\n\r\f]", "Newlines not permitted in string literals."),
makeEndTagTransition("style"), // TODO: Make this an error transition?
TRANSITION_TO_SELF))
.put(
HtmlContext.CSS_SQ_STRING,
ImmutableList.of(
makeTransitionToState("'", HtmlContext.CSS),
makeTransitionToSelf("\\\\(?:\r\n?|[\n\f'])"), // Line continuation or escape.
makeTransitionToError("[\n\r\f]", "Newlines not permitted in string literals."),
makeEndTagTransition("style"), // TODO: Make this an error transition?
TRANSITION_TO_SELF))
.put(
HtmlContext.CSS_URI,
ImmutableList.of(
makeTransitionToState("[\\)\\s]", HtmlContext.CSS),
URI_PART_TRANSITION,
URI_START_TRANSITION,
makeTransitionToError("[\"']", "Quotes not permitted in CSS URIs."),
makeEndTagTransition("style")))
.put(
HtmlContext.CSS_SQ_URI,
ImmutableList.of(
makeTransitionToState("'", HtmlContext.CSS),
URI_PART_TRANSITION,
URI_START_TRANSITION,
makeTransitionToSelf("\\\\(?:\r\n?|[\n\f'])"), // Line continuation or escape.
makeTransitionToError("[\n\r\f]", "Newlines not permitted in string literal."),
makeEndTagTransition("style")))
.put(
HtmlContext.CSS_DQ_URI,
ImmutableList.of(
makeTransitionToState("\"", HtmlContext.CSS),
URI_PART_TRANSITION,
URI_START_TRANSITION,
makeTransitionToSelf("\\\\(?:\r\n?|[\n\f\"])"), // Line continuation or escape.
makeTransitionToError("[\n\r\f]", "Newlines not permitted in string literal."),
makeEndTagTransition("style")))
.put(
HtmlContext.JS,
ImmutableList.of(
makeTransitionToState("/\\*", HtmlContext.JS_BLOCK_COMMENT),
makeTransitionToState("//", HtmlContext.JS_LINE_COMMENT),
makeTransitionToJsString("\"", HtmlContext.JS_DQ_STRING),
makeTransitionToJsString("'", HtmlContext.JS_SQ_STRING),
new Transition("/") {
@Override
Context computeNextContext(Context prior, Matcher matcher) {
switch (prior.slashType) {
case DIV_OP:
return prior
.toBuilder()
.withState(HtmlContext.JS)
.withSlashType(Context.JsFollowingSlash.REGEX)
.build();
case REGEX:
return prior
.toBuilder()
.withState(HtmlContext.JS_REGEX)
.withSlashType(Context.JsFollowingSlash.NONE)
.build();
default:
StringBuffer rest = new StringBuffer();
matcher.appendTail(rest);
throw SoyAutoescapeException.createWithoutMetaInfo(
"Slash (/) cannot follow the preceding branches since it is unclear "
+ "whether the slash is a RegExp literal or division operator. "
+ "Please add parentheses in the branches leading to `"
+ rest
+ "`");
}
}
},
/**
* Shuffle words, punctuation (besides /), and numbers off to an analyzer which
* does a quick and dirty check to update JsUtil.isRegexPreceder.
*/
new Transition("(?i)(?:[^</\"'\\s\\\\]+|<(?!/script))+") {
@Override
Context computeNextContext(Context prior, Matcher matcher) {
return prior.derive(
JsUtil.isRegexPreceder(matcher.group())
? Context.JsFollowingSlash.REGEX
: Context.JsFollowingSlash.DIV_OP);
}
},
makeTransitionToSelf("\\s+"), // Space
makeEndTagTransition("script")))
.put(
HtmlContext.JS_BLOCK_COMMENT,
ImmutableList.of(
makeTransitionToState("\\*/", HtmlContext.JS),
makeEndTagTransition("script"),
TRANSITION_TO_SELF))
// Line continuations are not allowed in line comments.
.put(
HtmlContext.JS_LINE_COMMENT,
ImmutableList.of(
makeTransitionToState("[" + JS_LINEBREAKS + "]", HtmlContext.JS),
makeEndTagTransition("script"),
TRANSITION_TO_SELF))
.put(
HtmlContext.JS_DQ_STRING,
ImmutableList.of(
makeDivPreceder("\""),
makeEndTagTransition("script"),
makeTransitionToSelf(
"(?i)^(?:"
+ // Case-insensitively, from start of string
"[^\"\\\\"
+ JS_LINEBREAKS
+ "<]+"
+ // match any chars except newlines, quotes, \s;
"|\\\\(?:"
+ // or backslash followed by a
"\\r\\n?"
+ // line continuation
"|[^\\r<]"
+ // or an escape
"|<(?!/script)"
+ // or less-than that doesn't close the script.
")"
+ "|<(?!/script)"
+ ")+")))
.put(
HtmlContext.JS_SQ_STRING,
ImmutableList.of(
makeDivPreceder("'"),
makeEndTagTransition("script"),
makeTransitionToSelf(
"(?i)^(?:"
+ // Case-insensitively, from start of string
"[^'\\\\"
+ JS_LINEBREAKS
+ "<]+"
+ // match any chars except newlines, quotes, \s;
"|\\\\(?:"
+ // or a backslash followed by a
"\\r\\n?"
+ // line continuation
"|[^\\r<]"
+ // or an escape;
"|<(?!/script)"
+ // or less-than that doesn't close the script.
")"
+ "|<(?!/script)"
+ ")+")))
.put(
HtmlContext.JS_REGEX,
ImmutableList.of(
makeDivPreceder("/"),
makeEndTagTransition("script"),
makeTransitionToSelf(
"(?i)^(?:"
+
/**
* We have to handle [...] style character sets specially since in /[/]/,
* the second solidus doesn't end the regular expression.
*/
"[^\\[\\\\/<"
+ JS_LINEBREAKS
+ "]"
+ // A non-charset, non-escape token;
"|\\\\[^"
+ JS_LINEBREAKS
+ "]"
+ // an escape;
"|\\\\?<(?!/script)"
+ "|\\["
+ // or a character set containing
"(?:[^\\]\\\\<"
+ JS_LINEBREAKS
+ "]"
+ // a normal character,
"|\\\\(?:[^"
+ JS_LINEBREAKS
+ "]))*"
+ // or an escape;
"|\\\\?<(?!/script)"
+ // or an angle bracket possibly escaped.
"\\]"
+ ")+")))
.put(HtmlContext.URI, ImmutableList.of(URI_PART_TRANSITION, URI_START_TRANSITION))
.put(
HtmlContext.HTML_RCDATA,
ImmutableList.of(
new Transition("</(\\w+)\\b") {
@Override
boolean isApplicableTo(Context prior, Matcher matcher) {
String tagName = matcher.group(1).toUpperCase(Locale.ENGLISH);
return prior.elType.name().equals(tagName);
}
@Override
Context computeNextContext(Context prior, Matcher matcher) {
return prior
.toBuilder()
.withState(HtmlContext.HTML_TAG)
.withElType(Context.ElementType.NORMAL)
.withoutAttrContext()
.build();
}
},
TRANSITION_TO_SELF))
// Text context has no edges except to itself.
.put(HtmlContext.TEXT, ImmutableList.of(TRANSITION_TO_SELF))
.build();
// TODO: If we need to deal with untrusted templates, then we need to make sure that tokens like
// <!--, </script>, etc. are never split with empty strings.
// We could do this by walking all possible paths through each template (both branches for ifs,
// each case for switches, and the 0,1, and 2+ iteration case for loops).
// For each template, tokenize the original's rawText nodes using RawTextContextUpdater and then
// tokenize one single rawText node made by concatenating all rawText.
// If one contains a sensitive token, e.g. <!--/ and the other doesn't, then we have a potential
// splitting attack.
// That and disallow unquoted attributes, and be paranoid about prints especially in the TAG_NAME
// productions.
}