/*
* Copyright 2011 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.template.soy.shared.restricted;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Joiner;
import com.google.common.base.Strings;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Lists;
import com.google.common.escape.Escaper;
import com.google.common.net.PercentEscaper;
import com.google.template.soy.data.Dir;
import com.google.template.soy.data.SanitizedContent;
import com.google.template.soy.data.SanitizedContent.ContentKind;
import com.google.template.soy.data.SoyValue;
import com.google.template.soy.data.UnsafeSanitizedContentOrdainer;
import com.google.template.soy.data.restricted.BooleanData;
import com.google.template.soy.data.restricted.NullData;
import com.google.template.soy.data.restricted.NumberData;
import com.google.template.soy.data.restricted.StringData;
import com.google.template.soy.shared.restricted.TagWhitelist.OptionalSafeTag;
import java.io.IOException;
import java.util.Collection;
import java.util.List;
import java.util.Locale;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Java implementations of functions that escape, normalize, and filter untrusted strings to allow
* them to be safely embedded in particular contexts. These correspond to the {@code soy.$$escape*},
* {@code soy.$$normalize*}, and {@code soy.$$filter*} functions defined in "soyutils.js".
*
*/
public final class Sanitizers {
/** Receives messages about unsafe values that were filtered out. */
private static final Logger logger = Logger.getLogger(Sanitizers.class.getName());
private Sanitizers() {
// Not instantiable.
}
/** Converts the input to HTML by entity escaping. */
public static String escapeHtml(SoyValue value) {
if (isSanitizedContentOfKind(value, SanitizedContent.ContentKind.HTML)) {
return value.coerceToString();
}
return escapeHtml(value.coerceToString());
}
/** Converts plain text to HTML by entity escaping. */
public static String escapeHtml(String value) {
return EscapingConventions.EscapeHtml.INSTANCE.escape(value);
}
/**
* Normalizes the input HTML while preserving "safe" tags and the known directionality.
*
* @return the normalized input, in the form of {@link SanitizedContent} of {@link
* ContentKind#HTML}
*/
public static SanitizedContent cleanHtml(SoyValue value) {
return cleanHtml(value, ImmutableSet.<OptionalSafeTag>of());
}
/**
* Normalizes the input HTML while preserving "safe" tags and the known directionality.
*
* @param optionalSafeTags to add to the basic whitelist of formatting safe tags
* @return the normalized input, in the form of {@link SanitizedContent} of {@link
* ContentKind#HTML}
*/
public static SanitizedContent cleanHtml(
SoyValue value, Collection<? extends OptionalSafeTag> optionalSafeTags) {
Dir valueDir = null;
if (value instanceof SanitizedContent) {
SanitizedContent sanitizedContent = (SanitizedContent) value;
if (sanitizedContent.getContentKind() == SanitizedContent.ContentKind.HTML) {
return (SanitizedContent) value;
}
valueDir = sanitizedContent.getContentDirection();
}
return cleanHtml(value.coerceToString(), valueDir, optionalSafeTags);
}
/**
* Normalizes the input HTML while preserving "safe" tags. The content directionality is unknown.
*
* @return the normalized input, in the form of {@link SanitizedContent} of {@link
* ContentKind#HTML}
*/
public static SanitizedContent cleanHtml(String value) {
return cleanHtml(value, ImmutableSet.<OptionalSafeTag>of());
}
/**
* Normalizes the input HTML while preserving "safe" tags. The content directionality is unknown.
*
* @param optionalSafeTags to add to the basic whitelist of formatting safe tags
* @return the normalized input, in the form of {@link SanitizedContent} of {@link
* ContentKind#HTML}
*/
public static SanitizedContent cleanHtml(
String value, Collection<? extends OptionalSafeTag> optionalSafeTags) {
return cleanHtml(value, null, optionalSafeTags);
}
/**
* Normalizes the input HTML of a given directionality while preserving "safe" tags.
*
* @param optionalSafeTags to add to the basic whitelist of formatting safe tags
* @return the normalized input, in the form of {@link SanitizedContent} of {@link
* ContentKind#HTML}
*/
public static SanitizedContent cleanHtml(
String value, Dir contentDir, Collection<? extends OptionalSafeTag> optionalSafeTags) {
return UnsafeSanitizedContentOrdainer.ordainAsSafe(
stripHtmlTags(value, TagWhitelist.FORMATTING.withOptionalSafeTags(optionalSafeTags), true),
ContentKind.HTML,
contentDir);
}
/** Converts the input to HTML suitable for use inside {@code <textarea>} by entity escaping. */
public static String escapeHtmlRcdata(SoyValue value) {
if (isSanitizedContentOfKind(value, SanitizedContent.ContentKind.HTML)) {
// We can't allow tags in the output, because that would allow safe HTML containing
// "<textarea>" to prematurely close the textarea.
// Instead, we normalize which is semantics preserving in RCDATA.
return normalizeHtml(value.coerceToString());
}
return escapeHtml(value.coerceToString());
}
/** Normalizes HTML to HTML making sure quotes and other specials are entity encoded. */
public static String normalizeHtml(SoyValue value) {
return normalizeHtml(value.coerceToString());
}
/** Normalizes HTML to HTML making sure quotes and other specials are entity encoded. */
public static String normalizeHtml(String value) {
return EscapingConventions.NormalizeHtml.INSTANCE.escape(value);
}
/**
* Normalizes HTML to HTML making sure quotes, spaces and other specials are entity encoded so
* that the result can be safely embedded in a valueless attribute.
*/
public static String normalizeHtmlNospace(SoyValue value) {
return normalizeHtmlNospace(value.coerceToString());
}
/**
* Normalizes HTML to HTML making sure quotes, spaces and other specials are entity encoded so
* that the result can be safely embedded in a valueless attribute.
*/
public static String normalizeHtmlNospace(String value) {
return EscapingConventions.NormalizeHtmlNospace.INSTANCE.escape(value);
}
/**
* Converts the input to HTML by entity escaping, stripping tags in sanitized content so the
* result can safely be embedded in an HTML attribute value.
*/
public static String escapeHtmlAttribute(SoyValue value) {
if (isSanitizedContentOfKind(value, SanitizedContent.ContentKind.HTML)) {
// |escapeHtmlAttribute should only be used on attribute values that cannot have tags.
return stripHtmlTags(value.coerceToString(), null, true);
}
return escapeHtmlAttribute(value.coerceToString());
}
/**
* Converts plain text to HTML by entity escaping so the result can safely be embedded in an HTML
* attribute value.
*/
public static String escapeHtmlAttribute(String value) {
return EscapingConventions.EscapeHtml.INSTANCE.escape(value);
}
/**
* Converts plain text to HTML by entity escaping, stripping tags in sanitized content so the
* result can safely be embedded in an unquoted HTML attribute value.
*/
public static String escapeHtmlAttributeNospace(SoyValue value) {
if (isSanitizedContentOfKind(value, SanitizedContent.ContentKind.HTML)) {
// |escapeHtmlAttributeNospace should only be used on attribute values that cannot have tags.
return stripHtmlTags(value.coerceToString(), null, false);
}
return escapeHtmlAttributeNospace(value.coerceToString());
}
/**
* Converts plain text to HTML by entity escaping so the result can safely be embedded in an
* unquoted HTML attribute value.
*/
public static String escapeHtmlAttributeNospace(String value) {
return EscapingConventions.EscapeHtmlNospace.INSTANCE.escape(value);
}
/** Converts the input to the body of a JavaScript string by using {@code \n} style escapes. */
public static String escapeJsString(SoyValue value) {
return escapeJsString(value.coerceToString());
}
/** Converts plain text to the body of a JavaScript string by using {@code \n} style escapes. */
public static String escapeJsString(String value) {
return EscapingConventions.EscapeJsString.INSTANCE.escape(value);
}
/**
* Converts the input to a JavaScript expression. The resulting expression can be a boolean,
* number, string literal, or {@code null}.
*/
public static String escapeJsValue(SoyValue value) {
// We surround values with spaces so that they can't be interpolated into identifiers
// by accident. We could use parentheses but those might be interpreted as a function call.
if (NullData.INSTANCE == value) {
// The JS counterpart of this code in soyutils.js emits " null " for both null and the special
// JS value undefined.
return " null ";
} else if (value instanceof NumberData) {
// This will emit references to NaN and Infinity. Client code should not redefine those
// to store sensitive data.
return " " + value.numberValue() + " ";
} else if (value instanceof BooleanData) {
return " " + value.booleanValue() + " ";
} else if (isSanitizedContentOfKind(value, SanitizedContent.ContentKind.JS)) {
String jsCode = value.coerceToString();
// This value may not be embeddable if it contains the substring "</script".
// TODO(msamuel): Fixup. We need to be careful because mucking with '<' can
// break code like
// while (i</foo/.exec(str).length)
// and mucking with / can break
// return untrustedHTML.replace(/</g, '<');
return jsCode;
} else {
return escapeJsValue(value.coerceToString());
}
}
/** Converts plain text to a quoted javaScript string value. */
public static String escapeJsValue(String value) {
return value != null ? "'" + escapeJsString(value) + "'" : " null ";
}
/** Converts the input to the body of a JavaScript regular expression literal. */
public static String escapeJsRegex(SoyValue value) {
return escapeJsRegex(value.coerceToString());
}
/** Converts plain text to the body of a JavaScript regular expression literal. */
public static String escapeJsRegex(String value) {
return EscapingConventions.EscapeJsRegex.INSTANCE.escape(value);
}
/** Converts the input to the body of a CSS string literal. */
public static String escapeCssString(SoyValue value) {
return escapeCssString(value.coerceToString());
}
/** Converts plain text to the body of a CSS string literal. */
public static String escapeCssString(String value) {
return EscapingConventions.EscapeCssString.INSTANCE.escape(value);
}
/**
* Makes sure that the input is a valid CSS identifier part, CLASS or ID part, quantity, or CSS
* keyword part.
*/
public static String filterCssValue(SoyValue value) {
if (isSanitizedContentOfKind(value, SanitizedContent.ContentKind.CSS)) {
// We don't need to do this when the CSS is embedded in a
// style attribute since then the HTML escaper kicks in.
// TODO(msamuel): Maybe change the autoescaper to generate
// |filterCssValue:attrib
// for style attributes and thread the parameter here so that
// we can skip this check when its unnecessary.
return embedCssIntoHtml(value.coerceToString());
}
return NullData.INSTANCE == value ? "" : filterCssValue(value.coerceToString());
}
/**
* Makes sure that the input is a valid CSS identifier part, CLASS or ID part, quantity, or CSS
* keyword part.
*/
public static String filterCssValue(String value) {
if (EscapingConventions.FilterCssValue.INSTANCE.getValueFilter().matcher(value).find()) {
return value;
}
logger.log(Level.WARNING, "|filterCssValue received bad value {0}", value);
return EscapingConventions.FilterCssValue.INSTANCE.getInnocuousOutput();
}
/** Converts the input to a piece of a URI by percent encoding the value as UTF-8 bytes. */
public static String escapeUri(SoyValue value) {
return escapeUri(value.coerceToString());
}
/** Converts plain text to a piece of a URI by percent encoding the string as UTF-8 bytes. */
public static String escapeUri(String value) {
return uriEscaper().escape(value);
}
/**
* Converts a piece of URI content to a piece of URI content that can be safely embedded in an
* HTML attribute by percent encoding.
*/
public static String normalizeUri(SoyValue value) {
return normalizeUri(value.coerceToString());
}
/**
* Converts a piece of URI content to a piece of URI content that can be safely embedded in an
* HTML attribute by percent encoding.
*/
public static String normalizeUri(String value) {
return EscapingConventions.NormalizeUri.INSTANCE.escape(value);
}
/**
* Makes sure that the given input doesn't specify a dangerous protocol and also {@link
* #normalizeUri normalizes} it.
*/
public static String filterNormalizeUri(SoyValue value) {
if (isSanitizedContentOfKind(value, SanitizedContent.ContentKind.URI)
|| isSanitizedContentOfKind(value, SanitizedContent.ContentKind.TRUSTED_RESOURCE_URI)) {
return normalizeUri(value);
}
return filterNormalizeUri(value.coerceToString());
}
/**
* Makes sure that the given input doesn't specify a dangerous protocol and also {@link
* #normalizeUri normalizes} it.
*/
public static String filterNormalizeUri(String value) {
if (EscapingConventions.FilterNormalizeUri.INSTANCE.getValueFilter().matcher(value).find()) {
return EscapingConventions.FilterNormalizeUri.INSTANCE.escape(value);
}
logger.log(Level.WARNING, "|filterNormalizeUri received bad value {0}", value);
return EscapingConventions.FilterNormalizeUri.INSTANCE.getInnocuousOutput();
}
/**
* Checks that a URI is safe to be an image source.
*
* <p>Does not return SanitizedContent as there isn't an appropriate type for this.
*/
public static String filterNormalizeMediaUri(SoyValue value) {
if (isSanitizedContentOfKind(value, SanitizedContent.ContentKind.URI)
|| isSanitizedContentOfKind(value, SanitizedContent.ContentKind.TRUSTED_RESOURCE_URI)) {
return normalizeUri(value);
}
return filterNormalizeMediaUri(value.coerceToString());
}
/**
* Checks that a URI is safe to be an image source.
*
* <p>Does not return SanitizedContent as there isn't an appropriate type for this.
*/
public static String filterNormalizeMediaUri(String value) {
if (EscapingConventions.FilterNormalizeMediaUri.INSTANCE
.getValueFilter()
.matcher(value)
.find()) {
return EscapingConventions.FilterNormalizeMediaUri.INSTANCE.escape(value);
}
logger.log(Level.WARNING, "|filterNormalizeMediaUri received bad value {0}", value);
return EscapingConventions.FilterNormalizeMediaUri.INSTANCE.getInnocuousOutput();
}
/** Makes sure the given input is an instance of either trustedResourceUrl or trustedString. */
public static String filterTrustedResourceUri(SoyValue value) {
if (isSanitizedContentOfKind(value, SanitizedContent.ContentKind.TRUSTED_RESOURCE_URI)) {
return value.coerceToString();
}
logger.log(Level.WARNING, "|filterTrustedResourceUri received bad value {0}", value);
return "about:invalid#" + EscapingConventions.INNOCUOUS_OUTPUT;
}
/** For string inputs this function just returns the input string itself. */
public static String filterTrustedResourceUri(String value) {
return value;
}
/**
* For any resource string/variable which has |blessStringAsTrustedResuorceUrlForLegacy directive
* return the input value as is.
*/
public static SoyValue blessStringAsTrustedResourceUrlForLegacy(SoyValue value) {
return value;
}
/**
* For any resource string/variable which has |blessStringAsTrustedResuorceUrlForLegacy directive
* return the input value as is after converting it into SoyValue.
*/
public static SoyValue blessStringAsTrustedResourceUrlForLegacy(String value) {
return StringData.forValue(value);
}
/**
* Makes sure that the given input is a data URI corresponding to an image.
*
* <p>SanitizedContent kind does not apply -- the directive is also used to ensure no foreign
* resources are loaded.
*/
public static SanitizedContent filterImageDataUri(SoyValue value) {
return filterImageDataUri(value.coerceToString());
}
/** Makes sure that the given input is a data URI corresponding to an image. */
public static SanitizedContent filterImageDataUri(String value) {
if (EscapingConventions.FilterImageDataUri.INSTANCE.getValueFilter().matcher(value).find()) {
// NOTE: No need to escape.
return UnsafeSanitizedContentOrdainer.ordainAsSafe(value, ContentKind.URI);
}
logger.log(Level.WARNING, "|filterImageDataUri received bad value {0}", value);
return UnsafeSanitizedContentOrdainer.ordainAsSafe(
EscapingConventions.FilterImageDataUri.INSTANCE.getInnocuousOutput(),
SanitizedContent.ContentKind.URI);
}
/** Makes sure that the given input is a tel URI. */
public static SanitizedContent filterTelUri(SoyValue value) {
return filterTelUri(value.coerceToString());
}
/** Makes sure that the given input is a tel URI. */
public static SanitizedContent filterTelUri(String value) {
if (EscapingConventions.FilterTelUri.INSTANCE.getValueFilter().matcher(value).find()) {
// NOTE: No need to escape. Escaping for other contexts (e.g. HTML) happen after this.
return UnsafeSanitizedContentOrdainer.ordainAsSafe(value, ContentKind.URI);
}
logger.log(Level.WARNING, "|filterTelUri received bad value {0}", value);
return UnsafeSanitizedContentOrdainer.ordainAsSafe(
EscapingConventions.FilterTelUri.INSTANCE.getInnocuousOutput(),
SanitizedContent.ContentKind.URI);
}
/**
* Checks that the input is a valid HTML attribute name with normal keyword or textual content or
* known safe attribute content.
*/
public static String filterHtmlAttributes(SoyValue value) {
if (isSanitizedContentOfKind(value, SanitizedContent.ContentKind.ATTRIBUTES)) {
// We're guaranteed to be in a case where key=value pairs are expected. However, if it would
// cause issues to directly abut this with more attributes, add a space. For example:
// {$a}{$b} where $a is foo=bar and $b is boo=baz requires a space in between to be parsed
// correctly, but not in the case where $a is foo="bar".
// TODO: We should be able to get rid of this if the compiler can guarantee spaces between
// adjacent print statements in attribute context at compile time.
String content = value.coerceToString();
if (content.length() > 0) {
char lastChar = content.charAt(content.length() - 1);
if (lastChar != '"' && lastChar != '\'' && !Character.isWhitespace(lastChar)) {
content += ' ';
}
}
return content;
}
return filterHtmlAttributes(value.coerceToString());
}
/**
* Checks that the input is a valid HTML attribute name with normal keyword or textual content.
*/
public static String filterHtmlAttributes(String value) {
if (EscapingConventions.FilterHtmlAttributes.INSTANCE.getValueFilter().matcher(value).find()) {
return value;
}
logger.log(Level.WARNING, "|filterHtmlAttributes received bad value {0}", value);
return EscapingConventions.FilterHtmlAttributes.INSTANCE.getInnocuousOutput();
}
/** Checks that the input is part of the name of an innocuous element. */
public static String filterHtmlElementName(SoyValue value) {
return filterHtmlElementName(value.coerceToString());
}
/** Checks that the input is part of the name of an innocuous element. */
public static String filterHtmlElementName(String value) {
if (EscapingConventions.FilterHtmlElementName.INSTANCE.getValueFilter().matcher(value).find()) {
return value;
}
logger.log(Level.WARNING, "|filterHtmlElementName received bad value {0}", value);
return EscapingConventions.FilterHtmlElementName.INSTANCE.getInnocuousOutput();
}
/**
* Filters noAutoescape input from explicitly tainted content.
*
* <p>SanitizedContent.ContentKind.TEXT is used to explicitly mark input that is never meant to be
* used unescaped. Specifically, {let} and {param} blocks of kind "text" are explicitly forbidden
* from being noAutoescaped to avoid XSS regressions during application transition.
*/
public static SoyValue filterNoAutoescape(SoyValue value) {
// TODO: Consider also checking for things that are never valid, like null characters.
if (isSanitizedContentOfKind(value, SanitizedContent.ContentKind.TEXT)) {
logger.log(
Level.WARNING,
"|noAutoescape received value explicitly tagged as ContentKind.TEXT: {0}",
value);
return StringData.forValue(EscapingConventions.INNOCUOUS_OUTPUT);
}
return value;
}
/** Filters bad csp values. */
public static String filterCspNonceValue(SoyValue soyValue) {
String value = soyValue.coerceToString();
if (EscapingConventions.FilterCspNonceValue.INSTANCE.getValueFilter().matcher(value).find()) {
return value;
}
logger.log(Level.WARNING, "|filterCspNonceValue received bad value {0}", value);
return EscapingConventions.FilterCspNonceValue.INSTANCE.getInnocuousOutput();
}
/** True iff the given value is sanitized content of the given kind. */
private static boolean isSanitizedContentOfKind(
SoyValue value, SanitizedContent.ContentKind kind) {
return value instanceof SanitizedContent && kind == ((SanitizedContent) value).getContentKind();
}
/**
* Given a snippet of HTML, returns a snippet that has the same text content but only whitelisted
* tags.
*
* @param safeTags the tags that are allowed in the output. A {@code null} white-list is the same
* as the empty white-list. If {@code null} or empty, then the output can be embedded in an
* attribute value. If the output is to be embedded in an attribute, {@code safeTags} should
* be {@code null}.
* @param rawSpacesAllowed true if spaces are allowed in the output unescaped as is the case when
* the output is embedded in a regular text node, or in a quoted attribute.
*/
@VisibleForTesting
static String stripHtmlTags(String value, TagWhitelist safeTags, boolean rawSpacesAllowed) {
EscapingConventions.CrossLanguageStringXform normalizer =
rawSpacesAllowed
? EscapingConventions.NormalizeHtml.INSTANCE
: EscapingConventions.NormalizeHtmlNospace.INSTANCE;
Matcher matcher = EscapingConventions.HTML_TAG_CONTENT.matcher(value);
if (!matcher.find()) {
// Normalize so that the output can be embedded in an HTML attribute.
return normalizer.escape(value);
}
StringBuilder out = new StringBuilder(value.length() - matcher.end() + matcher.start());
Appendable normalizedOut = normalizer.escape(out);
// We do some very simple tag balancing by dropping any close tags for unopened tags and at the
// end emitting close tags for any still open tags.
// This is sufficient (in HTML) to prevent embedded content with safe tags from breaking layout
// when, for example, stripHtmlTags("</table>") is embedded in a page that uses tables for
// formatting.
List<String> openTags = null;
int openListTagCount = 0;
try {
int pos = 0; // Such that value[:pos] has been sanitized onto out.
do {
int start = matcher.start();
if (pos < start) {
normalizedOut.append(value, pos, start);
// More aggressively normalize ampersands at the end of a chunk so that
// "&<b>amp;</b>" -> "&" instead of "&".
if (value.charAt(start - 1) == '&') {
out.append("amp;");
}
}
if (safeTags != null) {
String tagName = matcher.group(1);
if (tagName != null) {
// Use locale so that <I> works when the default locale is Turkish
tagName = tagName.toLowerCase(Locale.ENGLISH);
if (safeTags.isSafeTag(tagName)) {
boolean isClose = value.charAt(start + 1) == '/';
if (isClose) {
if (openTags != null) {
int lastIdx = openTags.lastIndexOf(tagName);
if (lastIdx >= 0) {
// Close contained tags as well.
// If we didn't, then we would convert "<ul><li></ul>" to "<ul><li></ul></li>"
// which could lead to broken layout for embedding HTML that uses lists for
// formatting.
// This leads to observably different behavior for adoption-agency dependent
// tag combinations like "<b><i>Foo</b> Bar</b>" but fails safe.
// http://www.whatwg.org/specs/web-apps/current-work/multipage/the-end.html#misnested-tags:-b-i-/b-/i
List<String> tagsToClose = openTags.subList(lastIdx, openTags.size());
for (String tagToClose : tagsToClose) {
if (isListTag(tagToClose)) {
openListTagCount--;
}
}
closeTags(tagsToClose, out);
}
}
} else {
// Only allow whitelisted <li> through if it is nested in a parent <ol> or <ul>.
if (openListTagCount > 0 || !"li".equals(tagName)) {
if (isListTag(tagName)) {
openListTagCount++;
}
// Emit beginning of the opening tag and tag name on the un-normalized channel.
out.append('<').append(tagName);
// Most attributes are dropped, but the dir attribute is preserved if it exists.
// The attribute matching could be made more generic if more attributes need to be
// whitelisted in the future. There are also probably other utilities in common to
// do such parsing of HTML, but this seemed simple enough and keeps with the
// current spirit of this function of doing custom parsing.
Matcher attributeMatcher = HTML_ATTRIBUTE_PATTERN.matcher(matcher.group());
while (attributeMatcher.find()) {
String attributeName = attributeMatcher.group(1);
if (!Strings.isNullOrEmpty(attributeName)
&& attributeName.toLowerCase(Locale.ENGLISH).equals("dir")) {
String dir = attributeMatcher.group(2);
if (!Strings.isNullOrEmpty(dir)) {
// Strip quotes if the attribute value was quoted.
if (dir.charAt(0) == '\'' || dir.charAt(0) == '"') {
dir = dir.substring(1, dir.length() - 1);
}
dir = dir.toLowerCase(Locale.ENGLISH);
if ("ltr".equals(dir) || "rtl".equals(dir) || "auto".equals(dir)) {
out.append(" dir=\"").append(dir).append("\"");
}
}
break;
}
}
// Emit the end of the opening tag
out.append('>');
// Keep track of tags that need closing.
if (!HTML5_VOID_ELEMENTS.contains(tagName)) {
if (openTags == null) {
openTags = Lists.newArrayList();
}
openTags.add(tagName);
}
}
}
}
}
}
pos = matcher.end();
} while (matcher.find());
normalizedOut.append(value, pos, value.length());
// Emit close tags, so that safeTags("<table>") can't break the layout of embedding HTML that
// uses tables for layout.
if (openTags != null) {
closeTags(openTags, out);
}
} catch (IOException ex) {
// Writing to a StringBuilder should not throw.
throw new AssertionError(ex);
}
return out.toString();
}
private static void closeTags(List<String> openTags, StringBuilder out) {
for (int i = openTags.size(); --i >= 0; ) {
out.append("</").append(openTags.get(i)).append('>');
}
openTags.clear();
}
private static boolean isListTag(String tagName) {
return "ol".equals(tagName) || "ul".equals(tagName);
}
/** From http://www.w3.org/TR/html-markup/syntax.html#syntax-elements */
private static final ImmutableSet<String> HTML5_VOID_ELEMENTS =
ImmutableSet.of(
"area", "base", "br", "col", "command", "embed", "hr", "img", "input", "keygen", "link",
"meta", "param", "source", "track", "wbr");
/**
* Pattern for matching attribute name and value, where value is single-quoted or double-quoted.
*/
public static final Pattern HTML_ATTRIBUTE_PATTERN;
static {
String attributeName = "[a-zA-Z][a-zA-Z0-9:\\-]*";
String space = "[\t\n\r ]";
String doubleQuotedValue = "\"[^\"]*\"";
String singleQuotedValue = "'[^']*'";
String attributeValue = Joiner.on('|').join(doubleQuotedValue, singleQuotedValue);
HTML_ATTRIBUTE_PATTERN =
Pattern.compile(
String.format(
"(%s)%s*=%s*(%s)",
attributeName, // Group 1: Attribute name.
space,
space,
attributeValue // Group 2: Optionally-quoted attributed value.
));
}
/**
* Returns a {@link Escaper} instance that escapes Java characters so they can be safely included
* in URIs. For details on escaping URIs, see section 2.4 of <a
* href="http://www.ietf.org/rfc/rfc2396.txt">RFC 2396</a>.
*
* <p>When encoding a String, the following rules apply:
*
* <ul>
* <li>The alphanumeric characters "a" through "z", "A" through "Z" and "0" through "9" remain
* the same.
* <li>The special characters ".", "-", "*", and "_" remain the same.
* <li>If {@code plusForSpace} was specified, the space character " " is converted into a plus
* sign "+". Otherwise it is converted into "%20".
* <li>All other characters are converted into one or more bytes using UTF-8 encoding and each
* byte is then represented by the 3-character string "%XY", where "XY" is the two-digit,
* uppercase, hexadecimal representation of the byte value.
* </ul>
*
* <p><b>Note</b>: Unlike other escapers, URI escapers produce uppercase hexadecimal sequences.
* From <a href="http://www.ietf.org/rfc/rfc3986.txt">RFC 3986</a>:<br>
* <i>"URI producers and normalizers should use uppercase hexadecimal digits for all
* percent-encodings."</i>
*
* @see #uriEscaper()
*/
private static Escaper uriEscaper() {
return URI_ESCAPER_NO_PLUS;
}
/**
* A string of safe characters that mimics the behavior of {@link java.net.URLEncoder}.
*
* <p>TODO: Fix escapers to be compliant with RFC 3986
*/
private static final String SAFECHARS_URLENCODER = "-_.*";
private static final Escaper URI_ESCAPER_NO_PLUS =
new PercentEscaper(SAFECHARS_URLENCODER, false);
private static final Pattern HTML_RAW_CONTENT_HAZARD_RE =
Pattern.compile(Pattern.quote("</") + "|" + Pattern.quote("]]>"));
private static final ImmutableMap<String, String> HTML_RAW_CONTENT_HAZARD_REPLACEMENT =
ImmutableMap.of(
"</", "<\\/",
"]]>", "]]\\>");
/**
* Make sure that tag boundaries are not broken by Safe CSS when embedded in a {@code <style>}
* element.
*/
@VisibleForTesting
static String embedCssIntoHtml(String css) {
// `</style` can close a containing style element in HTML.
// `]]>` can similarly close a CDATA element in XHTML.
// Scan for "</" and "]]>" and escape enough to remove the token seen by
// the HTML parser.
// For well-formed CSS, these string might validly appear in a few contexts:
// 1. comments
// 2. string bodies
// 3. url(...) bodies.
// Appending \ should be semantics preserving in comments and string bodies.
// This may not be semantics preserving in url content.
// The substring "]>" can validly appear in a selector
// a[href]>b
// but the substring "]]>" cannot.
// This should not affect how a CSS parser recovers from syntax errors.
Matcher m = HTML_RAW_CONTENT_HAZARD_RE.matcher(css);
if (!m.find()) {
return css;
}
StringBuffer sb = new StringBuffer(css.length() + 16);
do {
m.appendReplacement(sb, "");
sb.append(HTML_RAW_CONTENT_HAZARD_REPLACEMENT.get(m.group()));
} while (m.find());
m.appendTail(sb);
return sb.toString();
}
}