/**
* Copyright (c) 2000-present Liferay, Inc. All rights reserved.
*
* This library is free software; you can redistribute it and/or modify it under
* the terms of the GNU Lesser General Public License as published by the Free
* Software Foundation; either version 2.1 of the License, or (at your option)
* any later version.
*
* This library is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
* FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
* details.
*/
package com.liferay.portal.util;
import com.liferay.portal.kernel.security.pacl.DoPrivileged;
import com.liferay.portal.kernel.util.CharPool;
import com.liferay.portal.kernel.util.Html;
import com.liferay.portal.kernel.util.StringBundler;
import com.liferay.portal.kernel.util.StringPool;
import com.liferay.portal.kernel.util.StringUtil;
import com.liferay.portal.kernel.util.URLCodec;
import com.liferay.portal.kernel.util.Validator;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import net.htmlparser.jericho.Renderer;
import net.htmlparser.jericho.Source;
import net.htmlparser.jericho.TextExtractor;
/**
* Provides the implementation of the HTML utility interface for escaping,
* rendering, replacing, and stripping HTML text. This class uses XSS
* recommendations from <a
* href="http://www.owasp.org/index.php/Cross_Site_Scripting#How_to_Protect_Yourself">http://www.owasp.org/index.php/Cross_Site_Scripting#How_to_Protect_Yourself</a>
* when escaping HTML text.
*
* @author Brian Wing Shun Chan
* @author Clarence Shen
* @author Harry Mark
* @author Samuel Kong
* @author Connor McKay
* @author Shuyang Zhou
*/
@DoPrivileged
public class HtmlImpl implements Html {
public static final int ESCAPE_MODE_ATTRIBUTE = 1;
public static final int ESCAPE_MODE_CSS = 2;
public static final int ESCAPE_MODE_JS = 3;
public static final int ESCAPE_MODE_TEXT = 4;
public static final int ESCAPE_MODE_URL = 5;
/**
* Generates a string with the data-* attributes generated from the keys and
* values of a map. For example, a map containing
* <code>{key1=value1;key2=value2}</code> is returned as the string
* <code>data-key1=value1 data-key2=value2</code>.
*
* @param data the map of values to convert to data-* attributes
* @return a string with the data attributes, or <code>null</code> if the
* map is <code>null</code>
*/
@Override
public String buildData(Map<String, Object> data) {
if ((data == null) || data.isEmpty()) {
return StringPool.BLANK;
}
StringBundler sb = new StringBundler(data.size() * 5);
for (Map.Entry<String, Object> entry : data.entrySet()) {
sb.append("data-");
sb.append(entry.getKey());
sb.append("=\"");
sb.append(escapeAttribute(String.valueOf(entry.getValue())));
sb.append("\" ");
}
return sb.toString();
}
/**
* Escapes the text so that it is safe to use in an HTML context.
*
* @param text the text to escape
* @return the escaped HTML text, or <code>null</code> if the text is
* <code>null</code>
*/
@Override
public String escape(String text) {
if (text == null) {
return null;
}
if (text.length() == 0) {
return StringPool.BLANK;
}
// Escape using XSS recommendations from
// http://www.owasp.org/index.php/Cross_Site_Scripting
// #How_to_Protect_Yourself
StringBundler sb = null;
int lastReplacementIndex = 0;
for (int i = 0; i < text.length(); i++) {
char c = text.charAt(i);
String replacement = null;
if ((c < 256) && ((c >= 128) || _VALID_CHARS[c])) {
continue;
}
if (c == '<') {
replacement = "<";
}
else if (c == '>') {
replacement = ">";
}
else if (c == '&') {
replacement = "&";
}
else if (c == '"') {
replacement = """;
}
else if (c == '\'') {
replacement = "'";
}
else if (c == '\u00bb') {
replacement = "»";
}
else if (c == '\u2013') {
replacement = "–";
}
else if (c == '\u2014') {
replacement = "—";
}
else if (c == '\u2028') {
replacement = "
";
}
else if (!_isValidXmlCharacter(c) ||
_isUnicodeCompatibilityCharacter(c)) {
replacement = StringPool.SPACE;
}
else {
continue;
}
if (sb == null) {
sb = new StringBundler();
}
if (i > lastReplacementIndex) {
sb.append(text.substring(lastReplacementIndex, i));
}
sb.append(replacement);
lastReplacementIndex = i + 1;
}
if (sb == null) {
return text;
}
if (lastReplacementIndex < text.length()) {
sb.append(text.substring(lastReplacementIndex));
}
return sb.toString();
}
/**
* Escapes the input text as a hexadecimal value, based on the mode (type).
* The encoding types include: {@link #ESCAPE_MODE_ATTRIBUTE}, {@link
* #ESCAPE_MODE_CSS}, {@link #ESCAPE_MODE_JS}, {@link #ESCAPE_MODE_TEXT},
* and {@link #ESCAPE_MODE_URL}.
*
* <p>
* Note that <code>escape(text, ESCAPE_MODE_TEXT)</code> returns the same as
* <code>escape(text)</code>.
* </p>
*
* @param text the text to escape
* @param mode the encoding type
* @return the escaped hexadecimal value of the input text, based on the
* mode, or <code>null</code> if the text is <code>null</code>
*/
@Override
public String escape(String text, int mode) {
if (text == null) {
return null;
}
if (text.length() == 0) {
return StringPool.BLANK;
}
String prefix = StringPool.BLANK;
String postfix = StringPool.BLANK;
if (mode == ESCAPE_MODE_ATTRIBUTE) {
prefix = "";
postfix = StringPool.SEMICOLON;
}
else if (mode == ESCAPE_MODE_CSS) {
prefix = StringPool.BACK_SLASH;
}
else if (mode == ESCAPE_MODE_JS) {
prefix = "\\x";
}
else if (mode == ESCAPE_MODE_URL) {
return URLCodec.encodeURL(text, true);
}
else {
return escape(text);
}
StringBuilder sb = null;
char[] hexBuffer = new char[4];
int lastReplacementIndex = 0;
for (int i = 0; i < text.length(); i++) {
char c = text.charAt(i);
if (c < _VALID_CHARS.length) {
if (!_VALID_CHARS[c]) {
if (sb == null) {
sb = new StringBuilder(text.length() + 64);
}
if (i > lastReplacementIndex) {
sb.append(text, lastReplacementIndex, i);
}
sb.append(prefix);
_appendHexChars(sb, hexBuffer, c);
sb.append(postfix);
if ((mode == ESCAPE_MODE_CSS) &&
(i < (text.length() - 1))) {
char nextChar = text.charAt(i + 1);
if ((nextChar >= CharPool.NUMBER_0) &&
(nextChar <= CharPool.NUMBER_9)) {
sb.append(CharPool.SPACE);
}
}
lastReplacementIndex = i + 1;
}
}
else if ((mode == ESCAPE_MODE_ATTRIBUTE) &&
(!_isValidXmlCharacter(c) ||
_isUnicodeCompatibilityCharacter(c))) {
if (sb == null) {
sb = new StringBuilder(text.length() + 64);
}
if (i > lastReplacementIndex) {
sb.append(text, lastReplacementIndex, i);
}
sb.append(CharPool.SPACE);
lastReplacementIndex = i + 1;
}
else if ((mode == ESCAPE_MODE_JS) &&
((c == '\u2028') || (c == '\u2029'))) {
if (sb == null) {
sb = new StringBuilder(text.length() + 64);
}
if (i > lastReplacementIndex) {
sb.append(text, lastReplacementIndex, i);
}
sb.append("\\u");
_appendHexChars(sb, hexBuffer, c);
sb.append(postfix);
lastReplacementIndex = i + 1;
}
}
if (sb == null) {
return text;
}
if (lastReplacementIndex < text.length()) {
sb.append(text, lastReplacementIndex, text.length());
}
return sb.toString();
}
/**
* Escapes the attribute value so that it is safe to use within a quoted
* attribute.
*
* @param attribute the attribute to escape
* @return the escaped attribute value, or <code>null</code> if the
* attribute value is <code>null</code>
*/
@Override
public String escapeAttribute(String attribute) {
return escape(attribute, ESCAPE_MODE_ATTRIBUTE);
}
/**
* Escapes the CSS value so that it is safe to use in a CSS context.
*
* @param css the CSS value to escape
* @return the escaped CSS value, or <code>null</code> if the CSS value is
* <code>null</code>
*/
@Override
public String escapeCSS(String css) {
return escape(css, ESCAPE_MODE_CSS);
}
/**
* Escapes the HREF attribute so that it is safe to use as an HREF
* attribute.
*
* @param href the HREF attribute to escape
* @return the escaped HREF attribute, or <code>null</code> if the HREF
* attribute is <code>null</code>
*/
@Override
public String escapeHREF(String href) {
if (href == null) {
return null;
}
if (href.length() == 0) {
return StringPool.BLANK;
}
int index = href.indexOf(StringPool.COLON);
if (index == 4) {
String protocol = StringUtil.toLowerCase(href.substring(0, 4));
if (protocol.equals("data")) {
href = StringUtil.replaceFirst(href, CharPool.COLON, "%3a");
}
}
else if (index == 10) {
String protocol = StringUtil.toLowerCase(href.substring(0, 10));
if (protocol.equals("javascript")) {
href = StringUtil.replaceFirst(href, CharPool.COLON, "%3a");
}
}
return escapeAttribute(href);
}
/**
* Escapes the JavaScript value so that it is safe to use in a JavaScript
* context.
*
* @param js the JavaScript value to escape
* @return the escaped JavaScript value, or <code>null</code> if the
* JavaScript value is <code>null</code>
*/
@Override
public String escapeJS(String js) {
return escape(js, ESCAPE_MODE_JS);
}
@Override
public String escapeJSLink(String link) {
if (Validator.isNull(link)) {
return StringPool.BLANK;
}
if (link.indexOf(StringPool.COLON) == 10) {
String protocol = StringUtil.toLowerCase(link.substring(0, 10));
if (protocol.equals("javascript")) {
link = StringUtil.replaceFirst(link, CharPool.COLON, "%3a");
}
}
return link;
}
/**
* Escapes the URL value so that it is safe to use as a URL.
*
* @param url the URL value to escape
* @return the escaped URL value, or <code>null</code> if the URL value is
* <code>null</code>
*/
@Override
public String escapeURL(String url) {
return escape(url, ESCAPE_MODE_URL);
}
@Override
public String escapeXPath(String xPath) {
if (Validator.isNull(xPath)) {
return xPath;
}
StringBuilder sb = new StringBuilder(xPath.length());
for (int i = 0; i < xPath.length(); i++) {
char c = xPath.charAt(i);
boolean hasToken = false;
for (int j = 0; j < _XPATH_TOKENS.length; j++) {
if (c == _XPATH_TOKENS[j]) {
hasToken = true;
break;
}
}
if (hasToken) {
sb.append(CharPool.UNDERLINE);
}
else {
sb.append(c);
}
}
return sb.toString();
}
@Override
public String escapeXPathAttribute(String xPathAttribute) {
boolean hasApostrophe = xPathAttribute.contains(StringPool.APOSTROPHE);
boolean hasQuote = xPathAttribute.contains(StringPool.QUOTE);
if (hasQuote && hasApostrophe) {
String[] parts = xPathAttribute.split(StringPool.APOSTROPHE);
return "concat('".concat(
StringUtil.merge(parts, "', \"'\", '")).concat("')");
}
if (hasQuote) {
return StringPool.APOSTROPHE.concat(xPathAttribute).concat(
StringPool.APOSTROPHE);
}
return StringPool.QUOTE.concat(xPathAttribute).concat(StringPool.QUOTE);
}
/**
* Extracts the raw text from the HTML input, compressing its whitespace and
* removing all attributes, scripts, and styles.
*
* <p>
* For example, raw text returned by this method can be stored in a search
* index.
* </p>
*
* @param html the HTML text
* @return the raw text from the HTML input, or <code>null</code> if the
* HTML input is <code>null</code>
*/
@Override
public String extractText(String html) {
if (html == null) {
return null;
}
Source source = new Source(html);
TextExtractor textExtractor = source.getTextExtractor();
return textExtractor.toString();
}
@Override
public String fromInputSafe(String text) {
return StringUtil.replace(
text, new String[] {"&", """}, new String[] {"&", "\""});
}
@Override
public String getAUICompatibleId(String text) {
if (Validator.isNull(text)) {
return text;
}
StringBundler sb = null;
int lastReplacementIndex = 0;
for (int i = 0; i < text.length(); i++) {
char c = text.charAt(i);
if (((c <= 127) && (Validator.isChar(c) || Validator.isDigit(c))) ||
((c > 127) && (c != CharPool.FIGURE_SPACE) &&
(c != CharPool.NARROW_NO_BREAK_SPACE) &&
(c != CharPool.NO_BREAK_SPACE))) {
continue;
}
if (sb == null) {
sb = new StringBundler();
}
if (i > lastReplacementIndex) {
sb.append(text.substring(lastReplacementIndex, i));
}
sb.append(StringPool.UNDERLINE);
if (c != CharPool.UNDERLINE) {
sb.append(StringUtil.toHexString(c));
}
sb.append(StringPool.UNDERLINE);
lastReplacementIndex = i + 1;
}
if (sb == null) {
return text;
}
if (lastReplacementIndex < text.length()) {
sb.append(text.substring(lastReplacementIndex));
}
return sb.toString();
}
/**
* Renders the HTML content into text. This provides a human readable
* version of the content that is modeled on the way Mozilla
* Thunderbird® and other email clients provide an automatic conversion
* of HTML content to text in their alternative MIME encoding of emails.
*
* <p>
* Using the default settings, the output complies with the
* <code>Text/Plain; Format=Flowed (DelSp=No)</code> protocol described in
* <a href="http://tools.ietf.org/html/rfc3676">RFC-3676</a>.
* </p>
*
* @param html the HTML text
* @return the rendered HTML text, or <code>null</code> if the HTML text is
* <code>null</code>
*/
@Override
public String render(String html) {
if (html == null) {
return null;
}
Source source = new Source(html);
Renderer renderer = source.getRenderer();
return renderer.toString();
}
/**
* Replaces all Microsoft® Word Unicode characters with plain HTML
* entities or characters.
*
* @param text the text
* @return the converted text, or <code>null</code> if the text is
* <code>null</code>
* @deprecated As of 7.0.0, with no direct replacement
*/
@Deprecated
@Override
public String replaceMsWordCharacters(String text) {
return StringUtil.replace(text, _MS_WORD_UNICODE, _MS_WORD_HTML);
}
/**
* Replaces all new lines or carriage returns with the <code><br /></code>
* HTML tag.
*
* @param html the text
* @return the converted text, or <code>null</code> if the text is
* <code>null</code>
*/
@Override
public String replaceNewLine(String html) {
if (html == null) {
return null;
}
html = StringUtil.replace(html, StringPool.RETURN_NEW_LINE, "<br />");
return StringUtil.replace(html, CharPool.NEW_LINE, "<br />");
}
/**
* Strips all content delimited by the tag out of the text.
*
* <p>
* If the tag appears multiple times, all occurrences (including the tag)
* are stripped. The tag may have attributes. In order for this method to
* recognize the tag, it must consist of a separate opening and closing tag.
* Self-closing tags remain in the result.
* </p>
*
* @param text the text
* @param tag the tag used for delimiting, which should only be the tag's
* name (e.g. no <)
* @return the text, without the stripped tag and its contents, or
* <code>null</code> if the text is <code>null</code>
*/
@Override
public String stripBetween(String text, String tag) {
return StringUtil.stripBetween(text, "<" + tag, "</" + tag + ">");
}
/**
* Strips all XML comments out of the text.
*
* @param text the text
* @return the text, without the stripped XML comments, or <code>null</code>
* if the text is <code>null</code>
*/
@Override
public String stripComments(String text) {
return StringUtil.stripBetween(text, "<!--", "-->");
}
@Override
public String stripHtml(String text) {
if (text == null) {
return null;
}
text = stripComments(text);
StringBuilder sb = new StringBuilder(text.length());
int x = 0;
int y = text.indexOf("<");
while (y != -1) {
sb.append(text.substring(x, y));
// Look for text enclosed by <abc></abc>
if (isTag(_TAG_SCRIPT, text, y + 1)) {
y = stripTag(_TAG_SCRIPT, text, y);
}
else if (isTag(_TAG_STYLE, text, y + 1)) {
y = stripTag(_TAG_STYLE, text, y);
}
x = text.indexOf(">", y);
if (x == -1) {
break;
}
x++;
if (x < y) {
// <b>Hello</b
break;
}
y = text.indexOf("<", x);
}
if (y == -1) {
sb.append(text.substring(x));
}
return sb.toString();
}
/**
* Encodes the text so that it's safe to use as an HTML input field value.
*
* <p>
* For example, the <code>&</code> character is replaced by
* <code>&</code>.
* </p>
*
* @param text the text
* @return the encoded text that is safe to use as an HTML input field
* value, or <code>null</code> if the text is <code>null</code>
*/
@Override
public String toInputSafe(String text) {
return StringUtil.replace(
text, new char[] {'&', '\"'}, new String[] {"&", """});
}
@Override
public String unescape(String text) {
return StringUtil.replace(text, "&", ";", _unescapeMap);
}
@Override
public String unescapeCDATA(String text) {
if (text == null) {
return null;
}
if (text.length() == 0) {
return StringPool.BLANK;
}
text = StringUtil.replace(text, "<![CDATA[", "<![CDATA[");
text = StringUtil.replace(text, "]]>", "]]>");
return text;
}
@Override
public String wordBreak(String text, int columns) {
StringBundler sb = new StringBundler();
int length = 0;
int lastWrite = 0;
int pos = 0;
Matcher matcher = _pattern.matcher(text);
while (matcher.find()) {
if (matcher.start() < pos) {
continue;
}
while ((length + matcher.start() - pos) >= columns) {
pos += columns - length;
sb.append(text.substring(lastWrite, pos));
sb.append("<wbr/>");
length = 0;
lastWrite = pos;
}
length += matcher.start() - pos;
String group = matcher.group();
if (group.equals(StringPool.AMPERSAND)) {
int x = text.indexOf(StringPool.SEMICOLON, matcher.start());
if (x != -1) {
length++;
pos = x + 1;
}
continue;
}
if (group.equals(StringPool.LESS_THAN)) {
int x = text.indexOf(StringPool.GREATER_THAN, matcher.start());
if (x != -1) {
pos = x + 1;
}
continue;
}
if (group.equals(StringPool.SPACE) ||
group.equals(StringPool.NEW_LINE)) {
length = 0;
pos = matcher.start() + 1;
}
}
sb.append(text.substring(lastWrite));
return sb.toString();
}
protected boolean isTag(char[] tag, String text, int pos) {
if ((pos + tag.length + 1) <= text.length()) {
char item = '\0';
for (int i = 0; i < tag.length; i++) {
item = text.charAt(pos++);
if (Character.toLowerCase(item) != tag[i]) {
return false;
}
}
item = text.charAt(pos);
// Check that char after tag is not a letter (i.e. another tag)
return !Character.isLetter(item);
}
else {
return false;
}
}
protected int stripTag(char[] tag, String text, int pos) {
int x = pos + tag.length;
// Find end of the tag
x = text.indexOf(">", x);
if (x < 0) {
return pos;
}
// Check if preceding character is / (i.e. is this instance of <abc/>)
if (text.charAt(x - 1) == '/') {
return pos;
}
// Search for the ending </abc> tag
while (true) {
x = text.indexOf("</", x);
if (x >= 0) {
if (isTag(tag, text, x + 2)) {
pos = x;
break;
}
else {
// Skip past "</"
x += 2;
}
}
else {
break;
}
}
return pos;
}
private static void _appendHexChars(
StringBuilder sb, char[] buffer, char c) {
int index = buffer.length;
do {
buffer[--index] = _HEX_DIGITS[c & 15];
c >>>= 4;
}
while (c != 0);
if (index == (buffer.length - 1)) {
sb.append(CharPool.NUMBER_0);
sb.append(buffer[index]);
return;
}
sb.append(buffer, index, buffer.length - index);
}
private boolean _isUnicodeCompatibilityCharacter(char c) {
if (((c >= '\u007f') && (c <= '\u0084')) ||
((c >= '\u0086') && (c <= '\u009f')) ||
((c >= '\ufdd0') && (c <= '\ufdef'))) {
return true;
}
return false;
}
private boolean _isValidXmlCharacter(char c) {
if (((c >= CharPool.SPACE) && (c <= '\ud7ff')) ||
((c >= '\ue000') && (c <= '\ufffd')) || Character.isSurrogate(c) ||
(c == CharPool.TAB) || (c == CharPool.NEW_LINE) ||
(c == CharPool.RETURN)) {
return true;
}
return false;
}
private static final char[] _HEX_DIGITS = {
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd',
'e', 'f'
};
private static final String[] _MS_WORD_HTML = new String[] {
"®", StringPool.APOSTROPHE, StringPool.QUOTE, StringPool.QUOTE
};
private static final String[] _MS_WORD_UNICODE =
new String[] {"\u00ae", "\u2019", "\u201c", "\u201d"};
private static final char[] _TAG_SCRIPT = {'s', 'c', 'r', 'i', 'p', 't'};
private static final char[] _TAG_STYLE = {'s', 't', 'y', 'l', 'e'};
private static final boolean[] _VALID_CHARS = new boolean[256];
// See http://www.w3.org/TR/xpath20/#lexical-structure
private static final char[] _XPATH_TOKENS = {
'(', ')', '[', ']', '.', '@', ',', ':', '/', '|', '+', '-', '=', '!',
'<', '>', '*', '$', '"', '"', ' ', 9, 10, 13, 133, 8232
};
private static final Map<String, String> _unescapeMap = new HashMap<>();
static {
_unescapeMap.put("#34", "\"");
_unescapeMap.put("#35", "#");
_unescapeMap.put("#37", "%");
_unescapeMap.put("#39", "'");
_unescapeMap.put("#40", "(");
_unescapeMap.put("#41", ")");
_unescapeMap.put("#43", "+");
_unescapeMap.put("#44", ",");
_unescapeMap.put("#45", "-");
_unescapeMap.put("#59", ";");
_unescapeMap.put("#61", "=");
_unescapeMap.put("amp", "&");
_unescapeMap.put("gt", ">");
_unescapeMap.put("lt", "<");
_unescapeMap.put("rsquo", "\u2019");
for (int i = 0; i < _VALID_CHARS.length; i++) {
if (Character.isLetterOrDigit(i)) {
_VALID_CHARS[i] = true;
}
}
_VALID_CHARS['-'] = true;
_VALID_CHARS['_'] = true;
}
private final Pattern _pattern = Pattern.compile("([\\s<&]|$)");
}