/*
* Copyright 2008 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.template.soy.base.internal;
import static java.nio.charset.StandardCharsets.UTF_8;
import com.google.common.base.CharMatcher;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Sets;
import com.google.common.hash.Hashing;
import java.io.File;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;
/**
* Base utilities for Soy code.
*
* <p>Important: Do not use outside of Soy code (treat as superpackage-private).
*
*/
public class BaseUtils {
private BaseUtils() {}
/** Used by {@code ensureDirsExistInPath()}. Keeps track of known existing directory paths. */
private static final Set<String> KNOWN_EXISTING_DIRS = Sets.newHashSet();
/** Regular expression for an identifier. */
public static final String IDENT_RE = "[a-zA-Z_][a-zA-Z_0-9]*";
/** Pattern for an identifier. */
private static final Pattern IDENT_PATTERN = Pattern.compile(IDENT_RE);
/** Pattern for an identifier with leading dot. */
private static final Pattern IDENT_WITH_LEADING_DOT_PATTERN = Pattern.compile("[.]" + IDENT_RE);
/** Regular expression for a dotted identifier. */
public static final String DOTTED_IDENT_RE = IDENT_RE + "(?:[.]" + IDENT_RE + ")*";
/** Pattern for a dotted identifier. */
private static final Pattern DOTTED_IDENT_PATTERN = Pattern.compile(DOTTED_IDENT_RE);
/** Pattern for a leading or trailing underscore. */
private static final Pattern LEADING_OR_TRAILING_UNDERSCORE_PATTERN =
Pattern.compile("^_+|_+\\Z");
/** Pattern for places to insert underscores to make an identifier name underscore-separated. */
private static final Pattern WORD_BOUNDARY_IN_IDENT_PATTERN =
Pattern.compile(
// <letter>_<upper><lower>
"(?<= [a-zA-Z])(?= [A-Z][a-z])"
// <letter>_<digit>
+ "| (?<= [a-zA-Z])(?= [0-9])"
// <digit>_<letter>
+ "| (?<= [0-9])(?= [a-zA-Z])",
Pattern.COMMENTS);
/** Pattern for consecutive underscores. */
private static final Pattern CONSECUTIVE_UNDERSCORES_PATTERN =
Pattern.compile("_ _ _*", Pattern.COMMENTS);
/** Hex digits for Soy strings (requires upper-case hex digits). */
private static final char[] HEX_DIGITS = {
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'
};
/**
* Ensures that the directories in the given path exist, creating them if necessary.
*
* <p>Note: If the path does not end with the separator char (slash in Linux), then the name at
* the end is assumed to be the file name, so directories are only created down to its parent.
*
* @param path The path for which to ensure directories exist.
*/
public static void ensureDirsExistInPath(String path) {
if (path == null || path.length() == 0) {
throw new AssertionError("ensureDirsExistInPath called with null or empty path.");
}
String dirPath =
(path.charAt(path.length() - 1) == File.separatorChar)
? path.substring(0, path.length() - 1)
: (new File(path)).getParent();
if (dirPath == null || KNOWN_EXISTING_DIRS.contains(dirPath)) {
return; // known to exist
} else {
(new File(dirPath)).mkdirs();
KNOWN_EXISTING_DIRS.add(dirPath);
}
}
/**
* Determines whether the given string is an identifier.
*
* <p>An identifier must start with a letter or underscore and must only contain letters, digits,
* and underscores (i.e. it must match the regular expression {@code [A-Za-z_][A-Za-z_0-9]*}).
*
* @param s The string to check.
* @return True if the given string is an identifier.
*/
public static boolean isIdentifier(String s) {
return IDENT_PATTERN.matcher(s).matches();
}
/**
* Determines whether the given string is a dot followed by an identifier.
*
* @param s The string to check.
* @return True if the given string is a dot followed by an identifier.
*/
public static boolean isIdentifierWithLeadingDot(String s) {
return IDENT_WITH_LEADING_DOT_PATTERN.matcher(s).matches();
}
/**
* Determines whether the given string is a dotted identifier (e.g. {@code boo.foo0._goo}). A
* dotted identifier is not required to have dots (i.e. a simple identifier qualifies as a dotted
* identifier).
*
* @param s The string to check.
* @return True if the given string is a dotted identifier (e.g. {@code boo.foo0._goo}).
*/
public static boolean isDottedIdentifier(String s) {
return DOTTED_IDENT_PATTERN.matcher(s).matches();
}
/**
* Gets the part after the last dot in a dotted identifier. If there are no dots, returns the
* whole input string.
*
* <p>Important: The input must be a dotted identifier. This is not checked.
*/
public static String extractPartAfterLastDot(String dottedIdent) {
int lastDotIndex = dottedIdent.lastIndexOf('.');
return (lastDotIndex == -1) ? dottedIdent : dottedIdent.substring(lastDotIndex + 1);
}
/**
* Converts an identifier to upper-underscore format. The identifier must start with a letter or
* underscore and must only contain letters, digits, and underscores (i.e. it must match the
* regular expression {@code [A-Za-z_][A-Za-z_0-9]*}).
*
* @param ident The identifer to convert.
* @return The identifier in upper-underscore format.
*/
public static String convertToUpperUnderscore(String ident) {
ident = LEADING_OR_TRAILING_UNDERSCORE_PATTERN.matcher(ident).replaceAll("");
ident = WORD_BOUNDARY_IN_IDENT_PATTERN.matcher(ident).replaceAll("_");
ident = CONSECUTIVE_UNDERSCORES_PATTERN.matcher(ident).replaceAll("_");
return ident.toUpperCase();
}
/**
* Builds a Soy string literal for this string value (including the surrounding single quotes).
* Note that Soy string syntax is a subset of JS string syntax, so the result should also be a
* valid JS string.
*
* <p>Adapted from StringUtil.javaScriptEscape().
*
* @param value The string value to escape.
* @param shouldEscapeToAscii Whether to escape non-ASCII characters as Unicode hex escapes
* (backslash + 'u' + 4 hex digits).
* @return A Soy string literal for this string value (including the surrounding single quotes).
*/
public static String escapeToSoyString(String value, boolean shouldEscapeToAscii) {
// StringUtil.javaScriptEscape() is meant to be compatible with JS string syntax, which is a
// superset of the Soy expression string syntax, so we can't depend on it to properly escape a
// Soy expression string literal. For example, they switched the default character escaping
// to octal to save a few bytes, but octal escapes are not allowed in Soy syntax. I'm rewriting
// the code here in a correct way for Soy.
int len = value.length();
StringBuilder out = new StringBuilder(len * 9 / 8);
out.append('\'');
int codePoint;
for (int i = 0; i < len; i += Character.charCount(codePoint)) {
codePoint = value.codePointAt(i);
switch (codePoint) {
case '\n':
out.append("\\n");
break;
case '\r':
out.append("\\r");
break;
case '\t':
out.append("\\t");
break;
case '\b':
out.append("\\b");
break;
case '\f':
out.append("\\f");
break;
case '\\':
out.append("\\\\");
break;
case '\'':
out.append("\\'");
break;
case '"':
out.append('"');
break; // note: don't escape double quotes in Soy strings
default:
// If shouldEscapeToAscii, then hex escape characters outside the range 0x20 to 0x7F.
if (shouldEscapeToAscii && (codePoint < 0x20 || codePoint >= 0x7F)) {
appendHexEscape(out, codePoint);
} else {
out.appendCodePoint(codePoint);
}
break;
}
}
out.append('\'');
return out.toString();
}
/**
* Appends the Unicode hex escape sequence for the given code point (backslash + 'u' + 4 hex
* digits) to the given StringBuilder.
*
* <p>Note: May append 2 escape sequences (surrogate pair) in the case of a supplementary
* character (outside the Unicode BMP).
*
* <p>Adapted from StringUtil.appendHexJavaScriptRepresentation().
*
* @param out The StringBuilder to append to.
* @param codePoint The Unicode code point whose hex escape sequence to append.
*/
public static void appendHexEscape(StringBuilder out, int codePoint) {
if (Character.isSupplementaryCodePoint(codePoint)) {
// Handle supplementary unicode values which are not representable in
// javascript. We deal with these by escaping them as two 4B sequences
// so that they will round-trip properly when sent from java to javascript
// and back.
char[] surrogates = Character.toChars(codePoint);
appendHexEscape(out, surrogates[0]);
appendHexEscape(out, surrogates[1]);
} else {
out.append("\\u")
.append(HEX_DIGITS[(codePoint >>> 12) & 0xF])
.append(HEX_DIGITS[(codePoint >>> 8) & 0xF])
.append(HEX_DIGITS[(codePoint >>> 4) & 0xF])
.append(HEX_DIGITS[codePoint & 0xF]);
}
}
/**
* Computes the SHA-1 hash value of the input string's UTF-8 representation and returns the first
* numBits bits of the result as a hex value in string form.
*
* @param strToHash The string to compute SHA-1 of.
* @param numBits The number of bits worth to return. Must be a positive number at most 160 and
* divisible by 8 (since we process the result 8 bits at a time).
* @return The partial SHA-1 hash value as a hex string.
*/
public static String computePartialSha1AsHexString(String strToHash, int numBits) {
Preconditions.checkArgument(numBits > 0 && numBits <= 160 && numBits % 8 == 0);
int numBytes = numBits / 8;
return Hashing.sha1().hashString(strToHash, UTF_8).toString().substring(0, numBytes * 2);
}
private static final CharMatcher whitespaceCommaOrColon =
CharMatcher.whitespace().or(CharMatcher.is(',')).or(CharMatcher.is(':')).precomputed();
/**
* A helper method for formating javacc ParseExceptions.
*
* @param errorToken The piece of text that we were unable to parse.
* @param expectedTokens The set of formatted tokens that we were expecting next.
*/
public static String formatParseExceptionDetails(String errorToken, List<String> expectedTokens) {
// quotes/normalize the expected tokens before rendering, just in case after normalization some
// can be deduplicated.
ImmutableSet.Builder<String> normalizedTokensBuilder = ImmutableSet.builder();
for (String t : expectedTokens) {
normalizedTokensBuilder.add(maybeQuoteForParseError(t));
}
expectedTokens = normalizedTokensBuilder.build().asList();
String details;
int numExpectedTokens = expectedTokens.size();
if (numExpectedTokens != 0) {
StringBuilder builder = new StringBuilder(": expected ");
for (int i = 0; i < numExpectedTokens; i++) {
builder.append(expectedTokens.get(i));
if (i != numExpectedTokens - 1) {
builder.append(", ");
}
if (i == numExpectedTokens - 2) {
builder.append("or ");
}
}
details = builder.toString();
} else {
details = "";
}
return String.format(
"parse error at '%s'%s", escapeWhitespaceForErrorPrinting(errorToken), details);
}
private static String maybeQuoteForParseError(String token) {
// the literal matches are surrounded in double quotes, remove them, unless the token starts
// or ends with a whitespace character, a comma or a colon, because those characters could
// create ambiguity in the error messages.
if (token.length() > 1 && token.charAt(0) == '"' && token.charAt(token.length() - 1) == '"') {
token = token.substring(1, token.length() - 1);
}
if (whitespaceCommaOrColon.matchesAnyOf(token)) {
token = "'" + token + "'";
}
return escapeWhitespaceForErrorPrinting(token);
}
private static String escapeWhitespaceForErrorPrinting(String s) {
s = s.replaceAll("\r", "\\\\r");
s = s.replaceAll("\n", "\\\\n");
s = s.replaceAll("\t", "\\\\t");
return s;
}
}