//////////////////////////////////////////////////////////////////////////////// // checkstyle: Checks Java source code for adherence to a set of rules. // Copyright (C) 2001-2017 the original author or authors. // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public // License as published by the Free Software Foundation; either // version 2.1 of the License, or (at your option) any later version. // // This library is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU // Lesser General Public License for more details. // // You should have received a copy of the GNU Lesser General Public // License along with this library; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA //////////////////////////////////////////////////////////////////////////////// package com.puppycrawl.tools.checkstyle.checks; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import com.puppycrawl.tools.checkstyle.api.AbstractCheck; import com.puppycrawl.tools.checkstyle.api.DetailAST; import com.puppycrawl.tools.checkstyle.api.TextBlock; import com.puppycrawl.tools.checkstyle.api.TokenTypes; import com.puppycrawl.tools.checkstyle.utils.CommonUtils; /** * <p> * Restrict using <a href = * "http://docs.oracle.com/javase/specs/jls/se8/html/jls-3.html#jls-3.3"> * Unicode escapes</a> (such as <code>\u221e</code>). * It is possible to allow using escapes for * <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters"> * non-printable(control) characters</a>. * Also, this check can be configured to allow using escapes * if trail comment is present. By the option it is possible to * allow using escapes if literal contains only them. By the option it * is possible to allow using escapes for space literals. * </p> * <p> * Examples of using Unicode:</p> * <pre> * String unitAbbrev = "μs"; // Best: perfectly clear even without a comment. * String unitAbbrev = "\u03bcs"; // Poor: the reader has no idea what this is. * </pre> * <p> * An example of how to configure the check is: * </p> * <pre> * <module name="AvoidEscapedUnicodeCharacters"/> * </pre> * <p> * An example of non-printable(control) characters. * </p> * <pre> * return '\ufeff' + content; // byte order mark * </pre> * <p> * An example of how to configure the check to allow using escapes * for non-printable(control) characters: * </p> * <pre> * <module name="AvoidEscapedUnicodeCharacters"> * <property name="allowEscapesForControlCharacters" value="true"/> * </module> * </pre> * <p> * Example of using escapes with trail comment: * </p> * <pre> * String unitAbbrev = "\u03bcs"; // Greek letter mu, "s" * </pre> * <p>An example of how to configure the check to allow using escapes * if trail comment is present: * </p> * <pre> * <module name="AvoidEscapedUnicodeCharacters"> * <property name="allowByTailComment" value="true"/> * </module> * </pre> * <p>Example of using escapes if literal contains only them: * </p> * <pre> * String unitAbbrev = "\u03bc\u03bc\u03bc"; * </pre> * <p>An example of how to configure the check to allow escapes * if literal contains only them: * </p> * <pre> * <module name="AvoidEscapedUnicodeCharacters"> * <property name="allowIfAllCharactersEscaped" value="true"/> * </module> * </pre> * <p>An example of how to configure the check to allow non-printable escapes: * </p> * <pre> * <module name="AvoidEscapedUnicodeCharacters"> * <property name="allowNonPrintableEscapes" value="true"/> * </module> * </pre> * * @author maxvetrenko * */ public class AvoidEscapedUnicodeCharactersCheck extends AbstractCheck { /** * A key is pointing to the warning message text in "messages.properties" * file. */ public static final String MSG_KEY = "forbid.escaped.unicode.char"; /** Regular expression for Unicode chars. */ private static final Pattern UNICODE_REGEXP = Pattern.compile("\\\\u[a-fA-F0-9]{4}"); /** * Regular expression Unicode control characters. * * @see <a href="https://en.wiktionary.org/wiki/Appendix:Control_characters"> * Appendix:Control characters</a> */ private static final Pattern UNICODE_CONTROL = Pattern.compile("\\\\(u|U)" + "(00[0-1][0-9A-Fa-f]|00[8-9][0-9A-Fa-f]|00(a|A)(d|D)|034(f|F)|070(f|F)" + "|180(e|E)|200[b-fB-F]|202[a-eA-E]|206[0-4a-fA-F]" + "|[fF]{3}[9a-bA-B]|[fF][eE][fF]{2})"); /** Regular expression for all escaped chars. */ private static final Pattern ALL_ESCAPED_CHARS = Pattern.compile("^((\\\\u)[a-fA-F0-9]{4}" + "||\\\\b|\\\\t|\\\\n|\\\\f|\\\\r|\\\\|\"|\')+$"); /** Regular expression for escaped backslash. */ private static final Pattern ESCAPED_BACKSLASH = Pattern.compile("\\\\\\\\"); /** Regular expression for non-printable unicode chars. */ private static final Pattern NON_PRINTABLE_CHARS = Pattern.compile("\\\\u1680|\\\\u2028" + "|\\\\u2029|\\\\u205(f|F)|\\\\u3000|\\\\u2007|\\\\u2000|\\\\u200(a|A)" + "|\\\\u007(F|f)|\\\\u009(f|F)|\\\\u(f|F){4}|\\\\u007(F|f)|\\\\u00(a|A)(d|D)" + "|\\\\u0600|\\\\u061(c|C)|\\\\u06(d|D){2}|\\\\u070(f|F)|\\\\u1680|\\\\u180(e|E)" + "|\\\\u2000|\\\\u2028|\\\\u205(f|F)|\\\\u2066|\\\\u2067|\\\\u2068|\\\\u2069" + "|\\\\u206(a|A)|\\\\u(d|D)800|\\\\u(f|F)(e|E)(f|F){2}|\\\\u(f|F){3}9" + "|\\\\u(f|F){3}(a|A)|\\\\u0020|\\\\u00(a|A)0|\\\\u00(a|A)(d|D)|\\\\u0604" + "|\\\\u061(c|C)|\\\\u06(d|D){2}|\\\\u070(f|F)|\\\\u1680|\\\\u180(e|E)|\\\\u200(f|F)" + "|\\\\u202(f|F)|\\\\u2064|\\\\u2066|\\\\u2067|\\\\u2068|\\\\u2069|\\\\u206(f|F)" + "|\\\\u(f|F)8(f|F){2}|\\\\u(f|F)(e|E)(f|F){2}|\\\\u(f|F){3}9|\\\\u(f|F){3}(b|B)" + "|\\\\u05(d|D)0|\\\\u05(f|F)3|\\\\u0600|\\\\u0750|\\\\u0(e|E)00|\\\\u1(e|E)00" + "|\\\\u2100|\\\\u(f|F)(b|B)50|\\\\u(f|F)(e|E)70|\\\\u(F|f){2}61|\\\\u04(f|F)9" + "|\\\\u05(b|B)(e|E)|\\\\u05(e|E)(a|A)|\\\\u05(f|F)4|\\\\u06(f|F){2}" + "|\\\\u077(f|F)|\\\\u0(e|E)7(f|F)|\\\\u20(a|A)(f|F)|\\\\u213(a|A)|\\\\u0000" + "|\\\\u(f|F)(d|D)(f|F){2}|\\\\u(f|F)(e|E)(f|F){2}|\\\\u(f|F){2}(d|D)(c|C)" + "|\\\\u2002|\\\\u0085|\\\\u200(a|A)|\\\\u2005|\\\\u2000|\\\\u2029|\\\\u000(B|b)" + "|\\\\u2008|\\\\u2003|\\\\u205(f|F)|\\\\u1680|\\\\u0009|\\\\u0020|\\\\u2006" + "|\\\\u2001|\\\\u202(f|F)|\\\\u00(a|A)0|\\\\u000(c|C)|\\\\u2009|\\\\u2004|\\\\u2028" + "|\\\\u2028|\\\\u2007|\\\\u2004|\\\\u2028|\\\\u2007|\\\\u2025" + "|\\\\u(f|F){2}0(e|E)|\\\\u(f|F){2}61"); /** Cpp style comments. */ private Map<Integer, TextBlock> singlelineComments; /** C style comments. */ private Map<Integer, List<TextBlock>> blockComments; /** Allow use escapes for non-printable(control) characters. */ private boolean allowEscapesForControlCharacters; /** Allow use escapes if trail comment is present. */ private boolean allowByTailComment; /** Allow if all characters in literal are escaped. */ private boolean allowIfAllCharactersEscaped; /** Allow escapes for space literals. */ private boolean allowNonPrintableEscapes; /** * Set allowIfAllCharactersEscaped. * @param allow user's value. */ public final void setAllowEscapesForControlCharacters(boolean allow) { allowEscapesForControlCharacters = allow; } /** * Set allowByTailComment. * @param allow user's value. */ public final void setAllowByTailComment(boolean allow) { allowByTailComment = allow; } /** * Set allowIfAllCharactersEscaped. * @param allow user's value. */ public final void setAllowIfAllCharactersEscaped(boolean allow) { allowIfAllCharactersEscaped = allow; } /** * Set allowSpaceEscapes. * @param allow user's value. */ public final void setAllowNonPrintableEscapes(boolean allow) { allowNonPrintableEscapes = allow; } @Override public int[] getDefaultTokens() { return getAcceptableTokens(); } @Override public int[] getAcceptableTokens() { return new int[] {TokenTypes.STRING_LITERAL, TokenTypes.CHAR_LITERAL}; } @Override public int[] getRequiredTokens() { return getAcceptableTokens(); } @Override public void beginTree(DetailAST rootAST) { singlelineComments = getFileContents().getSingleLineComments(); blockComments = getFileContents().getBlockComments(); } @Override public void visitToken(DetailAST ast) { final String literal = ast.getText(); if (hasUnicodeChar(literal) && !(allowByTailComment && hasTrailComment(ast) || isAllCharactersEscaped(literal) || allowEscapesForControlCharacters && isOnlyUnicodeValidChars(literal, UNICODE_CONTROL) || allowNonPrintableEscapes && isOnlyUnicodeValidChars(literal, NON_PRINTABLE_CHARS))) { log(ast.getLineNo(), MSG_KEY); } } /** * Checks if literal has Unicode chars. * @param literal String literal. * @return true if literal has Unicode chars. */ private static boolean hasUnicodeChar(String literal) { final String literalWithoutEscapedBackslashes = ESCAPED_BACKSLASH.matcher(literal).replaceAll(""); return UNICODE_REGEXP.matcher(literalWithoutEscapedBackslashes).find(); } /** * Check if String literal contains Unicode control chars. * @param literal String literal. * @param pattern RegExp for valid characters. * @return true, if String literal contains Unicode control chars. */ private static boolean isOnlyUnicodeValidChars(String literal, Pattern pattern) { final int unicodeMatchesCounter = countMatches(UNICODE_REGEXP, literal); final int unicodeValidMatchesCounter = countMatches(pattern, literal); return unicodeMatchesCounter - unicodeValidMatchesCounter == 0; } /** * Check if trail comment is present after ast token. * @param ast current token. * @return true if trail comment is present after ast token. */ private boolean hasTrailComment(DetailAST ast) { boolean result = false; final int lineNo = ast.getLineNo(); if (singlelineComments.containsKey(lineNo)) { result = true; } else { final List<TextBlock> commentList = blockComments.get(lineNo); if (commentList != null) { final TextBlock comment = commentList.get(commentList.size() - 1); final String line = getLines()[lineNo - 1]; result = isTrailingBlockComment(comment, line); } } return result; } /** * Whether the C style comment is trailing. * @param comment the comment to check. * @param line the line where the comment starts. * @return true if the comment is trailing. */ private static boolean isTrailingBlockComment(TextBlock comment, String line) { return comment.getText().length != 1 || CommonUtils.isBlank(line.substring(comment.getEndColNo() + 1)); } /** * Count regexp matches into String literal. * @param pattern pattern. * @param target String literal. * @return count of regexp matches. */ private static int countMatches(Pattern pattern, String target) { int matcherCounter = 0; final Matcher matcher = pattern.matcher(target); while (matcher.find()) { matcherCounter++; } return matcherCounter; } /** * Checks if all characters in String literal is escaped. * @param literal current literal. * @return true if all characters in String literal is escaped. */ private boolean isAllCharactersEscaped(String literal) { return allowIfAllCharactersEscaped && ALL_ESCAPED_CHARS.matcher(literal.substring(1, literal.length() - 1)).find(); } }