/* * Created by Vinta Chen on 2014/11/05. */ package org.b3log.symphony.util; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Paranoid text spacing for good readability, to automatically insert whitespace between CJK (Chinese, Japanese, * Korean), half-width English, digit and symbol characters. * * <p> * These whitespaces between English and Chinese characters are called "Pangu Spacing" by sinologist, since it separate * the confusion between full-width and half-width characters. Studies showed that who dislike to add whitespace between * English and Chinese characters also have relationship problem. Almost 70 percent of them will get married to the one * they don't love, the rest only can left the heritage to their cat. Indeed, love and writing need some space in good * time. * </p> * * <p><a href="https://hacpai.com/article/1472639605458">為什麼你們就是不能加個空格呢?</a></p> * * @author Vinta Chen * @author <a href="http://88250.b3log.org">Liang Ding</a> * @version 1.0.0.0, Aug 31, 2016 * @since 1.6.0 */ public class Pangu { /** * You should use the constructor to create a {@code Pangu} object with default values. */ public Pangu() { } /* * Some capturing group patterns for convenience. * * CJK: Chinese, Japanese, Korean * ANS: Alphabet, Number, Symbol */ private static final Pattern CJK_ANS = Pattern.compile( "([\\p{InHiragana}\\p{InKatakana}\\p{InBopomofo}\\p{InCJKCompatibilityIdeographs}\\p{InCJKUnifiedIdeographs}])" + "([a-z0-9`~@\\$%\\^&\\*\\-_\\+=\\|\\\\/])", Pattern.CASE_INSENSITIVE ); private static final Pattern ANS_CJK = Pattern.compile( "([a-z0-9`~!\\$%\\^&\\*\\-_\\+=\\|\\\\;:,\\./\\?])" + "([\\p{InHiragana}\\p{InKatakana}\\p{InBopomofo}\\p{InCJKCompatibilityIdeographs}\\p{InCJKUnifiedIdeographs}])", Pattern.CASE_INSENSITIVE ); private static final Pattern CJK_QUOTE = Pattern.compile( "([\\p{InHiragana}\\p{InKatakana}\\p{InBopomofo}\\p{InCJKCompatibilityIdeographs}\\p{InCJKUnifiedIdeographs}])" + "([\"'])" ); private static final Pattern QUOTE_CJK = Pattern.compile( "([\"'])" + "([\\p{InHiragana}\\p{InKatakana}\\p{InBopomofo}\\p{InCJKCompatibilityIdeographs}\\p{InCJKUnifiedIdeographs}])" ); private static final Pattern FIX_QUOTE = Pattern.compile("([\"'])(\\s*)(.+?)(\\s*)([\"'])"); private static final Pattern CJK_BRACKET_CJK = Pattern.compile( "([\\p{InHiragana}\\p{InKatakana}\\p{InBopomofo}\\p{InCJKCompatibilityIdeographs}\\p{InCJKUnifiedIdeographs}])" + "([\\({\\[]+(.*?)[\\)}\\]]+)" + "([\\p{InHiragana}\\p{InKatakana}\\p{InBopomofo}\\p{InCJKCompatibilityIdeographs}\\p{InCJKUnifiedIdeographs}])" ); private static final Pattern CJK_BRACKET = Pattern.compile( "([\\p{InHiragana}\\p{InKatakana}\\p{InBopomofo}\\p{InCJKCompatibilityIdeographs}\\p{InCJKUnifiedIdeographs}])" + "([\\(\\){}\\[\\]<>])" ); private static final Pattern BRACKET_CJK = Pattern.compile( "([\\(\\){}\\[\\]<>])" + "([\\p{InHiragana}\\p{InKatakana}\\p{InBopomofo}\\p{InCJKCompatibilityIdeographs}\\p{InCJKUnifiedIdeographs}])" ); private static final Pattern FIX_BRACKET = Pattern.compile("([(\\({\\[)]+)(\\s*)(.+?)(\\s*)([\\)}\\]]+)"); private static final Pattern CJK_HASH = Pattern.compile( "([\\p{InHiragana}\\p{InKatakana}\\p{InBopomofo}\\p{InCJKCompatibilityIdeographs}\\p{InCJKUnifiedIdeographs}])" + "(#(\\S+))" ); private static final Pattern HASH_CJK = Pattern.compile( "((\\S+)#)" + "([\\p{InHiragana}\\p{InKatakana}\\p{InBopomofo}\\p{InCJKCompatibilityIdeographs}\\p{InCJKUnifiedIdeographs}])" ); /** * Performs a paranoid text spacing on {@code text}. * * @param text the string you want to process, must not be {@code null}. * @return a comfortable and readable version of {@code text} for paranoiac. */ public static String spacingText(String text) { // CJK and quotes Matcher cqMatcher = CJK_QUOTE.matcher(text); text = cqMatcher.replaceAll("$1 $2"); Matcher qcMatcher = QUOTE_CJK.matcher(text); text = qcMatcher.replaceAll("$1 $2"); Matcher fixQuoteMatcher = FIX_QUOTE.matcher(text); text = fixQuoteMatcher.replaceAll("$1$3$5"); // CJK and brackets String oldText = text; Matcher cbcMatcher = CJK_BRACKET_CJK.matcher(text); String newText = cbcMatcher.replaceAll("$1 $2 $4"); text = newText; if (oldText.equals(newText)) { Matcher cbMatcher = CJK_BRACKET.matcher(text); text = cbMatcher.replaceAll("$1 $2"); Matcher bcMatcher = BRACKET_CJK.matcher(text); text = bcMatcher.replaceAll("$1 $2"); } Matcher fixBracketMatcher = FIX_BRACKET.matcher(text); text = fixBracketMatcher.replaceAll("$1$3$5"); // CJK and hash Matcher chMatcher = CJK_HASH.matcher(text); text = chMatcher.replaceAll("$1 $2"); Matcher hcMatcher = HASH_CJK.matcher(text); text = hcMatcher.replaceAll("$1 $3"); // CJK and ANS Matcher caMatcher = CJK_ANS.matcher(text); text = caMatcher.replaceAll("$1 $2"); Matcher acMatcher = ANS_CJK.matcher(text); text = acMatcher.replaceAll("$1 $2"); return text; } }