/************************************************************************** OmegaT - Computer Assisted Translation (CAT) tool with fuzzy matching, translation memory, keyword search, glossaries, and translation leveraging into updated projects. Copyright (C) 2000-2006 Keith Godfrey and Maxym Mykhalchuk 2007 Didier Briel and Tiago Saboga 2007 Zoltan Bartko - bartkozoltan@bartkozoltan.com 2008 Andrzej Sawula 2010-2013 Alex Buloichik 2015 Zoltan Bartko, Aaron Madlon-Kay 2016 Aaron Madlon-Kay Home page: http://www.omegat.org/ Support center: http://groups.yahoo.com/group/OmegaT/ This file is part of OmegaT. OmegaT is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. OmegaT is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. **************************************************************************/ package org.omegat.util; import java.nio.ByteBuffer; import java.nio.CharBuffer; import java.nio.charset.Charset; import java.text.MessageFormat; import java.text.Normalizer; import java.util.Arrays; import java.util.Locale; import javax.xml.bind.DatatypeConverter; /** * Utilities for string processing. * * @author Maxym Mykhalchuk * @author Didier Briel * @author Tiago Saboga * @author Zoltan Bartko * @author Andrzej Sawula * @author Alex Buloichik (alex73mail@gmail.com) * @author Aaron Madlon-Kay */ public final class StringUtil { private StringUtil() { } public static final char TRUNCATE_CHAR = '\u2026'; /** * Check if string is empty, i.e. null or length==0 */ public static boolean isEmpty(final String str) { return str == null || str.isEmpty(); } /** * Returns true if the input has at least one letter and * all letters are lower case. */ public static boolean isLowerCase(final String input) { if (input.isEmpty()) { return false; } boolean hasLetters = false; for (int i = 0, cp; i < input.length(); i += Character.charCount(cp)) { cp = input.codePointAt(i); if (Character.isLetter(cp)) { hasLetters = true; if (!Character.isLowerCase(cp)) { return false; } } } return hasLetters; } /** * Returns true if the input is upper case. */ public static boolean isUpperCase(final String input) { if (input.isEmpty()) { return false; } boolean hasLetters = false; for (int i = 0, cp; i < input.length(); i += Character.charCount(cp)) { cp = input.codePointAt(i); if (Character.isLetter(cp)) { hasLetters = true; if (!Character.isUpperCase(cp)) { return false; } } } return hasLetters; } /** * Returns true if the input has both upper case and lower case letters, but * is not title case. */ public static boolean isMixedCase(final String input) { if (input.isEmpty() || input.codePointCount(0, input.length()) < 2) { return false; } boolean hasUpper = false; boolean hasLower = false; for (int i = 0, cp; i < input.length(); i += Character.charCount(cp)) { cp = input.codePointAt(i); if (Character.isLetter(cp)) { // Don't count the first cp as upper to allow for title case if (Character.isUpperCase(cp) && i > 0) { hasUpper = true; } else if (Character.isLowerCase(cp)) { hasLower = true; } if (hasUpper && hasLower) { return true; } } } return false; } /** * Returns true if the input is title case, meaning the first character is UpperCase or * TitleCase* and the rest of the string (if present) is LowerCase. * <p> * *There are exotic characters that are neither UpperCase nor LowerCase, but are TitleCase: * e.g. LATIN CAPITAL LETTER L WITH SMALL LETTER J (U+01C8)<br> * These are handled correctly. */ public static boolean isTitleCase(final String input) { if (input.isEmpty()) { return false; } if (input.codePointCount(0, input.length()) > 1) { return isTitleCase(input.codePointAt(0)) && isLowerCase(input.substring(input.offsetByCodePoints(0, 1))); } else { return isTitleCase(input.codePointAt(0)); } } public static boolean isTitleCase(int codePoint) { // True if is actual title case, or if is upper case and has no separate title case variant. return Character.isTitleCase(codePoint) || (Character.isUpperCase(codePoint) && Character.toTitleCase(codePoint) == codePoint); } /** * Returns true if the input consists only of whitespace characters * (including non-breaking characters that are false according to * {@link Character#isWhitespace(int)}). */ public static boolean isWhiteSpace(final String input) { if (input.isEmpty()) { return false; } for (int i = 0, cp; i < input.length(); i += Character.charCount(cp)) { cp = input.codePointAt(i); if (!isWhiteSpace(cp)) { return false; } } return true; } /** * Returns true if the input is a whitespace character * (including non-breaking characters that are false according to * {@link Character#isWhitespace(int)}). */ public static boolean isWhiteSpace(int codePoint) { return Character.isWhitespace(codePoint) || codePoint == '\u00A0' || codePoint == '\u2007' || codePoint == '\u202F'; } public static boolean isCJK(String input) { if (input.isEmpty()) { return false; } for (int i = 0, cp; i < input.length(); i += Character.charCount(cp)) { cp = input.codePointAt(i); // Anything less than CJK Radicals Supplement is "not CJK". Everything else is. // TODO: Make this smarter? if (cp < '\u2E80') { return false; } } return true; } public static String capitalizeFirst(String text, Locale locale) { int remainder = text.offsetByCodePoints(0, 1); String firstCP = text.substring(0, remainder); return StringUtil.toTitleCase(firstCP, locale) + text.substring(remainder); } public static String matchCapitalization(String text, String matchTo, Locale locale) { if (StringUtil.isEmpty(matchTo)) { return text; } // If input matches term exactly, don't change anything if (text.startsWith(matchTo)) { return text; } // If matching to title case (or 1 upper char), capitalize first letter. // Don't turn into title case because the text may be e.g. a phrase // with intentional mixed casing. if (StringUtil.isTitleCase(matchTo)) { return capitalizeFirst(text, locale); } // If matching to lower, turn into lower. if (StringUtil.isLowerCase(matchTo)) { return text.toLowerCase(locale); } // If matching to upper (at least 2 chars; otherwise would have hit isTitleCase() // above), turn into upper. if (StringUtil.isUpperCase(matchTo)) { return text.toUpperCase(locale); } return text; } /** * Convert text to title case according to the supplied locale. */ public static String toTitleCase(String text, Locale locale) { if (text.isEmpty()) { return text; } int firstLetterIndex = 0; for (int cp; firstLetterIndex < text.length(); firstLetterIndex += Character.charCount(cp)) { cp = text.codePointAt(firstLetterIndex); if (Character.isLetter(cp)) { break; } } if (firstLetterIndex == text.length()) { return text; } int firstTitleCase = Character.toTitleCase(text.codePointAt(firstLetterIndex)); int remainderOffset = text.offsetByCodePoints(firstLetterIndex, 1); // If the first codepoint has an actual title case variant (rare), use that. // Otherwise convert first codepoint to upper case according to locale. String first = Character.isTitleCase(firstTitleCase) ? String.valueOf(Character.toChars(firstTitleCase)) : text.substring(0, remainderOffset).toUpperCase(locale); return first + text.substring(remainderOffset).toLowerCase(locale); } /** * Returns first not null object from list, or null if all values is null. */ @SafeVarargs public static <T> T nvl(T... values) { for (T val : values) { if (val != null) { return val; } } return null; } /** * Returns first non-zero object from list, or zero if all values is null. */ public static long nvlLong(long... values) { for (int i = 0; i < values.length; i++) { if (values[i] != 0) { return values[i]; } } return 0; } /** * Compare two values, which could be null. */ public static <T> boolean equalsWithNulls(T v1, T v2) { if (v1 == null && v2 == null) { return true; } else if (v1 != null && v2 != null) { return v1.equals(v2); } else { return false; } } /** * Compare two values, which could be null. */ public static <T extends Comparable<T>> int compareToWithNulls(T v1, T v2) { if (v1 == v2) { return 0; } else if (v1 == null) { return -1; } else if (v2 == null) { return 1; } else { return v1.compareTo(v2); } } /** * Extracts first N codepoints from string. */ public static String firstN(String str, int len) { if (str.codePointCount(0, str.length()) <= len) { return str; } else { return str.substring(0, str.offsetByCodePoints(0, len)); } } /** * Truncate the supplied text to a maximum of len codepoints. If truncated, * the result will be the first (len - 1) codepoints plus a trailing * ellipsis. * * @param text * The text to truncate * @param len * The desired length (in codepoints) of the result * @return The truncated string */ public static String truncate(String text, int len) { if (text.codePointCount(0, text.length()) <= len) { return text; } return firstN(text, len - 1) + TRUNCATE_CHAR; } /** * Returns first letter in lowercase. Usually used for create tag shortcuts. */ public static int getFirstLetterLowercase(String s) { if (s == null) { return 0; } for (int cp, i = 0; i < s.length(); i += Character.charCount(cp)) { cp = s.codePointAt(i); if (Character.isLetter(cp)) { return Character.toLowerCase(cp); } } return 0; } /** * Checks if text contains substring after specified position. */ public static boolean isSubstringAfter(String text, int pos, String substring) { if (pos + substring.length() > text.length()) { return false; } return substring.equals(text.substring(pos, pos + substring.length())); } /** * Checks if text contains substring before specified position. */ public static boolean isSubstringBefore(String text, int pos, String substring) { if (pos - substring.length() < 0) { return false; } return substring.equals(text.substring(pos - substring.length(), pos)); } public static String stripFromEnd(String string, String... toStrip) { if (string == null) { return null; } if (toStrip == null) { return string; } for (String s : toStrip) { if (string.endsWith(s)) { string = string.substring(0, string.length() - s.length()); } } return string; } /** * Apply Unicode NFC normalization to a string. */ public static String normalizeUnicode(CharSequence text) { return Normalizer.isNormalized(text, Normalizer.Form.NFC) ? text.toString() : Normalizer.normalize(text, Normalizer.Form.NFC); } /** * Replace invalid XML chars by spaces. * * @param str * input stream * @return result stream * @see <a href="http://www.w3.org/TR/2006/REC-xml-20060816/#charsets"> * Supported chars</a> */ public static String removeXMLInvalidChars(String str) { StringBuilder sb = new StringBuilder(str.length()); for (int c, i = 0; i < str.length(); i += Character.charCount(c)) { c = str.codePointAt(i); if (!isValidXMLChar(c)) { c = ' '; } sb.appendCodePoint(c); } return sb.toString(); } public static boolean isValidXMLChar(int codePoint) { if (codePoint < 0x20) { if (codePoint != 0x09 && codePoint != 0x0A && codePoint != 0x0D) { return false; } } else if (codePoint >= 0x20 && codePoint <= 0xD7FF) { } else if (codePoint >= 0xE000 && codePoint <= 0xFFFD) { } else if (codePoint >= 0x10000 && codePoint <= 0x10FFFF) { } else { return false; } return true; } /** * Converts a stream of plaintext into valid XML. Output stream must convert * stream to UTF-8 when saving to disk. */ public static String makeValidXML(String plaintext) { StringBuilder out = new StringBuilder(); String text = removeXMLInvalidChars(plaintext); for (int cp, i = 0; i < text.length(); i += Character.charCount(cp)) { cp = text.codePointAt(i); out.append(escapeXMLChars(cp)); } return out.toString(); } /** Compresses spaces in case of non-preformatting paragraph. */ public static String compressSpaces(String str) { int strlen = str.length(); StringBuilder res = new StringBuilder(strlen); boolean wasspace = true; for (int cp, i = 0; i < strlen; i += Character.charCount(cp)) { cp = str.codePointAt(i); if (Character.isWhitespace(cp)) { if (!wasspace) { wasspace = true; } } else { if (wasspace && res.length() > 0) { res.append(' '); } res.appendCodePoint(cp); wasspace = false; } } return res.toString(); } /** * Converts a single code point into valid XML. Output stream must convert stream * to UTF-8 when saving to disk. */ public static String escapeXMLChars(int cp) { switch (cp) { // case '\'': // return "'"; case '&': return "&"; case '>': return ">"; case '<': return "<"; case '"': return """; default: return String.valueOf(Character.toChars(cp)); } } /** * Converts XML entities to characters. */ public static String unescapeXMLEntities(String text) { if (text.contains(">")) { text = text.replaceAll(">", ">"); } if (text.contains("<")) { text = text.replaceAll("<", "<"); } if (text.contains(""")) { text = text.replaceAll(""", "\""); } // If makeValidXML converts ' to apos;, the following lines should be uncommented /* if (text.indexOf("'") >= 0) { text = text.replaceAll("'", "'"); }*/ if (text.contains("&")) { text = text.replaceAll("&", "&"); } return text; } /** * Compares two strings for equality. Handles nulls: if both strings are * nulls they are considered equal. */ public static boolean equal(String one, String two) { return (one == null && two == null) || (one != null && one.equals(two)); } /** * Formats UI strings. * * Note: This is only a first attempt at putting right what goes wrong in * MessageFormat. Currently it only duplicates single quotes, but it doesn't * even test if the string contains parameters (numbers in curly braces), * and it doesn't allow for string containg already escaped quotes. * * @param str * The string to format * @param arguments * Arguments to use in formatting the string * * @return The formatted string */ public static String format(String str, Object... arguments) { // MessageFormat.format expects single quotes to be escaped // by duplicating them, otherwise the string will not be formatted str = str.replaceAll("'", "''"); return MessageFormat.format(str, arguments); } /** * Normalize the * <a href="https://en.wikipedia.org/wiki/Halfwidth_and_fullwidth_forms"> * width</a> of characters in the supplied text. Specifically: * <ul> * <li>ASCII characters will become halfwidth * <li>Katakana characters will become fullwidth * <li>Hangul will become fullwidth * <li>Letter-like symbols and squared Latin abbreviations will be * decomposed to ASCII * </ul> * This method was adapted from <a href= * "https://bitbucket.org/okapiframework/okapi/src/52143104fcfc7eda204d04dfbbc273189f3a7f0f/okapi/steps/fullwidthconversion/src/main/java/net/sf/okapi/steps/fullwidthconversion/FullWidthConversionStep.java"> * FullWidthConversionStep.java</a> in the Okapi Framework under GPLv2+. * * @param text * @return Normalized-width text */ // CHECKSTYLE:OFF public static String normalizeWidth(String text) { StringBuilder sb = new StringBuilder(text); int ch; for ( int i=0; i<sb.length(); i++ ) { ch = sb.charAt(i); // ASCII if (( ch >= 0xFF01 ) && ( ch <= 0xFF5E )) { sb.setCharAt(i, (char)(ch-0xFEE0)); continue; } if ( ch == 0x3000 ) { sb.setCharAt(i, ' '); } switch ( ch ) { // Katakana case 0xFF61: sb.setCharAt(i, (char)0x3002); break; case 0xFF62: sb.setCharAt(i, (char)0x300C); break; case 0xFF63: sb.setCharAt(i, (char)0x300D); break; case 0xFF64: sb.setCharAt(i, (char)0x3001); break; case 0xFF65: sb.setCharAt(i, (char)0x30FB); break; case 0xFF66: sb.setCharAt(i, (char)0x30F2); break; case 0xFF67: sb.setCharAt(i, (char)0x30A1); break; case 0xFF68: sb.setCharAt(i, (char)0x30A3); break; case 0xFF69: sb.setCharAt(i, (char)0x30A5); break; case 0xFF6A: sb.setCharAt(i, (char)0x30A7); break; case 0xFF6B: sb.setCharAt(i, (char)0x30A9); break; case 0xFF6C: sb.setCharAt(i, (char)0x30E3); break; case 0xFF6D: sb.setCharAt(i, (char)0x30E5); break; case 0xFF6E: sb.setCharAt(i, (char)0x30E7); break; case 0xFF6F: sb.setCharAt(i, (char)0x30C3); break; case 0xFF70: sb.setCharAt(i, (char)0x30FC); break; case 0xFF71: sb.setCharAt(i, (char)0x30A2); break; case 0xFF72: sb.setCharAt(i, (char)0x30A4); break; case 0xFF73: sb.setCharAt(i, (char)0x30A6); break; case 0xFF74: sb.setCharAt(i, (char)0x30A8); break; case 0xFF75: sb.setCharAt(i, (char)0x30AA); break; case 0xFF76: sb.setCharAt(i, (char)0x30AB); break; case 0xFF77: sb.setCharAt(i, (char)0x30AD); break; case 0xFF78: sb.setCharAt(i, (char)0x30AF); break; case 0xFF79: sb.setCharAt(i, (char)0x30B1); break; case 0xFF7A: sb.setCharAt(i, (char)0x30B3); break; case 0xFF7B: sb.setCharAt(i, (char)0x30B5); break; case 0xFF7C: sb.setCharAt(i, (char)0x30B7); break; case 0xFF7D: sb.setCharAt(i, (char)0x30B9); break; case 0xFF7E: sb.setCharAt(i, (char)0x30BB); break; case 0xFF7F: sb.setCharAt(i, (char)0x30BD); break; case 0xFF80: sb.setCharAt(i, (char)0x30BF); break; case 0xFF81: sb.setCharAt(i, (char)0x30C1); break; case 0xFF82: sb.setCharAt(i, (char)0x30C4); break; case 0xFF83: sb.setCharAt(i, (char)0x30C6); break; case 0xFF84: sb.setCharAt(i, (char)0x30C8); break; case 0xFF85: sb.setCharAt(i, (char)0x30CA); break; case 0xFF86: sb.setCharAt(i, (char)0x30CB); break; case 0xFF87: sb.setCharAt(i, (char)0x30CC); break; case 0xFF88: sb.setCharAt(i, (char)0x30CD); break; case 0xFF89: sb.setCharAt(i, (char)0x30CE); break; case 0xFF8A: sb.setCharAt(i, (char)0x30CF); break; case 0xFF8B: sb.setCharAt(i, (char)0x30D2); break; case 0xFF8C: sb.setCharAt(i, (char)0x30D5); break; case 0xFF8D: sb.setCharAt(i, (char)0x30D8); break; case 0xFF8E: sb.setCharAt(i, (char)0x30DB); break; case 0xFF8F: sb.setCharAt(i, (char)0x30DE); break; case 0xFF90: sb.setCharAt(i, (char)0x30DF); break; case 0xFF91: sb.setCharAt(i, (char)0x30E0); break; case 0xFF92: sb.setCharAt(i, (char)0x30E1); break; case 0xFF93: sb.setCharAt(i, (char)0x30E2); break; case 0xFF94: sb.setCharAt(i, (char)0x30E4); break; case 0xFF95: sb.setCharAt(i, (char)0x30E6); break; case 0xFF96: sb.setCharAt(i, (char)0x30E8); break; case 0xFF97: sb.setCharAt(i, (char)0x30E9); break; case 0xFF98: sb.setCharAt(i, (char)0x30EA); break; case 0xFF99: sb.setCharAt(i, (char)0x30EB); break; case 0xFF9A: sb.setCharAt(i, (char)0x30EC); break; case 0xFF9B: sb.setCharAt(i, (char)0x30ED); break; case 0xFF9C: sb.setCharAt(i, (char)0x30EF); break; case 0xFF9D: sb.setCharAt(i, (char)0x30F3); break; case 0xFF9E: sb.setCharAt(i, (char)0x3099); break; case 0xFF9F: sb.setCharAt(i, (char)0x309A); break; } // Hangul if (( ch > 0xFFA1 ) && ( ch <= 0xFFBE )) { sb.setCharAt(i, (char)(ch-0xCE70)); continue; } switch ( ch ) { // Hangul case 0xFFA0: sb.setCharAt(i, (char)0x3164); break; case 0xFFDA: sb.setCharAt(i, (char)0x3161); break; case 0xFFDB: sb.setCharAt(i, (char)0x3162); break; case 0xFFDC: sb.setCharAt(i, (char)0x3163); break; // Others case 0xFFE8: sb.setCharAt(i, (char)0x2502); break; case 0xFFE9: sb.setCharAt(i, (char)0x2190); break; case 0xFFEA: sb.setCharAt(i, (char)0x2191); break; case 0xFFEB: sb.setCharAt(i, (char)0x2192); break; case 0xFFEC: sb.setCharAt(i, (char)0x2193); break; case 0xFFED: sb.setCharAt(i, (char)0x25A0); break; case 0xFFEE: sb.setCharAt(i, (char)0x25CB); break; } // Process letter-like symbols switch ( ch ) { case 0x2100: sb.setCharAt(i, 'a'); sb.insert(i+1, "/c"); i+=2; break; case 0x2101: sb.setCharAt(i, 'a'); sb.insert(i+1, "/s"); i+=2; break; case 0x2105: sb.setCharAt(i, 'c'); sb.insert(i+1, "/o"); i+=2; break; case 0x2103: sb.setCharAt(i, (char)0x00B0); sb.insert(i+1, "C"); i++; break; case 0x2109: sb.setCharAt(i, (char)0x00B0); sb.insert(i+1, "F"); i++; break; case 0x2116: sb.setCharAt(i, 'N'); sb.insert(i+1, "o"); i++; break; case 0x212A: sb.setCharAt(i, 'K'); break; case 0x212B: sb.setCharAt(i, (char)0x00C5); break; } switch ( ch ) { // Squared Latin Abbreviations 1 case 0x3371: sb.setCharAt(i, 'h'); sb.insert(i+1, "Pa"); i+=2; break; case 0x3372: sb.setCharAt(i, 'd'); sb.insert(i+1, "a"); i++; break; case 0x3373: sb.setCharAt(i, 'A'); sb.insert(i+1, "U"); i++; break; case 0x3374: sb.setCharAt(i, 'b'); sb.insert(i+1, "ar"); i+=2; break; case 0x3375: sb.setCharAt(i, 'o'); sb.insert(i+1, "V"); i++; break; case 0x3376: sb.setCharAt(i, 'p'); sb.insert(i+1, "c"); i++; break; case 0x3377: sb.setCharAt(i, 'd'); sb.insert(i+1, "m"); i++; break; case 0x3378: sb.setCharAt(i, 'd'); sb.insert(i+1, "m\u00B2"); i+=2; break; case 0x3379: sb.setCharAt(i, 'd'); sb.insert(i+1, "m\u00B3"); i+=2; break; case 0x337A: sb.setCharAt(i, 'I'); sb.insert(i+1, "U"); i++; break; // Squared Latin Abbreviations 2 case 0x3380: sb.setCharAt(i, 'p'); sb.insert(i+1, "A"); i++; break; case 0x3381: sb.setCharAt(i, 'n'); sb.insert(i+1, "A"); i++; break; case 0x3382: sb.setCharAt(i, (char)0x03BC); sb.insert(i+1, "A"); i++; break; case 0x3383: sb.setCharAt(i, 'm'); sb.insert(i+1, "A"); i++; break; case 0x3384: sb.setCharAt(i, 'k'); sb.insert(i+1, "A"); i++; break; case 0x3385: sb.setCharAt(i, 'K'); sb.insert(i+1, "B"); i++; break; case 0x3386: sb.setCharAt(i, 'M'); sb.insert(i+1, "B"); i++; break; case 0x3387: sb.setCharAt(i, 'G'); sb.insert(i+1, "B"); i++; break; case 0x3388: sb.setCharAt(i, 'c'); sb.insert(i+1, "al"); i+=2; break; case 0x3389: sb.setCharAt(i, 'k'); sb.insert(i+1, "cal"); i+=3; break; case 0x338A: sb.setCharAt(i, 'p'); sb.insert(i+1, "F"); i++; break; case 0x338B: sb.setCharAt(i, 'n'); sb.insert(i+1, "F"); i++; break; case 0x338C: sb.setCharAt(i, (char)0x03BC); sb.insert(i+1, "F"); i++; break; case 0x338D: sb.setCharAt(i, (char)0x03BC); sb.insert(i+1, "g"); i++; break; case 0x338E: sb.setCharAt(i, 'm'); sb.insert(i+1, "g"); i++; break; case 0x338F: sb.setCharAt(i, 'k'); sb.insert(i+1, "g"); i++; break; case 0x3390: sb.setCharAt(i, 'H'); sb.insert(i+1, "z"); i++; break; case 0x3391: sb.setCharAt(i, 'k'); sb.insert(i+1, "Hz"); i+=2; break; case 0x3392: sb.setCharAt(i, 'M'); sb.insert(i+1, "Hz"); i+=2; break; case 0x3393: sb.setCharAt(i, 'G'); sb.insert(i+1, "Hz"); i+=2; break; case 0x3394: sb.setCharAt(i, 'T'); sb.insert(i+1, "Hz"); i+=2; break; case 0x3395: sb.setCharAt(i, (char)0x03BC); sb.insert(i+1, "\u2113"); i++; break; case 0x3396: sb.setCharAt(i, 'm'); sb.insert(i+1, "\u2113"); i++; break; case 0x3397: sb.setCharAt(i, 'd'); sb.insert(i+1, "\u2113"); i++; break; case 0x3398: sb.setCharAt(i, 'k'); sb.insert(i+1, "\u2113"); i++; break; case 0x3399: sb.setCharAt(i, 'f'); sb.insert(i+1, "m"); i++; break; case 0x339A: sb.setCharAt(i, 'n'); sb.insert(i+1, "m"); i++; break; case 0x339B: sb.setCharAt(i, (char)0x03BC); sb.insert(i+1, "m"); i++; break; case 0x339C: sb.setCharAt(i, 'm'); sb.insert(i+1, "m"); i++; break; case 0x339D: sb.setCharAt(i, 'c'); sb.insert(i+1, "m"); i++; break; case 0x339E: sb.setCharAt(i, 'k'); sb.insert(i+1, "m"); i++; break; case 0x339F: sb.setCharAt(i, 'm'); sb.insert(i+1, "m\u00B2"); i+=2; break; case 0x33A0: sb.setCharAt(i, 'c'); sb.insert(i+1, "m\u00B2"); i+=2; break; case 0x33A1: sb.setCharAt(i, 'm'); sb.insert(i+1, "\u00B2"); i++; break; case 0x33A2: sb.setCharAt(i, 'k'); sb.insert(i+1, "m\u00B2"); i+=2; break; case 0x33A3: sb.setCharAt(i, 'm'); sb.insert(i+1, "m\u00B3"); i+=2; break; case 0x33A4: sb.setCharAt(i, 'c'); sb.insert(i+1, "m\u00B3"); i+=2; break; case 0x33A5: sb.setCharAt(i, 'm'); sb.insert(i+1, "\u00B3"); i++; break; case 0x33A6: sb.setCharAt(i, 'k'); sb.insert(i+1, "m\u00B3"); i+=2; break; case 0x33A7: sb.setCharAt(i, 'm'); sb.insert(i+1, "/s"); i+=2; break; case 0x33A8: sb.setCharAt(i, 'm'); sb.insert(i+1, "/s\u00B2"); i+=3; break; case 0x33A9: sb.setCharAt(i, 'P'); sb.insert(i+1, "a"); i++; break; case 0x33AA: sb.setCharAt(i, 'k'); sb.insert(i+1, "Pa"); i+=2; break; case 0x33AB: sb.setCharAt(i, 'M'); sb.insert(i+1, "Pa"); i+=2; break; case 0x33AC: sb.setCharAt(i, 'G'); sb.insert(i+1, "Pa"); i+=2; break; case 0x33AD: sb.setCharAt(i, 'r'); sb.insert(i+1, "ad"); i+=2; break; case 0x33AE: sb.setCharAt(i, 'r'); sb.insert(i+1, "ad/s"); i+=4; break; case 0x33AF: sb.setCharAt(i, 'r'); sb.insert(i+1, "ad/s\u00B2"); i+=5; break; case 0x33B0: sb.setCharAt(i, 'p'); sb.insert(i+1, "s"); i++; break; case 0x33B1: sb.setCharAt(i, 'n'); sb.insert(i+1, "s"); i++; break; case 0x33B2: sb.setCharAt(i, (char)0x03BC); sb.insert(i+1, "s"); i++; break; case 0x33B3: sb.setCharAt(i, 'm'); sb.insert(i+1, "s"); i++; break; case 0x33B4: sb.setCharAt(i, 'p'); sb.insert(i+1, "V"); i++; break; case 0x33B5: sb.setCharAt(i, 'n'); sb.insert(i+1, "V"); i++; break; case 0x33B6: sb.setCharAt(i, (char)0x03BC); sb.insert(i+1, "V"); i++; break; case 0x33B7: sb.setCharAt(i, 'm'); sb.insert(i+1, "V"); i++; break; case 0x33B8: sb.setCharAt(i, 'k'); sb.insert(i+1, "V"); i++; break; case 0x33B9: sb.setCharAt(i, 'M'); sb.insert(i+1, "V"); i++; break; case 0x33BA: sb.setCharAt(i, 'p'); sb.insert(i+1, "W"); i++; break; case 0x33BB: sb.setCharAt(i, 'n'); sb.insert(i+1, "W"); i++; break; case 0x33BC: sb.setCharAt(i, (char)0x03BC); sb.insert(i+1, "W"); i++; break; case 0x33BD: sb.setCharAt(i, 'm'); sb.insert(i+1, "W"); i++; break; case 0x33BE: sb.setCharAt(i, 'k'); sb.insert(i+1, "W"); i++; break; case 0x33BF: sb.setCharAt(i, 'M'); sb.insert(i+1, "W"); i++; break; case 0x33C0: sb.setCharAt(i, 'k'); sb.insert(i+1, "\u03A9"); i++; break; case 0x33C1: sb.setCharAt(i, 'M'); sb.insert(i+1, "\u03A9"); i++; break; case 0x33C2: sb.setCharAt(i, 'a'); sb.insert(i+1, ".m."); i+=3; break; case 0x33C3: sb.setCharAt(i, 'B'); sb.insert(i+1, "q"); i++; break; case 0x33C4: sb.setCharAt(i, 'c'); sb.insert(i+1, "c"); i++; break; case 0x33C5: sb.setCharAt(i, 'c'); sb.insert(i+1, "d"); i++; break; case 0x33C6: sb.setCharAt(i, 'C'); sb.insert(i+1, "/kg"); i+=3; break; case 0x33C7: sb.setCharAt(i, 'C'); sb.insert(i+1, "o."); i+=2; break; case 0x33C8: sb.setCharAt(i, 'd'); sb.insert(i+1, "B"); i++; break; case 0x33C9: sb.setCharAt(i, 'G'); sb.insert(i+1, "y"); i++; break; case 0x33CA: sb.setCharAt(i, 'h'); sb.insert(i+1, "a"); i++; break; case 0x33CB: sb.setCharAt(i, 'H'); sb.insert(i+1, "P"); i++; break; case 0x33CC: sb.setCharAt(i, 'i'); sb.insert(i+1, "n"); i++; break; case 0x33CD: sb.setCharAt(i, 'K'); sb.insert(i+1, "K"); i++; break; case 0x33CE: sb.setCharAt(i, 'K'); sb.insert(i+1, "M"); i++; break; case 0x33CF: sb.setCharAt(i, 'K'); sb.insert(i+1, "t"); i++; break; case 0x33D0: sb.setCharAt(i, 'l'); sb.insert(i+1, "m"); i++; break; case 0x33D1: sb.setCharAt(i, 'l'); sb.insert(i+1, "n"); i++; break; case 0x33D2: sb.setCharAt(i, 'l'); sb.insert(i+1, "og"); i+=2; break; case 0x33D3: sb.setCharAt(i, 'l'); sb.insert(i+1, "x"); i++; break; case 0x33D4: sb.setCharAt(i, 'm'); sb.insert(i+1, "b"); i++; break; case 0x33D5: sb.setCharAt(i, 'm'); sb.insert(i+1, "il"); i+=2; break; case 0x33D6: sb.setCharAt(i, 'm'); sb.insert(i+1, "ol"); i+=2; break; case 0x33D7: sb.setCharAt(i, 'p'); sb.insert(i+1, "H"); i++; break; case 0x33D8: sb.setCharAt(i, 'p'); sb.insert(i+1, ".m."); i+=3; break; case 0x33D9: sb.setCharAt(i, 'P'); sb.insert(i+1, "PM"); i+=2; break; case 0x33DA: sb.setCharAt(i, 'P'); sb.insert(i+1, "R"); i++; break; case 0x33DB: sb.setCharAt(i, 's'); sb.insert(i+1, "r"); i++; break; case 0x33DC: sb.setCharAt(i, 'S'); sb.insert(i+1, "v"); i++; break; case 0x33DD: sb.setCharAt(i, 'W'); sb.insert(i+1, "b"); i++; break; case 0x33DE: sb.setCharAt(i, 'v'); sb.insert(i+1, "/m"); i+=2; break; case 0x33DF: sb.setCharAt(i, 'a'); sb.insert(i+1, "/m"); i+=2; break; // Squared Latin Abbreviations 3 case 0x33FF: sb.setCharAt(i, 'g'); sb.insert(i+1, "al"); i+=2; break; } } String result = sb.toString(); if (text.equals(result)) { // No characters were changed. Return the original text so that // composition of unrelated characters is not affected. return text; } return normalizeUnicode(result); } // CHECKSTYLE:ON /** * Strip whitespace from the end of a string. Uses * {@link Character#isWhitespace(int)}, so it does not strip the extra * non-breaking whitespace included in {@link #isWhiteSpace(int)}. * * @param text * @return text with trailing whitespace removed */ public static String rstrip(String text) { for (int cp, i = text.length(); i >= 0; i -= Character.charCount(cp)) { if (i == 0) { return ""; } cp = text.codePointBefore(i); if (!Character.isWhitespace(cp)) { return text.substring(0, i); } } return text; } /** * Convert a byte array into a Base64-encoded String. Convenience method for * {@link DatatypeConverter#printBase64Binary(byte[])} (available since Java * 1.6) because it's so well hidden. * * @param bytes * Data bytes * @return Base64-encoded String */ private static String encodeBase64(byte[] bytes) { return DatatypeConverter.printBase64Binary(bytes); } /** * Convert a string's <code>charset</code> bytes into a Base64-encoded String. * * @param string * a string * @param charset * the charset with which to obtain the bytes * @return Base64-encoded String */ public static String encodeBase64(String string, Charset charset) { return encodeBase64(string.getBytes(charset)); } /** * Convert a char array's <code>charset</code> bytes into a Base64-encoded String. * Useful for handling passwords. Intermediate buffers are cleared after use. * * @param chars * a char array * @param charset * the charset with which to obtain the bytes * @return Base64-encoded String */ public static String encodeBase64(char[] chars, Charset charset) { CharBuffer charBuf = CharBuffer.wrap(chars); ByteBuffer byteBuf = charset.encode(charBuf); String result = encodeBase64(byteBuf.array()); Arrays.fill(charBuf.array(), '\0'); Arrays.fill(byteBuf.array(), (byte) 0); return result; } /** * Convert a Base64-encoded String into an array of bytes. Convenience * method for {@link DatatypeConverter#parseBase64Binary(String)} (available * since Java 1.6) because it's so well hidden. * * @param b64data * Base64-encoded String * @return Data bytes */ private static byte[] decodeBase64(String b64data) { return DatatypeConverter.parseBase64Binary(b64data); } /** * Decode the Base64-encoded <code>charset</code> bytes back to a String. * * @param b64data * Base64-encoded String * @param charset * charset of decoded bytes * @return String */ public static String decodeBase64(String b64data, Charset charset) { return new String(decodeBase64(b64data), charset); } }