StringUtil.java example

Explorer
OmegaT-master
/**************************************************************************
 OmegaT - Computer Assisted Translation (CAT) tool
          with fuzzy matching, translation memory, keyword search,
          glossaries, and translation leveraging into updated projects.

 Copyright (C) 2000-2006 Keith Godfrey and Maxym Mykhalchuk
               2007 Didier Briel and Tiago Saboga
               2007 Zoltan Bartko - bartkozoltan@bartkozoltan.com
               2008 Andrzej Sawula
               2010-2013 Alex Buloichik
               2015 Zoltan Bartko, Aaron Madlon-Kay
               2016 Aaron Madlon-Kay
               Home page: http://www.omegat.org/
               Support center: http://groups.yahoo.com/group/OmegaT/

 This file is part of OmegaT.

 OmegaT is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
 the Free Software Foundation, either version 3 of the License, or
 (at your option) any later version.

 OmegaT is distributed in the hope that it will be useful,
 but WITHOUT ANY WARRANTY; without even the implied warranty of
 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 GNU General Public License for more details.

 You should have received a copy of the GNU General Public License
 along with this program.  If not, see <http://www.gnu.org/licenses/>.
 **************************************************************************/
package org.omegat.util;

import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.text.MessageFormat;
import java.text.Normalizer;
import java.util.Arrays;
import java.util.Locale;

import javax.xml.bind.DatatypeConverter;

/**
 * Utilities for string processing.
 *
 * @author Maxym Mykhalchuk
 * @author Didier Briel
 * @author Tiago Saboga
 * @author Zoltan Bartko
 * @author Andrzej Sawula
 * @author Alex Buloichik (alex73mail@gmail.com)
 * @author Aaron Madlon-Kay
 */
public final class StringUtil {

    private StringUtil() {
    }

    public static final char TRUNCATE_CHAR = '\u2026';

    /**
     * Check if string is empty, i.e. null or length==0
     */
    public static boolean isEmpty(final String str) {
        return str == null || str.isEmpty();
    }

    /**
     * Returns true if the input has at least one letter and
     * all letters are lower case.
     */
    public static boolean isLowerCase(final String input) {
        if (input.isEmpty()) {
            return false;
        }
        boolean hasLetters = false;
        for (int i = 0, cp; i < input.length(); i += Character.charCount(cp)) {
            cp = input.codePointAt(i);
            if (Character.isLetter(cp)) {
                hasLetters = true;
                if (!Character.isLowerCase(cp)) {
                    return false;
                }
            }
        }
        return hasLetters;
    }

    /**
     * Returns true if the input is upper case.
     */
    public static boolean isUpperCase(final String input) {
        if (input.isEmpty()) {
            return false;
        }
        boolean hasLetters = false;
        for (int i = 0, cp; i < input.length(); i += Character.charCount(cp)) {
            cp = input.codePointAt(i);
            if (Character.isLetter(cp)) {
                hasLetters = true;
                if (!Character.isUpperCase(cp)) {
                    return false;
                }
            }
        }
        return hasLetters;
    }

    /**
     * Returns true if the input has both upper case and lower case letters, but
     * is not title case.
     */
    public static boolean isMixedCase(final String input) {
        if (input.isEmpty() || input.codePointCount(0, input.length()) < 2) {
            return false;
        }
        boolean hasUpper = false;
        boolean hasLower = false;
        for (int i = 0, cp; i < input.length(); i += Character.charCount(cp)) {
            cp = input.codePointAt(i);
            if (Character.isLetter(cp)) {
                // Don't count the first cp as upper to allow for title case
                if (Character.isUpperCase(cp) && i > 0) {
                    hasUpper = true;
                } else if (Character.isLowerCase(cp)) {
                    hasLower = true;
                }
                if (hasUpper && hasLower) {
                    return true;
                }
            }
        }
        return false;
    }

    /**
     * Returns true if the input is title case, meaning the first character is UpperCase or
     * TitleCase* and the rest of the string (if present) is LowerCase.
     * <p>
     * *There are exotic characters that are neither UpperCase nor LowerCase, but are TitleCase:
     * e.g. LATIN CAPITAL LETTER L WITH SMALL LETTER J (U+01C8)<br>
     * These are handled correctly.
     */
    public static boolean isTitleCase(final String input) {
        if (input.isEmpty()) {
            return false;
        }
        if (input.codePointCount(0, input.length()) > 1) {
            return isTitleCase(input.codePointAt(0)) && isLowerCase(input.substring(input.offsetByCodePoints(0, 1)));
        } else {
            return isTitleCase(input.codePointAt(0));
        }
    }

    public static boolean isTitleCase(int codePoint) {
        // True if is actual title case, or if is upper case and has no separate title case variant.
        return Character.isTitleCase(codePoint)
                || (Character.isUpperCase(codePoint) && Character.toTitleCase(codePoint) == codePoint);
    }

    /**
     * Returns true if the input consists only of whitespace characters
     * (including non-breaking characters that are false according to
     * {@link Character#isWhitespace(int)}).
     */
    public static boolean isWhiteSpace(final String input) {
        if (input.isEmpty()) {
            return false;
        }
        for (int i = 0, cp; i < input.length(); i += Character.charCount(cp)) {
            cp = input.codePointAt(i);
            if (!isWhiteSpace(cp)) {
                return false;
            }
        }
        return true;
    }

    /**
     * Returns true if the input is a whitespace character
     * (including non-breaking characters that are false according to
     * {@link Character#isWhitespace(int)}).
     */
    public static boolean isWhiteSpace(int codePoint) {
        return Character.isWhitespace(codePoint)
                || codePoint == '\u00A0'
                || codePoint == '\u2007'
                || codePoint == '\u202F';
    }

    public static boolean isCJK(String input) {
        if (input.isEmpty()) {
            return false;
        }
        for (int i = 0, cp; i < input.length(); i += Character.charCount(cp)) {
            cp = input.codePointAt(i);
            // Anything less than CJK Radicals Supplement is "not CJK". Everything else is.
            // TODO: Make this smarter?
            if (cp < '\u2E80') {
                return false;
            }
        }
        return true;
    }

    public static String capitalizeFirst(String text, Locale locale) {
        int remainder = text.offsetByCodePoints(0, 1);
        String firstCP = text.substring(0, remainder);
        return StringUtil.toTitleCase(firstCP, locale)
                + text.substring(remainder);
    }

    public static String matchCapitalization(String text, String matchTo, Locale locale) {
        if (StringUtil.isEmpty(matchTo)) {
            return text;
        }
        // If input matches term exactly, don't change anything
        if (text.startsWith(matchTo)) {
            return text;
        }

        // If matching to title case (or 1 upper char), capitalize first letter.
        // Don't turn into title case because the text may be e.g. a phrase
        // with intentional mixed casing.
        if (StringUtil.isTitleCase(matchTo)) {
            return capitalizeFirst(text, locale);
        }
        // If matching to lower, turn into lower.
        if (StringUtil.isLowerCase(matchTo)) {
            return text.toLowerCase(locale);
        }
        // If matching to upper (at least 2 chars; otherwise would have hit isTitleCase()
        // above), turn into upper.
        if (StringUtil.isUpperCase(matchTo)) {
            return text.toUpperCase(locale);
        }
        return text;
    }

    /**
     * Convert text to title case according to the supplied locale.
     */
    public static String toTitleCase(String text, Locale locale) {
        if (text.isEmpty()) {
            return text;
        }
        int firstLetterIndex = 0;
        for (int cp; firstLetterIndex < text.length(); firstLetterIndex += Character.charCount(cp)) {
            cp = text.codePointAt(firstLetterIndex);
            if (Character.isLetter(cp)) {
                break;
            }
        }
        if (firstLetterIndex == text.length()) {
            return text;
        }
        int firstTitleCase = Character.toTitleCase(text.codePointAt(firstLetterIndex));
        int remainderOffset = text.offsetByCodePoints(firstLetterIndex, 1);
        // If the first codepoint has an actual title case variant (rare), use that.
        // Otherwise convert first codepoint to upper case according to locale.
        String first = Character.isTitleCase(firstTitleCase)
                    ? String.valueOf(Character.toChars(firstTitleCase))
                    : text.substring(0, remainderOffset).toUpperCase(locale);
        return first + text.substring(remainderOffset).toLowerCase(locale);
    }

    /**
     * Returns first not null object from list, or null if all values is null.
     */
    @SafeVarargs
    public static <T> T nvl(T... values) {
        for (T val : values) {
            if (val != null) {
                return val;
            }
        }
        return null;
    }

    /**
     * Returns first non-zero object from list, or zero if all values is null.
     */
    public static long nvlLong(long... values) {
        for (int i = 0; i < values.length; i++) {
            if (values[i] != 0) {
                return values[i];
            }
        }
        return 0;
    }

    /**
     * Compare two values, which could be null.
     */
    public static <T> boolean equalsWithNulls(T v1, T v2) {
        if (v1 == null && v2 == null) {
            return true;
        } else if (v1 != null && v2 != null) {
            return v1.equals(v2);
        } else {
            return false;
        }
    }

    /**
     * Compare two values, which could be null.
     */
    public static <T extends Comparable<T>> int compareToWithNulls(T v1, T v2) {
        if (v1 == v2) {
            return 0;
        } else if (v1 == null) {
            return -1;
        } else if (v2 == null) {
            return 1;
        } else {
            return v1.compareTo(v2);
        }
    }

    /**
     * Extracts first N codepoints from string.
     */
    public static String firstN(String str, int len) {
        if (str.codePointCount(0, str.length()) <= len) {
            return str;
        } else {
            return str.substring(0, str.offsetByCodePoints(0, len));
        }
    }

    /**
     * Truncate the supplied text to a maximum of len codepoints. If truncated,
     * the result will be the first (len - 1) codepoints plus a trailing
     * ellipsis.
     *
     * @param text
     *            The text to truncate
     * @param len
     *            The desired length (in codepoints) of the result
     * @return The truncated string
     */
    public static String truncate(String text, int len) {
        if (text.codePointCount(0, text.length()) <= len) {
            return text;
        }
        return firstN(text, len - 1) + TRUNCATE_CHAR;
    }

    /**
     * Returns first letter in lowercase. Usually used for create tag shortcuts.
     */
    public static int getFirstLetterLowercase(String s) {
        if (s == null) {
            return 0;
        }

        for (int cp, i = 0; i < s.length(); i += Character.charCount(cp)) {
            cp = s.codePointAt(i);
            if (Character.isLetter(cp)) {
                return Character.toLowerCase(cp);
            }
        }

        return 0;
    }

    /**
     * Checks if text contains substring after specified position.
     */
    public static boolean isSubstringAfter(String text, int pos, String substring) {
        if (pos + substring.length() > text.length()) {
            return false;
        }
        return substring.equals(text.substring(pos, pos + substring.length()));
    }

    /**
     * Checks if text contains substring before specified position.
     */
    public static boolean isSubstringBefore(String text, int pos, String substring) {
        if (pos - substring.length() < 0) {
            return false;
        }
        return substring.equals(text.substring(pos - substring.length(), pos));
    }

    public static String stripFromEnd(String string, String... toStrip) {
        if (string == null) {
            return null;
        }
        if (toStrip == null) {
            return string;
        }
        for (String s : toStrip) {
            if (string.endsWith(s)) {
                string = string.substring(0, string.length() - s.length());
            }
        }
        return string;
    }

    /**
     * Apply Unicode NFC normalization to a string.
     */
    public static String normalizeUnicode(CharSequence text) {
        return Normalizer.isNormalized(text, Normalizer.Form.NFC) ? text.toString()
                : Normalizer.normalize(text, Normalizer.Form.NFC);
    }

    /**
     * Replace invalid XML chars by spaces.
     *
     * @param str
     *            input stream
     * @return result stream
     * @see <a href="http://www.w3.org/TR/2006/REC-xml-20060816/#charsets">
     *      Supported chars</a>
     */
    public static String removeXMLInvalidChars(String str) {
        StringBuilder sb = new StringBuilder(str.length());
        for (int c, i = 0; i < str.length(); i += Character.charCount(c)) {
            c = str.codePointAt(i);
            if (!isValidXMLChar(c)) {
                c = ' ';
            }
            sb.appendCodePoint(c);
        }
        return sb.toString();
    }

    public static boolean isValidXMLChar(int codePoint) {
        if (codePoint < 0x20) {
            if (codePoint != 0x09 && codePoint != 0x0A && codePoint != 0x0D) {
                return false;
            }
        } else if (codePoint >= 0x20 && codePoint <= 0xD7FF) {
        } else if (codePoint >= 0xE000 && codePoint <= 0xFFFD) {
        } else if (codePoint >= 0x10000 && codePoint <= 0x10FFFF) {
        } else {
            return false;
        }
        return true;
    }

    /**
     * Converts a stream of plaintext into valid XML. Output stream must convert
     * stream to UTF-8 when saving to disk.
     */
    public static String makeValidXML(String plaintext) {
        StringBuilder out = new StringBuilder();
        String text = removeXMLInvalidChars(plaintext);
        for (int cp, i = 0; i < text.length(); i += Character.charCount(cp)) {
            cp = text.codePointAt(i);
            out.append(escapeXMLChars(cp));
        }
        return out.toString();
    }

    /** Compresses spaces in case of non-preformatting paragraph. */
    public static String compressSpaces(String str) {
        int strlen = str.length();
        StringBuilder res = new StringBuilder(strlen);
        boolean wasspace = true;
        for (int cp, i = 0; i < strlen; i += Character.charCount(cp)) {
            cp = str.codePointAt(i);
            if (Character.isWhitespace(cp)) {
                if (!wasspace) {
                    wasspace = true;
                }
            } else {
                if (wasspace && res.length() > 0) {
                    res.append(' ');
                }
                res.appendCodePoint(cp);
                wasspace = false;
            }
        }
        return res.toString();
    }

    /**
     * Converts a single code point into valid XML. Output stream must convert stream
     * to UTF-8 when saving to disk.
     */
    public static String escapeXMLChars(int cp) {
        switch (cp) {
        // case '\'':
        // return "'";
        case '&':
            return "&";
        case '>':
            return ">";
        case '<':
            return "<";
        case '"':
            return """;
        default:
            return String.valueOf(Character.toChars(cp));
        }
    }

    /**
     * Converts XML entities to characters.
     */
    public static String unescapeXMLEntities(String text) {

        if (text.contains(">")) {
            text = text.replaceAll(">", ">");
        }
        if (text.contains("<")) {
            text = text.replaceAll("<", "<");
        }
        if (text.contains(""")) {
            text = text.replaceAll(""", "\"");
        }
       // If makeValidXML converts ' to apos;, the following lines should be uncommented
        /* if (text.indexOf("'") >= 0) {
            text = text.replaceAll("'", "'");
        }*/
        if (text.contains("&")) {
            text = text.replaceAll("&", "&");
        }
        return text;
    }

    /**
     * Compares two strings for equality. Handles nulls: if both strings are
     * nulls they are considered equal.
     */
    public static boolean equal(String one, String two) {
        return (one == null && two == null) || (one != null && one.equals(two));
    }

    /**
     * Formats UI strings.
     *
     * Note: This is only a first attempt at putting right what goes wrong in
     * MessageFormat. Currently it only duplicates single quotes, but it doesn't
     * even test if the string contains parameters (numbers in curly braces),
     * and it doesn't allow for string containg already escaped quotes.
     *
     * @param str
     *            The string to format
     * @param arguments
     *            Arguments to use in formatting the string
     *
     * @return The formatted string
     */
    public static String format(String str, Object... arguments) {
        // MessageFormat.format expects single quotes to be escaped
        // by duplicating them, otherwise the string will not be formatted
        str = str.replaceAll("'", "''");
        return MessageFormat.format(str, arguments);
    }

    /**
     * Normalize the
     * <a href="https://en.wikipedia.org/wiki/Halfwidth_and_fullwidth_forms">
     * width</a> of characters in the supplied text. Specifically:
     * <ul>
     * <li>ASCII characters will become halfwidth
     * <li>Katakana characters will become fullwidth
     * <li>Hangul will become fullwidth
     * <li>Letter-like symbols and squared Latin abbreviations will be
     * decomposed to ASCII
     * </ul>
     * This method was adapted from <a href=
     * "https://bitbucket.org/okapiframework/okapi/src/52143104fcfc7eda204d04dfbbc273189f3a7f0f/okapi/steps/fullwidthconversion/src/main/java/net/sf/okapi/steps/fullwidthconversion/FullWidthConversionStep.java">
     * FullWidthConversionStep.java</a> in the Okapi Framework under GPLv2+.
     *
     * @param text
     * @return Normalized-width text
     */
    // CHECKSTYLE:OFF
    public static String normalizeWidth(String text) {
        StringBuilder sb = new StringBuilder(text);

        int ch;
        for ( int i=0; i<sb.length(); i++ ) {
            ch = sb.charAt(i);
            // ASCII
            if (( ch >= 0xFF01 ) && ( ch <= 0xFF5E )) {
                sb.setCharAt(i, (char)(ch-0xFEE0));
                continue;
            }
            if ( ch == 0x3000 ) {
                sb.setCharAt(i, ' ');
            }
            switch ( ch ) {
            // Katakana
            case 0xFF61: sb.setCharAt(i, (char)0x3002); break;
            case 0xFF62: sb.setCharAt(i, (char)0x300C); break;
            case 0xFF63: sb.setCharAt(i, (char)0x300D); break;
            case 0xFF64: sb.setCharAt(i, (char)0x3001); break;
            case 0xFF65: sb.setCharAt(i, (char)0x30FB); break;
            case 0xFF66: sb.setCharAt(i, (char)0x30F2); break;
            case 0xFF67: sb.setCharAt(i, (char)0x30A1); break;
            case 0xFF68: sb.setCharAt(i, (char)0x30A3); break;
            case 0xFF69: sb.setCharAt(i, (char)0x30A5); break;
            case 0xFF6A: sb.setCharAt(i, (char)0x30A7); break;
            case 0xFF6B: sb.setCharAt(i, (char)0x30A9); break;
            case 0xFF6C: sb.setCharAt(i, (char)0x30E3); break;
            case 0xFF6D: sb.setCharAt(i, (char)0x30E5); break;
            case 0xFF6E: sb.setCharAt(i, (char)0x30E7); break;
            case 0xFF6F: sb.setCharAt(i, (char)0x30C3); break;
            case 0xFF70: sb.setCharAt(i, (char)0x30FC); break;
            case 0xFF71: sb.setCharAt(i, (char)0x30A2); break;
            case 0xFF72: sb.setCharAt(i, (char)0x30A4); break;
            case 0xFF73: sb.setCharAt(i, (char)0x30A6); break;
            case 0xFF74: sb.setCharAt(i, (char)0x30A8); break;
            case 0xFF75: sb.setCharAt(i, (char)0x30AA); break;
            case 0xFF76: sb.setCharAt(i, (char)0x30AB); break;
            case 0xFF77: sb.setCharAt(i, (char)0x30AD); break;
            case 0xFF78: sb.setCharAt(i, (char)0x30AF); break;
            case 0xFF79: sb.setCharAt(i, (char)0x30B1); break;
            case 0xFF7A: sb.setCharAt(i, (char)0x30B3); break;
            case 0xFF7B: sb.setCharAt(i, (char)0x30B5); break;
            case 0xFF7C: sb.setCharAt(i, (char)0x30B7); break;
            case 0xFF7D: sb.setCharAt(i, (char)0x30B9); break;
            case 0xFF7E: sb.setCharAt(i, (char)0x30BB); break;
            case 0xFF7F: sb.setCharAt(i, (char)0x30BD); break;
            case 0xFF80: sb.setCharAt(i, (char)0x30BF); break;
            case 0xFF81: sb.setCharAt(i, (char)0x30C1); break;
            case 0xFF82: sb.setCharAt(i, (char)0x30C4); break;
            case 0xFF83: sb.setCharAt(i, (char)0x30C6); break;
            case 0xFF84: sb.setCharAt(i, (char)0x30C8); break;
            case 0xFF85: sb.setCharAt(i, (char)0x30CA); break;
            case 0xFF86: sb.setCharAt(i, (char)0x30CB); break;
            case 0xFF87: sb.setCharAt(i, (char)0x30CC); break;
            case 0xFF88: sb.setCharAt(i, (char)0x30CD); break;
            case 0xFF89: sb.setCharAt(i, (char)0x30CE); break;
            case 0xFF8A: sb.setCharAt(i, (char)0x30CF); break;
            case 0xFF8B: sb.setCharAt(i, (char)0x30D2); break;
            case 0xFF8C: sb.setCharAt(i, (char)0x30D5); break;
            case 0xFF8D: sb.setCharAt(i, (char)0x30D8); break;
            case 0xFF8E: sb.setCharAt(i, (char)0x30DB); break;
            case 0xFF8F: sb.setCharAt(i, (char)0x30DE); break;
            case 0xFF90: sb.setCharAt(i, (char)0x30DF); break;
            case 0xFF91: sb.setCharAt(i, (char)0x30E0); break;
            case 0xFF92: sb.setCharAt(i, (char)0x30E1); break;
            case 0xFF93: sb.setCharAt(i, (char)0x30E2); break;
            case 0xFF94: sb.setCharAt(i, (char)0x30E4); break;
            case 0xFF95: sb.setCharAt(i, (char)0x30E6); break;
            case 0xFF96: sb.setCharAt(i, (char)0x30E8); break;
            case 0xFF97: sb.setCharAt(i, (char)0x30E9); break;
            case 0xFF98: sb.setCharAt(i, (char)0x30EA); break;
            case 0xFF99: sb.setCharAt(i, (char)0x30EB); break;
            case 0xFF9A: sb.setCharAt(i, (char)0x30EC); break;
            case 0xFF9B: sb.setCharAt(i, (char)0x30ED); break;
            case 0xFF9C: sb.setCharAt(i, (char)0x30EF); break;
            case 0xFF9D: sb.setCharAt(i, (char)0x30F3); break;
            case 0xFF9E: sb.setCharAt(i, (char)0x3099); break;
            case 0xFF9F: sb.setCharAt(i, (char)0x309A); break;
            }
            // Hangul
            if (( ch > 0xFFA1 ) && ( ch <= 0xFFBE )) {
                sb.setCharAt(i, (char)(ch-0xCE70));
                continue;
            }
            switch ( ch ) {
            // Hangul
            case 0xFFA0: sb.setCharAt(i, (char)0x3164); break;
            case 0xFFDA: sb.setCharAt(i, (char)0x3161); break;
            case 0xFFDB: sb.setCharAt(i, (char)0x3162); break;
            case 0xFFDC: sb.setCharAt(i, (char)0x3163); break;
            // Others
            case 0xFFE8: sb.setCharAt(i, (char)0x2502); break;
            case 0xFFE9: sb.setCharAt(i, (char)0x2190); break;
            case 0xFFEA: sb.setCharAt(i, (char)0x2191); break;
            case 0xFFEB: sb.setCharAt(i, (char)0x2192); break;
            case 0xFFEC: sb.setCharAt(i, (char)0x2193); break;
            case 0xFFED: sb.setCharAt(i, (char)0x25A0); break;
            case 0xFFEE: sb.setCharAt(i, (char)0x25CB); break;
            }

            // Process letter-like symbols
            switch ( ch ) {
            case 0x2100: sb.setCharAt(i, 'a'); sb.insert(i+1, "/c"); i+=2; break;
            case 0x2101: sb.setCharAt(i, 'a'); sb.insert(i+1, "/s"); i+=2; break;
            case 0x2105: sb.setCharAt(i, 'c'); sb.insert(i+1, "/o"); i+=2; break;
            case 0x2103: sb.setCharAt(i, (char)0x00B0); sb.insert(i+1, "C"); i++; break;
            case 0x2109: sb.setCharAt(i, (char)0x00B0); sb.insert(i+1, "F"); i++; break;
            case 0x2116: sb.setCharAt(i, 'N'); sb.insert(i+1, "o"); i++; break;
            case 0x212A: sb.setCharAt(i, 'K'); break;
            case 0x212B: sb.setCharAt(i, (char)0x00C5); break;
            }

            switch ( ch ) {
            // Squared Latin Abbreviations 1
            case 0x3371: sb.setCharAt(i, 'h'); sb.insert(i+1, "Pa"); i+=2; break;
            case 0x3372: sb.setCharAt(i, 'd'); sb.insert(i+1, "a"); i++; break;
            case 0x3373: sb.setCharAt(i, 'A'); sb.insert(i+1, "U"); i++; break;
            case 0x3374: sb.setCharAt(i, 'b'); sb.insert(i+1, "ar"); i+=2; break;
            case 0x3375: sb.setCharAt(i, 'o'); sb.insert(i+1, "V"); i++; break;
            case 0x3376: sb.setCharAt(i, 'p'); sb.insert(i+1, "c"); i++; break;
            case 0x3377: sb.setCharAt(i, 'd'); sb.insert(i+1, "m"); i++; break;
            case 0x3378: sb.setCharAt(i, 'd'); sb.insert(i+1, "m\u00B2"); i+=2; break;
            case 0x3379: sb.setCharAt(i, 'd'); sb.insert(i+1, "m\u00B3"); i+=2; break;
            case 0x337A: sb.setCharAt(i, 'I'); sb.insert(i+1, "U"); i++; break;
            // Squared Latin Abbreviations 2
            case 0x3380: sb.setCharAt(i, 'p'); sb.insert(i+1, "A"); i++; break;
            case 0x3381: sb.setCharAt(i, 'n'); sb.insert(i+1, "A"); i++; break;
            case 0x3382: sb.setCharAt(i, (char)0x03BC); sb.insert(i+1, "A"); i++; break;
            case 0x3383: sb.setCharAt(i, 'm'); sb.insert(i+1, "A"); i++; break;
            case 0x3384: sb.setCharAt(i, 'k'); sb.insert(i+1, "A"); i++; break;
            case 0x3385: sb.setCharAt(i, 'K'); sb.insert(i+1, "B"); i++; break;
            case 0x3386: sb.setCharAt(i, 'M'); sb.insert(i+1, "B"); i++; break;
            case 0x3387: sb.setCharAt(i, 'G'); sb.insert(i+1, "B"); i++; break;
            case 0x3388: sb.setCharAt(i, 'c'); sb.insert(i+1, "al"); i+=2; break;
            case 0x3389: sb.setCharAt(i, 'k'); sb.insert(i+1, "cal"); i+=3; break;
            case 0x338A: sb.setCharAt(i, 'p'); sb.insert(i+1, "F"); i++; break;
            case 0x338B: sb.setCharAt(i, 'n'); sb.insert(i+1, "F"); i++; break;
            case 0x338C: sb.setCharAt(i, (char)0x03BC); sb.insert(i+1, "F"); i++; break;
            case 0x338D: sb.setCharAt(i, (char)0x03BC); sb.insert(i+1, "g"); i++; break;
            case 0x338E: sb.setCharAt(i, 'm'); sb.insert(i+1, "g"); i++; break;
            case 0x338F: sb.setCharAt(i, 'k'); sb.insert(i+1, "g"); i++; break;
            case 0x3390: sb.setCharAt(i, 'H'); sb.insert(i+1, "z"); i++; break;
            case 0x3391: sb.setCharAt(i, 'k'); sb.insert(i+1, "Hz"); i+=2; break;
            case 0x3392: sb.setCharAt(i, 'M'); sb.insert(i+1, "Hz"); i+=2; break;
            case 0x3393: sb.setCharAt(i, 'G'); sb.insert(i+1, "Hz"); i+=2; break;
            case 0x3394: sb.setCharAt(i, 'T'); sb.insert(i+1, "Hz"); i+=2; break;
            case 0x3395: sb.setCharAt(i, (char)0x03BC); sb.insert(i+1, "\u2113"); i++; break;
            case 0x3396: sb.setCharAt(i, 'm'); sb.insert(i+1, "\u2113"); i++; break;
            case 0x3397: sb.setCharAt(i, 'd'); sb.insert(i+1, "\u2113"); i++; break;
            case 0x3398: sb.setCharAt(i, 'k'); sb.insert(i+1, "\u2113"); i++; break;
            case 0x3399: sb.setCharAt(i, 'f'); sb.insert(i+1, "m"); i++; break;
            case 0x339A: sb.setCharAt(i, 'n'); sb.insert(i+1, "m"); i++; break;
            case 0x339B: sb.setCharAt(i, (char)0x03BC); sb.insert(i+1, "m"); i++; break;
            case 0x339C: sb.setCharAt(i, 'm'); sb.insert(i+1, "m"); i++; break;
            case 0x339D: sb.setCharAt(i, 'c'); sb.insert(i+1, "m"); i++; break;
            case 0x339E: sb.setCharAt(i, 'k'); sb.insert(i+1, "m"); i++; break;
            case 0x339F: sb.setCharAt(i, 'm'); sb.insert(i+1, "m\u00B2"); i+=2; break;
            case 0x33A0: sb.setCharAt(i, 'c'); sb.insert(i+1, "m\u00B2"); i+=2; break;
            case 0x33A1: sb.setCharAt(i, 'm'); sb.insert(i+1, "\u00B2"); i++; break;
            case 0x33A2: sb.setCharAt(i, 'k'); sb.insert(i+1, "m\u00B2"); i+=2; break;
            case 0x33A3: sb.setCharAt(i, 'm'); sb.insert(i+1, "m\u00B3"); i+=2; break;
            case 0x33A4: sb.setCharAt(i, 'c'); sb.insert(i+1, "m\u00B3"); i+=2; break;
            case 0x33A5: sb.setCharAt(i, 'm'); sb.insert(i+1, "\u00B3"); i++; break;
            case 0x33A6: sb.setCharAt(i, 'k'); sb.insert(i+1, "m\u00B3"); i+=2; break;
            case 0x33A7: sb.setCharAt(i, 'm'); sb.insert(i+1, "/s"); i+=2; break;
            case 0x33A8: sb.setCharAt(i, 'm'); sb.insert(i+1, "/s\u00B2"); i+=3; break;
            case 0x33A9: sb.setCharAt(i, 'P'); sb.insert(i+1, "a"); i++; break;
            case 0x33AA: sb.setCharAt(i, 'k'); sb.insert(i+1, "Pa"); i+=2; break;
            case 0x33AB: sb.setCharAt(i, 'M'); sb.insert(i+1, "Pa"); i+=2; break;
            case 0x33AC: sb.setCharAt(i, 'G'); sb.insert(i+1, "Pa"); i+=2; break;
            case 0x33AD: sb.setCharAt(i, 'r'); sb.insert(i+1, "ad"); i+=2; break;
            case 0x33AE: sb.setCharAt(i, 'r'); sb.insert(i+1, "ad/s"); i+=4; break;
            case 0x33AF: sb.setCharAt(i, 'r'); sb.insert(i+1, "ad/s\u00B2"); i+=5; break;
            case 0x33B0: sb.setCharAt(i, 'p'); sb.insert(i+1, "s"); i++; break;
            case 0x33B1: sb.setCharAt(i, 'n'); sb.insert(i+1, "s"); i++; break;
            case 0x33B2: sb.setCharAt(i, (char)0x03BC); sb.insert(i+1, "s"); i++; break;
            case 0x33B3: sb.setCharAt(i, 'm'); sb.insert(i+1, "s"); i++; break;
            case 0x33B4: sb.setCharAt(i, 'p'); sb.insert(i+1, "V"); i++; break;
            case 0x33B5: sb.setCharAt(i, 'n'); sb.insert(i+1, "V"); i++; break;
            case 0x33B6: sb.setCharAt(i, (char)0x03BC); sb.insert(i+1, "V"); i++; break;
            case 0x33B7: sb.setCharAt(i, 'm'); sb.insert(i+1, "V"); i++; break;
            case 0x33B8: sb.setCharAt(i, 'k'); sb.insert(i+1, "V"); i++; break;
            case 0x33B9: sb.setCharAt(i, 'M'); sb.insert(i+1, "V"); i++; break;
            case 0x33BA: sb.setCharAt(i, 'p'); sb.insert(i+1, "W"); i++; break;
            case 0x33BB: sb.setCharAt(i, 'n'); sb.insert(i+1, "W"); i++; break;
            case 0x33BC: sb.setCharAt(i, (char)0x03BC); sb.insert(i+1, "W"); i++; break;
            case 0x33BD: sb.setCharAt(i, 'm'); sb.insert(i+1, "W"); i++; break;
            case 0x33BE: sb.setCharAt(i, 'k'); sb.insert(i+1, "W"); i++; break;
            case 0x33BF: sb.setCharAt(i, 'M'); sb.insert(i+1, "W"); i++; break;
            case 0x33C0: sb.setCharAt(i, 'k'); sb.insert(i+1, "\u03A9"); i++; break;
            case 0x33C1: sb.setCharAt(i, 'M'); sb.insert(i+1, "\u03A9"); i++; break;
            case 0x33C2: sb.setCharAt(i, 'a'); sb.insert(i+1, ".m."); i+=3; break;
            case 0x33C3: sb.setCharAt(i, 'B'); sb.insert(i+1, "q"); i++; break;
            case 0x33C4: sb.setCharAt(i, 'c'); sb.insert(i+1, "c"); i++; break;
            case 0x33C5: sb.setCharAt(i, 'c'); sb.insert(i+1, "d"); i++; break;
            case 0x33C6: sb.setCharAt(i, 'C'); sb.insert(i+1, "/kg"); i+=3; break;
            case 0x33C7: sb.setCharAt(i, 'C'); sb.insert(i+1, "o."); i+=2; break;
            case 0x33C8: sb.setCharAt(i, 'd'); sb.insert(i+1, "B"); i++; break;
            case 0x33C9: sb.setCharAt(i, 'G'); sb.insert(i+1, "y"); i++; break;
            case 0x33CA: sb.setCharAt(i, 'h'); sb.insert(i+1, "a"); i++; break;
            case 0x33CB: sb.setCharAt(i, 'H'); sb.insert(i+1, "P"); i++; break;
            case 0x33CC: sb.setCharAt(i, 'i'); sb.insert(i+1, "n"); i++; break;
            case 0x33CD: sb.setCharAt(i, 'K'); sb.insert(i+1, "K"); i++; break;
            case 0x33CE: sb.setCharAt(i, 'K'); sb.insert(i+1, "M"); i++; break;
            case 0x33CF: sb.setCharAt(i, 'K'); sb.insert(i+1, "t"); i++; break;
            case 0x33D0: sb.setCharAt(i, 'l'); sb.insert(i+1, "m"); i++; break;
            case 0x33D1: sb.setCharAt(i, 'l'); sb.insert(i+1, "n"); i++; break;
            case 0x33D2: sb.setCharAt(i, 'l'); sb.insert(i+1, "og"); i+=2; break;
            case 0x33D3: sb.setCharAt(i, 'l'); sb.insert(i+1, "x"); i++; break;
            case 0x33D4: sb.setCharAt(i, 'm'); sb.insert(i+1, "b"); i++; break;
            case 0x33D5: sb.setCharAt(i, 'm'); sb.insert(i+1, "il"); i+=2; break;
            case 0x33D6: sb.setCharAt(i, 'm'); sb.insert(i+1, "ol"); i+=2; break;
            case 0x33D7: sb.setCharAt(i, 'p'); sb.insert(i+1, "H"); i++; break;
            case 0x33D8: sb.setCharAt(i, 'p'); sb.insert(i+1, ".m."); i+=3; break;
            case 0x33D9: sb.setCharAt(i, 'P'); sb.insert(i+1, "PM"); i+=2; break;
            case 0x33DA: sb.setCharAt(i, 'P'); sb.insert(i+1, "R"); i++; break;
            case 0x33DB: sb.setCharAt(i, 's'); sb.insert(i+1, "r"); i++; break;
            case 0x33DC: sb.setCharAt(i, 'S'); sb.insert(i+1, "v"); i++; break;
            case 0x33DD: sb.setCharAt(i, 'W'); sb.insert(i+1, "b"); i++; break;
            case 0x33DE: sb.setCharAt(i, 'v'); sb.insert(i+1, "/m"); i+=2; break;
            case 0x33DF: sb.setCharAt(i, 'a'); sb.insert(i+1, "/m"); i+=2; break;
            // Squared Latin Abbreviations 3
            case 0x33FF: sb.setCharAt(i, 'g'); sb.insert(i+1, "al"); i+=2; break;
            }
        }

        String result = sb.toString();

        if (text.equals(result)) {
            // No characters were changed. Return the original text so that
            // composition of unrelated characters is not affected.
            return text;
        }

        return normalizeUnicode(result);
    }
    // CHECKSTYLE:ON

    /**
     * Strip whitespace from the end of a string. Uses
     * {@link Character#isWhitespace(int)}, so it does not strip the extra
     * non-breaking whitespace included in {@link #isWhiteSpace(int)}.
     *
     * @param text
     * @return text with trailing whitespace removed
     */
    public static String rstrip(String text) {
        for (int cp, i = text.length(); i >= 0; i -= Character.charCount(cp)) {
            if (i == 0) {
                return "";
            }
            cp = text.codePointBefore(i);
            if (!Character.isWhitespace(cp)) {
                return text.substring(0, i);
            }
        }
        return text;
    }

    /**
     * Convert a byte array into a Base64-encoded String. Convenience method for
     * {@link DatatypeConverter#printBase64Binary(byte[])} (available since Java
     * 1.6) because it's so well hidden.
     *
     * @param bytes
     *            Data bytes
     * @return Base64-encoded String
     */
    private static String encodeBase64(byte[] bytes) {
        return DatatypeConverter.printBase64Binary(bytes);
    }

    /**
     * Convert a string's <code>charset</code> bytes into a Base64-encoded String.
     *
     * @param string
     *            a string
     * @param charset
     *            the charset with which to obtain the bytes
     * @return Base64-encoded String
     */
    public static String encodeBase64(String string, Charset charset) {
        return encodeBase64(string.getBytes(charset));
    }

    /**
     * Convert a char array's <code>charset</code> bytes into a Base64-encoded String.
     * Useful for handling passwords. Intermediate buffers are cleared after use.
     *
     * @param chars
     *            a char array
     * @param charset
     *            the charset with which to obtain the bytes
     * @return Base64-encoded String
     */
    public static String encodeBase64(char[] chars, Charset charset) {
        CharBuffer charBuf = CharBuffer.wrap(chars);
        ByteBuffer byteBuf = charset.encode(charBuf);
        String result = encodeBase64(byteBuf.array());
        Arrays.fill(charBuf.array(), '\0');
        Arrays.fill(byteBuf.array(), (byte) 0);
        return result;
    }

    /**
     * Convert a Base64-encoded String into an array of bytes. Convenience
     * method for {@link DatatypeConverter#parseBase64Binary(String)} (available
     * since Java 1.6) because it's so well hidden.
     *
     * @param b64data
     *            Base64-encoded String
     * @return Data bytes
     */
    private static byte[] decodeBase64(String b64data) {
        return DatatypeConverter.parseBase64Binary(b64data);
    }

    /**
     * Decode the Base64-encoded <code>charset</code> bytes back to a String.
     *
     * @param b64data
     *            Base64-encoded String
     * @param charset
     *            charset of decoded bytes
     * @return String
     */
    public static String decodeBase64(String b64data, Charset charset) {
        return new String(decodeBase64(b64data), charset);
    }
}