/* Copyright (c) 2008 Google Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.google.gdata.util.common.base; import java.io.BufferedReader; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStream; import java.io.StringWriter; import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import java.util.StringTokenizer; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Some common string manipulation utilities. */ public class StringUtil { public static final String EMPTY_STRING = ""; // \u3000 is the double-byte space character in UTF-8 // \u00A0 is the non-breaking space character ( ) // \u2007 is the figure space character ( ) // \u202F is the narrow non-breaking space character ( ) public static final String WHITE_SPACES = " \r\n\t\u3000\u00A0\u2007\u202F"; public static final String LINE_BREAKS = "\r\n"; private static final Pattern htmlTagPattern = Pattern.compile("</?[a-zA-Z][^>]*>"); private static final Pattern characterReferencePattern = Pattern.compile("&#?[a-zA-Z0-9]{1,8};"); private static final Pattern dbSpecPattern = Pattern.compile("(.*)\\{(\\d+),(\\d+)\\}(.*)"); // This class should not be instantiated, hence the private constructor private StringUtil() {} /** Split "str" by run of delimiters and return. */ public static String[] split(String str, String delims) { return split(str, delims, false); } /** * Split "str" into tokens by delimiters and optionally remove white spaces * from the splitted tokens. * * @param trimTokens if true, then trim the tokens */ public static String[] split(String str, String delims, boolean trimTokens) { StringTokenizer tokenizer = new StringTokenizer(str, delims); int n = tokenizer.countTokens(); String[] list = new String[n]; for (int i = 0; i < n; i++) { if (trimTokens) { list[i] = tokenizer.nextToken().trim(); } else { list[i] = tokenizer.nextToken(); } } return list; } /** * Short hand for <code>split(str, delims, true)</code> */ public static String[] splitAndTrim(String str, String delims) { return split(str, delims, true); } /** Parse comma-separated list of ints and return as array. */ public static int[] splitInts(String str) throws IllegalArgumentException { StringTokenizer tokenizer = new StringTokenizer(str, ","); int n = tokenizer.countTokens(); int[] list = new int[n]; for (int i = 0; i < n; i++) { String token = tokenizer.nextToken(); list[i] = Integer.parseInt(token); } return list; } /** Parse comma-separated list of longs and return as array. */ public static long[] splitLongs(String str) throws IllegalArgumentException { StringTokenizer tokenizer = new StringTokenizer(str, ","); int n = tokenizer.countTokens(); long[] list = new long[n]; for (int i = 0; i < n; i++) { String token = tokenizer.nextToken(); list[i] = Long.parseLong(token); } return list; } /** * Concatenates the given int[] array into one String, inserting a delimiter * between each pair of elements. */ public static String joinInts(int[] tokens, String delimiter) { if (tokens == null) return ""; StringBuilder result = new StringBuilder(); for (int i = 0; i < tokens.length; i++) { if (i > 0 && delimiter != null) { result.append(delimiter); } result.append(String.valueOf(tokens[i])); } return result.toString(); } /** * Concatenates the given long[] array into one String, inserting a delimiter * between each pair of elements. */ public static String joinLongs(long[] tokens, String delimiter) { if (tokens == null) return ""; StringBuilder result = new StringBuilder(); for (int i = 0; i < tokens.length; i++) { if (i > 0 && delimiter != null) { result.append(delimiter); } result.append(String.valueOf(tokens[i])); } return result.toString(); } /** * Concatenates the String representations of the elements of a * String[] array into one String, and inserts a delimiter between * each pair of elements. * <p> * This includes the String[] case, because if s is a String, then * s.toString() returns s. * * @deprecated Please use * But note that {@code Join} does not consider null elements to be * equivalent to the empty string, as this method does. */ @Deprecated public static String join(Object[] tokens, String delimiter) { if (tokens == null || tokens.length == 0) return ""; StringBuilder result = new StringBuilder(); for (int i = 0; i < tokens.length; i++) { if (i > 0 && delimiter != null) result.append(delimiter); if (tokens[i] != null) result.append(tokens[i].toString()); } return result.toString(); } /** * Same as {@link #join(Object[],String)}, but takes a {@link Collection} * instead. * * @deprecated Please use * But note that {@code Join} does not consider null elements to be * equivalent to the empty string, as this method does. */ @Deprecated public static String join(Collection tokens, String delimiter) { return join(tokens.toArray(), delimiter); } /** This replaces the occurances of 'what' in 'str' with 'with' * @param str - the string o process * @param what - to replace * @param with - replace with this * @return String str whete 'what' was repalced with 'with' * * @deprecated Please use {@link String#replace(CharSequence, CharSequence)}. */ @Deprecated public static String replace(String str, String what, String with) { // Have to check this argument, for compatibility with the old impl. // For the record, String.replace() is capable of handling an empty target // string... but it does something kind of weird in that case. assert(what.length() > 0); return str.replace(what, with); } /** * Reformats the given string to a fixed width by inserting * carriage returns and trimming unnecessary whitespace. * * @param str the string to format * @param width the fixed width (in characters) */ public static String fixedWidth(String str, int width) { String[] lines = split(str, "\n"); return fixedWidth(lines, width); } /** * Reformats the given array of lines to a fixed width by inserting * carriage returns and trimming unnecessary whitespace. * * @param lines - array of lines to format * @param width - the fixed width (in characters) */ public static String fixedWidth(String[] lines, int width) { StringBuilder formatStr = new StringBuilder(); for (int i = 0; i < lines.length; i++) { int curWidth = 0; if (i != 0) { formatStr.append("\n"); } // a small optimization if (lines[i].length() <= width) { formatStr.append(lines[i]); continue; } String[] words = splitAndTrim(lines[i], WHITE_SPACES); for (int j = 0; j < words.length; j++) { if (curWidth == 0 || (curWidth + words[j].length()) < width) { // add a space if we're not at the beginning of a line if (curWidth != 0) { formatStr.append(" "); curWidth += 1; } curWidth += words[j].length(); formatStr.append(words[j]); } else { formatStr.append("\n"); curWidth = words[j].length(); formatStr.append(words[j]); } } } return formatStr.toString(); } /** * Inserts spaces every splitLen characters so that the string will wrap. * * @param lineLen the length of the substrings to separate with spaces. * @param original the original String * * @return original String with spaces inserted every lineLen characters. * */ public static String insertBreakingWhitespace(int lineLen, String original) { if (original == null || lineLen <= 0) throw new IllegalArgumentException(); int length = original.length(); if (length <= lineLen) // we can avoid the overhead of instantiating a StringBuilder return original; int currPos = 0; StringBuilder retval = new StringBuilder(); while (length - currPos > lineLen) { retval.append(original.substring(currPos, currPos + lineLen)); currPos += lineLen; retval.append(" "); } retval.append(original.substring(currPos, length)); return retval.toString(); } /** * Indents the given String per line. * @param iString The string to indent. * @param iIndentDepth The depth of the indentation. * @return The indented string. */ public static String indent(String iString, int iIndentDepth) { StringBuilder spacer = new StringBuilder(); spacer.append("\n"); for (int i = 0; i < iIndentDepth; i ++) { spacer.append(" "); } return replace(iString, "\n", spacer.toString()); } /** * This is a both way strip * * @param str the string to strip * @param left strip from left * @param right strip from right * @param what character(s) to strip * @return the stripped string * @deprecated ensure the string is not null and use * <ul> * <li> {@code CharMatcher.anyOf(what).trimFrom(str)} * if {@code left == true} and {@code right == true} * <li> {@code CharMatcher.anyOf(what).trimLeadingFrom(str)} * if {@code left == true} and {@code right == false} * <li> {@code CharMatcher.anyOf(what).trimTrailingFrom(str)} * if {@code left == false} and {@code right == true} * </ul> */ @Deprecated public static String megastrip(String str, boolean left, boolean right, String what) { if (str == null) { return null; } int limitLeft = 0; int limitRight = str.length() - 1; while (left && limitLeft <= limitRight && what.indexOf(str.charAt(limitLeft)) >= 0) { limitLeft ++; } while (right && limitRight>=limitLeft && what.indexOf(str.charAt(limitRight)) >= 0) { limitRight --; } return str.substring(limitLeft, limitRight + 1); } /** lstrip - strips spaces from left * @param str what to strip * @return String the striped string * @deprecated ensure the string is not null and use {@code * CharMatcher.LEGACY_WHITESPACE.trimLeadingFrom(str)}; also consider whether you * really want the legacy whitespace definition, or something more * standard like {@link CharMatcher#WHITESPACE}. */ @Deprecated public static String lstrip(String str) { return (str == null) ? null : CharMatcher.LEGACY_WHITESPACE.trimLeadingFrom(str); } /** rstrip - strips spaces from right * @param str what to strip * @return String the striped string * @deprecated ensure the string is not null and use {@code * CharMatcher.LEGACY_WHITESPACE.trimTrailingFrom(str)}; also consider whether you * really want the legacy whitespace definition, or something more * standard like {@link CharMatcher#WHITESPACE}. */ @Deprecated public static String rstrip(String str) { return (str == null) ? null : CharMatcher.LEGACY_WHITESPACE.trimTrailingFrom(str); } /** strip - strips both ways * @param str what to strip * @return String the striped string */ public static String strip(String str) { return megastrip(str, true, true, WHITE_SPACES); } /** Strip white spaces from both end, and collapse white spaces * in the middle. * @param str what to strip * @return String the striped and collapsed string */ public static String stripAndCollapse(String str) { return collapseWhitespace(strip(str)); } /** * Give me a string and a potential prefix, and I return the string * following the prefix if the prefix matches, else null. * Analogous to the c++ functions strprefix and var_strprefix. */ public static String stripPrefix(String str, String prefix) { return str.startsWith(prefix) ? str.substring(prefix.length()) : null; } /** * Case insensitive version of stripPrefix. * Analogous to the c++ functions strcaseprefix and var_strcaseprefix. */ public static String stripPrefixIgnoreCase(String str, String prefix) { if (str.length() >= prefix.length() && str.substring(0, prefix.length()).equalsIgnoreCase(prefix)) { return str.substring(prefix.length()); } return null; } /** * Strips all non-digit characters from a string. * * The resulting string will only contain characters for which isDigit() * returns true. * * @param str the string to strip * @return a string consisting of digits only, or an empty string */ public static String stripNonDigits(String str) { StringBuffer result = new StringBuffer(str.length()); for (char candidate : str.toCharArray()) { if (Character.isDigit(candidate)) { result.append(candidate); } } return result.toString(); } /** * Counts the number of (not necessarily distinct) characters in the * string that also happen to be in 'chars' */ public static int numSharedChars(final String str, final String chars) { if (str == null || chars == null) { return 0; } int total = 0, pos = -1; while ((pos = indexOfChars(str, chars, pos + 1)) != -1) { total++; } return total; } /** * Like String.indexOf() except that it will look for any of the * characters in 'chars' (similar to C's strpbrk) */ public static int indexOfChars(String str, String chars, int fromIndex) { final int len = str.length(); for (int pos = fromIndex; pos < len; pos++) { if (chars.indexOf(str.charAt(pos)) >= 0) { return pos; } } return -1; } /** * Like String.indexOf() except that it will look for any of the * characters in 'chars' (similar to C's strpbrk) */ public static int indexOfChars(String str, String chars) { return indexOfChars(str, chars, 0); } /** * Finds the last index in str of a character not in the characters * in 'chars' (similar to ANSI string.find_last_not_of). * * Returns -1 if no such character can be found. */ public static int lastIndexNotOf(String str, String chars, int fromIndex) { fromIndex = Math.min(fromIndex, str.length() - 1); for (int pos = fromIndex; pos >= 0; pos--) { if (chars.indexOf(str.charAt(pos)) < 0) { return pos; } } return -1; } /** * Like String.replace() except that it accepts any number of old chars. * Replaces any occurrances of 'oldchars' in 'str' with 'newchar'. * Example: replaceChars("Hello, world!", "H,!", ' ') returns " ello world " */ public static String replaceChars(String str, String oldchars, char newchar) { int pos = indexOfChars(str, oldchars); if (pos == -1) { return str; } StringBuilder buf = new StringBuilder(str); do { buf.setCharAt(pos, newchar); pos = indexOfChars(str, oldchars, pos + 1); } while (pos != -1); return buf.toString(); } /** * Remove any occurrances of 'oldchars' in 'str'. * Example: removeChars("Hello, world!", ",!") returns "Hello world" */ public static String removeChars(String str, String oldchars) { int pos = indexOfChars(str, oldchars); if (pos == -1) { return str; } StringBuilder buf = new StringBuilder(); int start = 0; do { buf.append(str.substring(start, pos)); start = pos + 1; pos = indexOfChars(str, oldchars, start); } while (pos != -1); if (start < str.length()) { buf.append(str.substring(start)); } return buf.toString(); } /** * Removes all characters from 'str' that are not in 'retainChars'. * Example: retainAllChars("Hello, world!", "lo") returns "llool" */ public static String retainAllChars(String str, String retainChars) { int pos = indexOfChars(str, retainChars); if (pos == -1) { return ""; } StringBuilder buf = new StringBuilder(); do { buf.append(str.charAt(pos)); pos = indexOfChars(str, retainChars, pos + 1); } while (pos != -1); return buf.toString(); } /** * Replaces microsoft "smart quotes" (curly " and ') with their * ascii counterparts. */ public static String replaceSmartQuotes(String str) { // See http://www.microsoft.com/typography/unicode/1252.htm str = replaceChars(str, "\u0091\u0092\u2018\u2019", '\''); str = replaceChars(str, "\u0093\u0094\u201c\u201d", '"'); return str; } /** * Convert a string of hex digits to a byte array, with the first * byte in the array being the MSB. The string passed in should be * just the raw digits (upper or lower case), with no leading * or trailing characters (like '0x' or 'h'). * An odd number of characters is supported. * If the string is empty, an empty array will be returned. * * This is significantly faster than using * new BigInteger(str, 16).toByteArray(); * especially with larger strings. Here are the results of some * microbenchmarks done on a P4 2.8GHz 2GB RAM running * linux 2.4.22-gg11 and JDK 1.5 with an optimized build: * * String length hexToBytes (usec) BigInteger * ----------------------------------------------------- * 16 0.570 1.43 * 256 8.21 44.4 * 1024 32.8 526 * 16384 546 121000 */ public static byte[] hexToBytes(String str) { byte[] bytes = new byte[(str.length() + 1) / 2]; if (str.length() == 0) { return bytes; } bytes[0] = 0; int nibbleIdx = (str.length() % 2); for (int i = 0; i < str.length(); i++) { char c = str.charAt(i); if (!isHex(c)) { throw new IllegalArgumentException("string contains non-hex chars"); } if ((nibbleIdx % 2) == 0) { bytes[nibbleIdx >> 1] = (byte) (hexValue(c) << 4); } else { bytes[nibbleIdx >> 1] += (byte) hexValue(c); } nibbleIdx++; } return bytes; } /** * Converts any instances of "\r" or "\r\n" style EOLs into "\n" (Line Feed). */ public static String convertEOLToLF(String input) { StringBuilder res = new StringBuilder(input.length()); char[] s = input.toCharArray(); int from = 0; final int end = s.length; for (int i = 0; i < end; i++) { if (s[i] == '\r') { res.append(s, from, i - from); res.append('\n'); if (i + 1 < end && s[i + 1] == '\n') { i++; } from = i + 1; } } if (from == 0) { // no \r! return input; } res.append(s, from, end - from); return res.toString(); } /** @deprecated Please inline this method. */ @Deprecated public static String convertEOLToCRLF(String input) { return input.replaceAll("(\r\n|\r|\n)", "\r\n"); } /** * Returns a string consisting of "s", plus enough copies of "pad_ch" on the * left hand side to make the length of "s" equal to or greater than len (if * "s" is already longer than "len", then "s" is returned). */ public static String padLeft(String s, int len, char pad_ch) { if (s.length() >= len) { return s; } else { StringBuilder sb = new StringBuilder(); int n = len - s.length(); for (int i = 0; i < n; i++) { sb.append(pad_ch); } sb.append(s); return sb.toString(); } } /** * Returns a string consisting of "s", plus enough copies of "pad_ch" on the * right hand side to make the length of "s" equal to or greater than len (if * "s" is already longer than "len", then "s" is returned). */ public static String padRight(String s, int len, char pad_ch) { if (s.length() >= len) { return s; } else { StringBuilder sb = new StringBuilder(); int n = len - s.length(); sb.append(s); for (int i = 0; i < n; i++) { sb.append(pad_ch); } return sb.toString(); } } /** * Returns a string consisting of "s", with each of the first "len" characters * replaced by "mask_ch" character. */ public static String maskLeft(String s, int len, char mask_ch) { if (len <= 0) { return s; } len = Math.min(len, s.length()); StringBuilder sb = new StringBuilder(); for (int i = 0; i < len; i++) { sb.append(mask_ch); } sb.append(s.substring(len)); return sb.toString(); } /** * Returns a string consisting of "s", with each of the last "len" characters * replaces by "mask_ch" character. */ public static String maskRight(String s, int len, char mask_ch) { if (len <= 0) { return s; } len = Math.min(len, s.length()); StringBuilder sb = new StringBuilder(); sb.append(s.substring(0, s.length() - len)); for (int i = 0; i < len; i++) { sb.append(mask_ch); } return sb.toString(); } private static boolean isOctal(char c) { return (c >= '0') && (c <= '7'); } private static boolean isHex(char c) { return ((c >= '0') && (c <= '9')) || ((c >= 'a') && (c <= 'f')) || ((c >= 'A') && (c <= 'F')); } private static int hexValue(char c) { if ((c >= '0') && (c <= '9')) { return (c - '0'); } else if ((c >= 'a') && (c <= 'f')) { return (c - 'a') + 10; } else { return (c - 'A') + 10; } } /** * Unescape any C escape sequences (\n, \r, \\, \ooo, etc) and return the * resulting string. */ public static String unescapeCString(String s) { if (s.indexOf('\\') < 0) { // Fast path: nothing to unescape return s; } StringBuilder sb = new StringBuilder(); int len = s.length(); for (int i = 0; i < len;) { char c = s.charAt(i++); if (c == '\\' && (i < len)) { c = s.charAt(i++); switch (c) { case 'a': c = '\007'; break; case 'b': c = '\b'; break; case 'f': c = '\f'; break; case 'n': c = '\n'; break; case 'r': c = '\r'; break; case 't': c = '\t'; break; case 'v': c = '\013'; break; case '\\': c = '\\'; break; case '?': c = '?'; break; case '\'': c = '\''; break; case '"': c = '\"'; break; default: { if ((c == 'x') && (i < len) && isHex(s.charAt(i))) { // "\xXX" int v = hexValue(s.charAt(i++)); if ((i < len) && isHex(s.charAt(i))) { v = v*16 + hexValue(s.charAt(i++)); } c = (char)v; } else if (isOctal(c)) { // "\OOO" int v = (c - '0'); if ((i < len) && isOctal(s.charAt(i))) { v = v*8 + (s.charAt(i++) - '0'); } if ((i < len) && isOctal(s.charAt(i))) { v = v*8 + (s.charAt(i++) - '0'); } c = (char)v; } else { // Propagate unknown escape sequences. sb.append('\\'); } break; } } } sb.append(c); } return sb.toString(); } /** * Unescape any MySQL escape sequences. * See MySQL language reference Chapter 6 at * <a href="http://www.mysql.com/doc/">http://www.mysql.com/doc/</a>. * This function will <strong>not</strong> work for other SQL-like * dialects. * @param s string to unescape, with the surrounding quotes. * @return unescaped string, without the surrounding quotes. * @exception IllegalArgumentException if s is not a valid MySQL string. */ public static String unescapeMySQLString(String s) throws IllegalArgumentException { // note: the same buffer is used for both reading and writing // it works because the writer can never outrun the reader char chars[] = s.toCharArray(); // the string must be quoted 'like this' or "like this" if (chars.length < 2 || chars[0] != chars[chars.length-1] || (chars[0] != '\'' && chars[0] != '"')) { throw new IllegalArgumentException("not a valid MySQL string: " + s); } // parse the string and decode the backslash sequences; in addition, // quotes can be escaped 'like this: ''', "like this: """, or 'like this: "' int j = 1; // write position in the string (never exceeds read position) int f = 0; // state: 0 (normal), 1 (backslash), 2 (quote) for (int i = 1; i < chars.length - 1; i++) { if (f == 0) { // previous character was normal if (chars[i] == '\\') { f = 1; // backslash } else if (chars[i] == chars[0]) { f = 2; // quoting character } else { chars[j++] = chars[i]; } } else if (f == 1) { // previous character was a backslash switch (chars[i]) { case '0': chars[j++] = '\0'; break; case '\'': chars[j++] = '\''; break; case '"': chars[j++] = '"'; break; case 'b': chars[j++] = '\b'; break; case 'n': chars[j++] = '\n'; break; case 'r': chars[j++] = '\r'; break; case 't': chars[j++] = '\t'; break; case 'z': chars[j++] = '\032'; break; case '\\': chars[j++] = '\\'; break; default: // if the character is not special, backslash disappears chars[j++] = chars[i]; break; } f = 0; } else { // previous character was a quote // quoting characters must be doubled inside a string if (chars[i] != chars[0]) { throw new IllegalArgumentException("not a valid MySQL string: " + s); } chars[j++] = chars[0]; f = 0; } } // string contents cannot end with a special character if (f != 0) { throw new IllegalArgumentException("not a valid MySQL string: " + s); } // done return new String(chars, 1, j - 1); } static Map<String, Character> escapeStrings; static { // HTML character entity references as defined in HTML 4 // see http://www.w3.org/TR/REC-html40/sgml/entities.html escapeStrings = new HashMap<String, Character>(252); escapeStrings.put(" ", new Character('\u00A0')); escapeStrings.put("¡", new Character('\u00A1')); escapeStrings.put("¢", new Character('\u00A2')); escapeStrings.put("£", new Character('\u00A3')); escapeStrings.put("¤", new Character('\u00A4')); escapeStrings.put("¥", new Character('\u00A5')); escapeStrings.put("¦", new Character('\u00A6')); escapeStrings.put("§", new Character('\u00A7')); escapeStrings.put("¨", new Character('\u00A8')); escapeStrings.put("©", new Character('\u00A9')); escapeStrings.put("ª", new Character('\u00AA')); escapeStrings.put("«", new Character('\u00AB')); escapeStrings.put("¬", new Character('\u00AC')); escapeStrings.put("­", new Character('\u00AD')); escapeStrings.put("®", new Character('\u00AE')); escapeStrings.put("¯", new Character('\u00AF')); escapeStrings.put("°", new Character('\u00B0')); escapeStrings.put("±", new Character('\u00B1')); escapeStrings.put("²", new Character('\u00B2')); escapeStrings.put("³", new Character('\u00B3')); escapeStrings.put("´", new Character('\u00B4')); escapeStrings.put("µ", new Character('\u00B5')); escapeStrings.put("¶", new Character('\u00B6')); escapeStrings.put("·", new Character('\u00B7')); escapeStrings.put("¸", new Character('\u00B8')); escapeStrings.put("¹", new Character('\u00B9')); escapeStrings.put("º", new Character('\u00BA')); escapeStrings.put("»", new Character('\u00BB')); escapeStrings.put("¼", new Character('\u00BC')); escapeStrings.put("½", new Character('\u00BD')); escapeStrings.put("¾", new Character('\u00BE')); escapeStrings.put("¿", new Character('\u00BF')); escapeStrings.put("À", new Character('\u00C0')); escapeStrings.put("Á", new Character('\u00C1')); escapeStrings.put("Â", new Character('\u00C2')); escapeStrings.put("Ã", new Character('\u00C3')); escapeStrings.put("Ä", new Character('\u00C4')); escapeStrings.put("Å", new Character('\u00C5')); escapeStrings.put("Æ", new Character('\u00C6')); escapeStrings.put("Ç", new Character('\u00C7')); escapeStrings.put("È", new Character('\u00C8')); escapeStrings.put("É", new Character('\u00C9')); escapeStrings.put("Ê", new Character('\u00CA')); escapeStrings.put("Ë", new Character('\u00CB')); escapeStrings.put("Ì", new Character('\u00CC')); escapeStrings.put("Í", new Character('\u00CD')); escapeStrings.put("Î", new Character('\u00CE')); escapeStrings.put("Ï", new Character('\u00CF')); escapeStrings.put("Ð", new Character('\u00D0')); escapeStrings.put("Ñ", new Character('\u00D1')); escapeStrings.put("Ò", new Character('\u00D2')); escapeStrings.put("Ó", new Character('\u00D3')); escapeStrings.put("Ô", new Character('\u00D4')); escapeStrings.put("Õ", new Character('\u00D5')); escapeStrings.put("Ö", new Character('\u00D6')); escapeStrings.put("×", new Character('\u00D7')); escapeStrings.put("Ø", new Character('\u00D8')); escapeStrings.put("Ù", new Character('\u00D9')); escapeStrings.put("Ú", new Character('\u00DA')); escapeStrings.put("Û", new Character('\u00DB')); escapeStrings.put("Ü", new Character('\u00DC')); escapeStrings.put("Ý", new Character('\u00DD')); escapeStrings.put("Þ", new Character('\u00DE')); escapeStrings.put("ß", new Character('\u00DF')); escapeStrings.put("à", new Character('\u00E0')); escapeStrings.put("á", new Character('\u00E1')); escapeStrings.put("â", new Character('\u00E2')); escapeStrings.put("ã", new Character('\u00E3')); escapeStrings.put("ä", new Character('\u00E4')); escapeStrings.put("å", new Character('\u00E5')); escapeStrings.put("æ", new Character('\u00E6')); escapeStrings.put("ç", new Character('\u00E7')); escapeStrings.put("è", new Character('\u00E8')); escapeStrings.put("é", new Character('\u00E9')); escapeStrings.put("ê", new Character('\u00EA')); escapeStrings.put("ë", new Character('\u00EB')); escapeStrings.put("ì", new Character('\u00EC')); escapeStrings.put("í", new Character('\u00ED')); escapeStrings.put("î", new Character('\u00EE')); escapeStrings.put("ï", new Character('\u00EF')); escapeStrings.put("ð", new Character('\u00F0')); escapeStrings.put("ñ", new Character('\u00F1')); escapeStrings.put("ò", new Character('\u00F2')); escapeStrings.put("ó", new Character('\u00F3')); escapeStrings.put("ô", new Character('\u00F4')); escapeStrings.put("õ", new Character('\u00F5')); escapeStrings.put("ö", new Character('\u00F6')); escapeStrings.put("÷", new Character('\u00F7')); escapeStrings.put("ø", new Character('\u00F8')); escapeStrings.put("ù", new Character('\u00F9')); escapeStrings.put("ú", new Character('\u00FA')); escapeStrings.put("û", new Character('\u00FB')); escapeStrings.put("ü", new Character('\u00FC')); escapeStrings.put("ý", new Character('\u00FD')); escapeStrings.put("þ", new Character('\u00FE')); escapeStrings.put("ÿ", new Character('\u00FF')); escapeStrings.put("ƒ", new Character('\u0192')); escapeStrings.put("Α", new Character('\u0391')); escapeStrings.put("Β", new Character('\u0392')); escapeStrings.put("Γ", new Character('\u0393')); escapeStrings.put("Δ", new Character('\u0394')); escapeStrings.put("Ε", new Character('\u0395')); escapeStrings.put("Ζ", new Character('\u0396')); escapeStrings.put("Η", new Character('\u0397')); escapeStrings.put("Θ", new Character('\u0398')); escapeStrings.put("Ι", new Character('\u0399')); escapeStrings.put("Κ", new Character('\u039A')); escapeStrings.put("Λ", new Character('\u039B')); escapeStrings.put("Μ", new Character('\u039C')); escapeStrings.put("Ν", new Character('\u039D')); escapeStrings.put("Ξ", new Character('\u039E')); escapeStrings.put("Ο", new Character('\u039F')); escapeStrings.put("Π", new Character('\u03A0')); escapeStrings.put("Ρ", new Character('\u03A1')); escapeStrings.put("Σ", new Character('\u03A3')); escapeStrings.put("Τ", new Character('\u03A4')); escapeStrings.put("Υ", new Character('\u03A5')); escapeStrings.put("Φ", new Character('\u03A6')); escapeStrings.put("Χ", new Character('\u03A7')); escapeStrings.put("Ψ", new Character('\u03A8')); escapeStrings.put("Ω", new Character('\u03A9')); escapeStrings.put("α", new Character('\u03B1')); escapeStrings.put("β", new Character('\u03B2')); escapeStrings.put("γ", new Character('\u03B3')); escapeStrings.put("δ", new Character('\u03B4')); escapeStrings.put("ε", new Character('\u03B5')); escapeStrings.put("ζ", new Character('\u03B6')); escapeStrings.put("η", new Character('\u03B7')); escapeStrings.put("θ", new Character('\u03B8')); escapeStrings.put("ι", new Character('\u03B9')); escapeStrings.put("κ", new Character('\u03BA')); escapeStrings.put("λ", new Character('\u03BB')); escapeStrings.put("μ", new Character('\u03BC')); escapeStrings.put("ν", new Character('\u03BD')); escapeStrings.put("ξ", new Character('\u03BE')); escapeStrings.put("ο", new Character('\u03BF')); escapeStrings.put("π", new Character('\u03C0')); escapeStrings.put("ρ", new Character('\u03C1')); escapeStrings.put("ς", new Character('\u03C2')); escapeStrings.put("σ", new Character('\u03C3')); escapeStrings.put("τ", new Character('\u03C4')); escapeStrings.put("υ", new Character('\u03C5')); escapeStrings.put("φ", new Character('\u03C6')); escapeStrings.put("χ", new Character('\u03C7')); escapeStrings.put("ψ", new Character('\u03C8')); escapeStrings.put("ω", new Character('\u03C9')); escapeStrings.put("ϑ", new Character('\u03D1')); escapeStrings.put("ϒ", new Character('\u03D2')); escapeStrings.put("ϖ", new Character('\u03D6')); escapeStrings.put("•", new Character('\u2022')); escapeStrings.put("…", new Character('\u2026')); escapeStrings.put("′", new Character('\u2032')); escapeStrings.put("″", new Character('\u2033')); escapeStrings.put("‾", new Character('\u203E')); escapeStrings.put("⁄", new Character('\u2044')); escapeStrings.put("℘", new Character('\u2118')); escapeStrings.put("ℑ", new Character('\u2111')); escapeStrings.put("ℜ", new Character('\u211C')); escapeStrings.put("™", new Character('\u2122')); escapeStrings.put("ℵ", new Character('\u2135')); escapeStrings.put("←", new Character('\u2190')); escapeStrings.put("↑", new Character('\u2191')); escapeStrings.put("→", new Character('\u2192')); escapeStrings.put("↓", new Character('\u2193')); escapeStrings.put("↔", new Character('\u2194')); escapeStrings.put("↵", new Character('\u21B5')); escapeStrings.put("⇐", new Character('\u21D0')); escapeStrings.put("⇑", new Character('\u21D1')); escapeStrings.put("⇒", new Character('\u21D2')); escapeStrings.put("⇓", new Character('\u21D3')); escapeStrings.put("⇔", new Character('\u21D4')); escapeStrings.put("∀", new Character('\u2200')); escapeStrings.put("∂", new Character('\u2202')); escapeStrings.put("∃", new Character('\u2203')); escapeStrings.put("∅", new Character('\u2205')); escapeStrings.put("∇", new Character('\u2207')); escapeStrings.put("∈", new Character('\u2208')); escapeStrings.put("∉", new Character('\u2209')); escapeStrings.put("∋", new Character('\u220B')); escapeStrings.put("∏", new Character('\u220F')); escapeStrings.put("∑", new Character('\u2211')); escapeStrings.put("−", new Character('\u2212')); escapeStrings.put("∗", new Character('\u2217')); escapeStrings.put("√", new Character('\u221A')); escapeStrings.put("∝", new Character('\u221D')); escapeStrings.put("∞", new Character('\u221E')); escapeStrings.put("∠", new Character('\u2220')); escapeStrings.put("∧", new Character('\u2227')); escapeStrings.put("∨", new Character('\u2228')); escapeStrings.put("∩", new Character('\u2229')); escapeStrings.put("∪", new Character('\u222A')); escapeStrings.put("∫", new Character('\u222B')); escapeStrings.put("∴", new Character('\u2234')); escapeStrings.put("∼", new Character('\u223C')); escapeStrings.put("≅", new Character('\u2245')); escapeStrings.put("≈", new Character('\u2248')); escapeStrings.put("≠", new Character('\u2260')); escapeStrings.put("≡", new Character('\u2261')); escapeStrings.put("≤", new Character('\u2264')); escapeStrings.put("≥", new Character('\u2265')); escapeStrings.put("⊂", new Character('\u2282')); escapeStrings.put("⊃", new Character('\u2283')); escapeStrings.put("⊄", new Character('\u2284')); escapeStrings.put("⊆", new Character('\u2286')); escapeStrings.put("⊇", new Character('\u2287')); escapeStrings.put("⊕", new Character('\u2295')); escapeStrings.put("⊗", new Character('\u2297')); escapeStrings.put("⊥", new Character('\u22A5')); escapeStrings.put("⋅", new Character('\u22C5')); escapeStrings.put("⌈", new Character('\u2308')); escapeStrings.put("⌉", new Character('\u2309')); escapeStrings.put("⌊", new Character('\u230A')); escapeStrings.put("⌋", new Character('\u230B')); escapeStrings.put("⟨", new Character('\u2329')); escapeStrings.put("⟩", new Character('\u232A')); escapeStrings.put("◊", new Character('\u25CA')); escapeStrings.put("♠", new Character('\u2660')); escapeStrings.put("♣", new Character('\u2663')); escapeStrings.put("♥", new Character('\u2665')); escapeStrings.put("♦", new Character('\u2666')); escapeStrings.put(""", new Character('\u0022')); escapeStrings.put("&", new Character('\u0026')); escapeStrings.put("<", new Character('\u003C')); escapeStrings.put(">", new Character('\u003E')); escapeStrings.put("Œ", new Character('\u0152')); escapeStrings.put("œ", new Character('\u0153')); escapeStrings.put("Š", new Character('\u0160')); escapeStrings.put("š", new Character('\u0161')); escapeStrings.put("Ÿ", new Character('\u0178')); escapeStrings.put("ˆ", new Character('\u02C6')); escapeStrings.put("˜", new Character('\u02DC')); escapeStrings.put(" ", new Character('\u2002')); escapeStrings.put(" ", new Character('\u2003')); escapeStrings.put(" ", new Character('\u2009')); escapeStrings.put("‌", new Character('\u200C')); escapeStrings.put("‍", new Character('\u200D')); escapeStrings.put("‎", new Character('\u200E')); escapeStrings.put("‏", new Character('\u200F')); escapeStrings.put("–", new Character('\u2013')); escapeStrings.put("—", new Character('\u2014')); escapeStrings.put("‘", new Character('\u2018')); escapeStrings.put("’", new Character('\u2019')); escapeStrings.put("‚", new Character('\u201A')); escapeStrings.put("“", new Character('\u201C')); escapeStrings.put("”", new Character('\u201D')); escapeStrings.put("„", new Character('\u201E')); escapeStrings.put("†", new Character('\u2020')); escapeStrings.put("‡", new Character('\u2021')); escapeStrings.put("‰", new Character('\u2030')); escapeStrings.put("‹", new Character('\u2039')); escapeStrings.put("›", new Character('\u203A')); escapeStrings.put("€", new Character('\u20AC')); } /** * Replace all the occurences of HTML escape strings with the * respective characters. * * @param s a <code>String</code> value * @return a <code>String</code> value */ public static final String unescapeHTML(String s) { char[] chars = s.toCharArray(); char[] escaped = new char[chars.length]; // Note: escaped[pos] = end of the escaped char array. int pos = 0; for (int i = 0; i < chars.length;) { if (chars[i] != '&') { escaped[pos++] = chars[i++]; continue; } // Allow e.g. { int j = i + 1; if (j < chars.length && chars[j] == '#') j++; // Scan until we find a char that is not letter or digit. for (; j < chars.length; j++) { if (!Character.isLetterOrDigit(chars[j])) break; } boolean replaced = false; if (j < chars.length && chars[j] == ';') { if (s.charAt(i + 1) == '#') { // Check for &#D; and pattern try { long charcode = 0; char ch = s.charAt(i + 2); if (ch == 'x' || ch == 'X') { charcode = Long.parseLong(new String(chars, i + 3, j - i - 3), 16); } else if (Character.isDigit(ch)) { charcode = Long.parseLong(new String(chars, i + 2, j - i - 2)); } if (charcode > 0 && charcode < 65536) { escaped[pos++] = (char) charcode; replaced = true; } } catch (NumberFormatException ex) { // Failed, not replaced. } } else { String key = new String(chars, i, j - i + 1); Character repl = escapeStrings.get(key); if (repl != null) { escaped[pos++] = repl.charValue(); replaced = true; } } j++; // Skip over ';' } if (!replaced) { // Not a recognized escape sequence, leave as-is System.arraycopy(chars, i, escaped, pos, j - i); pos += j - i; } i = j; } return new String(escaped, 0, pos); } /** * Given a <code>String</code>, returns an equivalent <code>String</code> with * all HTML tags stripped. Note that HTML entities, such as "&amp;" will * still be preserved. */ public static String stripHtmlTags(String string) { if ((string == null) || "".equals(string)) { return string; } return htmlTagPattern.matcher(string).replaceAll(""); } /** * We escape some characters in s to be able to make the string executable * from a python string */ public static String pythonEscape(String s) { StringBuilder sb = new StringBuilder(); for (int i = 0; i < s.length(); i++) { char c = s.charAt(i); switch (c) { case '\n': sb.append("\\n"); break; case '\r': sb.append("\\r"); break; case '\t': sb.append("\\t"); break; case '\\': sb.append("\\\\"); break; case '\"': sb.append("\\\""); break; case '\'': sb.append("\\\'"); break; default: sb.append(c); } } return sb.toString(); } /** * We escape some characters in s to be able to insert strings into JavaScript * code. Also, make sure that we don't write out --> or </scrip, which may * close a script tag. */ public static String javaScriptEscape(String s) { return javaScriptEscapeHelper(s, false); } /** * We escape some characters in s to be able to insert strings into JavaScript * code. Also, make sure that we don't write out --> or </scrip, which may * close a script tag. Turns all non-ascii characters into ASCII javascript * escape sequences (eg \udddd) */ public static String javaScriptEscapeToAscii(String s) { return javaScriptEscapeHelper(s, true); } private static final String[] UNSAFE_TAGS = { "script", "style", "object", "applet", "!--" }; /** * Helper for javaScriptEscape and javaScriptEscapeToAscii */ private static String javaScriptEscapeHelper(String s, boolean escapeToAscii) { /* * IMPORTANT: If you change the semantics of this method (by escaping * extra characters, for example), please make similar changes to * com.google.javascript.util.Escape.toJsString */ StringBuilder sb = new StringBuilder(s.length() * 9 / 8); for (int i = 0; i < s.length(); i++) { char c = s.charAt(i); switch (c) { case '\n': sb.append("\\n"); break; case '\r': sb.append("\\r"); break; case '\t': sb.append("\\t"); break; case '\\': sb.append("\\\\"); break; case '\"': sb.append("\\\""); break; case '\'': sb.append("\\\'"); break; // escape '=' so that javascript won't be executed within tags case '=': appendHexJavaScriptRepresentation(sb, c); break; case '<': // for text that could potentially be interpreted as an case '/': // unsafe opening or closing tag, escape the char to hex boolean isUnsafe = false; for (String tag : UNSAFE_TAGS) { if (s.regionMatches(true, i + 1, tag, 0, tag.length())) { isUnsafe = true; break; } } if (isUnsafe) { appendHexJavaScriptRepresentation(sb, c); } else { sb.append(c); } break; case '>' : if (sb.length() > 0 && sb.charAt(sb.length() - 1) == '-') { sb.append('\\'); } sb.append(c); break; // Note: Mozilla browser treats the line/paragraph separator // as string terminators, so we need to escape them. case '\u2028': sb.append("\\u2028"); break; case '\u2029': sb.append("\\u2029"); break; default: if (c >= 128 && escapeToAscii) { appendHexJavaScriptRepresentation(sb, c); } else { sb.append(c); } } } return sb.toString(); } /** * Returns a javascript representation of the character in a hex escaped * format. Although this is a rather specific method, it is made public * because it is also used by the JSCompiler. * * * @param sb The buffer to which the hex representation should be appended. * @param c The character to be appended. */ public static void appendHexJavaScriptRepresentation(StringBuilder sb, char c) { sb.append("\\u"); String val = Integer.toHexString(c); for (int j = val.length(); j < 4; j++) { sb.append('0'); } sb.append(val); } /** * Undo escaping as performed in javaScriptEscape(.) * Throws an IllegalArgumentException if the string contains * bad escaping. */ public static String javaScriptUnescape(String s) { StringBuilder sb = new StringBuilder(s.length()); for (int i = 0; i < s.length(); ) { char c = s.charAt(i); if (c == '\\') { i = javaScriptUnescapeHelper(s, i + 1, sb); } else { sb.append(c); i++; } } return sb.toString(); } /** * Looks for an escape code starting at index i of s, * and appends it to sb. * @return the index of the first character in s * after the escape code. * @throws IllegalArgumentException if the escape code * is invalid */ private static int javaScriptUnescapeHelper(String s, int i, StringBuilder sb) { if (i >= s.length()) { throw new IllegalArgumentException( "End-of-string after escape character in [" + s + "]"); } char c = s.charAt(i++); switch (c) { case 'n': sb.append('\n'); break; case 'r': sb.append('\r'); break; case 't': sb.append('\t'); break; case '\\': case '\"': case '\'': case '>': sb.append(c); break; case 'u': String hexCode; try { hexCode = s.substring(i, i + 4); } catch (IndexOutOfBoundsException ioobe) { throw new IllegalArgumentException( "Invalid unicode sequence [" + s.substring(i) + "] at index " + i + " in [" + s + "]"); } int unicodeValue; try { unicodeValue = Integer.parseInt(hexCode, 16); } catch (NumberFormatException nfe) { throw new IllegalArgumentException( "Invalid unicode sequence [" + hexCode + "] at index " + i + " in [" + s + "]"); } sb.append((char)unicodeValue); i += 4; break; default: throw new IllegalArgumentException( "Unknown escape code [" + c + "] at index " + i + " in [" + s + "]"); } return i; } /** * Escape a string for use inside as XML element content. This escapes * less-than and ampersand, only. */ public static String xmlContentEscape(String s) { StringBuilder sb = new StringBuilder(); for (int i = 0; i < s.length(); i++) { char c = s.charAt(i); switch (c) { case '&': sb.append("&"); break; case '<': sb.append("<"); break; case '\000': case '\001': case '\002': case '\003': case '\004': case '\005': case '\006': case '\007': case '\010': case '\013': case '\014': case '\016': case '\017': case '\020': case '\021': case '\022': case '\023': case '\024': case '\025': case '\026': case '\027': case '\030': case '\031': case '\032': case '\033': case '\034': case '\035': case '\036': case '\037': // do nothing, these are disallowed characters break; default: sb.append(c); } } return sb.toString(); } /** * Escape a string for use inside as XML single-quoted attributes. This * escapes less-than, single-quote, ampersand, and (not strictly necessary) * newlines. */ public static String xmlSingleQuotedEscape(String s) { StringBuilder sb = new StringBuilder(); for (int i = 0; i < s.length(); i++) { char c = s.charAt(i); switch (c) { case '\'': sb.append("""); break; case '&': sb.append("&"); break; case '<': sb.append("<"); break; case '\n': sb.append(" "); break; case '\000': case '\001': case '\002': case '\003': case '\004': case '\005': case '\006': case '\007': case '\010': case '\013': case '\014': case '\016': case '\017': case '\020': case '\021': case '\022': case '\023': case '\024': case '\025': case '\026': case '\027': case '\030': case '\031': case '\032': case '\033': case '\034': case '\035': case '\036': case '\037': // do nothing, these are disallowed characters break; default: sb.append(c); } } return sb.toString(); } // C0 control characters except \t, \n, and \r and 0xFFFE and 0xFFFF private static final CharMatcher CONTROL_MATCHER = CharMatcher.anyOf( "\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007" + "\u0008\u000B\u000C\u000E\u000F" + "\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017" + "\u0018\u0019\u001A\u001B\u001C\u001D\u001E\u001F" + "\uFFFE\uFFFF"); /** * Escape a string that is meant to be embedded in a CDATA section. * The returned string is guaranteed to be valid CDATA content. * The syntax of CDATA sections is the following: * <blockquote> * <code><[!CDATA[...]]></code> * </blockquote> * The only invalid character sequence in a CDATA tag is "]]>". * If this sequence is present in the input string, we replace * it by closing the current CDATA field, then write ']]&gt;', * then reopen a new CDATA section. */ public static String xmlCDataEscape(String s) { // Make sure there are no illegal control characters. s = CONTROL_MATCHER.removeFrom(s); // Return the original reference if the string doesn't have a match. int found = s.indexOf("]]>"); if (found == -1) { return s; } // For each occurrence of "]]>", append a string that adds "]]>" after // the end of the CDATA which has just been closed, then opens a new CDATA. StringBuilder sb = new StringBuilder(); int prev = 0; do { sb.append(s.substring(prev, found + 3)); sb.append("]]><![CDATA["); prev = found + 3; } while ((found = s.indexOf("]]>", prev)) != -1); sb.append(s.substring(prev)); return sb.toString(); } /** * We escape some characters in s to be able to insert strings into Java code */ public static String javaEscape(String s) { StringBuilder sb = new StringBuilder(); for (int i = 0; i < s.length(); i++) { char c = s.charAt(i); switch (c) { case '\n': sb.append("\\n"); break; case '\r': sb.append("\\r"); break; case '\t': sb.append("\\t"); break; case '\\': sb.append("\\\\"); break; case '\"': sb.append("\\\""); break; case '&': sb.append("&"); break; case '<': sb.append("<"); break; case '>': sb.append(">"); break; case '\'': sb.append("\\\'"); break; default: sb.append(c); } } return sb.toString(); } /** * Escape a string so that it can be safely placed as value of an * attribute. This is essentially similar to the {@link * javaEscape(java.lang.String)} except that it escapes double quote * to the HTML literal &quot;. This is to prevent the double * quote from being interpreted as the character closing the * attribute. */ public static String javaEscapeWithinAttribute(String s) { StringBuilder sb = new StringBuilder(); for (int i = 0; i < s.length(); i++) { char c = s.charAt(i); switch (c) { case '\n': sb.append("\\n"); break; case '\r': sb.append("\\r"); break; case '\t': sb.append("\\t"); break; case '\\': sb.append("\\\\"); break; case '\"': sb.append("""); break; case '&': sb.append("&"); break; case '<': sb.append("<"); break; case '>': sb.append(">"); break; case '\'': sb.append("\\\'"); break; default: sb.append(c); } } return sb.toString(); } /** * Returns a form of "s" appropriate for including in an XML document, after * escaping certain special characters (e.g. '&' => '&', etc.) */ public static String xmlEscape(String s) { StringBuilder sb = new StringBuilder(); for (int i = 0; i < s.length(); i++) { char ch = s.charAt(i); switch (ch) { case '"': sb.append("""); break; case '&': sb.append("&"); break; case '\'': sb.append("'"); break; case '<': sb.append("<"); break; case '>': sb.append(">"); break; case '\n': sb.append(" "); break; case '\r': sb.append(" "); break; case '\t': sb.append(" "); break; case '\0': // \0 is not a valid XML char - skip it break; default: sb.append(ch); break; } } return sb.toString(); } /** * Escapes special characters (& < > ") from a string so it can safely be * included in an HTML document. (same as <code>xmlEscape</code> except that * <code>htmlEscape</code> does not escape the apostrophe character). */ public static String htmlEscape(String s) { // This method gets called A LOT so it has to be excruciatingly efficient. // Older versions were responsible for several percent of all objects // created on the heap, and 10% of total execution time. // In particular, if the String has no characters that need escaping, this // method should return its argument. StringBuilder sb = null; String replacement; int start = 0; // the earliest input position we haven't copied yet. for (int i = 0; i < s.length(); i++) { switch (s.charAt(i)) { case '"': replacement = """; break; case '&': replacement = "&"; break; case '<': replacement = "<"; break; case '>': replacement = ">"; break; default: replacement = null; } if (replacement != null) { if (sb == null) { // This is the first time we have found a replacement. Allocate the // StringBuilder now. // This initial size for the StringBuilder below will be exact if // this initial replacement is the only one. If not, sb will expand. sb = new StringBuilder(s.length() + replacement.length() - 1); } if (i > start) { // we have to copy some of the earlier string. sb.append(s.substring(start, i)); } sb.append(replacement); start = i+1; } } // now possibly also copy what's leftover in the input string. if (start > 0) { sb.append(s.substring(start)); } if (sb != null) { return sb.toString(); } return s; } /** * Escapes the special characters from a string so it can be used as part of * a regex pattern. This method is for use on gnu.regexp style regular * expressions. */ public static String regexEscape(String s) { StringBuilder sb = new StringBuilder(); for(int i = 0; i < s.length(); i++) { char c = s.charAt(i); // Test if c is an escapable character if ("()|*+?.{}[]$^\\".indexOf(c) != -1) { sb.append('\\'); sb.append(c); } else { sb.append(c); } } return sb.toString(); } /** * Escapes the special characters from a string so it can be used as part of * a regex pattern. This method is for use on regexes in the flavor of the * java.util.regex package. This method should be removed when we move to * the java version 1.5 (Tiger) release, since that release gives us * a literal regex flag as well as a quote method to produce literal regexes. */ public static String javaUtilRegexEscape(String s) { // Use the quoting mechanism in Pattern unless it is interfered with // by the contents of the string. if (s.indexOf("\\E") == -1) { return "\\Q" + s + "\\E"; } // Very rare case: must escape each character. StringBuilder sb = new StringBuilder(); for (int i = 0; i < s.length(); i++) { sb.append('\\'); sb.append(s.charAt(i)); } return sb.toString(); } /** * Escapes the '\' and '$' characters, which comprise the subset of regex * characters that has special meaning in methods such as: * * <pre>java.util.regex.Matcher.appendReplacement(sb, replacement);</pre> * <pre>java.lang.String.replaceAll(str, replacement);</pre> * * Note that this method is offered in java version 1.5 as the method * * <pre>java.util.regex.Matcher.quoteReplacement(String);</pre> */ public static String regexReplacementEscape(String s) { StringBuilder sb = null; for (int i = 0, n = s.length(); i < n; i++) { char c = s.charAt(i); switch (c) { case '\\': case '$': if (sb == null) { // This is the first replacement necessary. Initialize the // string buffer to contain the all of the previously checked // characters in 's' sb = new StringBuilder(s.substring(0, i)); } sb.append('\\'); default: if (sb != null) { sb.append(c); } break; } } return (sb == null) ? s : sb.toString(); } /** * The old interface to cropBetween - using a single char limit */ public static String cropBetween(String in, char limit) { return cropBetween(in, String.valueOf(new char[]{limit})); } /** * This removes characters between maching charLimit chars. For example * cropBetween("ab^cd^ef^gh^hi", '^') will return "abefhi" It will consider * squences of 2 charLimit as one charLimit in the output * * @param in - the string to process * @param limit - the limit of the string(s) to remove * * @return String - the cropped string */ public static String cropBetween(String in, String limit) { StringBuilder out = new StringBuilder(); int lastPos = 0; int lenLimit = limit.length(); boolean modeAdd = true; int pos = -1; while ((pos = in.indexOf(limit, lastPos)) >= 0) { if (modeAdd) { out.append(in.substring(lastPos, pos)); } modeAdd = !modeAdd; lastPos = pos + lenLimit; } // add the remainings if (modeAdd) { out.append(in.substring(lastPos)); } return out.toString(); } /** * This converts a String to a list of strings by extracting the substrings * between delimiter * * @param in - what to process * @param delimiter - the delimiting string * @param doStrip - to strip the substrings before adding to the list * * @return LinkedList */ public static LinkedList<String> string2List(String in, String delimiter, boolean doStrip) { if (in == null) { return null; } LinkedList<String> out = new LinkedList<String>(); string2Collection(in, delimiter, doStrip, out); return out; } /** * This converts a String to a Set of strings by extracting the substrings * between delimiter * * @param in - what to process * @param delimiter - the delimiting string * @param doStrip - to strip the substrings before adding to the list * * @return Set */ public static Set string2Set(String in, String delimiter, boolean doStrip) { if (in == null) { return null; } HashSet<String> out = new HashSet<String>(); string2Collection(in, delimiter, doStrip, out); return out; } /** * Converts a delimited string to a collection of strings. Substrings between * delimiters are extracted from the string and added to a collection that is * provided by the caller. * * @param in The delimited input string to process * @param delimiter The string delimiting entries in the input string. * @param doString Whether to strip the substrings before adding to the * collection * @param collection The collection to which the strings will be added. If * <code>null</code>, a new <code>List</code> will be created. * * @return The collection to which the substrings were added. This is * syntactic sugar to allow call chaining. */ public static Collection<String> string2Collection(String in, String delimiter, boolean doStrip, Collection<String> collection) { if (in == null) { return null; } if (collection == null) { collection = new ArrayList<String>(); } if (delimiter == null || delimiter.length() == 0) { collection.add(in); return collection; } int fromIndex = 0; int pos; while ((pos = in.indexOf(delimiter, fromIndex)) >= 0) { String interim = in.substring(fromIndex, pos); if (doStrip) { interim = strip(interim); } if (!doStrip || interim.length() > 0) { collection.add(interim); } fromIndex = pos + delimiter.length(); } String interim = in.substring(fromIndex); if (doStrip) { interim = strip(interim); } if (!doStrip || interim.length() > 0) { collection.add(interim); } return collection; } /** * Lots of people called list2String when in fact it was implemented as * Collection2String. I added Collection2String as a new function and am * leaving the list2String function signature here so it can continue to be * * @deprecated Please use * But note that {@code Join} does not consider null elements to be * equivalent to the empty string, as this method does. */ @Deprecated public static String list2String( Collection<?> in, String separator) { return Collection2String(in, separator); } /** * This concatenates the elements of a collection in a string * * @param in - the collection that has to be conatenated * @param separator - a string to sepparate the elements from the list * * @return String * * @deprecated Please use * But note that {@code Join} does not consider null elements to be * equivalent to the empty string, as this method does. */ @Deprecated public static String Collection2String( Collection<?> in, String separator) { if (in == null) { return null; } return Iterator2String(in.iterator(), separator); } /** * @deprecated Please use * But note that {@code Join} does not consider null elements to be * equivalent to the empty string, as this method does. */ @Deprecated public static String Iterator2String( Iterator<?> it, String separator) { if (it == null) { return null; } StringBuilder out = new StringBuilder(); while (it.hasNext()) { if (out.length() > 0) { out.append(separator); } out.append(it.next().toString()); } return out.toString(); } /** * This converts a string to a Map. It will first split the string into * entries using delimEntry. Then each entry is split into a key and a value * using delimKey. By default we strip the keys. Use doStripEntry to strip * also the entries * * @param in - the string to be processed * @param delimEntry - delimiter for the entries * @param delimKey - delimiter between keys and values * @param doStripEntry - strip entries before inserting in the map * * @return HashMap */ public static HashMap<String, String> string2Map(String in, String delimEntry, String delimKey, boolean doStripEntry) { if (in == null) { return null; } HashMap<String, String> out = new HashMap<String, String>(); if (isEmpty(delimEntry) || isEmpty(delimKey)) { out.put(strip(in), ""); return out; } Iterator<String> it = string2List(in, delimEntry, false).iterator(); int len = delimKey.length(); while (it.hasNext()) { String entry = it.next(); int pos = entry.indexOf(delimKey); if (pos > 0) { String value = entry.substring(pos + len); if (doStripEntry) { value = strip(value); } out.put(strip(entry.substring(0, pos)), value); } else { out.put(strip(entry), ""); } } return out; } /** * This function concatenates the elements of a Map in a string with form * "<key1><sepKey><value1><sepEntry>...<keyN><sepKey><valueN>" * * @param in - the map to be converted * @param sepKey - the separator to put between key and value * @param sepEntry - the separator to put between map entries * * @return String */ public static <K, V> String map2String(Map<K,V> in, String sepKey, String sepEntry) { if (in == null) { return null; } StringBuilder out = new StringBuilder(); Iterator<Entry<K, V>> it = in.entrySet().iterator(); while (it.hasNext()) { if (out.length() > 0) { out.append(sepEntry); } Entry<K, V> entry = it.next(); out.append(entry.getKey() + sepKey + entry.getValue()); } return out.toString(); } /** * Given a map, creates and returns a new map in which all keys are the * lower-cased version of each key. * * @param map A map containing String keys to be lowercased * @throws IllegalArgumentException if the map contains duplicate string keys * after lower casing */ public static <V> Map lowercaseKeys(Map<String, V> map) { Map<String, V> result = new HashMap<String, V>(map.size()); for (Iterator<String> it = map.keySet().iterator(); it.hasNext(); ) { String key = it.next(); if (result.containsKey(key.toLowerCase())) { throw new IllegalArgumentException( "Duplicate string key in map when lower casing"); } result.put(key.toLowerCase(), map.get(key)); } return result; } /** * Replaces any string of adjacent whitespace characters with the whitespace * character " ". * * @param str the string you want to munge * @return String with no more excessive whitespace! * * @see collapse */ public static String collapseWhitespace(String str) { return collapse(str, WHITE_SPACES, " "); } /** * Replaces any string of matched characters with the supplied string.<p> * * This is a more general version of collapseWhitespace. * * <pre> * E.g. collapse("hello world", " ", "::") * will return the following string: "hello::world" * </pre> * * @param str the string you want to munge * @param chars all of the characters to be considered for munge * @param replacement the replacement string * @return String munged and replaced string. */ public static String collapse(String str, String chars, String replacement) { if (str == null) { return null; } StringBuilder newStr = new StringBuilder(); boolean prevCharMatched = false; char c; for (int i = 0; i < str.length(); i++) { c = str.charAt(i); if (chars.indexOf(c) != -1) { // this character is matched if (prevCharMatched) { // apparently a string of matched chars, so don't append anything // to the string continue; } prevCharMatched = true; newStr.append(replacement); } else { prevCharMatched = false; newStr.append(c); } } return newStr.toString(); } /** * Read a String of up to maxLength bytes from an InputStream * * @param is input stream * @param maxLength max number of bytes to read from "is". If this is -1, we * read everything. * * @return String up to maxLength bytes, read from "is" */ public static String stream2String(InputStream is, int maxLength) throws IOException { byte[] buffer = new byte[4096]; StringWriter sw = new StringWriter(); int totalRead = 0; int read = 0; do { sw.write(new String(buffer, 0, read)); totalRead += read; read = is.read(buffer, 0, buffer.length); } while (((-1 == maxLength) || (totalRead < maxLength)) && (read != -1)); return sw.toString(); } /** * Parse a list of substrings separated by a given delimiter. The delimiter * can also appear in substrings (just double them): * * parseDelimitedString("this|is", '|') returns ["this","is"] * parseDelimitedString("this||is", '|') returns ["this|is"] * * @param list String containing delimited substrings * @param delimiter Delimiter (anything except ' ' is allowed) * * @return String[] A String array of parsed substrings */ public static String[] parseDelimitedList(String list, char delimiter) { String delim = "" + delimiter; // Append a sentinel of delimiter + space // (see comments below for more info) StringTokenizer st = new StringTokenizer(list + delim + " ", delim, true); ArrayList<String> v = new ArrayList<String>(); String lastToken = ""; String word = ""; // We keep a sliding window of 2 tokens // // delimiter : delimiter -> append delimiter to current word // and clear most recent token // (so delim : delim : delim will not // be treated as two escaped delims.) // // tok : delimiter -> append tok to current word // // delimiter : tok -> add current word to list, and clear it. // (We append a sentinel that conforms to this // pattern to make sure we've pushed every parsed token) while (st.hasMoreTokens()) { String tok = st.nextToken(); if (lastToken != null) { if (tok.equals(delim)) { word = word + lastToken; if (lastToken.equals(delim)) tok = null; } else { if (!word.equals("")) v.add(word); word = ""; } } lastToken = tok; } return v.toArray(new String[0]); } /** * Helper function for null and empty string testing. * * @return true iff s == null or s.equals(""); */ public static boolean isEmpty(String s) { return makeSafe(s).length() == 0; } /** * Helper function for null, empty, and whitespace string testing. * * @return true if s == null or s.equals("") or s contains only whitespace * characters. */ public static boolean isEmptyOrWhitespace(String s) { s = makeSafe(s); for (int i = 0, n = s.length(); i < n; i++) { if (!Character.isWhitespace(s.charAt(i))) { return false; } } return true; } /** * Helper function for making null strings safe for comparisons, etc. * * @return (s == null) ? "" : s; */ public static String makeSafe(String s) { return (s == null) ? "" : s; } /** * Helper function for making empty strings into a null. * * @return null if s is zero length. otherwise, returns s. */ public static String toNullIfEmpty(String s) { return (StringUtil.isEmpty(s)) ? null : s; } /** * Helper function for turning empty or whitespace strings into a null. * * @return null if s is zero length or if s contains only whitespace * characters. otherwise, returns s. */ public static String toNullIfEmptyOrWhitespace(String s) { return (StringUtil.isEmptyOrWhitespace(s)) ? null : s; } /** * Serializes a map * * @param map A map of String keys to arrays of String values * @param keyValueDelim Delimiter between keys and values * @param entryDelim Delimiter between entries * * @return String A string containing a serialized representation of the * contents of the map. * * e.g. arrayMap2String({"foo":["bar","bar2"],"foo1":["bar1"]}, "=", "&") * returns "foo=bar&foo=bar2&foo1=bar1" */ public static String arrayMap2String(Map<String, String[]> map, String keyValueDelim, String entryDelim) { Set<Entry<String, String[]>> entrySet = map.entrySet(); Iterator<Entry<String, String[]>> itor = entrySet.iterator(); StringWriter sw = new StringWriter(); while (itor.hasNext()) { Entry<String, String[]> entry = itor.next(); String key = entry.getKey(); String[] values = entry.getValue(); for (int i = 0; i < values.length; i++) { sw.write(entry.getKey() + keyValueDelim + values[i]); if (i < values.length - 1) { sw.write(entryDelim); } } if (itor.hasNext()) { sw.write(entryDelim); } } return sw.toString(); } /** * Compares two strings, guarding against nulls If both Strings are null we * return true */ public static boolean equals(String s1, String s2) { if (s1 == s2) { return true; // Either both the same String, or both null } if (s1 != null) { if (s2 != null) { return s1.equals(s2); } } return false; } /** * Splits s with delimiters in delimiter and returns the last token */ public static String lastToken(String s, String delimiter) { String[] parts = split(s, delimiter); return (parts.length == 0) ? "" : parts[parts.length -1]; } /** * Determines if a string contains only ascii characters */ public static boolean allAscii(String s) { int len = s.length(); for (int i = 0; i < len; ++i) { if ((s.charAt(i) & 0xff80) != 0) { return false; } } return true; } /** * Determines if a string contains what looks like an html character * reference. Useful for deciding whether unescaping is necessary. */ public static boolean containsCharRef(String s) { return characterReferencePattern.matcher(s).find(); } /** * Determines if a string is a Hebrew word. A string is considered to be * a Hebrew word if {@link #isHebrew(int)} is true for any of its characters. */ public static boolean isHebrew(String s) { int len = s.length(); for (int i = 0; i < len; ++i) { if (isHebrew(s.codePointAt(i))) { return true; } } return false; } /** * Determines if a character is a Hebrew character. */ public static boolean isHebrew(int codePoint) { return Character.UnicodeBlock.HEBREW.equals( Character.UnicodeBlock.of(codePoint)); } /** * Determines if a string is a CJK word. A string is considered to be CJK * if {@link #isCjk(char)} is true for any of its characters. */ public static boolean isCjk(String s) { int len = s.length(); for (int i = 0; i < len; ++i) { if (isCjk(s.codePointAt(i))) { return true; } } return false; } /** * Unicode code blocks containing CJK characters. */ private static final Set<Character.UnicodeBlock> CJK_BLOCKS; static { Set<Character.UnicodeBlock> set = new HashSet<Character.UnicodeBlock>(); set.add(Character.UnicodeBlock.HANGUL_JAMO); set.add(Character.UnicodeBlock.CJK_RADICALS_SUPPLEMENT); set.add(Character.UnicodeBlock.KANGXI_RADICALS); set.add(Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION); set.add(Character.UnicodeBlock.HIRAGANA); set.add(Character.UnicodeBlock.KATAKANA); set.add(Character.UnicodeBlock.BOPOMOFO); set.add(Character.UnicodeBlock.HANGUL_COMPATIBILITY_JAMO); set.add(Character.UnicodeBlock.KANBUN); set.add(Character.UnicodeBlock.BOPOMOFO_EXTENDED); set.add(Character.UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS); set.add(Character.UnicodeBlock.ENCLOSED_CJK_LETTERS_AND_MONTHS); set.add(Character.UnicodeBlock.CJK_COMPATIBILITY); set.add(Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A); set.add(Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS); set.add(Character.UnicodeBlock.HANGUL_SYLLABLES); set.add(Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS); set.add(Character.UnicodeBlock.CJK_COMPATIBILITY_FORMS); set.add(Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS); set.add(Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B); set.add(Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT); CJK_BLOCKS = Collections.unmodifiableSet(set); } /** * Determines if a character is a CJK ideograph or a character typically * used only in CJK text. * * Note: This function cannot handle supplementary characters. To handle all * Unicode characters, including supplementary characters, use the function * {@link #isCjk(int)}. */ public static boolean isCjk(char ch) { return isCjk((int) ch); } /** * Determines if a character is a CJK ideograph or a character typically * used only in CJK text. */ public static boolean isCjk(int codePoint) { // Time-saving early exit for all Latin-1 characters. if ((codePoint & 0xFFFFFF00) == 0) { return false; } return CJK_BLOCKS.contains(Character.UnicodeBlock.of(codePoint)); } /** * Replaces each non-ascii character in s with its Unicode escape sequence * \\uxxxx where xxxx is a hex number. Existing escape sequences won't be * affected. */ public static String unicodeEscape(String s) { if (allAscii(s)) { return s; } StringBuilder sb = new StringBuilder(s.length()); int len = s.length(); for (int i = 0; i < len; ++i) { char ch = s.charAt(i); if (ch <= 127) { sb.append(ch); } else { sb.append("\\u"); String hexString = Integer.toHexString(ch); // Pad with zeros if necessary int numZerosToPad = 4- hexString.length(); for (int j = 0; j < numZerosToPad; ++j) { sb.append('0'); } sb.append(hexString); } } return sb.toString(); } /** * Returns the approximate display width of the string, measured in units of * ascii characters. * * @see displayWidth(char) */ public static int displayWidth(String s) { int width = 0; int len = s.length(); for (int i = 0; i < len; ++i) { width += displayWidth(s.charAt(i)); } return width; } /** * Returns the approximate display width of the character, measured * in units of ascii characters. * * This method should err on the side of caution. By default, characters * are assumed to have width 2; this covers CJK ideographs, various * symbols and miscellaneous weird scripts. Given below are some Unicode * ranges for which it seems safe to assume that no character is * substantially wider than an ascii character: * - Latin, extended Latin, even more extended Latin. * - Greek, extended Greek, Cyrillic. * - Some symbols (including currency symbols) and punctuation. * - Half-width Katakana and Hangul. * - Hebrew * - Thai * Characters in these ranges are given a width of 1. * * IMPORTANT: this function has an analog in strutil.cc named * UnicodeCharWidth, which needs to be updated if you change the * implementation here. */ public static int displayWidth(char ch) { if (ch <= '\u04f9' || ch == '\u05be' || (ch >= '\u05d0' && ch <= '\u05ea') || ch == '\u05F3' || ch == '\u05f4' || (ch >= '\u0e00' && ch <= '\u0e7f') || (ch >= '\u1e00' && ch <= '\u20af') || (ch >= '\u2100' && ch <= '\u213a') || (ch >= '\uff61' && ch <= '\uffdc')) { return 1; } return 2; } /** * @return a string representation of the given native array. */ public static String toString(float[] iArray) { if (iArray == null) { return "NULL"; } StringBuilder buffer = new StringBuilder(); buffer.append("["); for (int i = 0; i < iArray.length; i ++) { buffer.append(iArray[i]); if (i != (iArray.length - 1)) { buffer.append(", "); } } buffer.append("]"); return buffer.toString(); } /** * @return a string representation of the given native array. */ public static String toString(long[] iArray) { if (iArray == null) { return "NULL"; } StringBuilder buffer = new StringBuilder(); buffer.append("["); for (int i = 0; i < iArray.length; i ++) { buffer.append(iArray[i]); if (i != (iArray.length - 1)) { buffer.append(", "); } } buffer.append("]"); return buffer.toString(); } /** * @return a string representation of the given native array */ public static String toString(int[] iArray) { if (iArray == null) { return "NULL"; } StringBuilder buffer = new StringBuilder(); buffer.append("["); for (int i = 0; i < iArray.length; i ++) { buffer.append(iArray[i]); if (i != (iArray.length - 1)) { buffer.append(", "); } } buffer.append("]"); return buffer.toString(); } /** * @return a string representation of the given array. */ public static String toString(String[] iArray) { if (iArray == null) return "NULL"; StringBuilder buffer = new StringBuilder(); buffer.append("["); for (int i = 0; i < iArray.length; i ++) { buffer.append("'").append(iArray[i]).append("'"); if (i != iArray.length-1) { buffer.append(", "); } } buffer.append("]"); return buffer.toString(); } /** * Returns the string, in single quotes, or "NULL". Intended only for * logging. * * @param s - the string * @return the string, in single quotes, or the string "null" if it's null. */ public static String toString(String s) { if (s == null) { return "NULL"; } else { return new StringBuilder(s.length() + 2).append("'").append(s) .append("'").toString(); } } /** * @return a string representation of the given native array */ public static String toString(int[][] iArray) { if (iArray == null) { return "NULL"; } StringBuilder buffer = new StringBuilder(); buffer.append("["); for (int i = 0; i < iArray.length; i ++) { buffer.append("["); for (int j = 0; j < iArray[i].length; j ++) { buffer.append(iArray[i][j]); if (j != (iArray[i].length - 1)) { buffer.append(", "); } } buffer.append("]"); if (i != iArray.length-1) { buffer.append(" "); } } buffer.append("]"); return buffer.toString(); } /** * @return a string representation of the given native array. */ public static String toString(long[][] iArray) { if (iArray == null) return "NULL"; StringBuilder buffer = new StringBuilder(); buffer.append("["); for (int i = 0; i < iArray.length; i ++) { buffer.append("["); for (int j = 0; j < iArray[i].length; j ++) { buffer.append(iArray[i][j]); if (j != (iArray[i].length - 1)) { buffer.append(", "); } } buffer.append("]"); if (i != iArray.length-1) { buffer.append(" "); } } buffer.append("]"); return buffer.toString(); } /** * @return a String representation of the given object array. * The strings are obtained by calling toString() on the * underlying objects. */ public static String toString(Object[] obj) { if (obj == null) return "NULL"; StringBuilder tmp = new StringBuilder(); tmp.append("["); for(int i = 0; i < obj.length; i ++) { tmp.append(obj[i].toString()); if (i != obj.length-1) { tmp.append(","); } } tmp.append("]"); return tmp.toString(); } /** * Replacement for deprecated StringBufferInputStream(). Instead of: * InputStream is = new StringBuilderInputStream(str); * do: * InputStream is = StringUtil.toUTF8InputStream(str); */ public static InputStream toUTF8InputStream(String str) { InputStream is = null; try { is = new ByteArrayInputStream(str.getBytes("UTF-8")); } catch (UnsupportedEncodingException e) { // UTF-8 should always be supported throw new AssertionError(); } return is; } /** * Copy all data from in to out in 4096 byte chunks. */ public static void copyStreams (InputStream in, OutputStream out) throws IOException { if (in == null || out == null) { throw new IllegalArgumentException(); } final byte[] buffer = new byte[4096]; int len; while (-1 != (len = in.read (buffer, 0, buffer.length))) { out.write (buffer, 0, len); } } /** * Convert a byte array to a String using Latin-1 (aka ISO-8859-1) encoding. * * Note: something is probably wrong if you're using this method. Either * you're dealing with legacy code that doesn't support i18n or you're * using a third-party library that only deals with Latin-1. New code * should (almost) always uses UTF-8 encoding. * * @return the decoded String or null if ba is null */ public static String bytesToLatin1(final byte[] ba) { // ISO-8859-1 should always be supported return bytesToEncoding(ba, "ISO-8859-1"); } private static char[] hexChars = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'}; /** * Convert a byte array to a hex-encoding string: "a33bff00..." */ public static String bytesToHexString(final byte[] bytes) { return bytesToHexString(bytes, null); } /** * Convert a byte array to a hex-encoding string with the specified * delimiter: "a3<delimiter>3b<delimiter>ff..." */ public static String bytesToHexString(final byte[] bytes, Character delimiter) { StringBuffer hex = new StringBuffer(bytes.length * (delimiter == null ? 2 : 3)); int nibble1, nibble2; for (int i = 0; i < bytes.length; i++) { nibble1 = (bytes[i] >>> 4) & 0xf; nibble2 = bytes[i] & 0xf; if (i > 0 && delimiter != null) hex.append(delimiter.charValue()); hex.append(hexChars[nibble1]); hex.append(hexChars[nibble2]); } return hex.toString(); } /** * Convert a String to a byte array using Latin-1 (aka ISO-8859-1) encoding. * If any character in the String is not Latin-1 (meaning it's high 8 bits * are not all zero), then the returned byte array will contain garbage. * Therefore, only use this if you know all your characters are within * Latin-1. * * Note: something is probably wrong if you're using this method. Either * you're dealing with legacy code that doesn't support i18n or you're * using a third-party library that only deals with Latin-1. New code * should (almost) always uses UTF-8 encoding. * * @return the encoded byte array or null if str is null */ public static byte[] latin1ToBytes(final String str) { // ISO-8859-1 should always be supported return encodingToBytes(str, "ISO-8859-1"); } /** * Convert a byte array to a String using UTF-8 encoding. * * @return the decoded String or null if ba is null */ public static String bytesToUtf8(final byte[] ba) { // UTF-8 should always be supported return bytesToEncoding(ba, "UTF8"); } /** * Convert a String to a byte array using UTF-8 encoding. * * @return the encoded byte array or null if str is null */ public static byte[] utf8ToBytes(final String str) { // UTF-8 should always be supported return encodingToBytes(str, "UTF8"); } /** * Convert a byte array to a String using the specified encoding. * @param encoding the encoding to use * @return the decoded String or null if ba is null */ private static String bytesToEncoding(final byte[] ba, final String encoding) { if (ba == null) { return null; } try { return new String(ba, encoding); } catch (final UnsupportedEncodingException e) { throw new Error(encoding + " not supported! Original exception: " + e); } } /** * Convert a String to a byte array using the specified encoding. * @param encoding the encoding to use * @return the encoded byte array or null if str is null */ public static byte[] encodingToBytes( final String str, final String encoding) { if (str == null) { return null; } try { return str.getBytes(encoding); } catch (final UnsupportedEncodingException e) { throw new Error(encoding + " not supported! Original exception: " + e); } } /** * Convert an array of bytes into a List of Strings using UTF-8. A line is * considered to be terminated by any one of a line feed ('\n'), a carriage * return ('\r'), or a carriage return followed immediately by a linefeed.<p/> * * Can be used to parse the output of * * @param bytes the array to convert * @return A new mutable list containing the Strings in the input array. The * list will be empty if bytes is empty or if it is null. */ public static List<String> bytesToStringList(byte[] bytes) { List<String> lines = new ArrayList<String>(); if (bytes == null) { return lines; } BufferedReader r = null; try { r = new BufferedReader( new InputStreamReader( new ByteArrayInputStream(bytes), "UTF-8")); } catch (UnsupportedEncodingException e) { // If UTF-8 is not supported we are in big trouble. throw new RuntimeException(e); } try { try { for (String line = r.readLine(); line != null; line = r.readLine()) { lines.add(line); } } finally { r.close(); } } catch (IOException e) { // I can't think of a reason we'd get here. throw new RuntimeException(e); } return lines; } /** * Safely convert the string to uppercase. * @return upper case representation of the String; or null if * the input string is null. */ public static String toUpperCase(String src) { if (src == null) { return null; } else { return src.toUpperCase(); } } /** * @param dbSpecComponent a single component of a DBDescriptor spec * (e.g. the host or database component). The expected format of the string is: * <br> * <center>(prefix){(digits),(digits)}(suffix) * </br> * @return a shard expansion of the given String. * Note that unless the pattern is matched exactly, no expansion is * performed and the original string is returned unaltered. * For example, 'db{0,1}.adz' is expanded into 'db0.adz, db1.adz'. * Note that this method is added to StringUtil instead of * DBDescriptor to better encapsulate the choice of regexp implementation. * @throws IllegalArgumentException if the string does not parse. */ public static String expandShardNames(String dbSpecComponent) throws IllegalArgumentException, IllegalStateException { Matcher matcher = dbSpecPattern.matcher(dbSpecComponent); if (matcher.find()) { try { String prefix = dbSpecComponent.substring( matcher.start(1), matcher.end(1)); int minShard = Integer.parseInt( dbSpecComponent.substring( matcher.start(2), matcher.end(2))); int maxShard = Integer.parseInt( dbSpecComponent.substring( matcher.start(3), matcher.end(3))); String suffix = dbSpecComponent.substring( matcher.start(4), matcher.end(4)); //Log2.logEvent(prefix + " " + minShard + " " + maxShard + " " + suffix); if (minShard > maxShard) { throw new IllegalArgumentException( "Maximum shard must be greater than or equal to " + "the minimum shard"); } StringBuilder tmp = new StringBuilder(); for(int shard = minShard; shard <= maxShard; shard ++) { tmp.append(prefix).append(shard).append(suffix); if (shard != maxShard) { tmp.append(","); } } return tmp.toString(); } catch (NumberFormatException nfex) { throw new IllegalArgumentException( "Malformed DB specification component: " + dbSpecComponent); } } else { return dbSpecComponent; } } /** * Returns sourceString concatenated together 'factor' times. * * @param sourceString The string to repeat * @param factor The number of times to repeat it. */ public static String repeat(String sourceString, int factor) { if (factor < 1) { return ""; } if (factor == 1) { return sourceString; } StringBuilder sb = new StringBuilder(factor * sourceString.length()); while (factor > 0) { sb.append(sourceString); factor--; } return sb.toString(); } /** * Returns a string that is equivalent to the specified string with its * first character converted to uppercase as by {@link String#toUpperCase}. * The returned string will have the same value as the specified string if * its first character is non-alphabetic, if its first character is already * uppercase, or if the specified string is of length 0. * * <p>For example: * <pre> * capitalize("foo bar").equals("Foo bar"); * capitalize("2b or not 2b").equals("2b or not 2b") * capitalize("Foo bar").equals("Foo bar"); * capitalize("").equals(""); * </pre> * * @param s the string whose first character is to be uppercased * @return a string equivalent to <tt>s</tt> with its first character * converted to uppercase * @throws NullPointerException if <tt>s</tt> is null */ public static String capitalize(String s) { if (s.length() == 0) return s; char first = s.charAt(0); char capitalized = Character.toUpperCase(first); return (first == capitalized) ? s : capitalized + s.substring(1); } }