/* * Copyright 2001-2006 Geert Bevin <gbevin[remove] at uwyn dot com> * Distributed under the terms of either: * - the common development and distribution license (CDDL), v1.0; or * - the GNU Lesser General Public License, v2.1 or later * $Id: StringUtils.java 3108 2006-03-13 18:03:00Z gbevin $ */ package com.uwyn.jhighlight.tools; import com.uwyn.jhighlight.pcj.map.CharKeyOpenHashMap; import java.util.ArrayList; import java.util.Iterator; import java.util.regex.Pattern; /** * General purpose class containing common <code>String</code> manipulation * methods. * * @author Geert Bevin (gbevin[remove] at uwyn dot com) * @version $Revision: 3108 $ * @since 1.0 */ public abstract class StringUtils { private static final CharKeyOpenHashMap mHtmlEncodeMap = new CharKeyOpenHashMap(); static { // Html encoding mapping according to the HTML 4.0 spec // http://www.w3.org/TR/REC-html40/sgml/entities.html // Special characters for HTML mHtmlEncodeMap.put('\u0026', "&"); mHtmlEncodeMap.put('\u003C', "<"); mHtmlEncodeMap.put('\u003E', ">"); mHtmlEncodeMap.put('\u0022', """); mHtmlEncodeMap.put('\u0152', "Œ"); mHtmlEncodeMap.put('\u0153', "œ"); mHtmlEncodeMap.put('\u0160', "Š"); mHtmlEncodeMap.put('\u0161', "š"); mHtmlEncodeMap.put('\u0178', "Ÿ"); mHtmlEncodeMap.put('\u02C6', "ˆ"); mHtmlEncodeMap.put('\u02DC', "˜"); mHtmlEncodeMap.put('\u2002', " "); mHtmlEncodeMap.put('\u2003', " "); mHtmlEncodeMap.put('\u2009', " "); mHtmlEncodeMap.put('\u200C', "‌"); mHtmlEncodeMap.put('\u200D', "‍"); mHtmlEncodeMap.put('\u200E', "‎"); mHtmlEncodeMap.put('\u200F', "‏"); mHtmlEncodeMap.put('\u2013', "–"); mHtmlEncodeMap.put('\u2014', "—"); mHtmlEncodeMap.put('\u2018', "‘"); mHtmlEncodeMap.put('\u2019', "’"); mHtmlEncodeMap.put('\u201A', "‚"); mHtmlEncodeMap.put('\u201C', "“"); mHtmlEncodeMap.put('\u201D', "”"); mHtmlEncodeMap.put('\u201E', "„"); mHtmlEncodeMap.put('\u2020', "†"); mHtmlEncodeMap.put('\u2021', "‡"); mHtmlEncodeMap.put('\u2030', "‰"); mHtmlEncodeMap.put('\u2039', "‹"); mHtmlEncodeMap.put('\u203A', "›"); mHtmlEncodeMap.put('\u20AC', "€"); // Character entity references for ISO 8859-1 characters mHtmlEncodeMap.put('\u00A0', " "); mHtmlEncodeMap.put('\u00A1', "¡"); mHtmlEncodeMap.put('\u00A2', "¢"); mHtmlEncodeMap.put('\u00A3', "£"); mHtmlEncodeMap.put('\u00A4', "¤"); mHtmlEncodeMap.put('\u00A5', "¥"); mHtmlEncodeMap.put('\u00A6', "¦"); mHtmlEncodeMap.put('\u00A7', "§"); mHtmlEncodeMap.put('\u00A8', "¨"); mHtmlEncodeMap.put('\u00A9', "©"); mHtmlEncodeMap.put('\u00AA', "ª"); mHtmlEncodeMap.put('\u00AB', "«"); mHtmlEncodeMap.put('\u00AC', "¬"); mHtmlEncodeMap.put('\u00AD', "­"); mHtmlEncodeMap.put('\u00AE', "®"); mHtmlEncodeMap.put('\u00AF', "¯"); mHtmlEncodeMap.put('\u00B0', "°"); mHtmlEncodeMap.put('\u00B1', "±"); mHtmlEncodeMap.put('\u00B2', "²"); mHtmlEncodeMap.put('\u00B3', "³"); mHtmlEncodeMap.put('\u00B4', "´"); mHtmlEncodeMap.put('\u00B5', "µ"); mHtmlEncodeMap.put('\u00B6', "¶"); mHtmlEncodeMap.put('\u00B7', "·"); mHtmlEncodeMap.put('\u00B8', "¸"); mHtmlEncodeMap.put('\u00B9', "¹"); mHtmlEncodeMap.put('\u00BA', "º"); mHtmlEncodeMap.put('\u00BB', "»"); mHtmlEncodeMap.put('\u00BC', "¼"); mHtmlEncodeMap.put('\u00BD', "½"); mHtmlEncodeMap.put('\u00BE', "¾"); mHtmlEncodeMap.put('\u00BF', "¿"); mHtmlEncodeMap.put('\u00C0', "À"); mHtmlEncodeMap.put('\u00C1', "Á"); mHtmlEncodeMap.put('\u00C2', "Â"); mHtmlEncodeMap.put('\u00C3', "Ã"); mHtmlEncodeMap.put('\u00C4', "Ä"); mHtmlEncodeMap.put('\u00C5', "Å"); mHtmlEncodeMap.put('\u00C6', "Æ"); mHtmlEncodeMap.put('\u00C7', "Ç"); mHtmlEncodeMap.put('\u00C8', "È"); mHtmlEncodeMap.put('\u00C9', "É"); mHtmlEncodeMap.put('\u00CA', "Ê"); mHtmlEncodeMap.put('\u00CB', "Ë"); mHtmlEncodeMap.put('\u00CC', "Ì"); mHtmlEncodeMap.put('\u00CD', "Í"); mHtmlEncodeMap.put('\u00CE', "Î"); mHtmlEncodeMap.put('\u00CF', "Ï"); mHtmlEncodeMap.put('\u00D0', "Ð"); mHtmlEncodeMap.put('\u00D1', "Ñ"); mHtmlEncodeMap.put('\u00D2', "Ò"); mHtmlEncodeMap.put('\u00D3', "Ó"); mHtmlEncodeMap.put('\u00D4', "Ô"); mHtmlEncodeMap.put('\u00D5', "Õ"); mHtmlEncodeMap.put('\u00D6', "Ö"); mHtmlEncodeMap.put('\u00D7', "×"); mHtmlEncodeMap.put('\u00D8', "Ø"); mHtmlEncodeMap.put('\u00D9', "Ù"); mHtmlEncodeMap.put('\u00DA', "Ú"); mHtmlEncodeMap.put('\u00DB', "Û"); mHtmlEncodeMap.put('\u00DC', "Ü"); mHtmlEncodeMap.put('\u00DD', "Ý"); mHtmlEncodeMap.put('\u00DE', "Þ"); mHtmlEncodeMap.put('\u00DF', "ß"); mHtmlEncodeMap.put('\u00E0', "à"); mHtmlEncodeMap.put('\u00E1', "á"); mHtmlEncodeMap.put('\u00E2', "â"); mHtmlEncodeMap.put('\u00E3', "ã"); mHtmlEncodeMap.put('\u00E4', "ä"); mHtmlEncodeMap.put('\u00E5', "å"); mHtmlEncodeMap.put('\u00E6', "æ"); mHtmlEncodeMap.put('\u00E7', "ç"); mHtmlEncodeMap.put('\u00E8', "è"); mHtmlEncodeMap.put('\u00E9', "é"); mHtmlEncodeMap.put('\u00EA', "ê"); mHtmlEncodeMap.put('\u00EB', "ë"); mHtmlEncodeMap.put('\u00EC', "ì"); mHtmlEncodeMap.put('\u00ED', "í"); mHtmlEncodeMap.put('\u00EE', "î"); mHtmlEncodeMap.put('\u00EF', "ï"); mHtmlEncodeMap.put('\u00F0', "ð"); mHtmlEncodeMap.put('\u00F1', "ñ"); mHtmlEncodeMap.put('\u00F2', "ò"); mHtmlEncodeMap.put('\u00F3', "ó"); mHtmlEncodeMap.put('\u00F4', "ô"); mHtmlEncodeMap.put('\u00F5', "õ"); mHtmlEncodeMap.put('\u00F6', "ö"); mHtmlEncodeMap.put('\u00F7', "÷"); mHtmlEncodeMap.put('\u00F8', "ø"); mHtmlEncodeMap.put('\u00F9', "ù"); mHtmlEncodeMap.put('\u00FA', "ú"); mHtmlEncodeMap.put('\u00FB', "û"); mHtmlEncodeMap.put('\u00FC', "ü"); mHtmlEncodeMap.put('\u00FD', "ý"); mHtmlEncodeMap.put('\u00FE', "þ"); mHtmlEncodeMap.put('\u00FF', "ÿ"); // Mathematical, Greek and Symbolic characters for HTML mHtmlEncodeMap.put('\u0192', "ƒ"); mHtmlEncodeMap.put('\u0391', "Α"); mHtmlEncodeMap.put('\u0392', "Β"); mHtmlEncodeMap.put('\u0393', "Γ"); mHtmlEncodeMap.put('\u0394', "Δ"); mHtmlEncodeMap.put('\u0395', "Ε"); mHtmlEncodeMap.put('\u0396', "Ζ"); mHtmlEncodeMap.put('\u0397', "Η"); mHtmlEncodeMap.put('\u0398', "Θ"); mHtmlEncodeMap.put('\u0399', "Ι"); mHtmlEncodeMap.put('\u039A', "Κ"); mHtmlEncodeMap.put('\u039B', "Λ"); mHtmlEncodeMap.put('\u039C', "Μ"); mHtmlEncodeMap.put('\u039D', "Ν"); mHtmlEncodeMap.put('\u039E', "Ξ"); mHtmlEncodeMap.put('\u039F', "Ο"); mHtmlEncodeMap.put('\u03A0', "Π"); mHtmlEncodeMap.put('\u03A1', "Ρ"); mHtmlEncodeMap.put('\u03A3', "Σ"); mHtmlEncodeMap.put('\u03A4', "Τ"); mHtmlEncodeMap.put('\u03A5', "Υ"); mHtmlEncodeMap.put('\u03A6', "Φ"); mHtmlEncodeMap.put('\u03A7', "Χ"); mHtmlEncodeMap.put('\u03A8', "Ψ"); mHtmlEncodeMap.put('\u03A9', "Ω"); mHtmlEncodeMap.put('\u03B1', "α"); mHtmlEncodeMap.put('\u03B2', "β"); mHtmlEncodeMap.put('\u03B3', "γ"); mHtmlEncodeMap.put('\u03B4', "δ"); mHtmlEncodeMap.put('\u03B5', "ε"); mHtmlEncodeMap.put('\u03B6', "ζ"); mHtmlEncodeMap.put('\u03B7', "η"); mHtmlEncodeMap.put('\u03B8', "θ"); mHtmlEncodeMap.put('\u03B9', "ι"); mHtmlEncodeMap.put('\u03BA', "κ"); mHtmlEncodeMap.put('\u03BB', "λ"); mHtmlEncodeMap.put('\u03BC', "μ"); mHtmlEncodeMap.put('\u03BD', "ν"); mHtmlEncodeMap.put('\u03BE', "ξ"); mHtmlEncodeMap.put('\u03BF', "ο"); mHtmlEncodeMap.put('\u03C0', "π"); mHtmlEncodeMap.put('\u03C1', "ρ"); mHtmlEncodeMap.put('\u03C2', "ς"); mHtmlEncodeMap.put('\u03C3', "σ"); mHtmlEncodeMap.put('\u03C4', "τ"); mHtmlEncodeMap.put('\u03C5', "υ"); mHtmlEncodeMap.put('\u03C6', "φ"); mHtmlEncodeMap.put('\u03C7', "χ"); mHtmlEncodeMap.put('\u03C8', "ψ"); mHtmlEncodeMap.put('\u03C9', "ω"); mHtmlEncodeMap.put('\u03D1', "ϑ"); mHtmlEncodeMap.put('\u03D2', "ϒ"); mHtmlEncodeMap.put('\u03D6', "ϖ"); mHtmlEncodeMap.put('\u2022', "•"); mHtmlEncodeMap.put('\u2026', "…"); mHtmlEncodeMap.put('\u2032', "′"); mHtmlEncodeMap.put('\u2033', "″"); mHtmlEncodeMap.put('\u203E', "‾"); mHtmlEncodeMap.put('\u2044', "⁄"); mHtmlEncodeMap.put('\u2118', "℘"); mHtmlEncodeMap.put('\u2111', "ℑ"); mHtmlEncodeMap.put('\u211C', "ℜ"); mHtmlEncodeMap.put('\u2122', "™"); mHtmlEncodeMap.put('\u2135', "ℵ"); mHtmlEncodeMap.put('\u2190', "←"); mHtmlEncodeMap.put('\u2191', "↑"); mHtmlEncodeMap.put('\u2192', "→"); mHtmlEncodeMap.put('\u2193', "↓"); mHtmlEncodeMap.put('\u2194', "↔"); mHtmlEncodeMap.put('\u21B5', "↵"); mHtmlEncodeMap.put('\u21D0', "⇐"); mHtmlEncodeMap.put('\u21D1', "⇑"); mHtmlEncodeMap.put('\u21D2', "⇒"); mHtmlEncodeMap.put('\u21D3', "⇓"); mHtmlEncodeMap.put('\u21D4', "⇔"); mHtmlEncodeMap.put('\u2200', "∀"); mHtmlEncodeMap.put('\u2202', "∂"); mHtmlEncodeMap.put('\u2203', "∃"); mHtmlEncodeMap.put('\u2205', "∅"); mHtmlEncodeMap.put('\u2207', "∇"); mHtmlEncodeMap.put('\u2208', "∈"); mHtmlEncodeMap.put('\u2209', "∉"); mHtmlEncodeMap.put('\u220B', "∋"); mHtmlEncodeMap.put('\u220F', "∏"); mHtmlEncodeMap.put('\u2211', "∑"); mHtmlEncodeMap.put('\u2212', "−"); mHtmlEncodeMap.put('\u2217', "∗"); mHtmlEncodeMap.put('\u221A', "√"); mHtmlEncodeMap.put('\u221D', "∝"); mHtmlEncodeMap.put('\u221E', "∞"); mHtmlEncodeMap.put('\u2220', "∠"); mHtmlEncodeMap.put('\u2227', "∧"); mHtmlEncodeMap.put('\u2228', "∨"); mHtmlEncodeMap.put('\u2229', "∩"); mHtmlEncodeMap.put('\u222A', "∪"); mHtmlEncodeMap.put('\u222B', "∫"); mHtmlEncodeMap.put('\u2234', "∴"); mHtmlEncodeMap.put('\u223C', "∼"); mHtmlEncodeMap.put('\u2245', "≅"); mHtmlEncodeMap.put('\u2248', "≈"); mHtmlEncodeMap.put('\u2260', "≠"); mHtmlEncodeMap.put('\u2261', "≡"); mHtmlEncodeMap.put('\u2264', "≤"); mHtmlEncodeMap.put('\u2265', "≥"); mHtmlEncodeMap.put('\u2282', "⊂"); mHtmlEncodeMap.put('\u2283', "⊃"); mHtmlEncodeMap.put('\u2284', "⊄"); mHtmlEncodeMap.put('\u2286', "⊆"); mHtmlEncodeMap.put('\u2287', "⊇"); mHtmlEncodeMap.put('\u2295', "⊕"); mHtmlEncodeMap.put('\u2297', "⊗"); mHtmlEncodeMap.put('\u22A5', "⊥"); mHtmlEncodeMap.put('\u22C5', "⋅"); mHtmlEncodeMap.put('\u2308', "⌈"); mHtmlEncodeMap.put('\u2309', "⌉"); mHtmlEncodeMap.put('\u230A', "⌊"); mHtmlEncodeMap.put('\u230B', "⌋"); mHtmlEncodeMap.put('\u2329', "⟨"); mHtmlEncodeMap.put('\u232A', "⟩"); mHtmlEncodeMap.put('\u25CA', "◊"); mHtmlEncodeMap.put('\u2660', "♠"); mHtmlEncodeMap.put('\u2663', "♣"); mHtmlEncodeMap.put('\u2665', "♥"); mHtmlEncodeMap.put('\u2666', "♦"); } private StringUtils() { } /** * Transforms a provided <code>String</code> object into a new string, * containing only valid Html characters. * * @param source The string that has to be transformed into a valid Html * string. * * @return The encoded <code>String</code> object. * * @since 1.0 */ public static String encodeHtml(String source) { return encode(source, mHtmlEncodeMap); } /** * Transforms a provided <code>String</code> object into a new string, * using the mapping that are provided through the supplied encoding table. * * @param source The string that has to be transformed into a valid string, * using the mappings that are provided through the supplied encoding table. * @param encodingTables A <code>Map</code> object containing the mappings to * transform characters into valid entities. The keys of this map should be * <code>Character</code> objects and the values <code>String</code> * objects. * * @return The encoded <code>String</code> object. * * @since 1.0 */ private static String encode(String source, CharKeyOpenHashMap encodingTable) { if (null == source) { return null; } if (null == encodingTable) { return source; } StringBuffer encoded_string = null; char[] string_to_encode_array = source.toCharArray(); int last_match = -1; int difference = 0; for (int i = 0; i < string_to_encode_array.length; i++) { char char_to_encode = string_to_encode_array[i]; if (encodingTable.containsKey(char_to_encode)) { if (null == encoded_string) { encoded_string = new StringBuffer(source.length()); } difference = i - (last_match + 1); if (difference > 0) { encoded_string.append(string_to_encode_array, last_match + 1, difference); } encoded_string.append(encodingTable.get(char_to_encode)); last_match = i; } } if (null == encoded_string) { return source; } else { difference = string_to_encode_array.length - (last_match + 1); if (difference > 0) { encoded_string.append(string_to_encode_array, last_match + 1, difference); } return encoded_string.toString(); } } /** * Checks if the name filters through an including and an excluding * regular expression. * * @param name The <code>String</code> that will be filtered. * @param included The regular expressions that needs to succeed * @param excluded The regular expressions that needs to fail * * @return <code>true</code> if the name filtered through correctly; or * <p> * <code>false</code> otherwise. * * @since 1.0 */ public static boolean filter(String name, Pattern included, Pattern excluded) { Pattern[] included_array = null; if (included != null) { included_array = new Pattern[] {included}; } Pattern[] excluded_array = null; if (excluded != null) { excluded_array = new Pattern[] {excluded}; } return filter(name, included_array, excluded_array); } /** * Checks if the name filters through a series of including and excluding * regular expressions. * * @param name The <code>String</code> that will be filtered. * @param included An array of regular expressions that need to succeed * @param excluded An array of regular expressions that need to fail * * @return <code>true</code> if the name filtered through correctly; or * <p> * <code>false</code> otherwise. * * @since 1.0 */ public static boolean filter(String name, Pattern[] included, Pattern[] excluded) { if (null == name) { return false; } boolean accepted = false; // retain only the includes if (null == included) { accepted = true; } else { Pattern pattern; for (int i = 0; i < included.length; i++) { pattern = included[i]; if (pattern != null && pattern.matcher(name).matches()) { accepted = true; break; } } } // remove the excludes if (accepted && excluded != null) { Pattern pattern; for (int i = 0; i < excluded.length; i++) { pattern = excluded[i]; if (pattern != null && pattern.matcher(name).matches()) { accepted = false; break; } } } return accepted; } /** * Splits a string into different parts, using a seperator string to detect * the seperation boundaries in a case-sensitive manner. The seperator will * not be included in the list of parts. * * @param source The string that will be split into parts. * @param seperator The seperator string that will be used to determine the * parts. * * @return An <code>ArrayList</code> containing the parts as * <code>String</code> objects. * * @since 1.0 */ public static ArrayList split(String source, String seperator) { return split(source, seperator, true); } /** * Splits a string into different parts, using a seperator string to detect * the seperation boundaries. The seperator will not be included in the list * of parts. * * @param source The string that will be split into parts. * @param seperator The seperator string that will be used to determine the * parts. * @param matchCase A <code>boolean</code> indicating if the match is going * to be performed in a case-sensitive manner or not. * * @return An <code>ArrayList</code> containing the parts as * <code>String</code> objects. * * @since 1.0 */ public static ArrayList split(String source, String seperator, boolean matchCase) { ArrayList substrings = new ArrayList(); if (null == source) { return substrings; } if (null == seperator) { substrings.add(source); return substrings; } int current_index = 0; int delimiter_index = 0; String element = null; String source_lookup_reference = null; if (!matchCase) { source_lookup_reference = source.toLowerCase(); seperator = seperator.toLowerCase(); } else { source_lookup_reference = source; } while (current_index <= source_lookup_reference.length()) { delimiter_index = source_lookup_reference.indexOf(seperator, current_index); if (-1 == delimiter_index) { element = new String(source.substring(current_index, source.length())); substrings.add(element); current_index = source.length() + 1; } else { element = new String(source.substring(current_index, delimiter_index)); substrings.add(element); current_index = delimiter_index + seperator.length(); } } return substrings; } /** * Searches for a string within a specified string in a case-sensitive * manner and replaces every match with another string. * * @param source The string in which the matching parts will be replaced. * @param stringToReplace The string that will be searched for. * @param replacementString The string that will replace each matching part. * * @return A new <code>String</code> object containing the replacement * result. * * @since 1.0 */ public static String replace(String source, String stringToReplace, String replacementString) { return replace(source, stringToReplace, replacementString, true); } /** * Searches for a string within a specified string and replaces every match * with another string. * * @param source The string in which the matching parts will be replaced. * @param stringToReplace The string that will be searched for. * @param replacementString The string that will replace each matching part. * @param matchCase A <code>boolean</code> indicating if the match is going * to be performed in a case-sensitive manner or not. * * @return A new <code>String</code> object containing the replacement * result. * * @since 1.0 */ public static String replace(String source, String stringToReplace, String replacementString, boolean matchCase) { if (null == source) { return null; } if (null == stringToReplace) { return source; } if (null == replacementString) { return source; } Iterator string_parts = split(source, stringToReplace, matchCase).iterator(); StringBuffer new_string = new StringBuffer(); synchronized (new_string) // speed increase by thread lock pre-allocation { while (string_parts.hasNext()) { String string_part = (String)string_parts.next(); new_string.append(string_part); if (string_parts.hasNext()) { new_string.append(replacementString); } } return new_string.toString(); } } /** * Creates a new string that contains the provided string a number of times. * * @param source The string that will be repeated. * @param count The number of times that the string will be repeated. * @return A new <code>String</code> object containing the repeated * concatenation result. * * @since 1.0 */ public static String repeat(String source, int count) { if (null == source) { return null; } StringBuffer new_string = new StringBuffer(); synchronized (new_string) // speed increase by thread lock pre-allocation { while (count > 0) { new_string.append(source); count --; } return new_string.toString(); } } /** * Converts all tabs on a line to spaces according to the provided tab * width. * * @param line The line whose tabs have to be converted. * @param tabWidth The tab width. * @return A new <code>String</code> object containing the line with the * replaced tabs. * @since 1.0 */ public static String convertTabsToSpaces(String line, int tabWidth) { StringBuffer result = new StringBuffer(); synchronized (result) // speed increase by thread lock pre-allocation { int tab_index = -1; int last_tab_index = 0; int added_chars = 0; int tab_size; while ((tab_index = line.indexOf("\t", last_tab_index)) != -1) { tab_size = tabWidth - ((tab_index + added_chars) % tabWidth); if (0 == tab_size) { tab_size = tabWidth; } added_chars += tab_size - 1; result.append(line.substring(last_tab_index, tab_index)); result.append(StringUtils.repeat(" ", tab_size)); last_tab_index = tab_index + 1; } if (0 == last_tab_index) { return line; } else { result.append(line.substring(last_tab_index)); } } return result.toString(); } }