/*
* Copyright 2001-2006 Geert Bevin <gbevin[remove] at uwyn dot com>
* Distributed under the terms of either:
* - the common development and distribution license (CDDL), v1.0; or
* - the GNU Lesser General Public License, v2.1 or later
* $Id: StringUtils.java 3108 2006-03-13 18:03:00Z gbevin $
*/
package com.uwyn.jhighlight.tools;
import com.uwyn.jhighlight.pcj.map.CharKeyOpenHashMap;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.regex.Pattern;
/**
* General purpose class containing common <code>String</code> manipulation
* methods.
*
* @author Geert Bevin (gbevin[remove] at uwyn dot com)
* @version $Revision: 3108 $
* @since 1.0
*/
public abstract class StringUtils
{
private static final CharKeyOpenHashMap mHtmlEncodeMap = new CharKeyOpenHashMap();
static
{
// Html encoding mapping according to the HTML 4.0 spec
// http://www.w3.org/TR/REC-html40/sgml/entities.html
// Special characters for HTML
mHtmlEncodeMap.put('\u0026', "&");
mHtmlEncodeMap.put('\u003C', "<");
mHtmlEncodeMap.put('\u003E', ">");
mHtmlEncodeMap.put('\u0022', """);
mHtmlEncodeMap.put('\u0152', "Œ");
mHtmlEncodeMap.put('\u0153', "œ");
mHtmlEncodeMap.put('\u0160', "Š");
mHtmlEncodeMap.put('\u0161', "š");
mHtmlEncodeMap.put('\u0178', "Ÿ");
mHtmlEncodeMap.put('\u02C6', "ˆ");
mHtmlEncodeMap.put('\u02DC', "˜");
mHtmlEncodeMap.put('\u2002', " ");
mHtmlEncodeMap.put('\u2003', " ");
mHtmlEncodeMap.put('\u2009', " ");
mHtmlEncodeMap.put('\u200C', "");
mHtmlEncodeMap.put('\u200D', "");
mHtmlEncodeMap.put('\u200E', "");
mHtmlEncodeMap.put('\u200F', "");
mHtmlEncodeMap.put('\u2013', "–");
mHtmlEncodeMap.put('\u2014', "—");
mHtmlEncodeMap.put('\u2018', "‘");
mHtmlEncodeMap.put('\u2019', "’");
mHtmlEncodeMap.put('\u201A', "‚");
mHtmlEncodeMap.put('\u201C', "“");
mHtmlEncodeMap.put('\u201D', "”");
mHtmlEncodeMap.put('\u201E', "„");
mHtmlEncodeMap.put('\u2020', "†");
mHtmlEncodeMap.put('\u2021', "‡");
mHtmlEncodeMap.put('\u2030', "‰");
mHtmlEncodeMap.put('\u2039', "‹");
mHtmlEncodeMap.put('\u203A', "›");
mHtmlEncodeMap.put('\u20AC', "€");
// Character entity references for ISO 8859-1 characters
mHtmlEncodeMap.put('\u00A0', " ");
mHtmlEncodeMap.put('\u00A1', "¡");
mHtmlEncodeMap.put('\u00A2', "¢");
mHtmlEncodeMap.put('\u00A3', "£");
mHtmlEncodeMap.put('\u00A4', "¤");
mHtmlEncodeMap.put('\u00A5', "¥");
mHtmlEncodeMap.put('\u00A6', "¦");
mHtmlEncodeMap.put('\u00A7', "§");
mHtmlEncodeMap.put('\u00A8', "¨");
mHtmlEncodeMap.put('\u00A9', "©");
mHtmlEncodeMap.put('\u00AA', "ª");
mHtmlEncodeMap.put('\u00AB', "«");
mHtmlEncodeMap.put('\u00AC', "¬");
mHtmlEncodeMap.put('\u00AD', "");
mHtmlEncodeMap.put('\u00AE', "®");
mHtmlEncodeMap.put('\u00AF', "¯");
mHtmlEncodeMap.put('\u00B0', "°");
mHtmlEncodeMap.put('\u00B1', "±");
mHtmlEncodeMap.put('\u00B2', "²");
mHtmlEncodeMap.put('\u00B3', "³");
mHtmlEncodeMap.put('\u00B4', "´");
mHtmlEncodeMap.put('\u00B5', "µ");
mHtmlEncodeMap.put('\u00B6', "¶");
mHtmlEncodeMap.put('\u00B7', "·");
mHtmlEncodeMap.put('\u00B8', "¸");
mHtmlEncodeMap.put('\u00B9', "¹");
mHtmlEncodeMap.put('\u00BA', "º");
mHtmlEncodeMap.put('\u00BB', "»");
mHtmlEncodeMap.put('\u00BC', "¼");
mHtmlEncodeMap.put('\u00BD', "½");
mHtmlEncodeMap.put('\u00BE', "¾");
mHtmlEncodeMap.put('\u00BF', "¿");
mHtmlEncodeMap.put('\u00C0', "À");
mHtmlEncodeMap.put('\u00C1', "Á");
mHtmlEncodeMap.put('\u00C2', "Â");
mHtmlEncodeMap.put('\u00C3', "Ã");
mHtmlEncodeMap.put('\u00C4', "Ä");
mHtmlEncodeMap.put('\u00C5', "Å");
mHtmlEncodeMap.put('\u00C6', "Æ");
mHtmlEncodeMap.put('\u00C7', "Ç");
mHtmlEncodeMap.put('\u00C8', "È");
mHtmlEncodeMap.put('\u00C9', "É");
mHtmlEncodeMap.put('\u00CA', "Ê");
mHtmlEncodeMap.put('\u00CB', "Ë");
mHtmlEncodeMap.put('\u00CC', "Ì");
mHtmlEncodeMap.put('\u00CD', "Í");
mHtmlEncodeMap.put('\u00CE', "Î");
mHtmlEncodeMap.put('\u00CF', "Ï");
mHtmlEncodeMap.put('\u00D0', "Ð");
mHtmlEncodeMap.put('\u00D1', "Ñ");
mHtmlEncodeMap.put('\u00D2', "Ò");
mHtmlEncodeMap.put('\u00D3', "Ó");
mHtmlEncodeMap.put('\u00D4', "Ô");
mHtmlEncodeMap.put('\u00D5', "Õ");
mHtmlEncodeMap.put('\u00D6', "Ö");
mHtmlEncodeMap.put('\u00D7', "×");
mHtmlEncodeMap.put('\u00D8', "Ø");
mHtmlEncodeMap.put('\u00D9', "Ù");
mHtmlEncodeMap.put('\u00DA', "Ú");
mHtmlEncodeMap.put('\u00DB', "Û");
mHtmlEncodeMap.put('\u00DC', "Ü");
mHtmlEncodeMap.put('\u00DD', "Ý");
mHtmlEncodeMap.put('\u00DE', "Þ");
mHtmlEncodeMap.put('\u00DF', "ß");
mHtmlEncodeMap.put('\u00E0', "à");
mHtmlEncodeMap.put('\u00E1', "á");
mHtmlEncodeMap.put('\u00E2', "â");
mHtmlEncodeMap.put('\u00E3', "ã");
mHtmlEncodeMap.put('\u00E4', "ä");
mHtmlEncodeMap.put('\u00E5', "å");
mHtmlEncodeMap.put('\u00E6', "æ");
mHtmlEncodeMap.put('\u00E7', "ç");
mHtmlEncodeMap.put('\u00E8', "è");
mHtmlEncodeMap.put('\u00E9', "é");
mHtmlEncodeMap.put('\u00EA', "ê");
mHtmlEncodeMap.put('\u00EB', "ë");
mHtmlEncodeMap.put('\u00EC', "ì");
mHtmlEncodeMap.put('\u00ED', "í");
mHtmlEncodeMap.put('\u00EE', "î");
mHtmlEncodeMap.put('\u00EF', "ï");
mHtmlEncodeMap.put('\u00F0', "ð");
mHtmlEncodeMap.put('\u00F1', "ñ");
mHtmlEncodeMap.put('\u00F2', "ò");
mHtmlEncodeMap.put('\u00F3', "ó");
mHtmlEncodeMap.put('\u00F4', "ô");
mHtmlEncodeMap.put('\u00F5', "õ");
mHtmlEncodeMap.put('\u00F6', "ö");
mHtmlEncodeMap.put('\u00F7', "÷");
mHtmlEncodeMap.put('\u00F8', "ø");
mHtmlEncodeMap.put('\u00F9', "ù");
mHtmlEncodeMap.put('\u00FA', "ú");
mHtmlEncodeMap.put('\u00FB', "û");
mHtmlEncodeMap.put('\u00FC', "ü");
mHtmlEncodeMap.put('\u00FD', "ý");
mHtmlEncodeMap.put('\u00FE', "þ");
mHtmlEncodeMap.put('\u00FF', "ÿ");
// Mathematical, Greek and Symbolic characters for HTML
mHtmlEncodeMap.put('\u0192', "ƒ");
mHtmlEncodeMap.put('\u0391', "Α");
mHtmlEncodeMap.put('\u0392', "Β");
mHtmlEncodeMap.put('\u0393', "Γ");
mHtmlEncodeMap.put('\u0394', "Δ");
mHtmlEncodeMap.put('\u0395', "Ε");
mHtmlEncodeMap.put('\u0396', "Ζ");
mHtmlEncodeMap.put('\u0397', "Η");
mHtmlEncodeMap.put('\u0398', "Θ");
mHtmlEncodeMap.put('\u0399', "Ι");
mHtmlEncodeMap.put('\u039A', "Κ");
mHtmlEncodeMap.put('\u039B', "Λ");
mHtmlEncodeMap.put('\u039C', "Μ");
mHtmlEncodeMap.put('\u039D', "Ν");
mHtmlEncodeMap.put('\u039E', "Ξ");
mHtmlEncodeMap.put('\u039F', "Ο");
mHtmlEncodeMap.put('\u03A0', "Π");
mHtmlEncodeMap.put('\u03A1', "Ρ");
mHtmlEncodeMap.put('\u03A3', "Σ");
mHtmlEncodeMap.put('\u03A4', "Τ");
mHtmlEncodeMap.put('\u03A5', "Υ");
mHtmlEncodeMap.put('\u03A6', "Φ");
mHtmlEncodeMap.put('\u03A7', "Χ");
mHtmlEncodeMap.put('\u03A8', "Ψ");
mHtmlEncodeMap.put('\u03A9', "Ω");
mHtmlEncodeMap.put('\u03B1', "α");
mHtmlEncodeMap.put('\u03B2', "β");
mHtmlEncodeMap.put('\u03B3', "γ");
mHtmlEncodeMap.put('\u03B4', "δ");
mHtmlEncodeMap.put('\u03B5', "ε");
mHtmlEncodeMap.put('\u03B6', "ζ");
mHtmlEncodeMap.put('\u03B7', "η");
mHtmlEncodeMap.put('\u03B8', "θ");
mHtmlEncodeMap.put('\u03B9', "ι");
mHtmlEncodeMap.put('\u03BA', "κ");
mHtmlEncodeMap.put('\u03BB', "λ");
mHtmlEncodeMap.put('\u03BC', "μ");
mHtmlEncodeMap.put('\u03BD', "ν");
mHtmlEncodeMap.put('\u03BE', "ξ");
mHtmlEncodeMap.put('\u03BF', "ο");
mHtmlEncodeMap.put('\u03C0', "π");
mHtmlEncodeMap.put('\u03C1', "ρ");
mHtmlEncodeMap.put('\u03C2', "ς");
mHtmlEncodeMap.put('\u03C3', "σ");
mHtmlEncodeMap.put('\u03C4', "τ");
mHtmlEncodeMap.put('\u03C5', "υ");
mHtmlEncodeMap.put('\u03C6', "φ");
mHtmlEncodeMap.put('\u03C7', "χ");
mHtmlEncodeMap.put('\u03C8', "ψ");
mHtmlEncodeMap.put('\u03C9', "ω");
mHtmlEncodeMap.put('\u03D1', "ϑ");
mHtmlEncodeMap.put('\u03D2', "ϒ");
mHtmlEncodeMap.put('\u03D6', "ϖ");
mHtmlEncodeMap.put('\u2022', "•");
mHtmlEncodeMap.put('\u2026', "…");
mHtmlEncodeMap.put('\u2032', "′");
mHtmlEncodeMap.put('\u2033', "″");
mHtmlEncodeMap.put('\u203E', "‾");
mHtmlEncodeMap.put('\u2044', "⁄");
mHtmlEncodeMap.put('\u2118', "℘");
mHtmlEncodeMap.put('\u2111', "ℑ");
mHtmlEncodeMap.put('\u211C', "ℜ");
mHtmlEncodeMap.put('\u2122', "™");
mHtmlEncodeMap.put('\u2135', "ℵ");
mHtmlEncodeMap.put('\u2190', "←");
mHtmlEncodeMap.put('\u2191', "↑");
mHtmlEncodeMap.put('\u2192', "→");
mHtmlEncodeMap.put('\u2193', "↓");
mHtmlEncodeMap.put('\u2194', "↔");
mHtmlEncodeMap.put('\u21B5', "↵");
mHtmlEncodeMap.put('\u21D0', "⇐");
mHtmlEncodeMap.put('\u21D1', "⇑");
mHtmlEncodeMap.put('\u21D2', "⇒");
mHtmlEncodeMap.put('\u21D3', "⇓");
mHtmlEncodeMap.put('\u21D4', "⇔");
mHtmlEncodeMap.put('\u2200', "∀");
mHtmlEncodeMap.put('\u2202', "∂");
mHtmlEncodeMap.put('\u2203', "∃");
mHtmlEncodeMap.put('\u2205', "∅");
mHtmlEncodeMap.put('\u2207', "∇");
mHtmlEncodeMap.put('\u2208', "∈");
mHtmlEncodeMap.put('\u2209', "∉");
mHtmlEncodeMap.put('\u220B', "∋");
mHtmlEncodeMap.put('\u220F', "∏");
mHtmlEncodeMap.put('\u2211', "∑");
mHtmlEncodeMap.put('\u2212', "−");
mHtmlEncodeMap.put('\u2217', "∗");
mHtmlEncodeMap.put('\u221A', "√");
mHtmlEncodeMap.put('\u221D', "∝");
mHtmlEncodeMap.put('\u221E', "∞");
mHtmlEncodeMap.put('\u2220', "∠");
mHtmlEncodeMap.put('\u2227', "∧");
mHtmlEncodeMap.put('\u2228', "∨");
mHtmlEncodeMap.put('\u2229', "∩");
mHtmlEncodeMap.put('\u222A', "∪");
mHtmlEncodeMap.put('\u222B', "∫");
mHtmlEncodeMap.put('\u2234', "∴");
mHtmlEncodeMap.put('\u223C', "∼");
mHtmlEncodeMap.put('\u2245', "≅");
mHtmlEncodeMap.put('\u2248', "≈");
mHtmlEncodeMap.put('\u2260', "≠");
mHtmlEncodeMap.put('\u2261', "≡");
mHtmlEncodeMap.put('\u2264', "≤");
mHtmlEncodeMap.put('\u2265', "≥");
mHtmlEncodeMap.put('\u2282', "⊂");
mHtmlEncodeMap.put('\u2283', "⊃");
mHtmlEncodeMap.put('\u2284', "⊄");
mHtmlEncodeMap.put('\u2286', "⊆");
mHtmlEncodeMap.put('\u2287', "⊇");
mHtmlEncodeMap.put('\u2295', "⊕");
mHtmlEncodeMap.put('\u2297', "⊗");
mHtmlEncodeMap.put('\u22A5', "⊥");
mHtmlEncodeMap.put('\u22C5', "⋅");
mHtmlEncodeMap.put('\u2308', "⌈");
mHtmlEncodeMap.put('\u2309', "⌉");
mHtmlEncodeMap.put('\u230A', "⌊");
mHtmlEncodeMap.put('\u230B', "⌋");
mHtmlEncodeMap.put('\u2329', "〈");
mHtmlEncodeMap.put('\u232A', "〉");
mHtmlEncodeMap.put('\u25CA', "◊");
mHtmlEncodeMap.put('\u2660', "♠");
mHtmlEncodeMap.put('\u2663', "♣");
mHtmlEncodeMap.put('\u2665', "♥");
mHtmlEncodeMap.put('\u2666', "♦");
}
private StringUtils()
{
}
/**
* Transforms a provided <code>String</code> object into a new string,
* containing only valid Html characters.
*
* @param source The string that has to be transformed into a valid Html
* string.
*
* @return The encoded <code>String</code> object.
*
* @since 1.0
*/
public static String encodeHtml(String source)
{
return encode(source, mHtmlEncodeMap);
}
/**
* Transforms a provided <code>String</code> object into a new string,
* using the mapping that are provided through the supplied encoding table.
*
* @param source The string that has to be transformed into a valid string,
* using the mappings that are provided through the supplied encoding table.
* @param encodingTables A <code>Map</code> object containing the mappings to
* transform characters into valid entities. The keys of this map should be
* <code>Character</code> objects and the values <code>String</code>
* objects.
*
* @return The encoded <code>String</code> object.
*
* @since 1.0
*/
private static String encode(String source, CharKeyOpenHashMap encodingTable)
{
if (null == source)
{
return null;
}
if (null == encodingTable)
{
return source;
}
StringBuffer encoded_string = null;
char[] string_to_encode_array = source.toCharArray();
int last_match = -1;
int difference = 0;
for (int i = 0; i < string_to_encode_array.length; i++)
{
char char_to_encode = string_to_encode_array[i];
if (encodingTable.containsKey(char_to_encode))
{
if (null == encoded_string)
{
encoded_string = new StringBuffer(source.length());
}
difference = i - (last_match + 1);
if (difference > 0)
{
encoded_string.append(string_to_encode_array, last_match + 1, difference);
}
encoded_string.append(encodingTable.get(char_to_encode));
last_match = i;
}
}
if (null == encoded_string)
{
return source;
}
else
{
difference = string_to_encode_array.length - (last_match + 1);
if (difference > 0)
{
encoded_string.append(string_to_encode_array, last_match + 1, difference);
}
return encoded_string.toString();
}
}
/**
* Checks if the name filters through an including and an excluding
* regular expression.
*
* @param name The <code>String</code> that will be filtered.
* @param included The regular expressions that needs to succeed
* @param excluded The regular expressions that needs to fail
*
* @return <code>true</code> if the name filtered through correctly; or
* <p>
* <code>false</code> otherwise.
*
* @since 1.0
*/
public static boolean filter(String name, Pattern included, Pattern excluded)
{
Pattern[] included_array = null;
if (included != null)
{
included_array = new Pattern[] {included};
}
Pattern[] excluded_array = null;
if (excluded != null)
{
excluded_array = new Pattern[] {excluded};
}
return filter(name, included_array, excluded_array);
}
/**
* Checks if the name filters through a series of including and excluding
* regular expressions.
*
* @param name The <code>String</code> that will be filtered.
* @param included An array of regular expressions that need to succeed
* @param excluded An array of regular expressions that need to fail
*
* @return <code>true</code> if the name filtered through correctly; or
* <p>
* <code>false</code> otherwise.
*
* @since 1.0
*/
public static boolean filter(String name, Pattern[] included, Pattern[] excluded)
{
if (null == name)
{
return false;
}
boolean accepted = false;
// retain only the includes
if (null == included)
{
accepted = true;
}
else
{
Pattern pattern;
for (int i = 0; i < included.length; i++)
{
pattern = included[i];
if (pattern != null &&
pattern.matcher(name).matches())
{
accepted = true;
break;
}
}
}
// remove the excludes
if (accepted &&
excluded != null)
{
Pattern pattern;
for (int i = 0; i < excluded.length; i++)
{
pattern = excluded[i];
if (pattern != null &&
pattern.matcher(name).matches())
{
accepted = false;
break;
}
}
}
return accepted;
}
/**
* Splits a string into different parts, using a seperator string to detect
* the seperation boundaries in a case-sensitive manner. The seperator will
* not be included in the list of parts.
*
* @param source The string that will be split into parts.
* @param seperator The seperator string that will be used to determine the
* parts.
*
* @return An <code>ArrayList</code> containing the parts as
* <code>String</code> objects.
*
* @since 1.0
*/
public static ArrayList split(String source, String seperator)
{
return split(source, seperator, true);
}
/**
* Splits a string into different parts, using a seperator string to detect
* the seperation boundaries. The seperator will not be included in the list
* of parts.
*
* @param source The string that will be split into parts.
* @param seperator The seperator string that will be used to determine the
* parts.
* @param matchCase A <code>boolean</code> indicating if the match is going
* to be performed in a case-sensitive manner or not.
*
* @return An <code>ArrayList</code> containing the parts as
* <code>String</code> objects.
*
* @since 1.0
*/
public static ArrayList split(String source, String seperator, boolean matchCase)
{
ArrayList substrings = new ArrayList();
if (null == source)
{
return substrings;
}
if (null == seperator)
{
substrings.add(source);
return substrings;
}
int current_index = 0;
int delimiter_index = 0;
String element = null;
String source_lookup_reference = null;
if (!matchCase)
{
source_lookup_reference = source.toLowerCase();
seperator = seperator.toLowerCase();
}
else
{
source_lookup_reference = source;
}
while (current_index <= source_lookup_reference.length())
{
delimiter_index = source_lookup_reference.indexOf(seperator, current_index);
if (-1 == delimiter_index)
{
element = new String(source.substring(current_index, source.length()));
substrings.add(element);
current_index = source.length() + 1;
}
else
{
element = new String(source.substring(current_index, delimiter_index));
substrings.add(element);
current_index = delimiter_index + seperator.length();
}
}
return substrings;
}
/**
* Searches for a string within a specified string in a case-sensitive
* manner and replaces every match with another string.
*
* @param source The string in which the matching parts will be replaced.
* @param stringToReplace The string that will be searched for.
* @param replacementString The string that will replace each matching part.
*
* @return A new <code>String</code> object containing the replacement
* result.
*
* @since 1.0
*/
public static String replace(String source, String stringToReplace, String replacementString)
{
return replace(source, stringToReplace, replacementString, true);
}
/**
* Searches for a string within a specified string and replaces every match
* with another string.
*
* @param source The string in which the matching parts will be replaced.
* @param stringToReplace The string that will be searched for.
* @param replacementString The string that will replace each matching part.
* @param matchCase A <code>boolean</code> indicating if the match is going
* to be performed in a case-sensitive manner or not.
*
* @return A new <code>String</code> object containing the replacement
* result.
*
* @since 1.0
*/
public static String replace(String source, String stringToReplace, String replacementString, boolean matchCase)
{
if (null == source)
{
return null;
}
if (null == stringToReplace)
{
return source;
}
if (null == replacementString)
{
return source;
}
Iterator string_parts = split(source, stringToReplace, matchCase).iterator();
StringBuffer new_string = new StringBuffer();
synchronized (new_string) // speed increase by thread lock pre-allocation
{
while (string_parts.hasNext())
{
String string_part = (String)string_parts.next();
new_string.append(string_part);
if (string_parts.hasNext())
{
new_string.append(replacementString);
}
}
return new_string.toString();
}
}
/**
* Creates a new string that contains the provided string a number of times.
*
* @param source The string that will be repeated.
* @param count The number of times that the string will be repeated.
* @return A new <code>String</code> object containing the repeated
* concatenation result.
*
* @since 1.0
*/
public static String repeat(String source, int count)
{
if (null == source)
{
return null;
}
StringBuffer new_string = new StringBuffer();
synchronized (new_string) // speed increase by thread lock pre-allocation
{
while (count > 0)
{
new_string.append(source);
count --;
}
return new_string.toString();
}
}
/**
* Converts all tabs on a line to spaces according to the provided tab
* width.
*
* @param line The line whose tabs have to be converted.
* @param tabWidth The tab width.
* @return A new <code>String</code> object containing the line with the
* replaced tabs.
* @since 1.0
*/
public static String convertTabsToSpaces(String line, int tabWidth)
{
StringBuffer result = new StringBuffer();
synchronized (result) // speed increase by thread lock pre-allocation
{
int tab_index = -1;
int last_tab_index = 0;
int added_chars = 0;
int tab_size;
while ((tab_index = line.indexOf("\t", last_tab_index)) != -1)
{
tab_size = tabWidth - ((tab_index + added_chars) % tabWidth);
if (0 == tab_size)
{
tab_size = tabWidth;
}
added_chars += tab_size - 1;
result.append(line.substring(last_tab_index, tab_index));
result.append(StringUtils.repeat(" ", tab_size));
last_tab_index = tab_index + 1;
}
if (0 == last_tab_index)
{
return line;
}
else
{
result.append(line.substring(last_tab_index));
}
}
return result.toString();
}
}