package nl.helixsoft.util;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public abstract class HStringUtils
{
/** default ascii-betical null-safe string comparator implementation */
public static class StringComparator implements Comparator<String>
{
public int compare(String s1, String s2)
{
if (s1 == null)
{
if (s2 == null) return 0;
return -1;
}
if (s2 == null) return 1;
return s1.compareTo(s2);
}
}
private HStringUtils() {} // instantiation forbidden
public static List<String> quotedCommaSplit(String input)
{
return HStringUtils.quotedSplit(input, '"', ',');
}
/**
* Permissive version of quotedCommaSplit that prints warnings instead of throwing exceptions
* in certain cases that don't adhere to the spec but are recoverable.
*/
public static List<String> permissiveQuotedCommaSplit(String input)
{
return HStringUtils.quotedSplit(input, '"', ',', false);
}
/**
* Concat a prefix and suffix to each element in a list, and join with a separator.
* For example, turn the list a b c into {a};{b};{c}
*
* Equivalent to the following groovy code:
*
* <pre>data.collect{ prefix + it + suffix }.join(sep)</pre>
*
* @param sep separator between list elements
* @param data list of strings
* @param prefix prefix concatenated before each element in the list
* @param suffix suffix concatenated after each element in the list
*/
public static String concatAndJoin (String sep, List<String> data, String prefix, String suffix)
{
StringBuilder builder = new StringBuilder();
boolean first = true;
for (String str : data)
{
if (first)
{
first = false;
}
else
{
builder.append(sep);
}
builder.append (prefix);
builder.append (str);
builder.append (suffix);
}
return builder.toString();
}
/**
* May return null, but will not throw an exception
*/
public static Double safeParseDouble(String val)
{
if (val == null) return null;
try
{
Double result = Double.parseDouble(val);
return result;
}
catch (NumberFormatException e)
{
return null;
}
}
/**
* May return null, but will not throw an exception
*/
public static Integer safeParseInt(String val)
{
if (val == null) return null;
try
{
Integer result = Integer.parseInt(val);
return result;
}
catch (NumberFormatException e)
{
return null;
}
}
/**
* May return null, but will not throw an exception
*/
public static Long safeParseLong(String val)
{
if (val == null) return null;
try
{
Long result = Long.parseLong(val);
return result;
}
catch (NumberFormatException e)
{
return null;
}
}
public static String escapeHtml(String s)
{
//TODO: replace with apache codec?
s = s.replaceAll("<", "<");
s = s.replaceAll(">", ">");
return s;
}
/**
* Join collection into a single string, with a separator between.
*/
public static String join (String sep, Collection<?> values)
{
StringBuilder builder = new StringBuilder();
join (builder, sep, values);
return builder.toString();
}
/**
* Join collection into and append to a StringBuilder, with a separator between.
* Useful if you want to join strings and append to an existing StringBuilder.
* @param builder StringBuilder you want to append to. This variable will be modified.
* @param sep Separator between strings
* @param values collection of strings to join.
*/
public static void join (StringBuilder builder, String sep, Collection<?> values)
{
boolean first = true;
for (Object o : values)
{
if (first)
first = false;
else
builder.append (sep);
builder.append ("" + o);
}
}
/**
* Join an multi value list into a single string, with a separator between.
*/
public static <T> void join (StringBuilder builder, String sep, T... values)
{
boolean first = true;
for (Object o : values)
{
if (first)
first = false;
else
builder.append (sep);
builder.append ("" + o);
}
}
/**
* Join an multi value list into a single string, with a separator between.
*/
public static <T> String join (String sep, T... values)
{
StringBuilder builder = new StringBuilder();
join (builder, sep, values);
return builder.toString();
}
/**
* If input string starts with a quote character (") AND ends with the same quote character, both a removed.
* Otherwise the input is returned unchanged.
*/
public static String removeOptionalQuotes(String in)
{
if (in == null) return null;
if (in.startsWith("\"") && in.endsWith("\""))
{
return in.substring (1, in.length() - 1);
}
else
return in;
}
public static boolean emptyOrNull (String s)
{
return (s == null || s.equals (""));
}
/**
* Make the first character in the String uppercase.
* Leave the remaining characters unchanged.
* Null-Safe: If the input value is null, this returns null;
*/
public static String initialUpper (String input)
{
if (input == null) return null;
String result = input.substring(0, 1).toUpperCase() + input.substring (1);
return result;
}
/**
* Applies escaping as described in MySQL documentation here: https://dev.mysql.com/doc/refman/5.0/en/string-literals.html
* Note that this applies escapes for a literal enclosed by single quotes, not double quotes!
*/
public static String escapeMysqlLiteral(String literal)
{
StringBuilder result = new StringBuilder();
for (char c : literal.toCharArray())
{
switch (c)
{
case '\'': case '\\': result.append ('\\'); result.append(c); break;
case 0: result.append ("\\0"); break;
case '\b': result.append ("\\b"); break;
case '\n': result.append ("\\n"); break;
case '\r': result.append ("\\r"); break;
case '\t': result.append ("\\t"); break;
case 26: result.append ("\\Z"); break;
default: result.append (c); break;
}
}
return result.toString();
}
/**
* Shorten a scientific species name.
* Takes the first capital letter of the first word, plus the second word.
* Homo sapiens -> Hsapiens
*/
public static String scientificShort(String species)
{
Pattern pat = Pattern.compile ("^([A-Z])[a-z]+ ([a-z]+)");
Matcher mat = pat.matcher(species);
if (!mat.matches()) throw new IllegalArgumentException (species + " is not a valid scientific name.");
return mat.group(1) + mat.group(2);
}
/**
* Remove spaces, make each word start with uppercase.
* CamelCase is a method for removing spaces from a phrase while maintaining leglibility.
*
* For example "Small molecule" -> "SmallMolecule"
* "Show me your ID!" -> "ShowMeYourID!"
* "Two spaces" -> "TwoSpaces"
* " surrounded " -> "Surrounded"
* Null-Safe: returns null if input is null;
*/
public static String toCamelCase(String value)
{
if (value == null) return null;
StringBuilder result = new StringBuilder();
for (String word : value.trim().split (" +"))
{
result.append (word.substring (0, 1).toUpperCase());
result.append (word.substring (1));
}
return result.toString();
}
/**
* Null-Safe version of String.toLowerCase;
*/
public static String toUpperCase (String input)
{
if (input == null) return null;
return input.toUpperCase();
}
/**
* Null-Safe version of String.toLowerCase;
*/
public static String toLowerCase (String input)
{
if (input == null) return null;
return input.toLowerCase();
}
/**
* To better deal with newline characters in CSV files,
*
* The idea is that a line is parsed in multiple invocations like this:
*
* List<String> row = new ArrayList<String>();
* boolean continued = false;
* do
* {
* String line = readLine();
* if (line == null) break; // reached end-of-file.
* continued = quotedSplit (line, '"', ',', continued, row);
* } while (continued)
*/
public static boolean quotedSplit (String input, char quoteChar, char separatorChar, boolean continued, List<String> previouslist)
{
//TODO: implement
throw new UnsupportedOperationException("Not yet implemented");
}
public static List<String> quotedSplit(String input, char quoteChar, char separatorChar)
{
return quotedSplit(input, quoteChar, separatorChar, true);
}
/**
* Efficiently parses strings like:
*
* a, b, c <br>
*
* a,"b", c <br>
*
* a,"b,b", c <br>
*
* a,"b""b", c <br>
*
* Also handles newline characters between quotes, assuming a multi-line string is passed as argument.
*
* If input is null, will return an empty list.
*
* @param strictValidation if you pass true, may throw an exception if the line doesn't adhere to CSV spec. If false, merely print a warning to STDERR.
* @see "https://en.wikipedia.org/wiki/Comma-separated_values"
*/
public static List<String> quotedSplit(String input, char quoteChar, char separatorChar, boolean strictValidation)
{
assert quoteChar != separatorChar;
List<String> result = new ArrayList<String>();
if (input == null) return result;
final int BOUNDARY = 0;
final int CONTENT = 1;
final int QUOTED = 2;
final int QUOTE_AFTER_QUOTE = 3;
int state = 0;
int pos = 0;
int start = 0;
StringBuilder current = null;
while (pos < input.length())
{
char c = input.charAt(pos);
switch (state)
{
case BOUNDARY:
current = new StringBuilder();
if (c == quoteChar)
{
state = QUOTED;
start = pos + 1;
}
else if (c == ' ')
{
// ignore opening whitespace
}
else if (c == separatorChar)
{
result.add ("");
}
else
{
start = pos;
state = CONTENT;
}
break;
case CONTENT:
if (c == separatorChar)
{
state = BOUNDARY;
result.add (input.substring (start, pos));
}
else if (c == quoteChar)
{
if (strictValidation)
{
throw new IllegalArgumentException("Found quote in middle of field: " + input);
}
else
{
System.err.println ("WARNING: Found quote in middle of field: " + input + " which is against CSV spec");
}
}
break;
case QUOTED:
if (c == quoteChar)
{
state = QUOTE_AFTER_QUOTE;
}
break;
case QUOTE_AFTER_QUOTE:
if (c == quoteChar)
{
// double quote, go back to quoted state.
current.append (input.substring (start, pos-1));
start = pos;
state = QUOTED;
}
else if (c == separatorChar)
{
current.append (input.substring (start, pos-1));
result.add (current.toString());
state = BOUNDARY;
}
else if (c == ' ')
{
// skip whitespace
}
else
{
throw new IllegalArgumentException("Illegal character after closing quote: " + input);
}
break;
}
pos++;
}
// close up
if (state == QUOTED)
{
throw new IllegalArgumentException("Missing closing quote: " + input);
}
else if (state == CONTENT)
{
result.add (input.substring (start, pos));
}
else if (state == QUOTE_AFTER_QUOTE)
{
current.append (input.substring (start, pos-1));
result.add (current.toString());
}
return result;
}
public static String urlEncode(String name) {
//TODO: replace with apache codec?
String encoded;
try {
encoded = URLEncoder.encode(name, "UTF-8");
} catch (UnsupportedEncodingException e) {
throw new RuntimeException("Programming error: wrong encoding");
}
return encoded;
}
/**
* Strip html tags from a String.
* For example, abc<b>def will be turned into abcdef.
* Http entities such as & are left unchanged, for doing that as well, see stripHtml.
*/
public static String stripTags(String s)
{
return s.replaceAll("<[^>]+>", "");
}
/**
* Create a string by repeating a given base a number of times.
*/
public static String rep (String base, int count)
{
StringBuilder builder = new StringBuilder();
for (int i = 0; i < count; ++i)
{
builder.append (base);
}
return builder.toString();
}
private static final Map<String, String> httpEntities;
//TODO: add more. see http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references
static {
httpEntities = new HashMap<String, String>();
httpEntities.put ("larr", "\u2190");
httpEntities.put ("harr", "\u2194");
httpEntities.put ("rarr", "\u2192");
httpEntities.put ("amp", "&");
httpEntities.put ("lt", "<");
httpEntities.put ("gt", ">");
httpEntities.put ("alpha", "\u03B1");
httpEntities.put ("Alpha", "\u0391");
httpEntities.put ("beta", "\u03B2");
httpEntities.put ("Beta", "\u0392");
httpEntities.put ("gamma", "\u03B3");
httpEntities.put ("Gamma", "\u0393");
httpEntities.put ("delta", "\u03B4");
httpEntities.put ("Delta", "\u0394");
httpEntities.put ("epsilon", "\u03B5");
httpEntities.put ("Epsilon", "\u0395");
httpEntities.put ("zeta", "\u03B6");
httpEntities.put ("eta", "\u03B7");
httpEntities.put ("theta", "\u03B8");
httpEntities.put ("iota", "\u03B9");
httpEntities.put ("kappa", "\u03BA");
httpEntities.put ("lambda", "\u03BB");
httpEntities.put ("mu", "\u03BC");
httpEntities.put ("nu", "\u03BD");
httpEntities.put ("xi", "\u03BE");
httpEntities.put ("omicron", "\u03BF");
httpEntities.put ("pi", "\u03C0");
httpEntities.put ("rho", "\u03C1");
// gap
httpEntities.put ("sigma", "\u03C3");
httpEntities.put ("tau", "\u03C4");
httpEntities.put ("upsilon", "\u03C5");
httpEntities.put ("phi", "\u03C6");
httpEntities.put ("chi", "\u03C7");
httpEntities.put ("psi", "\u03C8");
httpEntities.put ("Omega", "\u03A9");
httpEntities.put ("omega", "\u03C9");
}
private static final Map<Character, String> greek;
//TODO: add more. see http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references
static {
greek = new HashMap<Character, String>();
greek.put ('\u03B1', "alpha");
greek.put ('\u0391', "Alpha");
greek.put ('\u03B2', "beta");
greek.put ('\u0392', "Beta");
greek.put ('\u03B3', "gamma");
greek.put ('\u0393', "Gamma");
greek.put ('\u03B4', "delta");
greek.put ('\u0394', "Delta");
greek.put ('\u03B5', "epsilon");
greek.put ('\u0395', "Epsilon");
greek.put ('\u03B6', "zeta");
greek.put ('\u03B7', "eta");
greek.put ('\u03B8', "theta");
greek.put ('\u03B9', "iota");
greek.put ('\u03BA', "kappa");
greek.put ('\u03BB', "lambda");
greek.put ('\u03BC', "mu");
greek.put ('\u03BD', "nu");
greek.put ('\u03BE', "xi");
greek.put ('\u03BF', "omicron");
greek.put ('\u03C0', "pi");
greek.put ('\u03C1', "rho");
// gap
greek.put ('\u03C3', "sigma");
greek.put ('\u03C4', "tau");
greek.put ('\u03C5', "upsilon");
greek.put ('\u03C6', "phi");
greek.put ('\u03C7', "chi");
greek.put ('\u03C8', "psi");
greek.put ('\u03A9', "Omega");
greek.put ('\u03C9', "omega");
}
/**
* Replace greek letters such as \u03B2 with english text such as "Beta";
*/
public static String greekToEnglish(String input)
{
Pattern patGreek = Pattern.compile("[\u0391-\u03A9\u03B1-\u03C9]");
if (!patGreek.matcher(input).find())
return input; // no replacement needed.
StringBuilder result = new StringBuilder();
for (int i = 0; i < input.length(); ++i)
{
char c = input.charAt(i);
String rep = greek.get(c);
if (rep == null)
result.append (c);
else
result.append (rep);
}
return result.toString();
}
public static String decodeEntities (String s)
{
// now replace http entities
StringBuffer buf = new StringBuffer();
Matcher m = Pattern.compile("&(\\w+|#x?[0-9a-fA-F]+);").matcher(s);
while (m.find())
{
String entity = m.group(1);
Matcher m2 = Pattern.compile ("#x([0-9a-fA-F]+)").matcher(entity);
if (m2.matches())
{
m.appendReplacement(buf, "" + (char)Integer.parseInt(m2.group(1), 16));
}
else if (Pattern.compile ("#\\d+").matcher(entity).matches())
{
m.appendReplacement(buf, "" + (char)Integer.parseInt(entity.substring(1)));
}
else
{
String r = httpEntities.get(entity);
if (r != null) {
m.appendReplacement(buf, r);
}
else
{
// put original back
m.appendReplacement(buf, m.group(0));
System.err.println ("Failed to look up " + m.group(1));
}
}
}
m.appendTail(buf);
return buf.toString();
}
/**
* Strips html tags and http entities.
* Http entities such as '&' are replaced with '&'
*/
public static String stripHtml(String string)
{
//TODO: replace with apache codec?
String s = string;
// strip tags
s = s.replaceAll("<[^>]+>", "");
// reduce double whitespace that may have been left as a result from previous step.
s = s.replaceAll(" +", " ");
// replace http entities like &
return decodeEntities (s);
}
/**
* Make sure a string stays within a certain length, by cutting a bit from the middle.
*/
public static String abbrev(String result, int maxLength, String separator)
{
int mid = maxLength / 2;
if (result.length() > maxLength) result = result.substring(0, mid - 1) + separator + result.substring(result.length() - mid);
return result;
}
/**
* Compare two strings, without throwing nullpointerexception, ignoring case
*/
public static boolean safeEqualsIgnoreCase(String a, String b)
{
return (a == null) ? a == b : b.equalsIgnoreCase(a);
}
/**
* Safely convert an object to null;
* If the input is null, the result is null as well.
*/
public static String safeToString (Object o)
{
if (o == null) return null;
return o.toString();
}
/**
* Make a string suitable for use as filename,
* by replacing unsafe characters with a dash.
*/
public static String makeFileName (String s)
{
return s.replaceAll("[^a-zA-Z0-9\\-_()\\]\\[}{. ]+", "-");
}
public static boolean isFileNameSafe (String s)
{
return !s.matches(".*[^a-zA-Z0-9\\-_()\\]\\[}{. ].*");
}
/**
* Check if the string contains only allowed characters .
* @returns null if ok, or an error message otherwise.
*/
public static String checkForIllegalCharacter (String haystack, String allowedCharacters)
{
Set<Character> allowedSet = new HashSet<Character>();
for (char c : allowedCharacters.toCharArray())
{
allowedSet.add(c);
}
int pos = 0;
for (char hay : haystack.toCharArray())
{
if (!allowedSet.contains(hay))
{
return "contains illegal character '" + hay + "' at position " + pos;
}
pos++;
}
return null; // no problem found
}
/** safely split a string,
* Unlike String.split(), this works on null strings.
* Returns a zero-length array if the input is null */
public static String[] safeSplit(String regex, Object o)
{
if (o == null) return EMPTY_STRING_ARRAY;
String s = o.toString();
if ("".equals(s)) return EMPTY_STRING_ARRAY;
return s.split(regex);
}
public static final String[] EMPTY_STRING_ARRAY = new String[] {};
}