/*******************************************************************************
* Copyright 2016 Observational Health Data Sciences and Informatics
*
* This file is part of WhiteRabbit
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
package org.ohdsi.utilities;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.text.DateFormat;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Collection;
import java.util.Collections;
import java.util.Date;
import java.util.GregorianCalendar;
import java.util.Iterator;
import java.util.List;
import java.util.zip.DataFormatException;
public class StringUtilities {
public static long SECOND = 1000;
public static long MINUTE = 60 * SECOND;
public static long HOUR = 60 * MINUTE;
public static long DAY = 24 * HOUR;
public static long WEEK = 7 * DAY;
public static long YEAR = 365 * DAY;
public static long CENTURY = 100 * YEAR;
public static long MILLENIUM = 1000 * YEAR;
public static long MISSING_DATE = -999999;
private static Calendar calendar = new GregorianCalendar();
@SuppressWarnings({ "unchecked", "rawtypes" })
public static String joinSorted(Collection<? extends Comparable> s, String delimiter) {
List list = new ArrayList(s);
Collections.sort(list);
return join(list, delimiter);
}
public static String join(Collection<?> s, String delimiter) {
StringBuffer buffer = new StringBuffer();
Iterator<?> iter = s.iterator();
if (iter.hasNext()) {
buffer.append(iter.next().toString());
}
while (iter.hasNext()) {
buffer.append(delimiter);
buffer.append(iter.next().toString());
}
return buffer.toString();
}
public static String join(Object[] objects, String delimiter) {
StringBuffer buffer = new StringBuffer();
if (objects.length != 0)
buffer.append(objects[0].toString());
for (int i = 1; i < objects.length; i++) {
buffer.append(delimiter);
buffer.append(objects[i].toString());
}
return buffer.toString();
}
public static int twoHexDigitsToInt(String value, int index) {
return Integer.parseInt(value.substring(index, index + 2), 16);
}
public static boolean isInteger(String string) {
try {
Integer.parseInt(string);
} catch (NumberFormatException e) {
return false;
}
return true;
}
public static boolean isLong(String string) {
try {
Long.parseLong(string);
} catch (NumberFormatException e) {
return false;
}
return true;
}
// private static Pattern numberPattern = Pattern.compile("^-?\\d[0-9]*(\\.[0-9]+)?((E|e)-?[0-9]*)?\\d$");
public static boolean isNumber(String string) {
try {
Double.parseDouble(string);
} catch (Exception e) {
return false;
}
return true;
// string.trim();
// if (string.length() == 1) {
// return Character.isDigit(string.charAt(0));
// }
// return numberPattern.matcher(string).matches();
}
public static boolean isRomanNumeral(String string) {
return (string.equals("I") || string.equals("II") || string.equals("III") || string.equals("IV") || string.equals("V") || string.equals("VI")
|| string.equals("VII") || string.equals("VIII") || string.equals("IX") || string.equals("IX"));
}
public static boolean isGreekLetter(String string) {
String lcstring = string.toLowerCase();
return (lcstring.equals("alpha") || lcstring.equals("beta") || lcstring.equals("gamma") || lcstring.equals("delta") || lcstring.equals("epsilon")
|| lcstring.equals("zeta") || lcstring.equals("eta") || lcstring.equals("theta") || lcstring.equals("iota") || lcstring.equals("kappa")
|| lcstring.equals("lambda") || lcstring.equals("mu") || lcstring.equals("nu") || lcstring.equals("xi") || lcstring.equals("omicron")
|| lcstring.equals("pi") || lcstring.equals("rho") || lcstring.equals("sigma") || lcstring.equals("tau") || lcstring.equals("upsilon")
|| lcstring.equals("phi") || lcstring.equals("chi") || lcstring.equals("psi") || lcstring.equals("omega"));
}
// Adds PSF file specific escape characters to string
// Author: Martijn
public static String escape(String string) {
StringBuffer result = new StringBuffer();
for (int i = 0; i < string.length(); i++) {
char currentChar = string.charAt(i);
if (currentChar == '"' || currentChar == '?' || currentChar == ';' || currentChar == '\\' || currentChar == '|') {
result.append('\\');
}
result.append(currentChar);
}
return result.toString();
}
// Removes any escape characters from string
// Author: Martijn
public static String unescape(String string) {
StringBuffer result = new StringBuffer();
if (string.length() > 0) {
if (string.charAt(0) == '"' && string.charAt(string.length() - 1) == '"') {
result.append(string.substring(1, string.length() - 1));
} else {
boolean escape = false;
char currentchar;
for (int i = 0; i < string.length(); i++) {
currentchar = string.charAt(i);
if (escape) {
escape = false;
result.append(currentchar);
} else {
if (currentchar == '\\') {
escape = true;
} else {
result.append(currentchar);
}
}
}
}
}
return result.toString();
}
// Safesplit works the same as default split, but takes escapes into account
// Author: Martijn
public static List<String> safeSplit(String string, char delimiter) {
List<String> result = new ArrayList<String>();
if (string.length() == 0) {
result.add("");
return result;
}
boolean literal = false;
boolean escape = false;
int startpos = 0;
int i = 0;
char currentchar;
while (i < string.length()) {
currentchar = string.charAt(i);
if (currentchar == '"' && !escape) {
literal = !literal;
}
if (!literal && (currentchar == delimiter && !escape)) {
result.add(string.substring(startpos, i));
startpos = i + 1;
}
if (currentchar == '\\') {
escape = !escape;
} else {
escape = false;
}
i++;
}
result.add(string.substring(startpos, i));
return result;
}
public static boolean containsNumber(String string) {
for (int i = 0; i < string.length(); i++) {
if ((int) string.charAt(i) < 58 && (int) string.charAt(i) > 47) {
return true;
}
}
return false;
}
public static int countNumbers(String string) {
int total = 0;
for (int i = 0; i < string.length(); i++) {
if ((int) string.charAt(i) < 58 && (int) string.charAt(i) > 47) {
total++;
}
}
return total;
}
public static boolean containsLetter(String string) {
for (int i = 0; i < string.length(); i++) {
if (Character.isLetter(string.charAt(i))) {
return true;
}
}
return false;
}
public static int countLetters(String string) {
int total = 0;
for (int i = 0; i < string.length(); i++) {
if (Character.isLetter(string.charAt(i))) {
total++;
}
}
return total;
}
public static boolean containsCurlyBracket(String string) {
for (int i = 0; i < string.length(); i++) {
if (isCurlyBracket(string.charAt(i))) {
return true;
}
}
return false;
}
public static boolean containsParenthesis(String string) {
for (int i = 0; i < string.length(); i++) {
if (isParenthesis(string.charAt(i))) {
return true;
}
}
return false;
}
public static boolean containsBracket(String string) {
for (int i = 0; i < string.length(); i++) {
if (isBracket(string.charAt(i))) {
return true;
}
}
return false;
}
public static boolean containsArrow(String string) {
for (int i = 0; i < string.length(); i++) {
if (isArrow(string.charAt(i))) {
return true;
}
}
return false;
}
public static boolean isParenthesis(char ch) {
return (ch == ('(') || ch == (')'));
}
// Checks whether the word is a brackets
// Author: Kristina
public static boolean isBracket(char ch) {
return (ch == ('[') || ch == (']'));
}
public static boolean isArrow(char ch) {
return (ch == ('<') || ch == ('>'));
}
// Checks whether the word is a curly bracket
// Author: Kristina
public static boolean isCurlyBracket(char ch) {
return (ch == ('{') || ch == ('}'));
}
// Converts a string to a list of words
// Author: Martijn
public static List<String> mapToWords(String string) {
List<String> result = new ArrayList<String>();
int start = 0;
int i = 0;
for (; i < string.length(); i++) {
char ch = string.charAt(i);
if (!Character.isLetterOrDigit(ch)
&& !(ch == '\'' && i > 0 && Character.isLetter(string.charAt(i - 1)) && string.length() - 1 > i && string.charAt(i + 1) == 's' && (string
.length() - 2 == i || !Character.isLetterOrDigit(string.charAt(i + 2))))) { // leaves ' in possesive pattern
if (start != i) {
result.add(string.substring(start, i));
}
start = i + 1;
}
}
if (start != i) {
result.add(string.substring(start, i));
}
return result;
}
// Returns a string with the current time
// Author: Martijn
public static String now() {
Date d = new Date();
DateFormat df = DateFormat.getDateTimeInstance(DateFormat.MEDIUM, DateFormat.MEDIUM);
return df.format(d);
}
public static void outputWithTime(String message) {
System.out.println(now() + "\t" + message);
}
// Checks whether the word is an abbreviation
// Author: Martijn
public static boolean isAbbr(String word) {
int lowercase = 0;
int uppercase = 0;
int charInt = 0;
for (int i = 0; i < word.length(); i++) {
charInt = (int) word.charAt(i);
if (charInt < 58) {
if (charInt > 47) {
}// its a number
} else if (charInt < 91) {
if (charInt > 64) {
uppercase++;
}
} else if (charInt < 123 && charInt > 96) {
lowercase++;
}
}
return (uppercase > 0 && lowercase < uppercase);
}
/**
* If only the first letter of a word is a capital, the word is reduced to lowercase, else the original string is returned
*
* @param string
* @return
*/
public static String firstLetterToLowerCase(String string) {
boolean uppercase = false;
int charInt = 0;
for (int i = 1; i < string.length(); i++) {
charInt = (int) string.charAt(i);
if (charInt < 91)
if (charInt > 64) {
uppercase = true;
break;
}
}
if (!uppercase)
return string.toLowerCase();
else
return string;
}
public static int countsCharactersInUpperCase(String string) {
int uppercase = 0;
int charInt = 0;
for (int i = 0; i < string.length(); i++) {
charInt = (int) string.charAt(i);
if (charInt > 64 && charInt < 91) {
uppercase++;
}
}
return uppercase;
}
public static int countsCharactersInLowerCase(String string) {
int lowercase = 0;
int charInt = 0;
for (int i = 0; i < string.length(); i++) {
charInt = (int) string.charAt(i);
if (charInt > 96 && charInt < 123) {
lowercase++;
}
}
return lowercase;
}
// Converts a double to a formatted string. Examples of valid patterns are:
// "###,###.###"
// "###.##"
// "000000.000"
// "$###,###.###"
// "\u00a5###,###.###"
// # indicates optional number, 0 indicates forced number (will be printed as 0 when 0)
// Author: Martijn
public static String formatNumber(String pattern, double number) {
DecimalFormat myFormatter = new DecimalFormat(pattern);
return myFormatter.format(number);
}
public static boolean isPlural(String string) {
if (string.length() > 1)
if (string.charAt(string.length() - 1) == 's')
if (Character.isLetter(string.charAt(string.length() - 2)))
return true;
return false;
}
public static String findBetween(String source, String pre, String post) {
int start = source.indexOf(pre);
if (start == -1)
return "";
int end = source.indexOf(post, start + pre.length());
if (end == -1)
return "";
return source.substring(start + pre.length(), end);
}
public static List<String> multiFindBetween(String source, String pre, String post) {
List<String> result = new ArrayList<String>();
int start = 0;
int end = 0;
while (start != -1 && end != -1) {
start = source.indexOf(pre, end);
if (start != -1) {
end = source.indexOf(post, start + pre.length());
if (end != -1)
result.add(source.substring(start + pre.length(), end));
}
}
return result;
}
/**
* Returns true if every parenthesis in the string is matched
*
* @param string
* @return
*/
public static boolean parenthesisMatch(String string) {
int count = 0;
for (int i = 0; i < string.length(); i++) {
char ch = string.charAt(i);
if (ch == '(')
count++;
else if (ch == ')') {
count--;
if (count == -1)
return false;
}
}
return (count == 0);
}
public static int count(String s, char ch) {
int cnt = 0;
for (int i = 0; i < s.length(); i++)
if (s.charAt(i) == ch)
cnt++;
return cnt;
}
/**
* Removes parenthesis and what is within the parenthesis from the string. For example: 'cold (disease)' -> 'cold '
*
* @param string
* @return
*/
public static String removeParenthesisAndContent(String string) {
StringBuilder result = new StringBuilder();
int count = 0;
for (int i = 0; i < string.length(); i++) {
char ch = string.charAt(i);
if (ch == '(')
count++;
else if (ch == ')') {
count--;
} else if (count == 0)
result.append(ch);
}
return result.toString();
}
public static String daysToSortableDateString(long days) {
long ms = days * DAY;
// Calendar calendar = new GregorianCalendar();
ms -= calendar.getTimeZone().getOffset(ms);
calendar.setTimeInMillis(ms);
StringBuilder sb = new StringBuilder();
sb.append(calendar.get(Calendar.YEAR));
sb.append(StringUtilities.formatNumber("00", calendar.get(Calendar.MONTH) + 1));
sb.append(StringUtilities.formatNumber("00", calendar.get(Calendar.DATE)));
return sb.toString();
}
public static String daysToDatabaseDateString(long days) {
if (days == MISSING_DATE)
return "";
long ms = days * DAY;
ms -= calendar.getTimeZone().getOffset(ms);
calendar.setTimeInMillis(ms);
StringBuilder sb = new StringBuilder();
sb.append(calendar.get(Calendar.YEAR));
sb.append("-");
sb.append(StringUtilities.formatNumber("00", calendar.get(Calendar.MONTH) + 1));
sb.append("-");
sb.append(StringUtilities.formatNumber("00", calendar.get(Calendar.DATE)));
return sb.toString();
}
public static String daysToCalendarYear(long days) {
long ms = days * DAY;
ms -= calendar.getTimeZone().getOffset(ms);
calendar.setTimeInMillis(ms);
return Integer.toString(calendar.get(Calendar.YEAR));
}
public static String dateToCalendarYear(Date date) {
calendar.setTime(date);
return Integer.toString(calendar.get(Calendar.YEAR));
}
public static String daysToCalendarMonth(long days) {
long ms = days * DAY;
ms -= calendar.getTimeZone().getOffset(ms);
calendar.setTimeInMillis(ms);
return Integer.toString(calendar.get(Calendar.MONTH) + 1);
}
public static String daysToCalendarDayOfMonth(long days) {
long ms = days * DAY;
ms -= calendar.getTimeZone().getOffset(ms);
calendar.setTimeInMillis(ms);
return Integer.toString(calendar.get(Calendar.DATE) + 1);
}
public static String daysToCalendarQuarterYear(long days) {
long ms = days * DAY;
ms -= calendar.getTimeZone().getOffset(ms);
calendar.setTimeInMillis(ms);
return Integer.toString(1 + (calendar.get(Calendar.MONTH) / 3));
}
public static String millisecondsToSortableTimeString(long ms) {
// Calendar calendar = new GregorianCalendar();
ms -= calendar.getTimeZone().getOffset(ms + 2 * HOUR);
calendar.setTimeInMillis(ms);
StringBuilder sb = new StringBuilder();
sb.append(calendar.get(Calendar.YEAR));
sb.append(StringUtilities.formatNumber("00", calendar.get(Calendar.MONTH) + 1));
sb.append(StringUtilities.formatNumber("00", calendar.get(Calendar.DATE)));
sb.append(StringUtilities.formatNumber("00", calendar.get(Calendar.HOUR)));
sb.append(StringUtilities.formatNumber("00", calendar.get(Calendar.MINUTE)));
sb.append(StringUtilities.formatNumber("00", calendar.get(Calendar.SECOND)));
return sb.toString();
}
public static long sortableTimeStringToDays(String string) throws DataFormatException {
// Calendar calendar = new GregorianCalendar();
try {
int year = Integer.parseInt(string.substring(0, 4));
int month = Integer.parseInt(string.substring(4, 6)) - 1;
int day = Integer.parseInt(string.substring(6, 8));
calendar.set(year, month, day);
long time = calendar.getTimeInMillis();
time += calendar.getTimeZone().getOffset(time);
if (string.length() > 8) {
int hour = Integer.parseInt(string.substring(8, 10));
time += hour * 60 * 60 * 1000;
if (string.length() > 8) {
int minute = Integer.parseInt(string.substring(10, 12));
time += minute * 60 * 1000;
if (string.length() > 8) {
int second = Integer.parseInt(string.substring(12, 14));
time += second * 1000;
}
}
}
// Millenium is added because for negative numbers, integer division truncates upwards! (-8/10 = 0)
return (((MILLENIUM + time) / DAY) - (1000 * 365));
} catch (Exception e) {
throw new DataFormatException("Error parsing date: \"" + string + "\"");
}
}
public static long databaseTimeStringToDays(String string) {
if (string.equals(""))
return MISSING_DATE;
int year = Integer.parseInt(string.substring(0, 4));
int month = Integer.parseInt(string.substring(5, 7)) - 1;
int day = Integer.parseInt(string.substring(8, 10));
calendar.set(year, month, day);
long time = calendar.getTimeInMillis();
time += calendar.getTimeZone().getOffset(time);
// Millenium is added because for negative numbers, integer division truncates upwards! (-8/10 = 0)
return (((MILLENIUM + time) / DAY) - (1000 * 365));
}
public static long sortableTimeStringToMS(String string) throws DataFormatException {
// Calendar calendar = new GregorianCalendar();
try {
int year = Integer.parseInt(string.substring(0, 4));
int month = Integer.parseInt(string.substring(4, 6)) - 1;
int day = Integer.parseInt(string.substring(6, 8));
calendar.set(year, month, day);
long time = calendar.getTimeInMillis();
time += calendar.getTimeZone().getOffset(time);
if (string.length() > 8) {
int hour = Integer.parseInt(string.substring(8, 10));
time += hour * 60 * 60 * 1000;
if (string.length() > 8) {
int minute = Integer.parseInt(string.substring(10, 12));
time += minute * 60 * 1000;
if (string.length() > 8) {
int second = Integer.parseInt(string.substring(12, 14));
time += second * 1000;
}
}
}
// Millenium is added because for negative numbers, integer division truncates upwards! (-8/10 = 0)
return (time);
} catch (Exception e) {
throw new DataFormatException("Error parsing date: \"" + string + "\"");
}
}
public static String replaceInternationalChars(String string) {
char result[] = string.toCharArray();
for (int i = 0; i < result.length; i++) {
char ch = result[i];
int charInt = (int) ch;
if (charInt == 216)
result[i] = 'O';
else if (charInt == 248)
result[i] = 'o';
else if (charInt == 246)
result[i] = 'o';
else if (charInt == 244)
result[i] = 'o';
else if (charInt == 245)
result[i] = 'o';
else if (charInt == 242)
result[i] = 'o';
else if (charInt == 243)
result[i] = 'o';
else if (charInt == 237)
result[i] = 'i';
else if (charInt == 238)
result[i] = 'i';
else if (charInt == 239)
result[i] = 'i';
else if (charInt == 232)
result[i] = 'e';
else if (charInt == 233)
result[i] = 'e';
else if (charInt == 234)
result[i] = 'e';
else if (charInt == 235)
result[i] = 'e';
else if (charInt == 231)
result[i] = 'c';
else if (charInt == 224)
result[i] = 'a';
else if (charInt == 225)
result[i] = 'a';
else if (charInt == 226)
result[i] = 'a';
else if (charInt == 227)
result[i] = 'a';
else if (charInt == 228)
result[i] = 'a';
else if (charInt == 229)
result[i] = 'a';
else if (charInt == 252)
result[i] = 'u';
else if (charInt == 250)
result[i] = 'u';
else if (charInt == 253)
result[i] = 'y';
else if (charInt == 241)
result[i] = 'n';
}
return new String(result);
}
public static int caseInsensitiveIndexOf(String value, List<String> list) {
String queryLC = value.toLowerCase();
for (int i = 0; i < list.size(); i++) {
String string = list.get(i);
if (string.toLowerCase().equals(queryLC))
return i;
}
return -1;
}
public static int levenshteinDistance(String s, String t) {
int d[][]; // matrix
int n; // length of s
int m; // length of t
int i; // iterates through s
int j; // iterates through t
char s_i; // ith character of s
char t_j; // jth character of t
int cost; // cost
n = s.length();
m = t.length();
if (n == 0) {
return m;
}
if (m == 0) {
return n;
}
d = new int[n + 1][m + 1];
for (i = 0; i <= n; i++) {
d[i][0] = i;
}
for (j = 0; j <= m; j++) {
d[0][j] = j;
}
for (i = 1; i <= n; i++) {
s_i = s.charAt(i - 1);
for (j = 1; j <= m; j++) {
t_j = t.charAt(j - 1);
if (s_i == t_j) {
cost = 0;
} else {
cost = 1;
}
d[i][j] = Math.min(d[i - 1][j] + 1, Math.min(d[i][j - 1] + 1, d[i - 1][j - 1] + cost));
}
}
return d[n][m];
}
/**
* Get hex string interpretation of the 16-byte MD5 hash for an input string Author: Kristina
*/
public static String getMD5Digest(String str) {
try {
byte[] buffer = str.getBytes();
byte[] result = null;
StringBuffer buf = null;
MessageDigest md5 = MessageDigest.getInstance("MD5");
// allocate room for the hash
result = new byte[md5.getDigestLength()];
// calculate hash
md5.reset();
md5.update(buffer);
result = md5.digest();
// System.out.println(result);
// create hex string from the 16-byte hash
buf = new StringBuffer(result.length * 2);
for (int i = 0; i < result.length; i++) {
int intVal = result[i] & 0xff;
if (intVal < 0x10) {
buf.append("0");
}
buf.append(Integer.toHexString(intVal).toUpperCase());
}
return buf.toString();
} catch (NoSuchAlgorithmException e) {
System.err.println("Exception caught: " + e);
e.printStackTrace();
}
return null;
}
/**
* Get hex string interpretation of the SHA-256 hash for an input string Author: Kristina
*/
public static String getSHA256Digest(String str) {
try {
byte[] buffer = str.getBytes();
byte[] result = null;
StringBuffer buf = null;
MessageDigest sha256 = MessageDigest.getInstance("SHA-256");
// allocate room for the hash
result = new byte[sha256.getDigestLength()];
// calculate hash
sha256.reset();
sha256.update(buffer);
result = sha256.digest();
// System.out.println(result);
// create hex string from the 16-byte hash
buf = new StringBuffer(result.length * 2);
for (int i = 0; i < result.length; i++) {
int intVal = result[i] & 0xff;
if (intVal < 0x10) {
buf.append("0");
}
buf.append(Integer.toHexString(intVal).toUpperCase());
}
return buf.toString();
} catch (NoSuchAlgorithmException e) {
System.err.println("Exception caught: " + e);
e.printStackTrace();
}
return null;
}
public static String wordWrap(String text, int lineLength) {
text = text.trim();
if (text.length() < lineLength)
return text;
if (text.substring(0, lineLength).contains("\n"))
return text.substring(0, text.indexOf("\n")).trim() + "\n\n" + wordWrap(text.substring(text.indexOf("\n") + 1), lineLength);
int place = Math.max(Math.max(text.lastIndexOf(" ", lineLength), text.lastIndexOf("\t", lineLength)), text.lastIndexOf("-", lineLength));
if (place > 0) {
return text.substring(0, place).trim() + "\n" + wordWrap(text.substring(place), lineLength);
}
else {
return text;
}
}
public static boolean isDate(String string) {
if (string.length() == 10) {
if ((string.charAt(4) == '-' || string.charAt(4) == '/') || (string.charAt(4) == string.charAt(7)))
try {
int year = Integer.parseInt(string.substring(0, 4));
if (year < 1700 || year > 2200)
return false;
int month = Integer.parseInt(string.substring(5, 7));
if (month < 1 || month > 12)
return false;
int day = Integer.parseInt(string.substring(8, 10));
if (day < 1 || day > 31)
return false;
return true;
} catch (Exception e) {
return false;
}
} else if (string.length() == 8) {
if ((string.charAt(2) == '-' || string.charAt(5) == '/') || (string.charAt(2) == string.charAt(5)))
try {
Integer.parseInt(string.substring(6, 8));
int month = Integer.parseInt(string.substring(0, 2));
if (month < 1 || month > 12)
return false;
int day = Integer.parseInt(string.substring(3, 5));
if (day < 1 || day > 31)
return false;
return true;
} catch (Exception e) {
return false;
}
}
return false;
}
}