package water.util; import com.google.common.base.Charsets; import water.parser.BufferedString; import java.io.PrintWriter; import java.io.StringWriter; import java.nio.charset.Charset; import java.util.*; /** * String manipulation utilities. */ public class StringUtils { private static Map<Character, Integer> hexCode = CollectionUtils.createMap( toCharacterArray("0123456789abcdefABCDEF"), new Integer[]{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 10, 11, 12, 13, 14, 15} ); /** * Print exception stack trace into a string. * * @param t an exception * @return string containing pretty printed exception */ public static String toString(Throwable t) { StringWriter sw = new StringWriter(); PrintWriter pw = new PrintWriter(sw); t.printStackTrace(pw); return sw.toString(); } /** * Convenience function to test whether a string is empty. * @param s String to test * @return True if the string is either null or empty, false otherwise */ public static boolean isNullOrEmpty(String s) { return s == null || s.isEmpty(); } public static boolean isNullOrEmpty(BufferedString s) { return s == null || s.length() == 0; } /** * Expand ~ to user.home * @param path that can (but doesn't have to) contain a tilde (~) * @return expanded path */ public static String expandPath(String path) { return path.replaceFirst("^~", System.getProperty("user.home")); } public static String cleanString(String s) { //Tokenization/string cleaning for all datasets except for SST. // Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py String string = s; string = string.replaceAll("[^A-Za-z0-9(),!?\\'\\`]", " "); string = string.replaceAll("'s", " 's"); string = string.replaceAll("'ve", " 've"); string = string.replaceAll("n't", " n't"); string = string.replaceAll("'re", " 're"); string = string.replaceAll("'d", " 'd"); string = string.replaceAll("'ll", " 'll"); string = string.replaceAll(",", " , "); string = string.replaceAll("!", " ! "); string = string.replaceAll("\\(", " ( "); string = string.replaceAll("\\)", " ) "); string = string.replaceAll("\\?", " ? "); string = string.replaceAll("\\s{2,}", " "); return string.trim().toLowerCase(); } public static String[] tokenize(String text) { // System.out.println(cleanString(text)); return cleanString(text).split(" "); } public static int[] tokensToArray(String[] tokens, int padToLength, Map<String, Integer> dict) { assert(dict!=null); int len = tokens.length; int pad = padToLength - len; int[] data = new int[padToLength]; int ix = 0; for (String t : tokens) { Integer val = dict.get(t); int index; if (val == null) { index = dict.size(); dict.put(t, index); } else { index = val; } data[ix] = index; ix += 1; } for (int i = 0; i < pad; i++) { int index = dict.get(PADDING_SYMBOL); data[ix] = index; ix += 1; } return data; } public static String PADDING_SYMBOL = "</s>"; public static ArrayList<int[]> texts2array(List<String> texts) { int maxlen = 0; int index = 0; Map<String, Integer> dict = new HashMap<>(); dict.put(PADDING_SYMBOL, index); index += 1; for (String text : texts) { String[] tokens = tokenize(text); for (String token : tokens) { if (!dict.containsKey(token)) { dict.put(token, index); index += 1; } } int len = tokens.length; if (len > maxlen) maxlen = len; } // System.out.println(dict); // System.out.println("maxlen " + maxlen); // System.out.println("dict size " + dict.size()); ArrayList<int[]> array = new ArrayList<>(); for (String text: texts) { int[] data = tokensToArray(tokenize(text), maxlen, dict); // System.out.println(text); // System.out.println(Arrays.toString(data)); array.add(data); } return array; } /** * Join the array with the given delimiter, and return it as a string. * * @param delimiter string to be used as a separator between array elements * @param arr the array to join * @return a single string containing all elements in `arr` joined together */ public static String join(String delimiter, String[] arr) { return join(delimiter, Arrays.asList(arr)); } /** * Join the array with the given delimiter, and return it as a string. * * @param delimiter string to be used as a separator between array elements * @param strings the strings to join * @return a single string containing all elements in `strings` joined together */ public static String join(String delimiter, Iterable<String> strings) { StringBuilder sb = new StringBuilder(); for (String item : strings) { if (sb.length() > 0) sb.append(delimiter); sb.append(item); } return sb.toString(); } /** * Convert a string into the set of its characters. * * @param src Source string * @return Set of characters within the source string */ public static HashSet<Character> toCharacterSet(String src) { int n = src.length(); HashSet<Character> res = new HashSet<>(n); for (int i = 0; i < n; i++) res.add(src.charAt(i)); return res; } public static Character[] toCharacterArray(String src) { return ArrayUtils.box(src.toCharArray()); } public static int unhex(String str) { int res = 0; for (char c : str.toCharArray()) { if (!hexCode.containsKey(c)) throw new NumberFormatException("Not a hexademical character " + c); res = (res << 4) + hexCode.get(c); } return res; } public static byte[] bytesOf(CharSequence str) { return str.toString().getBytes(Charsets.UTF_8); } public static byte[] toBytes(Object value) { return bytesOf(String.valueOf(value)); } public static String toString(byte[] bytes, int from, int length) { return new String(bytes, from, length, Charsets.UTF_8); } }