package sizzle.functions; import java.util.ArrayList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * String Manipulation * * @author anthonyu * */ public class SizzleStringIntrinsics { /** * Returns a copy of the given {@link String} with all characters lowered. * * @param s * A {@link String} that wants lowercasing * * @return A copy of <i>s</i> with all characters converted to lower case, * as defined by Unicode. * */ @FunctionSpec(name = "lowercase", returnType = "string", formalParameters = { "string" }) public static String lowerCase(final String s) { return s.toLowerCase(); } /** * Returns a copy of the given {@link String} with all characters uppered. * * @param s * A {@link String} that wants uppercasing * * @return A copy of <i>s</i> with all characters converted to upper case, * as defined by Unicode. * */ @FunctionSpec(name = "uppercase", returnType = "string", formalParameters = { "string" }) public static String upperCase(final String s) { return s.toUpperCase(); } /** * Search for the first occurrence of the literal string p within s and * return the integer index of its first character, or -1 if it does not * occur. * * @param p * A {@link String} containing the needle * * @param s * A {@link String} containing the haystack * * @return A long representing the first occurrence of the literal string * <em>p</em> within <em>s</em> and return the integer index of its * first character, or -1 if it does not occur */ @FunctionSpec(name = "strfind", returnType = "int", formalParameters = { "string", "string" }) public static long indexOf(final String p, final String s) { return s.indexOf(p, 0); } /** * Search for the last occurrence of the literal string p within s and * return the integer index of its first character, or -1 if it does not * occur. * * @param p * A {@link String} containing the needle * * @param s * A {@link String} containing the haystack * * @return A long representing the last occurrence of the literal string * <em>p</em> within <em>s</em> and return the integer index of its * first character, or -1 if it does not occur */ @FunctionSpec(name = "strrfind", returnType = "int", formalParameters = { "string", "string" }) public static long lastIndexOf(final String p, final String s) { return s.lastIndexOf(p, 0); } /** * Search for the first occurrence of the literal bytes p within b and * return the integer index of its first byte, or -1 if it does not occur. * * @param p * An array of byte containing the needle * * @param s * A array of byte containing the haystack * * @return A long representing the first occurrence of the literal bytes * <em>p</em> within <em>s</em> and return the integer index of its * first byte, or -1 if it does not occur */ @FunctionSpec(name = "bytesfind", returnType = "string", formalParameters = { "string", "string" }) public static long indexOf(final byte[] p, final byte[] s) { for (int i = 0; i < s.length; i++) for (int j = 0; j < p.length; j++) if (s[i] != p[j]) break; else if (j == p.length - 1) return i; return -1; } /** * Search for the last occurrence of the literal bytes p within b and return * the integer index of its first byte, or -1 if it does not occur. * * @param p * An array of byte containing the needle * * @param s * A array of byte containing the haystack * * @return A long representing the last occurrence of the literal bytes * <em>p</em> within <em>s</em> and return the integer index of its * first byte, or -1 if it does not occur */ @FunctionSpec(name = "bytesrfind", returnType = "string", formalParameters = { "string", "string" }) public static long lastIndexOf(final byte[] p, final byte[] s) { for (int i = s.length - p.length; i >= 0; i--) for (int j = 0; j < p.length; j++) if (s[i] != p[j]) break; else if (j == p.length - 1) return i; return -1; } /** * Return a copy of string <em>str</em>, with non-overlapping instances of * <em>lit</em> replaced by <em>rep</em>. If <em>replace_all</em> is false, * only the first found instance is replaced. * * @param str * A {@link String} containing the source string * * @param lit * A {@link String} containing the substring to be replaced * * @param rep * A {@link String} containing the replacement string * * @param replaceAll * A boolean representing whether to replace every instance of * <em>lit</em> with <em>rep</em> * * @return A copy of {@link String} <em>str</em>, with non-overlapping * instances of <em>lit</em> replaced by <em>rep</em> */ @FunctionSpec(name = "strreplace", returnType = "string", formalParameters = { "string", "string", "string", "bool" }) public static String stringReplace(final String str, final String lit, final String rep, final boolean replaceAll) { if (replaceAll) return str.replace(lit, rep); else return str.replaceFirst(Pattern.quote(lit), rep); } /** * Search for a match of the regular expression <em>r</em> within <em>s</em> * , and return a boolean value indicating whether a match was found. (The * regular expression syntax is that of PCRE. <http://www.pcre.org/>) * * @param r * A {@link String} containing a regular expression * * @param s * A {@link String} containing the text to be searched * * @return A boolean representing whether the regular expression <em>r</em> * was found within <em>s</em> */ @FunctionSpec(name = "match", returnType = "bool", formalParameters = { "string", "string" }) public static boolean match(final String r, final String s) { final Matcher m = Pattern.compile(r).matcher(s); return m.find(); } /** * Search for a match of the regular expression <em>r</em> within <em>s</em> * , and return an array consisting of character positions within <em>s</em> * defined by the match. Positions 0 and 1 of the array report the location * of the match of the entire expression, subsequent pairs report the * location of matches of successive parenthesized subexpressions. * * @param r * A {@link String} containing a regular expression * * @param s * A {@link String} containing the text to be searched * * @return An array of long consisting of character positions within * <em>s</em> defined by the match */ @FunctionSpec(name = "matchposns", returnType = "array of int", formalParameters = { "string", "string" }) public static long[] matchPositions(final String r, final String s) { final Matcher m = Pattern.compile(r).matcher(s); if (!m.find()) return new long[0]; final int n = m.groupCount(); final long[] matches = new long[(n + 1) * 2]; for (int i = 0; i <= n; i++) { matches[i * 2] = m.start(i); matches[i * 2 + 1] = m.end(i); } return matches; } /** * Search for a match of the regular expression <em>r</em> within <em>s</em> * , and return . The 0th string is the entire match; following elements of * the array hold matches of successive parenthesized subexpressions. This * function is equivalent to using matchposns to find successive locations * of matches and created array slices of <em>s</em> with the indices * returned. * * * @param r * A {@link String} containing a regular expression * * @param s * A {@link String} containing the text to be searched * * @return an array of {@link String} consisting of matched substrings of * <em>s</em> */ @FunctionSpec(name = "matchstrs", returnType = "array of string", formalParameters = { "string", "string" }) public static String[] matchStrings(final String r, final String s) { final Matcher m = Pattern.compile(r).matcher(s); if (!m.find()) return new String[0]; final int n = m.groupCount(); final String[] matches = new String[(n + 1)]; for (int i = 0; i <= n; i++) matches[i] = m.group(i); return matches; } private static List<String> splitCsv(final String s) { final List<String> split = new ArrayList<String>(); boolean inQuote = false; StringBuilder sb = new StringBuilder(); for (final char c : s.trim().toCharArray()) switch (c) { case ',': if (!inQuote) { split.add(sb.toString()); sb = new StringBuilder(); } else { sb.append(c); } break; case '"': if (!inQuote) { inQuote = true; } else { inQuote = false; } break; default: sb.append(c); } split.add(sb.toString()); return split; } /** * The function splitcsvline takes a line of UTF-8 bytes and splits it at * commas, ignoring leading and trailing white space and using '"' for * quoting. It returns the array of fields produced. * * @param string * The {@link String} to be split * * @return An array of byte[] containing the splits * */ @FunctionSpec(name = "splitcsvline", returnType = "array of bytes", formalParameters = { "bytes" }) public static byte[][] splitCsvLine(final byte[] csv) { final List<String> split = SizzleStringIntrinsics.splitCsv(new String(csv)); final byte[][] bytes = new byte[split.size()][]; for (int i = 0; i < split.size(); i++) bytes[i] = split.get(i).getBytes(); return bytes; } /** * The function splitcsv takes an array of UTF-8 bytes containing lines of * text, such as that produced by the load() builtin. It splits each line * using the same method as splitcsvline, and then selects the fields * indicated by the second argument (numbered starting at 1). The return * value is a flat array of the collected fields. * * @param csv An arry of byte containing the input data * @param fields An array of long specified the fields to be returned * * @return An array of byte[] containing the collected fields */ @FunctionSpec(name = "splitcsv", returnType = "array of bytes", formalParameters = { "bytes", "array of int" }) public static byte[][] splitCsv(final byte[] csv, final long[] fields) { final List<List<String>> strings = new ArrayList<List<String>>(); for (final String line : new String(csv).split("\n")) { final List<String> values = SizzleStringIntrinsics.splitCsv(line); final List<String> b = new ArrayList<String>(); for (final long field : fields) b.add(values.get((int) field - 1)); strings.add(b); } final byte[][] output = new byte[strings.size() * fields.length][]; for (int i = 0; i < strings.size(); i++) for (int j = 0; j < fields.length; j++) output[i * 2 + j] = strings.get(i).get(j).getBytes(); return output; } /** * Return a string containing the arguments formatted according to the * format string fmt. The syntax of the format string is essentially that of * ANSI C with the following differences: * * <ul> * <li>%b prints a boolean, "true" or "false". * <li>%c prints a (u)int as a Unicode character in UTF-8. * <li>%k like %c with single quotes and backslash escapes for special * characters. * <li>%s prints a Sawzall string as UTF-8. * <li>%q like %s with double quotes and backslash escapes for special * characters. * <li>%p prints a fingerprint, in the format 0x%.16x. * <li>%t prints a time, in the format of the Unix function ctime without a * newline. * <li>%T prints a Sawzall type of the argument; %#T expands user-defined * types. * <li>%d / %i / %o / %u / %x / %X apply to a Sawzall (u)int and have no 'l' * or 'h' modifiers. * <li>%e / %f / %g / %E / %G apply to a Sawzall float and have no 'l' or * 'h' modifiers. * </ul> * format verbs 'n' and '*' are not supported. * * @param format * A * @param args * * @return A string containing the arguments formatted according to the * format string <em>fmt</em> */ @FunctionSpec(name = "format", returnType = "string", formalParameters = { "string", "string..." }) public static String format(final String format, final Object... args) { // TODO: support the Sawzall differences listed in the javadoc above return String.format(format, args); } }