package sizzle.functions;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* String Manipulation
*
* @author anthonyu
*
*/
public class SizzleStringIntrinsics {
/**
* Returns a copy of the given {@link String} with all characters lowered.
*
* @param s
* A {@link String} that wants lowercasing
*
* @return A copy of <i>s</i> with all characters converted to lower case,
* as defined by Unicode.
*
*/
@FunctionSpec(name = "lowercase", returnType = "string", formalParameters = { "string" })
public static String lowerCase(final String s) {
return s.toLowerCase();
}
/**
* Returns a copy of the given {@link String} with all characters uppered.
*
* @param s
* A {@link String} that wants uppercasing
*
* @return A copy of <i>s</i> with all characters converted to upper case,
* as defined by Unicode.
*
*/
@FunctionSpec(name = "uppercase", returnType = "string", formalParameters = { "string" })
public static String upperCase(final String s) {
return s.toUpperCase();
}
/**
* Search for the first occurrence of the literal string p within s and
* return the integer index of its first character, or -1 if it does not
* occur.
*
* @param p
* A {@link String} containing the needle
*
* @param s
* A {@link String} containing the haystack
*
* @return A long representing the first occurrence of the literal string
* <em>p</em> within <em>s</em> and return the integer index of its
* first character, or -1 if it does not occur
*/
@FunctionSpec(name = "strfind", returnType = "int", formalParameters = { "string", "string" })
public static long indexOf(final String p, final String s) {
return s.indexOf(p, 0);
}
/**
* Search for the last occurrence of the literal string p within s and
* return the integer index of its first character, or -1 if it does not
* occur.
*
* @param p
* A {@link String} containing the needle
*
* @param s
* A {@link String} containing the haystack
*
* @return A long representing the last occurrence of the literal string
* <em>p</em> within <em>s</em> and return the integer index of its
* first character, or -1 if it does not occur
*/
@FunctionSpec(name = "strrfind", returnType = "int", formalParameters = { "string", "string" })
public static long lastIndexOf(final String p, final String s) {
return s.lastIndexOf(p, 0);
}
/**
* Search for the first occurrence of the literal bytes p within b and
* return the integer index of its first byte, or -1 if it does not occur.
*
* @param p
* An array of byte containing the needle
*
* @param s
* A array of byte containing the haystack
*
* @return A long representing the first occurrence of the literal bytes
* <em>p</em> within <em>s</em> and return the integer index of its
* first byte, or -1 if it does not occur
*/
@FunctionSpec(name = "bytesfind", returnType = "string", formalParameters = { "string", "string" })
public static long indexOf(final byte[] p, final byte[] s) {
for (int i = 0; i < s.length; i++)
for (int j = 0; j < p.length; j++)
if (s[i] != p[j])
break;
else if (j == p.length - 1)
return i;
return -1;
}
/**
* Search for the last occurrence of the literal bytes p within b and return
* the integer index of its first byte, or -1 if it does not occur.
*
* @param p
* An array of byte containing the needle
*
* @param s
* A array of byte containing the haystack
*
* @return A long representing the last occurrence of the literal bytes
* <em>p</em> within <em>s</em> and return the integer index of its
* first byte, or -1 if it does not occur
*/
@FunctionSpec(name = "bytesrfind", returnType = "string", formalParameters = { "string", "string" })
public static long lastIndexOf(final byte[] p, final byte[] s) {
for (int i = s.length - p.length; i >= 0; i--)
for (int j = 0; j < p.length; j++)
if (s[i] != p[j])
break;
else if (j == p.length - 1)
return i;
return -1;
}
/**
* Return a copy of string <em>str</em>, with non-overlapping instances of
* <em>lit</em> replaced by <em>rep</em>. If <em>replace_all</em> is false,
* only the first found instance is replaced.
*
* @param str
* A {@link String} containing the source string
*
* @param lit
* A {@link String} containing the substring to be replaced
*
* @param rep
* A {@link String} containing the replacement string
*
* @param replaceAll
* A boolean representing whether to replace every instance of
* <em>lit</em> with <em>rep</em>
*
* @return A copy of {@link String} <em>str</em>, with non-overlapping
* instances of <em>lit</em> replaced by <em>rep</em>
*/
@FunctionSpec(name = "strreplace", returnType = "string", formalParameters = { "string", "string", "string", "bool" })
public static String stringReplace(final String str, final String lit, final String rep, final boolean replaceAll) {
if (replaceAll)
return str.replace(lit, rep);
else
return str.replaceFirst(Pattern.quote(lit), rep);
}
/**
* Search for a match of the regular expression <em>r</em> within <em>s</em>
* , and return a boolean value indicating whether a match was found. (The
* regular expression syntax is that of PCRE. <http://www.pcre.org/>)
*
* @param r
* A {@link String} containing a regular expression
*
* @param s
* A {@link String} containing the text to be searched
*
* @return A boolean representing whether the regular expression <em>r</em>
* was found within <em>s</em>
*/
@FunctionSpec(name = "match", returnType = "bool", formalParameters = { "string", "string" })
public static boolean match(final String r, final String s) {
final Matcher m = Pattern.compile(r).matcher(s);
return m.find();
}
/**
* Search for a match of the regular expression <em>r</em> within <em>s</em>
* , and return an array consisting of character positions within <em>s</em>
* defined by the match. Positions 0 and 1 of the array report the location
* of the match of the entire expression, subsequent pairs report the
* location of matches of successive parenthesized subexpressions.
*
* @param r
* A {@link String} containing a regular expression
*
* @param s
* A {@link String} containing the text to be searched
*
* @return An array of long consisting of character positions within
* <em>s</em> defined by the match
*/
@FunctionSpec(name = "matchposns", returnType = "array of int", formalParameters = { "string", "string" })
public static long[] matchPositions(final String r, final String s) {
final Matcher m = Pattern.compile(r).matcher(s);
if (!m.find())
return new long[0];
final int n = m.groupCount();
final long[] matches = new long[(n + 1) * 2];
for (int i = 0; i <= n; i++) {
matches[i * 2] = m.start(i);
matches[i * 2 + 1] = m.end(i);
}
return matches;
}
/**
* Search for a match of the regular expression <em>r</em> within <em>s</em>
* , and return . The 0th string is the entire match; following elements of
* the array hold matches of successive parenthesized subexpressions. This
* function is equivalent to using matchposns to find successive locations
* of matches and created array slices of <em>s</em> with the indices
* returned.
*
*
* @param r
* A {@link String} containing a regular expression
*
* @param s
* A {@link String} containing the text to be searched
*
* @return an array of {@link String} consisting of matched substrings of
* <em>s</em>
*/
@FunctionSpec(name = "matchstrs", returnType = "array of string", formalParameters = { "string", "string" })
public static String[] matchStrings(final String r, final String s) {
final Matcher m = Pattern.compile(r).matcher(s);
if (!m.find())
return new String[0];
final int n = m.groupCount();
final String[] matches = new String[(n + 1)];
for (int i = 0; i <= n; i++)
matches[i] = m.group(i);
return matches;
}
private static List<String> splitCsv(final String s) {
final List<String> split = new ArrayList<String>();
boolean inQuote = false;
StringBuilder sb = new StringBuilder();
for (final char c : s.trim().toCharArray())
switch (c) {
case ',':
if (!inQuote) {
split.add(sb.toString());
sb = new StringBuilder();
} else {
sb.append(c);
}
break;
case '"':
if (!inQuote) {
inQuote = true;
} else {
inQuote = false;
}
break;
default:
sb.append(c);
}
split.add(sb.toString());
return split;
}
/**
* The function splitcsvline takes a line of UTF-8 bytes and splits it at
* commas, ignoring leading and trailing white space and using '"' for
* quoting. It returns the array of fields produced.
*
* @param string
* The {@link String} to be split
*
* @return An array of byte[] containing the splits
*
*/
@FunctionSpec(name = "splitcsvline", returnType = "array of bytes", formalParameters = { "bytes" })
public static byte[][] splitCsvLine(final byte[] csv) {
final List<String> split = SizzleStringIntrinsics.splitCsv(new String(csv));
final byte[][] bytes = new byte[split.size()][];
for (int i = 0; i < split.size(); i++)
bytes[i] = split.get(i).getBytes();
return bytes;
}
/**
* The function splitcsv takes an array of UTF-8 bytes containing lines of
* text, such as that produced by the load() builtin. It splits each line
* using the same method as splitcsvline, and then selects the fields
* indicated by the second argument (numbered starting at 1). The return
* value is a flat array of the collected fields.
*
* @param csv An arry of byte containing the input data
* @param fields An array of long specified the fields to be returned
*
* @return An array of byte[] containing the collected fields
*/
@FunctionSpec(name = "splitcsv", returnType = "array of bytes", formalParameters = { "bytes", "array of int" })
public static byte[][] splitCsv(final byte[] csv, final long[] fields) {
final List<List<String>> strings = new ArrayList<List<String>>();
for (final String line : new String(csv).split("\n")) {
final List<String> values = SizzleStringIntrinsics.splitCsv(line);
final List<String> b = new ArrayList<String>();
for (final long field : fields)
b.add(values.get((int) field - 1));
strings.add(b);
}
final byte[][] output = new byte[strings.size() * fields.length][];
for (int i = 0; i < strings.size(); i++)
for (int j = 0; j < fields.length; j++)
output[i * 2 + j] = strings.get(i).get(j).getBytes();
return output;
}
/**
* Return a string containing the arguments formatted according to the
* format string fmt. The syntax of the format string is essentially that of
* ANSI C with the following differences:
*
* <ul>
* <li>%b prints a boolean, "true" or "false".
* <li>%c prints a (u)int as a Unicode character in UTF-8.
* <li>%k like %c with single quotes and backslash escapes for special
* characters.
* <li>%s prints a Sawzall string as UTF-8.
* <li>%q like %s with double quotes and backslash escapes for special
* characters.
* <li>%p prints a fingerprint, in the format 0x%.16x.
* <li>%t prints a time, in the format of the Unix function ctime without a
* newline.
* <li>%T prints a Sawzall type of the argument; %#T expands user-defined
* types.
* <li>%d / %i / %o / %u / %x / %X apply to a Sawzall (u)int and have no 'l'
* or 'h' modifiers.
* <li>%e / %f / %g / %E / %G apply to a Sawzall float and have no 'l' or
* 'h' modifiers.
* </ul>
* format verbs 'n' and '*' are not supported.
*
* @param format
* A
* @param args
*
* @return A string containing the arguments formatted according to the
* format string <em>fmt</em>
*/
@FunctionSpec(name = "format", returnType = "string", formalParameters = { "string", "string..." })
public static String format(final String format, final Object... args) {
// TODO: support the Sawzall differences listed in the javadoc above
return String.format(format, args);
}
}