package com.facebook.hive.udf;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDF;
import java.util.LinkedList;
import java.util.regex.MatchResult;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Extract substrings of a string which satisfy a regular expression. The
* first argument is the haystack in which to search while the second is a
* regular expression which follows Java's regular expression rules. Note that
* like REGEXP_EXTRACT, any backslashes in the regular expression string must
* be escaped to account for Hive's backslash escaping (e.g., '\\w' instead of
* '\w').
*
* An optional third argument specifies a group index. If specified, the
* function will only return the portion of the match in the nth group (the
* expression enclosed by the nth opening parenthesis reading from the left).
* Note that the special group value of 0 returns the entire matched string.
*
* The return value is an ARRAY of STRINGs of the extracted strings. If any
* argument is NULL then NULL is returned.
*
* Also note that the subsequences found will be non-overlapping; that is any
* subsequence must start after the end of the previous subsequence.
*/
@Description(name = "regexp_extract_all",
value = "_FUNC_(haystack, pattern, [index]) - Find all the instances of pattern in haystack.")
public class UDFRegexpExtractAll extends UDF {
private String lastRegex = null;
private Pattern p = null;
public LinkedList<String> evaluate(String s, String regex,
Integer extractIndex) {
if (s == null || regex == null || extractIndex == null) {
return null;
}
if (!regex.equals(lastRegex)) {
lastRegex = regex;
p = Pattern.compile(regex, Pattern.MULTILINE);
}
LinkedList<String> result = new LinkedList<String>();
Matcher m = p.matcher(s);
while (m.find()) {
MatchResult mr = m.toMatchResult();
result.add(mr.group(extractIndex));
}
return result;
}
public LinkedList<String> evaluate(String s, String regex) {
return this.evaluate(s, regex, 0);
}
}