/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.facebook.presto.operator.scalar;
import com.facebook.presto.spi.PrestoException;
import com.facebook.presto.spi.block.Block;
import com.facebook.presto.spi.block.BlockBuilder;
import com.facebook.presto.spi.block.BlockBuilderStatus;
import com.facebook.presto.spi.function.Description;
import com.facebook.presto.spi.function.LiteralParameters;
import com.facebook.presto.spi.function.OperatorType;
import com.facebook.presto.spi.function.ScalarFunction;
import com.facebook.presto.spi.function.ScalarOperator;
import com.facebook.presto.spi.function.SqlNullable;
import com.facebook.presto.spi.function.SqlType;
import com.facebook.presto.spi.type.StandardTypes;
import com.facebook.presto.type.CodePointsType;
import com.facebook.presto.type.Constraint;
import com.facebook.presto.type.LiteralParameter;
import com.google.common.primitives.Ints;
import io.airlift.slice.InvalidCodePointException;
import io.airlift.slice.InvalidUtf8Exception;
import io.airlift.slice.Slice;
import io.airlift.slice.SliceUtf8;
import io.airlift.slice.Slices;
import java.text.Normalizer;
import java.util.HashMap;
import java.util.Map;
import java.util.OptionalInt;
import static com.facebook.presto.spi.StandardErrorCode.INVALID_FUNCTION_ARGUMENT;
import static com.facebook.presto.spi.type.Chars.padSpaces;
import static com.facebook.presto.spi.type.VarcharType.VARCHAR;
import static com.facebook.presto.util.Failures.checkCondition;
import static io.airlift.slice.SliceUtf8.countCodePoints;
import static io.airlift.slice.SliceUtf8.getCodePointAt;
import static io.airlift.slice.SliceUtf8.lengthOfCodePoint;
import static io.airlift.slice.SliceUtf8.lengthOfCodePointSafe;
import static io.airlift.slice.SliceUtf8.offsetOfCodePoint;
import static io.airlift.slice.SliceUtf8.toLowerCase;
import static io.airlift.slice.SliceUtf8.toUpperCase;
import static io.airlift.slice.SliceUtf8.tryGetCodePointAt;
import static io.airlift.slice.Slices.utf8Slice;
import static java.lang.Character.MAX_CODE_POINT;
import static java.lang.Character.SURROGATE;
import static java.lang.Math.toIntExact;
import static java.lang.String.format;
/**
* Current implementation is based on code points from Unicode and does ignore grapheme cluster boundaries.
* Therefore only some methods work correctly with grapheme cluster boundaries.
*/
public final class StringFunctions
{
private StringFunctions() {}
@Description("convert Unicode code point to a string")
@ScalarFunction
@SqlType("varchar(1)")
public static Slice chr(@SqlType(StandardTypes.BIGINT) long codepoint)
{
try {
return SliceUtf8.codePointToUtf8(Ints.saturatedCast(codepoint));
}
catch (InvalidCodePointException e) {
throw new PrestoException(INVALID_FUNCTION_ARGUMENT, "Not a valid Unicode code point: " + codepoint, e);
}
}
@Description("returns Unicode code point of a single character string")
@ScalarFunction("codepoint")
@SqlType(StandardTypes.INTEGER)
public static long codepoint(@SqlType("varchar(1)") Slice slice)
{
checkCondition(countCodePoints(slice) == 1, INVALID_FUNCTION_ARGUMENT, "Input string must be a single character string");
return getCodePointAt(slice, 0);
}
@Description("count of code points of the given string")
@ScalarFunction
@LiteralParameters("x")
@SqlType(StandardTypes.BIGINT)
public static long length(@SqlType("varchar(x)") Slice slice)
{
return countCodePoints(slice);
}
@Description("count of code points of the given string")
@ScalarFunction("length")
@LiteralParameters("x")
@SqlType(StandardTypes.BIGINT)
public static long charLength(@LiteralParameter("x") long x, @SqlType("char(x)") Slice slice)
{
return x;
}
@Description("greedily removes occurrences of a pattern in a string")
@ScalarFunction
@LiteralParameters({"x", "y"})
@SqlType("varchar(x)")
public static Slice replace(@SqlType("varchar(x)") Slice str, @SqlType("varchar(y)") Slice search)
{
return replace(str, search, Slices.EMPTY_SLICE);
}
@Description("greedily replaces occurrences of a pattern with a string")
@ScalarFunction
@LiteralParameters({"x", "y", "z", "u"})
@Constraint(variable = "u", expression = "min(2147483647, x + z * (x + 1))")
@SqlType("varchar(u)")
public static Slice replace(@SqlType("varchar(x)") Slice str, @SqlType("varchar(y)") Slice search, @SqlType("varchar(z)") Slice replace)
{
// Empty search?
if (search.length() == 0) {
// With empty `search` we insert `replace` in front of every character and and the end
Slice buffer = Slices.allocate((countCodePoints(str) + 1) * replace.length() + str.length());
// Always start with replace
buffer.setBytes(0, replace);
int indexBuffer = replace.length();
// After every code point insert `replace`
int index = 0;
while (index < str.length()) {
int codePointLength = lengthOfCodePointSafe(str, index);
// Append current code point
buffer.setBytes(indexBuffer, str, index, codePointLength);
indexBuffer += codePointLength;
// Append `replace`
buffer.setBytes(indexBuffer, replace);
indexBuffer += replace.length();
// Advance pointer to current code point
index += codePointLength;
}
return buffer;
}
// Allocate a reasonable buffer
Slice buffer = Slices.allocate(str.length());
int index = 0;
int indexBuffer = 0;
while (index < str.length()) {
int matchIndex = str.indexOf(search, index);
// Found a match?
if (matchIndex < 0) {
// No match found so copy the rest of string
int bytesToCopy = str.length() - index;
buffer = Slices.ensureSize(buffer, indexBuffer + bytesToCopy);
buffer.setBytes(indexBuffer, str, index, bytesToCopy);
indexBuffer += bytesToCopy;
break;
}
int bytesToCopy = matchIndex - index;
buffer = Slices.ensureSize(buffer, indexBuffer + bytesToCopy + replace.length());
// Non empty match?
if (bytesToCopy > 0) {
buffer.setBytes(indexBuffer, str, index, bytesToCopy);
indexBuffer += bytesToCopy;
}
// Non empty replace?
if (replace.length() > 0) {
buffer.setBytes(indexBuffer, replace);
indexBuffer += replace.length();
}
// Continue searching after match
index = matchIndex + search.length();
}
return buffer.slice(0, indexBuffer);
}
@Description("reverse all code points in a given string")
@ScalarFunction
@LiteralParameters("x")
@SqlType("varchar(x)")
public static Slice reverse(@SqlType("varchar(x)") Slice slice)
{
return SliceUtf8.reverse(slice);
}
@Description("returns index of first occurrence of a substring (or 0 if not found)")
@ScalarFunction("strpos")
@LiteralParameters({"x", "y"})
@SqlType(StandardTypes.BIGINT)
public static long stringPosition(@SqlType("varchar(x)") Slice string, @SqlType("varchar(y)") Slice substring)
{
if (substring.length() == 0) {
return 1;
}
int index = string.indexOf(substring);
if (index < 0) {
return 0;
}
return countCodePoints(string, 0, index) + 1;
}
@Description("suffix starting at given index")
@ScalarFunction
@LiteralParameters("x")
@SqlType("varchar(x)")
public static Slice substr(@SqlType("varchar(x)") Slice utf8, @SqlType(StandardTypes.BIGINT) long start)
{
if ((start == 0) || utf8.length() == 0) {
return Slices.EMPTY_SLICE;
}
int startCodePoint = Ints.saturatedCast(start);
if (startCodePoint > 0) {
int indexStart = offsetOfCodePoint(utf8, startCodePoint - 1);
if (indexStart < 0) {
// before beginning of string
return Slices.EMPTY_SLICE;
}
int indexEnd = utf8.length();
return utf8.slice(indexStart, indexEnd - indexStart);
}
// negative start is relative to end of string
int codePoints = countCodePoints(utf8);
startCodePoint += codePoints;
// before beginning of string
if (startCodePoint < 0) {
return Slices.EMPTY_SLICE;
}
int indexStart = offsetOfCodePoint(utf8, startCodePoint);
int indexEnd = utf8.length();
return utf8.slice(indexStart, indexEnd - indexStart);
}
@Description("suffix starting at given index")
@ScalarFunction("substr")
@LiteralParameters("x")
@SqlType("char(x)")
public static Slice charSubstr(@SqlType("char(x)") Slice utf8, @SqlType(StandardTypes.BIGINT) long start)
{
return substr(utf8, start);
}
@Description("substring of given length starting at an index")
@ScalarFunction
@LiteralParameters("x")
@SqlType("varchar(x)")
public static Slice substr(@SqlType("varchar(x)") Slice utf8, @SqlType(StandardTypes.BIGINT) long start, @SqlType(StandardTypes.BIGINT) long length)
{
if (start == 0 || (length <= 0) || (utf8.length() == 0)) {
return Slices.EMPTY_SLICE;
}
int startCodePoint = Ints.saturatedCast(start);
int lengthCodePoints = Ints.saturatedCast(length);
if (startCodePoint > 0) {
int indexStart = offsetOfCodePoint(utf8, startCodePoint - 1);
if (indexStart < 0) {
// before beginning of string
return Slices.EMPTY_SLICE;
}
int indexEnd = offsetOfCodePoint(utf8, indexStart, lengthCodePoints);
if (indexEnd < 0) {
// after end of string
indexEnd = utf8.length();
}
return utf8.slice(indexStart, indexEnd - indexStart);
}
// negative start is relative to end of string
int codePoints = countCodePoints(utf8);
startCodePoint += codePoints;
// before beginning of string
if (startCodePoint < 0) {
return Slices.EMPTY_SLICE;
}
int indexStart = offsetOfCodePoint(utf8, startCodePoint);
int indexEnd;
if (startCodePoint + lengthCodePoints < codePoints) {
indexEnd = offsetOfCodePoint(utf8, indexStart, lengthCodePoints);
}
else {
indexEnd = utf8.length();
}
return utf8.slice(indexStart, indexEnd - indexStart);
}
@Description("substring of given length starting at an index")
@ScalarFunction("substr")
@LiteralParameters("x")
@SqlType("char(x)")
public static Slice charSubstr(@SqlType("char(x)") Slice utf8, @SqlType(StandardTypes.BIGINT) long start, @SqlType(StandardTypes.BIGINT) long length)
{
return substr(utf8, start, length);
}
@ScalarFunction
@LiteralParameters({"x", "y"})
@SqlType("array(varchar(x))")
public static Block split(@SqlType("varchar(x)") Slice string, @SqlType("varchar(y)") Slice delimiter)
{
return split(string, delimiter, string.length() + 1);
}
@ScalarFunction
@LiteralParameters({"x", "y"})
@SqlType("array(varchar(x))")
public static Block split(@SqlType("varchar(x)") Slice string, @SqlType("varchar(y)") Slice delimiter, @SqlType(StandardTypes.BIGINT) long limit)
{
checkCondition(limit > 0, INVALID_FUNCTION_ARGUMENT, "Limit must be positive");
checkCondition(limit <= Integer.MAX_VALUE, INVALID_FUNCTION_ARGUMENT, "Limit is too large");
checkCondition(delimiter.length() > 0, INVALID_FUNCTION_ARGUMENT, "The delimiter may not be the empty string");
BlockBuilder parts = VARCHAR.createBlockBuilder(new BlockBuilderStatus(), 1, string.length());
// If limit is one, the last and only element is the complete string
if (limit == 1) {
VARCHAR.writeSlice(parts, string);
return parts.build();
}
int index = 0;
while (index < string.length()) {
int splitIndex = string.indexOf(delimiter, index);
// Found split?
if (splitIndex < 0) {
break;
}
// Add the part from current index to found split
VARCHAR.writeSlice(parts, string, index, splitIndex - index);
// Continue searching after delimiter
index = splitIndex + delimiter.length();
// Reached limit-1 parts so we can stop
if (parts.getPositionCount() == limit - 1) {
break;
}
}
// Rest of string
VARCHAR.writeSlice(parts, string, index, string.length() - index);
return parts.build();
}
@SqlNullable
@Description("splits a string by a delimiter and returns the specified field (counting from one)")
@ScalarFunction
@LiteralParameters({"x", "y"})
@SqlType("varchar(x)")
public static Slice splitPart(@SqlType("varchar(x)") Slice string, @SqlType("varchar(y)") Slice delimiter, @SqlType(StandardTypes.BIGINT) long index)
{
checkCondition(index > 0, INVALID_FUNCTION_ARGUMENT, "Index must be greater than zero");
// Empty delimiter? Then every character will be a split
if (delimiter.length() == 0) {
int startCodePoint = toIntExact(index);
int indexStart = offsetOfCodePoint(string, startCodePoint - 1);
if (indexStart < 0) {
// index too big
return null;
}
int length = lengthOfCodePoint(string, indexStart);
if (indexStart + length > string.length()) {
throw new PrestoException(INVALID_FUNCTION_ARGUMENT, "Invalid UTF-8 encoding");
}
return string.slice(indexStart, length);
}
int matchCount = 0;
int previousIndex = 0;
while (previousIndex < string.length()) {
int matchIndex = string.indexOf(delimiter, previousIndex);
// No match
if (matchIndex < 0) {
break;
}
// Reached the requested part?
if (++matchCount == index) {
return string.slice(previousIndex, matchIndex - previousIndex);
}
// Continue searching after the delimiter
previousIndex = matchIndex + delimiter.length();
}
if (matchCount == index - 1) {
// returns last section of the split
return string.slice(previousIndex, string.length() - previousIndex);
}
// index is too big, null is returned
return null;
}
@Description("creates a map using entryDelimiter and keyValueDelimiter")
@ScalarFunction
@SqlType("map<varchar,varchar>")
public static Block splitToMap(@SqlType(StandardTypes.VARCHAR) Slice string, @SqlType(StandardTypes.VARCHAR) Slice entryDelimiter, @SqlType(StandardTypes.VARCHAR) Slice keyValueDelimiter)
{
checkCondition(entryDelimiter.length() > 0, INVALID_FUNCTION_ARGUMENT, "entryDelimiter is empty");
checkCondition(keyValueDelimiter.length() > 0, INVALID_FUNCTION_ARGUMENT, "keyValueDelimiter is empty");
checkCondition(!entryDelimiter.equals(keyValueDelimiter), INVALID_FUNCTION_ARGUMENT, "entryDelimiter and keyValueDelimiter must not be the same");
Map<Slice, Slice> map = new HashMap<>();
int entryStart = 0;
while (entryStart < string.length()) {
// Extract key-value pair based on current index
// then add the pair if it can be split by keyValueDelimiter
Slice keyValuePair;
int entryEnd = string.indexOf(entryDelimiter, entryStart);
if (entryEnd >= 0) {
keyValuePair = string.slice(entryStart, entryEnd - entryStart);
}
else {
// The rest of the string is the last possible pair.
keyValuePair = string.slice(entryStart, string.length() - entryStart);
}
int keyEnd = keyValuePair.indexOf(keyValueDelimiter);
if (keyEnd >= 0) {
int valueStart = keyEnd + keyValueDelimiter.length();
Slice key = keyValuePair.slice(0, keyEnd);
Slice value = keyValuePair.slice(valueStart, keyValuePair.length() - valueStart);
if (value.indexOf(keyValueDelimiter) >= 0) {
throw new PrestoException(INVALID_FUNCTION_ARGUMENT, "Key-value delimiter must appear exactly once in each entry. Bad input: '" + keyValuePair.toStringUtf8() + "'");
}
if (map.containsKey(key)) {
throw new PrestoException(INVALID_FUNCTION_ARGUMENT, format("Duplicate keys (%s) are not allowed", key.toStringUtf8()));
}
map.put(key, value);
}
else {
throw new PrestoException(INVALID_FUNCTION_ARGUMENT, "Key-value delimiter must appear exactly once in each entry. Bad input: '" + keyValuePair.toStringUtf8() + "'");
}
if (entryEnd < 0) {
// No more pairs to add
break;
}
// Next possible pair is placed next to the current entryDelimiter
entryStart = entryEnd + entryDelimiter.length();
}
BlockBuilder builder = VARCHAR.createBlockBuilder(new BlockBuilderStatus(), map.size());
for (Map.Entry<Slice, Slice> entry : map.entrySet()) {
VARCHAR.writeSlice(builder, entry.getKey());
VARCHAR.writeSlice(builder, entry.getValue());
}
return builder.build();
}
@Description("removes whitespace from the beginning of a string")
@ScalarFunction("ltrim")
@LiteralParameters("x")
@SqlType("varchar(x)")
public static Slice leftTrim(@SqlType("varchar(x)") Slice slice)
{
return SliceUtf8.leftTrim(slice);
}
@Description("removes whitespace from the beginning of a string")
@ScalarFunction("ltrim")
@LiteralParameters("x")
@SqlType("char(x)")
public static Slice charLeftTrim(@SqlType("char(x)") Slice slice)
{
return SliceUtf8.leftTrim(slice);
}
@Description("removes whitespace from the end of a string")
@ScalarFunction("rtrim")
@LiteralParameters("x")
@SqlType("varchar(x)")
public static Slice rightTrim(@SqlType("varchar(x)") Slice slice)
{
return SliceUtf8.rightTrim(slice);
}
@Description("removes whitespace from the end of a string")
@ScalarFunction("rtrim")
@LiteralParameters("x")
@SqlType("char(x)")
public static Slice charRightTrim(@SqlType("char(x)") Slice slice)
{
return rightTrim(slice);
}
@Description("removes whitespace from the beginning and end of a string")
@ScalarFunction
@LiteralParameters("x")
@SqlType("varchar(x)")
public static Slice trim(@SqlType("varchar(x)") Slice slice)
{
return SliceUtf8.trim(slice);
}
@Description("removes whitespace from the beginning and end of a string")
@ScalarFunction("trim")
@LiteralParameters("x")
@SqlType("char(x)")
public static Slice charTrim(@SqlType("char(x)") Slice slice)
{
return trim(slice);
}
@Description("remove the longest string containing only given characters from the beginning of a string")
@ScalarFunction("ltrim")
@LiteralParameters("x")
@SqlType("varchar(x)")
public static Slice leftTrim(@SqlType("varchar(x)") Slice slice, @SqlType(CodePointsType.NAME) int[] codePointsToTrim)
{
return SliceUtf8.leftTrim(slice, codePointsToTrim);
}
@Description("remove the longest string containing only given characters from the beginning of a string")
@ScalarFunction("ltrim")
@LiteralParameters("x")
@SqlType("char(x)")
public static Slice charLeftTrim(@SqlType("char(x)") Slice slice, @SqlType(CodePointsType.NAME) int[] codePointsToTrim)
{
return leftTrim(slice, codePointsToTrim);
}
@Description("remove the longest string containing only given characters from the end of a string")
@ScalarFunction("rtrim")
@LiteralParameters("x")
@SqlType("varchar(x)")
public static Slice rightTrim(@SqlType("varchar(x)") Slice slice, @SqlType(CodePointsType.NAME) int[] codePointsToTrim)
{
return SliceUtf8.rightTrim(slice, codePointsToTrim);
}
@Description("remove the longest string containing only given characters from the end of a string")
@ScalarFunction("rtrim")
@LiteralParameters("x")
@SqlType("char(x)")
public static Slice charRightTrim(@SqlType("char(x)") Slice slice, @SqlType(CodePointsType.NAME) int[] codePointsToTrim)
{
return rightTrim(slice, codePointsToTrim);
}
@Description("remove the longest string containing only given characters from the beginning and end of a string")
@ScalarFunction("trim")
@LiteralParameters("x")
@SqlType("varchar(x)")
public static Slice trim(@SqlType("varchar(x)") Slice slice, @SqlType(CodePointsType.NAME) int[] codePointsToTrim)
{
return SliceUtf8.trim(slice, codePointsToTrim);
}
@Description("remove the longest string containing only given characters from the beginning and end of a string")
@ScalarFunction("trim")
@LiteralParameters("x")
@SqlType("char(x)")
public static Slice charTrim(@SqlType("char(x)") Slice slice, @SqlType(CodePointsType.NAME) int[] codePointsToTrim)
{
return trim(slice, codePointsToTrim);
}
@ScalarOperator(OperatorType.CAST)
@LiteralParameters("x")
@SqlType(CodePointsType.NAME)
public static int[] castVarcharToCodePoints(@SqlType("varchar(x)") Slice slice)
{
return castToCodePoints(slice);
}
@ScalarOperator(OperatorType.CAST)
@SqlType(CodePointsType.NAME)
@LiteralParameters("x")
public static int[] castCharToCodePoints(@LiteralParameter("x") Long charLength, @SqlType("char(x)") Slice slice)
{
return castToCodePoints(padSpaces(slice, charLength.intValue()));
}
private static int[] castToCodePoints(Slice slice)
{
int[] codePoints = new int[safeCountCodePoints(slice)];
int position = 0;
for (int index = 0; index < codePoints.length; index++) {
codePoints[index] = getCodePointAt(slice, position);
position += lengthOfCodePoint(slice, position);
}
return codePoints;
}
private static int safeCountCodePoints(Slice slice)
{
int codePoints = 0;
for (int position = 0; position < slice.length(); ) {
int codePoint = tryGetCodePointAt(slice, position);
if (codePoint < 0) {
throw new PrestoException(INVALID_FUNCTION_ARGUMENT, "Invalid UTF-8 encoding in characters: " + slice.toStringUtf8());
}
position += lengthOfCodePoint(codePoint);
codePoints++;
}
return codePoints;
}
@Description("converts the string to lower case")
@ScalarFunction
@LiteralParameters("x")
@SqlType("varchar(x)")
public static Slice lower(@SqlType("varchar(x)") Slice slice)
{
return toLowerCase(slice);
}
@Description("converts the string to lower case")
@ScalarFunction("lower")
@LiteralParameters("x")
@SqlType("char(x)")
public static Slice charLower(@SqlType("char(x)") Slice slice)
{
return lower(slice);
}
@Description("converts the string to upper case")
@ScalarFunction
@LiteralParameters("x")
@SqlType("varchar(x)")
public static Slice upper(@SqlType("varchar(x)") Slice slice)
{
return toUpperCase(slice);
}
@Description("converts the string to upper case")
@ScalarFunction("upper")
@LiteralParameters("x")
@SqlType("char(x)")
public static Slice charUpper(@SqlType("char(x)") Slice slice)
{
return upper(slice);
}
private static Slice pad(Slice text, long targetLength, Slice padString, int paddingOffset)
{
checkCondition(
0 <= targetLength && targetLength <= Integer.MAX_VALUE,
INVALID_FUNCTION_ARGUMENT,
"Target length must be in the range [0.." + Integer.MAX_VALUE + "]"
);
checkCondition(padString.length() > 0, INVALID_FUNCTION_ARGUMENT, "Padding string must not be empty");
int textLength = countCodePoints(text);
int resultLength = (int) targetLength;
// if our target length is the same as our string then return our string
if (textLength == resultLength) {
return text;
}
// if our string is bigger than requested then truncate
if (textLength > resultLength) {
return SliceUtf8.substring(text, 0, resultLength);
}
// number of bytes in each code point
int padStringLength = countCodePoints(padString);
int[] padStringCounts = new int[padStringLength];
for (int i = 0; i < padStringLength; ++i) {
padStringCounts[i] = lengthOfCodePointSafe(padString, offsetOfCodePoint(padString, i));
}
// preallocate the result
int bufferSize = text.length();
for (int i = 0; i < resultLength - textLength; ++i) {
bufferSize += padStringCounts[i % padStringLength];
}
Slice buffer = Slices.allocate(bufferSize);
// fill in the existing string
int countBytes = bufferSize - text.length();
int startPointOfExistingText = (paddingOffset + countBytes) % bufferSize;
buffer.setBytes(startPointOfExistingText, text);
// assign the pad string while there's enough space for it
int byteIndex = paddingOffset;
for (int i = 0; i < countBytes / padString.length(); ++i) {
buffer.setBytes(byteIndex, padString);
byteIndex += padString.length();
}
// handle the tail: at most we assign padStringLength - 1 code points
buffer.setBytes(byteIndex, padString.getBytes(0, paddingOffset + countBytes - byteIndex));
return buffer;
}
@Description("pads a string on the left")
@ScalarFunction("lpad")
@LiteralParameters({"x", "y"})
@SqlType(StandardTypes.VARCHAR)
public static Slice leftPad(@SqlType("varchar(x)") Slice text, @SqlType(StandardTypes.BIGINT) long targetLength, @SqlType("varchar(y)") Slice padString)
{
return pad(text, targetLength, padString, 0);
}
@Description("pads a string on the right")
@ScalarFunction("rpad")
@LiteralParameters({"x", "y"})
@SqlType(StandardTypes.VARCHAR)
public static Slice rightPad(@SqlType("varchar(x)") Slice text, @SqlType(StandardTypes.BIGINT) long targetLength, @SqlType("varchar(y)") Slice padString)
{
return pad(text, targetLength, padString, text.length());
}
@Description("computes Levenshtein distance between two strings")
@ScalarFunction
@LiteralParameters({"x", "y"})
@SqlType(StandardTypes.BIGINT)
public static long levenshteinDistance(@SqlType("varchar(x)") Slice left, @SqlType("varchar(y)") Slice right)
{
int[] leftCodePoints = castToCodePoints(left);
int[] rightCodePoints = castToCodePoints(right);
if (leftCodePoints.length < rightCodePoints.length) {
int[] tempCodePoints = leftCodePoints;
leftCodePoints = rightCodePoints;
rightCodePoints = tempCodePoints;
}
if (rightCodePoints.length == 0) {
return leftCodePoints.length;
}
checkCondition(
(leftCodePoints.length * (rightCodePoints.length - 1)) <= 1_000_000,
INVALID_FUNCTION_ARGUMENT,
"The combined inputs for Levenshtein distance are too large");
int[] distances = new int[rightCodePoints.length];
for (int i = 0; i < rightCodePoints.length; i++) {
distances[i] = i + 1;
}
for (int i = 0; i < leftCodePoints.length; i++) {
int leftUpDistance = distances[0];
if (leftCodePoints[i] == rightCodePoints[0]) {
distances[0] = i;
}
else {
distances[0] = Math.min(i, distances[0]) + 1;
}
for (int j = 1; j < rightCodePoints.length; j++) {
int leftUpDistanceNext = distances[j];
if (leftCodePoints[i] == rightCodePoints[j]) {
distances[j] = leftUpDistance;
}
else {
distances[j] = Math.min(distances[j - 1], Math.min(leftUpDistance, distances[j])) + 1;
}
leftUpDistance = leftUpDistanceNext;
}
}
return distances[rightCodePoints.length - 1];
}
@Description("transforms the string to normalized form")
@ScalarFunction
@LiteralParameters({"x", "y"})
@SqlType(StandardTypes.VARCHAR)
public static Slice normalize(@SqlType("varchar(x)") Slice slice, @SqlType("varchar(y)") Slice form)
{
Normalizer.Form targetForm;
try {
targetForm = Normalizer.Form.valueOf(form.toStringUtf8());
}
catch (IllegalArgumentException e) {
throw new PrestoException(INVALID_FUNCTION_ARGUMENT, "Normalization form must be one of [NFD, NFC, NFKD, NFKC]");
}
return utf8Slice(Normalizer.normalize(slice.toStringUtf8(), targetForm));
}
@Description("decodes the UTF-8 encoded string")
@ScalarFunction
@SqlType(StandardTypes.VARCHAR)
public static Slice fromUtf8(@SqlType(StandardTypes.VARBINARY) Slice slice)
{
return SliceUtf8.fixInvalidUtf8(slice);
}
@Description("decodes the UTF-8 encoded string")
@ScalarFunction
@LiteralParameters("x")
@SqlType(StandardTypes.VARCHAR)
public static Slice fromUtf8(@SqlType(StandardTypes.VARBINARY) Slice slice, @SqlType("varchar(x)") Slice replacementCharacter)
{
int count = countCodePoints(replacementCharacter);
if (count > 1) {
throw new PrestoException(INVALID_FUNCTION_ARGUMENT, "Replacement character string must empty or a single character");
}
OptionalInt replacementCodePoint;
if (count == 1) {
try {
replacementCodePoint = OptionalInt.of(getCodePointAt(replacementCharacter, 0));
}
catch (InvalidUtf8Exception e) {
throw new PrestoException(INVALID_FUNCTION_ARGUMENT, "Invalid replacement character");
}
}
else {
replacementCodePoint = OptionalInt.empty();
}
return SliceUtf8.fixInvalidUtf8(slice, replacementCodePoint);
}
@Description("decodes the UTF-8 encoded string")
@ScalarFunction
@SqlType(StandardTypes.VARCHAR)
public static Slice fromUtf8(@SqlType(StandardTypes.VARBINARY) Slice slice, @SqlType(StandardTypes.BIGINT) long replacementCodePoint)
{
if (replacementCodePoint > MAX_CODE_POINT || Character.getType((int) replacementCodePoint) == SURROGATE) {
throw new PrestoException(INVALID_FUNCTION_ARGUMENT, "Invalid replacement character");
}
return SliceUtf8.fixInvalidUtf8(slice, OptionalInt.of((int) replacementCodePoint));
}
@Description("encodes the string to UTF-8")
@ScalarFunction
@LiteralParameters("x")
@SqlType(StandardTypes.VARBINARY)
public static Slice toUtf8(@SqlType("varchar(x)") Slice slice)
{
return slice;
}
}