package com.facebook.hive.udf;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDF;
/**
* Calculates Levenshtein distance between 2 strings.
*/
@Description(name = "udflevenshtein",
value = "_FUNC_(string, string) - calculates Levenshtein distance between 2 strings.")
public class UDFLevenshtein extends UDF {
public Integer evaluate(String s1, String s2) {
if (s1 == null || s1 == null) {
return null;
}
return computeLevenshteinDistance(s1.toCharArray(),
s2.toCharArray());
}
private static int minimum(int a, int b, int c) {
if (a<=b && a<=c)
return a;
if (b<=a && b<=c)
return b;
return c;
}
private static int computeLevenshteinDistance(char[] str1, char[] str2) {
int[][] distance = new int[str1.length+1][];
for(int i=0; i<=str1.length; i++) {
distance[i] = new int[str2.length+1];
distance[i][0] = i;
}
for(int j=0; j<str2.length+1; j++) {
distance[0][j]=j;
}
for(int i=1; i<=str1.length; i++) {
for(int j=1;j<=str2.length; j++) {
distance[i][j]= minimum(distance[i-1][j]+1,
distance[i][j-1]+1,
distance[i-1][j-1]+((str1[i-1]==str2[j-1])?0:1));
}
}
return distance[str1.length][str2.length];
}
}