package com.facebook.hive.udf;
import com.facebook.hive.udf.lib.SetOps;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDF;
import java.util.List;
/**
* Jaccard similarity |A&B| / |AvB|
* of two sets, represented as arrays of strings (e.g. the output of COLLECT)
*/
@Description(name = "udfjaccard",
value = "_FUNC_(array<string> set1, array<string> set2) ... or alternately \n" +
"_FUNC_(array<string> set1sample, array<string> set2sample, int set1fullsize, int set2fullsize)\n",
extended = "the sample-corrected version performs what we hope is an unbiased estimate. talk to @boconnor for gory details")
public class UDFJaccard extends UDF {
public Double evaluate(List<String> set1, List<String> set2) {
return SetOps.jaccard(set1, set2);
}
public Double evaluate(List<String> set1, List<String> set2, int fullSize1, int fullSize2) {
return SetOps.sampleCorrectedJaccard(set1, set2, fullSize1, fullSize2);
}
}