package com.facebook.hive.udf; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; import java.util.ArrayList; import java.util.HashMap; /** * ArrayCountOverlap counts how many items in one array are also * in another array. * * Note that since NULL means "unknown," an unknown thing in one array does * not count as "found" in a second array just because arr2 has some other * unknown thing in it. * * If arr1 is empty, overlap is 0. */ @Description(name = "udfarraycountoverlap", value = "_FUNC_(array1, array2) - Counts how many items in array1 are also in array2") public class UDFArrayCountOverlap extends UDF { public Integer evaluate(ArrayList<String> arr1, ArrayList<String> arr2) { return arr1.size() > arr2.size() ? evaluate2(arr2, arr1) : evaluate2(arr1, arr2); } public Integer evaluate2(ArrayList<String> arr1, ArrayList<String> arr2) { // Be greedy, but not too greedy...limit map to 100M buckets but try for // 1000 buckets per potentially-used item. Should keep us under our // allotted default -Xmx512MB. Integer capacity = arr1.size() > 10485 ? 104857600 : arr1.size()*1000; HashMap<String, Integer> m = new HashMap<String, Integer>(capacity, (float) 1.0); Integer result = 0; for (String key : arr1) { if (key != null) { m.put(key, m.containsKey(key) ? m.get(key) + 1 : 1); } } Integer val; for (String key : arr2) { if (key != null) { if (m.containsKey(key) && (val = m.get(key)) > 0) { result++; m.put(key, val - 1); } } } return result; } }