package brickhouse.udf.sketch;
/**
* Copyright 2012 Klout, Inc
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
**/
import brickhouse.analytics.uniques.SketchSet;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDFArgumentException;
import org.apache.hadoop.hive.ql.metadata.HiveException;
import org.apache.hadoop.hive.ql.udf.generic.GenericUDF;
import org.apache.hadoop.hive.serde2.objectinspector.ConstantObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector;
/**
* UDF to combine two sketch sets, to estimate size of set union.
* <p/>
* Sketch sets can be either the set of original strings or the
* MD5 hashes. If array<string> is passed in, it is assumed to be
* the original sketch_set values; if array<bigint> is used, then
* it is assumed to be the KMin hash values created with sketch_values
*/
@Description(name = "combine_sketch",
value = "_FUNC_(x) - Combine two sketch sets. "
)
public class CombineSketchUDF extends GenericUDF {
private ListObjectInspector listInspectors[];
private PrimitiveCategory elemCategory;
private int sketchSetSize = SketchSetUDAF.DEFAULT_SKETCH_SET_SIZE;
@Override
public Object evaluate(DeferredObject[] arg0) throws HiveException {
SketchSet ss = new SketchSet(sketchSetSize);
for (int i = 0; i < arg0.length; ++i) {
Object listObj = arg0[i].get();
int listLen = listInspectors[i].getListLength(listObj);
for (int j = 0; j < listLen; ++j) {
Object uninspObj = listInspectors[i].getListElement(listObj, j);
switch (elemCategory) {
case STRING:
StringObjectInspector strInspector = (StringObjectInspector) listInspectors[i].getListElementObjectInspector();
String item = strInspector.getPrimitiveJavaObject(uninspObj);
ss.addItem(item);
break;
case LONG:
LongObjectInspector bigintInspector = (LongObjectInspector) listInspectors[i].getListElementObjectInspector();
long itemHash = bigintInspector.get(uninspObj);
ss.addHash(itemHash);
break;
}
}
}
switch (elemCategory) {
case STRING:
return ss.getMinHashItems();
case LONG:
return ss.getMinHashes();
default:
/// will never happen
throw new HiveException("Unexpected Element Category " + elemCategory);
}
}
@Override
public String getDisplayString(String[] arg0) {
return "combine_sketch";
}
@Override
public ObjectInspector initialize(ObjectInspector[] arg0)
throws UDFArgumentException {
if (arg0.length < 2) {
throw new UDFArgumentException("combine_sketch takes at least two arguments; a set of array<string> or a set of array<bigint>");
}
if (arg0[0].getCategory() != Category.LIST) {
throw new UDFArgumentException("combine_sketch takes at least two arguments; a set of array<string> or a set of array<bigint>");
}
ObjectInspector lastInspector = arg0[arg0.length - 1];
int listLen = arg0.length;
if (lastInspector.getCategory() == Category.PRIMITIVE
&& ((PrimitiveObjectInspector) lastInspector).getPrimitiveCategory() == PrimitiveCategory.INT) {
if (lastInspector instanceof ConstantObjectInspector) {
} else {
throw new UDFArgumentException(" Sketch set size must an integer");
}
}
this.listInspectors = new ListObjectInspector[arg0.length];
this.listInspectors[0] = (ListObjectInspector) arg0[0];
if (this.listInspectors[0].getListElementObjectInspector().getCategory() != Category.PRIMITIVE) {
throw new UDFArgumentException("combine_sketch takes at least two arguments; a set of array<string> or a set of array<bigint>");
}
this.elemCategory = ((PrimitiveObjectInspector) ((listInspectors[0].getListElementObjectInspector()))).getPrimitiveCategory();
if (this.elemCategory != PrimitiveCategory.STRING && this.elemCategory != PrimitiveCategory.LONG) {
throw new UDFArgumentException("combine_sketch takes at least two arguments; a set of array<string> or a set of array<bigint>");
}
for (int i = 1; i < arg0.length; ++i) {
if (arg0[i].getCategory() != Category.LIST) {
throw new UDFArgumentException("combine_sketch takes at least two arguments; a set of array<string> or a set of array<bigint>");
}
this.listInspectors[i] = (ListObjectInspector) arg0[i];
if (((PrimitiveObjectInspector) ((listInspectors[0].getListElementObjectInspector()))).getPrimitiveCategory() != elemCategory) {
throw new UDFArgumentException("combine_sketch takes at least two arguments; a set of array<string> or a set of array<bigint>");
}
}
return ObjectInspectorFactory.getStandardListObjectInspector(
PrimitiveObjectInspectorFactory.getPrimitiveJavaObjectInspector(elemCategory));
}
}