package brickhouse.udf.sketch; /** * Copyright 2012 Klout, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * **/ import brickhouse.analytics.uniques.SketchSet; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDFArgumentException; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; import org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector; import org.apache.log4j.Logger; /** * Interpret a list of strings as a sketch_set * and return an estimated reach number */ @Description(name = "estimated_reach", value = "_FUNC_(x) - Estimate reach from a sketch set of Strings. " ) public class EstimatedReachUDF extends GenericUDF { private static final Logger LOG = Logger.getLogger(EstimatedReachUDF.class); private ListObjectInspector listInspector; private PrimitiveObjectInspector elemInspector; private PrimitiveCategory elemCategory; private IntObjectInspector lengthInspector; @Override public Object evaluate(DeferredObject[] arg0) throws HiveException { Object listObj = arg0[0].get(); int maxItems = SketchSet.DEFAULT_MAX_ITEMS; if (arg0.length > 1) { maxItems = lengthInspector.get(arg0[1].get()); } int listLen = listInspector.getListLength(listObj); if (listLen < maxItems) { return (long) listLen; } if (listLen > maxItems) { LOG.warn("estimated_reach: List length " + listLen + " is greater than sketch set Max items " + maxItems); } Object uninspMax = listInspector.getListElement(listObj, maxItems - 1); switch (this.elemCategory) { case STRING: StringObjectInspector strInspector = (StringObjectInspector) elemInspector; String lastItem = strInspector.getPrimitiveJavaObject(uninspMax); double reach = SketchSet.EstimatedReach(lastItem, maxItems); if (reach > listLen) return (long) (reach); else return (long) listLen; case LONG: LongObjectInspector longInspector = (LongObjectInspector) elemInspector; long lastHash = longInspector.get(uninspMax); double reachHash = SketchSet.EstimatedReach(lastHash, maxItems); if (reachHash > listLen) return (long) (reachHash); else return (long) listLen; default: /// should not happen throw new HiveException("Unexpected category type"); } } @Override public String getDisplayString(String[] arg0) { StringBuilder sb = new StringBuilder("estimated_reach( "); for (int i = 0; i < arg0.length - 1; ++i) { sb.append(arg0[i]); sb.append(" , "); } sb.append(arg0[arg0.length - 1]); sb.append(" )"); return sb.toString(); } @Override public ObjectInspector initialize(ObjectInspector[] arg0) throws UDFArgumentException { if (arg0.length != 1 && arg0.length != 2) { throw new UDFArgumentException("estimated_reach takes an array of strings or an array of hashes, and an optional sketch size"); } if (arg0[0].getCategory() != Category.LIST) { throw new UDFArgumentException("estimated_reach takes an array of strings or an array of hashes, and an optional sketch size"); } this.listInspector = (ListObjectInspector) arg0[0]; if (listInspector.getListElementObjectInspector().getCategory() != Category.PRIMITIVE) { throw new UDFArgumentException("estimated_reach takes an array of strings or an array of hashes, and an optional sketch size"); } this.elemInspector = (PrimitiveObjectInspector) listInspector.getListElementObjectInspector(); LOG.info(" Element category is " + this.elemInspector.getCategory()); this.elemCategory = this.elemInspector.getPrimitiveCategory(); if (this.elemCategory != PrimitiveCategory.STRING && this.elemCategory != PrimitiveCategory.LONG) { throw new UDFArgumentException("estimated_reach takes an array of strings or an array of hashes, and an optional sketch size"); } if (arg0.length > 1) { if (!(arg0[1] instanceof IntObjectInspector)) { throw new UDFArgumentException("estimated_reach takes an array of strings or an array of hashes, and an optional sketch size"); } this.lengthInspector = (IntObjectInspector) arg0[1]; } return PrimitiveObjectInspectorFactory.javaLongObjectInspector; } }