package brickhouse.udf.sketch; /** * Copyright 2012 Klout, Inc * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * **/ import brickhouse.analytics.uniques.SketchSet; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDFArgumentTypeException; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.ql.udf.generic.AbstractGenericUDAFResolver; import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator; import org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector.Category; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.StandardListObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; import org.apache.log4j.Logger; import org.joda.time.DateTime; import org.joda.time.Days; import org.joda.time.format.DateTimeFormat; import org.joda.time.format.DateTimeFormatter; import java.util.ArrayList; import java.util.List; /** * XXX Snarfed from multiday counter ... * TODO write one UDF which can be configured to sketch or count * TODO Generalize to represent other periods besides Days * TODO * XXX Probably needs Const object inspectors * <p/> * Count and count uniques for several day periods * ( i.e produce 1, 7 and 30 counts for various events) * <p>Input is a YYYYMMDD representation of the date counts are being generated, * a date representation of the date associated with the events, * a bigint of the event count for that day period, * an array of uniques for that count (or a sketch set for those uniques), * and an array of ints representing the dates being counted over ( ie. [1,7,30] ). * </p> * <p/> * <p>Output is a array of structs containing the num of days counted, the sum of events * over that date */ @Description(name = "multiday_sketch", value = "_FUNC_(x) - Returns a count of events over several different periods," ) public class MultiDaySketcherUDAF extends AbstractGenericUDAFResolver { private static final Logger LOG = Logger.getLogger(MultiDaySketcherUDAF.class); private static final String SKETCH_FLAG_PROP = "klout.warehouse.multiday_sketch"; public MultiDaySketcherUDAF() { } /** * Parameters are event date, event count, event uniques, asof date, period array , */ @Override public GenericUDAFEvaluator getEvaluator(TypeInfo[] parameters) throws SemanticException { for (int i = 0; i < parameters.length; ++i) { LOG.info("Type " + i + " == " + parameters[i].getTypeName() + " category " + parameters[i].getCategory().name()); } if (parameters.length != 5 && parameters.length != 6) { throw new UDFArgumentTypeException(parameters.length - 1, "multiday_sketch takes date, count, array, date, array "); } if (parameters[0].getCategory() != Category.PRIMITIVE) { throw new UDFArgumentTypeException(parameters.length - 1, "multiday_sketch takes date, count, array, date, array "); } MultiDayAggUDAFEvaluator mdEval = new MultiDayAggUDAFEvaluator(); return mdEval; } public static class MultiDayAggUDAFEvaluator extends GenericUDAFEvaluator { private static DateTimeFormatter yyyymmdd = DateTimeFormat.forPattern("yyyyMMdd"); private Integer[] daysArr; private DateTime asofDate; // For PARTIAL2 and FINAL: ObjectInspectors for partial aggregations (list // of objs) private StandardListObjectInspector internalMergeOI; // For PARTIAL1 and COMPLETE: ObjectInspectors for original data private StringObjectInspector asofInspector; private StringObjectInspector dtInspector; private LongObjectInspector longInspector; private ListObjectInspector uniqInspector; private ListObjectInspector daysArrInspector; static class MultiDaySketchBuffer implements AggregationBuffer { long counts[]; SketchSet[] sketches; } public ObjectInspector init(Mode m, ObjectInspector[] parameters) throws HiveException { super.init(m, parameters); LOG.info(" MODE = " + m.name() + " Num parameters = " + parameters.length); for (int i = 0; i < parameters.length; ++i) { LOG.info(" Parameter [ " + i + " ] == " + parameters[i]); } if (m.equals(Mode.PARTIAL1) || m.equals(Mode.COMPLETE)) { Object firstParam = parameters[0]; if (firstParam instanceof StringObjectInspector) { dtInspector = (StringObjectInspector) parameters[0]; longInspector = (LongObjectInspector) parameters[1]; uniqInspector = (ListObjectInspector) parameters[2]; asofInspector = (StringObjectInspector) parameters[3]; daysArrInspector = (ListObjectInspector) parameters[4]; } //// return a list of list of strings ... //// First string will the the count, rest are the uniques ... ListObjectInspector strListInspector = ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.javaStringObjectInspector); ListObjectInspector listInspector = ObjectInspectorFactory.getStandardListObjectInspector(strListInspector); return listInspector; ///} else if( m.equals( Mode.FINAL) || m.equals( Mode.PARTIAL2)) { } else { this.internalMergeOI = (StandardListObjectInspector) parameters[0]; List<String> fieldNames = new ArrayList<String>(); List<ObjectInspector> fieldInspectors = new ArrayList<ObjectInspector>(); fieldNames.add("num_days"); fieldInspectors.add(PrimitiveObjectInspectorFactory.javaIntObjectInspector); fieldNames.add("cnt"); fieldInspectors.add(PrimitiveObjectInspectorFactory.javaLongObjectInspector); fieldNames.add("sketch_sets"); fieldInspectors.add(ObjectInspectorFactory.getStandardListObjectInspector(PrimitiveObjectInspectorFactory.javaStringObjectInspector)); ObjectInspector structType = ObjectInspectorFactory.getStandardStructObjectInspector(fieldNames, fieldInspectors); ObjectInspector retType = ObjectInspectorFactory.getStandardListObjectInspector(structType); return retType; } } private void addMultiDay(MultiDaySketchBuffer mdCounter, DateTime dt, Long cnt, List<Object> uniqs) { for (int i = 0; i < daysArr.length; ++i) { int daysBetween = Days.daysBetween(dt, asofDate).getDays(); ///LOG.info( " DT = "+ dt + " asofDate = " + asofDate + " daysBetween = " + daysBetween); if (daysBetween < (Integer) daysArr[i]) { mdCounter.counts[i] += cnt; ///LOG.info( "Days between = " + daysBetween + " for idx "+ i + " with val " + daysArr[i] + " cnt = " + mdCounter.counts[i] ); for (Object unObj : uniqs) { String uniqStr = ((StringObjectInspector) uniqInspector.getListElementObjectInspector()).getPrimitiveJavaObject(unObj); ///LOG.info( " Adding Unique str " + uniqStr); mdCounter.sketches[i].addItem(uniqStr); } } } } private void setDaysArr(Object obj) { List inspected = this.daysArrInspector.getList(obj); daysArr = new Integer[inspected.size()]; int idx = 0; for (Object elem : inspected) { daysArr[idx++] = (Integer) ((IntObjectInspector) daysArrInspector.getListElementObjectInspector()).getPrimitiveJavaObject(elem); } } private void setAsofDate(Object obj) { String str = asofInspector.getPrimitiveJavaObject(obj); asofDate = getDateTime(str); } private DateTime getDateTime(String str) { DateTime dt = yyyymmdd.parseDateTime(str); return dt; } private long getLong(Object obj) { return longInspector.get(obj); } private List getList(Object obj) { return this.uniqInspector.getList(obj); } @Override public AggregationBuffer getNewAggregationBuffer() throws HiveException { AggregationBuffer buff = new MultiDaySketchBuffer(); reset(buff); return buff; } @Override public void iterate(AggregationBuffer agg, Object[] parameters) throws HiveException { if (daysArr == null) { setDaysArr(parameters[4]); reset(agg); } if (asofDate == null) { setAsofDate(parameters[3]); } MultiDaySketchBuffer myagg = (MultiDaySketchBuffer) agg; DateTime dt = getDateTime(dtInspector.getPrimitiveJavaObject(parameters[0])); long cnt = getLong(parameters[1]); List<Object> uniqList = getList(parameters[2]); addMultiDay(myagg, dt, cnt, uniqList); } @Override public void merge(AggregationBuffer agg, Object partial) throws HiveException { ////LOG.info(" MERGE IS CALLED partial is " + partial + " AGG is " + agg); List partialResultList = internalMergeOI.getList(partial); if (daysArr == null) { daysArr = new Integer[partialResultList.size()]; } MultiDaySketchBuffer myagg = (MultiDaySketchBuffer) agg; if (myagg.counts == null) { reset(myagg); } ListObjectInspector subListInspector = (ListObjectInspector) internalMergeOI.getListElementObjectInspector(); StringObjectInspector strInspector = (StringObjectInspector) subListInspector.getListElementObjectInspector(); int idx = 0; for (Object strListObj : partialResultList) { List strList = subListInspector.getList(strListObj); String numDaysStr = strInspector.getPrimitiveJavaObject(strList.get(0)); daysArr[idx] = Integer.decode(numDaysStr); ///LOG.info(" numDays = " + numDaysStr); String cntStr = strInspector.getPrimitiveJavaObject(strList.get(1)); ///LOG.info(" Count Strr = " + cntStr); Long cnt = Long.decode(cntStr); myagg.counts[idx] += cnt; for (int j = 2; j < strList.size(); ++j) { String uniqStr = strInspector.getPrimitiveJavaObject(strList.get(j)); myagg.sketches[idx].addItem(uniqStr); } idx++; } } @Override public void reset(AggregationBuffer buff) throws HiveException { MultiDaySketchBuffer countBuff = (MultiDaySketchBuffer) buff; if (daysArr != null) { countBuff.counts = new long[daysArr.length]; countBuff.sketches = new SketchSet[daysArr.length]; for (int i = 0; i < countBuff.sketches.length; ++i) countBuff.sketches[i] = new SketchSet(); } } @Override public Object terminate(AggregationBuffer agg) throws HiveException { ////LOG.info( "Terminate " + agg); MultiDaySketchBuffer myagg = (MultiDaySketchBuffer) agg; List<List> ret = new ArrayList<List>(); for (int i = 0; i < daysArr.length; ++i) { ArrayList structArr = new ArrayList(); structArr.add(daysArr[i]); /// num_days structArr.add(myagg.counts[i]); List<String> sketchList = myagg.sketches[i].getMinHashItems(); structArr.add(sketchList); ret.add(structArr); } return ret; } @Override public Object terminatePartial(AggregationBuffer agg) throws HiveException { ///LOG.info( "Terminate partial " + agg); MultiDaySketchBuffer myagg = (MultiDaySketchBuffer) agg; List<List> ret = new ArrayList<List>(); for (int i = 0; i < daysArr.length; ++i) { ArrayList strList = new ArrayList(); strList.add(Integer.toString(daysArr[i])); strList.add(Long.toString(myagg.counts[i])); List<String> itemList = myagg.sketches[i].getMinHashItems(); for (String minHashItem : itemList) { strList.add(minHashItem); //// XXX TODO for sketch sets, pass the hash as well ... } ret.add(strList); } return ret; } } }