package org.notmysock.hive; import hyperloglog.HyperLogLog; import hyperloglog.HyperLogLogUtils; import hyperloglog.HyperLogLog.HyperLogLogBuilder; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; import org.apache.hadoop.hive.ql.exec.UDF; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.Text; import org.notmysock.hive.UDAFHyperLogLog.HyperLogLogBuffer; public class UDFHyperLogLogUnion extends UDF { public BytesWritable evaluate(BytesWritable... args) throws HiveException { HyperLogLog hll = null; for (BytesWritable bw : args) { if (bw != null) { HyperLogLog hll2 = deserialize(bw); if(hll == null) { hll = hll2; } else { hll.merge(hll2); } } } return serialize(hll); } private final BytesWritable serialize(HyperLogLog hll) throws HiveException { if(hll == null) { return null; } ByteArrayOutputStream output = new ByteArrayOutputStream(); output.reset(); try { HyperLogLogUtils.serializeHLL(output, hll); } catch(IOException ioe) { throw new HiveException(ioe); } return new BytesWritable(output.toByteArray()); } private final HyperLogLog deserialize(BytesWritable bw) throws HiveException { try { ByteArrayInputStream input = new ByteArrayInputStream(bw.getBytes(), 0, bw.getLength()); return HyperLogLogUtils.deserializeHLL(input); } catch (IOException ioe) { throw new HiveException(ioe); } } }