package com.livingsocial.hive.udf; import java.nio.charset.Charset; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; import java.util.Collection; import java.util.HashMap; import java.util.Map; import org.apache.commons.codec.binary.Hex; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDF; import org.apache.hadoop.io.Text; @Description( name = "ls_hash", value = "_FUNC_(some_id, [some_salt, ['debug']]) - Generate a consistent hash based 'random' number between 0 and 1 mixing in the id and salt.", extended = "Generates a consistent 'random' number based on the passed in id and salt. \n" + " Normally this would be used to create a percentage based sample of a group with a query like:\n" + " select * from some_table_to_sample where _FUNC_(id, 'my_salt') < 0.10; -- extract a 10% random sample" ) public final class Hash extends UDF { private static final Text EMPTY = new Text(""); private static final int CHARS_TO_USE = 14; private static final double MAX_SIZE = Math.pow(2, CHARS_TO_USE*4); private Map<String,Object> mapOut = new HashMap<String,Object>(); private Charset charset = Charset.forName("UTF8"); public Double evaluate(final Text id) { return evaluate(id, EMPTY); } public Double evaluate(Text id, Text salt) { Double tmp = (Double) hashIt(id, salt).get("output"); return tmp; } public Text evaluate(Text id, Text salt, Text debug) { Map<String,Object> map = hashIt(id, salt); StringBuilder builder = new StringBuilder(); for(Map.Entry<String,Object> entry: (Collection<Map.Entry<String,Object>>)map.entrySet()) { builder.append(entry.getKey() + "=" + entry.getValue() + ","); } return new Text(builder.toString()); } private Map<String,Object> hashIt(Text id, Text salt) { mapOut.clear(); if (id == null) { return mapOut; } if(salt == null) { salt = EMPTY; } String toHash = id.toString() + salt; try { MessageDigest md = MessageDigest.getInstance("SHA1"); md.update(toHash.getBytes(charset)); byte[] hash = md.digest(); String hexHash = Hex.encodeHexString(hash); String finalHash = hexHash.substring(0, CHARS_TO_USE); long hashNum = Long.parseLong(finalHash, 16); double value = hashNum / MAX_SIZE; // Slightly hacky way to expose the internals of this mapOut.put("hexHash", hexHash); mapOut.put("finalHash", finalHash); mapOut.put("hashNum", hashNum); mapOut.put("toHash", toHash); mapOut.put("output", value); return mapOut; } catch (NoSuchAlgorithmException nsae) { throw new IllegalArgumentException("SHA1 is not setup"); } } }