package com.livingsocial.hive.udf; import javax.script.Invocable; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDFArgumentException; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.parse.SemanticException; import org.apache.hadoop.hive.ql.udf.UDFType; import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils; import com.livingsocial.hive.utils.ScriptingHelper; @UDFType(deterministic = false, stateful = true) @Description(name = "scriptedUDF", value = "_FUNC_(script_to_run, language, return_type, script_arg1, script_arg_2, ....) " + "- Returns the specified return_type (hive style types) from the evaluate function of the script.", extended = "Function descriptions in the script:\n" + " evaluate receives all the extra script_arguments passed in the _FUNC_ call and returns an object adhering to the defined return_type \n" + "\nLanguage is the javax.script engine name. Additional languages can be added by adding the jar implementing the scripting engine ('add jar groovy-all.jar;' or similar)\n" + "Return_type is a hive style data definition ('string', 'bigint', 'array<map<string,string>>', ...) \n\n" + "Example:\n > -- Gather complex data combining groups and individual rows without joins \n" + " select person_id, purchase_data['time'], purchase_data['diff'], \n" + " purchase_data['product'], purchase_data['purchase_count'] as pc,\n" + " purchase_data['blah']\n" + " from (\n" + " select person_id, scriptedUDF('\n" + " require \"json\"\n" + " def evaluate(data)\n" + " # This gathers all the data about purchases by person in one place so complex infromation can be gathered while avoiding complex joins \n" + " # Note: In order for this to work all the data passed into _FUNC_ for a row needs to fit into memory \n" + " tmp = [] # convert things over to a ruby array\n" + " tmp.concat(data)\n" + " tmp.sort_by! { |a| a.get(\"time\") } # for the time differences\n" + " last=0\n" + " tmp.map{ |row| \n" + " # Compute the time difference between purchases and add the total purchase count per person\n" + " t = row[\"time\"] \n" + " \n" + " # The parts that would be much more difficult to generate with SQL \n" + " row[\"diff\"] = t - last\n" + " row[\"purchase_count\"] = tmp.length\n" + " row[\"first_purchase\"] = tmp[0][\"time\"]\n" + " row[\"last_purchase\"] = tmp[-1][\"time\"]\n" + " \n" + " # This shows that built-in libraries are available\n" + " row[\"blah\"] = JSON.generate({\"id\" => row[\"id\"]})\n" + " last = t\n" + " row\n" + " }\n" + " end', 'ruby', 'array<map<string,string>>', \n" + " -- gather all the data about purchases by people so it can all be passed into the evaluate function \n" + " bh_collect(map( -- Note, bh_collect is from Klouts Brickhouse and allows collecting any type, see https://github.com/klout/brickhouse/ \n" + " 'time', unix_liberal_timestamp(purchase_time), \n" + " 'product', product_id)) ) as all_data \n" + " from purchases\n" + " group by person_id\n" + " ) foo \n" + " -- explode the data back out so it is available in flattened form \n" + " lateral view explode(all_data) bar as purchase_data \n" + "\n" + "\nAlternate syntax:\n> SELECT _FUNC_('/my_scripts/reusable.rb', 'ruby', 'map<string,int>', val1, val2) FROM src_table; \n" + " this will load the script from the location in HDFS and will invoke the evaluate function. This function needs to return a map of strings keys and int values. ") public class ScriptedUDF extends GenericUDF { private ScriptingHelper.InitializationContainer initData; private Invocable engine; @Override public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { try { initData = ScriptingHelper.initialize(arguments); } catch (SemanticException e) { throw new UDFArgumentException(e); } return initData.returnOIResolver.get(); } @Override public Object evaluate(DeferredObject[] arguments) throws HiveException { if (engine == null) engine = ScriptingHelper.initializeEngine(initData.language, initData.script); Object[] args = new Object[arguments.length - initData.argOffset]; for (int i = 0; i < args.length; i++) { args[i] = ObjectInspectorUtils.copyToStandardJavaObject(arguments[i+initData.argOffset].get(), initData.argumentOIs[i]); } Object out; try { out = engine.invokeFunction("evaluate", args); } catch (Exception e) { throw new HiveException("Error invoking the evaluate function", e); } return initData.returnOIResolver.convertIfNecessary(out, initData.outputOi); } @Override public String getDisplayString(String[] children) { StringBuilder sb = new StringBuilder(); sb.append("scriptedUDF("); for (int i = 0; i < children.length; i++) { if ( i != 0 ) sb.append(", "); sb.append(children); } sb.append(")"); return sb.toString(); } }