package brickhouse.hbase; import brickhouse.udf.json.InspectorHandle; import com.google.common.cache.Cache; import com.google.common.cache.CacheBuilder; import com.google.common.cache.CacheLoader; import com.google.common.util.concurrent.UncheckedExecutionException; import org.apache.hadoop.hbase.KeyValue; import org.apache.hadoop.hbase.client.Get; import org.apache.hadoop.hbase.client.HTable; import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hive.ql.exec.Description; import org.apache.hadoop.hive.ql.exec.UDFArgumentException; import org.apache.hadoop.hive.ql.metadata.HiveException; import org.apache.hadoop.hive.ql.udf.generic.GenericUDF; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector; import org.apache.log4j.Logger; import org.codehaus.jackson.JsonNode; import org.codehaus.jackson.map.ObjectMapper; import java.util.Map; import java.util.NoSuchElementException; import java.util.concurrent.Callable; /** * Load data from HBase, and cache locally in memory, for faster access. * <p/> * Similar to using a distributed map, except the values are stored in HBase, * and can be sharded across multiple nodes, so one can process elements which * wouldn't fit into memory on a single node. * <p/> * This may be useful in situations where you would potentially have a cartesian * product ( bayesian topic assignment, similiarity clustering ), and would * want to avoid an extra join. * <p/> * One can cache strings, or arbitrary Hive structures, by storing values as * JSON strings, and using a template object similiar to the one used in * the from_json UDF. An example would be storing a map<string,double> as * a bag-of-words, or an array<string> to store a sketch-set */ @Description(name = "hbase_cached_get", value = "_FUNC_(configMap,key,template) - Returns a cached object, given an HBase config, a key, and a template object used to interpret JSON" ) public class CachedGetUDF extends GenericUDF { private static final Logger LOG = Logger.getLogger(CachedGetUDF.class); private Cache<String, Object> cache; private Map<String, String> configMap; private StringObjectInspector strInspector; private InspectorHandle jsonInspectorHandle; @Override public Object evaluate(DeferredObject[] arg0) throws HiveException { return getValue(strInspector.getPrimitiveJavaObject(arg0[1].get())); } private CacheLoader valueLoader = new CacheLoader<String, Object>() { @Override public Object load(String key) throws Exception { String jsonString = loadString(key); if ((++numLoaded % 1000) == 0) { LOG.info(" loaded " + numLoaded + " records; Key = " + key + " json =" + jsonString); } if (jsonInspectorHandle != null) { ObjectMapper jacksonParser = new ObjectMapper(); JsonNode jsonNode = jacksonParser.readTree(jsonString); return jsonInspectorHandle.parseJson(jsonNode); } else { return jsonString; } } public String loadString(String key) throws Exception { Get keyGet = new Get(key.getBytes()); HTable htable = HTableFactory.getHTable(configMap); Result res = htable.get(keyGet); KeyValue kv = res.getColumnLatest(configMap.get(HTableFactory.FAMILY_TAG).getBytes(), configMap.get(HTableFactory.QUALIFIER_TAG).getBytes()); if (kv == null) { throw new NoSuchElementException("No value found for " + key); } byte[] bytes = kv.getValue(); String jsonStr = new String(bytes); return jsonStr; } }; private int numLoaded = 0; private int numCalls = 0; private int numMisses = 0; private int numErrors = 0; public Object getValue(final String key) { try { ++numCalls; Object l = cache.get(key, new Callable<Object>() { @Override public Object call() throws Exception { ++numMisses; return valueLoader.load(key); } }); if (((numCalls - numMisses) % 1000) == 0) { LOG.info("Retrieved " + (numCalls - numMisses) + " features key = " + key + " Num misses =" + numMisses); } return l; } catch (UncheckedExecutionException e) { LOG.error("Error while parsing string ", e); if ((++numErrors % 1000) == 0) { LOG.info("Num Errors = " + numErrors + "; Missed " + numMisses + " features key = " + key + " Num hits = " + (numCalls - numMisses)); } return null; } catch (Exception unexpected) { LOG.error("Error while parsing string ", unexpected); if ((++numErrors % 1000) == 0) { LOG.info("Num Errors = " + numErrors + "; Missed " + numMisses + " features key = " + key + " Num hits = " + (numCalls - numMisses)); } return null; } } @Override public String getDisplayString(String[] arg0) { return "hbase_cache_array(" + arg0[0] + " , " + arg0[1] + ")"; } /** * User should pass in a constant Map of HBase parameters, * the String key to look up, */ @Override public ObjectInspector initialize(ObjectInspector[] parameters) throws UDFArgumentException { this.configMap = HTableFactory.getConfigFromConstMapInspector(parameters[0]); this.strInspector = (StringObjectInspector) parameters[1]; this.cache = CacheBuilder.newBuilder().build(valueLoader); /** * If a third parameter is passed in, then */ if (parameters.length > 2) { jsonInspectorHandle = InspectorHandle.InspectorHandleFactory.GenerateInspectorHandle(parameters[2]); return jsonInspectorHandle.getReturnType(); } else { return PrimitiveObjectInspectorFactory.javaStringObjectInspector; } } }