package com.facebook.hive.udf.lib; import org.json.JSONException; import org.json.JSONObject; import java.io.BufferedReader; import java.io.InputStreamReader; import java.util.HashMap; import java.util.Map; import java.util.Set; /** inspired by stanford nlp's Counter classes * @boconnor **/ public class Counter<E> { public Map<E,Integer> counts; public int totalCount; public Counter() { counts = new HashMap<E,Integer>(); totalCount = 0; } public void increment(E obj, int amount) { if (!counts.containsKey(obj)) { counts.put(obj, amount); } else { counts.put(obj, counts.get(obj) + amount); } totalCount += amount; } public void increment(E obj) { increment(obj, 1); } public int getCount(E obj) { if (!counts.containsKey(obj)) { return 0; } else { return counts.get(obj); } } public int size() { return counts.size(); } public void addAll(Counter<E> counter) { for (E key : counter.keySet()) { increment(key, counter.getCount(key)); } } public Set<E> keySet() { return counts.keySet(); } /** this version is probabilities not counts, and also decorated with totalCount **/ public String toNormalizedJSON() throws JSONException { JSONObject j = new JSONObject(); j.put("totalCount", totalCount); JSONObject probs = new JSONObject(); j.put("probs", probs); // If keys aren't Strings, problems could happen. for (E key : keySet()) { probs.put(key.toString(), getCount(key)*1.0 / totalCount); } return j.toString(); } /** output should be something like: * * % echo a a a a a a b | tr ' ' '\n' | java -cp fb_udf.jar:$HIVE_HOME/lib/json.jar Counter * {"b":1,"a":6} * {"probs":{"b":0.14285714285714285,"a":0.8571428571428571},"totalCount":7} */ public static void main(String args[]) throws Exception { InputStreamReader converter = new InputStreamReader(System.in); BufferedReader in = new BufferedReader(converter); String line = ""; Counter<String> c = new Counter<String>(); while ((line = in.readLine()) != null) { line = line.trim(); c.increment(line); } System.out.println(new JSONObject(c.counts).toString()); System.out.println(c.toNormalizedJSON()); } }