/* * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.addthis.hydra.data.query.op; import java.util.Arrays; import java.util.HashMap; import java.util.Map; import java.util.TreeMap; import com.addthis.bundle.core.Bundle; import com.addthis.bundle.table.DataTable; import com.addthis.bundle.table.DataTableFactory; import com.addthis.bundle.util.BundleColumnBinder; import com.addthis.bundle.util.ValueUtil; import com.addthis.bundle.value.ValueFactory; import com.addthis.hydra.data.query.AbstractTableOp; import io.netty.channel.ChannelProgressivePromise; /** * <p>This query operation <span class="hydra-summary">calculates disorder values</span>. * <p/> * <p>Table-level operation that involves a primary key, a secondary key, and * optionally a frequency column. For each primary key, computes how "disorderly" the * set of secondary keys is. If the frequency column is omitted then each row is * assumed to have a weight of 1. * <p/> * <pre>a b 1 * a c 2 * a d 1 * b x 2 * b y 3 * b d 1 * <p/> * disorder=0:1:2 * <p/> * a 0.451544993496 0.625 * b 0.439247291136 0.611111111111</pre> * * @user-reference * @hydra-name disorder */ public class OpDisorder extends AbstractTableOp { public static final Long ONE = Long.valueOf(1); private int primary; private int secondary; private int frequency; public OpDisorder(DataTableFactory tableFactory, String args, ChannelProgressivePromise queryPromise) { super(tableFactory, queryPromise); String[] split = args.split(":"); if (split.length < 2 || split.length > 3) { throw new IllegalArgumentException("expected disorder=p:s[:f], got " + args); } primary = Integer.parseInt(split[0]); secondary = Integer.parseInt(split[1]); if (split.length == 3) { frequency = Integer.parseInt(split[2]); } else { frequency = -1; } } @Override public DataTable tableOp(DataTable input) { int max = Math.max(primary, Math.max(secondary, frequency)); Map<String, Map<String, Long>> data = new TreeMap<>(); BundleColumnBinder binder = getSourceColumnBinder(input); for (Bundle row : input) { if (row.getCount() < max) { continue; } String p = binder.getColumn(row, primary).toString(); if (p == null) { p = ""; } String s = binder.getColumn(row, secondary).toString(); if (s == null) { s = ""; } Long f = frequency < 0 ? ONE : ValueUtil.asNumberOrParse(binder.getColumn(row, frequency)).asLong().getLong(); if (f == null || f.longValue() <= 0) { continue; } bump(data, p, s, f); } DataTable output = createTable(data.size()); for (String key : data.keySet()) { Bundle row = output.createBundle(); binder.appendColumn(row, ValueFactory.create(key)); for (double d : computeDisorder(data.get(key))) { binder.appendColumn(row, ValueFactory.create(d)); } } return output; } // data[p][s] += f protected static void bump(Map<String, Map<String, Long>> data, String p, String s, long f) { Map<String, Long> m = data.get(p); if (m == null) { data.put(p, m = new HashMap<>()); } if (m.containsKey(s)) { m.put(s, Long.valueOf(f + m.get(s).longValue())); } else { m.put(s, Long.valueOf(f)); } } public static double[] computeDisorder(Map<String, Long> data) { double sum = 0.0; double ent = 0.0; double gin = 0.0; for (String k : data.keySet()) { sum += data.get(k).doubleValue(); } for (String k : data.keySet()) { double prk = data.get(k).doubleValue() / sum; ent += (prk * Math.log10(prk)); for (String k2 : data.keySet()) { if (k != k2) { double prk2 = data.get(k2).doubleValue() / sum; gin += prk * prk2; } } } return new double[]{-1.0 * ent, gin}; } public static void main(String[] args) throws Exception { Map<String, Long> data = new HashMap<>(); data.put("a", 1L); data.put("b", 2L); data.put("c", 2L); System.err.println(data); System.err.println(Arrays.toString(computeDisorder(data))); System.err.println(); data = new HashMap<>(); data.put("a", 1L); data.put("b", 2L); data.put("c", 3L); System.err.println(data); System.err.println(Arrays.toString(computeDisorder(data))); System.err.println(); } }