/* * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.addthis.hydra.data.tree.prop; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import com.addthis.basis.util.LessStrings; import com.addthis.bundle.core.Bundle; import com.addthis.bundle.core.BundleField; import com.addthis.bundle.util.ValueUtil; import com.addthis.bundle.value.ValueArray; import com.addthis.bundle.value.ValueFactory; import com.addthis.bundle.value.ValueMap; import com.addthis.bundle.value.ValueMapEntry; import com.addthis.bundle.value.ValueObject; import com.addthis.codec.annotations.FieldConfig; import com.addthis.codec.codables.SuperCodable; import com.addthis.hydra.data.tree.concurrent.ConcurrentTreeNode; import com.addthis.hydra.data.tree.DataTreeNode; import com.addthis.hydra.data.tree.DataTreeNodeUpdater; import com.addthis.hydra.data.tree.TreeDataParameters; import com.addthis.hydra.data.tree.TreeNodeData; import com.addthis.hydra.store.util.SeenFilterBasic; public final class DataKeySieve2 extends TreeNodeData<DataKeySieve2.Config> implements SuperCodable { private static final int targetSaturation = Integer.parseInt(System.getProperty("datakeysieve2.saturation", "20")); /** * This data attachment <span class="hydra-summary">keeps a sieve of encountered values</span>. * <p/> * <p>The sieve of encountered values is represented as a stack of bloom filters. * If any of the bloom filters in the current stack exceeds the target saturation value then * a new stack of bloom filters is created. The data attachment stores a list of * stacks of bloom filters because we cannot transfer elements from * the old stack of bloom filters to the new stack of bloom filters.</p> * <p/> * <p>Each stack of bloom filters is referred to as a single layer of bloom filters. * The number of bloom filters within each layer is equal to the number of bloom filters * specified in the {@link #tiers} field.</p> * <p/> * <p>Job Configuration Example:</p> * <pre> * {const:"0", data.sieve.key-sieve2 { * key:"TERMS" * saturation:25 * tiers: [ * {bits : 4000000, bitsper : 4, hash : 4} * {bits : 1000000, bitsper : 4, hash : 4} * {bits : 250000, bitsper : 4, hash : 4} * {bits : 125000, bitsper : 4, hash : 4} * ] * }} * </pre> * * <p><b>Query Path Directives</b> * * <p>"$" operations support the following commands in the format * $+{attachment}={command} : * <table> * <tr> * <td>layers</td> * <td>number of layers in the data attachment</td> * </tr> * <tr> * <td>satmax</td> * <td>maximum saturation across all layers</td> * </tr> * <tr> * <td>ram</td> * <td>number of bytes used by the data attachment</td> * </tr> * </table> * * <p>"%" operations support the following commands in the format /+%{attachment}={command}. * * <table> * <tr> * <td>"name1,name2,name3"</td> * <td>create virtual nodes using the keys specified in the command</td> * </tr> * </table> * * <p>Using "%" without any arguments returns one node per bloom filter * or N * M virtual nodes when there are N layers and * M bloom filters inside each layer.</p> * * @user-reference */ public static final class Config extends TreeDataParameters<DataKeySieve2> { /** * Bundle field name from which to draw values. * This field is required. */ @FieldConfig(codable = true, required = true) private String key; /** * Stack of initial bloom filters. * The filters are specified from the lowest level * to the highest level. * This field is required. */ @FieldConfig(codable = true, required = true) private SeenFilterBasic<String>[] tiers; /** * Default is either System property "datakeysieve2.saturation" or 20. */ @FieldConfig(codable = true) private int saturation = targetSaturation; @Override public DataKeySieve2 newInstance() { DataKeySieve2 top = new DataKeySieve2(); top.layers = new ArrayList<>(); top.saturation = saturation; top.template = tiers; top.addLayer(); return top; } } @FieldConfig(codable = true, required = true) private ArrayList<Sieve> layers; @FieldConfig(codable = true) private int saturation; private SeenFilterBasic<String>[] template; private BundleField keyAccess; private Sieve current; private void addLayer() { int[] newbits = new int[template.length]; if (current != null) { for (int idx = 0; idx < current.tiers.length; idx++) { SeenFilterBasic<String> filter = current.tiers[idx]; if (filter.getSaturation() >= saturation) { newbits[idx] = (int) (filter.getBits() * 1.5); } else { double filtsat = filter.getSaturation() * 1.0d; double minsat = Math.max(saturation * 0.75d, filtsat * 1.0d); newbits[idx] = (int) ((minsat / saturation) * filter.getBits()); } } } SeenFilterBasic<String>[] tmp = new SeenFilterBasic[template.length]; for (int i = 0; i < template.length; i++) { tmp[i] = template[i].newInstance(newbits[i] > 32 ? newbits[i] : template[i].getBits()); } current = new Sieve(tmp); layers.add(current); } @Override public boolean updateChildData(DataTreeNodeUpdater state, DataTreeNode childNode, Config conf) { Bundle bundle = state.getBundle(); if (keyAccess == null) { keyAccess = bundle.getFormat().getField(conf.key); } return updateCounter(bundle, bundle.getValue(keyAccess)); } private boolean updateCounter(Bundle bundle, ValueObject value) { boolean mod = false; if (value == null) { return false; } switch (value.getObjectType()) { case INT: case FLOAT: case STRING: case BYTES: case CUSTOM: String val = ValueUtil.asNativeString(value); if (val != null && current.updateSeen(val)) { bundle.setValue(keyAccess, null); if (current.isSaturated(saturation)) { addLayer(); } mod = true; } break; case ARRAY: ValueArray arr = value.asArray(); for (ValueObject o : arr) { // use "|" to prevent short circuiting mod = mod | updateCounter(bundle, o); } break; case MAP: ValueMap map = value.asMap(); for (ValueMapEntry o : map) { // use "|" to prevent short circuiting mod = mod | updateCounter(bundle, ValueFactory.create(o.getKey())); } break; default: throw new IllegalStateException("Unhandled object type " + value.getObjectType()); } return mod; } @Override public ValueObject getValue(String key) { if (key != null) { if (key.equals("layers")) { return ValueFactory.create(layers.size()); } if (key.equals("satmax")) { int sat = 0; for (Sieve sieve : layers) { sat = Math.max(sat, sieve.getSaturationMax()); } return ValueFactory.create(sat); } if (key.equals("ram")) { int sum = 0; for (Sieve sieve : layers) { sum += sieve.getByteSize(); } return ValueFactory.create(sum); } } return null; } /** * return types of synthetic nodes returned */ @Override public List<String> getNodeTypes() { return Arrays.asList(new String[]{"layers", "satmax", "ram"}); } @Override public List<DataTreeNode> getNodes(DataTreeNode parent, String key) { if (key == null || key.length() == 0) { ArrayList<DataTreeNode> list = new ArrayList<>(layers.size() * template.length); for (int i = 0; i < layers.size(); i++) { Sieve s = layers.get(i); for (int j = 0; j < template.length; j++) { list.add(new MyTreeNode(i + "-" + j + "-" + s.updates, s.tiers[j].getSaturation(), s.tiers[j].getBits())); } } return list; } String[] keys = LessStrings.splitArray(key, ","); ArrayList<DataTreeNode> list = new ArrayList<>(keys.length); synchronized (this) { for (String k : keys) { int count = 0; long[] hash = current.tiers[0].getHashSet(k); boolean lookDeep = false; for (Sieve s : layers) { lookDeep = false; int seen = s.getSeenLevel(hash); if (seen < 0) { lookDeep = true; } else if (seen > 0) { count += seen; } } if (count > 0 || lookDeep) { DataTreeNode n = lookDeep ? parent.getNode(k) : null; if (n != null) { list.add(new MyTreeNode(k, count + n.getCounter())); } else { list.add(new MyTreeNode(k, count)); } } } } return list.size() > 0 ? list : null; } /** * phantom node created for reporting */ private static class MyTreeNode extends ConcurrentTreeNode { MyTreeNode(String name, long hits) { this.name = name; this.hits = hits; } MyTreeNode(String name, long hits, int nodes) { this.name = name; this.hits = hits; this.nodes = nodes; } } /** * for stacking */ public static final class Sieve { @FieldConfig(codable = true, required = true) private SeenFilterBasic<String>[] tiers; @FieldConfig(codable = true, required = true) private int updates; public Sieve() { } public Sieve(SeenFilterBasic<String>[] tiers) { this.tiers = tiers; } public boolean isSaturated(int saturation) { return updates++ % 100 == 0 && getSaturationMax() > saturation; } public int getByteSize() { int sum = 0; for (SeenFilterBasic<String> bloom : tiers) { sum += bloom.getBits() / 32; } return sum; } public int getSaturationMax() { int sat = 0; for (SeenFilterBasic<String> bloom : tiers) { sat = Math.max(sat, bloom.getSaturation()); } return sat; } /** * @return true if handled in level filters */ public boolean updateSeen(String k) { long[] hash = tiers[0].getHashSet(k); for (SeenFilterBasic<String> bloom : tiers) { if (!bloom.checkHashSet(hash)) { bloom.setHashSet(hash); return true; } } return false; } /** * 0 = not seen in first level (by extension any) n = was seen last at * level -n = seen in every level */ public int getSeenLevel(long[] hash) { int count = 0; for (SeenFilterBasic<String> bloom : tiers) { if (!bloom.checkHashSet(hash)) { if (count == 0) { return 0; } return count; } count++; } return -count; } } @Override public void postDecode() { current = layers.get(layers.size() - 1); template = current.tiers; } @Override public void preEncode() { } }