/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.addthis.hydra.data.tree.prop;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import com.addthis.basis.util.LessStrings;
import com.addthis.bundle.core.Bundle;
import com.addthis.bundle.util.AutoField;
import com.addthis.bundle.value.ValueArray;
import com.addthis.bundle.value.ValueFactory;
import com.addthis.bundle.value.ValueObject;
import com.addthis.bundle.value.ValueTranslationException;
import com.addthis.codec.annotations.FieldConfig;
import com.addthis.codec.codables.SuperCodable;
import com.addthis.hydra.data.tree.DataTreeNode;
import com.addthis.hydra.data.tree.DataTreeNodeUpdater;
import com.addthis.hydra.data.tree.TreeDataParameters;
import com.addthis.hydra.data.tree.TreeNodeData;
import com.clearspring.analytics.stream.frequency.CountMinSketch;
import com.google.common.annotations.VisibleForTesting;
@Deprecated
public class DataCMSUpperBound extends TreeNodeData<DataCMSUpperBound.Config> implements SuperCodable {
/**
* <p>This data attachment is a <span class="hydra-summary">deprecated</span>.
* <p/>
* <p>This data attachment is deprecated. Use {@link com.addthis.hydra.data.filter.closeablebundle.CloseableBundleCMSLimit}.</p>
* <p>The {@link #limit limit} specifies the maximum number of occurrences that any value for
* a key can be observed. Once the limit is reached then this value will be removed
* from the bundle when it is encountered.</p>
* <p/>
* <p>The error is computed as a proportion of (T) the sum of the counts inserted into the data
* structure. The error rate is T * e (2.71828) / width. For example if I expect to observe
* a total count of 1,000,000 and my width is 100,000 bits then the error estimate is 10e (27.1828).
* The confidence of the estimation falling within the error is 1 - e ^ -depth. The default value
* of depth is 10. This has a confidence of 99.995%. The total numbers of bits allocated is
* width * depth. You are responsible for selecting values of width and depth that
* use reasonable amounts of memory and while having acceptable error bounds and confidence limits.</p>
* <p/>
* <p>The {@link #key key} field is required and specifies the bundle field name from
* which keys will be inserted into the sketch. The {@link #count count} field is optional
* and specifies the bundle field name from which non-negative integer values will be used
* as counts for the associated keys. If the count field is missing then each key instance
* is assumed to have a count of 1.</p>
* <p>Job Configuration Example:</p>
* <pre>
* {const:"service", data.idcount.limit-upper {key:"ID", width:100000, limit:5}}
* {const:"pixelator", data.idcount.limit-upper {key:"ID", count:"ID_COUNT", width:10000, limit:5}}
* </pre>
* <p/>
* <p><b>Query Path Directives</b>
* <p/>
* <pre>"$" operations support the following commands in the format $+{attachment}={command}:
* <p/>
* total : total of all the values inserted into the sketch.
* val(x): literal value estimation associated with key x</pre>
* <p/>
*
* <p>If no command is specified or an invalid command is specified then the estimator returns
* the total size. </p>
*
* <p>%{attachment}={a "~" separated list of keys} : generates a virtual node for each key.
* The number of hits for each virtual node is equal to the count estimate in the sketch.
* Keys with an estimate of 0 will not appear in the output.</p>
* <p/>
* <p>Query Path Examples:</p>
* <pre>
* /service$+pubcount=foo
* /service/+%pubcount~foo,bar,bax
* </pre>
*
* @user-reference
*/
public static final class Config extends TreeDataParameters<DataCMSUpperBound> {
/**
* Bundle field name from which to insert keys into the sketch.
* This field is required.
*/
@FieldConfig(codable = true, required = true)
private AutoField key;
/**
* Upper bound for values in the sketch. Subsequent occurrences
* of the key will be deleted from the bundle.
* This field is required.
*/
@FieldConfig(codable = true, required = true)
private long limit;
/**
* Optionally specify the depth of the sketch.
* If 'confidence' is specified then ignore this value. Default is 10.
*/
@FieldConfig(codable = true)
private int depth = 10;
/**
* Confidence that the error tolerance is satisfied.
* If 'confidence' is specified then ignore 'depth' parameter.
* Expressed as a fraction.
*/
private double confidence;
/**
* Width of the sketch in bits.
* Either 'width' or 'percentage' are required.
*/
@FieldConfig(codable = true)
private int width;
/**
* Maximum error tolerated as percentage of cardinality.
* Either 'width' or 'percentage' are required.
*/
@FieldConfig(codable = true)
private double percentage;
/**
* Optional bundle field name for the non-negative integer values
* that are to be associated with each key. If not specified then
* each key instance is assumed to have a count of 1.
*/
@FieldConfig(codable = true)
private AutoField count;
/**
* If true then use the limit as an upper bound. Otherwise
* use as a lower bound.
*/
@FieldConfig(codable = true)
private boolean upper = true;
@Override
public DataCMSUpperBound newInstance() {
DataCMSUpperBound db = new DataCMSUpperBound();
if ((width == 0) && (percentage == 0.0)) {
throw new IllegalArgumentException("Either 'width' or " +
"'percentage' must be specified.");
} else if ((width > 0) && (percentage > 0.0)) {
throw new IllegalArgumentException("Either 'width' or " +
"'percentage' must be specified.");
} else if (confidence < 0.0 || confidence >= 1.0) {
throw new IllegalArgumentException("'confidence' must be between 0 and 1");
}
int calcWidth = width;
int calcDepth = depth;
if (calcWidth == 0) {
calcWidth = (int) Math.ceil(Math.E / percentage);
}
if (confidence > 0.0) {
calcDepth = (int) Math.ceil(-Math.log(1.0 - confidence));
}
db.sketch = new CountMinSketch(calcDepth, calcWidth, 0);
return db;
}
}
@FieldConfig(codable = true)
private byte[] raw;
private CountMinSketch sketch;
public DataCMSUpperBound(){}
public DataCMSUpperBound(int depth, int width) {
this.sketch = new CountMinSketch(depth, width, 0);
}
@Override
public ValueObject getValue(String key) {
if (key == null || key.equals("total")) {
return ValueFactory.create(sketch.size());
} else if (key.startsWith("val(") && key.endsWith(")")) {
String input = key.substring(4, key.length() - 1);
long count = sketch.estimateCount(input);
return ValueFactory.create(count);
} else {
throw new IllegalArgumentException("Unexpected key argument " + key);
}
}
@Override
public List<DataTreeNode> getNodes(DataTreeNode parent, String key) {
if (key == null) {
throw new IllegalArgumentException("No key arguments entered");
}
String[] keys = LessStrings.splitArray(key, "~");
List<DataTreeNode> list = new ArrayList<>(keys.length);
for (String k : keys) {
long count = sketch.estimateCount(k);
list.add(new VirtualTreeNode(k, count));
}
return list;
}
/* updates the CountMinSketch
* if the key is absent, return false
* if the limit is reached then delete the field and return true
* if count field is not specified, always increment by one
* if count field is specified and not present or invalid, do not update and return false
* otherwise increment key by the count field's value
*/
@Override
public boolean updateChildData(DataTreeNodeUpdater state, DataTreeNode childNode, Config conf) {
return updateChildData(state.getBundle(), conf);
}
@VisibleForTesting
boolean updateChildData(Bundle bundle, Config conf) {
ValueObject valueObject = conf.key.getValue(bundle);
if (valueObject == null) {
return false;
}
if (valueObject.getObjectType() == ValueObject.TYPE.ARRAY) {
ValueArray array = valueObject.asArray();
Iterator<ValueObject> iterator = array.iterator();
boolean updated = false;
while (iterator.hasNext()) {
ValueObject next = iterator.next();
updated = updated | updateString(next.asString().asNative(), iterator, bundle, conf);
}
return updated;
} else {
return updateString(valueObject.asString().asNative(), null, bundle, conf);
}
}
private boolean updateString(String input, Iterator<ValueObject> iterator, Bundle bundle, Config conf) {
long current = sketch.estimateCount(input);
if (conf.upper) {
if (current >= conf.limit) {
removeElement(iterator, bundle, conf);
return false;
} else {
return updateCount(input, bundle, conf);
}
} else {
if (current < conf.limit) {
removeElement(iterator, bundle, conf);
return updateCount(input, bundle, conf);
} else {
return false;
}
}
}
private void removeElement(Iterator<ValueObject> iterator, Bundle bundle, Config conf) {
if (iterator != null) {
iterator.remove();
} else {
conf.key.setValue(bundle, null);
}
}
private boolean updateCount(String input, Bundle bundle, Config conf) {
long myCount = 1;
if (conf.count != null) {
ValueObject v = conf.count.getValue(bundle);
if (v != null) {
try {
myCount = v.asLong().getLong();
} catch (ValueTranslationException ignored) {
return false;
}
} else {
return false;
}
}
sketch.add(input, myCount);
return true;
}
@Override
public void postDecode() {
sketch = CountMinSketch.deserialize(raw);
raw = null;
}
@Override
public void preEncode() {
raw = CountMinSketch.serialize(sketch);
}
public void add(String val, long count) {
sketch.add(val, count);
}
long estimateCount(String item) {
return sketch.estimateCount(item);
}
}