/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.addthis.hydra.data.filter.closeablebundle;
import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Optional;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import com.addthis.bundle.core.Bundle;
import com.addthis.bundle.util.AutoField;
import com.addthis.bundle.value.ValueArray;
import com.addthis.bundle.value.ValueObject;
import com.clearspring.analytics.stream.frequency.CountMinSketch;
import com.google.common.io.ByteStreams;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonProperty;
/**
* This closeable bundle filter <span class="hydra-summary">applies a limit using the count-min sketch data structure</span>.
* If the input is a scalar value then this filter returns true if it accepts the input. If
* it rejects the input then that field is removed from the bundle and the filter returns false.
* If the input is an array then this filter removes values that do not need the limit criteria
* and always return true.
*
* @user-reference
*/
public class CloseableBundleCMSLimit implements CloseableBundleFilter {
public enum Bound {
LOWER, UPPER;
}
private static final String KEY_SEPARATOR = "&";
@Nonnull
public final AutoField[] keyFields;
@Nonnull
public final AutoField valueField;
@Nullable
public final AutoField countField;
public final String dataDir;
public final int cacheSize;
public final boolean rejectNull;
public final int limit;
/**
* The value to return if the filter removes the item from the bundle.
* Only used if the input is a scalar. If the input is an array then
* always return true. Default value of parameter is false.
*/
public final boolean failReturn;
/**
* Optionally specify the depth of the sketch.
* If 'confidence' is specified then ignore this value.
*/
public final int depth;
/**
* Confidence that the error tolerance is satisfied.
* If 'confidence' is specified then ignore 'depth' parameter.
* Expressed as a fraction.
*/
public final double confidence;
/**
* Width of the sketch in bits.
* Either 'width' or 'percentage' are required.
*/
public final int width;
/**
* Maximum error tolerated as percentage of cardinality.
* Either 'width' or 'percentage' are required.
*/
public final double percentage;
@Nonnull
public final Bound bound;
private final CMSLimitHashMap sketches;
private final int calcWidth;
private final int calcDepth;
@JsonCreator
public CloseableBundleCMSLimit(@JsonProperty(value = "keyFields", required = true) AutoField[] keyFields,
@JsonProperty(value = "valueField", required = true) AutoField valueField,
@JsonProperty("countField") AutoField countField,
@JsonProperty(value = "dataDir", required = true) String dataDir,
@JsonProperty(value = "cacheSize", required = true) int cacheSize,
@JsonProperty("rejectNull") boolean rejectNull,
@JsonProperty("failReturn") boolean failReturn,
@JsonProperty("width") int width,
@JsonProperty("depth") int depth,
@JsonProperty(value = "limit", required = true) int limit,
@JsonProperty("confidence") double confidence,
@JsonProperty("percentage") double percentage,
@JsonProperty(value = "bound", required = true) Bound bound) {
if ((width == 0) && (percentage == 0.0)) {
throw new IllegalArgumentException("Either 'width' or " +
"'percentage' must be specified.");
} else if ((width > 0) && (percentage > 0.0)) {
throw new IllegalArgumentException("Either 'width' or " +
"'percentage' must be specified.");
} else if (confidence < 0.0 || confidence >= 1.0) {
throw new IllegalArgumentException("'confidence' must be between 0 and 1");
}
this.keyFields = keyFields;
this.valueField = valueField;
this.countField = countField;
this.dataDir = dataDir;
this.cacheSize = cacheSize;
this.rejectNull = rejectNull;
this.failReturn = failReturn;
this.width = width;
this.depth = depth;
this.limit = limit;
this.confidence = confidence;
this.percentage = percentage;
this.bound = bound;
this.sketches = new CMSLimitHashMap();
int cWidth = width;
int cDepth = depth;
if (cWidth == 0) {
cWidth = (int) Math.ceil(Math.E / percentage);
}
if (confidence > 0.0) {
cDepth = (int) Math.ceil(-Math.log(1.0 - confidence));
}
calcWidth = cWidth;
calcDepth = cDepth;
}
@Override public synchronized void close() {
try {
for (Map.Entry<String, CountMinSketch> entry : sketches.entrySet()) {
writeSketch(entry.getKey(), entry.getValue());
}
} catch (IOException ex) {
throw new UncheckedIOException(ex);
}
}
@Override public boolean filter(Bundle row) {
StringBuilder sb = new StringBuilder();
for (AutoField keyField : keyFields) {
Optional<String> optional = keyField.getString(row);
if (optional.isPresent()) {
if (sb.length() > 0) {
sb.append(KEY_SEPARATOR);
}
sb.append(optional.get());
} else if (rejectNull) {
return failReturn;
}
}
return updateSketch(row, sb.toString(), valueField.getValue(row));
}
private synchronized boolean updateSketch(Bundle row, String key, ValueObject valueObject) {
CountMinSketch sketch = sketches.get(key);
if (valueObject == null) {
return failReturn;
}
if (valueObject.getObjectType() == ValueObject.TYPE.ARRAY) {
ValueArray array = valueObject.asArray();
Iterator<ValueObject> iterator = array.iterator();
while (iterator.hasNext()) {
ValueObject next = iterator.next();
updateString(next.asString().asNative(), sketch, iterator, row);
}
return true;
} else {
return updateString(valueObject.asString().asNative(), sketch, null, row);
}
}
private boolean updateString(String input, CountMinSketch sketch, Iterator<ValueObject> iterator, Bundle bundle) {
long current = sketch.estimateCount(input);
switch (bound) {
case UPPER:
if (current > limit) {
removeElement(iterator, bundle);
return failReturn;
} else {
updateCount(input, sketch, bundle);
}
break;
case LOWER:
if (current < limit) {
removeElement(iterator, bundle);
updateCount(input, sketch, bundle);
return failReturn;
}
break;
}
return true;
}
private void removeElement(Iterator<ValueObject> iterator, Bundle bundle) {
if (iterator != null) {
iterator.remove();
} else {
valueField.removeValue(bundle);
}
}
private void updateCount(String input, CountMinSketch sketch, Bundle bundle) {
long myCount = 1;
if (countField != null) {
myCount = countField.getLong(bundle).orElse(0);
}
if (myCount > 0) {
sketch.add(input, myCount);
}
}
private void writeSketch(String key, CountMinSketch sketch) throws IOException {
byte[] data = CountMinSketch.serialize(sketch);
ByteArrayOutputStream byteStream =
new ByteArrayOutputStream(data.length);
GZIPOutputStream zipStream = new GZIPOutputStream(byteStream);
try {
zipStream.write(data);
} finally {
zipStream.close();
byteStream.close();
}
Path parent = Paths.get(dataDir);
Path path = Paths.get(dataDir, key + ".gz");
Files.createDirectories(parent);
Files.write(path, byteStream.toByteArray());
}
private class CMSLimitHashMap extends LinkedHashMap<String, CountMinSketch> {
CMSLimitHashMap() {
super(cacheSize, 0.75f, true);
}
@Override
public CountMinSketch get(Object key) {
try {
CountMinSketch sketch = super.get(key);
if (sketch == null) {
ByteArrayInputStream byteStream = null;
GZIPInputStream zipStream = null;
try {
Path path = Paths.get(dataDir, key + ".gz");
if (Files.exists(path)) {
byte[] data = Files.readAllBytes(path);
byteStream = new ByteArrayInputStream(data);
zipStream = new GZIPInputStream(byteStream);
sketch = CountMinSketch.deserialize(ByteStreams.toByteArray(zipStream));
} else {
sketch = new CountMinSketch(calcDepth, calcWidth, 0);
}
} finally {
if (zipStream != null) {
zipStream.close();
}
if (byteStream != null) {
byteStream.close();
}
}
put(key.toString(), sketch);
}
return sketch;
} catch (IOException ex) {
throw new UncheckedIOException(ex);
}
}
protected boolean removeEldestEntry(Map.Entry<String, CountMinSketch> eldest) {
try {
if (size() > cacheSize) {
String key = eldest.getKey();
CountMinSketch value = eldest.getValue();
writeSketch(key, value);
return true;
} else {
return false;
}
} catch (IOException ex) {
throw new UncheckedIOException(ex);
}
}
}
public static class CloseableBundleCMSLimitBuilder {
private AutoField[] keyFields;
private AutoField valueField;
private AutoField countField;
private String dataDir;
private int cacheSize;
private boolean rejectNull;
private int limit;
private boolean failReturn;
private int depth;
private double confidence;
private int width;
private double percentage;
private Bound bound;
public CloseableBundleCMSLimitBuilder() {}
public CloseableBundleCMSLimitBuilder(CloseableBundleCMSLimit source) {
this.keyFields = source.keyFields;
this.valueField = source.valueField;
this.countField = source.countField;
this.dataDir = source.dataDir;
this.cacheSize = source.cacheSize;
this.rejectNull = source.rejectNull;
this.limit = source.limit;
this.failReturn = source.failReturn;
this.depth = source.depth;
this.confidence = source.confidence;
this.width = source.width;
this.percentage = source.percentage;
this.bound = source.bound;
}
public CloseableBundleCMSLimitBuilder setKeyFields(AutoField[] keyFields) {
this.keyFields = keyFields;
return this;
}
public CloseableBundleCMSLimitBuilder setValueField(AutoField valueField) {
this.valueField = valueField;
return this;
}
public CloseableBundleCMSLimitBuilder setCountField(AutoField countField) {
this.countField = countField;
return this;
}
public CloseableBundleCMSLimitBuilder setDataDir(String dataDir) {
this.dataDir = dataDir;
return this;
}
public CloseableBundleCMSLimitBuilder setCacheSize(int cacheSize) {
this.cacheSize = cacheSize;
return this;
}
public CloseableBundleCMSLimitBuilder setRejectNull(boolean rejectNull) {
this.rejectNull = rejectNull;
return this;
}
public CloseableBundleCMSLimitBuilder setLimit(int limit) {
this.limit = limit;
return this;
}
public CloseableBundleCMSLimitBuilder setFailReturn(boolean failReturn) {
this.failReturn = failReturn;
return this;
}
public CloseableBundleCMSLimitBuilder setDepth(int depth) {
this.depth = depth;
return this;
}
public CloseableBundleCMSLimitBuilder setConfidence(double confidence) {
this.confidence = confidence;
return this;
}
public CloseableBundleCMSLimitBuilder setWidth(int width) {
this.width = width;
return this;
}
public CloseableBundleCMSLimitBuilder setPercentage(double percentage) {
this.percentage = percentage;
return this;
}
public CloseableBundleCMSLimitBuilder setBound(Bound bound) {
this.bound = bound;
return this;
}
public CloseableBundleCMSLimit build() {
CloseableBundleCMSLimit closeableBundleCMSLimit =
new CloseableBundleCMSLimit(keyFields, valueField, countField, dataDir,
cacheSize, rejectNull, failReturn, width,
depth, limit, confidence, percentage, bound);
return closeableBundleCMSLimit;
}
}
}