/*
* Copyright © 2015 Cask Data, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License"); you may not
* use this file except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations under
* the License.
*/
package co.cask.cdap.dq.functions;
import co.cask.cdap.api.common.Bytes;
import co.cask.cdap.dq.DataQualityWritable;
import com.google.gson.Gson;
import com.google.gson.reflect.TypeToken;
import java.lang.reflect.Type;
import java.util.AbstractMap;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
/**
* Aggregation function creates a histogram with custom bucketing
* for numbers - no categorical data
*/
public class HistogramWithBucketing implements BasicAggregationFunction<Map<Map.Entry<Double, Double>, Integer>> {
private static final Gson GSON = new Gson();
private static final Type TOKEN_TYPE_MAP_MAP_ENTRY_DOUBLE_DOUBLE_LONG =
new TypeToken<Map<Map.Entry<Double, Double>, Long>>() { }.getType();
private ArrayList<Double> values = new ArrayList<>();
private Double max = Double.MIN_VALUE;
private Double min = Double.MAX_VALUE;
public Map<Map.Entry<Double, Double>, Long> histogram = new HashMap<>();
@Override
public void add(DataQualityWritable value) {
Double newValue = Double.parseDouble(value.get().toString());
max = newValue > max ? newValue : max;
min = newValue < min ? newValue : min;
values.add(newValue);
}
@Override
public byte[] aggregate() {
Bucketing bucketing = new Bucketing("automatic", null);
bucketing.doBucketing();
for (Double value : values) {
for (Map.Entry<Map.Entry<Double, Double>, Long> bucketMapEntry : histogram.entrySet()) {
if (value >= bucketMapEntry.getKey().getKey() && value <= bucketMapEntry.getKey().getValue()) {
bucketMapEntry.setValue(bucketMapEntry.getValue() + 1);
break;
}
}
}
String aggregationJSON = GSON.toJson(histogram);
return Bytes.toBytes(aggregationJSON);
}
@Override
public Map<Map.Entry<Double, Double>, Integer> deserialize(byte[] serializedValue) {
String valueJSON = Bytes.toString(serializedValue);
return GSON.fromJson(valueJSON, TOKEN_TYPE_MAP_MAP_ENTRY_DOUBLE_DOUBLE_LONG);
}
private class Bucketing {
String bucketingStrategy;
Integer maxBucketSize;
private Bucketing(String bucketingStrategy, Integer maxBucketSize) {
this.maxBucketSize = maxBucketSize == null ? 10 : maxBucketSize;
this.bucketingStrategy = bucketingStrategy;
}
private void doBucketing() {
if ("automatic".equals(bucketingStrategy)) {
automaticallyGenerateBuckets();
} else {
if (maxBucketSize > 0) {
manuallyGenerateBuckets(maxBucketSize);
}
}
}
/**
* Generates buckets using the Freedman-Diaconis rule
* Which says: Bin size = 2 * IQR(x) n^(-1/3)
*/
private void automaticallyGenerateBuckets() {
Collections.sort(values);
long valuesListSize = (long) values.size();
long quartile = (long) Math.floor(valuesListSize / 4.0);
Double firstQuartile = values.get((int) quartile);
Double thirdQuartile = values.get((int) quartile * 3);
Double interquartileRange = thirdQuartile - firstQuartile;
Long maxBucketSize = (long) Math.ceil(2 * interquartileRange * Math.pow(valuesListSize, -1 / 3));
if (maxBucketSize == 0L) {
maxBucketSize = 1L;
}
for (double i = min; i < max; i += maxBucketSize) {
Map.Entry<Double, Double> mapEntry =
new AbstractMap.SimpleEntry<>(i, i + maxBucketSize);
histogram.put(mapEntry, 0L);
}
}
/**
* Generates buckets by simply allowing the max span of a
* bucket to be maxBucketSize
*/
private void manuallyGenerateBuckets(long maxBucketSize) {
Collections.sort(values);
for (double i = min; i < max; i += maxBucketSize) {
Map.Entry<Double, Double> mapEntry =
new AbstractMap.SimpleEntry<>(i, i + maxBucketSize);
histogram.put(mapEntry, 0L);
}
}
}
}