/*
* Copyright (c) 2013-2017 Cinchapi Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.cinchapi.concourse.server.plugin.data;
import java.util.AbstractMap;
import java.util.AbstractSet;
import java.util.Collections;
import java.util.Iterator;
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import javax.annotation.Nullable;
import javax.annotation.concurrent.NotThreadSafe;
import org.apache.commons.math3.stat.StatUtils;
import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics;
import com.cinchapi.concourse.Link;
import com.cinchapi.concourse.thrift.TObject;
import com.cinchapi.concourse.thrift.Type;
import com.google.common.base.MoreObjects;
import com.google.common.base.Preconditions;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import com.zaxxer.sparsebits.SparseBitSet;
/**
* <p>
* An implementation of a {@code Map} that characterizes entries on the fly as
* they are added or removed. This is used to characterize user data as it is
* being entered, so that the visualization engine can query this map to
* immediately view data characterization in constant time.
* </p>
* <p>
* Apart from {@link #put(Object, Set)}, {@link #remove(Object)},
* {@link #putAll(Map)}, and {@link #clear()}, all methods are delegated to an
* internal map. The four aforementioned methods are overridden in terms of
* functionality to characterize the entries in the map before performing the
* original intended function.
* </p>
* <p>
* {@link TrackingMultimap} is parametrized by type-parameters K and V, but the
* underlying internal map is in the form {@code Map<K, Set<V>>}. This is to
* comply with the format of data, which is either a Map from Strings (keys) to
* Objects (values), or Objects (values) to Longs (records).
* </p>
*
* @author Jeff Nelson
*/
// TODO talk about what is tracked for keys and what is tracked for values
@NotThreadSafe
public abstract class TrackingMultimap<K, V> extends AbstractMap<K, Set<V>> {
/**
* Return the correct {@link DataType} for the {@code obj}.
*
* @param obj the object to categorize
* @return the correct {@link DataType}
*/
private static DataType getDataType(Object object) {
Class<?> clazz = object.getClass();
if(clazz == Link.class || isTObjectType(object, Type.LINK)) {
return DataType.LINK;
}
else if(isTObjectType(object, Type.DOUBLE, Type.FLOAT, Type.INTEGER,
Type.LONG) || Number.class.isAssignableFrom(clazz)
|| OTHER_NUMBER_CLASSES.contains(clazz)) {
return DataType.NUMBER;
}
else if(isTObjectType(object, Type.STRING, Type.TAG)
|| clazz == String.class) {
return DataType.STRING;
}
else if(isTObjectType(object, Type.BOOLEAN) || clazz == Boolean.class
|| clazz == boolean.class) {
return DataType.BOOLEAN;
}
else {
return DataType.UNKNOWN;
}
}
/**
* Return {@code true} if {@code obj} is an instance of {@link TObject} and
* falls under any of the specified {@code types}.
*
* @param obj the object to check
* @param types the types for which to check
* @return {@code true} if the ttype of the {@code obj} is any of the
* specified {@code types}
*/
private static boolean isTObjectType(Object obj, Type... types) {
if(obj instanceof TObject) {
for (Type type : types) {
if(type == ((TObject) obj).getType()) {
return true;
}
}
return false;
}
else {
return false;
}
}
/**
* Return a new {@link HashSet} that contains all of the {@code elements},
* if the input is not {@code null}. Otherwise, return {@code null}.
*
* @param elements the elements to include in the returned set
* @return a set that includes all of the elements or {@code null}
*/
@Nullable
private static <V> Set<V> newHashSetNullSafe(
Iterable<? extends V> elements) {
return elements != null ? Sets.newHashSet(elements) : null;
}
/**
* A collection of classes that don't extend {@link Number} should be
* considered {@link DataType#NUMBER numerical}.
*/
private static Set<Class<?>> OTHER_NUMBER_CLASSES = Sets
.newIdentityHashSet();
static {
OTHER_NUMBER_CLASSES.add(int.class);
OTHER_NUMBER_CLASSES.add(long.class);
OTHER_NUMBER_CLASSES.add(float.class);
OTHER_NUMBER_CLASSES.add(double.class);
OTHER_NUMBER_CLASSES.add(short.class);
OTHER_NUMBER_CLASSES.add(byte.class);
}
/**
* An internal map where the data is actually stored.
*/
private Map<K, Set<V>> data;
/**
* A mapping from each of the {@link DataType data types} to the number of
* stored keys that are characterized as such.
*/
private final Map<DataType, AtomicInteger> keyTypes;
/**
* The total number of values (including duplicates) added across all the
* keys.
*/
private final AtomicLong totalValueCount;
/**
* The total number of unique values (e.g. excluding duplicates) that are
* stored across all the keys.
*/
private final AtomicLong uniqueValueCount;
/**
* An approximate cache of values stored across all the keys.
* <p>
* Whenever a value is added to the map, the bit for its
* {@link Object#hashCode() hash code} is flipped to indicate that the value
* is stored. However, hash codes are not guaranteed to be unique among
* objects, so its necessary to look through all the values and test the
* equality for a potential match to determine if an object is actually
* contained or not.
* </p>
*/
private final SparseBitSet valueCache;
/**
* Construct a new instance.
*
* @param delegate an {@link Map#isEmpty() empty} map
*/
protected TrackingMultimap(Map<K, Set<V>> delegate) {
Preconditions.checkState(delegate.isEmpty());
this.data = delegate;
this.keyTypes = Maps.newIdentityHashMap();
for (DataType type : DataType.values()) {
this.keyTypes.put(type, new AtomicInteger(0));
}
this.totalValueCount = new AtomicLong(0);
this.uniqueValueCount = new AtomicLong(0);
this.valueCache = new SparseBitSet();
}
/**
* Returns whether the {@link TrackingMultimap} contains values of the
* specified {@link DataType}.
*
* @param type the {@link DataType} being queried
* @return {@code true} if the {@code Map} contains this {@link DataType},
* false otherwise
*/
public boolean containsDataType(DataType type) {
return percentKeyDataType(type) > 0;
}
/**
* Remove the association between {@code key} and {@code value} from the
* map.
*
* @param key the key
* @param value the value
* @return {@code true} if the association previously existed and is removed
*/
public boolean delete(K key, V value) {
Set<V> values = data.get(key);
if(values != null && values.remove(value)) {
if(values.isEmpty()) {
data.remove(values);
}
return true;
}
else {
return false;
}
}
/**
* The {@code distinctiveness} is a measure of how the number of unique keys
* in the map relative to the total number of values, expressed as a number
* between 0 and 1.
* <p>
* The {@link #keySet()} reveals the total number of unique keys; however,
* this method takes that value and divides it by the total number of values
* across all of the keys to get a mathematical measure of how much
* duplication exists among the data in the map duplication
* </p>
* <p>
* A distinctiveness of 1 means that the keys are completely distinct (e.g.
* no key maps to more than 1 value). Lower measures of distinctiveness mean
* that they are less distinct (e.g. on average, each key maps to more
* values as the distinctiveness gets closer to 0).
* </p>
*
* @return the distinctiveness of the data, on a scale from 0 to 1
*/
public double distinctiveness() {
double tvc = totalValueCount.get();
return (tvc == 0) ? 0 : (double) data.size() / tvc;
}
@Override
public Set<Entry<K, Set<V>>> entrySet() {
return data.entrySet();
}
@SuppressWarnings("unchecked")
@Override
public boolean equals(Object obj) {
if(obj instanceof TrackingMultimap) {
return data.equals(((TrackingMultimap<K, V>) obj).data);
}
else {
return false;
}
}
@Override
public Set<V> get(Object key) {
return data.get(key);
}
@Override
public int hashCode() {
return data.hashCode();
}
/**
* Return {@code true} if this map associates {@code value} with at least
* one key.
* <p>
* This method is different from {@link #containsValue(Object)} because it
* checks for values <strong>within</strong> the Sets that are mapped from
* keys. Use the aforementioned if you need to check for the existence of an
* entire Set as opposed to an individual value.
* </p>
*
* @param value the value to checks
* @return {@code true} if the value is contained, {@code false} otherwise
*/
public boolean hasValue(V value) {
int hashCode = Math.abs(value.hashCode());
if(valueCache.get(hashCode)) {
for (Set<V> values : data.values()) {
if(values.contains(value)) {
return true;
}
}
return false;
}
else {
return false;
}
}
/**
* Add a new association between {@code key} and {@code value} to the map if
* it doesn't already exist.
*
* @param key the key
* @param value the value
* @return {@code true} if the association didn't previously exist and is
* not added
*/
public boolean insert(K key, V value) {
Set<V> values = data.get(key);
if(values == null) {
values = new ValueSetWrapper(key);
data.put(key, values);
}
if(values.add(value)) {
return true;
}
else {
return false;
}
}
/**
* Merge all the {@code values} into the set of values that is mapped from
* {@code key}.
*
* @param key the key
* @param values the values to merge
* @return all the values mapped from {@code key} after the merge
*/
public Set<V> merge(K key, Set<V> values) {
for (V value : values) {
insert(key, value);
}
return get(key);
}
/**
* Return the percent (between 0 and 1) of keys that are an instance of the
* specified {@link DataType type}.
*
* @param type the {@link DataType} of interest
* @return the percent of keys of the {@code type}
*/
public double percentKeyDataType(DataType type) {
return ((double) keyTypes.get(type).get()) / totalValueCount.get();
}
/**
* Determines the proportion of occurrence of a particular key. This is
* merely the frequency of that key divided by the total number of key
* frequencies.
*
* @param element the key for which the proportion is being sought
* @return the proportion of the key
*/
public double proportion(K element) {
double frequency = data.get(element).size();
return frequency / totalValueCount.get();
}
/**
* <p>
* <strong>NOTE:</strong> This implementation will replace all the existing
* values mapped from {@code key} with those specified in the {@code value}.
* If you want "merge-like" functionality call the {@link #merge(Set)}
* method.
* </p>
* {@inheritDoc}
*/
@Override
public Set<V> put(K key, Set<V> value) {
Set<V> stored = newHashSetNullSafe(data.get(key));
if(stored == null) {
data.put(key, new ValueSetWrapper(key));
}
for (V element : MoreObjects.firstNonNull(stored,
Collections.<V> emptySet())) {
delete(key, element);
}
for (V element : value) {
insert(key, element);
}
return stored;
}
@SuppressWarnings("unchecked")
@Override
public Set<V> remove(Object key) {
Set<V> stored = newHashSetNullSafe(data.get(key));
if(stored != null) {
for (V element : stored) {
delete((K) key, element); // type cast is valid because the
// presence of elements over which to
// iterate ensures that #put(K key, V
// value) was called, which performs
// type checking
}
}
Set<V> values = data.get(key);
if(values != null && values.isEmpty()) {
data.remove(key);
}
return stored;
}
/**
* Return a relative measure of the statistical dispersion in this data.
* <p>
* There are several ways to measure statistical dispersion, so callers
* should not rely on a specific underlying implementation because it may
* change over time. This method simply offers a value that allows for
* comparison of dispersion across data sets.
* </p>
* <p>
* A larger dispersion value means that the data is more spread out whereas
* a smaller dispersion value indicates the opposite.
* </p>
*
* @return the dispersion value for this data
*/
public double spread() {
// Get the quartile coefficient of dispersion, which is a cross
// dataset mechanism for comparing the relative dispersion of data.
double[] frequencies = new double[size()];
AtomicInteger index = new AtomicInteger(0);
data.values().forEach(
records -> frequencies[index.getAndIncrement()] = records
.size());
DescriptiveStatistics stats = new DescriptiveStatistics(frequencies);
double p1 = stats.getPercentile(25);
double p3 = stats.getPercentile(75);
double coefficientOfDispersion = (p3 - p1) / (p3 + p1);
// Grab the coefficient of variance
double coefficientOfVariance = stats.getStandardDeviation()
/ stats.getMean();
// Calculate the average absolute deviation from the mean
double[] deviations = new double[frequencies.length];
for (int i = 0; i < deviations.length; ++i) {
deviations[i] = Math.abs(frequencies[i] - stats.getMean());
}
double averageAbsoluteDeviation = StatUtils.mean(deviations)
/ stats.getMean();
// Apply a weighting to the various components
return (0.50 * coefficientOfDispersion) + (0.40 * coefficientOfVariance)
+ (0.10 * averageAbsoluteDeviation);
}
@Override
public String toString() {
return data.toString();
}
/**
* Calculates the uniqueness of the data by summing the squares of the
* proportions of each key within the {@link #keySet() key set},
* determining the square root of the sum, and subtracting it from 1. This
* always results in a number between 0 and 1.
* <p>
* For datasets with a large number of distinct values appearing in
* relatively similar frequency, this function returns a relatively high
* number, since there are many unique values. Mathematically, each
* contributes a small amount to the proportion, so the square root term is
* small, returning a large end result.
* </p>
* <p>
* Conversely, for datasets with a few dominating values, this function
* returns a fairly low number. This is because the higher proportions from
* the dominating values contribute more heavily towards the sum of squares.
* The square root is therefore higher, and when subtracted from 1, returns
* a lower number.
* </p>
*
* @return the uniqueness of the data, on a scale from 0 to 1
*/
public double uniqueness() {
double sumOfSquares = 0;
for (K key : this.keySet()) {
sumOfSquares += Math.pow(proportion(key), 2);
}
return 1 - Math.sqrt(sumOfSquares);
}
/**
* Determines how many unique values exist within the {@link Map} and
* returns the appropriate {@link VariableType}.
*
* The three possible return types are:
* <ol>
* <li><strong>DICHOTOMOUS</strong>: if there are 1 or 2 unique values</li>
* <li><strong>NOMINAL</strong>: if the number of unique values is greater
* than 2 and less than or equal to 12</li>
* <li><strong>INTERVAL</strong>: if there are more than 12 unique
* values</li>
* </ol>
*
* @return
*/
public VariableType variableType() {
// NOTE: The boundary between nominal and interval is arbitrary, and may
// require tweaking since it is a heuristic model.
if(data.keySet().size() <= 2) {
return VariableType.DICHOTOMOUS;
}
else if(data.keySet().size() <= 12) {
return VariableType.NOMINAL;
}
else {
return VariableType.INTERVAL;
}
}
/**
* Return a new {@link Set} (of the appropriate type) to use for storing the
* values that are mapped from a key.
*
* @return a new {@link Set}
*/
protected abstract Set<V> createValueSet();
/**
* A broad classification of objects that describes the nature of the data.
*
* @author Jeff Nelson
*/
public static enum DataType {
BOOLEAN, LINK, NUMBER, STRING, UNKNOWN;
}
/**
* A classification of objects that describes how data is categorized
*/
public static enum VariableType {
DICHOTOMOUS, INTERVAL, NOMINAL;
}
/**
* An internal wrapper around a Set returned from the
* {@link #createValueSet()} method.
* <p>
* The wrapper is responsible for tracking stats for the individual set and
* updating the appropriate variables of the outer class. This ensures that
* the caller can interact with individual value sets without breaking
* tracking semantics.
* </p>
*
* @author Jeff Nelson
*/
private class ValueSetWrapper extends AbstractSet<V> {
/**
* The key from which this {@link Set} is mapped in the outer
* TrackingMultimap.
*/
private K key;
/**
* The wrapped set that actually stores the data.
*/
private final Set<V> values = createValueSet();
/**
* Construct a new instance.
*
* @param key
*/
ValueSetWrapper(K key) {
this.key = key;
}
@Override
public boolean add(V element) {
boolean contained = hasValue(element);
if(values.add(element)) {
totalValueCount.incrementAndGet();
DataType keyType = getDataType(key);
keyTypes.get(keyType).incrementAndGet();
if(!contained) {
// The value was not previously contained, so we must update
// the number of unique values stored across all the keys.
uniqueValueCount.incrementAndGet();
valueCache.set(Math.abs(element.hashCode()));
}
return true;
}
else {
return false;
}
}
@SuppressWarnings("unchecked")
@Override
public boolean equals(Object obj) {
if(obj instanceof TrackingMultimap.ValueSetWrapper) {
return values.equals(((ValueSetWrapper) obj).values);
}
else if(obj instanceof Set) {
return Objects.equals(values, obj);
}
else {
return false;
}
}
@Override
public int hashCode() {
return values.hashCode();
}
@Override
public Iterator<V> iterator() {
return new Iterator<V>() {
/**
* The delegate iterator that controls state.
*/
private final Iterator<V> delegate = values.iterator();
/**
* The last value returned from the {@link #next()} method.
*/
private V next = null;
@Override
public boolean hasNext() {
return delegate.hasNext();
}
@Override
public V next() {
next = delegate.next();
return next;
}
@Override
public void remove() {
ValueSetWrapper.this.remove(next);
next = null;
}
};
}
@SuppressWarnings("unchecked")
@Override
public boolean remove(Object element) {
if(values.remove(element)) {
totalValueCount.decrementAndGet();
DataType keyType = getDataType(key);
keyTypes.get(keyType).decrementAndGet();
boolean contained = hasValue((V) element);
if(!contained) {
// Since the value is no longer "contained" we are free to
// decrement the number of unique values stored across all
// the keys
uniqueValueCount.decrementAndGet();
}
return true;
}
else {
return false;
}
}
@Override
public int size() {
return values.size();
}
@Override
public String toString() {
return values.toString();
}
}
}