/*
* ARX: Powerful Data Anonymization
* Copyright 2012 - 2017 Fabian Prasser, Florian Kohlmayer and contributors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.deidentifier.arx.framework.check.distribution;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics;
import org.deidentifier.arx.DataType;
import org.deidentifier.arx.DataType.DataTypeWithRatioScale;
import cern.colt.GenericSorting;
import cern.colt.Swapper;
import cern.colt.function.IntComparator;
import cern.colt.list.DoubleArrayList;
/**
* This abstract class represents a function that aggregates values from a frequency distribution
*
* @author Florian Kohlmayer
* @author Fabian Prasser
*/
public abstract class DistributionAggregateFunction implements Serializable {
/**
* This class calculates the arithmetic mean for a given distribution.
*
* @author Florian Kohlmayer
* @author Fabian Prasser
*/
public static class DistributionAggregateFunctionArithmeticMean extends DistributionAggregateFunction {
/** SVUID. */
private static final long serialVersionUID = 8379579591466576517L;
/** Commons math object to calculate the statistic. */
private transient DescriptiveStatistics stats;
/** Minimum */
private Double minimum = null;
/** Maximum */
private Double maximum = null;
/**
* Instantiates.
*
* @param ignoreMissingData
*/
public DistributionAggregateFunctionArithmeticMean(boolean ignoreMissingData) {
super(ignoreMissingData, true);
}
/**
* Clone constructor
* @param ignoreMissingData
* @param minimum
* @param maximum
*/
private DistributionAggregateFunctionArithmeticMean(boolean ignoreMissingData,
Double minimum,
Double maximum) {
this(ignoreMissingData);
this.minimum = minimum;
this.maximum = maximum;
}
@Override
public <T> String aggregate(Distribution distribution) {
stats.clear();
@SuppressWarnings("unchecked")
DataType<T> type = (DataType<T>)this.type;
@SuppressWarnings("unchecked")
DataTypeWithRatioScale<T> rType = (DataTypeWithRatioScale<T>) this.type;
addAll(stats, distribution, rType, 0d);
return type.format(rType.fromDouble(stats.getMean()));
}
/**
* Clone method
*/
public DistributionAggregateFunctionArithmeticMean clone() {
DistributionAggregateFunctionArithmeticMean result = new DistributionAggregateFunctionArithmeticMean(this.ignoreMissingData,
this.minimum,
this.maximum);
if (dictionary != null) {
result.initialize(dictionary, type, hierarchy);
}
return result;
}
@Override
public <T> double getError(Distribution distribution) {
stats.clear();
@SuppressWarnings("unchecked")
DataTypeWithRatioScale<T> rType = (DataTypeWithRatioScale<T>) this.type;
addAll(stats, distribution, rType, 0d);
return getNMSE(minimum, maximum, stats.getValues(), stats.getMean());
}
@Override
public void initialize(String[] dictionary, DataType<?> type, int[][] hierarchy) {
super.initialize(dictionary, type, hierarchy);
this.stats = new DescriptiveStatistics();
if (minimum == null || maximum == null) {
double[] values = getMinMax(dictionary, (DataTypeWithRatioScale<?>)type);
this.minimum = values[0];
this.maximum = values[1];
}
}
}
/**
* This class generalizes the given distribution.
*
* @author Fabian Prasser
* @author Florian Kohlmayer
*
*/
public static class DistributionAggregateFunctionGeneralization extends DistributionAggregateFunction {
/** SVUID. */
private static final long serialVersionUID = 5010485066464965464L;
/**
* Creates a new instance
* @param ignoreMissingData
*/
public DistributionAggregateFunctionGeneralization(boolean ignoreMissingData) {
super(ignoreMissingData, false);
}
@Override
public <T> String aggregate(Distribution distribution) {
// Prepare iteration
int[] buckets = distribution.getBuckets();
int[] state = new int[] { -1, 0 }; // value, next offset
read(buckets, state);
int current = state[0];
int previous = -1;
int lvl = 0;
int val = hierarchy[current][0];
while (read(buckets, state)) {
previous = current;
current = state[0];
while (hierarchy[current][lvl] != val) {
lvl++;
if (lvl == hierarchy[previous].length) {
return DataType.ANY_VALUE;
}
val = hierarchy[previous][lvl];
}
}
return dictionary[val];
}
/**
* Clone method
*/
public DistributionAggregateFunctionGeneralization clone() {
DistributionAggregateFunctionGeneralization result = new DistributionAggregateFunctionGeneralization(this.ignoreMissingData);
if (dictionary != null) {
result.initialize(dictionary, type, hierarchy);
}
return result;
}
@Override
public <T> double getError(Distribution distribution) {
// Prepare iteration
int[] buckets = distribution.getBuckets();
int[] state = new int[] { -1, 0 }; // value, next offset
read(buckets, state);
int current = state[0];
int previous = -1;
// Compute the generalization level
int lvl = 0;
int val = hierarchy[current][0];
outer: while (read(buckets, state)) {
previous = current;
current = state[0];
while (hierarchy[current][lvl] != val) {
lvl++;
if (lvl == hierarchy[previous].length -1) {
break outer;
}
val = hierarchy[previous][lvl];
}
}
// Return error
return (double) lvl / (double) (hierarchy[0].length - 1);
}
/**
* Reads data into the provided array
* @param buckets
* @param state
* @return True, if data was read
*/
private boolean read(int[] buckets, int[] state) {
while (state[1] < buckets.length && buckets[state[1]] == -1) {
state[1] += 2;
}
if (state[1] >= buckets.length) {
return false;
} else {
state[0] = buckets[state[1]];
state[1] += 2;
return true;
}
}
}
/**
* This class calculates the geometric mean for a given distribution.
*
* @author Florian Kohlmayer
* @author Fabian Prasser
*/
public static class DistributionAggregateFunctionGeometricMean extends DistributionAggregateFunction {
/** SVUID. */
private static final long serialVersionUID = -3835477735362966307L;
/** Commons math object to calculate the statistic. */
private transient DescriptiveStatistics stats;
/** Minimum */
private Double minimum = null;
/** Maximum */
private Double maximum = null;
/**
* Instantiates.
*
* @param ignoreMissingData
*/
public DistributionAggregateFunctionGeometricMean(boolean ignoreMissingData) {
super(ignoreMissingData, true);
}
/**
* Clone constructor
* @param ignoreMissingData
* @param minimum
* @param maximum
*/
private DistributionAggregateFunctionGeometricMean(boolean ignoreMissingData,
Double minimum,
Double maximum) {
this(ignoreMissingData);
this.minimum = minimum;
this.maximum = maximum;
}
@Override
public <T> String aggregate(Distribution distribution) {
stats.clear();
@SuppressWarnings("unchecked")
DataType<T> type = (DataType<T>)this.type;
@SuppressWarnings("unchecked")
DataTypeWithRatioScale<T> rType = (DataTypeWithRatioScale<T>) this.type;
addAll(stats, distribution, rType, 1d);
return type.format(rType.fromDouble(stats.getGeometricMean() - 1d));
}
/**
* Clone method
*/
public DistributionAggregateFunctionGeometricMean clone() {
DistributionAggregateFunctionGeometricMean result = new DistributionAggregateFunctionGeometricMean(this.ignoreMissingData,
this.minimum,
this.maximum);
if (dictionary != null) {
result.initialize(dictionary, type, hierarchy);
}
return result;
}
@Override
public <T> double getError(Distribution distribution) {
stats.clear();
@SuppressWarnings("unchecked")
DataTypeWithRatioScale<T> rType = (DataTypeWithRatioScale<T>) this.type;
addAll(stats, distribution, rType, 1d);
return getNMSE(minimum, maximum, stats.getValues(), stats.getGeometricMean() - 1d);
}
@Override
public void initialize(String[] dictionary, DataType<?> type, int[][] hierarchy) {
super.initialize(dictionary, type, hierarchy);
this.stats = new DescriptiveStatistics();
if (minimum == null || maximum == null) {
double[] values = getMinMax(dictionary, (DataTypeWithRatioScale<?>)type);
this.minimum = values[0];
this.maximum = values[1];
}
}
}
/**
* This class calculates the mode for a given distribution.
*
* @author Fabian Prasser
* @author Florian Kohlmayer
*
*/
public static class DistributionAggregateFunctionInterval extends DistributionAggregateFunction {
/** SVUID. */
private static final long serialVersionUID = 2349775566497080868L;
/**
* Instantiates.
*
* @param ignoreMissingData
*/
public DistributionAggregateFunctionInterval(boolean ignoreMissingData) {
super(ignoreMissingData, false);
}
@Override
public <T> String aggregate(Distribution distribution) {
// Determine min & max
@SuppressWarnings("unchecked")
DataType<T> type = (DataType<T>)this.type;
T minT = null;
T maxT = null;
int[] buckets = distribution.getBuckets();
for (int i = 0; i < buckets.length; i += 2) {
int value = buckets[i];
if (value != -1) {
T valT = type.parse(dictionary[value]);
if (minT == null || type.compare(valT, minT) < 0 ) {
minT = valT;
}
if (maxT == null || type.compare(valT, maxT) > 0 ) {
maxT = valT;
}
}
}
// Format
return minT == null || maxT == null ? DataType.NULL_VALUE : "[" + type.format(minT) + ", " + type.format(maxT) + "]";
}
/**
* Clone method
*/
public DistributionAggregateFunctionInterval clone() {
DistributionAggregateFunctionInterval result = new DistributionAggregateFunctionInterval(this.ignoreMissingData);
if (dictionary != null) {
result.initialize(dictionary, type, hierarchy);
}
return result;
}
@Override
public <T> double getError(Distribution distribution) {
return getInformationLoss(distribution);
}
}
/**
* This class calculates the median for a given distribution.
*
* @author Fabian Prasser
* @author Florian Kohlmayer
*
*/
public static class DistributionAggregateFunctionMedian extends DistributionAggregateFunction {
/** SVUID. */
private static final long serialVersionUID = 4877214760061314248L;
/** Minimum */
private Double minimum = null;
/** Maximum */
private Double maximum = null;
/**
* Instantiates.
*
* @param ignoreMissingData
*/
public DistributionAggregateFunctionMedian(boolean ignoreMissingData) {
super(ignoreMissingData, true);
}
/**
* Clone constructor
* @param ignoreMissingData
* @param minimum
* @param maximum
*/
private DistributionAggregateFunctionMedian(boolean ignoreMissingData,
Double minimum,
Double maximum) {
this(ignoreMissingData);
this.minimum = minimum;
this.maximum = maximum;
}
@Override
public <T> String aggregate(Distribution distribution) {
@SuppressWarnings("unchecked")
final DataType<T> type = (DataType<T>)this.type;
// Determine median
final List<T> values = new ArrayList<T>();
final List<Integer> frequencies = new ArrayList<Integer>();
// Collect
int[] buckets = distribution.getBuckets();
for (int i = 0; i < buckets.length; i += 2) {
int value = buckets[i];
if (value != -1) {
int frequency = buckets[i + 1];
values.add(type.parse(dictionary[value]));
frequencies.add(frequency);
}
}
// Sort
GenericSorting.mergeSort(0, values.size(), new IntComparator() {
@Override
public int compare(int arg0, int arg1) {
return type.compare(values.get(arg0), values.get(arg1));
}
}, new Swapper() {
@Override
public void swap(int arg0, int arg1) {
T temp = values.get(arg0);
values.set(arg0, values.get(arg1));
values.set(arg1, temp);
Integer temp2 = frequencies.get(arg0);
frequencies.set(arg0, frequencies.get(arg1));
frequencies.set(arg1, temp2);
}
});
// Accumulate
int total = 0;
for (int i = 0; i < frequencies.size(); i++) {
total += frequencies.get(i);
frequencies.set(i, total - 1);
}
// Switch
if (total % 2 == 1) {
return type.format(getValueAt(values, frequencies, total / 2));
} else if (type instanceof DataTypeWithRatioScale) {
@SuppressWarnings("unchecked")
DataTypeWithRatioScale<T> rType = (DataTypeWithRatioScale<T>) type;
double median1 = rType.toDouble(getValueAt(values, frequencies, total / 2 - 1));
double median2 = rType.toDouble(getValueAt(values, frequencies, total / 2));
return rType.format(rType.fromDouble((median1 + median2) / 2d));
} else {
T median1 = getValueAt(values, frequencies, total / 2 - 1);
T median2 = getValueAt(values, frequencies, total / 2);
if ((median1 == null && median2 == null) || median1.equals(median2)) {
return type.format(median1);
} else {
return DataType.NULL_VALUE;
}
}
}
/**
* Clone method
*/
public DistributionAggregateFunctionMedian clone() {
DistributionAggregateFunctionMedian result = new DistributionAggregateFunctionMedian(this.ignoreMissingData,
this.minimum,
this.maximum);
if (dictionary != null) {
result.initialize(dictionary, type, hierarchy);
}
return result;
}
@Override
public <T> double getError(Distribution distribution) {
if (!(type instanceof DataTypeWithRatioScale)) {
return 0d;
}
@SuppressWarnings("unchecked")
DataTypeWithRatioScale<T> rType = (DataTypeWithRatioScale<T>) this.type;
DoubleArrayList list = new DoubleArrayList();
Iterator<Double> it = DistributionIterator.createIteratorDouble(distribution, dictionary, rType);
while (it.hasNext()) {
Double value = it.next();
value = value == null ? (ignoreMissingData ? null : 0d) : value;
if (value != null) {
list.add(value);
}
}
// Determine and check mode
String mean = aggregate(distribution);
if (mean == DataType.NULL_VALUE) {
return 1d;
}
// Compute error
return getNMSE(minimum, maximum, Arrays.copyOf(list.elements(), list.size()),
rType.toDouble(rType.parse(mean)));
}
@Override
public void initialize(String[] dictionary, DataType<?> type, int[][] hierarchy) {
super.initialize(dictionary, type, hierarchy);
if (type instanceof DataTypeWithRatioScale) {
if (minimum == null || maximum == null) {
double[] values = getMinMax(dictionary, (DataTypeWithRatioScale<?>)type);
this.minimum = values[0];
this.maximum = values[1];
}
}
}
/**
* Returns the value at
* @param values
* @param frequencies
* @param index
* @return
*/
private <T> T getValueAt(List<T> values, List<Integer> frequencies, int index) {
int pointer = 0;
while (frequencies.get(pointer) < index) {
pointer++;
}
return values.get(pointer);
}
}
/**
* This class calculates the mode for a given distribution.
*
* @author Fabian Prasser
* @author Florian Kohlmayer
*
*/
public static class DistributionAggregateFunctionMode extends DistributionAggregateFunction {
/** SVUID. */
private static final long serialVersionUID = -3424849372778696640L;
/** Minimum */
private double minimum = 0d;
/** Maximum */
private double maximum = 0d;
/**
* Instantiates.
*
* @param ignoreMissingData
*/
public DistributionAggregateFunctionMode(boolean ignoreMissingData) {
super(ignoreMissingData, true);
}
/**
* Clone constructor
* @param ignoreMissingData
* @param minimum
* @param maximum
*/
private DistributionAggregateFunctionMode(boolean ignoreMissingData,
double minimum,
double maximum) {
this(ignoreMissingData);
this.minimum = minimum;
this.maximum = maximum;
}
@Override
public <T> String aggregate(Distribution distribution) {
// Determine mode
int mode = getMode(distribution);
return mode == -1 ? DataType.NULL_VALUE : dictionary[mode];
}
/**
* Clone method
*/
public DistributionAggregateFunctionMode clone() {
DistributionAggregateFunctionMode result = new DistributionAggregateFunctionMode(this.ignoreMissingData,
this.minimum,
this.maximum);
if (dictionary != null) {
result.initialize(dictionary, type, hierarchy);
}
return result;
}
@Override
public <T> double getError(Distribution distribution) {
if (!(type instanceof DataTypeWithRatioScale)) {
return 0d;
}
@SuppressWarnings("unchecked")
DataTypeWithRatioScale<T> rType = (DataTypeWithRatioScale<T>) this.type;
DoubleArrayList list = new DoubleArrayList();
Iterator<Double> it = DistributionIterator.createIteratorDouble(distribution, dictionary, rType);
while (it.hasNext()) {
Double value = it.next();
value = value == null ? (ignoreMissingData ? null : 0d) : value;
if (value != null) {
list.add(value);
}
}
// Determine and check mode
int mode = getMode(distribution);
if (mode == -1) {
return 1d;
}
// Compute error
return getNMSE(minimum, maximum, Arrays.copyOf(list.elements(), list.size()),
rType.toDouble(rType.parse(dictionary[mode])));
}
@Override
public void initialize(String[] dictionary, DataType<?> type, int[][] hierarchy) {
super.initialize(dictionary, type, hierarchy);
if (type instanceof DataTypeWithRatioScale) {
double[] values = getMinMax(dictionary, (DataTypeWithRatioScale<?>)type);
this.minimum = values[0];
this.maximum = values[1];
}
}
/**
* Returns the index of the most frequent element from the distribution, -1 if there is no such element
* @param distribution
* @return
*/
private int getMode(Distribution distribution) {
int[] buckets = distribution.getBuckets();
int max = -1;
int mode = -1;
for (int i = 0; i < buckets.length; i += 2) {
int value = buckets[i];
int frequency = buckets[i + 1];
if (value != -1 && frequency > max) {
max = frequency;
mode = value;
}
}
return mode;
}
}
/** SVUID. */
private static final long serialVersionUID = 331877806010996154L;
/** Whether or not null values should be ignored */
protected boolean ignoreMissingData;
/** Stores whether this is a type-preserving function */
private final boolean typePreserving;
/** Dictionary */
protected transient String[] dictionary;
/** Type */
protected transient DataType<?> type;
/** Hierarchy */
protected transient int[][] hierarchy;
/**
* Instantiates a new function.
*
* @param ignoreMissingData
* @param typePreserving
*/
public DistributionAggregateFunction(boolean ignoreMissingData,
boolean typePreserving) {
this.ignoreMissingData = ignoreMissingData;
this.typePreserving = typePreserving;
}
/**
* This function returns an aggregate value.
*
* @param distribution
* @param dictionary
* @param type
* @return the string
*/
public abstract <T> String aggregate(Distribution distribution);
/**
* Clones this function
*/
public abstract DistributionAggregateFunction clone();
/**
* Returns the normalized error induced by aggregation. In most cases this will be the mean squared error
* normalized into [0,1]. In case of generalization, it will return the normalized generalization level
* (also called generalization intensity). In case of intervals, it will return the normalized number
* of aggregated values.
*
* @param distribution
* @return
*/
public abstract <T> double getError(Distribution distribution);
/**
* This will return the normalized number of aggregated values in range [1/#distinct-values, 1].
*
* @param distribution
* @return
*/
public <T> double getInformationLoss(Distribution distribution) {
double result = 0d;
int[] buckets = distribution.getBuckets();
for (int i = 0; i < buckets.length; i += 2) {
result += buckets[i] != -1 ? 1 : 0;
}
return result / (double)dictionary.length;
}
/**
* Initializes the function
* @param dictionary
* @param type
* @param hierarchy
*/
public void initialize(String[] dictionary, DataType<?> type, int[][] hierarchy) {
this.dictionary = dictionary;
this.type = type;
this.hierarchy = hierarchy;
}
/**
* Returns whether this is a type-preserving function
* @return
*/
public boolean isTypePreserving() {
return this.typePreserving;
}
/**
* Adds all values from the distribution to the given descriptive statistics object
* @param statistics
* @param distribution
* @param type
* @param offset will be added to values
*/
protected <T> void addAll(DescriptiveStatistics statistics,
Distribution distribution,
DataTypeWithRatioScale<T> type,
double offset) {
Iterator<Double> it = DistributionIterator.createIteratorDouble(distribution, dictionary, type);
while (it.hasNext()) {
Double value = it.next();
value = value == null ? (ignoreMissingData ? null : 0d) : value;
if (value != null) {
statistics.addValue(value + offset);
}
}
}
/**
* Returns the minimum and maximum value
* @param dictionary
* @param type
* @return
*/
protected <T> double[] getMinMax(String[] dictionary, DataTypeWithRatioScale<T> type) {
T min = null;
T max = null;
for (String string : dictionary) {
T value = type.parse(string);
if (!ignoreMissingData || value != null) {
min = min == null || type.compare(min, value) > 0 ? value : min;
max = max == null || type.compare(max, value) < 0 ? value : max;
}
}
Double _min = type.toDouble(min);
Double _max = type.toDouble(max);
_min = _min != null ? _min : 0d;
_max = _max != null ? _max : 0d;
return new double[]{_min, _max};
}
/**
* Calculates the mean square error after normalizing everything into [0,1]
*
* @param min
* @param max
* @param values
* @param aggregate
* @return
*/
protected double getNMSE(double min, double max, double[] values, double aggregate) {
// Prepare
double normalizationFactor = 1d / (max - min);
double normalizedAggregate = (aggregate - min) * normalizationFactor;
// NMSE and Sum 1
double nmse = 0d;
for (int i = 0; i < values.length; i++) {
double normalizedValue = (values[i] - min) * normalizationFactor;
double diff = normalizedValue - normalizedAggregate;
nmse += diff * diff;
}
// Normalize and return
return nmse / (double)values.length;
}
}