/*
* ARX: Powerful Data Anonymization
* Copyright 2012 - 2017 Fabian Prasser, Florian Kohlmayer and contributors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.deidentifier.arx.aggregates;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.commons.math3.stat.descriptive.DescriptiveStatistics;
import org.apache.commons.math3.stat.descriptive.moment.GeometricMean;
import org.deidentifier.arx.ARXLogisticRegressionConfiguration;
import org.deidentifier.arx.DataHandleInternal;
import org.deidentifier.arx.DataHandleInternal.InterruptHandler;
import org.deidentifier.arx.DataScale;
import org.deidentifier.arx.DataType;
import org.deidentifier.arx.DataType.ARXString;
import org.deidentifier.arx.DataType.DataTypeWithRatioScale;
import org.deidentifier.arx.aggregates.StatisticsContingencyTable.Entry;
import org.deidentifier.arx.aggregates.StatisticsSummary.StatisticsSummaryOrdinal;
import org.deidentifier.arx.common.Groupify;
import org.deidentifier.arx.common.Groupify.Group;
import org.deidentifier.arx.common.TupleWrapper;
import org.deidentifier.arx.common.WrappedBoolean;
import org.deidentifier.arx.common.WrappedInteger;
import org.deidentifier.arx.exceptions.ComputationInterruptedException;
import cern.colt.GenericSorting;
import cern.colt.Swapper;
import cern.colt.function.IntComparator;
/**
* A class offering basic descriptive statistics about data handles.
*
* @author Fabian Prasser
*/
public class StatisticsBuilder {
/** The handle. */
private DataHandleInternal handle;
/** The stop flag. */
private volatile WrappedBoolean interrupt = new WrappedBoolean(false);
/** Model */
private final WrappedInteger progress = new WrappedInteger();
/**
* Creates a new instance.
*
* @param handle
*/
public StatisticsBuilder(DataHandleInternal handle) {
this.handle = handle;
}
/**
* Creates a new set of statistics for the given classification task
* @param clazz - The class attribute
* @param config - The configuration
* @throws ParseException
*/
public StatisticsClassification getClassificationPerformance(String clazz, ARXLogisticRegressionConfiguration config) throws ParseException {
return getClassificationPerformance(new String[] {}, clazz, config);
}
/**
* Creates a new set of statistics for the given classification task
* @param features - The feature attributes
* @param clazz - The class attributes
* @param config - The configuration
* @throws ParseException
*/
public StatisticsClassification getClassificationPerformance(String[] features,
String clazz,
ARXLogisticRegressionConfiguration config) throws ParseException {
// Reset stop flag
interrupt.value = false;
progress.value = 0;
// Return
return new StatisticsClassification(handle.getAssociatedInput(), handle, features, clazz, config, interrupt, progress);
}
/**
* Returns a contingency table for the given columns.
*
* @param column1 The first column
* @param orderFromDefinition1 Indicates whether the order that should be assumed for string data items
* can (and should) be derived from the hierarchy provided in the data
* definition (if any)
* @param column2 The second column
* @param orderFromDefinition2 Indicates whether the order that should be assumed for string data items
* can (and should) be derived from the hierarchy provided in the data
* definition (if any)
* @return
*/
public StatisticsContingencyTable getContingencyTable(int column1, boolean orderFromDefinition1,
int column2, boolean orderFromDefinition2) {
return getContingencyTable(column1, getHierarchy(column1, orderFromDefinition1),
column2, getHierarchy(column2, orderFromDefinition2));
}
/**
* Returns a contingency table for the given columns. This method assumes that the
* order of string data items will be derived from the hierarchies provided
* in the data definition (if any)
*
* @param column1 The first column
* @param column2 The second column
* @return
*/
public StatisticsContingencyTable getContingencyTable(int column1, int column2) {
return getContingencyTable(column1, true, column2, true);
}
/**
* Returns a contingency table for the given columns.
*
* @param column1 The first column
* @param size1 The maximal size in this dimension
* @param orderFromDefinition1 Indicates whether the order that should be assumed for string data items
* can (and should) be derived from the hierarchy provided in the data
* definition (if any)
* @param column2 The second column
* @param size2 The maximal size in this dimension
* @param orderFromDefinition2 Indicates whether the order that should be assumed for string data items
* can (and should) be derived from the hierarchy provided in the data
* definition (if any)
* @return
*/
public StatisticsContingencyTable getContingencyTable(int column1,
int size1,
boolean orderFromDefinition1,
int column2,
int size2,
boolean orderFromDefinition2) {
return getContingencyTable(column1, size1, getHierarchy(column1, orderFromDefinition1),
column2, size2, getHierarchy(column2, orderFromDefinition2));
}
/**
* Returns a contingency table for the given columns. This method assumes that the
* order of string data items can (and should) be derived from the hierarchies provided
* in the data definition (if any)
*
* @param column1 The first column
* @param size1 The maximal size in this dimension
* @param column2 The second column
* @param size2 The maximal size in this dimension
* @return
*/
public StatisticsContingencyTable getContingencyTable(int column1,
int size1,
int column2,
int size2) {
return getContingencyTable(column1, size1, true, column2, size2, true);
}
/**
* Returns a contingency table for the given columns. The order for string data items is derived
* from the provided hierarchies
*
* @param column1 The first column
* @param size1 The maximal size in this dimension
* @param hierarchy1 The hierarchy for the first column, may be null
* @param column2 The second column
* @param size2 The maximal size in this dimension
* @param hierarchy2 The hierarchy for the second column, may be null
* @return
*/
public StatisticsContingencyTable getContingencyTable(int column1,
int size1,
String[][] hierarchy1,
int column2,
int size2,
String[][] hierarchy2) {
// Reset stop flag
interrupt.value = false;
// Check
if (size1 <= 0 || size2 <= 0) {
throw new IllegalArgumentException("Size must be > 0");
}
// Obtain default table
StatisticsContingencyTable table = getContingencyTable(column1,
hierarchy1,
column2,
hierarchy2);
// Check if suitable
if (table.values1.length <= size1 &&
table.values2.length <= size2) {
return table;
}
// Init
String[] values1;
String[] values2;
double factor1;
double factor2;
// Compute factors and values
if (table.values1.length > size1) {
factor1 = (double) size1 / (double) table.values1.length;
values1 = getScaledValues(table.values1, size1);
} else {
factor1 = 1;
values1 = table.values1;
}
if (table.values2.length > size2) {
factor2 = (double) size2 / (double) table.values2.length;
values2 = getScaledValues(table.values2, size2);
} else {
factor2 = 1;
values2 = table.values2;
}
// Create entry set
final Map<Entry, Double> entries = new HashMap<Entry, Double>();
Iterator<Entry> iter = table.iterator;
double max = 0d;
while (iter.hasNext()) {
checkInterrupt();
Entry old = iter.next();
int index1 = (int) Math.round((double) old.value1 * factor1);
int index2 = (int) Math.round((double) old.value2 * factor2);
index1 = index1 < size1 ? index1 : size1 - 1;
index2 = index2 < size2 ? index2 : size2 - 1;
Entry entry = new Entry(index1, index2);
Double previous = entries.get(entry);
double value = previous != null ? previous + old.frequency : old.frequency;
max = Math.max(value, max);
entries.put(entry, value);
}
// Create iterator
final Iterator<Entry> internal = entries.keySet().iterator();
final Iterator<Entry> iterator = new Iterator<Entry>() {
private Map<Entry, Double> _entries = entries;
private Iterator<Entry> _internal = internal;
@Override
public boolean hasNext() {
if (_internal == null) return false;
boolean result = _internal.hasNext();
// Try to release resources as early as possible
if (!result) {
_internal = null;
_entries = null;
}
return result;
}
@Override
public Entry next() {
if (_internal == null) return null;
Entry e = _internal.next();
e.frequency = _entries.get(e);
return e;
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
};
// Result result
return new StatisticsContingencyTable(values1, values2, table.count, max, iterator);
}
/**
* Returns a contingency table for the given columns. The order for string data items is derived
* from the provided hierarchies
*
* @param column1 The first column
* @param hierarchy1 The hierarchy for the first column, may be null
* @param column2 The second column
* @param hierarchy2 The hierarchy for the second column, may be null
* @return
*/
public StatisticsContingencyTable getContingencyTable(int column1,
String[][] hierarchy1,
int column2,
String[][] hierarchy2) {
// Reset stop flag
interrupt.value = false;
// Init
String[] values1 = getDistinctValuesOrdered(column1, hierarchy1);
String[] values2 = getDistinctValuesOrdered(column2, hierarchy2);
// Create maps of indexes
Map<String, Integer> indexes1 = new HashMap<String, Integer>();
for (int i = 0; i < values1.length; i++) {
checkInterrupt();
indexes1.put(values1[i], i);
}
Map<String, Integer> indexes2 = new HashMap<String, Integer>();
for (int i = 0; i < values2.length; i++) {
checkInterrupt();
indexes2.put(values2[i], i);
}
// Create entry set
int max = Integer.MIN_VALUE;
final Map<Entry, Integer> entries = new HashMap<Entry, Integer>();
for (int row = 0; row < handle.getNumRows(); row++) {
checkInterrupt();
int index1 = indexes1.get(handle.getValue(row, column1));
int index2 = indexes2.get(handle.getValue(row, column2));
Entry entry = new Entry(index1, index2);
Integer previous = entries.get(entry);
int value = previous != null ? previous + 1 : 1;
max = Math.max(max, value);
entries.put(entry, value);
}
// Create iterator
final int count = handle.getNumRows();
final Iterator<Entry> internal = entries.keySet().iterator();
final Iterator<Entry> iterator = new Iterator<Entry>() {
private Map<Entry, Integer> _entries = entries;
private Iterator<Entry> _internal = internal;
@Override
public boolean hasNext() {
if (_internal == null) return false;
boolean result = _internal.hasNext();
// Try to release resources as early as possible
if (!result) {
_internal = null;
_entries = null;
}
return result;
}
@Override
public Entry next() {
if (_internal == null) return null;
Entry e = _internal.next();
e.frequency = (double) _entries.get(e) / (double) count;
return e;
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
};
// Result result
return new StatisticsContingencyTable(values1, values2, count, (double) max / (double) count, iterator);
}
/**
* Returns the distinct set of data items from the given column.
*
* @param column The column
* @return
*/
public String[] getDistinctValues(int column) {
return this.handle.getDistinctValues(column, new InterruptHandler() {
@Override
public void checkInterrupt() {
StatisticsBuilder.this.checkInterrupt();
}
});
}
/**
* Returns an ordered list of the distinct set of data items from the given column. This method assumes
* that the order of string data items can (and should) be derived from the hierarchy provided in the
* data definition (if any)
*
* @param column The column
* @return
*/
public String[] getDistinctValuesOrdered(int column) {
return this.getDistinctValuesOrdered(column, true);
}
/**
* Returns an ordered list of the distinct set of data items from the given column.
*
* @param column The column
* @param orderFromDefinition Indicates whether the order that should be assumed for string data
* items can (and should) be derived from the hierarchy provided in the
* data definition (if any)
* @return
*/
public String[] getDistinctValuesOrdered(int column, boolean orderFromDefinition) {
return getDistinctValuesOrdered(column, getHierarchy(column, orderFromDefinition));
}
/**
* Returns an ordered list of the distinct set of data items from the given column. This method assumes
* that the order of string data items can (and should) be derived from the provided hierarchy
*
* @param column The column
* @param hierarchy The hierarchy, may be null
* @return
*/
public String[] getDistinctValuesOrdered(int column, String[][] hierarchy) {
// Reset stop flag
interrupt.value = false;
// Obtain list and data type
final String[] list = getDistinctValues(column);
final String attribute = handle.getAttributeName(column);
final DataType<?> datatype = handle.getDataType(attribute);
final int level = handle.getGeneralization(attribute);
// Sort by data type
if (hierarchy == null || level == 0) {
sort(list, datatype);
// Sort by hierarchy and data type
} else {
// Build order directly from the hierarchy
final Map<String, Integer> order = new HashMap<String, Integer>();
int max = 0; // The order to use for the suppression string
// Create base order
Set<String> baseSet = new HashSet<String>();
DataType<?> baseType = handle.getBaseDataType(attribute);
for (int i = 0; i < hierarchy.length; i++) {
String element = hierarchy[i][0];
checkInterrupt();
// Make sure that only elements from the hierarchy
// are added that are included in the data
// TODO: Calling isValid is only a work-around
if (baseType.isValid(element)) baseSet.add(element);
}
String[] baseArray = baseSet.toArray(new String[baseSet.size()]);
sort(baseArray, handle.getBaseDataType(attribute));
Map<String, Integer> baseOrder = new HashMap<String, Integer>();
for (int i = 0; i < baseArray.length; i++) {
checkInterrupt();
baseOrder.put(baseArray[i], i);
}
// Handle optimized handles
int lower = handle.isOptimized() ? 0 : level;
int upper = handle.isOptimized() ? hierarchy[0].length: level + 1;
// Build higher level order from base order
for (int i = 0; i < hierarchy.length; i++) {
checkInterrupt();
for (int j = lower; j < upper; j++) {
if (!order.containsKey(hierarchy[i][j])) {
Integer position = baseOrder.get(hierarchy[i][0]);
if (position != null) {
order.put(hierarchy[i][j], position);
max = Math.max(position, max) + 1;
}
}
}
}
// Add suppression string
order.put(DataType.ANY_VALUE, max);
// Sort
sort(list, order);
}
// Done
return list;
}
/**
* Returns statistics about the equivalence classes.
*
* @return
*/
public StatisticsEquivalenceClasses getEquivalenceClassStatistics() {
// Reset stop flag
interrupt.value = false;
// Prepare
Set<String> attributes = handle.getDefinition().getQuasiIdentifyingAttributes();
final int[] indices = new int[attributes.size()];
int index = 0;
for (int column = 0; column < handle.getNumColumns(); column++) {
if (attributes.contains(handle.getAttributeName(column))) {
indices[index++] = column;
}
}
// Calculate equivalence classes
int capacity = handle.getNumRows() / 10;
capacity = capacity > 10 ? capacity : 10;
Groupify<TupleWrapper> map = new Groupify<TupleWrapper>(capacity);
int numRows = handle.getNumRows();
for (int row = 0; row < numRows; row++) {
TupleWrapper tuple = new TupleWrapper(handle, indices, row, false);
map.add(tuple);
checkInterrupt();
}
// Now compute the following values
double averageEquivalenceClassSize = 0d;
double averageEquivalenceClassSizeIncludingOutliers = 0d;
int maximalEquivalenceClassSize = Integer.MIN_VALUE;
int maximalEquivalenceClassSizeIncludingOutliers = Integer.MIN_VALUE;
int minimalEquivalenceClassSize = Integer.MAX_VALUE;
int minimalEquivalenceClassSizeIncludingOutliers = Integer.MAX_VALUE;
int numberOfEquivalenceClasses = 0;
int numberOfEquivalenceClassesIncludingOutliers = map.size();
int numberOfTuples = 0;
int numberOfOutlyingTuples = 0;
// Let's do it
boolean containsOutliers = false;
Group<TupleWrapper> element = map.first();
while (element != null) {
checkInterrupt();
maximalEquivalenceClassSizeIncludingOutliers = Math.max(element.getCount(), maximalEquivalenceClassSizeIncludingOutliers);
minimalEquivalenceClassSizeIncludingOutliers = Math.min(element.getCount(), minimalEquivalenceClassSizeIncludingOutliers);
averageEquivalenceClassSizeIncludingOutliers += element.getCount();
numberOfTuples += element.getCount();
if (!element.getElement().isOutlier()) {
maximalEquivalenceClassSize = Math.max(element.getCount(), maximalEquivalenceClassSize);
minimalEquivalenceClassSize = Math.min(element.getCount(), minimalEquivalenceClassSize);
averageEquivalenceClassSize += element.getCount();
} else {
containsOutliers = true;
// All suppressed records will collapse into a single group, so we can use the "=" assignment operator here
numberOfOutlyingTuples = element.getCount();
}
element = element.next();
}
numberOfEquivalenceClasses = numberOfEquivalenceClassesIncludingOutliers;
if (containsOutliers) {
numberOfEquivalenceClasses -= 1;
}
averageEquivalenceClassSize /= (double)numberOfEquivalenceClasses;
averageEquivalenceClassSizeIncludingOutliers /= (double)numberOfEquivalenceClassesIncludingOutliers;
// Fix corner cases
if (numberOfEquivalenceClasses == 0) {
averageEquivalenceClassSize = 0;
maximalEquivalenceClassSize = 0;
minimalEquivalenceClassSize = 0;
}
// And return
return new StatisticsEquivalenceClasses(averageEquivalenceClassSize,
averageEquivalenceClassSizeIncludingOutliers,
maximalEquivalenceClassSize,
maximalEquivalenceClassSizeIncludingOutliers,
minimalEquivalenceClassSize,
minimalEquivalenceClassSizeIncludingOutliers,
numberOfEquivalenceClasses,
numberOfEquivalenceClassesIncludingOutliers,
numberOfTuples,
numberOfOutlyingTuples);
}
/**
* Returns a frequency distribution for the values in the given column. This method assumes that the
* order of string data items can (and should) be derived from the hierarchy provided in the data
* definition (if any)
*
* @param column The column
* @return
*/
public StatisticsFrequencyDistribution getFrequencyDistribution(int column) {
return getFrequencyDistribution(column, true);
}
/**
* Returns a frequency distribution for the values in the given column.
*
* @param column The column
* @param orderFromDefinition Indicates whether the order that should be assumed for string data items
* can (and should) be derived from the hierarchy provided in the data
* definition (if any)
* @return
*/
public StatisticsFrequencyDistribution getFrequencyDistribution(int column, boolean orderFromDefinition) {
return getFrequencyDistribution(column, getHierarchy(column, orderFromDefinition));
}
/**
* Returns a frequency distribution for the values in the given column. The order for string data items
* is derived from the provided hierarchy
*
* @param column The column
* @param hierarchy The hierarchy, may be null
* @return
*/
public StatisticsFrequencyDistribution getFrequencyDistribution(int column, String[][] hierarchy) {
// Reset stop flag
interrupt.value = false;
// Init
String[] values = getDistinctValuesOrdered(column, hierarchy);
double[] frequencies = new double[values.length];
// Create map of indexes
Map<String, Integer> indexes = new HashMap<String, Integer>();
for (int i = 0; i < values.length; i++) {
checkInterrupt();
indexes.put(values[i], i);
}
// Count frequencies
for (int row = 0; row < handle.getNumRows(); row++) {
checkInterrupt();
String value = handle.getValue(row, column);
frequencies[indexes.get(value)]++;
}
// Divide by count
int count = handle.getNumRows();
for (int i = 0; i < frequencies.length; i++) {
checkInterrupt();
frequencies[i] /= (double) count;
}
// Return
return new StatisticsFrequencyDistribution(values, frequencies, count);
}
/**
*
* Returns an interruptible instance of this object.
*
* @return
*/
public StatisticsBuilderInterruptible getInterruptibleInstance() {
return new StatisticsBuilderInterruptible(handle);
}
/**
* Returns summary statistics for all attributes.
*
* @param listwiseDeletion A flag enabling list-wise deletion
* @return
*/
@SuppressWarnings({ "unchecked", "rawtypes" })
public <T> Map<String, StatisticsSummary<?>> getSummaryStatistics(boolean listwiseDeletion) {
// Reset stop flag
interrupt.value = false;
Map<String, DescriptiveStatistics> statistics = new HashMap<String, DescriptiveStatistics>();
Map<String, StatisticsSummaryOrdinal> ordinal = new HashMap<String, StatisticsSummaryOrdinal>();
Map<String, DataScale> scales = new HashMap<String, DataScale>();
Map<String, GeometricMean> geomean = new HashMap<String, GeometricMean>();
// Detect scales
for (int col = 0; col < handle.getNumColumns(); col++) {
// Meta
String attribute = handle.getAttributeName(col);
DataType<?> type = handle.getDataType(attribute);
// Scale
DataScale scale = type.getDescription().getScale();
// Try to replace nominal scale with ordinal scale based on base data type
if (scale == DataScale.NOMINAL && handle.getGeneralization(attribute) != 0) {
if (!(handle.getBaseDataType(attribute) instanceof ARXString) &&
getHierarchy(col, true) != null) {
scale = DataScale.ORDINAL;
}
}
// Store
scales.put(attribute, scale);
statistics.put(attribute, new DescriptiveStatistics());
geomean.put(attribute, new GeometricMean());
ordinal.put(attribute, getSummaryStatisticsOrdinal(handle.getGeneralization(attribute),
handle.getDataType(attribute),
handle.getBaseDataType(attribute),
getHierarchy(col, true)));
}
// Compute summary statistics
for (int row = 0; row < handle.getNumRows(); row++) {
// Check, if we should include this row
boolean include = true;
if (listwiseDeletion) {
for (int col = 0; col < handle.getNumColumns(); col++) {
if (handle.isOutlier(row) || DataType.isNull(handle.getValue(row, col))) {
include = false;
break;
}
}
}
// Check
checkInterrupt();
// If yes, add
if (include) {
// For each column
for (int col = 0; col < handle.getNumColumns(); col++) {
// Meta
String value = handle.getValue(row, col);
String attribute = handle.getAttributeName(col);
DataType<?> type = handle.getDataType(attribute);
// Analyze
if (!DataType.isAny(value) && !DataType.isNull(value)) {
ordinal.get(attribute).addValue(value);
if (type instanceof DataTypeWithRatioScale) {
double doubleValue = ((DataTypeWithRatioScale) type).toDouble(type.parse(value));
statistics.get(attribute).addValue(doubleValue);
geomean.get(attribute).increment(doubleValue + 1d);
}
}
}
}
}
// Convert
Map<String, StatisticsSummary<?>> result = new HashMap<String, StatisticsSummary<?>>();
for (int col = 0; col < handle.getNumColumns(); col++) {
// Check
checkInterrupt();
// Depending on scale
String attribute = handle.getAttributeName(col);
DataScale scale = scales.get(attribute);
DataType<T> type = (DataType<T>) handle.getDataType(attribute);
ordinal.get(attribute).analyze();
if (scale == DataScale.NOMINAL) {
StatisticsSummaryOrdinal stats = ordinal.get(attribute);
result.put(attribute, new StatisticsSummary<T>(DataScale.NOMINAL,
stats.getNumberOfMeasures(),
stats.getDistinctNumberOfValues(),
stats.getMode(),
type.parse(stats.getMode())));
} else if (scale == DataScale.ORDINAL) {
StatisticsSummaryOrdinal stats = ordinal.get(attribute);
result.put(attribute, new StatisticsSummary<T>(DataScale.ORDINAL,
stats.getNumberOfMeasures(),
stats.getDistinctNumberOfValues(),
stats.getMode(),
type.parse(stats.getMode()),
stats.getMedian(),
type.parse(stats.getMedian()),
stats.getMin(),
type.parse(stats.getMin()),
stats.getMax(),
type.parse(stats.getMax())));
} else if (scale == DataScale.INTERVAL) {
StatisticsSummaryOrdinal stats = ordinal.get(attribute);
DescriptiveStatistics stats2 = statistics.get(attribute);
boolean isPeriod = type.getDescription().getWrappedClass() == Date.class;
// TODO: Something is wrong with commons math's kurtosis
double kurtosis = stats2.getKurtosis();
kurtosis = kurtosis < 0d ? Double.NaN : kurtosis;
double range = stats2.getMax() - stats2.getMin();
double stddev = Math.sqrt(stats2.getVariance());
result.put(attribute, new StatisticsSummary<T>(DataScale.INTERVAL,
stats.getNumberOfMeasures(),
stats.getDistinctNumberOfValues(),
stats.getMode(),
type.parse(stats.getMode()),
stats.getMedian(),
type.parse(stats.getMedian()),
stats.getMin(),
type.parse(stats.getMin()),
stats.getMax(),
type.parse(stats.getMax()),
toString(type, stats2.getMean(), false, false),
toValue(type, stats2.getMean()),
stats2.getMean(),
toString(type, stats2.getVariance(), isPeriod, true),
toValue(type, stats2.getVariance()),
stats2.getVariance(),
toString(type, stats2.getPopulationVariance(), isPeriod, true),
toValue(type, stats2.getPopulationVariance()),
stats2.getPopulationVariance(),
toString(type, stddev, isPeriod, false),
toValue(type, stddev),
stddev,
toString(type, range, isPeriod, false),
toValue(type, range),
stats2.getMax() - stats2.getMin(),
toString(type, kurtosis, isPeriod, false),
toValue(type, kurtosis),
kurtosis));
} else if (scale == DataScale.RATIO) {
StatisticsSummaryOrdinal stats = ordinal.get(attribute);
DescriptiveStatistics stats2 = statistics.get(attribute);
GeometricMean geo = geomean.get(attribute);
// TODO: Something is wrong with commons math's kurtosis
double kurtosis = stats2.getKurtosis();
kurtosis = kurtosis < 0d ? Double.NaN : kurtosis;
double range = stats2.getMax() - stats2.getMin();
double stddev = Math.sqrt(stats2.getVariance());
result.put(attribute, new StatisticsSummary<T>(DataScale.RATIO,
stats.getNumberOfMeasures(),
stats.getDistinctNumberOfValues(),
stats.getMode(),
type.parse(stats.getMode()),
stats.getMedian(),
type.parse(stats.getMedian()),
stats.getMin(),
type.parse(stats.getMin()),
stats.getMax(),
type.parse(stats.getMax()),
toString(type, stats2.getMean(), false, false),
toValue(type, stats2.getMean()),
stats2.getMean(),
toString(type, stats2.getVariance(), false, false),
toValue(type, stats2.getVariance()),
stats2.getVariance(),
toString(type, stats2.getPopulationVariance(), false, false),
toValue(type, stats2.getPopulationVariance()),
stats2.getPopulationVariance(),
toString(type, stddev, false, false),
toValue(type, stddev),
stddev,
toString(type, range, false, false),
toValue(type, range),
range,
toString(type, kurtosis, false, false),
toValue(type, kurtosis),
kurtosis,
toString(type, geo.getResult() - 1d, false, false),
toValue(type, geo.getResult() - 1d),
stats2.getGeometricMean()));
}
}
return result;
}
/**
* Checks whether an interruption happened.
*/
private void checkInterrupt() {
if (interrupt.value) {
throw new ComputationInterruptedException("Interrupted");
}
}
/**
* Returns the appropriate hierarchy, if any.
*
* @param column
* @param orderFromDefinition
* @return
*/
private String[][] getHierarchy(int column, boolean orderFromDefinition) {
// Init
final String attribute = handle.getAttributeName(column);
final String[][] hierarchy = handle.getDefinition().getHierarchy(attribute);
final DataType<?> datatype = handle.getDataType(attribute);
// Check if hierarchy available
if (orderFromDefinition && datatype instanceof ARXString && hierarchy != null) {
return hierarchy;
} else {
return null;
}
}
/**
* Scales the given string array.
*
* @param values
* @param length The resulting length
* @return
*/
private String[] getScaledValues(String[] values, int length) {
// Init
AggregateFunction<String> function = AggregateFunction.forType(DataType.STRING).createSetFunction();
double factor = (double) length / (double) values.length;
String[] result = new String[length];
// Aggregate
int previous = 0;
List<String> toAggregate = new ArrayList<String>();
for (int i = 0; i < values.length; i++) {
checkInterrupt();
int index = (int) Math.round((double) i * factor);
index = index < length ? index : length - 1;
if (index != previous) {
result[previous] = function.aggregate(toAggregate.toArray(new String[toAggregate.size()]));
toAggregate.clear();
previous = index;
}
toAggregate.add(values[i]);
}
result[length - 1] = function.aggregate(toAggregate.toArray(new String[toAggregate.size()]));
return result;
}
/**
* Returns a summary statistics object for the given attribute
* @param generalization
* @param dataType
* @param baseDataType
* @param hierarchy
* @return
*/
private <U, V> StatisticsSummaryOrdinal getSummaryStatisticsOrdinal(final int generalization,
final DataType<U> dataType,
final DataType<V> baseDataType,
final String[][] hierarchy) {
// TODO: It would be cleaner to return an ARXOrderedString for generalized variables
// TODO: that have a suitable data type directly from the DataHandle
if (generalization == 0 || !(dataType instanceof ARXString)) {
return new StatisticsSummaryOrdinal(dataType);
} else if (baseDataType instanceof ARXString) {
return new StatisticsSummaryOrdinal(dataType);
} else if (hierarchy == null) {
return new StatisticsSummaryOrdinal(dataType);
} else {
final Map<String, String> map = new HashMap<String, String>();
for (int i = 0; i < hierarchy.length; i++) {
map.put(hierarchy[i][generalization], hierarchy[i][0]);
}
return new StatisticsSummaryOrdinal(new Comparator<String>() {
public int compare(String o1, String o2) {
V _o1 = null;
try {
_o1 = baseDataType.parse(map.get(o1));
} catch (Exception e) {
// Nothing to do
}
V _o2 = null;
try {
_o2 = baseDataType.parse(map.get(o2));
} catch (Exception e) {
// Nothing to do
}
try {
return baseDataType.compare(_o1, _o2);
} catch (Exception e) {
return 0;
}
}
});
}
}
/**
* Orders the given array by data type.
*
* @param array
* @param type
*/
private void sort(final String[] array, final DataType<?> type) {
GenericSorting.mergeSort(0, array.length, new IntComparator() {
@Override
public int compare(int arg0, int arg1) {
checkInterrupt();
try {
String s1 = array[arg0];
String s2 = array[arg1];
return (s1 == DataType.ANY_VALUE && s2 == DataType.ANY_VALUE) ? 0
: (s1 == DataType.ANY_VALUE ? +1
: (s2 == DataType.ANY_VALUE ? -1
: type.compare(s1, s2)));
} catch (
IllegalArgumentException
| ParseException e) {
throw new RuntimeException("Some values seem to not conform to the data type", e);
}
}
}, new Swapper() {
@Override
public void swap(int arg0, int arg1) {
String temp = array[arg0];
array[arg0] = array[arg1];
array[arg1] = temp;
}
});
}
/**
* Orders the given array by the given sort order.
*
* @param array
* @param order
*/
private void sort(final String[] array, final Map<String, Integer> order) {
GenericSorting.mergeSort(0, array.length, new IntComparator() {
@Override
public int compare(int arg0, int arg1) {
checkInterrupt();
Integer order1 = order.get(array[arg0]);
Integer order2 = order.get(array[arg1]);
if (order1 == null || order2 == null) {
String message = "The hierarchy seems to not cover all data values";
message += order1 == null ? " (unknown = "+array[arg0]+")" : "";
message += order2 == null ? " (unknown = "+array[arg1]+")" : "";
throw new RuntimeException(message);
} else {
return order1.compareTo(order2);
}
}
}, new Swapper() {
@Override
public void swap(int arg0, int arg1) {
String temp = array[arg0];
array[arg0] = array[arg1];
array[arg1] = temp;
}
});
}
/**
* Used for building summary statistics
* @param type
* @param value
* @param isPeriod Defines whether the parameter is a time period
* @param isSquare Defines whether the period is a squared period
* @return
*/
@SuppressWarnings({ "unchecked", "rawtypes" })
private String toString(DataType<?> type, double value, boolean isPeriod, boolean isSquare) {
// Handle corner cases
if (Double.isNaN(value)) {
return "Not available";
} else if (Double.isInfinite(value)) {
if (value < 0) {
return "-Infinity";
} else {
return "+Infinity";
}
}
// Handle periods
if (isPeriod) {
// Init
long SECONDS = 1000;
long MINUTES = 60 * SECONDS;
long HOURS = 60 * MINUTES;
long DAYS = 24 * HOURS;
long WEEKS = 7 * DAYS;
// Square
if (isSquare) {
SECONDS *= SECONDS;
MINUTES *= MINUTES;
HOURS *= HOURS;
DAYS *= DAYS;
WEEKS *= WEEKS;
}
// Compute
final int weeks = (int) (value / WEEKS);
value = value % WEEKS;
final int days = (int) (value / DAYS);
value = value % DAYS;
final int hours = (int) (value / HOURS);
value = value % HOURS;
final int minutes = (int) (value / MINUTES);
value = value % MINUTES;
final int seconds = (int) (value / SECONDS);
value = value % SECONDS;
final int milliseconds = (int) (value);
// Convert
StringBuilder builder = new StringBuilder();
if (weeks != 0) builder.append(weeks).append(isSquare ? "w^2, " : "w, ");
if (days != 0) builder.append(days).append(isSquare ? "d^2, " : "d, ");
if (hours != 0) builder.append(hours).append(isSquare ? "h^2, " : "h, ");
if (minutes != 0) builder.append(minutes).append(isSquare ? "m^2, " : "m, ");
if (seconds != 0) builder.append(seconds).append(isSquare ? "s^2, " : "s, ");
builder.append(milliseconds).append(isSquare ? "ms^2" : "ms");
// Return
return builder.toString();
}
// Handle data types
if (type instanceof DataTypeWithRatioScale) {
DataTypeWithRatioScale rType = (DataTypeWithRatioScale) type;
return rType.format(rType.fromDouble(value));
} else {
return String.valueOf(value);
}
}
/**
* Used for building summary statistics
* @param type
* @param value
* @return
*/
@SuppressWarnings("unchecked")
private <T> T toValue(DataType<T> type, double value) {
// Handle corner cases
if (Double.isNaN(value) || Double.isInfinite(value)) {
return null;
}
// Handle data types
Class<?> clazz = type.getDescription().getWrappedClass();
if (clazz == Long.class) {
return (T) Long.valueOf((long) value);
} else if (clazz == Double.class) {
return (T) Double.valueOf(value);
} else if (clazz == Date.class) {
return (T) new Date((long) value);
} else {
return (T) String.valueOf(value);
}
}
/**
* Stops all computations. May lead to exceptions being thrown. Use with care.
*/
void interrupt() {
this.interrupt.value = true;
}
/**
* Returns progress data, if available
*
* @return
*/
int getProgress() {
return this.progress.value;
}
}