/* * ARX: Powerful Data Anonymization * Copyright 2012 - 2017 Fabian Prasser, Florian Kohlmayer and contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.deidentifier.arx.aggregates; import java.text.ParseException; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.HashSet; import java.util.List; import java.util.Set; import org.deidentifier.arx.DataScale; import org.deidentifier.arx.DataType; import org.deidentifier.arx.DataType.DataTypeWithRatioScale; /** * A base class for summary statistics * @author Fabian Prasser * */ public class StatisticsSummary<T> { /** * Summary statistics for variables with ordinal scale * @author Fabian Prasser * */ static final class StatisticsSummaryOrdinal { /** Var */ private final Comparator<String> comparator; /** Var */ private final List<String> values = new ArrayList<String>(); /** Var */ private String mode; /** Var */ private int distinctNumberOfValues; /** Var */ private String median; /** Var */ private String min; /** Var */ private String max; /** Var */ private int numberOfMeasures; /** Var */ private DataType<?> type; /** * Constructor * @param comparator */ StatisticsSummaryOrdinal(final Comparator<String> comparator) { this.comparator = comparator; this.type = null; } /** * Constructor * @param type */ StatisticsSummaryOrdinal(final DataType<?> type) { this.type = type; this.comparator = new Comparator<String>() { @Override public int compare(String o1, String o2) { try { return type.compare(o1, o2); } catch (NumberFormatException | ParseException e) { throw new RuntimeException(e); } } }; } /** * Adds a value * @param value */ public void addValue(String value) { this.values.add(value); } /** * Clears the data */ public void clear() { this.values.clear(); } /** * Returns a summary * @return */ public String getMax() { return max; } /** * Returns a summary * @return */ public String getMedian() { return median; } /** * Returns a summary * @return */ public String getMin() { return min; } /** * Returns a summary * @return */ public String getMode() { return mode; } /** * Returns the number of distinct values * @return */ public int getDistinctNumberOfValues() { return distinctNumberOfValues; } /** * Returns the number of measurements * @return */ public int getNumberOfMeasures() { return numberOfMeasures; } /** * Returns the index of the next element that does not equal the element at the given index * @param index * @param values * @return */ private int moveWhileEqual(int index, List<String> values) { String element = values.get(index); // We can do == because of dictionary compression while (index < values.size() && values.get(index) == element) { index++; } return index; } /** * Analyzes the data */ <T> void analyze() { Collections.sort(values, comparator); if (values.size() == 0) { min = DataType.NULL_VALUE; max = DataType.NULL_VALUE; mode = DataType.NULL_VALUE; median = DataType.NULL_VALUE; distinctNumberOfValues = 0; numberOfMeasures = 0; } else { // Determine simple things min = values.get(0); max = values.get(values.size() - 1); if (values.size() % 2 == 1) { median = values.get(values.size() / 2); } else if (type != null && type instanceof DataTypeWithRatioScale<?>) { @SuppressWarnings("unchecked") DataType<T> dType = (DataType<T>)type; @SuppressWarnings("unchecked") DataTypeWithRatioScale<T> rType = (DataTypeWithRatioScale<T>)dType; double median1 = rType.toDouble(dType.parse(values.get(values.size() / 2 - 1))); double median2 = rType.toDouble(dType.parse(values.get(values.size() / 2))); median = dType.format(rType.fromDouble((median1 + median2) / 2d)); } else { String median1 = values.get(values.size() / 2 - 1); String median2 = values.get(values.size() / 2); if (median1 == median2) { median = median1; } else { median = DataType.NULL_VALUE; } } numberOfMeasures = values.size(); // determine distinct number of measures Set<String> distinct = new HashSet<String>(); distinct.addAll(values); distinctNumberOfValues = distinct.size(); distinct.clear(); // Determine mode int count = 0; int index = 0; mode = values.get(0); while (index < values.size()) { int nIndex = moveWhileEqual(index, values); int nCount = nIndex - index; if (nCount > count) { mode = values.get(index); count = nCount; } index = nIndex; } } // Clear values.clear(); } } /** The associated scale of measure */ private final DataScale scale; /** The number of measures */ private final int numberOfMeasures; /** The distinct number of measures */ private final int distinctNumberOfValues; /* ******************************************************************** * ARXString, ARXOrderedString, ARXDate, ARXInteger, ARXDecimal **********************************************************************/ /** Mode */ private final String mode; /** Mode */ private final T modeT; /* ******************************************************************** * ARXOrderedString, ARXDate, ARXInteger, ARXDecimal **********************************************************************/ /** Median, may be null */ private final String median; /** Median, may be null */ private final T medianT; /** Min, may be null */ private final String min; /** Min, may be null */ private final T minT; /** Max, may be null */ private final String max; /** Max, may be null */ private final T maxT; /* ******************************************************************** * ARXDate, ARXInteger, ARXDecimal **********************************************************************/ /** Arithmetic mean, may be null */ private final String arithmeticMean; /** Arithmetic mean, may be null */ private final T arithmeticMeanT; /** Arithmetic mean, may be null */ private final double arithmeticMeanD; /** Sample variance, may be null */ private final String sampleVariance; /** Sample variance, may be null */ private final T sampleVarianceT; /** Sample variance, may be null */ private final double sampleVarianceD; /** Population variance, may be null */ private final String populationVariance; /** Population variance, may be null */ private final T populationVarianceT; /** Population variance, may be null */ private final double populationVarianceD; /** Std.dev, may be null */ private final String stdDev; /** Std.dev, may be null */ private final T stdDevT; /** Std.dev, may be null */ private final double stdDevD; /** Range, may be null */ private final String range; /** Range, may be null */ private final T rangeT; /** Range, may be null */ private final double rangeD; /** Kurtosis, may be null */ private final String kurtosis; /** Kurtosis, may be null */ private final T kurtosisT; /** Kurtosis, may be null */ private final double kurtosisD; /* ******************************************************************** * ARXInteger, ARXDecimal ********************************************************************* */ /** Geometric mean, may be null */ private final String geometricMean; /** Geometric mean, may be null */ private final T geometricMeanT; /** Geometric mean, may be null */ private final double geometricMeanD; /** * Constructor for ARXString * @param scale * @param numberOfMeasures * @param mode * @param modeT */ StatisticsSummary(DataScale scale, int numberOfMeasures, int distinctNumberOfValues, String mode, T modeT) { this(scale, numberOfMeasures, distinctNumberOfValues, mode, modeT, null, null, null, null, null, null, null, null, Double.NaN, null, null, Double.NaN, null, null, Double.NaN, null, null, Double.NaN, null, null, Double.NaN, null, null, Double.NaN, null, null, Double.NaN); } /** * Constructor for ARXOrderedString * @param scale * @param numberOfMeasures * @param mode * @param modeT * @param median * @param medianT * @param min * @param minT * @param max * @param maxT */ StatisticsSummary(DataScale scale, int numberOfMeasures, int distinctNumberOfValues, String mode, T modeT, String median, T medianT, String min, T minT, String max, T maxT) { this(scale, numberOfMeasures, distinctNumberOfValues, mode, modeT, median, medianT, min, minT, max, maxT, null, null, Double.NaN, null, null, Double.NaN, null, null, Double.NaN, null, null, Double.NaN, null, null, Double.NaN, null, null, Double.NaN, null, null, Double.NaN); } /** * Constructor for ARXDate * @param scale * @param numberOfMeasures * @param mode * @param modeT * @param median * @param medianT * @param min * @param minT * @param max * @param maxT * @param arithmeticMean * @param arithmeticMeanT * @param arithmeticMeanD * @param sampleVariance * @param sampleVarianceT * @param sampleVarianceD * @param populationVariance * @param populationVarianceT * @param populationVarianceD * @param stdDev * @param stdDevT * @param stdDevD * @param range * @param rangeT * @param rangeD * @param kurtosis * @param kurtosisT * @param kurtosisD */ StatisticsSummary(DataScale scale, int numberOfMeasures, int distinctNumberOfValues, String mode, T modeT, String median, T medianT, String min, T minT, String max, T maxT, String arithmeticMean, T arithmeticMeanT, double arithmeticMeanD, String sampleVariance, T sampleVarianceT, double sampleVarianceD, String populationVariance, T populationVarianceT, double populationVarianceD, String stdDev, T stdDevT, double stdDevD, String range, T rangeT, double rangeD, String kurtosis, T kurtosisT, double kurtosisD) { this(scale, numberOfMeasures, distinctNumberOfValues, mode, modeT, median, medianT, min, minT, max, maxT, arithmeticMean, arithmeticMeanT, arithmeticMeanD, sampleVariance, sampleVarianceT, sampleVarianceD, populationVariance, populationVarianceT, populationVarianceD, stdDev, stdDevT, stdDevD, range, rangeT, rangeD, kurtosis, kurtosisT, kurtosisD, null, null, Double.NaN); } /** * Constructor for ARXInteger and ARXDecimal * @param scale * @param numberOfMeasures * @param mode * @param modeT * @param median * @param medianT * @param min * @param minT * @param max * @param maxT * @param arithmeticMean * @param arithmeticMeanT * @param arithmeticMeanD * @param sampleVariance * @param sampleVarianceT * @param sampleVarianceD * @param populationVariance * @param populationVarianceT * @param populationVarianceD * @param stdDev * @param stdDevT * @param stdDevD * @param range * @param rangeT * @param rangeD * @param kurtosis * @param kurtosisT * @param kurtosisD * @param geometricMean * @param geometricMeanT * @param geometricMeanD */ StatisticsSummary(DataScale scale, int numberOfMeasures, int distinctNumberOfValues, String mode, T modeT, String median, T medianT, String min, T minT, String max, T maxT, String arithmeticMean, T arithmeticMeanT, double arithmeticMeanD, String sampleVariance, T sampleVarianceT, double sampleVarianceD, String populationVariance, T populationVarianceT, double populationVarianceD, String stdDev, T stdDevT, double stdDevD, String range, T rangeT, double rangeD, String kurtosis, T kurtosisT, double kurtosisD, String geometricMean, T geometricMeanT, double geometricMeanD) { this.numberOfMeasures = numberOfMeasures; this.scale = scale; this.mode = mode; this.modeT = modeT; this.distinctNumberOfValues = distinctNumberOfValues; this.median = median; this.medianT = medianT; this.min = min; this.minT = minT; this.max = max; this.maxT = maxT; this.arithmeticMean = arithmeticMean; this.arithmeticMeanT = arithmeticMeanT; this.arithmeticMeanD = arithmeticMeanD; this.sampleVariance = sampleVariance; this.sampleVarianceT = sampleVarianceT; this.sampleVarianceD = sampleVarianceD; this.populationVariance = populationVariance; this.populationVarianceT = populationVarianceT; this.populationVarianceD = populationVarianceD; this.range = range; this.rangeT = rangeT; this.rangeD = rangeD; this.kurtosis = kurtosis; this.kurtosisT = kurtosisT; this.kurtosisD = kurtosisD; this.geometricMean = geometricMean; this.geometricMeanT = geometricMeanT; this.geometricMeanD = geometricMeanD; this.stdDev = stdDev; this.stdDevT = stdDevT; this.stdDevD = stdDevD; } /** * Returns the mean * @return */ public double getArithmeticMeanAsDouble() { return arithmeticMeanD; } /** * Returns the mean * @return */ public String getArithmeticMeanAsString() { return arithmeticMean; } /** * Returns the mean * @return */ public T getArithmeticMeanAsValue() { return arithmeticMeanT; } /** * Returns the geometric mean * @return */ public double getGeometricMeanAsDouble() { return geometricMeanD; } /** * Returns the geometric mean * @return */ public String getGeometricMeanAsString() { return geometricMean; } /** * Returns the geometric mean * @return */ public T getGeometricMeanAsValue() { return geometricMeanT; } /** * Returns the kurtosis * @return */ public double getKurtosisAsDouble() { return kurtosisD; } /** * Returns the kurtosis * @return */ public String getKurtosisAsString() { return kurtosis; } /** * Returns the kurtosis * @return */ public T getKurtosisAsValue() { return kurtosisT; } /** * Returns the max * @return */ public String getMaxAsString() { return max; } /** * Returns the max * @return */ public T getMaxAsValue() { return maxT; } /** * Returns the median * @return */ public String getMedianAsString() { return median; } /** * Returns the median * @return */ public T getMedianAsValue() { return medianT; } /** * Returns the min * @return */ public String getMinAsString() { return min; } /** * Returns the min * @return */ public T getMinAsValue() { return minT; } /** * Returns the mode * @return */ public String getModeAsString() { return mode; } /** * Returns the mode * @return */ public T getModeAsValue() { return modeT; } /** * Returns the number of distinct values * @return */ public int getNumberOfDistinctValuesAsInt() { return distinctNumberOfValues; } /** * Returns the number of distinct values * @return */ public String getNumberOfDistinctValuesAsString() { return String.valueOf(distinctNumberOfValues); } /** * Returns the number of measures * @return */ public int getNumberOfMeasuresAsString() { return numberOfMeasures; } /** * Returns the population variance * @return */ public double getPopulationVarianceAsDouble() { return populationVarianceD; } /** * Returns the population variance * @return */ public String getPopulationVarianceAsString() { return populationVariance; } /** * Returns the population variance * @return */ public T getPopulationVarianceAsValue() { return populationVarianceT; } /** * Returns the range * @return */ public double getRangeAsDouble() { return rangeD; } /** * Returns the range * @return */ public String getRangeAsString() { return range; } /** * Returns the range * @return */ public T getRangeAsValue() { return rangeT; } /** * Returns the sample variance * @return */ public double getSampleVarianceAsDouble() { return sampleVarianceD; } /** * Returns the sample variance * @return */ public String getSampleVarianceAsString() { return sampleVariance; } /** * Returns the sample variance * @return */ public T getSampleVarianceAsValue() { return sampleVarianceT; } /** * Returns the scale of measure * @return */ public DataScale getScale() { return scale; } /** * Returns the standard deviation * @return */ public double getStdDevAsDouble() { return stdDevD; } /** * Returns the standard deviation * @return */ public String getStdDevAsString() { return stdDev; } /** * Returns the standard deviation * @return */ public T getStdDevAsValue() { return stdDevT; } /** * Returns whether the following measure is available: mean * @return */ public boolean isArithmeticMeanAvailable() { return null != arithmeticMean; } /** * Returns whether the following measure is available: geometric mean * @return */ public boolean isGeometricMeanAvailable() { return null != geometricMean; } /** * Returns whether the following measure is available: kurtosis * @return */ public boolean isKurtosisAvailable() { return null != kurtosis; } /** * Returns whether the following measure is available: max * @return */ public boolean isMaxAvailable() { return null != max; } /** * Returns whether the following measure is available: median * @return */ public boolean isMedianAvailable() { return null != median; } /** * Returns whether the following measure is available: min * @return */ public boolean isMinAvailable() { return null != min; } /** * Returns whether the following measure is available: mode * @return */ public boolean isModeAvailable() { return null != mode; } /** * Returns whether the following measure is available: population variance * @return */ public boolean isPopulationVarianceAvailable() { return null != populationVariance; } /** * Returns whether the following measure is available: range * @return */ public boolean isRangeAvailable() { return null != range; } /** * Returns whether the following measure is available: sample variance * @return */ public boolean isSampleVarianceAvailable() { return null != sampleVariance; } /** * Returns whether the following measure is available: std. dev * @return */ public boolean isStdDevAvailable() { return null != stdDev; } @Override public String toString() { return "StatisticsSummary [\n" + " - scale=" + scale + "\n" + " - numberOfMeasures=" + numberOfMeasures + "\n" + " - distinctNumberOfValues=" + distinctNumberOfValues + "\n" + (isModeAvailable() ? " - mode=" + mode + "\n" : "") + (isMedianAvailable() ? " - median=" + median + "\n" : "") + (isMinAvailable() ? " - min=" + min + "\n" : "") + (isMaxAvailable() ? " - max=" + max + "\n" : "") + (isArithmeticMeanAvailable() ? " - arithmeticMean=" + arithmeticMean + "\n" : "") + (isSampleVarianceAvailable() ? " - sampleVariance=" + sampleVariance + "\n" : "") + (isPopulationVarianceAvailable() ? " - populationVariance=" + populationVariance + "\n" : "") + (isRangeAvailable() ? " - range=" + range + "\n" : "") + (isKurtosisAvailable() ? " - kurtosis=" + kurtosis + "\n" : "") + (isGeometricMeanAvailable() ? " - geometricMean=" + geometricMean + "\n" : "") + "]"; } }