/* * ARX: Powerful Data Anonymization * Copyright 2012 - 2017 Fabian Prasser, Florian Kohlmayer and contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.deidentifier.arx.metric.v2; import java.util.Arrays; import org.deidentifier.arx.ARXConfiguration; import org.deidentifier.arx.DataDefinition; import org.deidentifier.arx.RowSet; import org.deidentifier.arx.certificate.elements.ElementData; import org.deidentifier.arx.framework.check.groupify.HashGroupify; import org.deidentifier.arx.framework.check.groupify.HashGroupifyEntry; import org.deidentifier.arx.framework.data.Data; import org.deidentifier.arx.framework.data.DataManager; import org.deidentifier.arx.framework.data.GeneralizationHierarchy; import org.deidentifier.arx.framework.lattice.Transformation; import org.deidentifier.arx.metric.MetricConfiguration; /** * This class provides an efficient implementation of the non-uniform entropy * metric. It avoids a cell-by-cell process by utilizing a three-dimensional * array that maps identifiers to their frequency for all quasi-identifiers and * generalization levels. It further reduces the overhead induced by subsequent * calls by caching the results for previous columns and generalization levels. * TODO: Add reference * * @author Fabian Prasser * @author Florian Kohlmayer */ public class MetricMDNUEntropyPrecomputed extends AbstractMetricMultiDimensional { /** SVUID. */ private static final long serialVersionUID = 8053878428909814308L; /** Not available in the cache. */ private static final double NOT_AVAILABLE = Double.POSITIVE_INFINITY; /** Log 2. */ private static final double LOG2 = Math.log(2); /** * Computes log 2. * * @param num * @return */ static final double log2(final double num) { return Math.log(num) / LOG2; } /** Cardinalities. */ private Cardinalities cardinalities; /** Column -> Level -> Value. */ private double[][] cache; /** Column -> Id -> Level -> Output. */ private int[][][] hierarchies; /** Num rows */ private double rows; /** * Precomputed. * * @param monotonicWithGeneralization * @param monotonicWithSuppression * @param independent * @param gsFactor * @param function */ public MetricMDNUEntropyPrecomputed(boolean monotonicWithGeneralization, boolean monotonicWithSuppression, boolean independent, double gsFactor, AggregateFunction function) { super(monotonicWithGeneralization, monotonicWithSuppression, independent, gsFactor, function); } /** * Creates a new instance. */ protected MetricMDNUEntropyPrecomputed() { super(true, true, true, 0.5d, AggregateFunction.SUM); } /** * Creates a new instance. * * @param gsFactor * @param function */ protected MetricMDNUEntropyPrecomputed(double gsFactor, AggregateFunction function){ super(true, true, true, gsFactor, function); } /** * Returns the configuration of this metric. * * @return */ public MetricConfiguration getConfiguration() { return new MetricConfiguration(true, // monotonic super.getGeneralizationSuppressionFactor(), // gs-factor true, // precomputed 1.0d, // precomputation threshold this.getAggregateFunction() // aggregate function ); } @Override public boolean isGSFactorSupported() { return true; } @Override public boolean isPrecomputed() { return true; } @Override public ElementData render(ARXConfiguration config) { ElementData result = new ElementData("Non-uniform entropy"); result.addProperty("Aggregate function", super.getAggregateFunction().toString()); result.addProperty("Monotonic", this.isMonotonic(config.getMaxOutliers())); result.addProperty("Generalization factor", this.getGeneralizationFactor()); result.addProperty("Suppression factor", this.getSuppressionFactor()); return result; } @Override public String toString() { return "Non-uniform entropy"; } @Override protected ILMultiDimensionalWithBound getInformationLossInternal(final Transformation node, final HashGroupify g) { double[] result = getInformationLossInternalRaw(node, g); // Switch sign bit and round for (int column = 0; column < hierarchies.length; column++) { result[column] = round(result[column] == 0.0d ? result[column] : -result[column]); } // Return return new ILMultiDimensionalWithBound(super.createInformationLoss(result), super.createInformationLoss(result)); } @Override protected ILMultiDimensionalWithBound getInformationLossInternal(Transformation node, HashGroupifyEntry entry) { double[] result = new double[getDimensions()]; Arrays.fill(result, entry.count); return new ILMultiDimensionalWithBound(super.createInformationLoss(result)); } /** * * * @param node * @param g * @return */ protected double[] getInformationLossInternalRaw(final Transformation node, final HashGroupify g) { // Prepare int[][][] cardinalities = this.cardinalities.getCardinalities(); double[] result = new double[hierarchies.length]; double gFactor = super.getGeneralizationFactor(); // For each column for (int column = 0; column < hierarchies.length; column++) { // Check for cached value final int transformation = node.getGeneralization()[column]; double value = cache[column][transformation]; if (value == NOT_AVAILABLE) { value = 0d; final int[][] cardinality = cardinalities[column]; final int[][] hierarchy = hierarchies[column]; for (int in = 0; in < hierarchy.length; in++) { final int out = hierarchy[in][transformation]; final double a = cardinality[in][0]; final double b = cardinality[out][transformation]; if (a != 0d) { value += a * log2(a / b); } } cache[column][transformation] = value; } result[column] = value * gFactor; } return result; } @Override protected AbstractILMultiDimensional getLowerBoundInternal(Transformation node) { return this.getInformationLossInternal(node, (HashGroupify)null).getLowerBound(); } @Override protected AbstractILMultiDimensional getLowerBoundInternal(Transformation node, HashGroupify groupify) { return this.getLowerBoundInternal(node); } /** * Returns the upper bound of the entropy value per column * @return */ protected double[] getUpperBounds() { // Prepare int[][][] cardinalities = this.cardinalities.getCardinalities(); double[] result = new double[hierarchies.length]; double gFactor = super.getGeneralizationFactor(); // For each column for (int column = 0; column < hierarchies.length; column++) { // Compute entropy double value = 0d; final int[][] cardinality = cardinalities[column]; final int[][] hierarchy = hierarchies[column]; for (int in = 0; in < hierarchy.length; in++) { final double a = cardinality[in][0]; if (a != 0d) { value += a * log2(a / rows); } } result[column] = value * gFactor; } // Switch sign bit and round for (int column = 0; column < hierarchies.length; column++) { result[column] = round(result[column] == 0.0d ? result[column] : -result[column]); } return result; } /** * For backwards compatibility. * * @param cache * @param cardinalities * @param hierarchies */ protected void initialize(double[][] cache, int[][][] cardinalities, int[][][] hierarchies) { // Initialize data structures this.cache = cache; this.hierarchies = hierarchies; this.cardinalities = new Cardinalities(cardinalities); // Initialize weights super.initialize(hierarchies.length); // Compute a reasonable maximum double[] min = new double[hierarchies.length]; Arrays.fill(min, 0d); // Its difficult to compute a reasonable maximum in this case double[] max = new double[hierarchies.length]; Arrays.fill(max, Double.MAX_VALUE / hierarchies.length); super.setMax(max); super.setMin(min); } @Override protected void initializeInternal(final DataManager manager, final DataDefinition definition, final Data input, final GeneralizationHierarchy[] hierarchies, final ARXConfiguration config) { super.initializeInternal(manager, definition, input, hierarchies, config); // Obtain subset RowSet subset = super.getSubset(config); // Cardinalities this.cardinalities = new Cardinalities(input, subset, hierarchies); this.rows = input.getDataLength(); double gFactor = super.getGeneralizationFactor(); double sFactor = super.getSuppressionFactor(); // Create a cache for the results this.cache = new double[hierarchies.length][]; for (int i = 0; i < cache.length; i++) { cache[i] = new double[hierarchies[i].getArray()[0].length]; Arrays.fill(cache[i], NOT_AVAILABLE); } // Create reference to the hierarchies final int[][] data = input.getArray(); this.hierarchies = new int[data[0].length][][]; for (int i = 0; i < hierarchies.length; i++) { this.hierarchies[i] = hierarchies[i].getArray(); } // Compute a reasonable min & max double[] min = new double[hierarchies.length]; Arrays.fill(min, 0d); double[] max = new double[hierarchies.length]; for (int i=0; i<max.length; i++) { max[i] = (input.getDataLength() * log2(input.getDataLength())) * Math.max(gFactor, sFactor); } super.setMax(max); super.setMin(min); } }