/* * ARX: Powerful Data Anonymization * Copyright 2012 - 2017 Fabian Prasser, Florian Kohlmayer and contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.deidentifier.arx.metric; import java.util.Arrays; import org.deidentifier.arx.ARXConfiguration; import org.deidentifier.arx.DataDefinition; import org.deidentifier.arx.RowSet; import org.deidentifier.arx.certificate.elements.ElementData; import org.deidentifier.arx.framework.check.groupify.HashGroupify; import org.deidentifier.arx.framework.check.groupify.HashGroupifyEntry; import org.deidentifier.arx.framework.data.Data; import org.deidentifier.arx.framework.data.DataManager; import org.deidentifier.arx.framework.data.Dictionary; import org.deidentifier.arx.framework.data.GeneralizationHierarchy; import org.deidentifier.arx.framework.lattice.Transformation; /** * This class provides an efficient implementation of the non-uniform entropy * metric. It avoids a cell-by-cell process by utilizing a three-dimensional * array that maps identifiers to their frequency for all quasi-identifiers and * generalization levels. It further reduces the overhead induced by subsequent * calls by caching the results for previous columns and generalization levels. * * @author Fabian Prasser * @author Florian Kohlmayer */ public class MetricEntropy extends MetricDefault { /** Value unknown. */ private static final double NA = Double.POSITIVE_INFINITY; /** SVUID. */ private static final long serialVersionUID = -8618697919821588987L; /** Log 2. */ static final double log2 = Math.log(2); /** * Computes log 2. * * @param num * @return */ static final double log2(final double num) { return Math.log(num) / log2; } /** Column -> Level -> Value. */ private double[][] cache; /** Column -> Id -> Level -> Count. */ private int[][][] cardinalities; /** Column -> Id -> Level -> Output. */ private int[][][] hierarchies; /** * */ protected MetricEntropy() { super(true, true, true); } /** * Creates a new instance. * * @param monotonicWithGeneralization * @param monotonicWithSuppression * @param independent */ protected MetricEntropy(final boolean monotonicWithGeneralization, final boolean monotonicWithSuppression, final boolean independent) { super(monotonicWithGeneralization, monotonicWithSuppression, independent); } @Override public ElementData render(ARXConfiguration config) { ElementData result = new ElementData("Non-uniform entropy"); result.addProperty("Monotonic", this.isMonotonic(config.getMaxOutliers())); return result; } @Override public String toString() { return "Monotonic Non-Uniform Entropy"; } /** * @return the cache */ protected double[][] getCache() { return cache; } /** * @return the cardinalities */ protected int[][][] getCardinalities() { return cardinalities; } /** * @return the hierarchies */ protected int[][][] getHierarchies() { return hierarchies; } @Override protected InformationLossWithBound<InformationLossDefault> getInformationLossInternal(final Transformation node, final HashGroupify g) { if (node.getLowerBound() != null) { return new InformationLossWithBound<InformationLossDefault>((InformationLossDefault)node.getLowerBound(), (InformationLossDefault)node.getLowerBound()); } // Init double result = 0; // For each column for (int column = 0; column < hierarchies.length; column++) { // Check for cached value final int state = node.getGeneralization()[column]; double value = cache[column][state]; if (value == NA) { value = 0d; final int[][] cardinality = cardinalities[column]; final int[][] hierarchy = hierarchies[column]; for (int in = 0; in < hierarchy.length; in++) { final int out = hierarchy[in][state]; final double a = cardinality[in][0]; final double b = cardinality[out][state]; if (a != 0d) { value += a * log2(a / b); } } cache[column][state] = value; } result += value; } result = round(result == 0.0d ? result : -result); return new InformationLossDefaultWithBound(result, result); } @Override protected InformationLossWithBound<InformationLossDefault> getInformationLossInternal(Transformation node, HashGroupifyEntry entry) { return new InformationLossDefaultWithBound(entry.count, entry.count); } @Override protected InformationLossDefault getLowerBoundInternal(Transformation node) { return getInformationLossInternal(node, (HashGroupify)null).getLowerBound(); } @Override protected InformationLossDefault getLowerBoundInternal(Transformation node, HashGroupify groupify) { return getLowerBoundInternal(node); } @Override protected void initializeInternal(final DataManager manager, final DataDefinition definition, final Data input, final GeneralizationHierarchy[] ahierarchies, final ARXConfiguration config) { // Obtain dictionary final Dictionary dictionary = input.getDictionary(); // Obtain research subset RowSet rSubset = super.getSubset(config); // Create reference to the hierarchies final int[][] data = input.getArray(); hierarchies = new int[data[0].length][][]; for (int i = 0; i < ahierarchies.length; i++) { hierarchies[i] = ahierarchies[i].getArray(); // Column -> Id -> Level -> Output } // Initialize counts cardinalities = new int[data[0].length][][]; for (int i = 0; i < cardinalities.length; i++) { cardinalities[i] = new int[dictionary.getMapping()[i].length][ahierarchies[i].getArray()[0].length]; // Column -> Id -> Level -> Count } for (int i = 0; i < data.length; i++) { // only use the rows contained in the research subset if (rSubset == null || rSubset.contains(i)) { final int[] row = data[i]; for (int column = 0; column < row.length; column++) { cardinalities[column][row[column]][0]++; } } } // Create counts for other levels for (int column = 0; column < hierarchies.length; column++) { final int[][] hierarchy = hierarchies[column]; for (int in = 0; in < hierarchy.length; in++) { final int cardinality = cardinalities[column][in][0]; for (int level = 1; level < hierarchy[in].length; level++) { final int out = hierarchy[in][level]; cardinalities[column][out][level] += cardinality; } } } // Create a cache for the results cache = new double[hierarchies.length][]; for (int i = 0; i < cache.length; i++) { cache[i] = new double[ahierarchies[i].getArray()[0].length]; Arrays.fill(cache[i], NA); } } }