/* * ARX: Powerful Data Anonymization * Copyright 2012 - 2017 Fabian Prasser, Florian Kohlmayer and contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.deidentifier.arx.criteria; import org.apache.commons.math3.distribution.PoissonDistribution; import org.deidentifier.arx.ARXConfiguration; import org.deidentifier.arx.ARXPopulationModel; import org.deidentifier.arx.DataSubset; import org.deidentifier.arx.certificate.elements.ElementData; import org.deidentifier.arx.framework.check.groupify.HashGroupifyEntry; import org.deidentifier.arx.framework.data.DataManager; import org.deidentifier.arx.framework.lattice.Transformation; /** * This class implements the k-map privacy model as proposed by Latanya Sweeney.<br> * <br> * As an alternative to explicitly providing data about the underlying population, cell sizes can be can be estimated with * the D3 (Poisson) and D4 (zero-truncated Poisson) estimators proposed in:<br> * K. El Emam and F. Dankar, "Protecting privacy using k-anonymity" JAMIA, vol. 15, no. 5, pp. 627-637, 2008.<br> * <br> * The estimator D3 was first published in:<br> * J. Pannekoek, "Statistical methods for some simple disclosure limitation rules," Statistica Neerlandica, vol. 53, no. 1, pp. 55-67, 1999. * * @author Fabian Prasser * @author Florian Kohlmayer */ public class KMap extends ImplicitPrivacyCriterion { /** * Estimators for cell sizes in the population. * * @author Florian Kohlmayer * @author Fabian Prasser */ public enum CellSizeEstimator { /** Poisson distribution */ POISSON("Poisson"), /** Truncate-at-zero Poisson distribution */ ZERO_TRUNCATED_POISSON("Zero-truncated Poisson"); /** Label */ private String label; /** Creates a new instance */ CellSizeEstimator(String label) { this.label = label; } @Override public String toString() { return this.label; } } /** SVUID */ private static final long serialVersionUID = -6966985761538810077L; /** K */ private final int k; /** A compressed representation of the research subset. */ private DataSubset subset; /** The parameter k'. */ private int derivedK = -1; /** The significance level */ private final double significanceLevel; /** The population model */ private final ARXPopulationModel populationModel; /** The selected estimator */ private final CellSizeEstimator estimator; /** The actual type I error. */ private double type1Error; /** * Creates a new instance of the k-map criterion as proposed by Latanya Sweeney * @param k * @param subset Research subset */ public KMap(int k, DataSubset subset) { this(k, 0d, null, null, subset); } /** * Creates a new instance of the criterion using thr Poisson estimator proposed by Pannekoek. */ public KMap(int k, double significanceLevel, ARXPopulationModel populationModel) { this(k, significanceLevel, populationModel, CellSizeEstimator.POISSON, null); } /** * Creates a new instance of the criterion using the Poisson estimator proposed by Pannekoek or by El Emam. */ public KMap(int k, double significanceLevel, ARXPopulationModel populationModel, CellSizeEstimator estimator) { this(k, significanceLevel, populationModel, estimator, null); } /** * Internal constructor. * @param k * @param significanceLevel * @param populationModel * @param estimator */ private KMap(int k, double significanceLevel, ARXPopulationModel populationModel, CellSizeEstimator estimator, DataSubset subset) { super(true, true); this.k = k; this.populationModel = populationModel; this.subset = subset; if ((significanceLevel < 0) || (significanceLevel > 1d)) { throw new IllegalArgumentException("Significance level has to be between 0 and 1."); } this.significanceLevel = significanceLevel; this.estimator = estimator; if ((estimator == null) && (this.subset == null)) { throw new IllegalArgumentException("If no estimator is defined a subset has to be provided."); } } @Override public KMap clone() { return new KMap(getK(), getSignificanceLevel(), ((getPopulationModel() == null) ? null : getPopulationModel().clone()), getEstimator(), ((getDataSubset() == null) ? null : getDataSubset().clone())); } @Override public PrivacyCriterion clone(DataSubset subset) { if (!isLocalRecodingSupported()) { throw new UnsupportedOperationException("Local recoding is not supported by this model"); } // We replace estimated k-map with an according instance of k-anonymity. // This avoids the re-calculation of k' return new KAnonymity(this.getDerivedK()); } @Override public DataSubset getDataSubset() { return this.subset; } /** * Returns the derived parameter k. * * @return */ public int getDerivedK() { return this.derivedK; } /** * Returns the specified estimator. * @return */ public CellSizeEstimator getEstimator() { return this.estimator; } /** * Returns k. * * @return */ public int getK() { return this.k; } @Override public int getMinimalClassSize() { if (!isAccurate()) { return this.derivedK; } else { return 0; } } @Override public ARXPopulationModel getPopulationModel() { return this.populationModel; } @Override public int getRequirements() { if (this.estimator == null) { // Requires two counters return ARXConfiguration.REQUIREMENT_COUNTER | ARXConfiguration.REQUIREMENT_SECONDARY_COUNTER; } else { // Requires only one counter return ARXConfiguration.REQUIREMENT_COUNTER; } } /** * Return journalist risk threshold, 1 if there is none * @return */ public double getRiskThresholdJournalist() { return 1d / (double)k; } /** * Return marketer risk threshold, 1 if there is none * @return */ public double getRiskThresholdMarketer() { return getRiskThresholdJournalist(); } /** * Return journalist risk threshold, 1 if there is none * @return */ public double getRiskThresholdProsecutor() { if (isAccurate() || derivedK == -1) { return 1d; } else { return 1d / (double)derivedK; } } /** * Returns the specified significance level. * @return */ public double getSignificanceLevel() { return this.significanceLevel; } /** * Returns the calculated type I error. * @return */ public double getType1Error() { return this.type1Error; } @Override @SuppressWarnings("deprecation") public void initialize(DataManager manager, ARXConfiguration config) { super.initialize(manager, config); // TODO: Needed for backwards compatibility of ARX 3.4.0 with previous versions if (this.populationModel != null) { this.populationModel.makeBackwardsCompatible(manager.getDataGeneralized().getDataLength()); } if (this.estimator != null) { // TODO: consider subset/inclusion double samplingFraction = (double)manager.getDataGeneralized().getDataLength() / (double)this.populationModel.getPopulationSize(); // Derive k switch (this.estimator) { case POISSON: this.derivedK = calculateKPoisson(samplingFraction * (double)this.k); break; case ZERO_TRUNCATED_POISSON: this.derivedK = calculateKZeroPoisson(samplingFraction * (double)this.k); break; default: throw new IllegalArgumentException("Unknown estimator: " + this.estimator); } } // Check bounds if (this.derivedK > manager.getDataGeneralized().getDataLength()) { this.derivedK = manager.getDataGeneralized().getDataLength(); } this.derivedK = Math.min(this.k, this.derivedK); } /** * Return true if the population has been modeled explicitly. * This implies that no approximation is performed. * @return */ public boolean isAccurate() { return this.subset != null; } @Override public boolean isAnonymous(Transformation node, HashGroupifyEntry entry) { if (this.estimator == null) { return entry.pcount >= this.k; } else { return entry.count >= this.derivedK; } } @Override public boolean isLocalRecodingSupported() { return !isAccurate(); } @Override public boolean isMinimalClassSizeAvailable() { return this.estimator != null && this.derivedK != -1; } @Override public boolean isSubsetAvailable() { return this.subset != null; } @Override public ElementData render() { ElementData result = new ElementData("k-Map"); result.addProperty("Threshold (k)", k); if (this.estimator != null) { result.addProperty("Estimator", this.estimator.toString()); if (this.derivedK != -1) { result.addProperty("Derived threshold", this.derivedK); } if (this.populationModel != null) { result.addProperty("Population", this.populationModel.getPopulationSize()); } } return result; } @Override public String toString() { String value = "(" + this.k + ")-map"; if (this.estimator != null) { if (derivedK == -1){ value += " estimated as (unknown)-anonymity (" + this.estimator + ")"; } else { value += " estimated as (" + this.derivedK + ")-anonymity (" + this.estimator + ")"; } } return value; } /** * Calculates k, based on Poisson distribution. * @param lambda * @return */ private int calculateKPoisson(double lambda) { final double threshold = 1d - this.significanceLevel; final PoissonDistribution distribution = new PoissonDistribution(lambda); int counter = 0; double value = 0; while (value < threshold) { // value += (Math.pow(lambda, counter) * Math.exp(-lambda)) / ArithmeticUtils.factorial(counter); value = distribution.cumulativeProbability(counter); counter++; // Break if the estimated k is equal or greater than the given k, as this makes no sense. if (counter >= this.k) { // We are 100% sure that the dataset fulfills k-map value = 1d; break; } } this.type1Error = 1d - value; return counter + 1; } /** * Calculates k, based on Zero-truncated Poisson distribution. * https://en.wikipedia.org/wiki/Zero-truncated_Poisson_distribution * * @param lambda * @return */ private int calculateKZeroPoisson(double lambda) { final double threshold = 1d - this.significanceLevel; final PoissonDistribution distribution = new PoissonDistribution(lambda); final double v2 = 1d - distribution.probability(0); int counter = 1; double value = 0d; while (value < threshold) { // value2 += ((Math.pow(lambda, counter)) / (Math.exp(lambda) - 1)) * ArithmeticUtils.factorial(counter); value += distribution.probability(counter) / v2; counter++; // Break if the estimated k is equal or greater than the given k, as this makes no sense. if (counter >= this.k) { // We are 100% sure that the dataset fulfills k-map value = 1d; break; } } this.type1Error = 1d - value; return counter; } }