/* * ARX: Powerful Data Anonymization * Copyright 2012 - 2017 Fabian Prasser, Florian Kohlmayer and contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.deidentifier.arx.criteria; import java.io.IOException; import java.io.ObjectInputStream; import org.deidentifier.arx.certificate.elements.ElementData; import org.deidentifier.arx.framework.check.distribution.Distribution; import org.deidentifier.arx.framework.check.groupify.HashGroupifyEntry; import org.deidentifier.arx.framework.lattice.Transformation; /** * The entropy l-diversity privacy model. * * @author Fabian Prasser * @author Florian Kohlmayer * @author Sebastian Stammler */ public class EntropyLDiversity extends LDiversity { /** * Enumerator of entropy estimators for the entropy-l-diversity privacy model. * This enumerator actually holds the logarithm substitute \psi for * entropy estimation via the formula * $H = \psi(N) - 1/N \sum n \psi (n)$ * * @author Sebastian Stammler * */ public enum EntropyEstimator { /** The Shannon estimator*/ SHANNON(new IPsi(){public double f(int n) {return Math.log(n);}}, "Shannon"), /** The Grassberger estimator*/ GRASSBERGER(new IPsi(){public double f(int n) {return G(n);}}, "Grassberger"); /** * In Java, we need to use an inner functional interface * to have an enumerator of functions... doh * * @author Sebastian Stammler */ private interface IPsi { public double f(int n); } /** Our inner function */ private final IPsi psi; /** The name */ private final String name; /** * Holds precomputed values of G_n for 1 <= n <= 100<br> * It is G_1 = G_PRECOMPUTED[0]. * For n>1, we have G_{2n+1} := G_{2n}, so we only store the values for even index: * G_{2n} = G_PRECOMPUTED[n] */ final private static double [] G_PRECOMPUTED = { -1.2703628454614782, // G_1 0.7296371545385218, // G_2 1.3963038212051886, // G_4 1.7963038212051885, // G_6 2.0820181069194743, // G_8 2.3042403291416966, // G_10 2.4860585109598783, // G_12 2.639904664806032, // G_14 2.7732379981393653, // G_16 2.8908850569628948, // G_18 2.9961482148576315, // G_20 3.091386310095727, // G_22 3.178342831834857, // G_24 3.2583428318348573, // G_26 3.3324169059089312, // G_28 3.4013824231503107, // G_30 3.4658985521825687, // G_32 3.5265046127886293, // G_34 3.5836474699314866, // G_36 3.6377015239855406, // G_38 3.6889835752675917, // G_40 3.7377640630724698, // G_42 3.7842756909794466, // G_44 3.8287201354238913, // G_46 3.871273326913253, // G_48 3.912089653443865, // G_50 3.951305339718375, // G_52 3.9890411887749786, // G_54 4.025404825138615, // G_56 4.060492544436861, // G_58 4.094390849521607, // G_60 4.127177734767508, // G_62 4.1589237665135395, // G_64 4.18969299728277, // G_66 4.219543743551427, // G_68 4.248529250797804, // G_70 4.276698264882311, // G_72 4.304095525156284, // G_74 4.33076219182295, // G_76 4.356736217796977, // G_78 4.382052673493179, // G_80 4.40674403151787, // G_82 4.430840417060039, // G_84 4.454369828824745, // G_86 4.477358334571871, // G_88 4.49983024468423, // G_90 4.521808266662253, // G_92 4.543313643006339, // G_94 4.564366274585286, // G_96 4.5849848312863175, // G_98 4.605186851488337, // G_100 }; /** Static s1 */ private static final double s1 = 1d/24; /** Static s2 */ private static final double s2 = 7d/960; /** Static s3 */ private static final double s3 = 31d/8064; /** * Calculates the Grassberger entropy correction term G_n<br> * <br> * $$G_{2n+1} := G_{2n} = -\gamma -\log2 +\sum_{k=1}^n 2/(2k-1)$$ * The first 100 values are precomputed. After that, an expansion of the Digamma function at infinity is used. * * @param n > 0 (not checked!) * @return G_n */ private static double G(int n) { if (n <= 100) { return G_PRECOMPUTED[(n-n%2)/2]; } n -= n%2; // Make n even final double m = 1d / ((n/2)*(n/2)); return Math.log(n) + m *(s1 - m *(s2 - m*s3)); } /** * Creates a new instance * @param psi * @param name */ private EntropyEstimator(IPsi psi, String name) { this.psi = psi; this.name = name; } /** * The logarithm substitute of the current estimator * * The difference in estimating the entropy by the naive Shannon or Grassberger * estimator is actually using log or G for \psi in the entropy formula * $H = \psi(N) - 1/N \sum n \psi(n)$ * where N is the size of the set and the sum goes over all values of the * sensitive attribute, n is the count of the current sensitive attribute * * @param n * @return The logarithm substitute of the estimator */ public double psi(int n) { return psi.f(n); } @Override public String toString() { return name; } } /** SVUID */ private static final long serialVersionUID = -354688551915634000L; /** Entropy estimator to be used */ private EntropyEstimator estimator; /** * Creates a new instance of the entropy l-diversity model as proposed in:<br> * Machanavajjhala A, Kifer D, Gehrke J. l-diversity: Privacy beyond k-anonymity.<br> * Transactions on Knowledge Discovery from Data (TKDD). 2007;1(1):3. * * @param attribute * @param l */ public EntropyLDiversity(String attribute, double l){ super(attribute, l, false, true); this.estimator = EntropyEstimator.SHANNON; } /** * Creates a new instance of the entropy-l-diversity privacy model, * specifying the entropy estimator be to used. * Two estimators are available:<br> * <ul> * <li> * SHANNON for the usual naive Shannon estimator: * this amounts to the original entropy-l-diversity definition by Machanavajjhala. * </li> * <li> * GRASSBERGER for the corrected Grassberger estimator as proposed in: * P Grassberger. Entropy Estimates from Insufficient Samplings. * https://arxiv.org/abs/physics/0307138v2<br> * This estimator generally accepts more sets as being entropy-l-diverse than * the naive Shannon estimator, thus increases data utility. * It also guarantees a more consistent meaning of the security * parameter l between different data sets. For details take a look at: * S Stammler, S Katzenbeisser, K Hamacher. * Correcting Finite Sampling Issues in Entropy l-diversity. * Privacy in Statistical Databases 2016. LNCS Vol. 9867 pp 135-146 * </li> * </ul> * * @param attribute The sensitive attribute * @param l Security parameter * @param estimator Entropy estimator (SHANNON or GRASSBERGER) */ public EntropyLDiversity(String attribute, double l, EntropyEstimator estimator) { super(attribute, l, false, true); this.estimator = estimator; } @Override public EntropyLDiversity clone() { return new EntropyLDiversity(this.getAttribute(), this.getL(), this.getEstimator()); } /** * Returns the entropy estimator used by this instance * @return */ public EntropyEstimator getEstimator() { return estimator; } @Override public boolean isAnonymous(Transformation node, HashGroupifyEntry entry) { Distribution d = entry.distributions[index]; // If less than l values are present skip if (d.size() < minSize) { return false; } // Sum of the frequencies in distribution (=number of elements) final int total = entry.count; // Sum must stay smaller than this constant term final double C = total * (estimator.psi(total) - Math.log(l)); double sum1 = 0d; final int[] buckets = d.getBuckets(); for (int i = 0; i < buckets.length; i += 2) { if (buckets[i] != -1) { // bucket not empty final int frequency = buckets[i + 1]; sum1 += frequency * estimator.psi(frequency); // If the sum grows over C, we can abort the loop earlier. if (C < sum1) { return false; } } } // If we reach this point, the loop did not return false. return true; } @Override public boolean isLocalRecodingSupported() { return true; } @Override public ElementData render() { ElementData result = new ElementData("Entropy l-diversity"); result.addProperty("Attribute", attribute); result.addProperty("Threshold (l)", this.l); result.addProperty("Entropy estimator", this.estimator.toString()); return result; } @Override public String toString() { return estimator.toString().toLowerCase() + "-entropy-" + l + "-diversity for attribute '" + attribute + "'"; } /** * Custom de-serialization * * If we de-serialize an older object where the entropy estimator * could not be chosen, set the estimator to the default: Shannon. * * @param ois * @throws ClassNotFoundException * @throws IOException */ private void readObject(ObjectInputStream ois) throws ClassNotFoundException, IOException { // Default de-serialization ois.defaultReadObject(); // Set default estimator if de-serializing an older object if (this.estimator == null) { this.estimator = EntropyEstimator.SHANNON; } } }