EntropyLDiversity.java example

Explorer
ARX-master
- src
/*
 * ARX: Powerful Data Anonymization
 * Copyright 2012 - 2017 Fabian Prasser, Florian Kohlmayer and contributors
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.deidentifier.arx.criteria;

import java.io.IOException;
import java.io.ObjectInputStream;

import org.deidentifier.arx.certificate.elements.ElementData;
import org.deidentifier.arx.framework.check.distribution.Distribution;
import org.deidentifier.arx.framework.check.groupify.HashGroupifyEntry;
import org.deidentifier.arx.framework.lattice.Transformation;

/**
 * The entropy l-diversity privacy model.
 *
 * @author Fabian Prasser
 * @author Florian Kohlmayer
 * @author Sebastian Stammler
 */
public class EntropyLDiversity extends LDiversity {

    /**
     * Enumerator of entropy estimators for the entropy-l-diversity privacy model.
     * This enumerator actually holds the logarithm substitute \psi for
     * entropy estimation via the formula
     *   $H = \psi(N) - 1/N \sum n \psi (n)$
     *
     * @author Sebastian Stammler
     *
     */
    public enum EntropyEstimator {

        /** The Shannon estimator*/
    	SHANNON(new IPsi(){public double f(int n) {return Math.log(n);}}, "Shannon"),
    	/** The Grassberger estimator*/
    	GRASSBERGER(new IPsi(){public double f(int n) {return G(n);}}, "Grassberger");

        /** 
         * In Java, we need to use an inner functional interface
         * to have an enumerator of functions... doh
         * 
         * @author Sebastian Stammler
         */
        private interface IPsi {
            public double f(int n);
        }
        
        /** Our inner function */
        private final IPsi            psi;
        /** The name */
        private final String          name;

    	/**
         * Holds precomputed values of G_n for 1 <= n <= 100<br>
         * It is G_1 = G_PRECOMPUTED[0].
         * For n>1, we have G_{2n+1} := G_{2n}, so we only store the values for even index:
         * G_{2n} = G_PRECOMPUTED[n]
         */
        final private static double [] G_PRECOMPUTED = {
                -1.2703628454614782, // G_1
                0.7296371545385218, // G_2
                1.3963038212051886, // G_4
                1.7963038212051885, // G_6
                2.0820181069194743, // G_8
                2.3042403291416966, // G_10
                2.4860585109598783, // G_12
                2.639904664806032, // G_14
                2.7732379981393653, // G_16
                2.8908850569628948, // G_18
                2.9961482148576315, // G_20
                3.091386310095727, // G_22
                3.178342831834857, // G_24
                3.2583428318348573, // G_26
                3.3324169059089312, // G_28
                3.4013824231503107, // G_30
                3.4658985521825687, // G_32
                3.5265046127886293, // G_34
                3.5836474699314866, // G_36
                3.6377015239855406, // G_38
                3.6889835752675917, // G_40
                3.7377640630724698, // G_42
                3.7842756909794466, // G_44
                3.8287201354238913, // G_46
                3.871273326913253, // G_48
                3.912089653443865, // G_50
                3.951305339718375, // G_52
                3.9890411887749786, // G_54
                4.025404825138615, // G_56
                4.060492544436861, // G_58
                4.094390849521607, // G_60
                4.127177734767508, // G_62
                4.1589237665135395, // G_64
                4.18969299728277, // G_66
                4.219543743551427, // G_68
                4.248529250797804, // G_70
                4.276698264882311, // G_72
                4.304095525156284, // G_74
                4.33076219182295, // G_76
                4.356736217796977, // G_78
                4.382052673493179, // G_80
                4.40674403151787, // G_82
                4.430840417060039, // G_84
                4.454369828824745, // G_86
                4.477358334571871, // G_88
                4.49983024468423, // G_90
                4.521808266662253, // G_92
                4.543313643006339, // G_94
                4.564366274585286, // G_96
                4.5849848312863175, // G_98
                4.605186851488337, // G_100
        };
    	
    	/** Static s1 */
        private static final double s1 = 1d/24;
    	/** Static s2 */
        private static final double s2 = 7d/960;
        /** Static s3 */
        private static final double s3 = 31d/8064;
        
        /**
         * Calculates the Grassberger entropy correction term G_n<br>
         * <br>
         * $$G_{2n+1} := G_{2n} = -\gamma -\log2 +\sum_{k=1}^n 2/(2k-1)$$
         * The first 100 values are precomputed. After that, an expansion of the Digamma function at infinity is used.
         *
         * @param n > 0 (not checked!)
         * @return G_n
         */
        private static double G(int n) {
            if (n <= 100) {
                return G_PRECOMPUTED[(n-n%2)/2];
            }

            n -= n%2; // Make n even
            final double m = 1d / ((n/2)*(n/2));

            return Math.log(n) + m *(s1 - m *(s2 - m*s3));
        }
        /**
    	 * Creates a new instance
    	 * @param psi
    	 * @param name
    	 */
    	private EntropyEstimator(IPsi psi, String name) {
    		this.psi = psi;
    		this.name = name;
    	}
    	
    	/**
    	 * The logarithm substitute of the current estimator
    	 * 
    	 * The difference in estimating the entropy by the naive Shannon or Grassberger
    	 * estimator is actually using log or G for \psi in the entropy formula
    	 *    $H = \psi(N) - 1/N \sum n \psi(n)$
    	 * where N is the size of the set and the sum goes over all values of the
    	 * sensitive attribute, n is the count of the current sensitive attribute
    	 *  
    	 * @param n
    	 * @return The logarithm substitute of the estimator
    	 */
    	public double psi(int n) {
    		return psi.f(n);
    	}
        
        @Override
    	public String toString() {
    	    return name;
    	}
    }
    
    /**  SVUID */
    private static final long   serialVersionUID = -354688551915634000L;

    /** Entropy estimator to be used */
    private EntropyEstimator estimator;
    
    /**
     * Creates a new instance of the entropy l-diversity model as proposed in:<br>
     * Machanavajjhala A, Kifer D, Gehrke J. l-diversity: Privacy beyond k-anonymity.<br>
     * Transactions on Knowledge Discovery from Data (TKDD). 2007;1(1):3.
     *
     * @param attribute
     * @param l
     */
    public EntropyLDiversity(String attribute, double l){
        super(attribute, l, false, true);
        this.estimator = EntropyEstimator.SHANNON;
    }

    /**
     * Creates a new instance of the entropy-l-diversity privacy model,
     * specifying the entropy estimator be to used.
     * Two estimators are available:<br>
     * <ul>
     *   <li> 
     *   SHANNON for the usual naive Shannon estimator:
     *   this amounts to the original entropy-l-diversity definition by Machanavajjhala.
     *   </li>
     *   <li>
     *   GRASSBERGER for the corrected Grassberger estimator as proposed in:
     *   P Grassberger. Entropy Estimates from Insufficient Samplings.
     *   https://arxiv.org/abs/physics/0307138v2<br>
     *   This estimator generally accepts more sets as being entropy-l-diverse than
     *   the naive Shannon estimator, thus increases data utility.
     *   It also guarantees a more consistent meaning of the security
     *   parameter l between different data sets. For details take a look at:
     *   S Stammler, S Katzenbeisser, K Hamacher.
     *   Correcting Finite Sampling Issues in Entropy l-diversity.
     *   Privacy in Statistical Databases 2016. LNCS Vol. 9867 pp 135-146
     *   </li>
     * </ul>
     *   
     * @param attribute The sensitive attribute
     * @param l Security parameter
     * @param estimator Entropy estimator (SHANNON or GRASSBERGER)
     */
    public EntropyLDiversity(String attribute, double l, EntropyEstimator estimator) {
    	super(attribute, l, false, true);
        this.estimator = estimator;
	}

	@Override
    public EntropyLDiversity clone() {
        return new EntropyLDiversity(this.getAttribute(), this.getL(), this.getEstimator());
    }

    /**
     * Returns the entropy estimator used by this instance
     * @return
     */
    public EntropyEstimator getEstimator() {
		return estimator;
	}

    @Override
    public boolean isAnonymous(Transformation node, HashGroupifyEntry entry) {

        Distribution d = entry.distributions[index];

        // If less than l values are present skip
        if (d.size() < minSize) { return false; }

        // Sum of the frequencies in distribution (=number of elements)
        final int total = entry.count;
        // Sum must stay smaller than this constant term
        final double C = total * (estimator.psi(total) - Math.log(l));
        double sum1 = 0d;

        final int[] buckets = d.getBuckets();
        for (int i = 0; i < buckets.length; i += 2) {
            if (buckets[i] != -1) { // bucket not empty
                final int frequency = buckets[i + 1];
                sum1 += frequency * estimator.psi(frequency);
                // If the sum grows over C, we can abort the loop earlier.
                if (C < sum1) { return false; }
            }
        }

        // If we reach this point, the loop did not return false.
        return true;
    }

    @Override
    public boolean isLocalRecodingSupported() {
        return true;
    }
    
    @Override
    public ElementData render() {
        ElementData result = new ElementData("Entropy l-diversity");
        result.addProperty("Attribute", attribute);
        result.addProperty("Threshold (l)", this.l);
        result.addProperty("Entropy estimator", this.estimator.toString());
        return result;
    }
    
    @Override
	public String toString() {
        return estimator.toString().toLowerCase() + "-entropy-" + l + "-diversity for attribute '" + attribute + "'";
	}

    /**
     * Custom de-serialization
     * 
     * If we de-serialize an older object where the entropy estimator
     * could not be chosen, set the estimator to the default: Shannon.
     * 
     * @param ois
     * @throws ClassNotFoundException
     * @throws IOException
     */
    private void readObject(ObjectInputStream ois) throws ClassNotFoundException, IOException {
        
    	// Default de-serialization
    	ois.defaultReadObject();
    	
    	// Set default estimator if de-serializing an older object
        if (this.estimator == null) {
            this.estimator = EntropyEstimator.SHANNON;
        }
    }
}