MetricSDNMEntropyBasedInformationLoss.java example

Explorer
ARX-master
- src
/*
 * ARX: Powerful Data Anonymization
 * Copyright 2012 - 2017 Fabian Prasser, Florian Kohlmayer and contributors
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.deidentifier.arx.metric.v2;

import org.deidentifier.arx.ARXConfiguration;
import org.deidentifier.arx.DataDefinition;
import org.deidentifier.arx.certificate.elements.ElementData;
import org.deidentifier.arx.framework.check.distribution.DistributionAggregateFunction;
import org.deidentifier.arx.framework.check.groupify.HashGroupify;
import org.deidentifier.arx.framework.check.groupify.HashGroupifyEntry;
import org.deidentifier.arx.framework.data.Data;
import org.deidentifier.arx.framework.data.DataManager;
import org.deidentifier.arx.framework.data.GeneralizationHierarchy;
import org.deidentifier.arx.framework.lattice.Transformation;
import org.deidentifier.arx.metric.InformationLossWithBound;
import org.deidentifier.arx.metric.MetricConfiguration;

/**
 * This class implements a the entropy-based information loss model proposed in:<br>
 * A Game Theoretic Framework for Analyzing Re-Identification Risk.
 * Zhiyu Wan, Yevgeniy Vorobeychik, Weiyi Xia, Ellen Wright Clayton,
 * Murat Kantarcioglu, Ranjit Ganta, Raymond Heatherly, Bradley A. Malin
 * PLOS|ONE. 2015.
 * 
 * 
 * @author Fabian Prasser
 */
public class MetricSDNMEntropyBasedInformationLoss extends AbstractMetricSingleDimensional {

    /** SVUID*/
    private static final long serialVersionUID = -2443537745262162075L;

    /**
     * Implements the entropy-based IL model. Ignores record suppression. Returns the loss for exactly one record.
     * @param transformation
     * @param entry
     * @param shares
     * @param functions
     * @param microaggregationStartIndex
     * @param maxIL
     * @return
     */
    public static double getEntropyBasedInformationLoss(Transformation transformation, 
                                                        HashGroupifyEntry entry,
                                                        DomainShare[] shares,
                                                        DistributionAggregateFunction[] microaggregationFunctions,
                                                        int microaggregationStartIndex,
                                                        double maxIL) {

        // We transform the formula, to make evaluating it more efficient.
        //
        // With maxIL = log(size_1 * size_2 * ... * size_n) we define
        // IL = [-log( 1 / (share_1 * size_1) ) - log ( 1 / (share_2 * size_2) ) ... - log( 1 / (share_n * size_n) ) ] / maxIL
        //
        // Step 1:
        //
        // IL = [log(share_1 * size_1 ) + log (share_2 * size_2 ) ... + log( share_n * size_n) ] / maxIL
        //
        // Step 2:
        //
        // IL = [log(share_1 * share_2 * ... * share_n) + log(size_1 * size_2 * ... * size_n) ] / maxIL
        //
        // Step 3:
        // 
        // IL = [log(share_1 * share_2 * ... * share_n) + maxIL ] / maxIL
        //
        // Step 4:
        // 
        // IL = log(share_1 * share_2 * ... * share_n) / maxIL + 1
        //
        // For attributes transformed with microaggregation, we set share_i to 1/#distinct-values-in-eq-class and size_i to the #distinct-values-in-dataset

        int[] generalization = transformation.getGeneralization();
        double infoLoss = 1d;
        for (int dimension = 0; dimension < shares.length; dimension++) {
            int value = entry.key[dimension];
            int level = generalization[dimension];
            infoLoss *= shares[dimension].getShare(value, level);
        }
        if (microaggregationFunctions != null) {
            for (int dimension=0; dimension<microaggregationFunctions.length; dimension++){
                infoLoss *= microaggregationFunctions[dimension].getInformationLoss(entry.distributions[microaggregationStartIndex + dimension]);
            }
        }
        
        // Finalize
        double result = Math.log10(infoLoss) / maxIL + 1d;
        
        // TODO: Floating point operations suck
        if (Double.isNaN(result) || result <= -0.001d || result >= +1.001d) {
            throw new IllegalStateException("Value (" + result + ") out of range [0,1]");
        }
        
        // Fix rounding problems
        result = result < 0d ? 0d : result;
        result = result > 1d ? 1d : result;
        
        // Return
        return result;
    }

    /**
     * Returns the maximal entropy-based information loss
     * @param domainShares For generalized attributes
     * @param domainSizes For microaggregated attributes
     * @return
     */
    public static double getMaximalEntropyBasedInformationLoss(DomainShare[] domainShares,
                                                               int[] domainSizes) {
        double maxIL = 1d;
        for (DomainShare share : domainShares) {
            maxIL *= share.getDomainSize();
        }
        for (int size : domainSizes) {
            maxIL *= size;
        }
        maxIL = Math.log10(maxIL);
        return maxIL;
    }

    /** Domain shares for each dimension. */
    private DomainShare[]                     shares;

    /** MaxIL */
    private double                            maxIL;

    /**
     * Creates a new instance. Default constructor which treats all transformation methods equally.
     */
    public MetricSDNMEntropyBasedInformationLoss() {
        this(0.5d);
    }

    /**
     * Creates a new instance.
     * 
     * @param gsFactor A factor [0,1] weighting generalization and suppression.
     *            The default value is 0.5, which means that generalization
     *            and suppression will be treated equally. A factor of 0
     *            will favor suppression, and a factor of 1 will favor
     *            generalization. The values in between can be used for
     *            balancing both methods.
     */
    public MetricSDNMEntropyBasedInformationLoss(double gsFactor) {
        super(true, false, false, gsFactor);
    }

    @Override
    public ILSingleDimensional createMaxInformationLoss() {
        Double rows = getNumTuples();
        if (rows == null) {
            throw new IllegalStateException("Metric must be initialized first");
        } else {
            return new ILSingleDimensional(rows);
        }
    }

    @Override
    public ILSingleDimensional createMinInformationLoss() {
        return new ILSingleDimensional(0d);
    }

    /**
     * Returns the configuration of this metric.
     * 
     * @return
     */
    public MetricConfiguration getConfiguration() {
        return new MetricConfiguration(false, 
                                       super.getGeneralizationSuppressionFactor(), 
                                       false, 
                                       0.0d, 
                                       this.getAggregateFunction());
    }

    @Override
    public String getName() {
        return "Entropy-based information loss";
    }

    @Override
    public boolean isAbleToHandleMicroaggregation() {
        return true;
    }

    @Override
    public boolean isGSFactorSupported() {
        return true;
    }

    @Override
    public ElementData render(ARXConfiguration config) {
        ElementData result = new ElementData("Entropy-based information loss");
        result.addProperty("Monotonic", this.isMonotonic(config.getMaxOutliers()));
        result.addProperty("Generalization factor", this.getGeneralizationFactor());
        result.addProperty("Suppression factor", this.getSuppressionFactor());
        return result;
    }
    
    @Override
    public String toString() {
        return "EntropyBasedInformationLoss";
    }

    @Override
    protected ILSingleDimensionalWithBound getInformationLossInternal(Transformation transformation, HashGroupify g) {
        
        // Prepare
        double real = 0;
        double bound = 0;
        double gFactor = super.getGeneralizationFactor();
        double sFactor = super.getSuppressionFactor();
        HashGroupifyEntry entry = g.getFirstEquivalenceClass();
        DistributionAggregateFunction[] microaggregationFunctions = super.getMicroaggregationFunctions();
        int microaggregationStartIndex = super.getMicroaggregationStartIndex();

        // Compute
        while (entry != null) {
            if (entry.count > 0) {
                double loss = entry.count * getEntropyBasedInformationLoss(  transformation,
                                                                             entry,
                                                                             shares,
                                                                             microaggregationFunctions,
                                                                             microaggregationStartIndex,
                                                                             maxIL);
                real += entry.isNotOutlier ? gFactor * loss : sFactor * entry.count;
                bound += gFactor * loss;
            }
            entry = entry.nextOrdered;
        }
        
        // Return
        return super.createInformationLoss(real, bound);
    }

    @Override
    protected InformationLossWithBound<ILSingleDimensional> getInformationLossInternal(Transformation transformation,
                                                                                       HashGroupifyEntry entry) {
        
        DistributionAggregateFunction[] microaggregationFunctions = super.getMicroaggregationFunctions();
        int microaggregationStartIndex = super.getMicroaggregationStartIndex();
        double gFactor = super.getGeneralizationFactor();
        double sFactor = super.getSuppressionFactor();
        double bound = entry.count * getEntropyBasedInformationLoss(  transformation,
                                                                      entry,
                                                                      shares,
                                                                      microaggregationFunctions,
                                                                      microaggregationStartIndex,
                                                                      maxIL);
        double loss = entry.isNotOutlier ? gFactor * bound : sFactor * entry.count;
        return super.createInformationLoss(loss, gFactor * bound);
    }

    @Override
    protected ILSingleDimensional getLowerBoundInternal(Transformation transformation) {
        return null;
    }

    @Override
    protected ILSingleDimensional getLowerBoundInternal(Transformation transformation,
                                                        HashGroupify groupify) {

        // Compute
        double bound = 0;
        double gFactor = super.getGeneralizationFactor();
        HashGroupifyEntry entry = groupify.getFirstEquivalenceClass();
        while (entry != null) {
            
            bound += entry.count == 0 ? 0d : gFactor * entry.count * getEntropyBasedInformationLoss(  transformation,
                                                                                                      entry,
                                                                                                      shares,
                                                                                                      null,
                                                                                                      0,
                                                                                                      maxIL);
            entry = entry.nextOrdered;
        }
        
        // Return
        return new ILSingleDimensional(bound);
    }
    
    /**
     * For subclasses.
     * 
     * @return
     */
    protected DomainShare[] getShares() {
        return this.shares;
    }

    @Override
    protected void initializeInternal(final DataManager manager,
                                      final DataDefinition definition,
                                      final Data input,
                                      final GeneralizationHierarchy[] hierarchies,
                                      final ARXConfiguration config) {

        // Prepare weights
        super.initializeInternal(manager, definition, input, hierarchies, config);

        // Compute domain shares
        this.shares =  manager.getDomainShares();

        // Calculate MaxIL
        this.maxIL = getMaximalEntropyBasedInformationLoss(this.shares, super.getMicroaggregationDomainSizes());
    }
}