RiskModelAttributes.java example

Explorer
ARX-master
- src
/*
 * ARX: Powerful Data Anonymization
 * Copyright 2012 - 2017 Fabian Prasser, Florian Kohlmayer and contributors
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.deidentifier.arx.risk;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.deidentifier.arx.ARXConfiguration;
import org.deidentifier.arx.ARXPopulationModel;
import org.deidentifier.arx.ARXSolverConfiguration;
import org.deidentifier.arx.DataHandleInternal;
import org.deidentifier.arx.common.WrappedBoolean;
import org.deidentifier.arx.common.WrappedInteger;
import org.deidentifier.arx.exceptions.ComputationInterruptedException;

/**
 * A class for analyzing attribute-related risks. Calculates alpha-distinction and
 * alpha separation as described in R. Motwani et al.
 * "Efficient algorithms for masking and finding quasi-identifiers" Proc. VLDB Conf., 2007.
 *
 *
 * @author Fabian Prasser
 * @author Maximilian Zitzmann
 */
public class RiskModelAttributes {

    /**
     * Risks associated with a certain quasi-identifier
     *
     * @author Fabian Prasser
     * @author Maximilian Zitzmann
     */
    public final class QuasiIdentifierRisk implements Comparable<QuasiIdentifierRisk> {

        /** Field */
        private final List<String> identifier;
        /** Field */
        private final double      alphaDistinction;
        /** Field */
        private final double      alphaSeparation;

        /**
         * Creates a new instance
         *
         * @param identifier
         */
        private QuasiIdentifierRisk(List<String> identifier) {

            // Store identifier
            this.identifier = identifier;

            // Calculate distribution of class sizes
            RiskModelHistogram histogram = new RiskEstimateBuilder(population,
                                                                   handle,
                                                                   new HashSet<String>(identifier),
                                                                   stop,
                                                                   solverconfig,
                                                                   arxconfig).getEquivalenceClassModel();

            // Calculate distinction and separation
            this.alphaDistinction = getAlphaDistinction(histogram);
            this.alphaSeparation = getAlphaSeparation(histogram);
        }

        @Override
        public int compareTo(QuasiIdentifierRisk other) {
            
            // Compare size
            int cmp = Integer.compare(this.identifier.size(), other.identifier.size());
            if (cmp != 0) {
                return cmp;
            }

            // Compare distinction
            cmp = Double.compare(this.alphaDistinction, other.alphaDistinction);
            if (cmp != 0) {
                return cmp;
            }

            // Compare separation
            cmp = Double.compare(this.alphaSeparation, other.alphaSeparation);
            if (cmp != 0) {
                return cmp;
            }
            
            // Compare lexicographically
            return this.identifier.toString().compareTo(other.identifier.toString());
        }

        /**
         * Returns the alpha distinction parameter of this quasi-identifier
         * 
         * @return the alpha distinction
         */
        public double getDistinction() {
            return alphaDistinction;
        }

        /**
         * Returns the attributes in this quasi-identifier
         * 
         * @return the identifier
         */
        public List<String> getIdentifier() {
            return identifier;
        }

        /**
         * Returns the alpha separation parameter of this quasi-identifier
         * 
         * @return the alpha separation
         */
        public double getSeparation() {
            return alphaSeparation;
        }
    }
    /** Stop flag */
    private final WrappedBoolean        stop;
    /** Results */
    private final QuasiIdentifierRisk[] risks;
    /** Just needed for creating risk models */
    private ARXPopulationModel          population;
    /** Data handle */
    private DataHandleInternal          handle;
    /** Just needed for creating risk models */
    private ARXSolverConfiguration      solverconfig;
    /** Just needed for creating risk models */
    private ARXConfiguration            arxconfig;

    /**
     * Creates a new instance
     * @param population
     * @param handle
     * @param identifiers
     * @param stop
     * @param percentageDone
     * @param solverconfig
     * @param arxconfig
     */
    RiskModelAttributes(final ARXPopulationModel population,
                        final DataHandleInternal handle,
                        final Set<String> identifiers,
                        final WrappedBoolean stop,
                        final WrappedInteger percentageDone,
                        final ARXSolverConfiguration solverconfig,
                        final ARXConfiguration arxconfig) {
        
        this.population = population;
        this.handle = handle;
        this.stop = stop;
        this.solverconfig = solverconfig;
        this.arxconfig = arxconfig;

        // Find order list of qis
        List<List<String>> qis = new ArrayList<>();
        Set<Set<String>> powerset = getPowerSet(identifiers);
        for (Set<String> set : powerset) {
            
            // Exclude empty set
            if (!set.isEmpty()) {
                
                // Create and add
                List<String> qi = new ArrayList<String>(set);
                qis.add(qi);
                
                // Sort by column index
                Collections.sort(qi, new Comparator<String>(){
                    @Override
                    public int compare(String o1, String o2) {
                        int index1 = handle.getColumnIndexOf(o1);
                        int index2 = handle.getColumnIndexOf(o2);
                        return new Integer(index1).compareTo(index2);
                    }
                });
            }
        }
        
        // Compute risk estimates for all elements in the power set
        Map<List<String>, QuasiIdentifierRisk> scores = new HashMap<>();
        int done = 0;
        for (List<String> qi : qis) {
            checkInterrupt();
            scores.put(qi, new QuasiIdentifierRisk(qi));
            percentageDone.value = (int) Math.round((double) done++ / (double) (powerset.size() - 1) * 100d);
        }

        // Now create sorted array
        risks = new QuasiIdentifierRisk[scores.size()];
        int idx = 0;
        for (QuasiIdentifierRisk value : scores.values()) {
            risks[idx++] = value;
        }
        Arrays.sort(risks);
    }

    /**
     * Returns the quasi-identifiers, sorted by risk
     *
     * @return
     */
    public QuasiIdentifierRisk[] getAttributeRisks() {
        return this.risks;
    }

    /**
     * Checks for interrupts
     */
    private void checkInterrupt() {
        if (stop.value) {
            throw new ComputationInterruptedException();
        }
    }

    /**
     * Calculates the Gaussian sum formula
     * 
     * @param n the number to sum to
     * @return the sum from 1 to n
     */
    private double gaussianSum(double n) {
        return (n * (n + 1d)) / 2d;
    }

    /**
     * We calculate a value alpha in [0,1] such that the set of attributes becomes a key
     * after the removal of a fraction of at most 1-alpha of the records in the table.
     * This equals the number of distinct combinations of values (= number of eqClasses) / number of all records.
     *
     * @return the calculated alpha distinction
     */
    private double getAlphaDistinction(RiskModelHistogram histogramm) {
        
        // This is almost trivial
        return histogramm.getNumClasses() / histogramm.getNumRecords();
    }

    /**
     * Two records are separated by the QI if they do not share the same quasi-identifying values.
     * From the set of all possible combinations of records, this method returns the fraction alpha (in [0, 1])
     * of all combinations which are separated by the current QI.
     *
     * @return the calculated alpha separation
     */
    private double getAlphaSeparation(RiskModelHistogram histogram) {
        
        // Obtain class sizes
        int[] classes = histogram.getHistogram();

        // when we want to compare 4 values (only in one direction this means we compare "a" to "b" but not "b" to "a")
        // we have 3 + 2 + 1 comparisons
        // => numberComparisons = number of values - 1
        double totalNumberOfComparisons = gaussianSum(histogram.getNumRecords() - 1d);

        // a record separates another record when it has on at least one attribute a different value
        double separatedRecords = 0;

        // no record have been compared yet
        double numberRecordsLeft = histogram.getNumRecords();

        // For each class-size
        for (int i = 0; i < classes.length; i += 2) {

            // Obtain size and multiplicity of that class
            double classSize = classes[i];
            double classMultiplicity = classes[i + 1];

            // Calculate records remaining in all classes of a size larger than the current one
            numberRecordsLeft -= classSize * classMultiplicity;

            // All records in classes of the current size are different from all remaining records
            double separatedRecordsCurrentClass = classMultiplicity * classSize * numberRecordsLeft;

            // Moreover, all records in each class of the current size are different from all other records
            // in other classes of the same size
            separatedRecordsCurrentClass += ((classMultiplicity - 1d) * classMultiplicity * (classSize * classSize)) / 2d;

            // add number of separated classes to result
            separatedRecords += separatedRecordsCurrentClass;
            
            // Check interrupt
            checkInterrupt();
        }

        // alpha separation indicates a value alpha [0,1] such that a subset of attributes separates
        // at least an alpha fraction of all record pairs
        return separatedRecords / totalNumberOfComparisons;
    }

    /**
     * Returns the power set
     *
     * @param originalSet
     * @return
     */
    private <T> Set<Set<T>> getPowerSet(Set<T> originalSet) {
        checkInterrupt();
        Set<Set<T>> sets = new HashSet<Set<T>>();
        if (originalSet.isEmpty()) {
            sets.add(new HashSet<T>());
            return sets;
        }
        List<T> list = new ArrayList<T>(originalSet);
        T head = list.get(0);
        Set<T> rest = new HashSet<T>(list.subList(1, list.size()));
        for (Set<T> set : getPowerSet(rest)) {
            checkInterrupt();
            Set<T> newSet = new HashSet<T>();
            newSet.add(head);
            newSet.addAll(set);
            sets.add(newSet);
            sets.add(set);
        }
        return sets;
    }
}