/* * ARX: Powerful Data Anonymization * Copyright 2012 - 2017 Fabian Prasser, Florian Kohlmayer and contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.deidentifier.arx.risk; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import org.deidentifier.arx.ARXConfiguration; import org.deidentifier.arx.ARXPopulationModel; import org.deidentifier.arx.ARXSolverConfiguration; import org.deidentifier.arx.DataHandleInternal; import org.deidentifier.arx.common.WrappedBoolean; import org.deidentifier.arx.common.WrappedInteger; import org.deidentifier.arx.exceptions.ComputationInterruptedException; /** * A class for analyzing attribute-related risks. Calculates alpha-distinction and * alpha separation as described in R. Motwani et al. * "Efficient algorithms for masking and finding quasi-identifiers" Proc. VLDB Conf., 2007. * * * @author Fabian Prasser * @author Maximilian Zitzmann */ public class RiskModelAttributes { /** * Risks associated with a certain quasi-identifier * * @author Fabian Prasser * @author Maximilian Zitzmann */ public final class QuasiIdentifierRisk implements Comparable<QuasiIdentifierRisk> { /** Field */ private final List<String> identifier; /** Field */ private final double alphaDistinction; /** Field */ private final double alphaSeparation; /** * Creates a new instance * * @param identifier */ private QuasiIdentifierRisk(List<String> identifier) { // Store identifier this.identifier = identifier; // Calculate distribution of class sizes RiskModelHistogram histogram = new RiskEstimateBuilder(population, handle, new HashSet<String>(identifier), stop, solverconfig, arxconfig).getEquivalenceClassModel(); // Calculate distinction and separation this.alphaDistinction = getAlphaDistinction(histogram); this.alphaSeparation = getAlphaSeparation(histogram); } @Override public int compareTo(QuasiIdentifierRisk other) { // Compare size int cmp = Integer.compare(this.identifier.size(), other.identifier.size()); if (cmp != 0) { return cmp; } // Compare distinction cmp = Double.compare(this.alphaDistinction, other.alphaDistinction); if (cmp != 0) { return cmp; } // Compare separation cmp = Double.compare(this.alphaSeparation, other.alphaSeparation); if (cmp != 0) { return cmp; } // Compare lexicographically return this.identifier.toString().compareTo(other.identifier.toString()); } /** * Returns the alpha distinction parameter of this quasi-identifier * * @return the alpha distinction */ public double getDistinction() { return alphaDistinction; } /** * Returns the attributes in this quasi-identifier * * @return the identifier */ public List<String> getIdentifier() { return identifier; } /** * Returns the alpha separation parameter of this quasi-identifier * * @return the alpha separation */ public double getSeparation() { return alphaSeparation; } } /** Stop flag */ private final WrappedBoolean stop; /** Results */ private final QuasiIdentifierRisk[] risks; /** Just needed for creating risk models */ private ARXPopulationModel population; /** Data handle */ private DataHandleInternal handle; /** Just needed for creating risk models */ private ARXSolverConfiguration solverconfig; /** Just needed for creating risk models */ private ARXConfiguration arxconfig; /** * Creates a new instance * @param population * @param handle * @param identifiers * @param stop * @param percentageDone * @param solverconfig * @param arxconfig */ RiskModelAttributes(final ARXPopulationModel population, final DataHandleInternal handle, final Set<String> identifiers, final WrappedBoolean stop, final WrappedInteger percentageDone, final ARXSolverConfiguration solverconfig, final ARXConfiguration arxconfig) { this.population = population; this.handle = handle; this.stop = stop; this.solverconfig = solverconfig; this.arxconfig = arxconfig; // Find order list of qis List<List<String>> qis = new ArrayList<>(); Set<Set<String>> powerset = getPowerSet(identifiers); for (Set<String> set : powerset) { // Exclude empty set if (!set.isEmpty()) { // Create and add List<String> qi = new ArrayList<String>(set); qis.add(qi); // Sort by column index Collections.sort(qi, new Comparator<String>(){ @Override public int compare(String o1, String o2) { int index1 = handle.getColumnIndexOf(o1); int index2 = handle.getColumnIndexOf(o2); return new Integer(index1).compareTo(index2); } }); } } // Compute risk estimates for all elements in the power set Map<List<String>, QuasiIdentifierRisk> scores = new HashMap<>(); int done = 0; for (List<String> qi : qis) { checkInterrupt(); scores.put(qi, new QuasiIdentifierRisk(qi)); percentageDone.value = (int) Math.round((double) done++ / (double) (powerset.size() - 1) * 100d); } // Now create sorted array risks = new QuasiIdentifierRisk[scores.size()]; int idx = 0; for (QuasiIdentifierRisk value : scores.values()) { risks[idx++] = value; } Arrays.sort(risks); } /** * Returns the quasi-identifiers, sorted by risk * * @return */ public QuasiIdentifierRisk[] getAttributeRisks() { return this.risks; } /** * Checks for interrupts */ private void checkInterrupt() { if (stop.value) { throw new ComputationInterruptedException(); } } /** * Calculates the Gaussian sum formula * * @param n the number to sum to * @return the sum from 1 to n */ private double gaussianSum(double n) { return (n * (n + 1d)) / 2d; } /** * We calculate a value alpha in [0,1] such that the set of attributes becomes a key * after the removal of a fraction of at most 1-alpha of the records in the table. * This equals the number of distinct combinations of values (= number of eqClasses) / number of all records. * * @return the calculated alpha distinction */ private double getAlphaDistinction(RiskModelHistogram histogramm) { // This is almost trivial return histogramm.getNumClasses() / histogramm.getNumRecords(); } /** * Two records are separated by the QI if they do not share the same quasi-identifying values. * From the set of all possible combinations of records, this method returns the fraction alpha (in [0, 1]) * of all combinations which are separated by the current QI. * * @return the calculated alpha separation */ private double getAlphaSeparation(RiskModelHistogram histogram) { // Obtain class sizes int[] classes = histogram.getHistogram(); // when we want to compare 4 values (only in one direction this means we compare "a" to "b" but not "b" to "a") // we have 3 + 2 + 1 comparisons // => numberComparisons = number of values - 1 double totalNumberOfComparisons = gaussianSum(histogram.getNumRecords() - 1d); // a record separates another record when it has on at least one attribute a different value double separatedRecords = 0; // no record have been compared yet double numberRecordsLeft = histogram.getNumRecords(); // For each class-size for (int i = 0; i < classes.length; i += 2) { // Obtain size and multiplicity of that class double classSize = classes[i]; double classMultiplicity = classes[i + 1]; // Calculate records remaining in all classes of a size larger than the current one numberRecordsLeft -= classSize * classMultiplicity; // All records in classes of the current size are different from all remaining records double separatedRecordsCurrentClass = classMultiplicity * classSize * numberRecordsLeft; // Moreover, all records in each class of the current size are different from all other records // in other classes of the same size separatedRecordsCurrentClass += ((classMultiplicity - 1d) * classMultiplicity * (classSize * classSize)) / 2d; // add number of separated classes to result separatedRecords += separatedRecordsCurrentClass; // Check interrupt checkInterrupt(); } // alpha separation indicates a value alpha [0,1] such that a subset of attributes separates // at least an alpha fraction of all record pairs return separatedRecords / totalNumberOfComparisons; } /** * Returns the power set * * @param originalSet * @return */ private <T> Set<Set<T>> getPowerSet(Set<T> originalSet) { checkInterrupt(); Set<Set<T>> sets = new HashSet<Set<T>>(); if (originalSet.isEmpty()) { sets.add(new HashSet<T>()); return sets; } List<T> list = new ArrayList<T>(originalSet); T head = list.get(0); Set<T> rest = new HashSet<T>(list.subList(1, list.size())); for (Set<T> set : getPowerSet(rest)) { checkInterrupt(); Set<T> newSet = new HashSet<T>(); newSet.add(head); newSet.addAll(set); sets.add(newSet); sets.add(set); } return sets; } }