/* * ARX: Powerful Data Anonymization * Copyright 2012 - 2017 Fabian Prasser, Florian Kohlmayer and contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.deidentifier.arx.risk; import java.util.Arrays; import java.util.Set; import org.deidentifier.arx.DataHandleInternal; import org.deidentifier.arx.common.Groupify; import org.deidentifier.arx.common.Groupify.Group; import org.deidentifier.arx.common.TupleWrapper; import org.deidentifier.arx.common.WrappedBoolean; import org.deidentifier.arx.common.WrappedInteger; import org.deidentifier.arx.exceptions.ComputationInterruptedException; /** * This class implements risk measures as proposed by El Emam in * "Guide to the De-Identification of Personal Health Information", * "Measuring the Probability of Re-Identification" * * @author Fabian Prasser */ public class RiskModelSampleSummary { /** * Journalist risk * @author Fabian Prasser */ public static class JournalistRisk extends RiskSummary { /** * Creates a new instance * @param rA * @param rB * @param rC */ protected JournalistRisk(double rA, double rB, double rC) { super(rA, rB, rC); } } /** * Marketer risk * * @author Fabian Prasser */ public static class MarketerRisk { /** Proportion of records that can be re-identified on average*/ private final double rC; /** * Creates a new instance * @param rC */ protected MarketerRisk(double rC) { this.rC = rC; } /** * Proportion of records that can be re-identified on average * @return */ public double getSuccessRate() { return rC; } } /** * Prosecutor risk * @author Fabian Prasser */ public static class ProsecutorRisk extends RiskSummary { /** * Creates a new instance * @param rA * @param rB * @param rC */ protected ProsecutorRisk(double rA, double rB, double rC) { super(rA, rB, rC); } } /** * A set of derived risk estimates * * @author Fabian Prasser */ public static class RiskSummary { /** Proportion of records with risk above threshold*/ private final double rA; /** Maximum probability of re-identification*/ private final double rB; /** Proportion of records that can be re-identified on average*/ private final double rC; /** * Creates a new instance * @param rA * @param rB * @param rC */ protected RiskSummary(double rA, double rB, double rC) { this.rA = rA; this.rB = rB; this.rC = rC; } /** * Maximum probability of re-identification * @return */ public double getHighestRisk() { return rB; } /** * Proportion of records with risk above threshold * @return */ public double getRecordsAtRisk() { return rA; } /** * Proportion of records that can be re-identified on average * @return */ public double getSuccessRate() { return rC; } } /** Prosecutor risk */ private final ProsecutorRisk prosecutorRisk; /** Journalist risk */ private final JournalistRisk journalistRisk; /** Marketer risk */ private final MarketerRisk marketerRisk; /** Acceptable highest probability of re-identification for a single record */ private final double threshold; /** * Creates a new instance * @param handle Handle * @param identifiers Identifiers * @param threshold Acceptable highest probability of re-identification for a single record * @param stop Stop flag * @param progress Progress */ public RiskModelSampleSummary(DataHandleInternal handle, Set<String> identifiers, double threshold, WrappedBoolean stop, WrappedInteger progress) { // Init this.threshold = threshold; // Prepare Groupify<TupleWrapper> sample; Groupify<TupleWrapper> population; if (handle.getSuperset() != null) { sample = getGroups(handle, identifiers, 0d, 0.45d, stop, progress, false); population = getGroups(handle.getSuperset(), identifiers, 0.45d, 0.45d, stop, progress, true); } else { sample = getGroups(handle, identifiers, 0d, 0.9d, stop, progress, false); population = sample; } this.prosecutorRisk = getProsecutorRisk(population, sample, 0.9d, stop, progress); this.journalistRisk = getJournalistRisk(population, sample, 0.933d, stop, progress); this.marketerRisk = getMarketerRisk(population, sample, 0.966d, stop, progress); } /** * Returns the journalist risk * @return */ public JournalistRisk getJournalistRisk() { return journalistRisk; } /** * Returns the marketer risk * @return */ public MarketerRisk getMarketerRisk() { return marketerRisk; } /** * Returns the prosecutor risk * @return */ public ProsecutorRisk getProsecutorRisk() { return prosecutorRisk; } /** * Returns the user-defined risk threshold for individual records * @return */ public double getThreshold() { return threshold; } /** * Computes the equivalence classes * @param handle * @param qis * @param offset * @param factor * @param stop * @param progress * @param ignoreOutliers * @return */ private Groupify<TupleWrapper> getGroups(DataHandleInternal handle, Set<String> qis, double offset, double factor, WrappedBoolean stop, WrappedInteger progress, boolean ignoreOutliers) { /* ******************************** * Check * ********************************/ if (handle == null) { throw new NullPointerException("Handle is null"); } if (qis == null) { throw new NullPointerException("Quasi identifiers must not be null"); } for (String q : qis) { if (handle.getColumnIndexOf(q) == -1) { throw new IllegalArgumentException(q + " is not an attribute"); } } /* ******************************** * Build equivalence classes * ********************************/ final int[] indices = new int[qis.size()]; int index = 0; for (final String attribute : qis) { indices[index++] = handle.getColumnIndexOf(attribute); } Arrays.sort(indices); // Calculate equivalence classes int capacity = handle.getNumRows() / 10; capacity = capacity > 10 ? capacity : 10; Groupify<TupleWrapper> map = new Groupify<TupleWrapper>(capacity); int numRows = handle.getNumRows(); for (int row = 0; row < numRows; row++) { int prog = (int) Math.round(offset + (double) row / (double) numRows * factor); if (prog != progress.value) { progress.value = prog; } TupleWrapper tuple = new TupleWrapper(handle, indices, row, ignoreOutliers); map.add(tuple); if (stop.value) { throw new ComputationInterruptedException(); } } // Return return map; } /** * Computes risks * @param population * @param sample * @param offset * @param progress * @param stop * @return */ private JournalistRisk getJournalistRisk(Groupify<TupleWrapper> population, Groupify<TupleWrapper> sample, double offset, WrappedBoolean stop, WrappedInteger progress) { // Init double rA = 0d; double rB = 0d; double rC = 0d; double rC1 = 0d; double rC2 = 0d; double numRecordsInSample = 0d; double numClassesInSample = 0d; double smallestClassSizeInPopulation = Integer.MAX_VALUE; int maxindex = sample.size(); int index = 0; // For each group Group<TupleWrapper> element = sample.first(); while (element != null) { // Track progress int prog = (int) Math.round(offset + (double) index++ / (double) maxindex * 3.3d); if (prog != progress.value) { progress.value = prog; } // Only process unsuppressed records if (!element.getElement().isOutlier()) { int groupSizeInSample = element.getCount(); int groupSizeInPopulation = groupSizeInSample; if (population != sample) { groupSizeInPopulation = population.get(element.getElement()).getCount(); } // Compute rA if (1d / groupSizeInPopulation > threshold) { rA += groupSizeInSample; } // Compute rB if (groupSizeInPopulation < smallestClassSizeInPopulation) { smallestClassSizeInPopulation = groupSizeInPopulation; } // Compute rC numClassesInSample++; numRecordsInSample += groupSizeInSample; rC1 += groupSizeInPopulation; rC2 += (double)groupSizeInSample / (double)groupSizeInPopulation; } // Next element element = element.next(); // Stop, if required if (stop.value) { throw new ComputationInterruptedException(); } } // Finalize rA rA /= numRecordsInSample; // Compute rB: smallest class is first class in the histogram rB = 1d / smallestClassSizeInPopulation; // Compute rC rC1 = numClassesInSample / rC1; rC2 = rC2 / numRecordsInSample; rC = Math.max(rC1, rC2); // Return return new JournalistRisk(rA, rB, rC); } /** * Computes risks * @param population * @param sample * @param offset * @param progress * @param stop * @return */ private MarketerRisk getMarketerRisk(Groupify<TupleWrapper> population, Groupify<TupleWrapper> sample, double offset, WrappedBoolean stop, WrappedInteger progress) { // Init double rC = 0d; double numRecordsInSample = 0d; int maxindex = sample.size(); int index = 0; // For each group Group<TupleWrapper> element = sample.first(); while (element != null) { // Track progress int prog = (int) Math.round(offset + (double) index++ / (double) maxindex * 3.3d); if (prog != progress.value) { progress.value = prog; } // Only process unsuppressed records if (!element.getElement().isOutlier()) { int groupSizeInSample = element.getCount(); int groupSizeInPopulation = groupSizeInSample; if (population != sample) { groupSizeInPopulation = population.get(element.getElement()).getCount(); } // Compute rC numRecordsInSample += groupSizeInSample; rC += (double)groupSizeInSample / (double)groupSizeInPopulation; } // Next element element = element.next(); // Stop, if required if (stop.value) { throw new ComputationInterruptedException(); } } // Compute rC rC = rC / numRecordsInSample; // Return return new MarketerRisk(rC); } /** * Computes risks * @param population * @param sample * @param offset * @param progress * @param stop * @return */ private ProsecutorRisk getProsecutorRisk(Groupify<TupleWrapper> population, Groupify<TupleWrapper> sample, double offset, WrappedBoolean stop, WrappedInteger progress) { // Init double rA = 0d; double rB = 0d; double rC = 0d; double numRecords = 0d; double numClasses = 0d; double smallestClassSize = Integer.MAX_VALUE; int maxindex = sample.size(); int index = 0; // For each group Group<TupleWrapper> element = sample.first(); while (element != null) { // Track progress int prog = (int) Math.round(offset + (double) index++ / (double) maxindex * 3.3d); if (prog != progress.value) { progress.value = prog; } // Only process unsuppressed records if (!element.getElement().isOutlier()) { // Compute rA int groupSize = element.getCount(); if (1d / groupSize > threshold) { rA += groupSize; } // Compute rB if (groupSize < smallestClassSize) { smallestClassSize = groupSize; } // Compute rC numClasses++; numRecords += groupSize; } // Next element element = element.next(); // Stop, if required if (stop.value) { throw new ComputationInterruptedException(); } } // Finalize rA rA /= numRecords; // Compute rB: smallest class is first class in the histogram rB = 1d / smallestClassSize; // Compute rC rC = numClasses / numRecords; // Return return new ProsecutorRisk(rA, rB, rC); } }