/*
* ARX: Powerful Data Anonymization
* Copyright 2012 - 2017 Fabian Prasser, Florian Kohlmayer and contributors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.deidentifier.arx.test;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
import java.io.File;
import java.io.FilenameFilter;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.deidentifier.arx.ARXAnonymizer;
import org.deidentifier.arx.ARXConfiguration;
import org.deidentifier.arx.ARXPopulationModel;
import org.deidentifier.arx.ARXResult;
import org.deidentifier.arx.AttributeType.Hierarchy;
import org.deidentifier.arx.Data;
import org.deidentifier.arx.DataHandle;
import org.deidentifier.arx.criteria.KAnonymity;
import org.deidentifier.arx.io.CSVHierarchyInput;
import org.deidentifier.arx.metric.Metric;
import org.deidentifier.arx.metric.Metric.AggregateFunction;
import org.deidentifier.arx.risk.RiskModelPopulationUniqueness;
import org.deidentifier.arx.risk.RiskModelPopulationUniqueness.PopulationUniquenessModel;
import org.junit.Test;
/**
* Test for risk metrics.
*
* @author Fabian Prasser
* @author Florian Kohlmayer
*/
public class TestRiskMetrics {
/**
* Returns the data object for a given dataset
*
* @param dataset the dataset
* @return the data object
* @throws IOException Signals that an I/O exception has occurred.
*/
private static Data getDataObject(final String dataset) throws IOException {
final Data data = Data.create(dataset, StandardCharsets.UTF_8, ';');
// Read generalization hierachies
final FilenameFilter hierarchyFilter = new FilenameFilter() {
@Override
public boolean accept(final File dir, final String name) {
if (name.matches(dataset.substring(dataset.lastIndexOf("/") + 1, dataset.length() - 4) + "_hierarchy_(.)+.csv")) {
return true;
} else {
return false;
}
}
};
final File testDir = new File(dataset.substring(0, dataset.lastIndexOf("/")));
final File[] genHierFiles = testDir.listFiles(hierarchyFilter);
final Pattern pattern = Pattern.compile("_hierarchy_(.*?).csv");
for (final File file : genHierFiles) {
final Matcher matcher = pattern.matcher(file.getName());
if (matcher.find()) {
final CSVHierarchyInput hier = new CSVHierarchyInput(file, StandardCharsets.UTF_8, ';');
final String attributeName = matcher.group(1);
// use all found attribute hierarchies as qis
data.getDefinition().setAttributeType(attributeName, Hierarchy.create(hier.getHierarchy()));
}
}
return data;
}
/**
* Test average risk using the example dataset.
*/
@Test
public void testAverageRisk() {
DataProvider provider = new DataProvider();
provider.createDataDefinition();
// Risk before anonymization
double risk = provider.getData().getHandle().getRiskEstimator(ARXPopulationModel.create(provider.getData().getHandle().getNumRows(), 0.1d)).getSampleBasedReidentificationRisk().getAverageRisk();
assertTrue("Is: " + risk, risk == 1.0d);
// Risk after anonymization
risk = getAnonymizedData(provider.getData()).getRiskEstimator(ARXPopulationModel.create(provider.getData().getHandle().getNumRows(), 0.1d)).getSampleBasedReidentificationRisk().getAverageRisk();
assertTrue("Is: " + risk, risk == 0.42857142857142855);
}
/**
* Test average risk using the adult dataset.
*
* @throws IOException Signals that an I/O exception has occurred.
*/
@Test
public void testAverageRisk2() throws IOException {
Data data = getDataObject("./data/adult.csv");
// Risk before anonymization
double risk = data.getHandle().getRiskEstimator(ARXPopulationModel.create(data.getHandle().getNumRows(), 0.1d)).getSampleBasedReidentificationRisk().getAverageRisk();
assertTrue("Is: " + risk, risk == 0.6465751607983555d);
// Risk after anonymization
risk = getAnonymizedData(data).getRiskEstimator(ARXPopulationModel.create(data.getHandle().getNumRows(), 0.1d)).getSampleBasedReidentificationRisk().getAverageRisk();
assertTrue("Is: " + risk, risk == 0.001922949406538028);
}
/**
* Test decision rule using the test dataset.
*/
@Test
public void testDecisionRule() {
DataProvider provider = new DataProvider();
provider.createDataDefinition();
DataHandle handle = provider.getData().getHandle();
RiskModelPopulationUniqueness model = handle.getRiskEstimator(ARXPopulationModel.create(handle.getNumRows(), 0.2d)).getPopulationBasedUniquenessRisk();
double populationUniqueness = model.getFractionOfUniqueTuplesDankar();
double sampleUniqueness = handle.getRiskEstimator(ARXPopulationModel.create(handle.getNumRows(), 0.1d)).getSampleBasedUniquenessRisk().getFractionOfUniqueTuples();
// Risk before anonymization
assertTrue(sampleUniqueness + " / " + populationUniqueness, compareUniqueness(populationUniqueness, 1.0d) == 0);
assertTrue(sampleUniqueness + " / " + populationUniqueness, compareUniqueness(populationUniqueness, sampleUniqueness) <= 0);
final ARXAnonymizer anonymizer = new ARXAnonymizer();
final ARXConfiguration config = ARXConfiguration.create();
config.addPrivacyModel(new KAnonymity(2));
config.setMaxOutliers(0d);
ARXResult result = null;
try {
result = anonymizer.anonymize(provider.getData(), config);
} catch (IOException e) {
e.printStackTrace();
}
final DataHandle outHandle = result.getOutput(false);
populationUniqueness = outHandle.getRiskEstimator(ARXPopulationModel.create(provider.getData().getHandle().getNumRows(), 0.1d)).getPopulationBasedUniquenessRisk().getFractionOfUniqueTuplesDankar();
assertTrue("Is: " + populationUniqueness, compareUniqueness(populationUniqueness, 0) == 0);
}
/**
* Test decision rule using the adult dataset.
*
* @throws IOException Signals that an I/O exception has occurred.
*/
@Test
public void testDecisionRule2() throws IOException {
Data data = getDataObject("./data/adult.csv");
DataHandle handle = data.getHandle();
RiskModelPopulationUniqueness model = handle.getRiskEstimator(ARXPopulationModel.create(handle.getNumRows(), 0.1d)).getPopulationBasedUniquenessRisk();
double sampleUniqueness = handle.getRiskEstimator(ARXPopulationModel.create(handle.getNumRows(), 0.1d)).getSampleBasedUniquenessRisk().getFractionOfUniqueTuples();
double populationUniqueness = model.getFractionOfUniqueTuplesDankar();
if (model.getPopulationUniquenessModel() == PopulationUniquenessModel.PITMAN) {
assertTrue(populationUniqueness + "/" + sampleUniqueness, compareUniqueness(populationUniqueness, 0.27684993883653597) == 0);
} else if (model.getPopulationUniquenessModel() == PopulationUniquenessModel.ZAYATZ) {
assertTrue(populationUniqueness + "/" + sampleUniqueness, compareUniqueness(populationUniqueness, 0.3207402393466189) == 0);
} else {
fail("Unexpected convergence of SNB");
}
assertTrue(populationUniqueness + "/" + sampleUniqueness, compareUniqueness(populationUniqueness, sampleUniqueness) <= 0);
model = handle.getRiskEstimator(ARXPopulationModel.create(handle.getNumRows(), 0.2d)).getPopulationBasedUniquenessRisk();
populationUniqueness = model.getFractionOfUniqueTuplesDankar();
assertTrue(populationUniqueness + "/" + sampleUniqueness, compareUniqueness(populationUniqueness, 0.3577099234829125d) == 0);
assertTrue(populationUniqueness + "/" + sampleUniqueness, compareUniqueness(populationUniqueness, sampleUniqueness) <= 0);
model = handle.getRiskEstimator(ARXPopulationModel.create(handle.getNumRows(), 0.01d)).getPopulationBasedUniquenessRisk();
populationUniqueness = model.getFractionOfUniqueTuplesDankar();
assertTrue(populationUniqueness + "/" + sampleUniqueness, compareUniqueness(populationUniqueness, 0.1446083531167384) == 0);
assertTrue(populationUniqueness + "/" + sampleUniqueness, compareUniqueness(populationUniqueness, sampleUniqueness) <= 0);
model = handle.getRiskEstimator(ARXPopulationModel.create(handle.getNumRows(), 1d)).getPopulationBasedUniquenessRisk();
populationUniqueness = model.getFractionOfUniqueTuplesDankar();
assertTrue(populationUniqueness + "/" + sampleUniqueness, compareUniqueness(populationUniqueness, 0.5142895033485844) == 0);
assertTrue(populationUniqueness + "/" + sampleUniqueness, compareUniqueness(populationUniqueness, sampleUniqueness) == 0);
}
/**
* Compares two uniqueness measures with four significant digits
* @param val1
* @param val2
* @return
*/
private int compareUniqueness(double val1, double val2) {
return Integer.compare((int) (val1 * 10000d), (int) (val2 * 10000d));
}
/**
* Test highest individual risk using the test dataset.
*/
@Test
public void testHighestIndividualRisk() {
DataProvider provider = new DataProvider();
provider.createDataDefinition();
// Risk before anonymization
assertTrue(provider.getData().getHandle().getRiskEstimator(ARXPopulationModel.create(provider.getData().getHandle().getNumRows(), 0.1d)).getSampleBasedReidentificationRisk().getHighestRisk() == 1.0d);
// Risk after anonymization
assertTrue(getAnonymizedData(provider.getData()).getRiskEstimator(ARXPopulationModel.create(provider.getData().getHandle().getNumRows(), 0.1d)).getSampleBasedReidentificationRisk().getHighestRisk() == 0.5d);
}
/**
* Test highest individual risk using the adult dataset.
*
* @throws IOException Signals that an I/O exception has occurred.
*/
@Test
public void testHighestIndividualRisk2() throws IOException {
Data data = getDataObject("./data/adult.csv");
// Risk before anonymization
assertTrue(data.getHandle().getRiskEstimator(ARXPopulationModel.create(data.getHandle().getNumRows(), 0.1d)).getSampleBasedReidentificationRisk().getHighestRisk() == 1.0d);
// Risk after anonymization
assertTrue(getAnonymizedData(data).getRiskEstimator(ARXPopulationModel.create(data.getHandle().getNumRows(), 0.1d)).getSampleBasedReidentificationRisk().getHighestRisk() == 0.5d);
}
/**
* 2-Anonymizes the given data. No suppression allowed.
*
* @param data the data
* @return the anonymized data
*/
private DataHandle getAnonymizedData(Data data) {
final ARXAnonymizer anonymizer = new ARXAnonymizer();
final ARXConfiguration config = ARXConfiguration.create();
config.addPrivacyModel(new KAnonymity(2));
config.setMaxOutliers(0d);
config.setQualityModel(Metric.createLossMetric(AggregateFunction.RANK));
ARXResult result = null;
try {
result = anonymizer.anonymize(data, config);
} catch (IOException e) {
e.printStackTrace();
}
final DataHandle outHandle = result.getOutput(false);
return outHandle;
}
}