/*
* ARX: Powerful Data Anonymization
* Copyright 2012 - 2017 Fabian Prasser, Florian Kohlmayer and contributors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.deidentifier.arx.metric.v2;
import org.deidentifier.arx.ARXConfiguration;
import org.deidentifier.arx.DataDefinition;
import org.deidentifier.arx.certificate.elements.ElementData;
import org.deidentifier.arx.framework.check.distribution.DistributionAggregateFunction;
import org.deidentifier.arx.framework.check.groupify.HashGroupify;
import org.deidentifier.arx.framework.check.groupify.HashGroupifyEntry;
import org.deidentifier.arx.framework.data.Data;
import org.deidentifier.arx.framework.data.DataManager;
import org.deidentifier.arx.framework.data.GeneralizationHierarchy;
import org.deidentifier.arx.framework.lattice.Transformation;
import org.deidentifier.arx.metric.InformationLossWithBound;
import org.deidentifier.arx.metric.MetricConfiguration;
/**
* This class implements a the entropy-based information loss model proposed in:<br>
* A Game Theoretic Framework for Analyzing Re-Identification Risk.
* Zhiyu Wan, Yevgeniy Vorobeychik, Weiyi Xia, Ellen Wright Clayton,
* Murat Kantarcioglu, Ranjit Ganta, Raymond Heatherly, Bradley A. Malin
* PLOS|ONE. 2015.
*
*
* @author Fabian Prasser
*/
public class MetricSDNMEntropyBasedInformationLoss extends AbstractMetricSingleDimensional {
/** SVUID*/
private static final long serialVersionUID = -2443537745262162075L;
/**
* Implements the entropy-based IL model. Ignores record suppression. Returns the loss for exactly one record.
* @param transformation
* @param entry
* @param shares
* @param functions
* @param microaggregationStartIndex
* @param maxIL
* @return
*/
public static double getEntropyBasedInformationLoss(Transformation transformation,
HashGroupifyEntry entry,
DomainShare[] shares,
DistributionAggregateFunction[] microaggregationFunctions,
int microaggregationStartIndex,
double maxIL) {
// We transform the formula, to make evaluating it more efficient.
//
// With maxIL = log(size_1 * size_2 * ... * size_n) we define
// IL = [-log( 1 / (share_1 * size_1) ) - log ( 1 / (share_2 * size_2) ) ... - log( 1 / (share_n * size_n) ) ] / maxIL
//
// Step 1:
//
// IL = [log(share_1 * size_1 ) + log (share_2 * size_2 ) ... + log( share_n * size_n) ] / maxIL
//
// Step 2:
//
// IL = [log(share_1 * share_2 * ... * share_n) + log(size_1 * size_2 * ... * size_n) ] / maxIL
//
// Step 3:
//
// IL = [log(share_1 * share_2 * ... * share_n) + maxIL ] / maxIL
//
// Step 4:
//
// IL = log(share_1 * share_2 * ... * share_n) / maxIL + 1
//
// For attributes transformed with microaggregation, we set share_i to 1/#distinct-values-in-eq-class and size_i to the #distinct-values-in-dataset
int[] generalization = transformation.getGeneralization();
double infoLoss = 1d;
for (int dimension = 0; dimension < shares.length; dimension++) {
int value = entry.key[dimension];
int level = generalization[dimension];
infoLoss *= shares[dimension].getShare(value, level);
}
if (microaggregationFunctions != null) {
for (int dimension=0; dimension<microaggregationFunctions.length; dimension++){
infoLoss *= microaggregationFunctions[dimension].getInformationLoss(entry.distributions[microaggregationStartIndex + dimension]);
}
}
// Finalize
double result = Math.log10(infoLoss) / maxIL + 1d;
// TODO: Floating point operations suck
if (Double.isNaN(result) || result <= -0.001d || result >= +1.001d) {
throw new IllegalStateException("Value (" + result + ") out of range [0,1]");
}
// Fix rounding problems
result = result < 0d ? 0d : result;
result = result > 1d ? 1d : result;
// Return
return result;
}
/**
* Returns the maximal entropy-based information loss
* @param domainShares For generalized attributes
* @param domainSizes For microaggregated attributes
* @return
*/
public static double getMaximalEntropyBasedInformationLoss(DomainShare[] domainShares,
int[] domainSizes) {
double maxIL = 1d;
for (DomainShare share : domainShares) {
maxIL *= share.getDomainSize();
}
for (int size : domainSizes) {
maxIL *= size;
}
maxIL = Math.log10(maxIL);
return maxIL;
}
/** Domain shares for each dimension. */
private DomainShare[] shares;
/** MaxIL */
private double maxIL;
/**
* Creates a new instance. Default constructor which treats all transformation methods equally.
*/
public MetricSDNMEntropyBasedInformationLoss() {
this(0.5d);
}
/**
* Creates a new instance.
*
* @param gsFactor A factor [0,1] weighting generalization and suppression.
* The default value is 0.5, which means that generalization
* and suppression will be treated equally. A factor of 0
* will favor suppression, and a factor of 1 will favor
* generalization. The values in between can be used for
* balancing both methods.
*/
public MetricSDNMEntropyBasedInformationLoss(double gsFactor) {
super(true, false, false, gsFactor);
}
@Override
public ILSingleDimensional createMaxInformationLoss() {
Double rows = getNumTuples();
if (rows == null) {
throw new IllegalStateException("Metric must be initialized first");
} else {
return new ILSingleDimensional(rows);
}
}
@Override
public ILSingleDimensional createMinInformationLoss() {
return new ILSingleDimensional(0d);
}
/**
* Returns the configuration of this metric.
*
* @return
*/
public MetricConfiguration getConfiguration() {
return new MetricConfiguration(false,
super.getGeneralizationSuppressionFactor(),
false,
0.0d,
this.getAggregateFunction());
}
@Override
public String getName() {
return "Entropy-based information loss";
}
@Override
public boolean isAbleToHandleMicroaggregation() {
return true;
}
@Override
public boolean isGSFactorSupported() {
return true;
}
@Override
public ElementData render(ARXConfiguration config) {
ElementData result = new ElementData("Entropy-based information loss");
result.addProperty("Monotonic", this.isMonotonic(config.getMaxOutliers()));
result.addProperty("Generalization factor", this.getGeneralizationFactor());
result.addProperty("Suppression factor", this.getSuppressionFactor());
return result;
}
@Override
public String toString() {
return "EntropyBasedInformationLoss";
}
@Override
protected ILSingleDimensionalWithBound getInformationLossInternal(Transformation transformation, HashGroupify g) {
// Prepare
double real = 0;
double bound = 0;
double gFactor = super.getGeneralizationFactor();
double sFactor = super.getSuppressionFactor();
HashGroupifyEntry entry = g.getFirstEquivalenceClass();
DistributionAggregateFunction[] microaggregationFunctions = super.getMicroaggregationFunctions();
int microaggregationStartIndex = super.getMicroaggregationStartIndex();
// Compute
while (entry != null) {
if (entry.count > 0) {
double loss = entry.count * getEntropyBasedInformationLoss( transformation,
entry,
shares,
microaggregationFunctions,
microaggregationStartIndex,
maxIL);
real += entry.isNotOutlier ? gFactor * loss : sFactor * entry.count;
bound += gFactor * loss;
}
entry = entry.nextOrdered;
}
// Return
return super.createInformationLoss(real, bound);
}
@Override
protected InformationLossWithBound<ILSingleDimensional> getInformationLossInternal(Transformation transformation,
HashGroupifyEntry entry) {
DistributionAggregateFunction[] microaggregationFunctions = super.getMicroaggregationFunctions();
int microaggregationStartIndex = super.getMicroaggregationStartIndex();
double gFactor = super.getGeneralizationFactor();
double sFactor = super.getSuppressionFactor();
double bound = entry.count * getEntropyBasedInformationLoss( transformation,
entry,
shares,
microaggregationFunctions,
microaggregationStartIndex,
maxIL);
double loss = entry.isNotOutlier ? gFactor * bound : sFactor * entry.count;
return super.createInformationLoss(loss, gFactor * bound);
}
@Override
protected ILSingleDimensional getLowerBoundInternal(Transformation transformation) {
return null;
}
@Override
protected ILSingleDimensional getLowerBoundInternal(Transformation transformation,
HashGroupify groupify) {
// Compute
double bound = 0;
double gFactor = super.getGeneralizationFactor();
HashGroupifyEntry entry = groupify.getFirstEquivalenceClass();
while (entry != null) {
bound += entry.count == 0 ? 0d : gFactor * entry.count * getEntropyBasedInformationLoss( transformation,
entry,
shares,
null,
0,
maxIL);
entry = entry.nextOrdered;
}
// Return
return new ILSingleDimensional(bound);
}
/**
* For subclasses.
*
* @return
*/
protected DomainShare[] getShares() {
return this.shares;
}
@Override
protected void initializeInternal(final DataManager manager,
final DataDefinition definition,
final Data input,
final GeneralizationHierarchy[] hierarchies,
final ARXConfiguration config) {
// Prepare weights
super.initializeInternal(manager, definition, input, hierarchies, config);
// Compute domain shares
this.shares = manager.getDomainShares();
// Calculate MaxIL
this.maxIL = getMaximalEntropyBasedInformationLoss(this.shares, super.getMicroaggregationDomainSizes());
}
}