/*
* ARX: Powerful Data Anonymization
* Copyright 2012 - 2017 Fabian Prasser, Florian Kohlmayer and contributors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.deidentifier.arx.metric.v2;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import org.deidentifier.arx.ARXConfiguration;
import org.deidentifier.arx.DataDefinition;
import org.deidentifier.arx.RowSet;
import org.deidentifier.arx.certificate.elements.ElementData;
import org.deidentifier.arx.framework.check.groupify.HashGroupify;
import org.deidentifier.arx.framework.check.groupify.HashGroupifyEntry;
import org.deidentifier.arx.framework.data.Data;
import org.deidentifier.arx.framework.data.DataManager;
import org.deidentifier.arx.framework.data.GeneralizationHierarchy;
import org.deidentifier.arx.framework.lattice.Transformation;
import org.deidentifier.arx.metric.MetricConfiguration;
/**
* This class implements the KL Divergence metric.
* Ashwin Machanavajjhala, Daniel Kifer, Johannes Gehrke, Muthuramakrishnan Venkitasubramaniam:
* L-diversity: Privacy beyond k-anonymity
* ACM Transactions on Knowledge Discovery from Data (TKDD), Volume 1 Issue 1, March 2007
*
* @author Fabian Prasser
*/
public class MetricSDNMKLDivergence extends AbstractMetricSingleDimensional {
/** Tuple wrapper*/
class TupleWrapper {
/** Field*/
private final int[] tuple;
/** Field*/
private final int hash;
/**
* Constructor
* @param tuple
*/
public TupleWrapper(int[] tuple) {
this.tuple = tuple;
this.hash = Arrays.hashCode(tuple);
}
@Override
public boolean equals(Object other) {
return Arrays.equals(this.tuple, ((TupleWrapper)other).tuple);
}
@Override
public int hashCode() {
return hash;
}
}
/** SUID. */
private static final long serialVersionUID = -4918601543733931921L;
/**
* Computes log 2.
*
* @param num
* @return
*/
static final double log2(final double num) {
return Math.log(num) / LOG2;
}
/** Total number of tuples, depends on existence of research subset. */
private Double tuples = null;
/** Domain shares for each dimension. */
private DomainShare[] shares;
/** Maximum value */
private Double max = null;
/** Tuple matcher */
private TupleMatcher matcher = null;
/** Distribution */
private double[] inputDistribution = null;
/** Log 2. */
private static final double LOG2 = Math.log(2);
/** Maximal area */
private double maximalArea = 0d;
/**
* Default constructor.
*/
public MetricSDNMKLDivergence(){
super(true, false, false);
}
@Override
public ILSingleDimensional createMaxInformationLoss() {
if (max == null) {
throw new IllegalStateException("Metric must be initialized first");
} else {
return new ILSingleDimensional(max);
}
}
@Override
public ILSingleDimensional createMinInformationLoss() {
return new ILSingleDimensional(0);
}
/**
* Returns the configuration of this metric.
*
* @return
*/
public MetricConfiguration getConfiguration() {
return new MetricConfiguration(false, // monotonic
0.5d, // gs-factor
false, // precomputed
0.0d, // precomputation threshold
AggregateFunction.SUM // aggregate function
);
}
@Override
public String getName() {
return "KL-Divergence";
}
@Override
public ElementData render(ARXConfiguration config) {
ElementData result = new ElementData("KL divergence");
result.addProperty("Monotonic", this.isMonotonic(config.getMaxOutliers()));
return result;
}
@Override
public String toString() {
return "KL-Divergence";
}
/**
* Returns the area
* @param output
* @param generalization
* @return
*/
private double getArea(int[] output, int[] generalization) {
double result = 1d;
for (int dimension = 0; dimension < output.length; dimension++) {
DomainShare share = this.shares[dimension];
result *= share.getShare(output[dimension], generalization[dimension]) * share.getDomainSize();
}
return result;
}
@Override
protected ILSingleDimensionalWithBound getInformationLossInternal(Transformation node, HashGroupify g) {
// Obtain number of outliers
double outliers = 0d;
HashGroupifyEntry m = g.getFirstEquivalenceClass();
while (m != null) {
outliers += !m.isNotOutlier ? m.count : 0d;
m = m.nextOrdered;
}
// Init
double result = 0d;
// For each tuple
for (int row = 0; row < this.inputDistribution.length; row++) {
// Obtain frequency
double inputFrequency = inputDistribution[row];
// Only if present
if (inputFrequency != 0d) {
int[] generalization = node.getGeneralization();
HashGroupifyEntry entry = this.matcher.getEntry(row, generalization, g);
double outputFrequency = entry.isNotOutlier ? entry.count : outliers;
outputFrequency /= this.tuples;
outputFrequency /= entry.isNotOutlier ? getArea(entry.key, generalization) : maximalArea;
// Compute KL-Divergence
result += inputFrequency * log2(inputFrequency / outputFrequency);
}
}
// Return
return new ILSingleDimensionalWithBound(result);
}
@Override
protected ILSingleDimensionalWithBound getInformationLossInternal(Transformation node, HashGroupifyEntry entry) {
return new ILSingleDimensionalWithBound(entry.count, entry.count);
}
@Override
protected ILSingleDimensional getLowerBoundInternal(Transformation node) {
return null;
}
@Override
protected ILSingleDimensional getLowerBoundInternal(Transformation node,
HashGroupify g) {
return null;
}
@Override
protected void initializeInternal(final DataManager manager,
final DataDefinition definition,
final Data input,
final GeneralizationHierarchy[] hierarchies,
final ARXConfiguration config) {
// Prepare weights
super.initializeInternal(manager, definition, input, hierarchies, config);
// Compute domain shares
this.shares = new DomainShare[hierarchies.length];
for (int i = 0; i < shares.length; i++) {
// Extract info
String attribute = input.getHeader()[i];
String[][] hierarchy = definition.getHierarchy(attribute);
this.shares[i] = new DomainShareMaterialized(hierarchy,
input.getDictionary().getMapping()[i],
hierarchies[i].getArray());
}
// Determine total number of tuples
this.tuples = (double)super.getNumRecords(config, input);
RowSet subset = super.getSubset(config);
// Tuple matcher
this.matcher = new TupleMatcher(hierarchies, input.getArray());
// Areamax
this.maximalArea = 1d;
for (int dimension = 0; dimension < this.shares.length; dimension++) {
maximalArea *= this.shares[dimension].getDomainSize();
}
// Groupify
Map<TupleWrapper, Integer> groupify = new HashMap<TupleWrapper, Integer>();
for (int row = 0; row < input.getDataLength(); row++) {
if (subset == null || subset.contains(row)) {
TupleWrapper wrapper = new TupleWrapper(input.getArray()[row]);
Integer count = groupify.get(wrapper);
count = count == null ? 1 : count + 1;
groupify.put(wrapper, count);
}
}
// Build input distribution and compute max
this.max = 0d;
this.inputDistribution = new double[input.getArray().length];
for (int row = 0; row < input.getDataLength(); row++) {
if (subset == null || subset.contains(row)) {
TupleWrapper wrapper = new TupleWrapper(input.getArray()[row]);
double frequency = groupify.get(wrapper).doubleValue() / this.tuples;
this.inputDistribution[row] = frequency ;
this.max += frequency * log2(frequency * maximalArea);
}
}
}
}