/*
* ARX: Powerful Data Anonymization
* Copyright 2012 - 2017 Fabian Prasser, Florian Kohlmayer and contributors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.deidentifier.arx;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import org.deidentifier.arx.ARXAnonymizer.Result;
import org.deidentifier.arx.ARXLattice.ARXNode;
import org.deidentifier.arx.criteria.PrivacyCriterion;
import org.deidentifier.arx.exceptions.RollbackRequiredException;
import org.deidentifier.arx.framework.check.NodeChecker;
import org.deidentifier.arx.framework.check.TransformedData;
import org.deidentifier.arx.framework.check.distribution.DistributionAggregateFunction;
import org.deidentifier.arx.framework.data.Data;
import org.deidentifier.arx.framework.data.DataManager;
import org.deidentifier.arx.framework.data.Dictionary;
import org.deidentifier.arx.framework.lattice.SolutionSpace;
import org.deidentifier.arx.framework.lattice.Transformation;
import org.deidentifier.arx.metric.Metric;
/**
* Encapsulates the results of an execution of the ARX algorithm.
*
* @author Fabian Prasser
* @author Florian Kohlmayer
*/
public class ARXResult {
/** Lock the buffer. */
private DataHandle bufferLockedByHandle = null;
/** Lock the buffer. */
private ARXNode bufferLockedByNode = null;
/** The node checker. */
private final NodeChecker checker;
/** The config. */
private final ARXConfiguration config;
/** The data definition. */
private final DataDefinition definition;
/** Wall clock. */
private final long duration;
/** The lattice. */
private final ARXLattice lattice;
/** The data manager. */
private final DataManager manager;
/** The global optimum. */
private final ARXNode optimalNode;
/** The registry. */
private final DataRegistry registry;
/** The registry. */
private final SolutionSpace solutionSpace;
/**
* Internal constructor for deserialization.
*
* @param handle
* @param definition
* @param lattice
* @param historySize
* @param snapshotSizeSnapshot
* @param snapshotSizeDataset
* @param metric
* @param config
* @param optimum
* @param time
* @param solutionSpace
*/
public ARXResult(final DataHandle handle,
final DataDefinition definition,
final ARXLattice lattice,
final int historySize,
final double snapshotSizeSnapshot,
final double snapshotSizeDataset,
final Metric<?> metric,
final ARXConfiguration config,
final ARXNode optimum,
final long time,
final SolutionSpace solutionSpace) {
// Set registry and definition
((DataHandleInput)handle).setDefinition(definition);
handle.getRegistry().createInputSubset(config);
// Set optimum in lattice
lattice.access().setOptimum(optimum);
// Extract data
final String[] header = ((DataHandleInput) handle).header;
final int[][] dataArray = ((DataHandleInput) handle).data;
final Dictionary dictionary = ((DataHandleInput) handle).dictionary;
final DataManager manager = new DataManager(header,
dataArray,
dictionary,
handle.getDefinition(),
config.getPrivacyModels(),
getAggregateFunctions(handle.getDefinition()));
// Update handle
((DataHandleInput)handle).update(manager.getDataGeneralized().getArray(),
manager.getDataAnalyzed().getArray(),
manager.getDataStatic().getArray());
// Lock handle
((DataHandleInput)handle).setLocked(true);
// Initialize
config.initialize(manager);
// Initialize the metric
metric.initialize(manager, definition, manager.getDataGeneralized(), manager.getHierarchies(), config);
// Create a node checker
final NodeChecker checker = new NodeChecker(manager,
metric,
config.getInternalConfiguration(),
historySize,
snapshotSizeDataset,
snapshotSizeSnapshot,
solutionSpace);
// Initialize the result
this.registry = handle.getRegistry();
this.manager = manager;
this.checker = checker;
this.definition = definition;
this.config = config;
this.lattice = lattice;
this.optimalNode = lattice.getOptimum();
this.duration = time;
this.solutionSpace = solutionSpace;
}
/**
* Creates a new instance.
*
* @param registry
* @param manager
* @param checker
* @param definition
* @param config
* @param lattice
* @param duration
* @param solutionSpace
*/
protected ARXResult(DataRegistry registry,
DataManager manager,
NodeChecker checker,
DataDefinition definition,
ARXConfiguration config,
ARXLattice lattice,
long duration,
SolutionSpace solutionSpace) {
this.registry = registry;
this.manager = manager;
this.checker = checker;
this.definition = definition;
this.config = config;
this.lattice = lattice;
this.optimalNode = lattice.getOptimum();
this.duration = duration;
this.solutionSpace = solutionSpace;
}
/**
* Returns the configuration used.
*
* @return
*/
public ARXConfiguration getConfiguration() {
return config;
}
/**
* Gets the global optimum.
*
* @return the global optimum
*/
public ARXNode getGlobalOptimum() {
return optimalNode;
}
/**
* Returns a handle to the data obtained by applying the optimal transformation. This method will not copy the buffer,
* i.e., only one instance can be obtained for each transformation. All previous handles for output data will be invalidated when a new handle is
* obtained. Use this only if you know exactly what you are doing.<br>
* <br>
* This method is obsolete. Please use getOutput() instead.
*
* @return
*/
@Deprecated
public DataHandle getHandle() {
if (optimalNode == null) { return null; }
return getOutput(optimalNode, false);
}
/**
* Returns a handle to data obtained by applying the given transformation. This method will not copy the buffer,
* i.e., only one instance can be obtained for each transformation. All previous handles for output data will be invalidated when a new handle is
* obtained. Use this only if you know exactly what you are doing.<br>
* <br>
* This method is obsolete. Please use getOutput(...) instead.
*
* @param node the transformation
*
* @return
*/
@Deprecated
public DataHandle getHandle(ARXNode node) {
return getOutput(node, false);
}
/**
* Returns the lattice.
*
* @return
*/
public ARXLattice getLattice() {
return lattice;
}
/**
* Returns a handle to the data obtained by applying the optimal transformation. This method will fork the buffer,
* allowing to obtain multiple handles to different representations of the data set. Note that only one instance can
* be obtained for each transformation.
*
* @return
*/
public DataHandle getOutput() {
if (optimalNode == null) { return null; }
return getOutput(optimalNode, true);
}
/**
* Returns a handle to data obtained by applying the given transformation. This method will fork the buffer,
* allowing to obtain multiple handles to different representations of the data set. Note that only one instance can
* be obtained for each transformation.
*
* @param node the transformation
*
* @return
*/
public DataHandle getOutput(ARXNode node) {
return getOutput(node, true);
}
/**
* Returns a handle to data obtained by applying the given transformation. This method allows controlling whether
* the underlying buffer is copied or not. Setting the flag to true will fork the buffer for every handle, allowing to
* obtain multiple handles to different representations of the data set. When setting the flag to false, all previous
* handles for output data will be invalidated when a new handle is obtained.
*
* @param node the transformation
* @param fork Set this flag to false, only if you know exactly what you are doing.
*
* @return
*/
public DataHandle getOutput(ARXNode node, boolean fork) {
// Check lock
if (fork && bufferLockedByHandle != null) {
throw new RuntimeException("The buffer is currently locked by another handle");
}
// Release lock
if (!fork && bufferLockedByHandle != null) {
if (bufferLockedByNode == node && !((DataHandleOutput)bufferLockedByHandle).isOptimized()) {
return bufferLockedByHandle;
} else {
registry.release(bufferLockedByHandle);
bufferLockedByHandle = null;
bufferLockedByNode = null;
}
}
DataHandle handle = registry.getOutputHandle(node);
if (handle != null) {
if (!((DataHandleOutput)handle).isOptimized()) {
return handle;
} else {
registry.release(handle);
}
}
// Apply the transformation
final Transformation transformation = solutionSpace.getTransformation(node.getTransformation());
TransformedData information = checker.applyTransformation(transformation);
transformation.setChecked(information.properties);
// Store
if (!node.isChecked() || node.getHighestScore().compareTo(node.getLowestScore()) != 0) {
node.access().setChecked(true);
if (transformation.hasProperty(solutionSpace.getPropertyAnonymous())) {
node.access().setAnonymous();
} else {
node.access().setNotAnonymous();
}
node.access().setHighestScore(transformation.getInformationLoss());
node.access().setLowestScore(transformation.getInformationLoss());
node.access().setLowerBound(transformation.getLowerBound());
lattice.estimateInformationLoss();
}
// Clone if needed
if (fork) {
information.bufferGeneralized = information.bufferGeneralized.clone();
information.bufferMicroaggregated = information.bufferMicroaggregated.clone();
}
// Create
DataHandleOutput result = new DataHandleOutput(this,
registry,
manager,
information.bufferGeneralized,
information.bufferMicroaggregated,
node,
definition,
config);
// Lock
if (!fork) {
bufferLockedByHandle = result;
bufferLockedByNode = node;
}
// Return
return result;
}
/**
* Returns a handle to the data obtained by applying the optimal transformation. This method allows controlling whether
* the underlying buffer is copied or not. Setting the flag to true will fork the buffer for every handle, allowing to
* obtain multiple handles to different representations of the data set. When setting the flag to false, all previous
* handles for output data will be invalidated when a new handle is obtained.
*
* @param fork Set this flag to false, only if you know exactly what you are doing.
*
* @return
*/
public DataHandle getOutput(boolean fork) {
if (optimalNode == null) { return null; }
return getOutput(optimalNode, fork);
}
/**
* Returns the execution time (wall clock).
*
* @return
*/
public long getTime() {
return duration;
}
/**
* Returns whether local recoding can be applied to the given handle
* @param handle
* @return
*/
public boolean isOptimizable(DataHandle handle) {
// Check, if output
if (!(handle instanceof DataHandleOutput)) {
return false;
}
// Extract
DataHandleOutput output = (DataHandleOutput)handle;
// Check, if input matches
if (output.getInputBuffer() == null || !output.getInputBuffer().equals(this.checker.getInputBuffer())) {
return false;
}
// Check if optimizable
for (PrivacyCriterion c : config.getPrivacyModels()) {
if (!c.isLocalRecodingSupported()) {
return false;
}
}
// Check, if there are enough outliers
int outliers = 0;
for (int row = 0; row < output.getNumRows(); row++) {
if (output.isOutlier(row)) {
outliers++;
}
}
// Check minimal group size
if (config.getMinimalGroupSize() != Integer.MAX_VALUE && outliers < config.getMinimalGroupSize()) {
return false;
}
// Check, if there are any outliers
if (outliers == 0) {
return false;
}
// Yes, we probably can do this
return true;
}
/**
* Indicates if a result is available.
*
* @return
*/
public boolean isResultAvailable() {
return optimalNode != null;
}
/**
* This method optimizes the given data output with local recoding to improve its utility
* @param handle
* @return The number of optimized records
* @throws RollbackRequiredException
*/
public int optimize(DataHandle handle) throws RollbackRequiredException {
return this.optimize(handle, 0.5d, new ARXListener(){
@Override
public void progress(double progress) {
// Empty by design
}
});
}
/**
* This method optimizes the given data output with local recoding to improve its utility
* @param handle
* @param gsFactor A factor [0,1] weighting generalization and suppression.
* The default value is 0.5, which means that generalization
* and suppression will be treated equally. A factor of 0
* will favor suppression, and a factor of 1 will favor
* generalization. The values in between can be used for
* balancing both methods.
* @return The number of optimized records
* @throws RollbackRequiredException
*/
public int optimize(DataHandle handle, double gsFactor) throws RollbackRequiredException {
return this.optimize(handle, gsFactor, new ARXListener(){
@Override
public void progress(double progress) {
// Empty by design
}
});
}
/**
* This method optimizes the given data output with local recoding to improve its utility
* @param handle
* @param gsFactor A factor [0,1] weighting generalization and suppression.
* The default value is 0.5, which means that generalization
* and suppression will be treated equally. A factor of 0
* will favor suppression, and a factor of 1 will favor
* generalization. The values in between can be used for
* balancing both methods.
* @param listener
* @return The number of optimized records
*/
public int optimize(DataHandle handle, double gsFactor, ARXListener listener) throws RollbackRequiredException {
// Check if null
if (listener == null) {
throw new NullPointerException("Listener must not be null");
}
// Check if null
if (handle == null) {
throw new NullPointerException("Handle must not be null");
}
// Check bounds
if (gsFactor < 0d || gsFactor > 1d) {
throw new IllegalArgumentException("Generalization/suppression factor must be in [0, 1]");
}
// Check if output
if (!(handle instanceof DataHandleOutput)) {
throw new IllegalArgumentException("Local recoding can only be applied to output data");
}
// Check if optimizable
if (!isOptimizable(handle)) {
return 0;
}
// Extract
DataHandleOutput output = (DataHandleOutput)handle;
// Check, if input matches
if (output.getInputBuffer() == null || !output.getInputBuffer().equals(this.checker.getInputBuffer())) {
throw new IllegalArgumentException("This output data is not associated to the correct input data");
}
// We are now ready, to go
// Collect input and row indices
RowSet rowset = RowSet.create(output.getNumRows());
for (int row = 0; row < output.getNumRows(); row++) {
if (output.isOutlier(row)) {
rowset.add(row);
}
}
// Everything that is used from here on, needs to be either
// (a) state-less, or
// (b) a fresh copy of the original configuration.
// We start by creating a projected instance of the configuration
// - All privacy models will be cloned
// - Subsets will be projected accordingly
// - Utility measures will be cloned
ARXConfiguration config = this.config.getInstanceForLocalRecoding(rowset, gsFactor);
// In the data definition, only microaggregation functions maintain a state, but these
// are cloned, when cloning the definition
// TODO: This is probably not necessary, because they are used from the data manager,
// which in turn creates a clone by itself
DataDefinition definition = this.definition.clone();
// Clone the data manager
DataManager manager = this.manager.getSubsetInstance(rowset);
// Create an anonymizer
// TODO: May this object stores some values that should be transferred?
ARXAnonymizer anonymizer = new ARXAnonymizer();
anonymizer.setListener(listener);
// Anonymize
Result result = null;
try {
result = anonymizer.anonymize(manager, definition, config);
} catch (IOException e) {
// This should not happen at this point in time, as data has already been read from the source
throw new RuntimeException("Internal error");
}
// Break, if no solution has been found
if (result.optimum == null) {
return 0;
}
// Else, merge the results back into the given handle
TransformedData data = result.checker.applyTransformation(result.optimum, output.getOutputBufferMicroaggregated().getDictionary());
int newIndex = -1;
int[][] oldGeneralized = output.getOutputBufferGeneralized().getArray();
int[][] oldMicroaggregated = output.getOutputBufferMicroaggregated().getArray();
int[][] newGeneralized = data.bufferGeneralized.getArray();
int[][] newMicroaggregated = data.bufferMicroaggregated.getArray();
try {
int optimized = 0;
for (int oldIndex = 0; oldIndex < rowset.length(); oldIndex++) {
if (rowset.contains(oldIndex)) {
newIndex++;
if (oldGeneralized != null && oldGeneralized.length != 0) {
System.arraycopy(newGeneralized[newIndex], 0, oldGeneralized[oldIndex], 0, newGeneralized[newIndex].length);
optimized += (newGeneralized[newIndex][0] & Data.OUTLIER_MASK) != 0 ? 0 : 1;
}
if (oldMicroaggregated != null && oldMicroaggregated.length != 0) {
System.arraycopy(newMicroaggregated[newIndex], 0, oldMicroaggregated[oldIndex], 0, newMicroaggregated[newIndex].length);
}
}
}
// Update data types
output.updateDataTypes(result.optimum.getGeneralization());
// Mark as optimized
if (optimized != 0) {
output.setOptimized(true);
}
// Return
return optimized;
// If anything happens in the above block, the operation needs to be rolled back, because
// the buffer might be in an inconsistent state
} catch (Exception e) {
throw new RollbackRequiredException("Handle must be rebuild to guarantee privacy", e);
}
}
/**
* This method optimizes the given data output with local recoding to improve its utility
* @param handle
* @param gsFactor A factor [0,1] weighting generalization and suppression.
* The default value is 0.5, which means that generalization
* and suppression will be treated equally. A factor of 0
* will favor suppression, and a factor of 1 will favor
* generalization. The values in between can be used for
* balancing both methods.
* @param maxIterations The maximal number of iterations to perform
* @param adaptionFactor Is added to the gsFactor when reaching a fixpoint
* @param listener
* @throws RollbackRequiredException
*/
public void optimizeIterative(DataHandle handle,
double gsFactor,
int maxIterations,
double adaptionFactor) throws RollbackRequiredException {
this.optimizeIterative(handle, gsFactor, maxIterations, adaptionFactor, new ARXListener(){
@Override
public void progress(double progress) {
// Empty by design
}
});
}
/**
* This method optimizes the given data output with local recoding to improve its utility
* @param handle
* @param gsFactor A factor [0,1] weighting generalization and suppression.
* The default value is 0.5, which means that generalization
* and suppression will be treated equally. A factor of 0
* will favor suppression, and a factor of 1 will favor
* generalization. The values in between can be used for
* balancing both methods.
* @param maxIterations The maximal number of iterations to perform
* @param adaptionFactor Is added to the gsFactor when reaching a fixpoint
* @param listener
* @throws RollbackRequiredException
*/
public void optimizeIterative(final DataHandle handle,
double gsFactor,
final int maxIterations,
final double adaptionFactor,
final ARXListener listener) throws RollbackRequiredException {
if (gsFactor < 0d || gsFactor > 1d) {
throw new IllegalArgumentException("Generalization/suppression factor must be in [0, 1]");
}
if (adaptionFactor < 0d || adaptionFactor > 1d) {
throw new IllegalArgumentException("Adaption factor must be in [0, 1]");
}
if (maxIterations <= 0) {
throw new IllegalArgumentException("Max. iterations must be > zero");
}
// Outer loop
int iterations = 0;
int optimized = Integer.MAX_VALUE;
double totalAdaption = 0d;
final double max = maxIterations != Integer.MAX_VALUE ? maxIterations : (1d - gsFactor) / adaptionFactor;
while (isOptimizable(handle) && iterations < maxIterations && optimized > 0) {
// Create a wrapped listener
final double base = maxIterations != Integer.MAX_VALUE ? iterations : totalAdaption / adaptionFactor;
ARXListener wrapper = new ARXListener() {
@Override
public void progress(double progress) {
double _max = (max > 1d && !Double.isInfinite(max) && !Double.isNaN(max) ? max : 1d);
double _base = (base > 0d && !Double.isInfinite(base) && !Double.isNaN(base)? base : 0d);
double value = (progress + _base) / _max;
listener.progress(value);
}
};
// Perform individual optimization
optimized = optimize(handle, gsFactor, wrapper);
// Try to adapt, if possible
if (optimized == 0 && adaptionFactor > 0d) {
gsFactor += adaptionFactor;
totalAdaption += adaptionFactor;
// If valid, try again
if (gsFactor <= 1d) {
optimized = Integer.MAX_VALUE;
}
}
iterations++;
}
}
/**
* Returns a map of all microaggregation functions
* @param definition
* @return
*/
private Map<String, DistributionAggregateFunction> getAggregateFunctions(DataDefinition definition) {
Map<String, DistributionAggregateFunction> result = new HashMap<String, DistributionAggregateFunction>();
for (String key : definition.getQuasiIdentifiersWithMicroaggregation()) {
result.put(key, definition.getMicroAggregationFunction(key).getFunction());
}
return result;
}
/**
* Releases the buffer.
*
* @param handle
*/
protected void releaseBuffer(DataHandleOutput handle) {
if (handle == bufferLockedByHandle) {
bufferLockedByHandle = null;
bufferLockedByNode = null;
}
}
}