/* * ARX: Powerful Data Anonymization * Copyright 2012 - 2017 Fabian Prasser, Florian Kohlmayer and contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.deidentifier.arx; import java.io.IOException; import java.util.HashMap; import java.util.Map; import org.deidentifier.arx.ARXAnonymizer.Result; import org.deidentifier.arx.ARXLattice.ARXNode; import org.deidentifier.arx.criteria.PrivacyCriterion; import org.deidentifier.arx.exceptions.RollbackRequiredException; import org.deidentifier.arx.framework.check.NodeChecker; import org.deidentifier.arx.framework.check.TransformedData; import org.deidentifier.arx.framework.check.distribution.DistributionAggregateFunction; import org.deidentifier.arx.framework.data.Data; import org.deidentifier.arx.framework.data.DataManager; import org.deidentifier.arx.framework.data.Dictionary; import org.deidentifier.arx.framework.lattice.SolutionSpace; import org.deidentifier.arx.framework.lattice.Transformation; import org.deidentifier.arx.metric.Metric; /** * Encapsulates the results of an execution of the ARX algorithm. * * @author Fabian Prasser * @author Florian Kohlmayer */ public class ARXResult { /** Lock the buffer. */ private DataHandle bufferLockedByHandle = null; /** Lock the buffer. */ private ARXNode bufferLockedByNode = null; /** The node checker. */ private final NodeChecker checker; /** The config. */ private final ARXConfiguration config; /** The data definition. */ private final DataDefinition definition; /** Wall clock. */ private final long duration; /** The lattice. */ private final ARXLattice lattice; /** The data manager. */ private final DataManager manager; /** The global optimum. */ private final ARXNode optimalNode; /** The registry. */ private final DataRegistry registry; /** The registry. */ private final SolutionSpace solutionSpace; /** * Internal constructor for deserialization. * * @param handle * @param definition * @param lattice * @param historySize * @param snapshotSizeSnapshot * @param snapshotSizeDataset * @param metric * @param config * @param optimum * @param time * @param solutionSpace */ public ARXResult(final DataHandle handle, final DataDefinition definition, final ARXLattice lattice, final int historySize, final double snapshotSizeSnapshot, final double snapshotSizeDataset, final Metric<?> metric, final ARXConfiguration config, final ARXNode optimum, final long time, final SolutionSpace solutionSpace) { // Set registry and definition ((DataHandleInput)handle).setDefinition(definition); handle.getRegistry().createInputSubset(config); // Set optimum in lattice lattice.access().setOptimum(optimum); // Extract data final String[] header = ((DataHandleInput) handle).header; final int[][] dataArray = ((DataHandleInput) handle).data; final Dictionary dictionary = ((DataHandleInput) handle).dictionary; final DataManager manager = new DataManager(header, dataArray, dictionary, handle.getDefinition(), config.getPrivacyModels(), getAggregateFunctions(handle.getDefinition())); // Update handle ((DataHandleInput)handle).update(manager.getDataGeneralized().getArray(), manager.getDataAnalyzed().getArray(), manager.getDataStatic().getArray()); // Lock handle ((DataHandleInput)handle).setLocked(true); // Initialize config.initialize(manager); // Initialize the metric metric.initialize(manager, definition, manager.getDataGeneralized(), manager.getHierarchies(), config); // Create a node checker final NodeChecker checker = new NodeChecker(manager, metric, config.getInternalConfiguration(), historySize, snapshotSizeDataset, snapshotSizeSnapshot, solutionSpace); // Initialize the result this.registry = handle.getRegistry(); this.manager = manager; this.checker = checker; this.definition = definition; this.config = config; this.lattice = lattice; this.optimalNode = lattice.getOptimum(); this.duration = time; this.solutionSpace = solutionSpace; } /** * Creates a new instance. * * @param registry * @param manager * @param checker * @param definition * @param config * @param lattice * @param duration * @param solutionSpace */ protected ARXResult(DataRegistry registry, DataManager manager, NodeChecker checker, DataDefinition definition, ARXConfiguration config, ARXLattice lattice, long duration, SolutionSpace solutionSpace) { this.registry = registry; this.manager = manager; this.checker = checker; this.definition = definition; this.config = config; this.lattice = lattice; this.optimalNode = lattice.getOptimum(); this.duration = duration; this.solutionSpace = solutionSpace; } /** * Returns the configuration used. * * @return */ public ARXConfiguration getConfiguration() { return config; } /** * Gets the global optimum. * * @return the global optimum */ public ARXNode getGlobalOptimum() { return optimalNode; } /** * Returns a handle to the data obtained by applying the optimal transformation. This method will not copy the buffer, * i.e., only one instance can be obtained for each transformation. All previous handles for output data will be invalidated when a new handle is * obtained. Use this only if you know exactly what you are doing.<br> * <br> * This method is obsolete. Please use getOutput() instead. * * @return */ @Deprecated public DataHandle getHandle() { if (optimalNode == null) { return null; } return getOutput(optimalNode, false); } /** * Returns a handle to data obtained by applying the given transformation. This method will not copy the buffer, * i.e., only one instance can be obtained for each transformation. All previous handles for output data will be invalidated when a new handle is * obtained. Use this only if you know exactly what you are doing.<br> * <br> * This method is obsolete. Please use getOutput(...) instead. * * @param node the transformation * * @return */ @Deprecated public DataHandle getHandle(ARXNode node) { return getOutput(node, false); } /** * Returns the lattice. * * @return */ public ARXLattice getLattice() { return lattice; } /** * Returns a handle to the data obtained by applying the optimal transformation. This method will fork the buffer, * allowing to obtain multiple handles to different representations of the data set. Note that only one instance can * be obtained for each transformation. * * @return */ public DataHandle getOutput() { if (optimalNode == null) { return null; } return getOutput(optimalNode, true); } /** * Returns a handle to data obtained by applying the given transformation. This method will fork the buffer, * allowing to obtain multiple handles to different representations of the data set. Note that only one instance can * be obtained for each transformation. * * @param node the transformation * * @return */ public DataHandle getOutput(ARXNode node) { return getOutput(node, true); } /** * Returns a handle to data obtained by applying the given transformation. This method allows controlling whether * the underlying buffer is copied or not. Setting the flag to true will fork the buffer for every handle, allowing to * obtain multiple handles to different representations of the data set. When setting the flag to false, all previous * handles for output data will be invalidated when a new handle is obtained. * * @param node the transformation * @param fork Set this flag to false, only if you know exactly what you are doing. * * @return */ public DataHandle getOutput(ARXNode node, boolean fork) { // Check lock if (fork && bufferLockedByHandle != null) { throw new RuntimeException("The buffer is currently locked by another handle"); } // Release lock if (!fork && bufferLockedByHandle != null) { if (bufferLockedByNode == node && !((DataHandleOutput)bufferLockedByHandle).isOptimized()) { return bufferLockedByHandle; } else { registry.release(bufferLockedByHandle); bufferLockedByHandle = null; bufferLockedByNode = null; } } DataHandle handle = registry.getOutputHandle(node); if (handle != null) { if (!((DataHandleOutput)handle).isOptimized()) { return handle; } else { registry.release(handle); } } // Apply the transformation final Transformation transformation = solutionSpace.getTransformation(node.getTransformation()); TransformedData information = checker.applyTransformation(transformation); transformation.setChecked(information.properties); // Store if (!node.isChecked() || node.getHighestScore().compareTo(node.getLowestScore()) != 0) { node.access().setChecked(true); if (transformation.hasProperty(solutionSpace.getPropertyAnonymous())) { node.access().setAnonymous(); } else { node.access().setNotAnonymous(); } node.access().setHighestScore(transformation.getInformationLoss()); node.access().setLowestScore(transformation.getInformationLoss()); node.access().setLowerBound(transformation.getLowerBound()); lattice.estimateInformationLoss(); } // Clone if needed if (fork) { information.bufferGeneralized = information.bufferGeneralized.clone(); information.bufferMicroaggregated = information.bufferMicroaggregated.clone(); } // Create DataHandleOutput result = new DataHandleOutput(this, registry, manager, information.bufferGeneralized, information.bufferMicroaggregated, node, definition, config); // Lock if (!fork) { bufferLockedByHandle = result; bufferLockedByNode = node; } // Return return result; } /** * Returns a handle to the data obtained by applying the optimal transformation. This method allows controlling whether * the underlying buffer is copied or not. Setting the flag to true will fork the buffer for every handle, allowing to * obtain multiple handles to different representations of the data set. When setting the flag to false, all previous * handles for output data will be invalidated when a new handle is obtained. * * @param fork Set this flag to false, only if you know exactly what you are doing. * * @return */ public DataHandle getOutput(boolean fork) { if (optimalNode == null) { return null; } return getOutput(optimalNode, fork); } /** * Returns the execution time (wall clock). * * @return */ public long getTime() { return duration; } /** * Returns whether local recoding can be applied to the given handle * @param handle * @return */ public boolean isOptimizable(DataHandle handle) { // Check, if output if (!(handle instanceof DataHandleOutput)) { return false; } // Extract DataHandleOutput output = (DataHandleOutput)handle; // Check, if input matches if (output.getInputBuffer() == null || !output.getInputBuffer().equals(this.checker.getInputBuffer())) { return false; } // Check if optimizable for (PrivacyCriterion c : config.getPrivacyModels()) { if (!c.isLocalRecodingSupported()) { return false; } } // Check, if there are enough outliers int outliers = 0; for (int row = 0; row < output.getNumRows(); row++) { if (output.isOutlier(row)) { outliers++; } } // Check minimal group size if (config.getMinimalGroupSize() != Integer.MAX_VALUE && outliers < config.getMinimalGroupSize()) { return false; } // Check, if there are any outliers if (outliers == 0) { return false; } // Yes, we probably can do this return true; } /** * Indicates if a result is available. * * @return */ public boolean isResultAvailable() { return optimalNode != null; } /** * This method optimizes the given data output with local recoding to improve its utility * @param handle * @return The number of optimized records * @throws RollbackRequiredException */ public int optimize(DataHandle handle) throws RollbackRequiredException { return this.optimize(handle, 0.5d, new ARXListener(){ @Override public void progress(double progress) { // Empty by design } }); } /** * This method optimizes the given data output with local recoding to improve its utility * @param handle * @param gsFactor A factor [0,1] weighting generalization and suppression. * The default value is 0.5, which means that generalization * and suppression will be treated equally. A factor of 0 * will favor suppression, and a factor of 1 will favor * generalization. The values in between can be used for * balancing both methods. * @return The number of optimized records * @throws RollbackRequiredException */ public int optimize(DataHandle handle, double gsFactor) throws RollbackRequiredException { return this.optimize(handle, gsFactor, new ARXListener(){ @Override public void progress(double progress) { // Empty by design } }); } /** * This method optimizes the given data output with local recoding to improve its utility * @param handle * @param gsFactor A factor [0,1] weighting generalization and suppression. * The default value is 0.5, which means that generalization * and suppression will be treated equally. A factor of 0 * will favor suppression, and a factor of 1 will favor * generalization. The values in between can be used for * balancing both methods. * @param listener * @return The number of optimized records */ public int optimize(DataHandle handle, double gsFactor, ARXListener listener) throws RollbackRequiredException { // Check if null if (listener == null) { throw new NullPointerException("Listener must not be null"); } // Check if null if (handle == null) { throw new NullPointerException("Handle must not be null"); } // Check bounds if (gsFactor < 0d || gsFactor > 1d) { throw new IllegalArgumentException("Generalization/suppression factor must be in [0, 1]"); } // Check if output if (!(handle instanceof DataHandleOutput)) { throw new IllegalArgumentException("Local recoding can only be applied to output data"); } // Check if optimizable if (!isOptimizable(handle)) { return 0; } // Extract DataHandleOutput output = (DataHandleOutput)handle; // Check, if input matches if (output.getInputBuffer() == null || !output.getInputBuffer().equals(this.checker.getInputBuffer())) { throw new IllegalArgumentException("This output data is not associated to the correct input data"); } // We are now ready, to go // Collect input and row indices RowSet rowset = RowSet.create(output.getNumRows()); for (int row = 0; row < output.getNumRows(); row++) { if (output.isOutlier(row)) { rowset.add(row); } } // Everything that is used from here on, needs to be either // (a) state-less, or // (b) a fresh copy of the original configuration. // We start by creating a projected instance of the configuration // - All privacy models will be cloned // - Subsets will be projected accordingly // - Utility measures will be cloned ARXConfiguration config = this.config.getInstanceForLocalRecoding(rowset, gsFactor); // In the data definition, only microaggregation functions maintain a state, but these // are cloned, when cloning the definition // TODO: This is probably not necessary, because they are used from the data manager, // which in turn creates a clone by itself DataDefinition definition = this.definition.clone(); // Clone the data manager DataManager manager = this.manager.getSubsetInstance(rowset); // Create an anonymizer // TODO: May this object stores some values that should be transferred? ARXAnonymizer anonymizer = new ARXAnonymizer(); anonymizer.setListener(listener); // Anonymize Result result = null; try { result = anonymizer.anonymize(manager, definition, config); } catch (IOException e) { // This should not happen at this point in time, as data has already been read from the source throw new RuntimeException("Internal error"); } // Break, if no solution has been found if (result.optimum == null) { return 0; } // Else, merge the results back into the given handle TransformedData data = result.checker.applyTransformation(result.optimum, output.getOutputBufferMicroaggregated().getDictionary()); int newIndex = -1; int[][] oldGeneralized = output.getOutputBufferGeneralized().getArray(); int[][] oldMicroaggregated = output.getOutputBufferMicroaggregated().getArray(); int[][] newGeneralized = data.bufferGeneralized.getArray(); int[][] newMicroaggregated = data.bufferMicroaggregated.getArray(); try { int optimized = 0; for (int oldIndex = 0; oldIndex < rowset.length(); oldIndex++) { if (rowset.contains(oldIndex)) { newIndex++; if (oldGeneralized != null && oldGeneralized.length != 0) { System.arraycopy(newGeneralized[newIndex], 0, oldGeneralized[oldIndex], 0, newGeneralized[newIndex].length); optimized += (newGeneralized[newIndex][0] & Data.OUTLIER_MASK) != 0 ? 0 : 1; } if (oldMicroaggregated != null && oldMicroaggregated.length != 0) { System.arraycopy(newMicroaggregated[newIndex], 0, oldMicroaggregated[oldIndex], 0, newMicroaggregated[newIndex].length); } } } // Update data types output.updateDataTypes(result.optimum.getGeneralization()); // Mark as optimized if (optimized != 0) { output.setOptimized(true); } // Return return optimized; // If anything happens in the above block, the operation needs to be rolled back, because // the buffer might be in an inconsistent state } catch (Exception e) { throw new RollbackRequiredException("Handle must be rebuild to guarantee privacy", e); } } /** * This method optimizes the given data output with local recoding to improve its utility * @param handle * @param gsFactor A factor [0,1] weighting generalization and suppression. * The default value is 0.5, which means that generalization * and suppression will be treated equally. A factor of 0 * will favor suppression, and a factor of 1 will favor * generalization. The values in between can be used for * balancing both methods. * @param maxIterations The maximal number of iterations to perform * @param adaptionFactor Is added to the gsFactor when reaching a fixpoint * @param listener * @throws RollbackRequiredException */ public void optimizeIterative(DataHandle handle, double gsFactor, int maxIterations, double adaptionFactor) throws RollbackRequiredException { this.optimizeIterative(handle, gsFactor, maxIterations, adaptionFactor, new ARXListener(){ @Override public void progress(double progress) { // Empty by design } }); } /** * This method optimizes the given data output with local recoding to improve its utility * @param handle * @param gsFactor A factor [0,1] weighting generalization and suppression. * The default value is 0.5, which means that generalization * and suppression will be treated equally. A factor of 0 * will favor suppression, and a factor of 1 will favor * generalization. The values in between can be used for * balancing both methods. * @param maxIterations The maximal number of iterations to perform * @param adaptionFactor Is added to the gsFactor when reaching a fixpoint * @param listener * @throws RollbackRequiredException */ public void optimizeIterative(final DataHandle handle, double gsFactor, final int maxIterations, final double adaptionFactor, final ARXListener listener) throws RollbackRequiredException { if (gsFactor < 0d || gsFactor > 1d) { throw new IllegalArgumentException("Generalization/suppression factor must be in [0, 1]"); } if (adaptionFactor < 0d || adaptionFactor > 1d) { throw new IllegalArgumentException("Adaption factor must be in [0, 1]"); } if (maxIterations <= 0) { throw new IllegalArgumentException("Max. iterations must be > zero"); } // Outer loop int iterations = 0; int optimized = Integer.MAX_VALUE; double totalAdaption = 0d; final double max = maxIterations != Integer.MAX_VALUE ? maxIterations : (1d - gsFactor) / adaptionFactor; while (isOptimizable(handle) && iterations < maxIterations && optimized > 0) { // Create a wrapped listener final double base = maxIterations != Integer.MAX_VALUE ? iterations : totalAdaption / adaptionFactor; ARXListener wrapper = new ARXListener() { @Override public void progress(double progress) { double _max = (max > 1d && !Double.isInfinite(max) && !Double.isNaN(max) ? max : 1d); double _base = (base > 0d && !Double.isInfinite(base) && !Double.isNaN(base)? base : 0d); double value = (progress + _base) / _max; listener.progress(value); } }; // Perform individual optimization optimized = optimize(handle, gsFactor, wrapper); // Try to adapt, if possible if (optimized == 0 && adaptionFactor > 0d) { gsFactor += adaptionFactor; totalAdaption += adaptionFactor; // If valid, try again if (gsFactor <= 1d) { optimized = Integer.MAX_VALUE; } } iterations++; } } /** * Returns a map of all microaggregation functions * @param definition * @return */ private Map<String, DistributionAggregateFunction> getAggregateFunctions(DataDefinition definition) { Map<String, DistributionAggregateFunction> result = new HashMap<String, DistributionAggregateFunction>(); for (String key : definition.getQuasiIdentifiersWithMicroaggregation()) { result.put(key, definition.getMicroAggregationFunction(key).getFunction()); } return result; } /** * Releases the buffer. * * @param handle */ protected void releaseBuffer(DataHandleOutput handle) { if (handle == bufferLockedByHandle) { bufferLockedByHandle = null; bufferLockedByNode = null; } } }