/* * ARX: Powerful Data Anonymization * Copyright 2012 - 2017 Fabian Prasser, Florian Kohlmayer and contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.deidentifier.arx.framework.check.groupify; import org.deidentifier.arx.ARXConfiguration.ARXConfigurationInternal; import org.deidentifier.arx.RowSet; import org.deidentifier.arx.criteria.DPresence; import org.deidentifier.arx.criteria.Inclusion; import org.deidentifier.arx.criteria.PrivacyCriterion; import org.deidentifier.arx.criteria.SampleBasedCriterion; import org.deidentifier.arx.framework.check.distribution.Distribution; import org.deidentifier.arx.framework.check.distribution.DistributionAggregateFunction; import org.deidentifier.arx.framework.data.Data; import org.deidentifier.arx.framework.data.Dictionary; import org.deidentifier.arx.framework.lattice.Transformation; import org.deidentifier.arx.metric.Metric; import com.carrotsearch.hppc.ObjectIntOpenHashMap; /** * A hash groupify operator. It implements a hash table with chaining and keeps * track of additional properties per equivalence class * * @author Fabian Prasser * @author Florian Kohlmayer */ public class HashGroupify { /** Criteria. */ private final PrivacyCriterion[] classBasedCriteria; /** The current number of outliers. */ private int currentNumOutliers; /** The entry array. */ private HashGroupifyEntry[] hashTableBuckets; /** Current number of elements. */ private int hashTableElementCount; /** The first entry. */ private HashGroupifyEntry hashTableFirstEntry; /** The last entry. */ private HashGroupifyEntry hashTableLastEntry; /** Load factor. */ private final float hashTableLoadFactor = 0.75f; /** Maximum number of elements that can be put in this map before having to rehash. */ private int hashTableThreshold; /** Do we ensure optimality for sample-based criteria */ private final boolean heuristicForSampleBasedCriteria; /** The parameter k, if k-anonymity is contained in the set of criteria. */ private final int minimalClassSize; /** Is the result k-anonymous?. */ private boolean minimalClassSizeFulfilled; /** True, if the contained d-presence criterion is not inclusion. */ private final boolean privacyModelContainsDPresence; /** The research subset, if d-presence is contained in the set of criteria. */ private final RowSet privacyModelDefinesSubset; /** Is the result anonymous. */ private boolean privacyModelFulfilled; /** Criteria. */ private final SampleBasedCriterion[] sampleBasedCriteria; /** Allowed tuple outliers. */ private final int suppressionLimit; /** Utility measure */ private final Metric<?> utilityMeasure; /** * Constructs a new hash groupify operator. * * @param capacity The capacity * @param config The config */ public HashGroupify(int capacity, final ARXConfigurationInternal config) { // Set capacity capacity = HashTableUtil.calculateCapacity(capacity); this.hashTableElementCount = 0; this.hashTableBuckets = new HashGroupifyEntry[capacity]; this.hashTableThreshold = HashTableUtil.calculateThreshold(hashTableBuckets.length, hashTableLoadFactor); // Set params this.currentNumOutliers = 0; this.suppressionLimit = config.getAbsoluteMaxOutliers(); this.utilityMeasure = config.getQualityModel(); this.heuristicForSampleBasedCriteria = config.isUseHeuristicForSampleBasedCriteria(); // Extract research subset if (config.getSubset() != null) { this.privacyModelDefinesSubset = config.getSubset().getSet(); } else { this.privacyModelDefinesSubset = null; } // Extract criteria this.classBasedCriteria = config.getClassBasedPrivacyModelsAsArray(); this.sampleBasedCriteria = config.getSampleBasedPrivacyModelsAsArray(); this.minimalClassSize = config.getMinimalGroupSize(); // Sanity check: by convention, d-presence must be the first criterion // See analyze() and isAnonymous(Entry) for more details for (int i = 1; i < classBasedCriteria.length; i++) { if (classBasedCriteria[i] instanceof DPresence) { throw new RuntimeException("D-Presence must be the first criterion in the array"); } } // Remember, if (real) d-presence is part of the criteria that must be enforced privacyModelContainsDPresence = (classBasedCriteria.length > 0 && (classBasedCriteria[0] instanceof DPresence) && !(classBasedCriteria[0] instanceof Inclusion)); } /** * Adds a tuple from the buffer * @param generalized * @param other * @param representative * @param count * @param pcount */ public void addFromBuffer(int[] generalized, int[] other, int representative, int count, int pcount) { // Add final int hash = HashTableUtil.hashcode(generalized); final HashGroupifyEntry entry = addInternal(generalized, hash, representative, count, pcount); // Is a other attribute provided if (other != null) { if (entry.distributions == null) { entry.distributions = new Distribution[other.length]; // TODO: Improve! for (int i = 0; i < entry.distributions.length; i++) { entry.distributions[i] = new Distribution(); } } // Only add other value if in research subset if (privacyModelDefinesSubset == null || privacyModelDefinesSubset.contains(representative)) { // TODO: Improve! for (int i = 0; i < entry.distributions.length; i++) { entry.distributions[i].add(other[i]); } } } } /** * Adds an entry from another groupify operator * @param generalized * @param distributions * @param representative * @param count * @param pcount */ public void addFromGroupify(int[] generalized, Distribution[] distributions, int representative, int count, int pcount) { // Add final int hash = HashTableUtil.hashcode(generalized); final HashGroupifyEntry entry = addInternal(generalized, hash, representative, count, pcount); // Is a distribution provided if (distributions != null) { if (entry.distributions == null) { entry.distributions = distributions; } else { // TODO: Improve! for (int i = 0; i < entry.distributions.length; i++) { entry.distributions[i].merge(distributions[i]); } } } } /** * Adds a class from a snapshot * @param generalized * @param elements * @param frequencies * @param representative * @param count * @param pcount */ public void addFromSnapshot(int[] generalized, int[][] elements, int[][] frequencies, int representative, int count, int pcount) { // Add final int hash = HashTableUtil.hashcode(generalized); final HashGroupifyEntry entry = addInternal(generalized, hash, representative, count, pcount); // Is a distribution provided if (elements != null) { if (entry.distributions == null) { entry.distributions = new Distribution[elements.length]; // TODO: Improve! for (int i = 0; i < entry.distributions.length; i++) { entry.distributions[i] = new Distribution(elements[i], frequencies[i]); } } else { // TODO: Improve! for (int i = 0; i < entry.distributions.length; i++) { entry.distributions[i].merge(elements[i], frequencies[i]); } } } } /** * Returns the entry for the given tuple * @param tuple * @return */ public HashGroupifyEntry getEntry(int[] tuple) { final int hash = HashTableUtil.hashcode(tuple); int index = hash & (hashTableBuckets.length - 1); return findEntry(tuple, index, hash); } /** * Returns the first entry * @return */ public HashGroupifyEntry getFirstEquivalenceClass() { return hashTableFirstEntry; } /** * Returns the current size in terms of classes * @return */ public int getNumberOfEquivalenceClasses() { return hashTableElementCount; } /** * Returns whether the current state of the dataset fulfills the minimal class-size property * @return */ public boolean isMinimalClassSizeFulfilled() { return minimalClassSize != Integer.MAX_VALUE && minimalClassSizeFulfilled; } /** * Returns whether the current state of the dataset fulfills the privacy model * @return */ public boolean isPrivacyModelFulfilled() { return privacyModelFulfilled; } /** * Microaggregates all according attributes * @param data * @param start * @param num * @param functions * @param map * @param header * @param dictionary * @return */ public Data performMicroaggregation(int[][] data, int start, int num, DistributionAggregateFunction[] functions, int[] map, String[] header, Dictionary dictionary) { // Prepare result Data result = new Data(new int[data.length][num], header, map, dictionary); // TODO: To improve performance, microaggregation and marking of outliers could be performed in one pass ObjectIntOpenHashMap<Distribution> cache = new ObjectIntOpenHashMap<Distribution>(); for (int row = 0; row < data.length; row++) { if (privacyModelDefinesSubset == null || privacyModelDefinesSubset.contains(row)) { final int[] key = data[row]; final int hash = HashTableUtil.hashcode(key); final int index = hash & (hashTableBuckets.length - 1); HashGroupifyEntry m = hashTableBuckets[index]; while ((m != null) && ((m.hashcode != hash) || !equalsIgnoringOutliers(key, m.key))) { m = m.next; } if (m == null) { throw new RuntimeException("Invalid state! Groupify the data before microaggregation!"); } int dimension = 0; result.getArray()[row] = new int[num]; for (int i = start; i < start + num; i++) { if (!cache.containsKey(m.distributions[i])) { String value = functions[dimension].aggregate(m.distributions[i]); int code = result.getDictionary().register(dimension, value); cache.put(m.distributions[i], code); } result.getArray()[row][dimension] = cache.get(m.distributions[i]); dimension++; } } } // Finalize result.getDictionary().finalizeAll(); // Returns the result return result; } /** * Marks all outliers in the given (generalized subset of the) input datasets * @param data */ public void performSuppression(final int[][] data) { for (int row = 0; row < data.length; row++) { final int[] key = data[row]; if (privacyModelDefinesSubset == null || privacyModelDefinesSubset.contains(row)) { final int hash = HashTableUtil.hashcode(key); final int index = hash & (hashTableBuckets.length - 1); HashGroupifyEntry m = hashTableBuckets[index]; while ((m != null) && ((m.hashcode != hash) || !equalsIgnoringOutliers(key, m.key))) { m = m.next; } if (m == null) { throw new RuntimeException("Invalid state! Groupify the data before marking outliers!"); } if (!m.isNotOutlier) { key[0] |= Data.OUTLIER_MASK; } } else { key[0] |= Data.OUTLIER_MASK; } } } /** * Analyzes the current state * @param transformation * @param force */ public void stateAnalyze(Transformation transformation, boolean force) { if (force) analyzeAll(transformation); else analyzeWithEarlyAbort(transformation); } /** * Clears all entries */ public void stateClear() { if (hashTableElementCount > 0) { this.hashTableElementCount = 0; this.currentNumOutliers = 0; this.hashTableFirstEntry = null; this.hashTableLastEntry = null; HashTableUtil.nullifyArray(hashTableBuckets); } } /** * This method will reset all flags that indicate that equivalence classes are suppressed. */ public void stateResetSuppression() { HashGroupifyEntry entry = hashTableFirstEntry; while (entry != null) { entry.isNotOutlier = true; entry = entry.nextOrdered; } this.currentNumOutliers = 0; } /** * Internal adder method. * * @param generalized the key * @param hash the hash * @param representative * @param count * @param pcount * @return the hash groupify entry */ private HashGroupifyEntry addInternal(final int[] generalized, final int hash, final int representative, int count, final int pcount) { // Find or create entry int index = hash & (hashTableBuckets.length - 1); HashGroupifyEntry entry = findEntry(generalized, index, hash); if (entry == null) { if (++hashTableElementCount > hashTableThreshold) { rehash(); index = hash & (hashTableBuckets.length - 1); } entry = createEntry(generalized, index, hash, representative); } // If we enforce d-presence and the tuple is not contained in the research subset: set its count to zero count = (privacyModelDefinesSubset != null && !privacyModelDefinesSubset.contains(representative)) ? 0 : count; // Track size: private table for d-presence, overall table, else entry.count += count; // Indirectly check if we enforce d-presence if (privacyModelDefinesSubset != null) { // Increase size of tuples from public table entry.pcount += pcount; // This is a tuple from the research subset, but the class is not represented by a tuple from the subset. // Or this is a tuple from the subset with a representative that is smaller than the current representative of the tuple (which is also from the subset) // Reset its representative, which is necessary for rollup / history, because // otherwise subset.contains(tupleID) could potentially return false. // Moreover, we *must* always represent classes by its minimal representative to ensure that roll-ups and snapshots can be // utilized correctly. This is guaranteed, if there is no research subset, and needs to be enforced explicitly, if there is one. // // Consider the following scenario // // 1. Tuple from G1 (Not in subset) // 2. Tuple from G2 (Not in subset) // 3. Tuple from G2 <-Representative // 4. Tuple from G1 <-Representative // // We assume that G1 and G2 collapse in the next grouping operation. // // If we iterate over the whole dataset and always choose the last element, the group is represented by tuple 4 // If we iterate over a snapshot, G1 will be iterated over before G2 (although it has the larger representative), resetting the representative index 3 // // To prevent this, we always choose the smallest index: entry.representative = (count > 0 && (entry.count == count || entry.representative < representative)) ? representative : entry.representative; } // Compute current total number of outliers, if k-anonymity is contained in the set of criteria // TODO: Replace with conditional moves if (entry.count >= minimalClassSize) { if (!entry.isNotOutlier) { entry.isNotOutlier = true; currentNumOutliers -= (entry.count - count); } } else { currentNumOutliers += count; } // Return return entry; } /** * Analyzes the content of the hash table. Checks the privacy criteria against each class. * @param transformation */ private void analyzeAll(Transformation transformation) { // We have only checked k-anonymity so far minimalClassSizeFulfilled = (currentNumOutliers <= suppressionLimit); // Iterate over all classes boolean dpresent = true; currentNumOutliers = 0; HashGroupifyEntry entry = hashTableFirstEntry; while (entry != null) { // Check for anonymity int anonymous = isPrivacyModelFulfilled(transformation, entry); // Determine outliers if (anonymous != -1) { // Note: If d-presence exists, it is stored at criteria[0] by convention. // If it fails, isAnonymous(entry) thus returns 1. // Tuples from the public table that have no matching candidates in the private table // and that do not fulfill d-presence cannot be suppressed. In this case, the whole // transformation must be considered to not fulfill the privacy criteria. if (privacyModelContainsDPresence && entry.count == 0 && anonymous == 1) { dpresent = false; } currentNumOutliers += entry.count; } // We only suppress classes that are contained in the research subset entry.isNotOutlier = entry.count != 0 ? (anonymous == -1) : true; // Next class entry = entry.nextOrdered; } this.analyzeSampleBasedCriteria(transformation, false); this.privacyModelFulfilled = (currentNumOutliers <= suppressionLimit) && dpresent; } /** * Analyze sample-based criteria * @param transformation * @param earlyAbort May we perform an early abort, if we reach the threshold * @return */ private void analyzeSampleBasedCriteria(Transformation transformation, boolean earlyAbort) { // Nothing to do if (this.sampleBasedCriteria.length == 0) { return; } // Build a distribution HashGroupifyDistribution distribution = new HashGroupifyDistribution(heuristicForSampleBasedCriteria ? null : utilityMeasure, transformation, this.hashTableFirstEntry); // For each criterion for (SampleBasedCriterion criterion : this.sampleBasedCriteria) { // Enforce criterion.enforce(distribution, earlyAbort ? this.suppressionLimit : Integer.MAX_VALUE); // Early abort this.currentNumOutliers = distribution.getNumSuppressedRecords(); if (earlyAbort && currentNumOutliers > suppressionLimit) { return; } } } /** * Analyzes the content of the hash table. Checks the privacy criteria against each class. * @param transformation */ private void analyzeWithEarlyAbort(Transformation transformation) { // We have only checked k-anonymity so far minimalClassSizeFulfilled = (currentNumOutliers <= suppressionLimit); // Abort early, if only k-anonymity was specified if (classBasedCriteria.length == 0 && sampleBasedCriteria.length == 0) { privacyModelFulfilled = minimalClassSizeFulfilled; return; } // Abort early, if k-anonymity sub-criterion is not fulfilled // CAUTION: This leaves GroupifyEntry.isNotOutlier and currentOutliers in an inconsistent state // for non-anonymous transformations if (minimalClassSize != Integer.MAX_VALUE && !minimalClassSizeFulfilled) { privacyModelFulfilled = false; return; } // Iterate over all classes currentNumOutliers = 0; HashGroupifyEntry entry = hashTableFirstEntry; while (entry != null) { // Check for anonymity int anonymous = isPrivacyModelFulfilled(transformation, entry); // Determine outliers if (anonymous != -1) { // Note: If d-presence exists, it is stored at criteria[0] by convention. // If it fails, isAnonymous(entry) thus returns 1. // Tuples from the public table that have no matching candidates in the private table // and that do not fulfill d-presence cannot be suppressed. In this case, the whole // transformation must be considered to not fulfill the privacy criteria. // CAUTION: This leaves GroupifyEntry.isNotOutlier and currentOutliers in an inconsistent state // for non-anonymous transformations if (privacyModelContainsDPresence && entry.count == 0 && anonymous == 1) { this.privacyModelFulfilled = false; return; } currentNumOutliers += entry.count; // Break as soon as too many classes are not anonymous // CAUTION: This leaves GroupifyEntry.isNotOutlier and currentOutliers in an inconsistent state // for non-anonymous transformations if (currentNumOutliers > suppressionLimit) { this.privacyModelFulfilled = false; return; } } // We only suppress classes that are contained in the research subset entry.isNotOutlier = entry.count != 0 ? (anonymous == -1) : true; // Next class entry = entry.nextOrdered; } this.analyzeSampleBasedCriteria(transformation, true); this.privacyModelFulfilled = (currentNumOutliers <= suppressionLimit); } /** * Creates a new entry. * * @param key * the key * @param index * the index * @param hash * the hash * @param line * the line * @return the hash groupify entry */ private HashGroupifyEntry createEntry(final int[] key, final int index, final int hash, final int line) { final HashGroupifyEntry entry = new HashGroupifyEntry(key, hash); entry.next = hashTableBuckets[index]; entry.representative = line; hashTableBuckets[index] = entry; if (hashTableFirstEntry == null) { hashTableFirstEntry = entry; hashTableLastEntry = entry; } else { hashTableLastEntry.nextOrdered = entry; hashTableLastEntry = entry; } return entry; } /** * TODO: Ugly!. * * @param a * @param a2 * @return */ private boolean equalsIgnoringOutliers(final int[] a, final int[] a2) { for (int i = 0; i < a.length; i++) { if (a[i] != (a2[i] & Data.REMOVE_OUTLIER_MASK)) { return false; } } return true; } /** * Returns the according entry. * * @param key * the key * @param index * the index * @param keyHash * the key hash * @return the hash groupify entry */ private HashGroupifyEntry findEntry(final int[] key, final int index, final int keyHash) { HashGroupifyEntry m = hashTableBuckets[index]; while ((m != null) && ((m.hashcode != keyHash) || !HashTableUtil.equals(key, m.key))) { m = m.next; } return m; } /** * Checks whether the given entry is anonymous. * @param transformation * @param entry * @return * @returns -1, if all criteria are fulfilled, 0, if minimal group size is not fulfilled, (index+1) if criteria[index] is not fulfilled */ private int isPrivacyModelFulfilled(Transformation transformation, HashGroupifyEntry entry) { // Check minimal group size if (minimalClassSize != Integer.MAX_VALUE && entry.count < minimalClassSize) { return 0; } // Check other criteria // Note: The d-presence criterion must be checked first to ensure correct handling of d-presence with tuple suppression. // This is currently ensured by convention. See ARXConfiguration.getCriteriaAsArray(); for (int i = 0; i < classBasedCriteria.length; i++) { if (!classBasedCriteria[i].isAnonymous(transformation, entry)) { return i + 1; } } return -1; } /** * Rehashes this operator. */ private void rehash() { final int length = HashTableUtil.calculateCapacity((hashTableBuckets.length == 0 ? 1 : hashTableBuckets.length << 1)); final HashGroupifyEntry[] newData = new HashGroupifyEntry[length]; HashGroupifyEntry entry = hashTableFirstEntry; while (entry != null) { final int index = entry.hashcode & (length - 1); entry.next = newData[index]; newData[index] = entry; entry = entry.nextOrdered; } hashTableBuckets = newData; hashTableThreshold = HashTableUtil.calculateThreshold(hashTableBuckets.length, hashTableLoadFactor); } }