/*
* ARX: Powerful Data Anonymization
* Copyright 2012 - 2017 Fabian Prasser, Florian Kohlmayer and contributors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.deidentifier.arx.framework.check.groupify;
import org.deidentifier.arx.ARXConfiguration.ARXConfigurationInternal;
import org.deidentifier.arx.RowSet;
import org.deidentifier.arx.criteria.DPresence;
import org.deidentifier.arx.criteria.Inclusion;
import org.deidentifier.arx.criteria.PrivacyCriterion;
import org.deidentifier.arx.criteria.SampleBasedCriterion;
import org.deidentifier.arx.framework.check.distribution.Distribution;
import org.deidentifier.arx.framework.check.distribution.DistributionAggregateFunction;
import org.deidentifier.arx.framework.data.Data;
import org.deidentifier.arx.framework.data.Dictionary;
import org.deidentifier.arx.framework.lattice.Transformation;
import org.deidentifier.arx.metric.Metric;
import com.carrotsearch.hppc.ObjectIntOpenHashMap;
/**
* A hash groupify operator. It implements a hash table with chaining and keeps
* track of additional properties per equivalence class
*
* @author Fabian Prasser
* @author Florian Kohlmayer
*/
public class HashGroupify {
/** Criteria. */
private final PrivacyCriterion[] classBasedCriteria;
/** The current number of outliers. */
private int currentNumOutliers;
/** The entry array. */
private HashGroupifyEntry[] hashTableBuckets;
/** Current number of elements. */
private int hashTableElementCount;
/** The first entry. */
private HashGroupifyEntry hashTableFirstEntry;
/** The last entry. */
private HashGroupifyEntry hashTableLastEntry;
/** Load factor. */
private final float hashTableLoadFactor = 0.75f;
/** Maximum number of elements that can be put in this map before having to rehash. */
private int hashTableThreshold;
/** Do we ensure optimality for sample-based criteria */
private final boolean heuristicForSampleBasedCriteria;
/** The parameter k, if k-anonymity is contained in the set of criteria. */
private final int minimalClassSize;
/** Is the result k-anonymous?. */
private boolean minimalClassSizeFulfilled;
/** True, if the contained d-presence criterion is not inclusion. */
private final boolean privacyModelContainsDPresence;
/** The research subset, if d-presence is contained in the set of criteria. */
private final RowSet privacyModelDefinesSubset;
/** Is the result anonymous. */
private boolean privacyModelFulfilled;
/** Criteria. */
private final SampleBasedCriterion[] sampleBasedCriteria;
/** Allowed tuple outliers. */
private final int suppressionLimit;
/** Utility measure */
private final Metric<?> utilityMeasure;
/**
* Constructs a new hash groupify operator.
*
* @param capacity The capacity
* @param config The config
*/
public HashGroupify(int capacity, final ARXConfigurationInternal config) {
// Set capacity
capacity = HashTableUtil.calculateCapacity(capacity);
this.hashTableElementCount = 0;
this.hashTableBuckets = new HashGroupifyEntry[capacity];
this.hashTableThreshold = HashTableUtil.calculateThreshold(hashTableBuckets.length, hashTableLoadFactor);
// Set params
this.currentNumOutliers = 0;
this.suppressionLimit = config.getAbsoluteMaxOutliers();
this.utilityMeasure = config.getQualityModel();
this.heuristicForSampleBasedCriteria = config.isUseHeuristicForSampleBasedCriteria();
// Extract research subset
if (config.getSubset() != null) {
this.privacyModelDefinesSubset = config.getSubset().getSet();
} else {
this.privacyModelDefinesSubset = null;
}
// Extract criteria
this.classBasedCriteria = config.getClassBasedPrivacyModelsAsArray();
this.sampleBasedCriteria = config.getSampleBasedPrivacyModelsAsArray();
this.minimalClassSize = config.getMinimalGroupSize();
// Sanity check: by convention, d-presence must be the first criterion
// See analyze() and isAnonymous(Entry) for more details
for (int i = 1; i < classBasedCriteria.length; i++) {
if (classBasedCriteria[i] instanceof DPresence) {
throw new RuntimeException("D-Presence must be the first criterion in the array");
}
}
// Remember, if (real) d-presence is part of the criteria that must be enforced
privacyModelContainsDPresence = (classBasedCriteria.length > 0 && (classBasedCriteria[0] instanceof DPresence) && !(classBasedCriteria[0] instanceof Inclusion));
}
/**
* Adds a tuple from the buffer
* @param generalized
* @param other
* @param representative
* @param count
* @param pcount
*/
public void addFromBuffer(int[] generalized, int[] other, int representative, int count, int pcount) {
// Add
final int hash = HashTableUtil.hashcode(generalized);
final HashGroupifyEntry entry = addInternal(generalized, hash, representative, count, pcount);
// Is a other attribute provided
if (other != null) {
if (entry.distributions == null) {
entry.distributions = new Distribution[other.length];
// TODO: Improve!
for (int i = 0; i < entry.distributions.length; i++) {
entry.distributions[i] = new Distribution();
}
}
// Only add other value if in research subset
if (privacyModelDefinesSubset == null || privacyModelDefinesSubset.contains(representative)) {
// TODO: Improve!
for (int i = 0; i < entry.distributions.length; i++) {
entry.distributions[i].add(other[i]);
}
}
}
}
/**
* Adds an entry from another groupify operator
* @param generalized
* @param distributions
* @param representative
* @param count
* @param pcount
*/
public void addFromGroupify(int[] generalized, Distribution[] distributions, int representative, int count, int pcount) {
// Add
final int hash = HashTableUtil.hashcode(generalized);
final HashGroupifyEntry entry = addInternal(generalized, hash, representative, count, pcount);
// Is a distribution provided
if (distributions != null) {
if (entry.distributions == null) {
entry.distributions = distributions;
} else {
// TODO: Improve!
for (int i = 0; i < entry.distributions.length; i++) {
entry.distributions[i].merge(distributions[i]);
}
}
}
}
/**
* Adds a class from a snapshot
* @param generalized
* @param elements
* @param frequencies
* @param representative
* @param count
* @param pcount
*/
public void addFromSnapshot(int[] generalized, int[][] elements, int[][] frequencies, int representative, int count, int pcount) {
// Add
final int hash = HashTableUtil.hashcode(generalized);
final HashGroupifyEntry entry = addInternal(generalized, hash, representative, count, pcount);
// Is a distribution provided
if (elements != null) {
if (entry.distributions == null) {
entry.distributions = new Distribution[elements.length];
// TODO: Improve!
for (int i = 0; i < entry.distributions.length; i++) {
entry.distributions[i] = new Distribution(elements[i], frequencies[i]);
}
} else {
// TODO: Improve!
for (int i = 0; i < entry.distributions.length; i++) {
entry.distributions[i].merge(elements[i], frequencies[i]);
}
}
}
}
/**
* Returns the entry for the given tuple
* @param tuple
* @return
*/
public HashGroupifyEntry getEntry(int[] tuple) {
final int hash = HashTableUtil.hashcode(tuple);
int index = hash & (hashTableBuckets.length - 1);
return findEntry(tuple, index, hash);
}
/**
* Returns the first entry
* @return
*/
public HashGroupifyEntry getFirstEquivalenceClass() {
return hashTableFirstEntry;
}
/**
* Returns the current size in terms of classes
* @return
*/
public int getNumberOfEquivalenceClasses() {
return hashTableElementCount;
}
/**
* Returns whether the current state of the dataset fulfills the minimal class-size property
* @return
*/
public boolean isMinimalClassSizeFulfilled() {
return minimalClassSize != Integer.MAX_VALUE && minimalClassSizeFulfilled;
}
/**
* Returns whether the current state of the dataset fulfills the privacy model
* @return
*/
public boolean isPrivacyModelFulfilled() {
return privacyModelFulfilled;
}
/**
* Microaggregates all according attributes
* @param data
* @param start
* @param num
* @param functions
* @param map
* @param header
* @param dictionary
* @return
*/
public Data performMicroaggregation(int[][] data,
int start,
int num,
DistributionAggregateFunction[] functions,
int[] map,
String[] header,
Dictionary dictionary) {
// Prepare result
Data result = new Data(new int[data.length][num], header, map, dictionary);
// TODO: To improve performance, microaggregation and marking of outliers could be performed in one pass
ObjectIntOpenHashMap<Distribution> cache = new ObjectIntOpenHashMap<Distribution>();
for (int row = 0; row < data.length; row++) {
if (privacyModelDefinesSubset == null || privacyModelDefinesSubset.contains(row)) {
final int[] key = data[row];
final int hash = HashTableUtil.hashcode(key);
final int index = hash & (hashTableBuckets.length - 1);
HashGroupifyEntry m = hashTableBuckets[index];
while ((m != null) && ((m.hashcode != hash) || !equalsIgnoringOutliers(key, m.key))) {
m = m.next;
}
if (m == null) { throw new RuntimeException("Invalid state! Groupify the data before microaggregation!"); }
int dimension = 0;
result.getArray()[row] = new int[num];
for (int i = start; i < start + num; i++) {
if (!cache.containsKey(m.distributions[i])) {
String value = functions[dimension].aggregate(m.distributions[i]);
int code = result.getDictionary().register(dimension, value);
cache.put(m.distributions[i], code);
}
result.getArray()[row][dimension] = cache.get(m.distributions[i]);
dimension++;
}
}
}
// Finalize
result.getDictionary().finalizeAll();
// Returns the result
return result;
}
/**
* Marks all outliers in the given (generalized subset of the) input datasets
* @param data
*/
public void performSuppression(final int[][] data) {
for (int row = 0; row < data.length; row++) {
final int[] key = data[row];
if (privacyModelDefinesSubset == null || privacyModelDefinesSubset.contains(row)) {
final int hash = HashTableUtil.hashcode(key);
final int index = hash & (hashTableBuckets.length - 1);
HashGroupifyEntry m = hashTableBuckets[index];
while ((m != null) && ((m.hashcode != hash) || !equalsIgnoringOutliers(key, m.key))) {
m = m.next;
}
if (m == null) {
throw new RuntimeException("Invalid state! Groupify the data before marking outliers!");
}
if (!m.isNotOutlier) {
key[0] |= Data.OUTLIER_MASK;
}
} else {
key[0] |= Data.OUTLIER_MASK;
}
}
}
/**
* Analyzes the current state
* @param transformation
* @param force
*/
public void stateAnalyze(Transformation transformation, boolean force) {
if (force) analyzeAll(transformation);
else analyzeWithEarlyAbort(transformation);
}
/**
* Clears all entries
*/
public void stateClear() {
if (hashTableElementCount > 0) {
this.hashTableElementCount = 0;
this.currentNumOutliers = 0;
this.hashTableFirstEntry = null;
this.hashTableLastEntry = null;
HashTableUtil.nullifyArray(hashTableBuckets);
}
}
/**
* This method will reset all flags that indicate that equivalence classes are suppressed.
*/
public void stateResetSuppression() {
HashGroupifyEntry entry = hashTableFirstEntry;
while (entry != null) {
entry.isNotOutlier = true;
entry = entry.nextOrdered;
}
this.currentNumOutliers = 0;
}
/**
* Internal adder method.
*
* @param generalized the key
* @param hash the hash
* @param representative
* @param count
* @param pcount
* @return the hash groupify entry
*/
private HashGroupifyEntry addInternal(final int[] generalized, final int hash, final int representative, int count, final int pcount) {
// Find or create entry
int index = hash & (hashTableBuckets.length - 1);
HashGroupifyEntry entry = findEntry(generalized, index, hash);
if (entry == null) {
if (++hashTableElementCount > hashTableThreshold) {
rehash();
index = hash & (hashTableBuckets.length - 1);
}
entry = createEntry(generalized, index, hash, representative);
}
// If we enforce d-presence and the tuple is not contained in the research subset: set its count to zero
count = (privacyModelDefinesSubset != null && !privacyModelDefinesSubset.contains(representative)) ? 0 : count;
// Track size: private table for d-presence, overall table, else
entry.count += count;
// Indirectly check if we enforce d-presence
if (privacyModelDefinesSubset != null) {
// Increase size of tuples from public table
entry.pcount += pcount;
// This is a tuple from the research subset, but the class is not represented by a tuple from the subset.
// Or this is a tuple from the subset with a representative that is smaller than the current representative of the tuple (which is also from the subset)
// Reset its representative, which is necessary for rollup / history, because
// otherwise subset.contains(tupleID) could potentially return false.
// Moreover, we *must* always represent classes by its minimal representative to ensure that roll-ups and snapshots can be
// utilized correctly. This is guaranteed, if there is no research subset, and needs to be enforced explicitly, if there is one.
//
// Consider the following scenario
//
// 1. Tuple from G1 (Not in subset)
// 2. Tuple from G2 (Not in subset)
// 3. Tuple from G2 <-Representative
// 4. Tuple from G1 <-Representative
//
// We assume that G1 and G2 collapse in the next grouping operation.
//
// If we iterate over the whole dataset and always choose the last element, the group is represented by tuple 4
// If we iterate over a snapshot, G1 will be iterated over before G2 (although it has the larger representative), resetting the representative index 3
//
// To prevent this, we always choose the smallest index:
entry.representative = (count > 0 && (entry.count == count || entry.representative < representative)) ? representative : entry.representative;
}
// Compute current total number of outliers, if k-anonymity is contained in the set of criteria
// TODO: Replace with conditional moves
if (entry.count >= minimalClassSize) {
if (!entry.isNotOutlier) {
entry.isNotOutlier = true;
currentNumOutliers -= (entry.count - count);
}
} else {
currentNumOutliers += count;
}
// Return
return entry;
}
/**
* Analyzes the content of the hash table. Checks the privacy criteria against each class.
* @param transformation
*/
private void analyzeAll(Transformation transformation) {
// We have only checked k-anonymity so far
minimalClassSizeFulfilled = (currentNumOutliers <= suppressionLimit);
// Iterate over all classes
boolean dpresent = true;
currentNumOutliers = 0;
HashGroupifyEntry entry = hashTableFirstEntry;
while (entry != null) {
// Check for anonymity
int anonymous = isPrivacyModelFulfilled(transformation, entry);
// Determine outliers
if (anonymous != -1) {
// Note: If d-presence exists, it is stored at criteria[0] by convention.
// If it fails, isAnonymous(entry) thus returns 1.
// Tuples from the public table that have no matching candidates in the private table
// and that do not fulfill d-presence cannot be suppressed. In this case, the whole
// transformation must be considered to not fulfill the privacy criteria.
if (privacyModelContainsDPresence && entry.count == 0 && anonymous == 1) {
dpresent = false;
}
currentNumOutliers += entry.count;
}
// We only suppress classes that are contained in the research subset
entry.isNotOutlier = entry.count != 0 ? (anonymous == -1) : true;
// Next class
entry = entry.nextOrdered;
}
this.analyzeSampleBasedCriteria(transformation, false);
this.privacyModelFulfilled = (currentNumOutliers <= suppressionLimit) && dpresent;
}
/**
* Analyze sample-based criteria
* @param transformation
* @param earlyAbort May we perform an early abort, if we reach the threshold
* @return
*/
private void analyzeSampleBasedCriteria(Transformation transformation, boolean earlyAbort) {
// Nothing to do
if (this.sampleBasedCriteria.length == 0) {
return;
}
// Build a distribution
HashGroupifyDistribution distribution = new HashGroupifyDistribution(heuristicForSampleBasedCriteria ? null : utilityMeasure,
transformation,
this.hashTableFirstEntry);
// For each criterion
for (SampleBasedCriterion criterion : this.sampleBasedCriteria) {
// Enforce
criterion.enforce(distribution, earlyAbort ? this.suppressionLimit : Integer.MAX_VALUE);
// Early abort
this.currentNumOutliers = distribution.getNumSuppressedRecords();
if (earlyAbort && currentNumOutliers > suppressionLimit) {
return;
}
}
}
/**
* Analyzes the content of the hash table. Checks the privacy criteria against each class.
* @param transformation
*/
private void analyzeWithEarlyAbort(Transformation transformation) {
// We have only checked k-anonymity so far
minimalClassSizeFulfilled = (currentNumOutliers <= suppressionLimit);
// Abort early, if only k-anonymity was specified
if (classBasedCriteria.length == 0 && sampleBasedCriteria.length == 0) {
privacyModelFulfilled = minimalClassSizeFulfilled;
return;
}
// Abort early, if k-anonymity sub-criterion is not fulfilled
// CAUTION: This leaves GroupifyEntry.isNotOutlier and currentOutliers in an inconsistent state
// for non-anonymous transformations
if (minimalClassSize != Integer.MAX_VALUE && !minimalClassSizeFulfilled) {
privacyModelFulfilled = false;
return;
}
// Iterate over all classes
currentNumOutliers = 0;
HashGroupifyEntry entry = hashTableFirstEntry;
while (entry != null) {
// Check for anonymity
int anonymous = isPrivacyModelFulfilled(transformation, entry);
// Determine outliers
if (anonymous != -1) {
// Note: If d-presence exists, it is stored at criteria[0] by convention.
// If it fails, isAnonymous(entry) thus returns 1.
// Tuples from the public table that have no matching candidates in the private table
// and that do not fulfill d-presence cannot be suppressed. In this case, the whole
// transformation must be considered to not fulfill the privacy criteria.
// CAUTION: This leaves GroupifyEntry.isNotOutlier and currentOutliers in an inconsistent state
// for non-anonymous transformations
if (privacyModelContainsDPresence && entry.count == 0 && anonymous == 1) {
this.privacyModelFulfilled = false;
return;
}
currentNumOutliers += entry.count;
// Break as soon as too many classes are not anonymous
// CAUTION: This leaves GroupifyEntry.isNotOutlier and currentOutliers in an inconsistent state
// for non-anonymous transformations
if (currentNumOutliers > suppressionLimit) {
this.privacyModelFulfilled = false;
return;
}
}
// We only suppress classes that are contained in the research subset
entry.isNotOutlier = entry.count != 0 ? (anonymous == -1) : true;
// Next class
entry = entry.nextOrdered;
}
this.analyzeSampleBasedCriteria(transformation, true);
this.privacyModelFulfilled = (currentNumOutliers <= suppressionLimit);
}
/**
* Creates a new entry.
*
* @param key
* the key
* @param index
* the index
* @param hash
* the hash
* @param line
* the line
* @return the hash groupify entry
*/
private HashGroupifyEntry createEntry(final int[] key, final int index, final int hash, final int line) {
final HashGroupifyEntry entry = new HashGroupifyEntry(key, hash);
entry.next = hashTableBuckets[index];
entry.representative = line;
hashTableBuckets[index] = entry;
if (hashTableFirstEntry == null) {
hashTableFirstEntry = entry;
hashTableLastEntry = entry;
} else {
hashTableLastEntry.nextOrdered = entry;
hashTableLastEntry = entry;
}
return entry;
}
/**
* TODO: Ugly!.
*
* @param a
* @param a2
* @return
*/
private boolean equalsIgnoringOutliers(final int[] a, final int[] a2) {
for (int i = 0; i < a.length; i++) {
if (a[i] != (a2[i] & Data.REMOVE_OUTLIER_MASK)) {
return false;
}
}
return true;
}
/**
* Returns the according entry.
*
* @param key
* the key
* @param index
* the index
* @param keyHash
* the key hash
* @return the hash groupify entry
*/
private HashGroupifyEntry findEntry(final int[] key, final int index, final int keyHash) {
HashGroupifyEntry m = hashTableBuckets[index];
while ((m != null) && ((m.hashcode != keyHash) || !HashTableUtil.equals(key, m.key))) {
m = m.next;
}
return m;
}
/**
* Checks whether the given entry is anonymous.
* @param transformation
* @param entry
* @return
* @returns -1, if all criteria are fulfilled, 0, if minimal group size is not fulfilled, (index+1) if criteria[index] is not fulfilled
*/
private int isPrivacyModelFulfilled(Transformation transformation, HashGroupifyEntry entry) {
// Check minimal group size
if (minimalClassSize != Integer.MAX_VALUE && entry.count < minimalClassSize) {
return 0;
}
// Check other criteria
// Note: The d-presence criterion must be checked first to ensure correct handling of d-presence with tuple suppression.
// This is currently ensured by convention. See ARXConfiguration.getCriteriaAsArray();
for (int i = 0; i < classBasedCriteria.length; i++) {
if (!classBasedCriteria[i].isAnonymous(transformation, entry)) {
return i + 1;
}
}
return -1;
}
/**
* Rehashes this operator.
*/
private void rehash() {
final int length = HashTableUtil.calculateCapacity((hashTableBuckets.length == 0 ? 1 : hashTableBuckets.length << 1));
final HashGroupifyEntry[] newData = new HashGroupifyEntry[length];
HashGroupifyEntry entry = hashTableFirstEntry;
while (entry != null) {
final int index = entry.hashcode & (length - 1);
entry.next = newData[index];
newData[index] = entry;
entry = entry.nextOrdered;
}
hashTableBuckets = newData;
hashTableThreshold = HashTableUtil.calculateThreshold(hashTableBuckets.length, hashTableLoadFactor);
}
}