/*
* ARX: Powerful Data Anonymization
* Copyright 2012 - 2017 Fabian Prasser, Florian Kohlmayer and contributors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.deidentifier.arx.framework.data;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import org.deidentifier.arx.DataDefinition;
import org.deidentifier.arx.DataGeneralizationScheme;
import org.deidentifier.arx.DataSubset;
import org.deidentifier.arx.DataType;
import org.deidentifier.arx.RowSet;
import org.deidentifier.arx.aggregates.HierarchyBuilder;
import org.deidentifier.arx.aggregates.HierarchyBuilderIntervalBased;
import org.deidentifier.arx.aggregates.HierarchyBuilderRedactionBased;
import org.deidentifier.arx.criteria.EDDifferentialPrivacy;
import org.deidentifier.arx.criteria.HierarchicalDistanceTCloseness;
import org.deidentifier.arx.criteria.PrivacyCriterion;
import org.deidentifier.arx.framework.check.distribution.DistributionAggregateFunction;
import org.deidentifier.arx.framework.check.distribution.DistributionAggregateFunction.DistributionAggregateFunctionGeneralization;
import org.deidentifier.arx.metric.v2.DomainShare;
import org.deidentifier.arx.metric.v2.DomainShareInterval;
import org.deidentifier.arx.metric.v2.DomainShareMaterialized;
import org.deidentifier.arx.metric.v2.DomainShareRedaction;
import cern.colt.Sorting;
import cern.colt.function.IntComparator;
import com.carrotsearch.hppc.IntObjectOpenHashMap;
import com.carrotsearch.hppc.IntOpenHashSet;
/**
* Holds all data needed for the anonymization process.
*
* @author Fabian Prasser
* @author Florian Kohlmayer
*/
public class DataManager {
/**
* Internal representation of attribute types. Quasi-identifiers are split
* into the ones to which generalization is applied and the ones to which
* microaggregation is applied
*
* @author Florian Kohlmayer
* @author Fabian Prasser
*
*/
public static class AttributeTypeInternal {
public static final int IDENTIFYING = 3;
public static final int INSENSITIVE = 2;
public static final int QUASI_IDENTIFYING_GENERALIZED = 0;
public static final int QUASI_IDENTIFYING_MICROAGGREGATED = 4;
public static final int SENSITIVE = 1;
}
/** The data. */
private final Data dataAnalyzed;
/** The data which is generalized */
private final Data dataGeneralized;
/** The data which is insensitive */
private final Data dataStatic;
/** The data definition */
private final DataDefinition definition;
/** The domain shares */
private DomainShare[] shares;
/** The original input header. */
private final String[] header;
/** The generalization hierarchiesQI. */
private final GeneralizationHierarchy[] hierarchiesGeneralized;
/** The hierarchy heights for each QI. */
private final int[] hierarchiesHeights;
/** The sensitive attributes. */
private final Map<String, GeneralizationHierarchy> hierarchiesSensitive;
/** The data types of sensitive attributes. */
private final Map<String, DataType<?>> dataTypesSensitive;
/** The indexes of sensitive attributes. */
private final Map<String, Integer> indexesSensitive;
/** The maximum level for each QI. */
private final int[] maxLevels;
/** The microaggregation functions. */
private final DistributionAggregateFunction[] microaggregationFunctions;
/** Header for microaggregated attributes */
private final String[] microaggregationHeader;
/** Map for microaggregated attributes */
private final int[] microaggregationMap;
/** Map for microaggregated attributes */
private final int[] microaggregationDomainSizes;
/** The number of microaggregation attributes in the dataDI */
private final int microaggregationNumAttributes;
/** The start index of the microaggregation attributes in the dataDI */
private final int microaggregationStartIndex;
/** The minimum level for each QI. */
private final int[] minLevels;
/** The research subset, if any. */
private RowSet subset = null;
/** The size of the research subset. */
private int subsetSize = 0;
/**
* Creates a new data manager from pre-encoded data.
*
* @param header
* @param data
* @param dictionary
* @param definition
* @param criteria
* @param function
*/
public DataManager(final String[] header,
final int[][] data,
final Dictionary dictionary,
final DataDefinition definition,
final Set<PrivacyCriterion> criteria,
final Map<String, DistributionAggregateFunction> functions) {
// Store columns for reordering the output
this.header = header;
this.definition = definition;
Set<String> attributesGeneralized = definition.getQuasiIdentifiersWithGeneralization();
Set<String> attributesSensitive = definition.getSensitiveAttributes();
Set<String> attributesMicroaggregated = definition.getQuasiIdentifiersWithMicroaggregation();
Set<String> attributesInsensitive = definition.getInsensitiveAttributes();
// Init dictionary
final Dictionary dictionaryGeneralized = new Dictionary(attributesGeneralized.size());
final Dictionary dictionaryAnalyzed = new Dictionary(attributesSensitive.size() + attributesMicroaggregated.size());
final Dictionary dictionaryStatic = new Dictionary(attributesInsensitive.size());
// Init maps for reordering the output
final int[] mapGeneralized = new int[dictionaryGeneralized.getNumDimensions()];
final int[] mapAnalyzed = new int[dictionaryAnalyzed.getNumDimensions()];
final int[] mapStatic = new int[dictionaryStatic.getNumDimensions()];
this.microaggregationMap = new int[attributesMicroaggregated.size()];
// Indexes
this.microaggregationStartIndex = attributesSensitive.size();
this.microaggregationNumAttributes = attributesMicroaggregated.size();
int indexStatic = 0;
int indexGeneralized = 0;
int indexAnalyzed = 0;
int indexSensitive = 0;
int indexMicroaggregated = this.microaggregationStartIndex;
int counter = 0;
// A map for column indices. map[i*2]=attribute type, map[i*2+1]=index position.
final int[] map = new int[header.length * 2];
final String[] headerGH = new String[dictionaryGeneralized.getNumDimensions()];
final String[] headerDI = new String[dictionaryAnalyzed.getNumDimensions()];
final String[] headerIS = new String[dictionaryStatic.getNumDimensions()];
this.microaggregationHeader = new String[attributesMicroaggregated.size()];
this.dataTypesSensitive = new HashMap<>();
for (final String column : header) {
final int idx = counter * 2;
if (attributesGeneralized.contains(column)) {
map[idx] = AttributeTypeInternal.QUASI_IDENTIFYING_GENERALIZED;
map[idx + 1] = indexGeneralized;
mapGeneralized[indexGeneralized] = counter;
dictionaryGeneralized.registerAll(indexGeneralized, dictionary, counter);
headerGH[indexGeneralized] = header[counter];
indexGeneralized++;
} else if (attributesMicroaggregated.contains(column)) {
map[idx] = AttributeTypeInternal.QUASI_IDENTIFYING_MICROAGGREGATED;
map[idx + 1] = indexMicroaggregated;
mapAnalyzed[indexMicroaggregated] = counter;
dictionaryAnalyzed.registerAll(indexMicroaggregated, dictionary, counter);
headerDI[indexMicroaggregated] = header[counter];
indexMicroaggregated++;
microaggregationMap[indexAnalyzed] = counter;
microaggregationHeader[indexAnalyzed] = header[counter];
indexAnalyzed++;
} else if (attributesInsensitive.contains(column)) {
map[idx] = AttributeTypeInternal.INSENSITIVE;
map[idx + 1] = indexStatic;
mapStatic[indexStatic] = counter;
dictionaryStatic.registerAll(indexStatic, dictionary, counter);
headerIS[indexStatic] = header[counter];
indexStatic++;
} else if (attributesSensitive.contains(column)) {
map[idx] = AttributeTypeInternal.SENSITIVE;
map[idx + 1] = indexSensitive;
mapAnalyzed[indexSensitive] = counter;
dictionaryAnalyzed.registerAll(indexSensitive, dictionary, counter);
headerDI[indexSensitive] = header[counter];
indexSensitive++;
dataTypesSensitive.put(column, definition.getDataType(column));
} else {
// TODO: CHECK: Changed default? - now all undefined attributes
// are identifying! Previously they were considered sensitive?
map[idx] = AttributeTypeInternal.IDENTIFYING;
map[idx + 1] = -1;
}
counter++;
}
// encode Data
final Data[] ddata = encode(data,
map,
mapGeneralized,
mapAnalyzed,
mapStatic,
dictionaryGeneralized,
dictionaryAnalyzed,
dictionaryStatic,
headerGH,
headerDI,
headerIS);
dataGeneralized = ddata[0];
dataAnalyzed = ddata[1];
dataStatic = ddata[2];
// Initialize minlevels
minLevels = new int[attributesGeneralized.size()];
hierarchiesHeights = new int[attributesGeneralized.size()];
maxLevels = new int[attributesGeneralized.size()];
// Build hierarchiesQI
hierarchiesGeneralized = new GeneralizationHierarchy[attributesGeneralized.size()];
for (int i = 0; i < header.length; i++) {
final int idx = i * 2;
if (attributesGeneralized.contains(header[i]) &&
map[idx] == AttributeTypeInternal.QUASI_IDENTIFYING_GENERALIZED) {
final int dictionaryIndex = map[idx + 1];
final String name = header[i];
if (definition.getHierarchy(name) != null) {
hierarchiesGeneralized[dictionaryIndex] = new GeneralizationHierarchy(name,
definition.getHierarchy(name),
dictionaryIndex,
dictionaryGeneralized);
} else {
throw new IllegalStateException("No hierarchy available for attribute (" + header[i] + ")");
}
// Initialize hierarchy height and minimum / maximum
// generalization
hierarchiesHeights[dictionaryIndex] = hierarchiesGeneralized[dictionaryIndex].getArray()[0].length;
final Integer minGenLevel = definition.getMinimumGeneralization(name);
minLevels[dictionaryIndex] = minGenLevel == null ? 0 : minGenLevel;
final Integer maxGenLevel = definition.getMaximumGeneralization(name);
maxLevels[dictionaryIndex] = maxGenLevel == null ? hierarchiesHeights[dictionaryIndex] - 1 : maxGenLevel;
}
}
// Change min & max, when using (e,d)-DP
for (PrivacyCriterion c : criteria) {
if (c instanceof EDDifferentialPrivacy) {
DataGeneralizationScheme scheme = ((EDDifferentialPrivacy)c).getGeneralizationScheme();
for (int i = 0; i < header.length; i++) {
final int idx = i * 2;
if (attributesGeneralized.contains(header[i]) &&
map[idx] == AttributeTypeInternal.QUASI_IDENTIFYING_GENERALIZED) {
minLevels[map[idx + 1]] = scheme.getGeneralizationLevel(header[i], definition);
maxLevels[map[idx + 1]] = scheme.getGeneralizationLevel(header[i], definition);
}
}
break;
}
}
// Build map with hierarchies for sensitive attributes
Map<String, String[][]> sensitiveHierarchies = new HashMap<String, String[][]>();
for (PrivacyCriterion c : criteria) {
if (c instanceof HierarchicalDistanceTCloseness) {
HierarchicalDistanceTCloseness t = (HierarchicalDistanceTCloseness) c;
sensitiveHierarchies.put(t.getAttribute(), t.getHierarchy().getHierarchy());
}
}
// Build generalization hierarchies for sensitive attributes
hierarchiesSensitive = new HashMap<String, GeneralizationHierarchy>();
indexesSensitive = new HashMap<String, Integer>();
int index = 0;
for (int i = 0; i < header.length; i++) {
final String name = header[i];
final int idx = i * 2;
if (sensitiveHierarchies.containsKey(name) &&
map[idx] == AttributeTypeInternal.SENSITIVE) {
final int dictionaryIndex = map[idx + 1];
final String[][] hiers = sensitiveHierarchies.get(name);
if (hiers != null) {
hierarchiesSensitive.put(name, new GeneralizationHierarchy(name,
hiers,
dictionaryIndex,
dictionaryAnalyzed));
}
}
// Store index for sensitive attributes
if (attributesSensitive.contains(header[i])) {
indexesSensitive.put(name, index);
index++;
}
}
// Build map with hierarchies for microaggregated attributes
Map<String, String[][]> maHierarchies = new HashMap<String, String[][]>();
for (String attribute : functions.keySet()) {
if (functions.get(attribute) instanceof DistributionAggregateFunctionGeneralization) {
maHierarchies.put(attribute, definition.getHierarchy(attribute));
}
}
// Build generalization hierarchies for microaggregated attributes
Map<String, int[][]> hierarchiesMA = new HashMap<String, int[][]>();
index = 0;
for (int i = 0; i < header.length; i++) {
final String name = header[i];
final int idx = i * 2;
if (maHierarchies.containsKey(name) &&
map[idx] == AttributeTypeInternal.QUASI_IDENTIFYING_MICROAGGREGATED) {
final int dictionaryIndex = map[idx + 1];
final String[][] hiers = maHierarchies.get(name);
if (hiers != null) {
hierarchiesMA.put(name, new GeneralizationHierarchy(name,
hiers,
dictionaryIndex,
dictionaryAnalyzed).map);
}
}
}
// finalize dictionary
dictionaryGeneralized.finalizeAll();
dictionaryAnalyzed.finalizeAll();
dictionaryStatic.finalizeAll();
// Init microaggregation functions
microaggregationFunctions = new DistributionAggregateFunction[attributesMicroaggregated.size()];
microaggregationDomainSizes = new int[attributesMicroaggregated.size()];
for (int i = 0; i < header.length; i++) {
final int idx = i * 2;
if (attributesMicroaggregated.contains(header[i]) &&
map[idx] == AttributeTypeInternal.QUASI_IDENTIFYING_MICROAGGREGATED) {
final int dictionaryIndex = map[idx + 1] - microaggregationStartIndex;
final String name = header[i];
if (definition.getMicroAggregationFunction(name) != null) {
microaggregationDomainSizes[dictionaryIndex] = dictionaryAnalyzed.getMapping()[dictionaryIndex + microaggregationStartIndex].length;
microaggregationFunctions[dictionaryIndex] = functions.get(name);
microaggregationFunctions[dictionaryIndex].initialize(dictionaryAnalyzed.getMapping()[dictionaryIndex + microaggregationStartIndex],
definition.getDataType(name),
hierarchiesMA.get(name));
} else {
throw new IllegalStateException("No microaggregation function defined for attribute (" +
header[i] + ")");
}
}
}
// Store research subset
for (PrivacyCriterion c : criteria) {
if (c instanceof EDDifferentialPrivacy) {
((EDDifferentialPrivacy) c).initialize(this, null);
}
if (c.isSubsetAvailable()) {
DataSubset _subset = c.getDataSubset();
if (_subset != null) {
subset = _subset.getSet();
subsetSize = _subset.getArray().length;
break;
}
}
}
}
/**
* For creating a projected instance
* @param definition
* @param dataAnalyzed
* @param dataGeneralized
* @param dataStatic
* @param header
* @param hierarchiesGeneralized
* @param hierarchiesHeights
* @param hierarchiesSensitive
* @param indexesSensitive
* @param maxLevels
* @param microaggregationFunctions
* @param microaggregationHeader
* @param microaggregationMap
* @param microaggregationDomainSizes
* @param microaggregationNumAttributes
* @param microaggregationStartIndex
* @param minLevels
* @param dataTypesSensitive
*/
protected DataManager(DataDefinition definition,
Data dataAnalyzed,
Data dataGeneralized,
Data dataStatic,
String[] header,
GeneralizationHierarchy[] hierarchiesGeneralized,
int[] hierarchiesHeights,
Map<String, GeneralizationHierarchy> hierarchiesSensitive,
Map<String, Integer> indexesSensitive,
int[] maxLevels,
DistributionAggregateFunction[] microaggregationFunctions,
String[] microaggregationHeader,
int[] microaggregationMap,
int[] microaggregationDomainSizes,
int microaggregationNumAttributes,
int microaggregationStartIndex,
int[] minLevels,
Map<String, DataType<?>> dataTypesSensitive) {
this.definition = definition;
this.dataAnalyzed = dataAnalyzed;
this.dataGeneralized = dataGeneralized;
this.dataStatic = dataStatic;
this.header = header;
this.hierarchiesGeneralized = hierarchiesGeneralized;
this.hierarchiesHeights = hierarchiesHeights;
this.hierarchiesSensitive = hierarchiesSensitive;
this.indexesSensitive = indexesSensitive;
this.maxLevels = maxLevels;
this.microaggregationFunctions = microaggregationFunctions;
this.microaggregationDomainSizes = microaggregationDomainSizes;
this.microaggregationHeader = microaggregationHeader;
this.microaggregationMap = microaggregationMap;
this.microaggregationNumAttributes = microaggregationNumAttributes;
this.microaggregationStartIndex = microaggregationStartIndex;
this.minLevels = minLevels;
this.dataTypesSensitive = dataTypesSensitive;
// Both variables are only used for getDistribution() and getTree()
// The projected instance delegates these methods to the original data manager
this.subset = null;
this.subsetSize = 0;
}
/**
* Returns the input data that will be analyzed.
*
* @return the data
*/
public Data getDataAnalyzed() {
return dataAnalyzed;
}
/**
* Returns the input data that will be generalized.
*
* @return the data
*/
public Data getDataGeneralized() {
return dataGeneralized;
}
/**
* Returns the static input data.
*
* @return the data
*/
public Data getDataStatic() {
return dataStatic;
}
/**
* Returns the distribution of the attribute in the data array at the given index.
* @param data
* @param index
* @param distinctValues
* @return
*/
public double[] getDistribution(int[][] data, int index, int distinctValues) {
// Initialize counts: iterate over all rows or the subset
final int[] cardinalities = new int[distinctValues];
for (int i = 0; i < data.length; i++) {
if (subset == null || subset.contains(i)) {
cardinalities[data[i][index]]++;
}
}
// compute distribution
final double total = subset == null ? data.length : subsetSize;
final double[] distribution = new double[cardinalities.length];
for (int i = 0; i < distribution.length; i++) {
distribution[i] = (double) cardinalities[i] / total;
}
return distribution;
}
/**
* Returns the distribution of the given sensitive attribute in the original dataset.
* Required for t-closeness.
*
* @param attribute
* @return distribution
*/
public double[] getDistribution(String attribute) {
// Check
if (!indexesSensitive.containsKey(attribute)) {
throw new IllegalArgumentException("Attribute " + attribute + " is not sensitive");
}
// Prepare
int index = indexesSensitive.get(attribute);
int distinctValues = dataAnalyzed.getDictionary().getMapping()[index].length;
int[][] data = dataAnalyzed.getArray();
// Calculate and return
return getDistribution(data, index, distinctValues);
}
public DomainShare[] getDomainShares() {
// Build on-demand
if (this.shares == null) {
// Compute domain shares
this.shares = new DomainShare[dataGeneralized.getHeader().length];
for (int i=0; i<shares.length; i++) {
// Extract info
String attribute = dataGeneralized.getHeader()[i];
String[][] hierarchy = definition.getHierarchy(attribute);
HierarchyBuilder<?> builder = definition.getHierarchyBuilder(attribute);
// Create shares for redaction-based hierarchies
if (builder != null && (builder instanceof HierarchyBuilderRedactionBased) &&
((HierarchyBuilderRedactionBased<?>)builder).isDomainPropertiesAvailable()){
this.shares[i] = new DomainShareRedaction((HierarchyBuilderRedactionBased<?>)builder);
// Create shares for interval-based hierarchies
} else if (builder != null && (builder instanceof HierarchyBuilderIntervalBased)) {
this.shares[i] = new DomainShareInterval<>((HierarchyBuilderIntervalBased<?>)builder,
hierarchiesGeneralized[i].getArray(),
dataGeneralized.getDictionary().getMapping()[i]);
// Create fallback-shares for materialized hierarchies
} else {
this.shares[i] = new DomainShareMaterialized(hierarchy,
dataGeneralized.getDictionary().getMapping()[i],
hierarchiesGeneralized[i].getArray());
}
}
}
// Return
return this.shares;
}
/**
* The original data header.
*
* @return
*/
public String[] getHeader() {
return header;
}
/**
* Returns the heights of the hierarchiesQI.
*
* @return
*/
public int[] getHierachiesHeights() {
return hierarchiesHeights;
}
/**
* Returns the generalization hierarchiesQI.
*
* @return the hierarchiesQI
*/
public GeneralizationHierarchy[] getHierarchies() {
return hierarchiesGeneralized;
}
/**
* Returns the maximum levels for the generalizaiton.
*
* @return the maximum level for each QI
*/
public int[] getHierarchiesMaxLevels() {
return maxLevels;
}
/**
* Returns the minimum levels for the generalizations.
*
* @return
*/
public int[] getHierarchiesMinLevels() {
return minLevels;
}
/**
* Returns the map for the according buffer
* @return
*/
public int[] getMicroaggregationDomainSizes() {
return microaggregationDomainSizes;
}
/**
* Returns the microaggregation functions.
*
* @return
*/
public DistributionAggregateFunction[] getMicroaggregationFunctions() {
return microaggregationFunctions;
}
/**
* Returns the header for the according buffer
* @return
*/
public String[] getMicroaggregationHeader() {
return microaggregationHeader;
}
/**
* Returns the map for the according buffer
* @return
*/
public int[] getMicroaggregationMap() {
return microaggregationMap;
}
/**
* Gets the number of attributes to which microaggregation will be applied
* in dataAnalyzed.
*
* @return
*/
public int getMicroaggregationNumAttributes() {
return microaggregationNumAttributes;
}
/**
* Gets the start index of the attributes to which microaggregation will be
* applied in dataAnalyzed.
*
* @return
*/
public int getMicroaggregationStartIndex() {
return microaggregationStartIndex;
}
/**
* Returns the order of the given sensitive attribute in the original dataset.
* Required for t-closeness.
*
* @param attribute
* @return distribution
*/
public int[] getOrder(String attribute) {
// Check
if (!indexesSensitive.containsKey(attribute)) {
throw new IllegalArgumentException("Attribute " + attribute + " is not sensitive");
}
// Prepare
final String[] dictionary = dataAnalyzed.getDictionary().getMapping()[indexesSensitive.get(attribute)];
final DataType<?> type = this.dataTypesSensitive.get(attribute);
// Init
int[] order = new int[dictionary.length];
for (int i = 0; i < order.length; i++) {
order[i] = i;
}
// Sort
Sorting.mergeSort(order, 0, order.length, new IntComparator() {
@Override public int compare(int arg0, int arg1) {
String value1 = dictionary[arg0];
String value2 = dictionary[arg1];
try {
return type.compare(value1, value2);
} catch (NumberFormatException | ParseException e) {
throw new IllegalStateException(e);
}
}
});
// Return
return order;
}
/**
* Returns an instance of this data manager, that is projected onto the given rowset
* @param rowset
* @return
*/
public DataManager getSubsetInstance(RowSet rowset) {
DistributionAggregateFunction[] microaggregationFunctions = new DistributionAggregateFunction[this.microaggregationFunctions.length];
for (int i = 0; i < this.microaggregationFunctions.length; i++) {
microaggregationFunctions[i] = this.microaggregationFunctions[i].clone();
}
return new DataManagerSubset(this,
this.dataAnalyzed.getSubsetInstance(rowset),
this.dataGeneralized.getSubsetInstance(rowset),
this.dataStatic.getSubsetInstance(rowset),
this.header,
this.hierarchiesGeneralized,
this.hierarchiesHeights,
this.hierarchiesSensitive,
this.indexesSensitive,
this.maxLevels,
microaggregationFunctions,
this.microaggregationHeader,
this.microaggregationMap,
this.microaggregationDomainSizes,
this.microaggregationNumAttributes,
this.microaggregationStartIndex,
this.minLevels,
this.dataTypesSensitive);
}
/**
* Returns a tree for the given attribute at the index within the given data array, using the given hierarchy.
* The resulting tree can be used to calculate the earth mover's distance with hierarchical ground-distance.
* @param data
* @param index
* @param hierarchy
* @return tree
*/
public int[] getTree(int[][] data,
int index,
int[][] hierarchy) {
final int totalElementsP = subset == null ? data.length : subsetSize;
final int height = hierarchy[0].length - 1;
final int numLeafs = hierarchy.length;
// TODO: Size could be calculated?!
final ArrayList<Integer> treeList = new ArrayList<Integer>();
treeList.add(totalElementsP);
treeList.add(numLeafs);
treeList.add(height);
// Init all freq to 0
for (int i = 0; i < numLeafs; i++) {
treeList.add(0);
}
// Count frequencies
final int offsetLeafs = 3;
for (int i = 0; i < data.length; i++) {
if (subset == null || subset.contains(i)) {
int previousFreq = treeList.get(data[i][index] + offsetLeafs);
previousFreq++;
treeList.set(data[i][index] + offsetLeafs, previousFreq);
}
}
// Init extras
for (int i = 0; i < numLeafs; i++) {
treeList.add(-1);
}
// Temporary class for nodes
class TNode {
IntOpenHashSet children = new IntOpenHashSet();
int level = 0;
int offset = 0;
}
final int offsetsExtras = offsetLeafs + numLeafs;
final IntObjectOpenHashMap<TNode> nodes = new IntObjectOpenHashMap<TNode>();
final ArrayList<ArrayList<TNode>> levels = new ArrayList<ArrayList<TNode>>();
// Init levels
for (int i = 0; i < hierarchy[0].length; i++) {
levels.add(new ArrayList<TNode>());
}
// Build nodes
int offset = dataAnalyzed.getDictionary().getMapping()[index].length;
for (int i = 0; i < hierarchy[0].length; i++) {
for (int j = 0; j < hierarchy.length; j++) {
final int nodeID = hierarchy[j][i] + i * offset;
TNode curNode = null;
if (!nodes.containsKey(nodeID)) {
curNode = new TNode();
curNode.level = i;
nodes.put(nodeID, curNode);
final ArrayList<TNode> level = levels.get(curNode.level);
level.add(curNode);
} else {
curNode = nodes.get(nodeID);
}
if (i > 0) { // first add child
curNode.children.add(hierarchy[j][i - 1] + (i - 1) * offset);
}
}
}
// For all nodes
for (final ArrayList<TNode> level : levels) {
for (final TNode node : level) {
if (node.level > 0) { // only inner nodes
node.offset = treeList.size();
treeList.add(node.children.size());
treeList.add(node.level);
final int[] keys = node.children.keys;
final boolean[] allocated = node.children.allocated;
for (int i = 0; i < allocated.length; i++) {
if (allocated[i]) {
treeList.add(node.level == 1 ? keys[i] + offsetsExtras
: nodes.get(keys[i]).offset);
}
}
treeList.add(0); // pos_e
treeList.add(0); // neg_e
}
}
}
final int[] treeArray = new int[treeList.size()];
int count = 0;
for (final int val : treeList) {
treeArray[count++] = val;
}
return treeArray;
}
/**
* Returns the tree for the given sensitive attribute, if a generalization hierarchy is associated.
* The resulting tree can be used to calculate the earth mover's distance with hierarchical ground-distance.
*
* @param attribute
* @return tree
*/
public int[] getTree(String attribute) {
if (!hierarchiesSensitive.containsKey(attribute)) {
throw new IllegalArgumentException("Attribute " + attribute + " is not sensitive");
}
final int[][] data = dataAnalyzed.getArray();
final int index = indexesSensitive.get(attribute);
return getTree(data, index, hierarchiesSensitive.get(attribute).map);
}
/**
* Encodes the data.
*
* @param data
* @param map
* @param mapGeneralized
* @param mapAnalyzed
* @param mapStatic
* @param dictionaryGeneralized
* @param dictionaryAnalyzed
* @param dictionaryStatic
* @param headerGeneralized
* @param headerAnalyzed
* @param headerStatic
* @return
*/
private Data[] encode(final int[][] data,
final int[] map,
final int[] mapGeneralized,
final int[] mapAnalyzed,
final int[] mapStatic,
final Dictionary dictionaryGeneralized,
final Dictionary dictionaryAnalyzed,
final Dictionary dictionaryStatic,
final String[] headerGeneralized,
final String[] headerAnalyzed,
final String[] headerStatic) {
// Parse the dataset
final int[][] valsGH = headerGeneralized.length == 0 ? null : new int[data.length][];
final int[][] valsDI = headerAnalyzed.length == 0 ? null : new int[data.length][];
final int[][] valsIS = headerStatic.length == 0 ? null : new int[data.length][];
int index = 0;
for (final int[] tuple : data) {
// Process a tuple
final int[] tupleGH = headerGeneralized.length == 0 ? null : new int[headerGeneralized.length];
final int[] tupleDI = headerAnalyzed.length == 0 ? null : new int[headerAnalyzed.length];
final int[] tupleIS = headerStatic.length == 0 ? null : new int[headerStatic.length];
for (int i = 0; i < tuple.length; i++) {
final int idx = i * 2;
int aType = map[idx];
final int iPos = map[idx + 1];
switch (aType) {
case AttributeTypeInternal.QUASI_IDENTIFYING_GENERALIZED:
tupleGH[iPos] = tuple[i];
break;
case AttributeTypeInternal.IDENTIFYING:
// Ignore
break;
case AttributeTypeInternal.INSENSITIVE:
tupleIS[iPos] = tuple[i];
break;
case AttributeTypeInternal.QUASI_IDENTIFYING_MICROAGGREGATED:
tupleDI[iPos] = tuple[i];
break;
case AttributeTypeInternal.SENSITIVE:
tupleDI[iPos] = tuple[i];
break;
}
}
if (valsGH != null) valsGH[index] = tupleGH;
if (valsIS != null) valsIS[index] = tupleIS;
if (valsDI != null) valsDI[index] = tupleDI;
index++;
}
// Build data object
final Data[] result = { new Data(valsGH,
headerGeneralized,
mapGeneralized,
dictionaryGeneralized),
new Data(valsDI, headerAnalyzed, mapAnalyzed, dictionaryAnalyzed),
new Data(valsIS, headerStatic, mapStatic, dictionaryStatic) };
return result;
}
/**
* Returns the data definitions
* @return
*/
protected DataDefinition getDataDefinition() {
return this.definition;
}
}