/* * ARX: Powerful Data Anonymization * Copyright 2012 - 2017 Fabian Prasser, Florian Kohlmayer and contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.deidentifier.arx.framework.data; import java.text.ParseException; import java.util.ArrayList; import java.util.HashMap; import java.util.Map; import java.util.Set; import org.deidentifier.arx.DataDefinition; import org.deidentifier.arx.DataGeneralizationScheme; import org.deidentifier.arx.DataSubset; import org.deidentifier.arx.DataType; import org.deidentifier.arx.RowSet; import org.deidentifier.arx.aggregates.HierarchyBuilder; import org.deidentifier.arx.aggregates.HierarchyBuilderIntervalBased; import org.deidentifier.arx.aggregates.HierarchyBuilderRedactionBased; import org.deidentifier.arx.criteria.EDDifferentialPrivacy; import org.deidentifier.arx.criteria.HierarchicalDistanceTCloseness; import org.deidentifier.arx.criteria.PrivacyCriterion; import org.deidentifier.arx.framework.check.distribution.DistributionAggregateFunction; import org.deidentifier.arx.framework.check.distribution.DistributionAggregateFunction.DistributionAggregateFunctionGeneralization; import org.deidentifier.arx.metric.v2.DomainShare; import org.deidentifier.arx.metric.v2.DomainShareInterval; import org.deidentifier.arx.metric.v2.DomainShareMaterialized; import org.deidentifier.arx.metric.v2.DomainShareRedaction; import cern.colt.Sorting; import cern.colt.function.IntComparator; import com.carrotsearch.hppc.IntObjectOpenHashMap; import com.carrotsearch.hppc.IntOpenHashSet; /** * Holds all data needed for the anonymization process. * * @author Fabian Prasser * @author Florian Kohlmayer */ public class DataManager { /** * Internal representation of attribute types. Quasi-identifiers are split * into the ones to which generalization is applied and the ones to which * microaggregation is applied * * @author Florian Kohlmayer * @author Fabian Prasser * */ public static class AttributeTypeInternal { public static final int IDENTIFYING = 3; public static final int INSENSITIVE = 2; public static final int QUASI_IDENTIFYING_GENERALIZED = 0; public static final int QUASI_IDENTIFYING_MICROAGGREGATED = 4; public static final int SENSITIVE = 1; } /** The data. */ private final Data dataAnalyzed; /** The data which is generalized */ private final Data dataGeneralized; /** The data which is insensitive */ private final Data dataStatic; /** The data definition */ private final DataDefinition definition; /** The domain shares */ private DomainShare[] shares; /** The original input header. */ private final String[] header; /** The generalization hierarchiesQI. */ private final GeneralizationHierarchy[] hierarchiesGeneralized; /** The hierarchy heights for each QI. */ private final int[] hierarchiesHeights; /** The sensitive attributes. */ private final Map<String, GeneralizationHierarchy> hierarchiesSensitive; /** The data types of sensitive attributes. */ private final Map<String, DataType<?>> dataTypesSensitive; /** The indexes of sensitive attributes. */ private final Map<String, Integer> indexesSensitive; /** The maximum level for each QI. */ private final int[] maxLevels; /** The microaggregation functions. */ private final DistributionAggregateFunction[] microaggregationFunctions; /** Header for microaggregated attributes */ private final String[] microaggregationHeader; /** Map for microaggregated attributes */ private final int[] microaggregationMap; /** Map for microaggregated attributes */ private final int[] microaggregationDomainSizes; /** The number of microaggregation attributes in the dataDI */ private final int microaggregationNumAttributes; /** The start index of the microaggregation attributes in the dataDI */ private final int microaggregationStartIndex; /** The minimum level for each QI. */ private final int[] minLevels; /** The research subset, if any. */ private RowSet subset = null; /** The size of the research subset. */ private int subsetSize = 0; /** * Creates a new data manager from pre-encoded data. * * @param header * @param data * @param dictionary * @param definition * @param criteria * @param function */ public DataManager(final String[] header, final int[][] data, final Dictionary dictionary, final DataDefinition definition, final Set<PrivacyCriterion> criteria, final Map<String, DistributionAggregateFunction> functions) { // Store columns for reordering the output this.header = header; this.definition = definition; Set<String> attributesGeneralized = definition.getQuasiIdentifiersWithGeneralization(); Set<String> attributesSensitive = definition.getSensitiveAttributes(); Set<String> attributesMicroaggregated = definition.getQuasiIdentifiersWithMicroaggregation(); Set<String> attributesInsensitive = definition.getInsensitiveAttributes(); // Init dictionary final Dictionary dictionaryGeneralized = new Dictionary(attributesGeneralized.size()); final Dictionary dictionaryAnalyzed = new Dictionary(attributesSensitive.size() + attributesMicroaggregated.size()); final Dictionary dictionaryStatic = new Dictionary(attributesInsensitive.size()); // Init maps for reordering the output final int[] mapGeneralized = new int[dictionaryGeneralized.getNumDimensions()]; final int[] mapAnalyzed = new int[dictionaryAnalyzed.getNumDimensions()]; final int[] mapStatic = new int[dictionaryStatic.getNumDimensions()]; this.microaggregationMap = new int[attributesMicroaggregated.size()]; // Indexes this.microaggregationStartIndex = attributesSensitive.size(); this.microaggregationNumAttributes = attributesMicroaggregated.size(); int indexStatic = 0; int indexGeneralized = 0; int indexAnalyzed = 0; int indexSensitive = 0; int indexMicroaggregated = this.microaggregationStartIndex; int counter = 0; // A map for column indices. map[i*2]=attribute type, map[i*2+1]=index position. final int[] map = new int[header.length * 2]; final String[] headerGH = new String[dictionaryGeneralized.getNumDimensions()]; final String[] headerDI = new String[dictionaryAnalyzed.getNumDimensions()]; final String[] headerIS = new String[dictionaryStatic.getNumDimensions()]; this.microaggregationHeader = new String[attributesMicroaggregated.size()]; this.dataTypesSensitive = new HashMap<>(); for (final String column : header) { final int idx = counter * 2; if (attributesGeneralized.contains(column)) { map[idx] = AttributeTypeInternal.QUASI_IDENTIFYING_GENERALIZED; map[idx + 1] = indexGeneralized; mapGeneralized[indexGeneralized] = counter; dictionaryGeneralized.registerAll(indexGeneralized, dictionary, counter); headerGH[indexGeneralized] = header[counter]; indexGeneralized++; } else if (attributesMicroaggregated.contains(column)) { map[idx] = AttributeTypeInternal.QUASI_IDENTIFYING_MICROAGGREGATED; map[idx + 1] = indexMicroaggregated; mapAnalyzed[indexMicroaggregated] = counter; dictionaryAnalyzed.registerAll(indexMicroaggregated, dictionary, counter); headerDI[indexMicroaggregated] = header[counter]; indexMicroaggregated++; microaggregationMap[indexAnalyzed] = counter; microaggregationHeader[indexAnalyzed] = header[counter]; indexAnalyzed++; } else if (attributesInsensitive.contains(column)) { map[idx] = AttributeTypeInternal.INSENSITIVE; map[idx + 1] = indexStatic; mapStatic[indexStatic] = counter; dictionaryStatic.registerAll(indexStatic, dictionary, counter); headerIS[indexStatic] = header[counter]; indexStatic++; } else if (attributesSensitive.contains(column)) { map[idx] = AttributeTypeInternal.SENSITIVE; map[idx + 1] = indexSensitive; mapAnalyzed[indexSensitive] = counter; dictionaryAnalyzed.registerAll(indexSensitive, dictionary, counter); headerDI[indexSensitive] = header[counter]; indexSensitive++; dataTypesSensitive.put(column, definition.getDataType(column)); } else { // TODO: CHECK: Changed default? - now all undefined attributes // are identifying! Previously they were considered sensitive? map[idx] = AttributeTypeInternal.IDENTIFYING; map[idx + 1] = -1; } counter++; } // encode Data final Data[] ddata = encode(data, map, mapGeneralized, mapAnalyzed, mapStatic, dictionaryGeneralized, dictionaryAnalyzed, dictionaryStatic, headerGH, headerDI, headerIS); dataGeneralized = ddata[0]; dataAnalyzed = ddata[1]; dataStatic = ddata[2]; // Initialize minlevels minLevels = new int[attributesGeneralized.size()]; hierarchiesHeights = new int[attributesGeneralized.size()]; maxLevels = new int[attributesGeneralized.size()]; // Build hierarchiesQI hierarchiesGeneralized = new GeneralizationHierarchy[attributesGeneralized.size()]; for (int i = 0; i < header.length; i++) { final int idx = i * 2; if (attributesGeneralized.contains(header[i]) && map[idx] == AttributeTypeInternal.QUASI_IDENTIFYING_GENERALIZED) { final int dictionaryIndex = map[idx + 1]; final String name = header[i]; if (definition.getHierarchy(name) != null) { hierarchiesGeneralized[dictionaryIndex] = new GeneralizationHierarchy(name, definition.getHierarchy(name), dictionaryIndex, dictionaryGeneralized); } else { throw new IllegalStateException("No hierarchy available for attribute (" + header[i] + ")"); } // Initialize hierarchy height and minimum / maximum // generalization hierarchiesHeights[dictionaryIndex] = hierarchiesGeneralized[dictionaryIndex].getArray()[0].length; final Integer minGenLevel = definition.getMinimumGeneralization(name); minLevels[dictionaryIndex] = minGenLevel == null ? 0 : minGenLevel; final Integer maxGenLevel = definition.getMaximumGeneralization(name); maxLevels[dictionaryIndex] = maxGenLevel == null ? hierarchiesHeights[dictionaryIndex] - 1 : maxGenLevel; } } // Change min & max, when using (e,d)-DP for (PrivacyCriterion c : criteria) { if (c instanceof EDDifferentialPrivacy) { DataGeneralizationScheme scheme = ((EDDifferentialPrivacy)c).getGeneralizationScheme(); for (int i = 0; i < header.length; i++) { final int idx = i * 2; if (attributesGeneralized.contains(header[i]) && map[idx] == AttributeTypeInternal.QUASI_IDENTIFYING_GENERALIZED) { minLevels[map[idx + 1]] = scheme.getGeneralizationLevel(header[i], definition); maxLevels[map[idx + 1]] = scheme.getGeneralizationLevel(header[i], definition); } } break; } } // Build map with hierarchies for sensitive attributes Map<String, String[][]> sensitiveHierarchies = new HashMap<String, String[][]>(); for (PrivacyCriterion c : criteria) { if (c instanceof HierarchicalDistanceTCloseness) { HierarchicalDistanceTCloseness t = (HierarchicalDistanceTCloseness) c; sensitiveHierarchies.put(t.getAttribute(), t.getHierarchy().getHierarchy()); } } // Build generalization hierarchies for sensitive attributes hierarchiesSensitive = new HashMap<String, GeneralizationHierarchy>(); indexesSensitive = new HashMap<String, Integer>(); int index = 0; for (int i = 0; i < header.length; i++) { final String name = header[i]; final int idx = i * 2; if (sensitiveHierarchies.containsKey(name) && map[idx] == AttributeTypeInternal.SENSITIVE) { final int dictionaryIndex = map[idx + 1]; final String[][] hiers = sensitiveHierarchies.get(name); if (hiers != null) { hierarchiesSensitive.put(name, new GeneralizationHierarchy(name, hiers, dictionaryIndex, dictionaryAnalyzed)); } } // Store index for sensitive attributes if (attributesSensitive.contains(header[i])) { indexesSensitive.put(name, index); index++; } } // Build map with hierarchies for microaggregated attributes Map<String, String[][]> maHierarchies = new HashMap<String, String[][]>(); for (String attribute : functions.keySet()) { if (functions.get(attribute) instanceof DistributionAggregateFunctionGeneralization) { maHierarchies.put(attribute, definition.getHierarchy(attribute)); } } // Build generalization hierarchies for microaggregated attributes Map<String, int[][]> hierarchiesMA = new HashMap<String, int[][]>(); index = 0; for (int i = 0; i < header.length; i++) { final String name = header[i]; final int idx = i * 2; if (maHierarchies.containsKey(name) && map[idx] == AttributeTypeInternal.QUASI_IDENTIFYING_MICROAGGREGATED) { final int dictionaryIndex = map[idx + 1]; final String[][] hiers = maHierarchies.get(name); if (hiers != null) { hierarchiesMA.put(name, new GeneralizationHierarchy(name, hiers, dictionaryIndex, dictionaryAnalyzed).map); } } } // finalize dictionary dictionaryGeneralized.finalizeAll(); dictionaryAnalyzed.finalizeAll(); dictionaryStatic.finalizeAll(); // Init microaggregation functions microaggregationFunctions = new DistributionAggregateFunction[attributesMicroaggregated.size()]; microaggregationDomainSizes = new int[attributesMicroaggregated.size()]; for (int i = 0; i < header.length; i++) { final int idx = i * 2; if (attributesMicroaggregated.contains(header[i]) && map[idx] == AttributeTypeInternal.QUASI_IDENTIFYING_MICROAGGREGATED) { final int dictionaryIndex = map[idx + 1] - microaggregationStartIndex; final String name = header[i]; if (definition.getMicroAggregationFunction(name) != null) { microaggregationDomainSizes[dictionaryIndex] = dictionaryAnalyzed.getMapping()[dictionaryIndex + microaggregationStartIndex].length; microaggregationFunctions[dictionaryIndex] = functions.get(name); microaggregationFunctions[dictionaryIndex].initialize(dictionaryAnalyzed.getMapping()[dictionaryIndex + microaggregationStartIndex], definition.getDataType(name), hierarchiesMA.get(name)); } else { throw new IllegalStateException("No microaggregation function defined for attribute (" + header[i] + ")"); } } } // Store research subset for (PrivacyCriterion c : criteria) { if (c instanceof EDDifferentialPrivacy) { ((EDDifferentialPrivacy) c).initialize(this, null); } if (c.isSubsetAvailable()) { DataSubset _subset = c.getDataSubset(); if (_subset != null) { subset = _subset.getSet(); subsetSize = _subset.getArray().length; break; } } } } /** * For creating a projected instance * @param definition * @param dataAnalyzed * @param dataGeneralized * @param dataStatic * @param header * @param hierarchiesGeneralized * @param hierarchiesHeights * @param hierarchiesSensitive * @param indexesSensitive * @param maxLevels * @param microaggregationFunctions * @param microaggregationHeader * @param microaggregationMap * @param microaggregationDomainSizes * @param microaggregationNumAttributes * @param microaggregationStartIndex * @param minLevels * @param dataTypesSensitive */ protected DataManager(DataDefinition definition, Data dataAnalyzed, Data dataGeneralized, Data dataStatic, String[] header, GeneralizationHierarchy[] hierarchiesGeneralized, int[] hierarchiesHeights, Map<String, GeneralizationHierarchy> hierarchiesSensitive, Map<String, Integer> indexesSensitive, int[] maxLevels, DistributionAggregateFunction[] microaggregationFunctions, String[] microaggregationHeader, int[] microaggregationMap, int[] microaggregationDomainSizes, int microaggregationNumAttributes, int microaggregationStartIndex, int[] minLevels, Map<String, DataType<?>> dataTypesSensitive) { this.definition = definition; this.dataAnalyzed = dataAnalyzed; this.dataGeneralized = dataGeneralized; this.dataStatic = dataStatic; this.header = header; this.hierarchiesGeneralized = hierarchiesGeneralized; this.hierarchiesHeights = hierarchiesHeights; this.hierarchiesSensitive = hierarchiesSensitive; this.indexesSensitive = indexesSensitive; this.maxLevels = maxLevels; this.microaggregationFunctions = microaggregationFunctions; this.microaggregationDomainSizes = microaggregationDomainSizes; this.microaggregationHeader = microaggregationHeader; this.microaggregationMap = microaggregationMap; this.microaggregationNumAttributes = microaggregationNumAttributes; this.microaggregationStartIndex = microaggregationStartIndex; this.minLevels = minLevels; this.dataTypesSensitive = dataTypesSensitive; // Both variables are only used for getDistribution() and getTree() // The projected instance delegates these methods to the original data manager this.subset = null; this.subsetSize = 0; } /** * Returns the input data that will be analyzed. * * @return the data */ public Data getDataAnalyzed() { return dataAnalyzed; } /** * Returns the input data that will be generalized. * * @return the data */ public Data getDataGeneralized() { return dataGeneralized; } /** * Returns the static input data. * * @return the data */ public Data getDataStatic() { return dataStatic; } /** * Returns the distribution of the attribute in the data array at the given index. * @param data * @param index * @param distinctValues * @return */ public double[] getDistribution(int[][] data, int index, int distinctValues) { // Initialize counts: iterate over all rows or the subset final int[] cardinalities = new int[distinctValues]; for (int i = 0; i < data.length; i++) { if (subset == null || subset.contains(i)) { cardinalities[data[i][index]]++; } } // compute distribution final double total = subset == null ? data.length : subsetSize; final double[] distribution = new double[cardinalities.length]; for (int i = 0; i < distribution.length; i++) { distribution[i] = (double) cardinalities[i] / total; } return distribution; } /** * Returns the distribution of the given sensitive attribute in the original dataset. * Required for t-closeness. * * @param attribute * @return distribution */ public double[] getDistribution(String attribute) { // Check if (!indexesSensitive.containsKey(attribute)) { throw new IllegalArgumentException("Attribute " + attribute + " is not sensitive"); } // Prepare int index = indexesSensitive.get(attribute); int distinctValues = dataAnalyzed.getDictionary().getMapping()[index].length; int[][] data = dataAnalyzed.getArray(); // Calculate and return return getDistribution(data, index, distinctValues); } public DomainShare[] getDomainShares() { // Build on-demand if (this.shares == null) { // Compute domain shares this.shares = new DomainShare[dataGeneralized.getHeader().length]; for (int i=0; i<shares.length; i++) { // Extract info String attribute = dataGeneralized.getHeader()[i]; String[][] hierarchy = definition.getHierarchy(attribute); HierarchyBuilder<?> builder = definition.getHierarchyBuilder(attribute); // Create shares for redaction-based hierarchies if (builder != null && (builder instanceof HierarchyBuilderRedactionBased) && ((HierarchyBuilderRedactionBased<?>)builder).isDomainPropertiesAvailable()){ this.shares[i] = new DomainShareRedaction((HierarchyBuilderRedactionBased<?>)builder); // Create shares for interval-based hierarchies } else if (builder != null && (builder instanceof HierarchyBuilderIntervalBased)) { this.shares[i] = new DomainShareInterval<>((HierarchyBuilderIntervalBased<?>)builder, hierarchiesGeneralized[i].getArray(), dataGeneralized.getDictionary().getMapping()[i]); // Create fallback-shares for materialized hierarchies } else { this.shares[i] = new DomainShareMaterialized(hierarchy, dataGeneralized.getDictionary().getMapping()[i], hierarchiesGeneralized[i].getArray()); } } } // Return return this.shares; } /** * The original data header. * * @return */ public String[] getHeader() { return header; } /** * Returns the heights of the hierarchiesQI. * * @return */ public int[] getHierachiesHeights() { return hierarchiesHeights; } /** * Returns the generalization hierarchiesQI. * * @return the hierarchiesQI */ public GeneralizationHierarchy[] getHierarchies() { return hierarchiesGeneralized; } /** * Returns the maximum levels for the generalizaiton. * * @return the maximum level for each QI */ public int[] getHierarchiesMaxLevels() { return maxLevels; } /** * Returns the minimum levels for the generalizations. * * @return */ public int[] getHierarchiesMinLevels() { return minLevels; } /** * Returns the map for the according buffer * @return */ public int[] getMicroaggregationDomainSizes() { return microaggregationDomainSizes; } /** * Returns the microaggregation functions. * * @return */ public DistributionAggregateFunction[] getMicroaggregationFunctions() { return microaggregationFunctions; } /** * Returns the header for the according buffer * @return */ public String[] getMicroaggregationHeader() { return microaggregationHeader; } /** * Returns the map for the according buffer * @return */ public int[] getMicroaggregationMap() { return microaggregationMap; } /** * Gets the number of attributes to which microaggregation will be applied * in dataAnalyzed. * * @return */ public int getMicroaggregationNumAttributes() { return microaggregationNumAttributes; } /** * Gets the start index of the attributes to which microaggregation will be * applied in dataAnalyzed. * * @return */ public int getMicroaggregationStartIndex() { return microaggregationStartIndex; } /** * Returns the order of the given sensitive attribute in the original dataset. * Required for t-closeness. * * @param attribute * @return distribution */ public int[] getOrder(String attribute) { // Check if (!indexesSensitive.containsKey(attribute)) { throw new IllegalArgumentException("Attribute " + attribute + " is not sensitive"); } // Prepare final String[] dictionary = dataAnalyzed.getDictionary().getMapping()[indexesSensitive.get(attribute)]; final DataType<?> type = this.dataTypesSensitive.get(attribute); // Init int[] order = new int[dictionary.length]; for (int i = 0; i < order.length; i++) { order[i] = i; } // Sort Sorting.mergeSort(order, 0, order.length, new IntComparator() { @Override public int compare(int arg0, int arg1) { String value1 = dictionary[arg0]; String value2 = dictionary[arg1]; try { return type.compare(value1, value2); } catch (NumberFormatException | ParseException e) { throw new IllegalStateException(e); } } }); // Return return order; } /** * Returns an instance of this data manager, that is projected onto the given rowset * @param rowset * @return */ public DataManager getSubsetInstance(RowSet rowset) { DistributionAggregateFunction[] microaggregationFunctions = new DistributionAggregateFunction[this.microaggregationFunctions.length]; for (int i = 0; i < this.microaggregationFunctions.length; i++) { microaggregationFunctions[i] = this.microaggregationFunctions[i].clone(); } return new DataManagerSubset(this, this.dataAnalyzed.getSubsetInstance(rowset), this.dataGeneralized.getSubsetInstance(rowset), this.dataStatic.getSubsetInstance(rowset), this.header, this.hierarchiesGeneralized, this.hierarchiesHeights, this.hierarchiesSensitive, this.indexesSensitive, this.maxLevels, microaggregationFunctions, this.microaggregationHeader, this.microaggregationMap, this.microaggregationDomainSizes, this.microaggregationNumAttributes, this.microaggregationStartIndex, this.minLevels, this.dataTypesSensitive); } /** * Returns a tree for the given attribute at the index within the given data array, using the given hierarchy. * The resulting tree can be used to calculate the earth mover's distance with hierarchical ground-distance. * @param data * @param index * @param hierarchy * @return tree */ public int[] getTree(int[][] data, int index, int[][] hierarchy) { final int totalElementsP = subset == null ? data.length : subsetSize; final int height = hierarchy[0].length - 1; final int numLeafs = hierarchy.length; // TODO: Size could be calculated?! final ArrayList<Integer> treeList = new ArrayList<Integer>(); treeList.add(totalElementsP); treeList.add(numLeafs); treeList.add(height); // Init all freq to 0 for (int i = 0; i < numLeafs; i++) { treeList.add(0); } // Count frequencies final int offsetLeafs = 3; for (int i = 0; i < data.length; i++) { if (subset == null || subset.contains(i)) { int previousFreq = treeList.get(data[i][index] + offsetLeafs); previousFreq++; treeList.set(data[i][index] + offsetLeafs, previousFreq); } } // Init extras for (int i = 0; i < numLeafs; i++) { treeList.add(-1); } // Temporary class for nodes class TNode { IntOpenHashSet children = new IntOpenHashSet(); int level = 0; int offset = 0; } final int offsetsExtras = offsetLeafs + numLeafs; final IntObjectOpenHashMap<TNode> nodes = new IntObjectOpenHashMap<TNode>(); final ArrayList<ArrayList<TNode>> levels = new ArrayList<ArrayList<TNode>>(); // Init levels for (int i = 0; i < hierarchy[0].length; i++) { levels.add(new ArrayList<TNode>()); } // Build nodes int offset = dataAnalyzed.getDictionary().getMapping()[index].length; for (int i = 0; i < hierarchy[0].length; i++) { for (int j = 0; j < hierarchy.length; j++) { final int nodeID = hierarchy[j][i] + i * offset; TNode curNode = null; if (!nodes.containsKey(nodeID)) { curNode = new TNode(); curNode.level = i; nodes.put(nodeID, curNode); final ArrayList<TNode> level = levels.get(curNode.level); level.add(curNode); } else { curNode = nodes.get(nodeID); } if (i > 0) { // first add child curNode.children.add(hierarchy[j][i - 1] + (i - 1) * offset); } } } // For all nodes for (final ArrayList<TNode> level : levels) { for (final TNode node : level) { if (node.level > 0) { // only inner nodes node.offset = treeList.size(); treeList.add(node.children.size()); treeList.add(node.level); final int[] keys = node.children.keys; final boolean[] allocated = node.children.allocated; for (int i = 0; i < allocated.length; i++) { if (allocated[i]) { treeList.add(node.level == 1 ? keys[i] + offsetsExtras : nodes.get(keys[i]).offset); } } treeList.add(0); // pos_e treeList.add(0); // neg_e } } } final int[] treeArray = new int[treeList.size()]; int count = 0; for (final int val : treeList) { treeArray[count++] = val; } return treeArray; } /** * Returns the tree for the given sensitive attribute, if a generalization hierarchy is associated. * The resulting tree can be used to calculate the earth mover's distance with hierarchical ground-distance. * * @param attribute * @return tree */ public int[] getTree(String attribute) { if (!hierarchiesSensitive.containsKey(attribute)) { throw new IllegalArgumentException("Attribute " + attribute + " is not sensitive"); } final int[][] data = dataAnalyzed.getArray(); final int index = indexesSensitive.get(attribute); return getTree(data, index, hierarchiesSensitive.get(attribute).map); } /** * Encodes the data. * * @param data * @param map * @param mapGeneralized * @param mapAnalyzed * @param mapStatic * @param dictionaryGeneralized * @param dictionaryAnalyzed * @param dictionaryStatic * @param headerGeneralized * @param headerAnalyzed * @param headerStatic * @return */ private Data[] encode(final int[][] data, final int[] map, final int[] mapGeneralized, final int[] mapAnalyzed, final int[] mapStatic, final Dictionary dictionaryGeneralized, final Dictionary dictionaryAnalyzed, final Dictionary dictionaryStatic, final String[] headerGeneralized, final String[] headerAnalyzed, final String[] headerStatic) { // Parse the dataset final int[][] valsGH = headerGeneralized.length == 0 ? null : new int[data.length][]; final int[][] valsDI = headerAnalyzed.length == 0 ? null : new int[data.length][]; final int[][] valsIS = headerStatic.length == 0 ? null : new int[data.length][]; int index = 0; for (final int[] tuple : data) { // Process a tuple final int[] tupleGH = headerGeneralized.length == 0 ? null : new int[headerGeneralized.length]; final int[] tupleDI = headerAnalyzed.length == 0 ? null : new int[headerAnalyzed.length]; final int[] tupleIS = headerStatic.length == 0 ? null : new int[headerStatic.length]; for (int i = 0; i < tuple.length; i++) { final int idx = i * 2; int aType = map[idx]; final int iPos = map[idx + 1]; switch (aType) { case AttributeTypeInternal.QUASI_IDENTIFYING_GENERALIZED: tupleGH[iPos] = tuple[i]; break; case AttributeTypeInternal.IDENTIFYING: // Ignore break; case AttributeTypeInternal.INSENSITIVE: tupleIS[iPos] = tuple[i]; break; case AttributeTypeInternal.QUASI_IDENTIFYING_MICROAGGREGATED: tupleDI[iPos] = tuple[i]; break; case AttributeTypeInternal.SENSITIVE: tupleDI[iPos] = tuple[i]; break; } } if (valsGH != null) valsGH[index] = tupleGH; if (valsIS != null) valsIS[index] = tupleIS; if (valsDI != null) valsDI[index] = tupleDI; index++; } // Build data object final Data[] result = { new Data(valsGH, headerGeneralized, mapGeneralized, dictionaryGeneralized), new Data(valsDI, headerAnalyzed, mapAnalyzed, dictionaryAnalyzed), new Data(valsIS, headerStatic, mapStatic, dictionaryStatic) }; return result; } /** * Returns the data definitions * @return */ protected DataDefinition getDataDefinition() { return this.definition; } }