/* * ARX: Powerful Data Anonymization * Copyright 2012 - 2017 Fabian Prasser, Florian Kohlmayer and contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.deidentifier.arx.risk; import java.util.Arrays; import java.util.Comparator; import java.util.Set; import org.deidentifier.arx.DataHandleInternal; import org.deidentifier.arx.common.Groupify; import org.deidentifier.arx.common.Groupify.Group; import org.deidentifier.arx.common.TupleWrapper; import org.deidentifier.arx.common.WrappedBoolean; import org.deidentifier.arx.common.WrappedInteger; import org.deidentifier.arx.exceptions.ComputationInterruptedException; import com.carrotsearch.hppc.IntIntOpenHashMap; /** * This class encapsulates information about equivalence classes in a data set * * @author Fabian Prasser */ public class RiskModelHistogram { /** The equivalence classes */ private int[] equivalenceClasses; /** Summary */ private double avgClassSize; /** Summary */ private double numRecords; /** Summary */ private double numClasses; /** * Creates a new instance from the given distribution * * @param distribution */ public RiskModelHistogram(final IntIntOpenHashMap distribution) { this.convertAndAnalyze(distribution, new WrappedBoolean(), new WrappedInteger()); } /** * Creates a new instance by analyzing the given data handle. * IMPORTANT: Suppressed records will be ignored! * * @param handle * @param qis */ RiskModelHistogram(final DataHandleInternal handle, final Set<String> qis, final WrappedBoolean stop, final WrappedInteger progress, double factor) { /* ******************************** * Check * ********************************/ if (handle == null) { throw new NullPointerException("Handle is null"); } if (qis == null) { throw new NullPointerException("Quasi identifiers must not be null"); } for (String q : qis) { if (handle.getColumnIndexOf(q) == -1) { throw new IllegalArgumentException(q + " is not an attribute"); } } /* ******************************** * Build equivalence classes * ********************************/ final int[] indices = new int[qis.size()]; int index = 0; for (final String attribute : qis) { indices[index++] = handle.getColumnIndexOf(attribute); } Arrays.sort(indices); // Calculate equivalence classes int capacity = handle.getNumRows() / 10; capacity = capacity > 10 ? capacity : 10; Groupify<TupleWrapper> map = new Groupify<TupleWrapper>(capacity); int numRows = handle.getNumRows(); for (int row = 0; row < numRows; row++) { int prog = (int) Math.round((double) row / (double) numRows * factor * 80d); if (prog != progress.value) { progress.value = prog; } if (!handle.isOutlier(row)) { TupleWrapper tuple = new TupleWrapper(handle, indices, row, false); map.add(tuple); } if (stop.value) { throw new ComputationInterruptedException(); } } // Group by size IntIntOpenHashMap grouped = new IntIntOpenHashMap(); int i = 0; int size = map.size(); Group<TupleWrapper> element = map.first(); while (element != null) { int prog = (int) Math.round((80d + (double) i++ / (double) size * 20d) * factor); if (prog != progress.value) { progress.value = prog; } grouped.putOrAdd(element.getCount(), 1, 1); element = element.next(); if (stop.value) { throw new ComputationInterruptedException(); } } map = null; convertAndAnalyze(grouped, stop, progress); } /** * Returns a property of the class distribution * * @return the avgClassSize */ public double getAvgClassSize() { return avgClassSize; } /** * Returns class-size[idx], class-count[idx+1],... ordered ascending by * class size * * @return the histogram */ public int[] getHistogram() { return equivalenceClasses; } /** * Returns a property of the class distribution * * @return the numClasses */ public double getNumClasses() { return numClasses; } /** * Returns a property of the class distribution * * @return the numRecords */ public double getNumRecords() { return numRecords; } /** * Convert and analyze * * @param grouped * @param stop * @param progress */ private void convertAndAnalyze(IntIntOpenHashMap grouped, final WrappedBoolean stop, final WrappedInteger progress) { // Convert int[][] temp = new int[grouped.size()][2]; int idx = 0; final int[] values2 = grouped.values; final int[] keys2 = grouped.keys; final boolean[] states2 = grouped.allocated; for (int i = 0; i < states2.length; i++) { if (states2[i]) { temp[idx++] = new int[] { keys2[i], values2[i] }; } if (stop.value) { throw new ComputationInterruptedException(); } } grouped = null; // Sort ascending by size Arrays.sort(temp, new Comparator<int[]>() { public int compare(int[] o1, int[] o2) { if (stop.value) { throw new ComputationInterruptedException(); } return Integer.compare(o1[0], o2[0]); } }); // Convert and analyze int numClasses = 0; int numTuples = 0; this.equivalenceClasses = new int[temp.length * 2]; idx = 0; for (int[] entry : temp) { this.equivalenceClasses[idx++] = entry[0]; this.equivalenceClasses[idx++] = entry[1]; numClasses += entry[1]; numTuples += entry[0] * entry[1]; if (stop.value) { throw new ComputationInterruptedException(); } } this.numRecords = numTuples; this.numClasses = numClasses; this.avgClassSize = this.numRecords / this.numClasses; } }