/* * ARX: Powerful Data Anonymization * Copyright 2012 - 2017 Fabian Prasser, Florian Kohlmayer and contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.deidentifier.arx.risk; import org.apache.commons.math3.distribution.HypergeometricDistribution; import org.deidentifier.arx.ARXPopulationModel; import org.deidentifier.arx.common.WrappedBoolean; import org.deidentifier.arx.common.WrappedInteger; /** * This class implements the ZayatzModel based on equivalence classes, for * details see the paper ESTIMATION OF THE NUMBER OF UNIQUE POPULATION ELEMENTS * USING A SAMPLE, Zayatz, 1991 * * @author Fabian Prasser * @author Michael Schneider * @version 1.0 */ class ModelZayatz extends RiskModelPopulation { /** Resulting estimate */ private final double numUniques; /** * Creates a new instance * * @param model * @param histogram * @param stop */ ModelZayatz(ARXPopulationModel model, final RiskModelHistogram histogram, final WrappedBoolean stop) { super(histogram, model, stop, new WrappedInteger()); int[] _histogram = getHistogram().getHistogram(); double uniqueness = computeConditionalUniqueness(_histogram, getPopulationSize(), getSampleSize(), getNumClasses()); this.numUniques = getNumClassesOfSize(1) * uniqueness / getSamplingFraction(); } /** * Returns the number of uniques * * @return */ public double getNumUniques() { return this.numUniques; } /** * Estimates the probability that an equivalence class of size 1 in the * sample was chosen from an equivalence class of size 1 in the population * * @param classes * @param populationSize * @param sampleSize * @param numClasses * @return */ private double computeConditionalUniqueness(int[] classes, double populationSize, double sampleSize, double numClasses) { int numClassesOfSize1 = classes[0] == 1 ? classes[1] : 0; double temp = 0; int param1 = (int) populationSize; if (populationSize > Integer.MAX_VALUE) { param1 = Integer.MAX_VALUE; // TODO: This is an error: overflow } int param2 = (int) sampleSize; if (sampleSize > Integer.MAX_VALUE) { param2 = Integer.MAX_VALUE; // TODO: This is an error: overflow } for (int i = 0; i < classes.length; i += 2) { int size = classes[i]; int count = classes[i + 1]; HypergeometricDistribution distribution = new HypergeometricDistribution(param1, size, param2); temp += (count / ((double) numClasses)) * distribution.probability(1); checkInterrupt(); } HypergeometricDistribution distribution = new HypergeometricDistribution(param1, 1, param2); return (((double) numClassesOfSize1 / ((double) numClasses)) * (distribution.probability(1))) / temp; } }