/* * ARX: Powerful Data Anonymization * Copyright 2012 - 2017 Fabian Prasser, Florian Kohlmayer and contributors * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.deidentifier.arx.framework.check.distribution; import java.util.Arrays; import org.deidentifier.arx.framework.check.groupify.HashTableUtil; /** * This class can be utilized to track the distributions of values. It is backed by a hash table * implementing open addressing with linear probing. * * @author Fabian Prasser * @author Florian Kohlmayer */ public class Distribution { /** The size. */ private int size; /** The threshold used for rehashing. */ private int threshold; /** The elements. Even index contains value, odd index contains frequency */ private int[] elements; /** The sorted element array - used for history entries only. */ private int[] packedElements; /** The sorted frequency array - used for history entries only. */ private int[] packedFrequencies; /** The loadfactor. */ private final static float LOADFACTOR = 0.75f; /** The initial default capacity of the hashtable. */ private static final int DEFAULT_CAPACITY = 8; // power of two /** * Default constructor. */ public Distribution() { this(DEFAULT_CAPACITY); } /** * Constructor used to create frequency set from a history entry. * * @param element * @param frequency */ public Distribution(final int[] element, final int[] frequency) { this(element.length); for (int i = 0; i < element.length; i++) { if (element[i] != -1) { this.add(element[i], frequency[i]); } } } /** * Constructor using next power of two starting at capacity as initial * capacity. * * @param capacity */ private Distribution(int capacity) { capacity = HashTableUtil.calculateCapacity(capacity); size = 0; elements = new int[capacity << 1]; Arrays.fill(elements, -1); threshold = HashTableUtil.calculateThreshold(capacity, LOADFACTOR); } /** * Adds a element to the hashtable. Frequency value 1. * * @param element */ public final void add(final int element) { this.add(element, 1); } /** * Clears the table. */ public void clear() { Arrays.fill(elements, -1); size = 0; } /** * Gets all buckets of the hash table. * * @return */ public int[] getBuckets() { return elements; } /** * Gets all elements of the packed table. * * @return */ public int[] getPackedElements() { return packedElements; } /** * Gets the frequency of the packed table. * * @return */ public int[] getPackedFrequency() { return packedFrequencies; } /** * Merges two frequency sets. * * @param other */ public void merge(final Distribution other) { final int[] otherElements = other.elements; for (int i = 0; i < otherElements.length; i += 2) { if (otherElements[i] != -1) { this.add(otherElements[i], otherElements[i + 1]); } } } /** * Merge a frequency set with a history entry. * * @param elements * @param frequency */ public void merge(final int[] elements, final int[] frequency) { for (int i = 0; i < elements.length; i++) { if (elements[i] != -1) { this.add(elements[i], frequency[i]); } } } /** * Packs the frequency table; removes null values and generates * sortedElements and sortedFrequency arrays. In case a collission occured * this method also sorts the elements. */ public void pack() { final int[] sortedelements = new int[size]; final int[] sortedfrequency = new int[size]; if (size > 0) { // compress & copy int count = 0; for (int i = 0; i < elements.length; i += 2) { if (elements[i] != -1) { // bucket not empty sortedelements[count] = elements[i]; sortedfrequency[count] = elements[i + 1]; count++; } } } this.packedElements = sortedelements; this.packedFrequencies = sortedfrequency; } /** * Gets the current size. * * @return */ public int size() { return size; } @Override public String toString() { StringBuilder builder = new StringBuilder(); builder.append("Distribution ["); boolean first = true; for (int i=0; i<elements.length; i+=2) { if (elements[i] != -1) { builder.append(first ? "" : ","); builder.append(elements[i]).append("=").append(elements[i+1]); first = false; } } builder.append("]"); return builder.toString(); } /** * Adds an element with the given frequency. * * @param element * @param value */ private void add(final int element, final int value) { final int mask = (elements.length - 1); int index = (element & ((elements.length >> 1) - 1)) << 1; // start at home bucket while (true) { if (elements[index] == -1) { // empty bucket, not found elements[index] = element; elements[index + 1] = value; size++; if (size > threshold) { rehash(); } break; } else if (elements[index] == element) { // element found elements[index + 1] += value; break; } index = (index + 2) & mask; // next bucket } } /** * Rehashes the frequency set table. */ private void rehash() { final int capacity = HashTableUtil.calculateCapacity(elements.length); final int[] newelements = new int[capacity << 1]; Arrays.fill(newelements, -1); final int mask = (newelements.length - 1); for (int i = 0; i < elements.length; i += 2) { if (elements[i] != -1) { // bucket not empty int index = (elements[i] & ((newelements.length >> 1) - 1)) << 1; while (true) { if (newelements[index] == -1) { // empty bucket, not found newelements[index] = elements[i]; newelements[index + 1] = elements[i + 1]; break; } index = (index + 2) & mask; // next bucket } } } threshold = (int) (capacity * LOADFACTOR); elements = newelements; } }