/** * Copyright (C) 2001-2017 by RapidMiner and the contributors * * Complete list of developers available at our web site: * * http://rapidminer.com * * This program is free software: you can redistribute it and/or modify it under the terms of the * GNU Affero General Public License as published by the Free Software Foundation, either version 3 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License along with this program. * If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.example.table.internal; import java.io.Serializable; import java.util.Arrays; import com.rapidminer.example.Tools; /** * Super class for sparse chunks with medium sparsity. Stores only values different from the default * value. {@link #set(int, double)} returns {@code true} if the chunk is filled more than * {@link #getMaximalDensity(int)}. Should only be extended by two classes in order to be fast. * * @author Jan Czogalla * @since 7.3.1 */ abstract class AbstractMediumSparsityChunk implements Serializable { private static final long serialVersionUID = 1L; private static final int MIN_NON_EMPTY_SIZE = 8; /** * the maximal row that can occur. This is a power of 2 minus 1, so instead of * {@code % (MAX_SIZE+1)} we can do {@code & MAX_SIZE}. */ private static final int MAX_SIZE = AutoColumnUtils.CHUNK_SIZE - 1; private int[] indices = AutoColumnUtils.EMPTY_INTEGER_ARRAY; protected int valueCount; private double defaultValue; private int ensuredCount; private byte[] bloomFilter = AutoColumnUtils.EMPTY_BYTE_ARRAY; /** * 2^(AbstractAutoColumn.CHUNK_SIZE_EXP - bloomShift) = number of bits in bloomFilter = * bloomFilter.length * 2^3 * * This constant is adjusted by {@link #growBloomFilter()} so that this stays true when the * bloomFilter grows */ private int bloomShift = AutoColumnUtils.CHUNK_SIZE_EXP - 3 + 1; private int bloomMult = (int) Math.pow(2, bloomShift) + 1; AbstractMediumSparsityChunk(double defaultValue) { this.defaultValue = defaultValue; } /** * Returns the value stored for this row. * * @param row * the row for which to obtain the stored value * @return the value stored for row */ public final double get(int row) { int index = getIndex(row, true); return index < 0 ? defaultValue : getValue(index); } /** * Sets the value for the given row. Returns {@code true} if after this set the sparse chunk is * too full, i.e. its density is bigger than * {@link AutoColumnUtils#THRESHOLD_HIGH_SPARSITY_MAXIMAL_DENSITY}. Note that the density check only * works if the total size was {@link #ensure}d before. * * @param row * the row for which to set the value * @param value * the value to store * @return {@code true} if the maximal density is reached */ public final boolean set(int row, double value) { if (Tools.isDefault(defaultValue, value)) { int index = getIndex(row, true); // index not set, default value => do nothing if (index < 0) { return false; } // remove existing index removeIndex(index); return false; } boolean tooFull = false; int index = getIndex(row, false); if (index < 0) { // insert new index // see Arrays.binarySearch index = -index - 1; insertIndex(index); // check density if (valueCount / (double) ensuredCount > getMaximalDensity(row)) { tooFull = true; } // fill bloom filter int hash = hashForBloom(row); bloomFilter[hash >>> 3] |= 1 << (hash & 7); } // set index indices[index] = row; // set value in base column setValue(index, value); return tooFull; } /** * Returns the maximal density of this chunk depending on whether it stores integer or double * values. * * @return the maximal density for this chunk */ protected abstract double getMaximalDensity(int row); /** * The index returned by binary search or -1 if lookup is {@code true} and mayContain returns * {@link false}. Returns only a positive number if the row was found in {@link #indices}. If * called with lookup {@code false} then a returned negative index encodes where to insert this * new row (see {@link Arrays#binarySearch}). * * @param row * the row to search for * @param lookup * whether to look in the bloom filter, use {@code true} if the exact negative index * is not necessary * @return the index where row is found or a negative index */ private int getIndex(int row, boolean lookup) { // if no row inserted yet, return // bloom filter first, if only lookup if (valueCount == 0 || lookup && !mayContain(row)) { return -1; } // if new row is bigger than the biggest or no row inserted yet, // no binary search is necessary if (row > indices[valueCount - 1]) { return -valueCount - 1; } return Arrays.binarySearch(indices, 0, valueCount, row); } /** * Sets the total size. * * @param size * the expected size */ public final void ensure(int size) { ensuredCount = size; } /** * Checks the bloom filter if the set of rows might contain this row. * * @param row * the row to look for * @return {@code false} if this row was never added before, {@code true} if it is unknown */ private boolean mayContain(int row) { int hash = hashForBloom(row); return (bloomFilter[hash >>> 3] & 1 << (hash & 7)) != 0; } /** * Calculates as hash value for row using the multiply-shift hash of Dietzfelbinger. */ private int hashForBloom(int row) { // h: [2^w] -> [2^l], h(x) = (a*x mod 2^w) / 2^(w-l) = (a*x & (2^w -1))>>>(w-l) return (bloomMult * row & MAX_SIZE) >>> bloomShift; } /** * Grows and shifts the indices and value arrays so that there is a new place at index. */ private void insertIndex(int index) { int[] tmp = checkedGrow(); AutoColumnUtils.copy(indices, tmp, index, index, index + 1, valueCount); indices = tmp; insertValueIndex(index, indices.length); valueCount++; } /** * Removes the given index from the indices and value arrays. */ private void removeIndex(int index) { int[] tmp = checkedShrink(); AutoColumnUtils.copy(indices, tmp, index, index + 1, index, tmp.length); indices = tmp; removeValueIndex(index, indices.length); // overwrite duplicate last row with MAX_VALUE indices[indices.length - 1] = Integer.MAX_VALUE; valueCount--; } /** * Enlarges the {@link #indices} array if necessary. */ private int[] checkedGrow() { growBloomFilter(); int length = indices.length; if (valueCount < length) { return indices; } // grow int newLength = length == 0 ? MIN_NON_EMPTY_SIZE : length + (length >> 1); return new int[newLength]; } /** * Ensures that the {@link #bloomFilter} contains at least twice as many bits as the value * count. If growing the bloom filter is necessary the values for the hash function are * recalculated and the bloom filter is rehashed. */ private void growBloomFilter() { int length = bloomFilter.length; // maintain 1:2 relation for inserted indices vs bloom filter size if (valueCount >> 2 < length) { return; } length = length == 0 ? 1 : length << 1; bloomFilter = new byte[length]; bloomShift--; bloomMult = (int) Math.pow(2, bloomShift) + 1; // rehash bloom filter for (int i = 0; i < valueCount; i++) { int hash = hashForBloom(indices[i]); bloomFilter[hash >>> 3] |= 1 << (hash & 7); } } /** * Checks if the {@link #indices} array is too empty and shrinks it if necessary. */ private int[] checkedShrink() { int length = indices.length; if (length >> 1 >= MIN_NON_EMPTY_SIZE && valueCount - 1 <= length >> 2) { // shrink return new int[length >> 1]; } return indices; } /** * Removes the given index from the values array and sets its length. * * @param index * the index to remove * @param length * the desired array length */ abstract void removeValueIndex(int index, int length); /** * Inserts a new place in the values array at the given index and ensures that the array has the * given length. * * @param index * the index to insert * @param length * the desired array length */ abstract void insertValueIndex(int index, int length); /** * Returns the value stored at the given index. * * @param index * the index to look up * @return the value for the index */ abstract double getValue(int index); /** * Sets the value at position index of the values array. * * @param index * the index where to set the value * @param value * the value to store */ abstract void setValue(int index, double value); }