/**
* Copyright (C) 2001-2017 by RapidMiner and the contributors
*
* Complete list of developers available at our web site:
*
* http://rapidminer.com
*
* This program is free software: you can redistribute it and/or modify it under the terms of the
* GNU Affero General Public License as published by the Free Software Foundation, either version 3
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
* even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License along with this program.
* If not, see http://www.gnu.org/licenses/.
*/
package com.rapidminer.example.table.internal;
import java.util.Arrays;
/**
* Utility class for auto column calculations.
*
* @author Gisa Schaefer
* @since 7.3.1
*/
final class AutoColumnUtils {
/** Chunks of 128MB of double or 64MB of integer values */
static final int CHUNK_SIZE_EXP = 24;
/** size of a chunk, always a power of 2 */
static final int CHUNK_SIZE = 1 << CHUNK_SIZE_EXP;
static final int CHUNK_MODULO_MASK = -1 >>> 32 - CHUNK_SIZE_EXP;
/** the maximal number of chunks per column */
static final int MAXIMAL_CHUNKS = Integer.MAX_VALUE / CHUNK_SIZE + 1;
/** determines after how many values the check for sparse is done */
static final int THRESHOLD_CHECK_FOR_SPARSE = 2048;
/** the threshold densitity to change to sparse representation in auto mode */
static final double THRESHOLD_HIGH_SPARSITY_DENSITY = 0.01;
/**
* the threshold densitity to change to sparse representation in memory-optimized mode for
* double values
*/
static final double THRESHOLD_DOUBLE_MEDIUM_SPARSITY_DENSITY = 0.5;
/**
* the threshold densitity to change to sparse representation in memory-optimized mode for
* integer values
*/
static final double THRESHOLD_INTEGER_MEDIUM_SPARSITY_DENSITY = 0.4;
/** the maximal density a sparse column in auto mode should have */
static final double THRESHOLD_HIGH_SPARSITY_MAXIMAL_DENSITY = 0.02;
/** the maximal density a sparse double column in memory-optimized mode should have */
static final double THRESHOLD_DOUBLE_MEDIUM_SPARSITY_MAXIMAL_DENSITY = 0.55;
/** the maximal density a sparse integer column in memory-optimized mode should have */
static final double THRESHOLD_INTEGER_MEDIUM_SPARSITY_MAXIMAL_DENSITY = 0.45;
/** empty double array to use for empty chunks */
static final double[] EMPTY_DOUBLE_ARRAY = new double[0];
/** empty int array to use for empty chunks */
static final int[] EMPTY_INTEGER_ARRAY = new int[0];
/** empty byte array to use for empty chunks */
static final byte[] EMPTY_BYTE_ARRAY = new byte[0];
/** the missing values for integers */
static final int INTEGER_NAN = Integer.MIN_VALUE;
private AutoColumnUtils() {
// Utility class constructor
}
/**
* Container for the result of {@link AutoColumnUtils#checkDensity(double[])}. Stores the
* density and the most frequent value.
*
* @author Gisa Schaefer
*/
static class DensityResult {
final double density;
final double mostFrequentValue;
DensityResult(double density, double mostFrequentValue) {
this.density = density;
this.mostFrequentValue = mostFrequentValue;
}
}
/**
* Calculates the most frequent value in the first {@link #THRESHOLD_CHECK_FOR_SPARSE} entries
* of the data array and the density of the other values.
*
* @param data
* the array to check
* @return the density of the other values and the most frequent value
*/
static DensityResult checkDensity(double[] data) {
int length = AutoColumnUtils.THRESHOLD_CHECK_FOR_SPARSE;
double[] sorted = Arrays.copyOf(data, length);
Arrays.sort(sorted);
double mostFrequent = sorted[0];
int mostFrequentCount = 1;
double currentValue = mostFrequent;
int currentCount = mostFrequentCount;
for (int i = 1; i < length; i++) {
if (sorted[i] == currentValue) {
currentCount++;
if (currentCount > mostFrequentCount) {
mostFrequentCount = currentCount;
mostFrequent = currentValue;
}
} else {
currentValue = sorted[i];
currentCount = 1;
}
}
double density = 1.0 - mostFrequentCount / (double) length;
return new DensityResult(density, mostFrequent);
}
/**
* Calculates the most frequent value in the in the first {@link #THRESHOLD_CHECK_FOR_SPARSE}
* entries of the data array and the density of the other values.
*
* @param data
* the array to check
* @return the density of the other values and the most frequent value
*/
static DensityResult checkDensity(int[] data) {
int length = AutoColumnUtils.THRESHOLD_CHECK_FOR_SPARSE;
int[] sorted = Arrays.copyOf(data, length);
Arrays.sort(sorted);
int mostFrequent = sorted[0];
int mostFrequentCount = 1;
int currentValue = mostFrequent;
int currentCount = mostFrequentCount;
for (int i = 1; i < length; i++) {
if (sorted[i] == currentValue) {
currentCount++;
if (currentCount > mostFrequentCount) {
mostFrequentCount = currentCount;
mostFrequent = currentValue;
}
} else {
currentValue = sorted[i];
currentCount = 1;
}
}
double density = 1.0 - mostFrequentCount / (double) length;
return new DensityResult(density, mostFrequent == INTEGER_NAN ? Double.NaN : mostFrequent);
}
/**
* Copies the end of src after srcOff to dest starting at destOff. Copies the beginning of src
* until index to dest until if src and dest are not the same array.
*
* @param src
* the source array
* @param dest
* the destination array
* @param index
* the index until which dest should be the same as source
* @param srcOff
* from where on to copy the end of src
* @param destOff
* from where on to paste the end of src into dest
* @param length
* the length to which src is filled
*/
static void copy(Object src, Object dest, int index, int srcOff, int destOff, int length) {
// copy indices after index if exist
if (length - srcOff > 0) {
System.arraycopy(src, srcOff, dest, destOff, length - srcOff);
}
if (src != dest && index != 0) {
// copy indices before index
System.arraycopy(src, 0, dest, 0, index);
}
}
}