package com.thinkbiganalytics.spark.dataprofiler.columns;
/*-
* #%L
* thinkbig-spark-job-profiler-app
* %%
* Copyright (C) 2017 ThinkBig Analytics
* %%
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* #L%
*/
import com.thinkbiganalytics.spark.dataprofiler.ProfilerConfiguration;
import com.thinkbiganalytics.spark.dataprofiler.model.MetricType;
import com.thinkbiganalytics.spark.dataprofiler.output.OutputRow;
import org.apache.spark.sql.types.StructField;
import java.util.ArrayList;
import java.util.List;
import javax.annotation.Nonnull;
/**
* Class to hold profile statistics for columns of float data type <br>
* [Hive data type: FLOAT]
*/
@SuppressWarnings("serial")
public class FloatColumnStatistics extends StandardColumnStatistics {
/* Float specific metrics */
private float max;
private float min;
private double sum;
private double mean;
private double stddev;
private double variance;
/* Other variables */
private double sumOfSquares;
private long oldTotalCount;
private long oldNullCount;
private double oldMean;
private double oldStdDev;
private double oldSumOfSquares;
private float columnFloatValue;
/**
* One-argument constructor
*
* @param columnField field schema
*/
public FloatColumnStatistics(StructField columnField, @Nonnull final ProfilerConfiguration profilerConfiguration) {
super(columnField, profilerConfiguration);
max = Float.MIN_VALUE;
min = Float.MAX_VALUE;
sum = 0.0d;
mean = 0.0d;
stddev = 0.0d;
variance = 0.0d;
sumOfSquares = 0.0d;
oldTotalCount = 0L;
oldNullCount = 0L;
oldMean = 0.0d;
oldStdDev = 0.0d;
oldSumOfSquares = 0.0d;
columnFloatValue = 0.0f;
}
/**
* Calculate float-specific statistics by accommodating the value and frequency/count
*/
@Override
public void accomodate(Object columnValue, Long columnCount) {
accomodateCommon(columnValue, columnCount);
if (columnValue != null) {
columnFloatValue = Float.valueOf(String.valueOf(columnValue));
if (max < columnFloatValue) {
max = columnFloatValue;
}
if (min > columnFloatValue) {
min = columnFloatValue;
}
sum += (columnFloatValue * columnCount);
oldMean = 0.0d;
oldSumOfSquares = 0.0d;
for (int i = 1; i <= columnCount; i++) {
oldMean = mean;
oldSumOfSquares = sumOfSquares;
mean = oldMean + ((columnFloatValue - oldMean) / (totalCount - columnCount + i - nullCount));
sumOfSquares = oldSumOfSquares + ((columnFloatValue - mean) * (columnFloatValue - oldMean));
}
variance = sumOfSquares / (totalCount - nullCount);
stddev = Math.sqrt(variance);
}
}
/**
* Combine with another column statistics
*/
@Override
public void combine(StandardColumnStatistics v_columnStatistics) {
saveMetricsForStdDevCalc();
combineCommon(v_columnStatistics);
FloatColumnStatistics vFloat_columnStatistics = (FloatColumnStatistics) v_columnStatistics;
max = Math.max(max, vFloat_columnStatistics.max);
min = Math.min(min, vFloat_columnStatistics.min);
sum += vFloat_columnStatistics.sum;
mean = sum / (totalCount - nullCount);
double term1 = (totalCount - nullCount) * Math.pow(stddev, 2);
double term2 = (vFloat_columnStatistics.totalCount - vFloat_columnStatistics.nullCount) * Math.pow(vFloat_columnStatistics.stddev, 2);
double term3 = (totalCount - nullCount) * Math.pow((mean - vFloat_columnStatistics.mean), 2);
double term4 = (vFloat_columnStatistics.totalCount - vFloat_columnStatistics.nullCount) * Math.pow((vFloat_columnStatistics.mean - mean), 2);
double term5 = (totalCount - nullCount) + (vFloat_columnStatistics.totalCount - vFloat_columnStatistics.nullCount);
stddev = Math.sqrt((term1 + term2 + term3 + term4) / term5);
stddev = getCombinedStdDev(vFloat_columnStatistics);
variance = Math.pow(stddev, 2);
}
/*
* Save values for running statistical calculations
*/
private void saveMetricsForStdDevCalc() {
oldTotalCount = totalCount;
oldNullCount = nullCount;
oldMean = mean;
oldStdDev = stddev;
}
/*
* Get combined standard deviations from two standard deviations
*/
private double getCombinedStdDev(FloatColumnStatistics vFloat_columnStatistics) {
double meanComb = (
((oldTotalCount - oldNullCount) * oldMean) +
((vFloat_columnStatistics.totalCount - vFloat_columnStatistics.nullCount) * vFloat_columnStatistics.mean)
)
/ ((oldTotalCount - oldNullCount) + (vFloat_columnStatistics.totalCount - vFloat_columnStatistics.nullCount));
double term1 = (oldTotalCount - oldNullCount) * Math.pow(oldStdDev, 2);
double term2 = (vFloat_columnStatistics.totalCount - vFloat_columnStatistics.nullCount) * Math.pow(vFloat_columnStatistics.stddev, 2);
double term3 = (oldTotalCount - oldNullCount) * Math.pow((oldMean - meanComb), 2);
double term4 = (vFloat_columnStatistics.totalCount - vFloat_columnStatistics.nullCount) * Math.pow((vFloat_columnStatistics.mean - meanComb), 2);
double term5 = (oldTotalCount - oldNullCount) + (vFloat_columnStatistics.totalCount - vFloat_columnStatistics.nullCount);
return (Math.sqrt((term1 + term2 + term3 + term4) / term5));
}
/**
* Print statistics to console
*/
@Override
public String getVerboseStatistics() {
return "{\n" + getVerboseStatisticsCommon()
+ "\n"
+ "FloatColumnStatistics ["
+ "max=" + max
+ ", min=" + min
+ ", sum=" + sum
+ ", mean=" + df.format(mean)
+ ", stddev=" + df.format(stddev)
+ ", variance=" + df.format(variance)
+ "]\n}";
}
/**
* Write statistics for output result table
*/
@Override
public List<OutputRow> getStatistics() {
final List<OutputRow> rows = new ArrayList<>();
writeStatisticsCommon(rows);
if (allNulls()) {
min = 0;
max = 0;
sum = 0;
mean = 0;
stddev = 0;
variance = 0;
}
rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.MAX), String.valueOf(max)));
rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.MIN), String.valueOf(min)));
rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.SUM), String.valueOf(sum)));
rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.MEAN), String.valueOf(mean)));
rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.STDDEV), String.valueOf(stddev)));
rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.VARIANCE), String.valueOf(variance)));
return rows;
}
/**
* Get maximum value
*
* @return max value
*/
public float getMax() {
return max;
}
/**
* Get minimum value
*
* @return min value
*/
public float getMin() {
return min;
}
/**
* Get sum
*
* @return sum
*/
public double getSum() {
return sum;
}
/**
* Get mean (average)
*
* @return mean
*/
public double getMean() {
return mean;
}
/**
* Get standard deviation (population)
*
* @return standard deviation (population)
*/
public double getStddev() {
return stddev;
}
/**
* Get variance (population)
*
* @return variance (population)
*/
public double getVariance() {
return variance;
}
}