package com.thinkbiganalytics.spark.dataprofiler.columns; /*- * #%L * thinkbig-spark-job-profiler-app * %% * Copyright (C) 2017 ThinkBig Analytics * %% * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * #L% */ import com.thinkbiganalytics.spark.dataprofiler.ProfilerConfiguration; import com.thinkbiganalytics.spark.dataprofiler.model.MetricType; import com.thinkbiganalytics.spark.dataprofiler.output.OutputRow; import org.apache.spark.sql.types.StructField; import java.util.ArrayList; import java.util.List; import javax.annotation.Nonnull; /** * Class to hold profile statistics for columns of short data type <br> * [Hive data type: SMALLINT] */ @SuppressWarnings("serial") public class ShortColumnStatistics extends StandardColumnStatistics { /* Short specific metrics */ private short max; private short min; private long sum; private double mean; private double stddev; private double variance; /* Other variables */ private double sumOfSquares; private long oldTotalCount; private long oldNullCount; private double oldMean; private double oldStdDev; private double oldSumOfSquares; private short columnShortValue; /** * One-argument constructor * * @param columnField field schema */ public ShortColumnStatistics(StructField columnField, @Nonnull final ProfilerConfiguration profilerConfiguration) { super(columnField, profilerConfiguration); max = Short.MIN_VALUE; min = Short.MAX_VALUE; sum = 0L; mean = 0.0d; stddev = 0.0d; variance = 0.0d; sumOfSquares = 0.0d; oldTotalCount = 0L; oldNullCount = 0L; oldMean = 0.0d; oldStdDev = 0.0d; oldSumOfSquares = 0.0d; columnShortValue = 0; } /** * Calculate short-specific statistics by accommodating the value and frequency/count */ @Override public void accomodate(Object columnValue, Long columnCount) { accomodateCommon(columnValue, columnCount); if (columnValue != null) { columnShortValue = Short.valueOf(String.valueOf(columnValue)); if (max < columnShortValue) { max = columnShortValue; } if (min > columnShortValue) { min = columnShortValue; } sum += (columnShortValue * columnCount); oldMean = 0.0d; oldSumOfSquares = 0.0d; for (int i = 1; i <= columnCount; i++) { oldMean = mean; oldSumOfSquares = sumOfSquares; mean = oldMean + ((columnShortValue - oldMean) / (totalCount - columnCount + i - nullCount)); sumOfSquares = oldSumOfSquares + ((columnShortValue - mean) * (columnShortValue - oldMean)); } variance = sumOfSquares / (totalCount - nullCount); stddev = Math.sqrt(variance); } } /** * Combine with another column statistics */ @Override public void combine(StandardColumnStatistics v_columnStatistics) { saveMetricsForStdDevCalc(); combineCommon(v_columnStatistics); ShortColumnStatistics vShort_columnStatistics = (ShortColumnStatistics) v_columnStatistics; max = (short) Math.max(max, vShort_columnStatistics.max); min = (short) Math.min(min, vShort_columnStatistics.min); sum += vShort_columnStatistics.sum; mean = (double) sum / (totalCount - nullCount); stddev = getCombinedStdDev(vShort_columnStatistics); variance = Math.pow(stddev, 2); } /* * Save values for running statistical calculations */ private void saveMetricsForStdDevCalc() { oldTotalCount = totalCount; oldNullCount = nullCount; oldMean = mean; oldStdDev = stddev; } /* * Get combined standard deviations from two standard deviations */ private double getCombinedStdDev(ShortColumnStatistics vShort_columnStatistics) { double meanComb = ( ((oldTotalCount - oldNullCount) * oldMean) + ((vShort_columnStatistics.totalCount - vShort_columnStatistics.nullCount) * vShort_columnStatistics.mean) ) / ((oldTotalCount - oldNullCount) + (vShort_columnStatistics.totalCount - vShort_columnStatistics.nullCount)); double term1 = (oldTotalCount - oldNullCount) * Math.pow(oldStdDev, 2); double term2 = (vShort_columnStatistics.totalCount - vShort_columnStatistics.nullCount) * Math.pow(vShort_columnStatistics.stddev, 2); double term3 = (oldTotalCount - oldNullCount) * Math.pow((oldMean - meanComb), 2); double term4 = (vShort_columnStatistics.totalCount - vShort_columnStatistics.nullCount) * Math.pow((vShort_columnStatistics.mean - meanComb), 2); double term5 = (oldTotalCount - oldNullCount) + (vShort_columnStatistics.totalCount - vShort_columnStatistics.nullCount); return (Math.sqrt((term1 + term2 + term3 + term4) / term5)); } /** * Print statistics to console */ @Override public String getVerboseStatistics() { return "{\n" + getVerboseStatisticsCommon() + "\n" + "ShortColumnStatistics [" + "max=" + max + ", min=" + min + ", sum=" + sum + ", mean=" + df.format(mean) + ", stddev=" + df.format(stddev) + ", variance=" + df.format(variance) + "]\n}"; } /** * Write statistics for output result table */ @Override public List<OutputRow> getStatistics() { final List<OutputRow> rows = new ArrayList<>(); writeStatisticsCommon(rows); if (allNulls()) { min = 0; max = 0; sum = 0; mean = 0; stddev = 0; variance = 0; } rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.MAX), String.valueOf(max))); rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.MIN), String.valueOf(min))); rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.SUM), String.valueOf(sum))); rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.MEAN), String.valueOf(mean))); rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.STDDEV), String.valueOf(stddev))); rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.VARIANCE), String.valueOf(variance))); return rows; } /** * Get maximum value * * @return max value */ public short getMax() { return max; } /** * Get minimum value * * @return min value */ public short getMin() { return min; } /** * Get sum * * @return sum */ public long getSum() { return sum; } /** * Get mean (average) * * @return mean */ public double getMean() { return mean; } /** * Get standard deviation (population) * * @return standard deviation (population) */ public double getStddev() { return stddev; } /** * Get variance (population) * * @return variance (population) */ public double getVariance() { return variance; } }