package com.thinkbiganalytics.spark.dataprofiler.columns; /*- * #%L * thinkbig-spark-job-profiler-app * %% * Copyright (C) 2017 ThinkBig Analytics * %% * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * #L% */ import com.thinkbiganalytics.spark.dataprofiler.ColumnStatistics; import com.thinkbiganalytics.spark.dataprofiler.ProfilerConfiguration; import com.thinkbiganalytics.spark.dataprofiler.model.MetricType; import com.thinkbiganalytics.spark.dataprofiler.output.OutputRow; import com.thinkbiganalytics.spark.dataprofiler.topn.TopNDataItem; import com.thinkbiganalytics.spark.dataprofiler.topn.TopNDataList; import org.apache.spark.sql.types.StructField; import java.io.Serializable; import java.text.DecimalFormat; import java.util.List; import javax.annotation.Nonnull; /** * Class to hold common profile statistics for columns of all data types */ @SuppressWarnings("serial") public abstract class StandardColumnStatistics implements ColumnStatistics, Serializable { /* Schema information for column */ final StructField columnField; /* Other variables */ final DecimalFormat df; private final TopNDataList topNValues; /* Common metrics for all data types */ long nullCount; long totalCount; private long uniqueCount; private double percNullValues; private double percUniqueValues; private double percDuplicateValues; private ProfilerConfiguration profilerConfiguration; /** * One-argument constructor * * @param columnField field schema */ protected StandardColumnStatistics(StructField columnField, @Nonnull final ProfilerConfiguration profilerConfiguration) { this.columnField = columnField; nullCount = 0; totalCount = 0; uniqueCount = 0; percNullValues = 0.0d; percUniqueValues = 0.0d; percDuplicateValues = 0.0d; this.profilerConfiguration = profilerConfiguration; topNValues = new TopNDataList(profilerConfiguration.getNumberOfTopNValues()); df = new DecimalFormat(getDecimalFormatPattern()); } /** * Calculate common statistics by accommodating the value and frequency/count * * @param columnValue value * @param columnCount frequency/count */ void accomodateCommon(Object columnValue, Long columnCount) { totalCount += columnCount; uniqueCount += 1; if (columnValue == null) { nullCount += columnCount; } doPercentageCalculationsCommon(); topNValues.add(columnValue, columnCount); } /** * Combine with another column statistics * * @param v_columnStatistics column statistics to combine with */ void combineCommon(StandardColumnStatistics v_columnStatistics) { totalCount += v_columnStatistics.totalCount; uniqueCount += v_columnStatistics.uniqueCount; nullCount += v_columnStatistics.nullCount; doPercentageCalculationsCommon(); for (TopNDataItem dataItem : v_columnStatistics.topNValues.getTopNDataItemsForColumn()) { topNValues.add(dataItem.getValue(), dataItem.getCount()); } } /** * Write column's schema information for output result table */ private void writeColumnSchemaInformation(@Nonnull final List<OutputRow> rows) { rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.COLUMN_DATATYPE), String.valueOf(columnField.dataType()))); rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.COLUMN_NULLABLE), String.valueOf(columnField.nullable()))); rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.COLUMN_METADATA), String.valueOf(columnField.metadata()))); } /** * Print column's schema information to console * * @return schema information */ private String getVerboseColumnSchemaInformation() { return "ColumnInfo [" + "name=" + columnField.name() + ", datatype=" + columnField.dataType().simpleString() + ", nullable=" + columnField.nullable() + ", metadata=" + columnField.metadata() + "]"; } /** * Write top n rows in column for output result table */ private void writeTopNInformation(@Nonnull final List<OutputRow> rows) { rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.TOP_N_VALUES), topNValues.printTopNItems())); } /** * Print top n rows in column to console * * @return top n rows */ private String getVerboseTopNInformation() { return "Top " + profilerConfiguration.getNumberOfTopNValues() + " values [\n" + topNValues.printTopNItems() + "]"; } /** * Write common statistics information for output result table */ void writeStatisticsCommon(@Nonnull final List<OutputRow> rows) { writeColumnSchemaInformation(rows); rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.NULL_COUNT), String.valueOf(nullCount))); rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.TOTAL_COUNT), String.valueOf(totalCount))); rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.UNIQUE_COUNT), String.valueOf(uniqueCount))); rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.PERC_NULL_VALUES), df.format(percNullValues))); rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.PERC_UNIQUE_VALUES), df.format(percUniqueValues))); rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.PERC_DUPLICATE_VALUES), df.format(percDuplicateValues))); writeTopNInformation(rows); } /** * Print common statistics information to console * * @return common statistics */ String getVerboseStatisticsCommon() { return getVerboseColumnSchemaInformation() + "\n" + "CommonStatistics [" + "nullCount=" + nullCount + ", totalCount=" + totalCount + ", uniqueCount=" + uniqueCount + ", percNullValues=" + df.format(percNullValues) + ", percUniqueValues=" + df.format(percUniqueValues) + ", percDuplicateValues=" + df.format(percDuplicateValues) + "]" + "\n" + getVerboseTopNInformation(); } /* * Do percentage calculations for common metrics */ private void doPercentageCalculationsCommon() { percNullValues = ((double) nullCount / totalCount) * 100; percUniqueValues = ((double) uniqueCount / totalCount) * 100; percDuplicateValues = 100.0d - percUniqueValues; } /* * Build format to display decimals up to configured number of digits */ private String getDecimalFormatPattern() { StringBuilder format = new StringBuilder(); format.append("#."); for (int i = 0; i < profilerConfiguration.getDecimalDigitsToDisplayConsoleOutput(); i++) { format.append("#"); } return format.toString(); } /** * Get null count * * @return null count */ public long getNullCount() { return nullCount; } /** * Get total count (includes nulls and empty values) * * @return total count */ public long getTotalCount() { return totalCount; } /** * Get unique count (null and empty are considered a unique value each) * * @return unique count */ public long getUniqueCount() { return uniqueCount; } /** * Get percentage of null values * * @return percentage of null values */ public double getPercNullValues() { return percNullValues; } /** * Get percentage of unique values * * @return percentage of unique values */ public double getPercUniqueValues() { return percUniqueValues; } /** * Get percentage of duplicate values * * @return percentage of duplicate values */ public double getPercDuplicateValues() { return percDuplicateValues; } /** * Get top n values (in order of frequency) * * @return top n values */ public TopNDataList getTopNValues() { return topNValues; } /* * Methods to be implemented by data type specific column statistics classes that: * 1) extend this class * 2) may implement additional metrics * */ public abstract void accomodate(Object columnValue, Long columnCount); public abstract void combine(StandardColumnStatistics v_columnStatistics); public abstract List<OutputRow> getStatistics(); public abstract String getVerboseStatistics(); /* * Check if all values are null */ boolean allNulls() { return (totalCount == nullCount); } }