package com.thinkbiganalytics.spark.dataprofiler.columns; /*- * #%L * thinkbig-spark-job-profiler-app * %% * Copyright (C) 2017 ThinkBig Analytics * %% * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * #L% */ import com.thinkbiganalytics.spark.dataprofiler.ProfilerConfiguration; import com.thinkbiganalytics.spark.dataprofiler.model.MetricType; import com.thinkbiganalytics.spark.dataprofiler.output.OutputRow; import org.apache.spark.sql.types.StructField; import java.util.ArrayList; import java.util.List; import javax.annotation.Nonnull; /** * Class to hold profile statistics for columns of string data type <br> * [Hive data types: STRING, VARCHAR] */ @SuppressWarnings("serial") public class StringColumnStatistics extends StandardColumnStatistics { /* String specific metrics */ private int maxLength; private int minLength; private String longestString; private String shortestString; private long emptyCount; private double percEmptyValues; private String minStringCase; private String maxStringCase; private String minStringICase; private String maxStringICase; /* Other variables */ private String columnStringValue; private int columnStringLength; private boolean initializationFlag; /** * One-argument constructor * * @param columnField field schema */ public StringColumnStatistics(StructField columnField, @Nonnull final ProfilerConfiguration profilerConfiguration) { super(columnField, profilerConfiguration); maxLength = Integer.MIN_VALUE; minLength = Integer.MAX_VALUE; String EMPTY_STRING = ""; longestString = EMPTY_STRING; shortestString = EMPTY_STRING; emptyCount = 0; percEmptyValues = 0.0d; initializationFlag = false; minStringCase = EMPTY_STRING; maxStringCase = EMPTY_STRING; minStringICase = EMPTY_STRING; maxStringICase = EMPTY_STRING; columnStringValue = EMPTY_STRING; columnStringLength = 0; } /** * Calculate string-specific statistics by accommodating the value and frequency/count */ @Override public void accomodate(Object columnValue, Long columnCount) { accomodateCommon(columnValue, columnCount); if (columnValue != null) { columnStringValue = String.valueOf(columnValue); columnStringLength = columnStringValue.length(); if (maxLength < columnStringLength) { maxLength = columnStringLength; longestString = columnStringValue; } /* Empty strings not considered for: * - minLength, shortestString metrics * - minStringCase, maxStringCase, minStringICase, maxStringICase metrics */ if (!columnStringValue.isEmpty()) { if (minLength > columnStringLength) { minLength = columnStringLength; shortestString = columnStringValue; } if (!initializationFlag) { minStringCase = columnStringValue; maxStringCase = columnStringValue; minStringICase = columnStringValue; maxStringICase = columnStringValue; initializationFlag = true; } else { if (minStringCase.compareTo(columnStringValue) > 0) { minStringCase = columnStringValue; } if (maxStringCase.compareTo(columnStringValue) < 0) { maxStringCase = columnStringValue; } if (minStringICase.compareToIgnoreCase(columnStringValue) > 0) { minStringICase = columnStringValue; } if (maxStringICase.compareToIgnoreCase(columnStringValue) < 0) { maxStringICase = columnStringValue; } } } if (columnStringValue.isEmpty()) { emptyCount += columnCount; } doPercentageCalculations(); } } /** * Combine with another column statistics */ @Override public void combine(StandardColumnStatistics v_columnStatistics) { combineCommon(v_columnStatistics); StringColumnStatistics vString_columnStatistics = (StringColumnStatistics) v_columnStatistics; maxLength = Math.max(maxLength, vString_columnStatistics.maxLength); /* Empty strings not considered for: * - minLength, shortestString metrics * - minStringCase, maxStringCase, minStringICase, maxStringICase metrics */ if ((minLength != Integer.MAX_VALUE) && (vString_columnStatistics.minLength != Integer.MAX_VALUE)) { if (minLength > vString_columnStatistics.minLength) { minLength = vString_columnStatistics.minLength; shortestString = vString_columnStatistics.shortestString; } } else if (minLength == Integer.MAX_VALUE) { minLength = vString_columnStatistics.minLength; shortestString = vString_columnStatistics.shortestString; } /* kept for readability else if (vString_columnStatistics.minLength == Integer.MAX_VALUE) { //no operation }*/ else { minLength = 0; } if ((initializationFlag) && (vString_columnStatistics.initializationFlag)) { if (minStringCase.compareTo(vString_columnStatistics.minStringCase) > 0) { minStringCase = vString_columnStatistics.minStringCase; } if (maxStringCase.compareTo(vString_columnStatistics.maxStringCase) < 0) { maxStringCase = vString_columnStatistics.maxStringCase; } if (minStringICase.compareToIgnoreCase(vString_columnStatistics.minStringICase) > 0) { minStringICase = vString_columnStatistics.minStringICase; } if (maxStringICase.compareToIgnoreCase(vString_columnStatistics.maxStringICase) < 0) { maxStringICase = vString_columnStatistics.maxStringICase; } } else if (!initializationFlag) { minStringCase = vString_columnStatistics.minStringCase; maxStringCase = vString_columnStatistics.maxStringCase; minStringICase = vString_columnStatistics.minStringICase; maxStringICase = vString_columnStatistics.maxStringICase; } /* kept for readability else if (!vString_columnStatistics.initializationFlag) { //no operation. } else { //no operation. } */ if (longestString.length() < vString_columnStatistics.longestString.length()) { longestString = vString_columnStatistics.longestString; } emptyCount += vString_columnStatistics.emptyCount; doPercentageCalculations(); } /* * Calculate percentage metrics */ private void doPercentageCalculations() { percEmptyValues = ((double) emptyCount / totalCount) * 100; } /** * Write statistics for output result table */ @Override public List<OutputRow> getStatistics() { final List<OutputRow> rows = new ArrayList<>(); writeStatisticsCommon(rows); rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.MAX_LENGTH), String.valueOf(maxLength))); rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.MIN_LENGTH), String.valueOf(minLength))); rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.LONGEST_STRING), String.valueOf(longestString))); rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.SHORTEST_STRING), String.valueOf(shortestString))); rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.EMPTY_COUNT), String.valueOf(emptyCount))); rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.PERC_EMPTY_VALUES), df.format(percEmptyValues))); rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.MIN_STRING_CASE), String.valueOf(minStringCase))); rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.MAX_STRING_CASE), String.valueOf(maxStringCase))); rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.MIN_STRING_ICASE), String.valueOf(minStringICase))); rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.MAX_STRING_ICASE), String.valueOf(maxStringICase))); return rows; } /** * Print statistics to console */ @Override public String getVerboseStatistics() { return "{\n" + getVerboseStatisticsCommon() + "\n" + "StringColumnStatistics [" + "maxLength=" + maxLength + ", minLength=" + minLength + ", longestString=" + longestString + ", shortestString=" + shortestString + ", emptyCount=" + emptyCount + ", percEmptyValues=" + df.format(percEmptyValues) + ", minStringCaseSensitive=" + minStringCase + ", maxStringCaseSensitive=" + maxStringCase + ", minStringCaseInsensitive=" + minStringICase + ", maxStringCaseInsensitive=" + maxStringICase + "]\n}"; } /** * Get length of longest string * * @return max length */ public int getMaxLength() { return maxLength; } /** * Get length of shortest string * * @return min length */ public int getMinLength() { return minLength; } /** * Get value of longest string * * @return longest string */ public String getLongestString() { return longestString; } /** * Get value of shortest string (empty string is still considered a string) * * @return shortest string */ public String getShortestString() { return shortestString; } /** * Get count of empty strings * * @return empty string count */ public long getEmptyCount() { return emptyCount; } /** * Get percentage of empty strings * * @return perc empty strings */ public double getPercEmptyValues() { return percEmptyValues; } /** * Get min string (lexical) (case-sensitive) * * @return min string */ public String getMinStringCase() { return minStringCase; } /** * Get max string (lexical) (case-sensitive) * * @return max string */ public String getMaxStringCase() { return maxStringCase; } /** * Get min string (lexical) (case-insensitive) * * @return min string */ public String getMinStringICase() { return minStringICase; } /** * Get max string (lexical) (case-insensitive) * * @return max string */ public String getMaxStringICase() { return maxStringICase; } }