package com.thinkbiganalytics.spark.dataprofiler.model; /*- * #%L * thinkbig-spark-job-profiler-app * %% * Copyright (C) 2017 ThinkBig Analytics * %% * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * #L% */ import com.thinkbiganalytics.spark.dataprofiler.ColumnStatistics; import com.thinkbiganalytics.spark.dataprofiler.ProfilerConfiguration; import com.thinkbiganalytics.spark.dataprofiler.StatisticsModel; import com.thinkbiganalytics.spark.dataprofiler.columns.BigDecimalColumnStatistics; import com.thinkbiganalytics.spark.dataprofiler.columns.BooleanColumnStatistics; import com.thinkbiganalytics.spark.dataprofiler.columns.ByteColumnStatistics; import com.thinkbiganalytics.spark.dataprofiler.columns.DateColumnStatistics; import com.thinkbiganalytics.spark.dataprofiler.columns.DoubleColumnStatistics; import com.thinkbiganalytics.spark.dataprofiler.columns.FloatColumnStatistics; import com.thinkbiganalytics.spark.dataprofiler.columns.IntegerColumnStatistics; import com.thinkbiganalytics.spark.dataprofiler.columns.LongColumnStatistics; import com.thinkbiganalytics.spark.dataprofiler.columns.ShortColumnStatistics; import com.thinkbiganalytics.spark.dataprofiler.columns.StandardColumnStatistics; import com.thinkbiganalytics.spark.dataprofiler.columns.StringColumnStatistics; import com.thinkbiganalytics.spark.dataprofiler.columns.TimestampColumnStatistics; import com.thinkbiganalytics.spark.dataprofiler.columns.UnsupportedColumnStatistics; import org.apache.spark.sql.types.DataType; import org.apache.spark.sql.types.StructField; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.Serializable; import java.util.HashMap; import java.util.Map; import javax.annotation.Nonnull; /** * Class to store the profile statistics */ @SuppressWarnings("serial") public class StandardStatisticsModel implements Serializable, StatisticsModel { private static final Logger log = LoggerFactory.getLogger(StandardStatisticsModel.class); private final Map<Integer, StandardColumnStatistics> columnStatisticsMap = new HashMap<>(); @Nonnull private final ProfilerConfiguration profilerConfiguration; public StandardStatisticsModel(@Nonnull final ProfilerConfiguration profilerConfiguration) { this.profilerConfiguration = profilerConfiguration; } /** * Include a column value in calculation of profile statistics for the column * * @param columnIndex numeric index of column (0-based) * @param columnValue value in column * @param columnCount number of times value is found in column * @param columnField schema information of the column */ public void add(Integer columnIndex, Object columnValue, Long columnCount, StructField columnField) { StandardColumnStatistics newColumnStatistics; DataType columnDataType = columnField.dataType(); switch (columnDataType.simpleString()) { /* === Group 1 ===*/ /* * Hive datatype: TINYINT * SparkSQL datatype: tinyint * Java datatype: Byte */ case "tinyint": newColumnStatistics = new ByteColumnStatistics(columnField, profilerConfiguration); break; /* * Hive datatype: SMALLINT * SparkSQL datatype: smallint * Java datatype: Short */ case "smallint": newColumnStatistics = new ShortColumnStatistics(columnField, profilerConfiguration); break; /* * Hive datatype: INT * SparkSQL datatype: int * Java datatype: Int */ case "int": newColumnStatistics = new IntegerColumnStatistics(columnField, profilerConfiguration); break; /* * Hive datatype: BIGINT * SparkSQL datatype: bigint * Java datatype: Long */ case "bigint": newColumnStatistics = new LongColumnStatistics(columnField, profilerConfiguration); break; /* === Group 2 === */ /* * Hive datatype: FLOAT * SparkSQL datatype: float * Java datatype: Float */ case "float": newColumnStatistics = new FloatColumnStatistics(columnField, profilerConfiguration); break; /* * Hive datatype: DOUBLE * SparkSQL datatype: double * Java datatype: Double */ case "double": newColumnStatistics = new DoubleColumnStatistics(columnField, profilerConfiguration); break; /* === Group 3 === */ /* * Hive datatypes: STRING, VARCHAR * SparkSQL datatype: string * Java datatype: String */ case "string": newColumnStatistics = new StringColumnStatistics(columnField, profilerConfiguration); break; /* === Group 4 === */ /* * Hive datatype: BOOLEAN * SparkSQL datatype: boolean * Java datatype: Boolean */ case "boolean": newColumnStatistics = new BooleanColumnStatistics(columnField, profilerConfiguration); break; /* === Group 5 === */ /* * Hive datatype: DATE * SparkSQL datatype: date * Java datatype: java.sql.Date */ case "date": newColumnStatistics = new DateColumnStatistics(columnField, profilerConfiguration); break; /* * Hive datatype: TIMESTAMP * SparkSQL datatype: timestamp * Java datatype: java.sql.Timestamp */ case "timestamp": newColumnStatistics = new TimestampColumnStatistics(columnField, profilerConfiguration); break; /* === Group 6 === */ default: /* * Hive datatype: DECIMAL * SparkSQL datatype: decimal * Java datatype: java.math.BigDecimal * * Handle the decimal type here since it comes with scale and precision e.g. decimal(7,5) */ String decimalTypeRegex = "decimal\\S+"; if (columnDataType.simpleString().matches(decimalTypeRegex)) { newColumnStatistics = new BigDecimalColumnStatistics(columnField, profilerConfiguration); } /* * Hive datatypes: CHAR, BINARY, ARRAY, MAP, STRUCT, UNIONTYPE */ else { if (log.isWarnEnabled()) { log.warn("[PROFILER-INFO] Unsupported data type: {}", columnDataType.simpleString()); } newColumnStatistics = new UnsupportedColumnStatistics(columnField, profilerConfiguration); } } if (!columnStatisticsMap.containsKey(columnIndex)) { columnStatisticsMap.put(columnIndex, newColumnStatistics); } StandardColumnStatistics currentColumnStatistics = columnStatisticsMap.get(columnIndex); currentColumnStatistics.accomodate(columnValue, columnCount); } /** * Combine another statistics model * * @param statisticsModel model to combine with */ public void combine(StandardStatisticsModel statisticsModel) { for (Integer k_columnIndex : statisticsModel.columnStatisticsMap.keySet()) { StandardColumnStatistics columnStatistics = columnStatisticsMap.get(k_columnIndex); StandardColumnStatistics v_columnStatistics = statisticsModel.columnStatisticsMap.get(k_columnIndex); if (columnStatistics != null) { columnStatistics.combine(v_columnStatistics); } else { columnStatisticsMap.put(k_columnIndex, v_columnStatistics); } } } /** * Print the profile statistics on console * * @return profile model string */ private String printModel() { StringBuilder sb = new StringBuilder(); sb.append("====== Statistics Model ======"); sb.append("\n"); for (Map.Entry<Integer, StandardColumnStatistics> entry : columnStatisticsMap.entrySet()) { sb.append("=== Column #") .append(entry.getKey()) .append("\n"); sb.append(entry.getValue().getVerboseStatistics()) .append("\n"); } sb.append("=============================="); return sb.toString(); } /** * Print the profile statistics on console */ @Override public String toString() { return printModel(); } /** * Get the column statistics map (column number mapped to column statistics) * * @return column statistics map */ @Override @SuppressWarnings({"unchecked", "squid:S1905"}) public Map<Integer, ColumnStatistics> getColumnStatisticsMap() { return (Map) columnStatisticsMap; } }