package com.thinkbiganalytics.spark.dataprofiler.columns; /*- * #%L * thinkbig-spark-job-profiler-app * %% * Copyright (C) 2017 ThinkBig Analytics * %% * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * #L% */ import com.thinkbiganalytics.spark.dataprofiler.ProfilerConfiguration; import com.thinkbiganalytics.spark.dataprofiler.model.MetricType; import com.thinkbiganalytics.spark.dataprofiler.output.OutputRow; import org.apache.commons.lang3.StringUtils; import org.apache.spark.sql.types.StructField; import java.sql.Timestamp; import java.util.ArrayList; import java.util.List; import javax.annotation.Nonnull; import javax.annotation.Nullable; /** * Class to hold profile statistics for columns of timestamp data type <br> * [Hive data type: TIMESTAMP] */ @SuppressWarnings("serial") public class TimestampColumnStatistics extends StandardColumnStatistics { /** * Records the maximum value of the column */ @Nullable private Timestamp maxTimestamp; /** * Records the minimum value of the column */ @Nullable private Timestamp minTimestamp; /** * Constructs a {@code TimestampColumnStatistics} for profiling the the specified field. * * @param columnField the field to be profiled * @param profilerConfiguration the profiler configuration */ public TimestampColumnStatistics(@Nonnull final StructField columnField, @Nonnull final ProfilerConfiguration profilerConfiguration) { super(columnField, profilerConfiguration); } /** * Adds the specified value to the statistics for this column. * * @param columnValue the column value to be profiled * @param columnCount the number of rows containing the value */ @Override public void accomodate(@Nullable final Object columnValue, @Nonnull Long columnCount) { // Update common statistics accomodateCommon(columnValue, columnCount); // Update timestamp-specific statistics String stringValue = (columnValue != null) ? columnValue.toString() : null; if (!StringUtils.isEmpty(stringValue)) { Timestamp timestamp = Timestamp.valueOf(stringValue); if (maxTimestamp == null || maxTimestamp.before(timestamp)) { maxTimestamp = timestamp; } if (minTimestamp == null || minTimestamp.after(timestamp)) { minTimestamp = timestamp; } } } /** * Merges the specified statistics into this object. * * @param v_columnStatistics the statistics to be merged */ @Override public void combine(@Nonnull final StandardColumnStatistics v_columnStatistics) { // Combine common statistics combineCommon(v_columnStatistics); // Combine timestamp-specific statistics TimestampColumnStatistics vTimestamp_columnStatistics = (TimestampColumnStatistics) v_columnStatistics; if (maxTimestamp == null || (vTimestamp_columnStatistics.maxTimestamp != null && maxTimestamp.before(vTimestamp_columnStatistics.maxTimestamp))) { maxTimestamp = vTimestamp_columnStatistics.maxTimestamp; } if (minTimestamp == null || (vTimestamp_columnStatistics.minTimestamp != null && minTimestamp.after(vTimestamp_columnStatistics.minTimestamp))) { minTimestamp = vTimestamp_columnStatistics.minTimestamp; } } /** * Returns the statistics as a string. * * @return the statistics */ @Nonnull @Override public String getVerboseStatistics() { return "{\n" + getVerboseStatisticsCommon() + "\n" + "TimestampColumnStatistics [maxTimestamp=" + (maxTimestamp != null ? maxTimestamp : "") + ", minTimestamp=" + (minTimestamp != null ? minTimestamp : "") + "]\n}"; } /** * Writes the statistics to an output table. */ @Override public List<OutputRow> getStatistics() { final List<OutputRow> rows = new ArrayList<>(); // Write common statistics writeStatisticsCommon(rows); // Write timestamp-specific statistics rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.MAX_TIMESTAMP), (maxTimestamp != null) ? maxTimestamp.toString() : "")); rows.add(new OutputRow(columnField.name(), String.valueOf(MetricType.MIN_TIMESTAMP), (minTimestamp != null) ? minTimestamp.toString() : "")); return rows; } /** * Get latest timestamp * * @return latest timestamp */ @Nullable public Timestamp getMaxTimestamp() { return maxTimestamp; } /** * Get earliest timestamp * * @return earliest timestamp */ @Nullable public Timestamp getMinTimestamp() { return minTimestamp; } }