package com.thinkbiganalytics.spark.dataprofiler.testcases; /*- * #%L * thinkbig-spark-job-profiler-app * %% * Copyright (C) 2017 ThinkBig Analytics * %% * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * #L% */ import com.thinkbiganalytics.spark.dataprofiler.ProfilerConfiguration; import com.thinkbiganalytics.spark.dataprofiler.columns.TimestampColumnStatistics; import com.thinkbiganalytics.spark.dataprofiler.core.ProfilerTest; import com.thinkbiganalytics.spark.dataprofiler.output.OutputRow; import org.apache.spark.sql.types.DataTypes; import org.joda.time.DateTime; import org.junit.AfterClass; import org.junit.Assert; import org.junit.BeforeClass; import org.junit.Test; import java.sql.Timestamp; import java.util.List; /** * Timestamp Column Statistics Test Case */ public class TimestampColumnCase2Test extends ProfilerTest { @BeforeClass public static void setUpClass() { System.out.println("\t*** Starting run for TimestampColumnCase2Test ***"); } @AfterClass public static void tearDownClass() { System.out.println("\t*** Completed run for TimestampColumnCase2Test ***"); } /** * Verify accommodating column values. */ @Test public void accomodate() { final ProfilerConfiguration profilerConfiguration = new ProfilerConfiguration(); // Test with a null value TimestampColumnStatistics stats = new TimestampColumnStatistics(DataTypes.createStructField("ts", DataTypes.TimestampType, true), profilerConfiguration); stats.accomodate(null, 1L); Assert.assertNull(stats.getMaxTimestamp()); Assert.assertNull(stats.getMinTimestamp()); // Test with uninitialized max & min stats.accomodate("2016-06-27 14:04:30", 1L); Timestamp ts1 = new Timestamp(new DateTime(2016, 6, 27, 14, 4, 30).getMillis()); Assert.assertEquals(ts1, stats.getMaxTimestamp()); Assert.assertEquals(ts1, stats.getMinTimestamp()); // Test with a later timestamp stats.accomodate("2016-06-27 14:04:31", 1L); Timestamp ts2 = new Timestamp(new DateTime(2016, 6, 27, 14, 4, 31).getMillis()); Assert.assertEquals(ts2, stats.getMaxTimestamp()); Assert.assertEquals(ts1, stats.getMinTimestamp()); // Test with an earlier timestamp stats.accomodate("2016-06-27 14:04:29", 1L); Timestamp ts3 = new Timestamp(new DateTime(2016, 6, 27, 14, 4, 29).getMillis()); Assert.assertEquals(ts2, stats.getMaxTimestamp()); Assert.assertEquals(ts3, stats.getMinTimestamp()); } /** * Verify combining statistics. */ @Test public void combine() { final ProfilerConfiguration profilerConfiguration = new ProfilerConfiguration(); // Test when 'this' is empty TimestampColumnStatistics other = new TimestampColumnStatistics(DataTypes.createStructField("ts", DataTypes.TimestampType, true), profilerConfiguration); TimestampColumnStatistics stats = new TimestampColumnStatistics(DataTypes.createStructField("ts", DataTypes.TimestampType, true), profilerConfiguration); other.accomodate("2016-06-27 14:04:30", 1L); stats.combine(other); Timestamp ts1 = new Timestamp(new DateTime(2016, 6, 27, 14, 4, 30).getMillis()); Assert.assertEquals(ts1, stats.getMaxTimestamp()); Assert.assertEquals(ts1, stats.getMinTimestamp()); // Test when other is empty other = new TimestampColumnStatistics(DataTypes.createStructField("ts", DataTypes.TimestampType, true), profilerConfiguration); stats.combine(other); Assert.assertEquals(ts1, stats.getMaxTimestamp()); Assert.assertEquals(ts1, stats.getMinTimestamp()); // Test when other has later timestamp other.accomodate("2016-06-27 14:04:31", 1L); stats.combine(other); Timestamp ts2 = new Timestamp(new DateTime(2016, 6, 27, 14, 4, 31).getMillis()); Assert.assertEquals(ts2, stats.getMaxTimestamp()); Assert.assertEquals(ts1, stats.getMinTimestamp()); // Test when other has earlier timestamp other.accomodate("2016-06-27 14:04:29", 1L); stats.combine(other); Timestamp ts3 = new Timestamp(new DateTime(2016, 6, 27, 14, 4, 29).getMillis()); Assert.assertEquals(ts2, stats.getMaxTimestamp()); Assert.assertEquals(ts3, stats.getMinTimestamp()); } /** * Verify statistics string. */ @Test public void getVerboseStatistics() { final ProfilerConfiguration profilerConfiguration = new ProfilerConfiguration(); // Test when empty TimestampColumnStatistics stats = new TimestampColumnStatistics(DataTypes.createStructField("ts", DataTypes.TimestampType, true), profilerConfiguration); String expected = "{\nColumnInfo [name=ts, datatype=timestamp, nullable=true, metadata={}]\n" + "CommonStatistics [nullCount=0, totalCount=0, uniqueCount=0, percNullValues=0, percUniqueValues=0, percDuplicateValues=0]\n" + "Top 3 values [\n]\n" + "TimestampColumnStatistics [maxTimestamp=, minTimestamp=]\n}"; Assert.assertEquals(expected, stats.getVerboseStatistics()); // Test with multiple values stats.accomodate("", 1L); stats.accomodate("2016-06-27 14:04:29", 1L); stats.accomodate("2016-06-27 14:04:30", 1L); stats.accomodate("2016-06-27 14:04:31", 1L); stats.accomodate(null, 1L); expected = "{\nColumnInfo [name=ts, datatype=timestamp, nullable=true, metadata={}]\n" + "CommonStatistics [nullCount=1, totalCount=5, uniqueCount=5, percNullValues=20, percUniqueValues=100, percDuplicateValues=0]\n" + "Top 3 values [\n1^A^A1^B2^A2016-06-27 14:04:29^A1^B3^A2016-06-27 14:04:30^A1^B]\n" + "TimestampColumnStatistics [maxTimestamp=2016-06-27 14:04:31.0, minTimestamp=2016-06-27 14:04:29.0]\n}"; Assert.assertEquals(expected, stats.getVerboseStatistics()); } /** * Verify writing statistics. */ @Test public void writeStatistics() { final ProfilerConfiguration profilerConfiguration = new ProfilerConfiguration(); // Test when empty TimestampColumnStatistics stats = new TimestampColumnStatistics(DataTypes.createStructField("ts", DataTypes.TimestampType, true), profilerConfiguration); List<OutputRow> rows = stats.getStatistics(); Assert.assertEquals(12, rows.size()); Assert.assertEquals("OutputRow [columnName=ts, metricType=COLUMN_DATATYPE, metricValue=TimestampType]", rows.get(0).toString()); Assert.assertEquals("OutputRow [columnName=ts, metricType=COLUMN_NULLABLE, metricValue=true]", rows.get(1).toString()); Assert.assertEquals("OutputRow [columnName=ts, metricType=COLUMN_METADATA, metricValue={}]", rows.get(2).toString()); Assert.assertEquals("OutputRow [columnName=ts, metricType=NULL_COUNT, metricValue=0]", rows.get(3).toString()); Assert.assertEquals("OutputRow [columnName=ts, metricType=TOTAL_COUNT, metricValue=0]", rows.get(4).toString()); Assert.assertEquals("OutputRow [columnName=ts, metricType=UNIQUE_COUNT, metricValue=0]", rows.get(5).toString()); Assert.assertEquals("OutputRow [columnName=ts, metricType=PERC_NULL_VALUES, metricValue=0]", rows.get(6).toString()); Assert.assertEquals("OutputRow [columnName=ts, metricType=PERC_UNIQUE_VALUES, metricValue=0]", rows.get(7).toString()); Assert.assertEquals("OutputRow [columnName=ts, metricType=PERC_DUPLICATE_VALUES, metricValue=0]", rows.get(8).toString()); Assert.assertEquals("OutputRow [columnName=ts, metricType=TOP_N_VALUES, metricValue=]", rows.get(9).toString()); Assert.assertEquals("OutputRow [columnName=ts, metricType=MAX_TIMESTAMP, metricValue=]", rows.get(10).toString()); Assert.assertEquals("OutputRow [columnName=ts, metricType=MIN_TIMESTAMP, metricValue=]", rows.get(11).toString()); // Test with multiple values stats.accomodate("", 1L); stats.accomodate("2016-06-27 14:04:29", 1L); stats.accomodate("2016-06-27 14:04:30", 1L); stats.accomodate("2016-06-27 14:04:31", 1L); stats.accomodate(null, 1L); rows = stats.getStatistics(); Assert.assertEquals(12, rows.size()); Assert.assertEquals("OutputRow [columnName=ts, metricType=COLUMN_DATATYPE, metricValue=TimestampType]", rows.get(0).toString()); Assert.assertEquals("OutputRow [columnName=ts, metricType=COLUMN_NULLABLE, metricValue=true]", rows.get(1).toString()); Assert.assertEquals("OutputRow [columnName=ts, metricType=COLUMN_METADATA, metricValue={}]", rows.get(2).toString()); Assert.assertEquals("OutputRow [columnName=ts, metricType=NULL_COUNT, metricValue=1]", rows.get(3).toString()); Assert.assertEquals("OutputRow [columnName=ts, metricType=TOTAL_COUNT, metricValue=5]", rows.get(4).toString()); Assert.assertEquals("OutputRow [columnName=ts, metricType=UNIQUE_COUNT, metricValue=5]", rows.get(5).toString()); Assert.assertEquals("OutputRow [columnName=ts, metricType=PERC_NULL_VALUES, metricValue=20]", rows.get(6).toString()); Assert.assertEquals("OutputRow [columnName=ts, metricType=PERC_UNIQUE_VALUES, metricValue=100]", rows.get(7).toString()); Assert.assertEquals("OutputRow [columnName=ts, metricType=PERC_DUPLICATE_VALUES, metricValue=0]", rows.get(8).toString()); Assert.assertEquals("OutputRow [columnName=ts, metricType=TOP_N_VALUES, metricValue=1^A^A1^B2^A2016-06-27 14:04:29^A1^B3^A2016-06-27 14:04:30^A1^B]", rows.get(9).toString()); Assert.assertEquals("OutputRow [columnName=ts, metricType=MAX_TIMESTAMP, metricValue=2016-06-27 14:04:31.0]", rows.get(10).toString()); Assert.assertEquals("OutputRow [columnName=ts, metricType=MIN_TIMESTAMP, metricValue=2016-06-27 14:04:29.0]", rows.get(11).toString()); } }