package com.thinkbiganalytics.spark.dataprofiler.core; /*- * #%L * thinkbig-spark-job-profiler-app * %% * Copyright (C) 2017 ThinkBig Analytics * %% * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * #L% */ import com.thinkbiganalytics.spark.DataSet; import com.thinkbiganalytics.spark.SparkContextService; import com.thinkbiganalytics.spark.dataprofiler.ProfilerConfiguration; import com.thinkbiganalytics.spark.dataprofiler.StatisticsModel; import com.thinkbiganalytics.spark.dataprofiler.columns.StandardColumnStatistics; import com.thinkbiganalytics.spark.dataprofiler.config.ProfilerConfig; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.Row; import org.apache.spark.sql.RowFactory; import org.apache.spark.sql.SQLContext; import org.apache.spark.sql.types.DataTypes; import org.apache.spark.sql.types.StructField; import org.apache.spark.sql.types.StructType; import org.junit.After; import org.junit.Before; import org.junit.runner.RunWith; import org.springframework.context.annotation.ComponentScan; import org.springframework.test.context.ActiveProfiles; import org.springframework.test.context.ContextConfiguration; import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; import java.math.BigDecimal; import java.sql.Date; import java.sql.Timestamp; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Map; import javax.inject.Inject; @RunWith(SpringJUnit4ClassRunner.class) @ComponentScan(basePackages = {"com.thinkbiganalytics"}) @ContextConfiguration(classes = {ProfilerConfig.class, SpringTestConfigV1.class, SpringTestConfigV2.class}) @ActiveProfiles("spark-v1") public abstract class ProfilerTest { public static final String EMPTY_STRING = ""; public static final double epsilon = 0.0001d; public static final double epsilon2 = 3000.0d; //only used for long-variance, since they are extremely large numbers //columnStatsMap is static to be shared between multiple sub-classes protected static Map<Integer, StandardColumnStatistics> columnStatsMap; private JavaSparkContext sc; @Inject private com.thinkbiganalytics.spark.dataprofiler.Profiler profiler; @Inject private SparkContextService scs; @Inject private SQLContext sqlContext; @Before @SuppressWarnings("unchecked") public void setUp() { if (columnStatsMap == null) { StructField[] schemaFields = new StructField[15]; schemaFields[0] = DataTypes.createStructField("id", DataTypes.IntegerType, true); schemaFields[1] = DataTypes.createStructField("firstname", DataTypes.StringType, true); schemaFields[2] = DataTypes.createStructField("lastname", DataTypes.StringType, true); schemaFields[3] = DataTypes.createStructField("age", DataTypes.IntegerType, true); schemaFields[4] = DataTypes.createStructField("description", DataTypes.StringType, true); schemaFields[5] = DataTypes.createStructField("height", DataTypes.DoubleType, true); schemaFields[6] = DataTypes.createStructField("joindate", DataTypes.DateType, true); schemaFields[7] = DataTypes.createStructField("lifemember", DataTypes.BooleanType, true); schemaFields[8] = DataTypes.createStructField("lastlogin", DataTypes.TimestampType, true); schemaFields[9] = DataTypes.createStructField("phash", DataTypes.LongType, true); schemaFields[10] = DataTypes.createStructField("weight", DataTypes.FloatType, true); schemaFields[11] = DataTypes.createStructField("credits", DataTypes.ShortType, true); schemaFields[12] = DataTypes.createStructField("ccode", DataTypes.ByteType, true); schemaFields[13] = DataTypes.createStructField("score", DataTypes.createDecimalType(7, 5), true); schemaFields[14] = DataTypes.createStructField("favoritepet", DataTypes.StringType, true); StructType schema = DataTypes.createStructType(schemaFields); List<Row> rows = new ArrayList<>(); rows.add(RowFactory.create( 1, "Jon", "Wright", 14, "Jon::Wright", 5.85d, Date.valueOf("2010-05-04"), Boolean.TRUE, Timestamp.valueOf("2008-05-06 23:10:10"), 1456890911L, 40.2f, (short) 100, (byte) 99, new BigDecimal(String.valueOf(1.567)), "Cat")); rows.add(RowFactory.create( 2, "Jon", "Hudson", null, "Jon::Hudson", 5.85d, Date.valueOf("1990-10-25"), null, Timestamp.valueOf("2011-01-08 11:25:45"), 7638962135L, 110.5f, (short) 100, (byte) 99, new BigDecimal(String.valueOf(8.223)), "alligator")); rows.add(RowFactory.create( 3, "Rachael", "Hu", 40, "Rachael::Hu", 6.22d, Date.valueOf("1990-10-25"), Boolean.TRUE, Timestamp.valueOf("2011-01-08 11:25:45"), 2988626110L, 160.7f, (short) 1400, (byte) 99, new BigDecimal(String.valueOf(1.567)), "Alpaca")); rows.add(RowFactory.create( 4, EMPTY_STRING, EMPTY_STRING, 40, null, null, Date.valueOf("1956-11-12"), Boolean.TRUE, Timestamp.valueOf("2008-05-06 23:10:10"), 2988626110L, null, null, (byte) 99, null, "Cat")); rows.add(RowFactory.create( 5, "Rachael", EMPTY_STRING, 22, "Rachael::", 5.85d, Date.valueOf("2005-12-24"), Boolean.FALSE, Timestamp.valueOf("2008-05-06 23:10:10"), 8260467621L, 160.7f, (short) 100, null, new BigDecimal(String.valueOf(4.343)), "Zebra")); rows.add(RowFactory.create( 6, "Elizabeth", "Taylor", 40, "Elizabeth::Taylor", 5.85d, Date.valueOf("2011-08-08"), null, Timestamp.valueOf("2016-01-14 14:20:20"), 8732866249L, null, (short) 1400, null, new BigDecimal(String.valueOf(4.343)), "ZEBRA")); rows.add(RowFactory.create( 7, "Jon", "Taylor", 18, "Jon::Taylor", null, Date.valueOf("2011-08-08"), Boolean.TRUE, Timestamp.valueOf("2011-01-08 11:25:45"), 2988626110L, 110.5f, (short) 500, (byte) 40, new BigDecimal(String.valueOf(4.343)), null)); rows.add(RowFactory.create( 8, "Rachael", EMPTY_STRING, 22, "Rachael::", 4.37d, Date.valueOf("2011-08-08"), Boolean.FALSE, Timestamp.valueOf("2008-05-06 23:10:10"), 8782348100L, null, null, null, null, "albatross")); rows.add(RowFactory.create( 9, EMPTY_STRING, "Edmundson Jr", 11, "::Edmundson Jr", 4.88d, Date.valueOf("2007-06-07"), Boolean.FALSE, Timestamp.valueOf("2007-03-16 08:24:37"), null, 155.3f, (short) 0, (byte) 99, new BigDecimal(String.valueOf(1.567)), EMPTY_STRING)); rows.add(RowFactory.create( 10, "Jon", EMPTY_STRING, 65, "Jon::", null, Date.valueOf("1975-04-04"), Boolean.TRUE, Timestamp.valueOf("2007-03-16 08:24:31"), null, 180.6f, (short) 5000, (byte) 2, new BigDecimal(String.valueOf(4.343)), "Cat")); final JavaSparkContext javaSparkContext = JavaSparkContext.fromSparkContext(sqlContext.sparkContext()); JavaRDD<Row> dataRDD = javaSparkContext.parallelize(rows); DataSet dataDF = scs.toDataSet(sqlContext.createDataFrame(dataRDD, schema)); /* Enable to debug contents of test data */ /* for (Row r: dataRDD.collect()) { System.out.println(r.toString()); } */ StatisticsModel statsModel = profiler.profile(dataDF, new ProfilerConfiguration()); columnStatsMap = (statsModel != null) ? (Map) statsModel.getColumnStatisticsMap() : (Map<Integer, StandardColumnStatistics>) Collections.EMPTY_MAP; } } @After public void tearDown() { if (sc != null) { sc.close(); sc = null; } } }