package com.thinkbiganalytics.spark.dataprofiler.core; /*- * #%L * kylo-spark-job-profiler-app * %% * Copyright (C) 2017 ThinkBig Analytics * %% * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * #L% */ import com.thinkbiganalytics.spark.dataprofiler.ProfilerConfiguration; import com.thinkbiganalytics.spark.dataprofiler.StatisticsModel; import com.thinkbiganalytics.spark.dataprofiler.columns.BigDecimalColumnStatistics; import com.thinkbiganalytics.spark.dataprofiler.columns.BooleanColumnStatistics; import com.thinkbiganalytics.spark.dataprofiler.columns.ByteColumnStatistics; import com.thinkbiganalytics.spark.dataprofiler.columns.DateColumnStatistics; import com.thinkbiganalytics.spark.dataprofiler.columns.DoubleColumnStatistics; import com.thinkbiganalytics.spark.dataprofiler.columns.FloatColumnStatistics; import com.thinkbiganalytics.spark.dataprofiler.columns.IntegerColumnStatistics; import com.thinkbiganalytics.spark.dataprofiler.columns.LongColumnStatistics; import com.thinkbiganalytics.spark.dataprofiler.columns.ShortColumnStatistics; import com.thinkbiganalytics.spark.dataprofiler.columns.StandardColumnStatistics; import com.thinkbiganalytics.spark.dataprofiler.columns.StringColumnStatistics; import com.thinkbiganalytics.spark.dataprofiler.columns.TimestampColumnStatistics; import com.thinkbiganalytics.spark.dataprofiler.columns.UnsupportedColumnStatistics; import com.thinkbiganalytics.spark.dataprofiler.output.OutputWriter; import com.thinkbiganalytics.spark.dataprofiler.output.OutputRow; import com.thinkbiganalytics.spark.dataprofiler.topn.TopNDataItem; import com.thinkbiganalytics.spark.dataprofiler.topn.TopNDataList; import org.apache.spark.SparkConf; import org.apache.spark.SparkContext; import org.apache.spark.sql.SQLContext; import org.apache.spark.sql.hive.HiveContext; import org.springframework.context.annotation.Bean; import org.springframework.context.annotation.Configuration; import java.util.ArrayList; import java.util.List; import javax.annotation.Nonnull; @Configuration public class ProfilerApp { @Bean public ProfilerConfiguration profilerConfiguration() { return new ProfilerConfiguration(); } @Bean public SQLContext sqlContext(final ProfilerConfiguration profilerConfiguration) { SparkConf conf = new SparkConf(); conf = configureEfficientSerialization(conf); HiveContext hiveContext = new HiveContext(new SparkContext(conf)); hiveContext.setConf("spark.sql.dialect", profilerConfiguration.getSqlDialect()); return hiveContext; } /** * Configure efficient serialization via Kryo. */ @Nonnull private SparkConf configureEfficientSerialization(@Nonnull final SparkConf conf) { List<Class<?>> serializeClassesList; Class<?>[] serializeClassesArray; conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); serializeClassesList = new ArrayList<>(); serializeClassesList.add(StandardColumnStatistics.class); serializeClassesList.add(BigDecimalColumnStatistics.class); serializeClassesList.add(BooleanColumnStatistics.class); serializeClassesList.add(ByteColumnStatistics.class); serializeClassesList.add(DateColumnStatistics.class); serializeClassesList.add(DoubleColumnStatistics.class); serializeClassesList.add(FloatColumnStatistics.class); serializeClassesList.add(IntegerColumnStatistics.class); serializeClassesList.add(LongColumnStatistics.class); serializeClassesList.add(ShortColumnStatistics.class); serializeClassesList.add(StringColumnStatistics.class); serializeClassesList.add(TimestampColumnStatistics.class); serializeClassesList.add(UnsupportedColumnStatistics.class); serializeClassesList.add(StatisticsModel.class); serializeClassesList.add(TopNDataItem.class); serializeClassesList.add(TopNDataList.class); serializeClassesList.add(OutputRow.class); serializeClassesList.add(OutputWriter.class); serializeClassesArray = new Class[serializeClassesList.size()]; for (int i = 0; i < serializeClassesList.size(); i++) { serializeClassesArray[i] = serializeClassesList.get(i); } conf.registerKryoClasses(serializeClassesArray); return conf; } }