package com.thinkbiganalytics.spark.dataprofiler.core; /*- * #%L * thinkbig-spark-job-profiler-app * %% * Copyright (C) 2017 ThinkBig Analytics * %% * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * #L% */ import com.thinkbiganalytics.hive.util.HiveUtils; import com.thinkbiganalytics.policy.FieldPolicy; import com.thinkbiganalytics.spark.DataSet; import com.thinkbiganalytics.spark.SparkContextService; import com.thinkbiganalytics.spark.dataprofiler.ProfilerConfiguration; import com.thinkbiganalytics.spark.dataprofiler.StatisticsModel; import com.thinkbiganalytics.spark.dataprofiler.output.OutputWriter; import com.thinkbiganalytics.spark.policy.FieldPolicyLoader; import org.apache.commons.lang.StringUtils; import org.apache.spark.sql.SQLContext; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.context.ApplicationContext; import org.springframework.context.annotation.AnnotationConfigApplicationContext; import java.util.ArrayList; import java.util.List; import java.util.Map; import javax.annotation.Nonnull; import javax.annotation.Nullable; /** * Generate data profile statistics for a table/query, and write result to a table */ public class Profiler { private static final Logger log = LoggerFactory.getLogger(Profiler.class); private FieldPolicyLoader loader; private com.thinkbiganalytics.spark.dataprofiler.Profiler profiler; private ProfilerConfiguration profilerConfiguration; private SparkContextService sparkContextService; private SQLContext sqlContext; /** * Main entry point into program * * @param args: list of args */ public static void main(String[] args) { final ApplicationContext ctx = new AnnotationConfigApplicationContext("com.thinkbiganalytics.spark"); final Profiler profiler = new Profiler(ctx.getBean(FieldPolicyLoader.class), ctx.getBean(com.thinkbiganalytics.spark.dataprofiler.Profiler.class), ctx.getBean(ProfilerConfiguration.class), ctx.getBean(SparkContextService.class), ctx.getBean(SQLContext.class)); profiler.run(args); } public Profiler(FieldPolicyLoader loader, com.thinkbiganalytics.spark.dataprofiler.Profiler profiler, ProfilerConfiguration profilerConfiguration, SparkContextService sparkContextService, SQLContext sqlContext) { this.loader = loader; this.profiler = profiler; this.profilerConfiguration = profilerConfiguration; this.sparkContextService = sparkContextService; this.sqlContext = sqlContext; } public void run(String[] args) { /* Variables */ DataSet resultDF; String queryString; /* Check command line arguments and get query to run. */ if ((queryString = checkCommandLineArgs(args)) == null) { return; } /* Run query and get result */ log.info("[PROFILER-INFO] Analyzing profile statistics for: [{}]", queryString); resultDF = sparkContextService.sql(sqlContext, queryString); /* Get profile statistics and write to table */ final StatisticsModel statisticsModel = profiler.profile(resultDF, profilerConfiguration); if (statisticsModel != null) { OutputWriter.writeModel(statisticsModel, profilerConfiguration, sqlContext, sparkContextService); } else { log.info("[PROFILER-INFO] No data to process. Hence, no profile statistics generated."); } /* Wrap up */ log.info("[PROFILER-INFO] Profiling finished."); } /** * Check command line arguments * * @param args list of command line arguments * @return query to run (null if invalid arguments) */ @Nullable private String checkCommandLineArgs(final String[] args) { if (log.isInfoEnabled()) { log.info("Running Spark Profiler with the following command line {} args (comma separated): {}", args.length, StringUtils.join(args, ",")); } if (args.length < 5) { log.error("Invalid number of command line arguments ({})", args.length); showCommandLineArgs(); return null; } String retVal; String profileObjectType = args[0]; String profileObjectDesc = args[1]; Integer n = Integer.valueOf(args[2]); String profileOutputTable = args[3]; String fieldPolicyJsonPath = args[4]; Map<String, FieldPolicy> policyMap = loader.loadFieldPolicy(fieldPolicyJsonPath); String inputAndOutputTablePartitionKey = "ALL"; if (args.length >= 6) { inputAndOutputTablePartitionKey = args[5]; } switch (profileObjectType) { case "table": // Quote source table final String[] tableRef = profileObjectDesc.split("\\.", 2); final String safeTable = tableRef.length == 1 ? HiveUtils.quoteIdentifier(tableRef[0]) : HiveUtils.quoteIdentifier(tableRef[0], tableRef[1]); // Create SQL List<String> profiledColumns = new ArrayList<>(); for (FieldPolicy fieldPolicy : policyMap.values()) { if (fieldPolicy.isProfile()) { profiledColumns.add(HiveUtils.quoteIdentifier(fieldPolicy.getField().toLowerCase())); } } if (!profiledColumns.isEmpty()) { retVal = "select " + StringUtils.join(profiledColumns, ',') + " from " + safeTable; if (inputAndOutputTablePartitionKey != null && !"ALL".equalsIgnoreCase(inputAndOutputTablePartitionKey)) { retVal += " where " + HiveUtils.quoteIdentifier(profilerConfiguration.getInputTablePartitionColumnName()) + " = " + HiveUtils.quoteString(inputAndOutputTablePartitionKey); } } else { retVal = null; } break; case "query": retVal = profileObjectDesc; break; default: log.error("Illegal command line argument for object type ({})", profileObjectType); showCommandLineArgs(); return null; } if (n <= 0) { log.error("Illegal command line argument for n for top_n values ({})", n); showCommandLineArgs(); return null; } else { profilerConfiguration.setNumberOfTopNValues(n); } if (!setOutputTableDBAndName(profileOutputTable, profilerConfiguration)) { log.error("Illegal command line argument for output table ({})", profileOutputTable); showCommandLineArgs(); return null; } profilerConfiguration.setInputAndOutputTablePartitionKey(inputAndOutputTablePartitionKey); return retVal; } /* * Set output database and table */ private boolean setOutputTableDBAndName(@Nonnull final String profileOutputTable, @Nonnull final ProfilerConfiguration profilerConfiguration) { Boolean retVal = true; String[] tableNameParts = profileOutputTable.split("\\."); if (tableNameParts.length == 1) { //output db remains as 'default' profilerConfiguration.setOutputTableName(tableNameParts[0]); } else if (tableNameParts.length == 2) { profilerConfiguration.setOutputDbName(tableNameParts[0]); profilerConfiguration.setOutputTableName(tableNameParts[1]); } else { retVal = false; } return retVal; } /** * Show required command-line arguments. */ private void showCommandLineArgs() { log.info("*** \nInfo: Required command line arguments:\n" + "1. object type: valid values are {table, query}\n" + "2. object description: valid values are {<database.table>, <query>}\n" + "3. n for top_n values: valid value is {<integer>}\n" + "4. output table: valid values are {<table>, <database.table>}" + "5. full path to policy file " + "\n" + "Info: Optional command line argument:\n" + "6. partition_key: valid value is {<string>}\n\n" + "(Note: Only alphanumeric and underscore characters for table names and partition key)" + "\n***"); } }