package com.thinkbiganalytics.spark.dataprofiler.output;
/*-
* #%L
* thinkbig-spark-job-profiler-app
* %%
* Copyright (C) 2017 ThinkBig Analytics
* %%
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* #L%
*/
import com.thinkbiganalytics.hive.util.HiveUtils;
import com.thinkbiganalytics.spark.DataSet;
import com.thinkbiganalytics.spark.SparkContextService;
import com.thinkbiganalytics.spark.dataprofiler.ColumnStatistics;
import com.thinkbiganalytics.spark.dataprofiler.ProfilerConfiguration;
import com.thinkbiganalytics.spark.dataprofiler.StatisticsModel;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SQLContext;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import javax.annotation.Nonnull;
/**
* Class to write profile statistics result to Hive table
*/
@SuppressWarnings("serial")
public class OutputWriter implements Serializable {
/**
* Write the profile statistics to Hive.
*/
public static void writeModel(@Nonnull final StatisticsModel model, @Nonnull final ProfilerConfiguration profilerConfiguration, @Nonnull final SQLContext sqlContext,
@Nonnull final SparkContextService scs) {
final OutputWriter writer = new OutputWriter(profilerConfiguration);
for (final ColumnStatistics column : model.getColumnStatisticsMap().values()) {
writer.addRows(column.getStatistics());
}
writer.writeResultToTable(sqlContext, scs);
}
private final List<OutputRow> outputRows = new ArrayList<>();
@Nonnull
private final ProfilerConfiguration profilerConfiguration;
private OutputWriter(@Nonnull final ProfilerConfiguration profilerConfiguration) {
this.profilerConfiguration = profilerConfiguration;
}
/**
* Helper method:
* Check if output configuration (db, table, partition column, partition key) has been set
*/
private boolean checkOutputConfigSettings() {
return !((profilerConfiguration.getOutputDbName() == null)
|| (profilerConfiguration.getOutputTableName() == null)
|| (profilerConfiguration.getOutputTablePartitionColumnName() == null)
|| (profilerConfiguration.getInputAndOutputTablePartitionKey() == null));
}
/**
* Add multiple rows to write in output
*
* @param rows list of rows for output
*/
public void addRows(List<OutputRow> rows) {
outputRows.addAll(rows);
}
/**
* Write result to Hive table
*
* @return boolean indicating result of write
*/
@SuppressWarnings("unchecked")
public boolean writeResultToTable(@Nonnull final SQLContext sqlContext, @Nonnull final SparkContextService scs) {
boolean retVal = false;
if (!checkOutputConfigSettings()) {
System.out.println("Error writing result: Output database/table/partition column/partition key not set.");
} else if (sqlContext == null) {
System.out.println("Error writing result: Spark context is not available.");
} else {
@SuppressWarnings("squid:S2095") final JavaRDD<OutputRow> outputRowsRDD = JavaSparkContext.fromSparkContext(sqlContext.sparkContext()).parallelize(outputRows);
DataSet outputRowsDF = scs.toDataSet(sqlContext, outputRowsRDD, OutputRow.class);
//outputRowsDF.write().mode(SaveMode.Overwrite).saveAsTable(outputTable);
// Since Spark doesn't support partitions, write to temp table, then write to partitioned table
String tempTable = profilerConfiguration.getOutputTableName() + "_" + System.currentTimeMillis();
outputRowsDF.registerTempTable(tempTable);
createOutputTableIfNotExists(sqlContext, scs);
writeResultToOutputTable(sqlContext, scs, tempTable);
retVal = true;
}
return retVal;
}
/**
* Create output table if does not exist
*/
private void createOutputTableIfNotExists(@Nonnull final SQLContext sqlContext, @Nonnull final SparkContextService scs) {
String createTableSQL = "CREATE TABLE IF NOT EXISTS " + HiveUtils.quoteIdentifier(profilerConfiguration.getOutputDbName(), profilerConfiguration.getOutputTableName()) + "\n"
+ "(columnname STRING, metricname STRING, metricvalue STRING)\n"
+ "PARTITIONED BY (" + profilerConfiguration.getOutputTablePartitionColumnName() + " STRING)\n"
+ "ROW FORMAT DELIMITED\n"
+ "FIELDS TERMINATED BY ','\n"
+ "STORED AS TEXTFILE";
scs.sql(sqlContext, createTableSQL);
}
/**
* Write to output table
*/
private void writeResultToOutputTable(@Nonnull final SQLContext sqlContext, @Nonnull final SparkContextService scs, @Nonnull final String tempTable) {
String insertTableSQL = "INSERT INTO TABLE " + HiveUtils.quoteIdentifier(profilerConfiguration.getOutputDbName(), profilerConfiguration.getOutputTableName())
+ " PARTITION (" + HiveUtils.quoteIdentifier(profilerConfiguration.getOutputTablePartitionColumnName()) + "="
+ HiveUtils.quoteString(profilerConfiguration.getInputAndOutputTablePartitionKey()) + ")"
+ " SELECT columnname,metrictype,metricvalue FROM " + HiveUtils.quoteIdentifier(tempTable);
scs.sql(sqlContext, insertTableSQL);
System.out.println("[PROFILER-INFO] Metrics written to Hive table: "
+ profilerConfiguration.getOutputDbName() + "." + profilerConfiguration.getOutputTableName()
+ " Partition: (" + profilerConfiguration.getOutputTablePartitionColumnName() + "='" + profilerConfiguration.getInputAndOutputTablePartitionKey() + "')"
+ " [" + outputRows.size() + " rows]");
}
}