package com.thinkbiganalytics.spark.datavalidator;
/*-
* #%L
* thinkbig-spark-validate-cleanse-app
* %%
* Copyright (C) 2017 ThinkBig Analytics
* %%
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* #L%
*/
import com.beust.jcommander.JCommander;
import com.beust.jcommander.internal.Lists;
import com.thinkbiganalytics.annotations.AnnotatedFieldProperty;
import com.thinkbiganalytics.annotations.AnnotationFieldNameResolver;
import com.thinkbiganalytics.hive.util.HiveUtils;
import com.thinkbiganalytics.policy.BaseFieldPolicy;
import com.thinkbiganalytics.policy.FieldPolicy;
import com.thinkbiganalytics.policy.FieldPolicyBuilder;
import com.thinkbiganalytics.policy.PolicyProperty;
import com.thinkbiganalytics.policy.standardization.AcceptsEmptyValues;
import com.thinkbiganalytics.policy.standardization.StandardizationPolicy;
import com.thinkbiganalytics.policy.validation.ValidationPolicy;
import com.thinkbiganalytics.policy.validation.ValidationResult;
import com.thinkbiganalytics.spark.DataSet;
import com.thinkbiganalytics.spark.SparkContextService;
import com.thinkbiganalytics.spark.datavalidator.functions.SumPartitionLevelCounts;
import com.thinkbiganalytics.spark.policy.FieldPolicyLoader;
import com.thinkbiganalytics.spark.util.InvalidFormatException;
import com.thinkbiganalytics.spark.validation.HCatDataType;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang3.reflect.FieldUtils;
import org.apache.spark.SparkContext;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.RowFactory;
import org.apache.spark.sql.hive.HiveContext;
import org.apache.spark.sql.types.DataType;
import org.apache.spark.sql.types.DataTypes;
import org.apache.spark.sql.types.Metadata;
import org.apache.spark.sql.types.StructField;
import org.apache.spark.sql.types.StructType;
import org.apache.spark.storage.StorageLevel;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.ApplicationContext;
import org.springframework.context.annotation.AnnotationConfigApplicationContext;
import org.springframework.stereotype.Component;
import java.io.Serializable;
import java.lang.reflect.ParameterizedType;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Vector;
/**
* Cleanses and validates a table of strings according to defined field-level policies. Records are split into good and bad.
* <p> blog.cloudera.com/blog/2015/07/how-to-do-data-quality-checks-using-apache-spark-dataframes/
*/
@Component
public class Validator implements Serializable {
private static final Logger log = LoggerFactory.getLogger(Validator.class);
@Autowired
IValidatorStrategy validatorStrategy;
public void setValidatorStrategy(IValidatorStrategy strategy) {
this.validatorStrategy = strategy;
}
/*
Valid validation result
*/
protected static ValidationResult VALID_RESULT = new ValidationResult();
private static String REJECT_REASON_COL = "dlp_reject_reason";
private static String VALID_INVALID_COL = "dlp_valid";
private static String PROCESSING_DTTM_COL = "processing_dttm";
/* Initialize Spark */
private HiveContext hiveContext;
// Optimization to write directly from dataframe to the table vs. temporary table (not tested with < 1.6.x)
private boolean useDirectInsert = true;
/*
Valid target schema
*/
private String validTableName;
private String invalidTableName;
private String feedTablename;
private String refTablename;
private String profileTableName;
private String qualifiedProfileName;
private String targetDatabase;
private String partition;
private FieldPolicy[] policies;
private HCatDataType[] schema;
private Map<String, FieldPolicy> policyMap = new HashMap<>();
/*
Cache for performance. Validators accept different parameters (numeric,string, etc) so we need to resolve the type using reflection
*/
private Map<Class, Class> validatorParamType = new HashMap<>();
@Autowired
private SparkContextService scs;
@Autowired
private FieldPolicyLoader loader;
/**
* Path to the file containing the JSON for the Field Policies. If called from NIFI it will pass it in as a command argument in the Validate processor The JSON should conform to the array of
* FieldPolicy objects found in the thinkbig-field-policy-rest-model module
*/
private String fieldPolicyJsonPath;
private CommandLineParams params;
static CommandLineParams parseRemainingParameters(String[] args, int from) {
CommandLineParams params = new CommandLineParams();
new JCommander(params, Arrays.copyOfRange(args, from, args.length));
return params;
}
public static void main(String[] args) {
log.info("Running Spark Validator with the following command line args (comma separated):" + StringUtils.join(args, ","));
// Check how many arguments were passed in
if (args.length < 4) {
System.out.println("Proper Usage is: <targetDatabase> <entity> <partition> <path-to-policy-file>");
System.out.println("You can optionally add: --hiveConf hive.setting=value --hiveConf hive.other.setting=value");
System.out.println("You can optionally add: --storageLevel rdd_persistence_level_value");
System.out.println("You can optionally add: --numPartitions number_of_rdd_partitions");
System.out.println("You provided " + args.length + " args which are (comma separated): " + StringUtils.join(args, ","));
System.exit(1);
}
try {
ApplicationContext ctx = new AnnotationConfigApplicationContext("com.thinkbiganalytics.spark");
Validator app = ctx.getBean(Validator.class);
app.setArguments(args[0], args[1], args[2], args[3]);
app.addParameters(parseRemainingParameters(args, 4));
app.doValidate();
} catch (Exception e) {
System.out.println(e);
}
}
public void setArguments(String targetDatabase, String entity, String partition, String fieldPolicyJsonPath) {
this.validTableName = entity + "_valid";
this.invalidTableName = entity + "_invalid";
this.profileTableName = entity + "_profile";
this.feedTablename = HiveUtils.quoteIdentifier(targetDatabase, entity + "_feed");
this.refTablename = HiveUtils.quoteIdentifier(targetDatabase, validTableName);
this.qualifiedProfileName = HiveUtils.quoteIdentifier(targetDatabase, profileTableName);
this.partition = partition;
this.targetDatabase = targetDatabase;
this.fieldPolicyJsonPath = fieldPolicyJsonPath;
}
protected HiveContext getHiveContext() {
return hiveContext;
}
public void doValidate() {
try {
SparkContext sparkContext = SparkContext.getOrCreate();
hiveContext = new HiveContext(sparkContext);
for (Param param : params.getHiveParams()) {
log.info("Adding Hive parameter {}={}", param.getName(), param.getValue());
hiveContext.setConf(param.getName(), param.getValue());
}
log.info("Deployment Mode - " + sparkContext.getConf().get("spark.submit.deployMode"));
policyMap = loader.loadFieldPolicy(fieldPolicyJsonPath);
// Extract fields from a source table
StructField[] fields = resolveSchema();
this.schema = resolveDataTypes(fields);
this.policies = resolvePolicies(fields);
String selectStmt = toSelectFields();
String sql = "SELECT " + selectStmt + " FROM " + feedTablename + " WHERE processing_dttm = '" + partition + "'";
log.info("Executing query {}", sql);
DataSet sourceDF = scs.sql(getHiveContext(), sql);
JavaRDD<Row> sourceRDD = sourceDF.javaRDD();
// Extract schema from the source table. This will be used for the invalidDataFrame
StructType invalidSchema = createModifiedSchema(feedTablename,false);
//Extract the schema from the target table. This will be used for the validDataFrame
StructType validSchema = createModifiedSchema(feedTablename,true);
log.info("invalidSchema {}", invalidSchema);
log.info("validSchema {}", validSchema);
log.info("Persistence level: {}", params.getStorageLevel());
// Validate and cleanse input rows
JavaRDD<CleansedRowResult> cleansedRowResultRDD;
if (params.getNumPartitions() <= 0) {
cleansedRowResultRDD = sourceRDD.map(new Function<Row, CleansedRowResult>() {
@Override
public CleansedRowResult call(Row row) throws Exception {
return cleanseAndValidateRow(row);
}
}).persist(StorageLevel.fromString(params.getStorageLevel()));
} else {
log.info("Partition count: " + params.getNumPartitions());
cleansedRowResultRDD = sourceRDD.repartition(params.getNumPartitions()).map(new Function<Row, CleansedRowResult>() {
@Override
public CleansedRowResult call(Row row) throws Exception {
return cleanseAndValidateRow(row);
}
}).persist(StorageLevel.fromString(params.getStorageLevel()));
}
// Return a new rdd based for Valid Results
JavaRDD<Row> validResultRDD = cleansedRowResultRDD.filter(new Function<CleansedRowResult, Boolean>() {
@Override
public Boolean call(CleansedRowResult cleansedRowResult) throws Exception {
return cleansedRowResult.rowIsValid;
}
}).map(new Function<CleansedRowResult, Row>() {
@Override
public Row call(CleansedRowResult cleansedRowResult) throws Exception {
return cleansedRowResult.row;
}
});
// Return a new rdd based for Invalid Results
JavaRDD<Row> invalidResultRDD = cleansedRowResultRDD.filter(new Function<CleansedRowResult, Boolean>() {
@Override
public Boolean call(CleansedRowResult cleansedRowResult) throws Exception {
return cleansedRowResult.rowIsValid == false;
}
}).map(new Function<CleansedRowResult, Row>() {
@Override
public Row call(CleansedRowResult cleansedRowResult) throws Exception {
return cleansedRowResult.row;
}
});
// Counts of invalid columns, total valid rows and total invalid rows
long[] fieldInvalidCounts = cleansedRowResultsValidationCounts(cleansedRowResultRDD, schema.length);
//Create the 2 new Data Frames for the invalid and valid results
final DataSet invalidDF = scs.toDataSet(getHiveContext(), invalidResultRDD, invalidSchema);
final DataSet validatedDF = scs.toDataSet(getHiveContext(), validResultRDD, validSchema);
DataSet invalidDataFrame = null;
// ensure the dataframe matches the correct schema
if (useDirectInsert) {
invalidDataFrame = invalidDF;
} else {
invalidDataFrame = invalidDF.drop(PROCESSING_DTTM_COL).toDF();
}
writeToTargetTable(invalidDataFrame, invalidTableName);
log.info("wrote values to the invalid Table {}", invalidTableName);
// Write out the valid records (dropping the two columns)
DataSet validDataFrame = null;
if (useDirectInsert) {
validDataFrame = validatedDF.drop(REJECT_REASON_COL).toDF();
} else {
validDataFrame = validatedDF.drop(REJECT_REASON_COL).drop(PROCESSING_DTTM_COL).toDF();
}
writeToTargetTable(validDataFrame, validTableName);
log.info("wrote values to the valid Table {}", validTableName);
long validCount = fieldInvalidCounts[schema.length];
long invalidCount = fieldInvalidCounts[schema.length + 1];
cleansedRowResultRDD.unpersist();
log.info("Valid count {} invalid count {}", validCount, invalidCount);
// Record the validation stats
writeStatsToProfileTable(validCount, invalidCount, fieldInvalidCounts);
} catch (Exception e) {
log.error("Failed to perform validation", e);
System.exit(1);
}
}
protected String toSelectFields(FieldPolicy[] policies1) {
List<String> fields = new ArrayList<>();
log.info("Building select statement for # of policies {}", policies1.length);
for (int i = 0; i < policies1.length; i++) {
if (policies1[i].getField() != null) {
log.info("policy [{}] name {} feedName {}", i, policies1[i].getField(), policies1[i].getFeedField());
String feedField = StringUtils.defaultIfEmpty(policies1[i].getFeedField(), policies1[i].getField());
fields.add("`" + feedField + "` as `" + policies1[i].getField() + "`");
}
}
fields.add("`processing_dttm`");
return StringUtils.join(fields.toArray(new String[0]), ",");
}
private String toSelectFields() {
return toSelectFields(this.policies);
}
private void writeStatsToProfileTable(long validCount, long invalidCount, long[] fieldInvalidCounts) {
try {
// Create a temporary table that can be used to copy data from. Writing directly to the partition from a spark dataframe doesn't work.
String tempTable = profileTableName + "_" + System.currentTimeMillis();
// Refactor this into something common with profile table
List<StructField> fields = new ArrayList<>();
fields.add(DataTypes.createStructField("columnname", DataTypes.StringType, true));
fields.add(DataTypes.createStructField("metrictype", DataTypes.StringType, true));
fields.add(DataTypes.createStructField("metricvalue", DataTypes.StringType, true));
StructType statsSchema = DataTypes.createStructType(fields);
final ArrayList<String> csvRows = new ArrayList<>();
csvRows.add("(ALL),TOTAL_COUNT," + Long.toString(validCount + invalidCount));
csvRows.add("(ALL),VALID_COUNT," + Long.toString(validCount));
csvRows.add("(ALL),INVALID_COUNT," + Long.toString(invalidCount));
// Write csv row for each columns
for (int i = 0; i < fieldInvalidCounts.length; i++) {
if (i < schema.length) {
String csvRow = schema[i].getName() + ",INVALID_COUNT," + Long.toString(fieldInvalidCounts[i]);
csvRows.add(csvRow);
}
}
JavaSparkContext jsc = new JavaSparkContext(SparkContext.getOrCreate());
JavaRDD<Row> statsRDD = jsc.parallelize(csvRows)
.map(new Function<String, Row>() {
@Override
public Row call(String s) throws Exception {
return RowFactory.create(s.split("\\,"));
}
});
DataSet df = scs.toDataSet(getHiveContext(), statsRDD, statsSchema);
df.registerTempTable(tempTable);
String insertSQL = "INSERT OVERWRITE TABLE " + qualifiedProfileName
+ " PARTITION (processing_dttm='" + partition + "')"
+ " SELECT columnname, metrictype, metricvalue FROM " + HiveUtils.quoteIdentifier(tempTable);
log.info("Writing profile stats {}", insertSQL);
scs.sql(getHiveContext(), insertSQL);
} catch (Exception e) {
log.error("Failed to insert validation stats", e);
throw new RuntimeException(e);
}
}
/**
* Creates a new RDD schema based on the source schema plus two additional columns for processing dttm and validation reason
* @param sourceTable the table to parse for the structure
* @param validTableSchema true/false if the table is for the _valid table schema. Valid schema tables that have standardization rules on a given field will result in the Dataframe type to be of type String
* @return the schema structure
*/
private StructType createModifiedSchema(String sourceTable, boolean validTableSchema) {
// Extract schema from the source table
StructType schema = scs.toDataSet(getHiveContext(), sourceTable).schema();
StructField[] fields = schema.fields();
List<StructField> fieldsList = new Vector<>();
for (int i = 0; i < fields.length; i++) {
//Build a list of feed field names using the policy map
List<String> policyMapFeedFieldNames = new ArrayList<>();
//get a list of all those that have standardization policies on them
List<String> fieldsWithStandardizers = new ArrayList<>();
for (Map.Entry<String, FieldPolicy> policyMapItem : policyMap.entrySet()) {
String fieldName = policyMapItem.getValue().getFeedField().toLowerCase();
policyMapFeedFieldNames.add(fieldName);
if(policyMapItem.getValue().hasStandardizationPolicies()) {
fieldsWithStandardizers.add(fieldName);
}
}
String lowerFieldName = fields[i].name().toLowerCase();
if (policyMapFeedFieldNames.contains(lowerFieldName)) {
log.info("Adding field {}", fields[i].name());
//if the field has a Standardization policy and its part of the validation table, then we should set the value to a String type
if(validTableSchema && fieldsWithStandardizers.contains(lowerFieldName)) {
StructField field = fields[i];
field = new StructField(field.name(),DataTypes.StringType,field.nullable(),field.metadata());
fieldsList.add(field);
}
else {
fieldsList.add(fields[i]);
}
} else {
log.warn("Feed table field {} is not present in policy map", fields[i].name().toLowerCase());
}
}
// Insert the two custom fields before the processing partition column
fieldsList.add(new StructField(PROCESSING_DTTM_COL, DataTypes.StringType, true, Metadata.empty()));
// fieldsList.add(fieldsList.size() - 1, new StructField(VALID_INVALID_COL, DataTypes.StringType, true, Metadata.empty()));
fieldsList.add(fieldsList.size() - 1, new StructField(REJECT_REASON_COL, DataTypes.StringType, true, Metadata.empty()));
return new StructType(fieldsList.toArray(new StructField[0]));
}
private void writeToTargetTable(DataSet sourceDF, String targetTable) throws Exception {
final String qualifiedTable = HiveUtils.quoteIdentifier(targetDatabase, targetTable);
// Direct insert into the table partition vs. writing into a temporary table
if (useDirectInsert) {
getHiveContext().setConf("hive.exec.dynamic.partition", "true");
getHiveContext().setConf("hive.exec.dynamic.partition.mode", "nonstrict");
// Required for ORC and Parquet
getHiveContext().setConf("set hive.optimize.index.filter", "false");
sourceDF.writeToTable(PROCESSING_DTTM_COL, qualifiedTable);
return;
} else {
// Legacy way: Create a temporary table we can use to copy data from. Writing directly to the partition from a spark dataframe doesn't work.
String tempTable = targetTable + "_" + System.currentTimeMillis();
sourceDF.registerTempTable(tempTable);
// Insert the data into the partition
final String sql = "INSERT OVERWRITE TABLE " + qualifiedTable + " PARTITION (processing_dttm='" + partition + "') SELECT * FROM " + HiveUtils.quoteIdentifier(tempTable);
log.info("Writing to target {}", sql);
scs.sql(getHiveContext(), sql);
}
}
/**
* Spark function to perform both cleansing and validation of a data row based on data policies and the target datatype
*/
private CleansedRowResult cleanseAndValidateRow(Row row) {
int nulls = 1;
// Create placeholder for the new values plus one columns for reject_reason
Object[] newValues = new Object[schema.length + 1];
boolean rowValid = true;
String sbRejectReason = null;
List<ValidationResult> results = null;
boolean[] columnsValid = new boolean[schema.length];
// Iterate through columns to cleanse and validate
for (int idx = 0; idx < schema.length; idx++) {
ValidationResult result = VALID_RESULT;
FieldPolicy fieldPolicy = policies[idx];
HCatDataType dataType = schema[idx];
boolean columnValid = true;
// Extract the value (allowing for null or missing field for odd-ball data)
Object val = (idx == row.length() || row.isNullAt(idx) ? null : row.get(idx));
// Handle complex types by passing them through
if (dataType.isUnchecked()) {
if (val == null) {
nulls++;
}
newValues[idx] = val;
} else {
Object fieldValue = (val != null ? val : null);
if (fieldValue == null) {
nulls++;
}
StandardizationAndValidationResult standardizationAndValidationResult = standardizeAndValidateField(fieldPolicy, fieldValue, dataType);
result = standardizationAndValidationResult.getFinalValidationResult();
//only apply the standardized result value if the routine is valid
fieldValue = result.isValid() ? standardizationAndValidationResult.getFieldValue() : fieldValue;
newValues[idx] = fieldValue;
if (!result.isValid()) {
rowValid = false;
results = (results == null ? new Vector<ValidationResult>() : results);
results.addAll(standardizationAndValidationResult.getValidationResults());
//results.add(result);
columnValid = false;
}
}
// Record fact that we there was an invalid column
columnsValid[idx] = columnValid;
}
// Return success unless all values were null. That would indicate a blank line in the file.
if (nulls >= schema.length) {
rowValid = false;
results = (results == null ? new Vector<ValidationResult>() : results);
results.add(ValidationResult.failRow("empty", "Row is empty"));
}
// Convert to reject reasons to JSON
sbRejectReason = toJSONArray(results);
// Record the results in the appended columns, move processing partition value last
newValues[schema.length] = newValues[schema.length - 1]; //PROCESSING_DTTM_COL
newValues[schema.length-1] = sbRejectReason; //REJECT_REASON_COL
// newValues[schema.length - 1] = (rowValid ? "1" : "0"); //VALID_INVALID_COL
CleansedRowResult cleansedRowResult = new CleansedRowResult();
cleansedRowResult.row = RowFactory.create(newValues);
cleansedRowResult.columnsValid = columnsValid;
cleansedRowResult.rowIsValid = rowValid;
return cleansedRowResult;
}
/**
* Performs counts of invalid columns, total valid and total invalid on a JavaRDD<CleansedRowResults>
*/
public long[] cleansedRowResultsValidationCounts(JavaRDD<CleansedRowResult> cleansedRowResultJavaRDD, int schemaLength) {
final int schemaLen = schemaLength;
// Maps each partition in the JavaRDD<CleansedRowResults> to a long[] of invalid column counts and total valid/invalid counts
JavaRDD<long[]> partitionCounts = validatorStrategy.getCleansedRowResultPartitionCounts(cleansedRowResultJavaRDD, schemaLen);
// Sums up all partitions validation counts into one long[]
long[] finalCounts = partitionCounts.reduce(new SumPartitionLevelCounts());
return finalCounts;
}
private String toJSONArray(List<ValidationResult> results) {
// Convert to reject reasons to JSON
StringBuffer sb = null;
if (results != null) {
sb = new StringBuffer();
for (ValidationResult result : results) {
if (sb.length() > 0) {
sb.append(",");
} else {
sb.append("[");
}
sb.append(result.toJSON());
}
sb.append("]");
}
return (sb == null ? "" : sb.toString());
}
/**
* Perform validation using both schema validation the validation policies
*/
protected ValidationResult finalValidationCheck(FieldPolicy fieldPolicy, HCatDataType fieldDataType, String fieldValue) {
boolean isEmpty = (StringUtils.isEmpty(fieldValue));
if (isEmpty) {
ValidationPolicy validator;
if ((validator = fieldPolicy.getNotNullValidator()) != null) {
ValidationResult result = validateValue(validator, fieldDataType, fieldValue, -1);
if (result != VALID_RESULT) {
return result;
}
}
} else if (!fieldPolicy.shouldSkipSchemaValidation()) {
if (!fieldDataType.isValueConvertibleToType(fieldValue)) {
return ValidationResult
.failField("incompatible", fieldDataType.getName(),
"Not convertible to " + fieldDataType.getNativeType());
}
}
return VALID_RESULT;
}
/**
* Extract the @PolicyProperty annotated fields
*
* @param policy the policy (validator or standardizer to parse)
* @return a string of the fileds and values
*/
private String getFieldPolicyDetails(BaseFieldPolicy policy) {
//cache the list
AnnotationFieldNameResolver annotationFieldNameResolver = new AnnotationFieldNameResolver(PolicyProperty.class);
List<AnnotatedFieldProperty> list = annotationFieldNameResolver.getProperties(policy.getClass());
StringBuffer sb = null;
for (AnnotatedFieldProperty<PolicyProperty> annotatedFieldProperty : list) {
PolicyProperty prop = annotatedFieldProperty.getAnnotation();
String value = null;
if (sb != null) {
sb.append(",");
}
if (sb == null) {
sb = new StringBuffer();
}
sb.append(StringUtils.isBlank(prop.displayName()) ? prop.name() : prop.displayName());
try {
Object fieldValue = FieldUtils.readField(annotatedFieldProperty.getField(), policy, true);
if (fieldValue != null) {
value = fieldValue.toString();
}
} catch (IllegalAccessException e) {
}
sb.append(" = ");
sb.append(value == null ? "<null> " : value);
}
return sb != null ? sb.toString() : "";
}
protected ValidationResult validateValue(ValidationPolicy validator, HCatDataType fieldDataType, String fieldValue, Integer idx) {
try {
// Resolve the type of parameter required by the validator. A cache is used to avoid cost of reflection.
Class expectedParamClazz = resolveValidatorParamType(validator);
Object nativeValue = fieldValue;
if (expectedParamClazz != String.class) {
nativeValue = fieldDataType.toNativeValue(fieldValue);
}
if (!validator.validate(nativeValue)) {
//get any fields in this validator annotated with PolicyProperty
return ValidationResult
.failFieldRule("rule", fieldDataType.getName(), validator.getClass().getSimpleName(),
"Rule violation");
}
return VALID_RESULT;
} catch (InvalidFormatException | ClassCastException e) {
return ValidationResult
.failField("incompatible", fieldDataType.getName(),
"Not convertible to " + fieldDataType.getNativeType());
}
}
/* Resolve the type of param required by the validator. A cache is used to avoid cost of reflection */
protected Class resolveValidatorParamType(ValidationPolicy validator) {
Class expectedParamClazz = validatorParamType.get(validator.getClass());
if (expectedParamClazz == null) {
// Cache for future references
Object t = validator.getClass().getGenericInterfaces()[0];
if (t instanceof ParameterizedType) {
ParameterizedType type = (ParameterizedType) t;
expectedParamClazz = (Class) type.getActualTypeArguments()[0];
} else {
expectedParamClazz = String.class;
}
validatorParamType.put(validator.getClass(), expectedParamClazz);
}
return expectedParamClazz;
}
protected StandardizationAndValidationResult standardizeAndValidateField(FieldPolicy fieldPolicy, Object value, HCatDataType dataType) {
StandardizationAndValidationResult result = new StandardizationAndValidationResult(value);
List<BaseFieldPolicy> fieldPolicies = fieldPolicy.getAllPolicies();
int idx = 0;
for (BaseFieldPolicy p : fieldPolicies) {
if (p instanceof StandardizationPolicy) {
StandardizationPolicy standardizationPolicy = (StandardizationPolicy) p;
boolean isEmpty = ((value == null) || (StringUtils.isEmpty(value.toString())));
boolean shouldStandardize = true;
if (isEmpty && !(standardizationPolicy instanceof AcceptsEmptyValues)) {
shouldStandardize = false;
}
if (!standardizationPolicy.accepts(value)) {
shouldStandardize = false;
}
if (shouldStandardize) {
Object newValue = standardizationPolicy.convertRawValue(result.getFieldValue());
result.setFieldValue(newValue != null ? newValue.toString() : newValue);
}
}
if (p instanceof ValidationPolicy) {
ValidationPolicy validationPolicy = (ValidationPolicy) p;
ValidationResult validationResult = validateValue(validationPolicy, dataType, result.getFieldValueForValidation(), idx);
//only need to add those that are invalid
if (validationResult != VALID_RESULT) {
result.addValidationResult(validationResult);
break; //exit out of processing if invalid records found.
}
}
}
ValidationResult finalValidationCheck = finalValidationCheck(fieldPolicy, dataType, result.getFieldValueForValidation());
if (finalValidationCheck != VALID_RESULT) {
result.addValidationResult(finalValidationCheck);
}
return result;
}
/**
* Converts the table schema into the corresponding data type structures
*/
protected HCatDataType[] resolveDataTypes(StructField[] fields) {
List<HCatDataType> cols = new Vector<>();
for (StructField field : fields) {
String colName = field.name();
String dataType = field.dataType().simpleString();
cols.add(HCatDataType.createFromDataType(colName, dataType));
}
return cols.toArray(new HCatDataType[0]);
}
protected StructField[] resolveSchema() {
StructType schema = scs.toDataSet(hiveContext, refTablename).schema();
return schema.fields();
}
/**
* Returns an array of field-level policies for data validation and cleansing
*/
protected FieldPolicy[] resolvePolicies(StructField[] fields) {
List<FieldPolicy> pols = new Vector<>();
for (StructField field : fields) {
String colName = field.name();
FieldPolicy policy = policyMap.get(colName);
if (policy == null) {
policy = FieldPolicyBuilder.SKIP_VALIDATION;
}
pols.add(policy);
}
return pols.toArray(new FieldPolicy[0]);
}
private void addParameters(CommandLineParams params) {
this.params = params;
}
}