package com.thinkbiganalytics.spark.datavalidator;
/*-
* #%L
* thinkbig-spark-validate-cleanse-app
* %%
* Copyright (C) 2017 ThinkBig Analytics
* %%
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* #L%
*/
import com.thinkbiganalytics.policy.BaseFieldPolicy;
import com.thinkbiganalytics.policy.FieldPoliciesJsonTransformer;
import com.thinkbiganalytics.policy.FieldPolicy;
import com.thinkbiganalytics.policy.FieldPolicyBuilder;
import com.thinkbiganalytics.policy.standardization.LowercaseStandardizer;
import com.thinkbiganalytics.policy.standardization.MaskLeavingLastFourDigitStandardizer;
import com.thinkbiganalytics.policy.standardization.SimpleRegexReplacer;
import com.thinkbiganalytics.policy.standardization.StandardizationPolicy;
import com.thinkbiganalytics.policy.standardization.UppercaseStandardizer;
import com.thinkbiganalytics.policy.validation.CharacterValidator;
import com.thinkbiganalytics.policy.validation.EmailValidator;
import com.thinkbiganalytics.policy.validation.LookupValidator;
import com.thinkbiganalytics.policy.validation.NotNullValidator;
import com.thinkbiganalytics.policy.validation.RangeValidator;
import com.thinkbiganalytics.policy.validation.ValidationPolicy;
import com.thinkbiganalytics.policy.validation.ValidationResult;
import com.thinkbiganalytics.spark.validation.HCatDataType;
import org.junit.Before;
import org.junit.Test;
import java.net.URL;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotEquals;
import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue;
public class ValidatorTest {
private Validator validator;
@Before
public void setUp() throws Exception {
URL url = getClass().getClassLoader().getResource("example-policy.json");
Path pathToPolicyFile = (url != null) ? Paths.get(url.toURI()) : Paths.get("");
validator = new Validator();
validator.setArguments("emp", "sampletable", "20001", pathToPolicyFile.toString());
}
@Test
public void testParseRemainingParameters() {
String[] args = {"targetDatabase", "entity", "partition", "path-to-policy-file", "-h", "hive.setting.1=value.1", "--hiveConf", "hive.setting.2=value.2"};
CommandLineParams params = Validator.parseRemainingParameters(args, 4);
List<Param> hiveParams = params.getHiveParams();
Param first = hiveParams.get(0);
assertEquals("hive.setting.1", first.getName());
assertEquals("value.1", first.getValue());
Param second = hiveParams.get(1);
assertEquals("hive.setting.2", second.getName());
assertEquals("value.2", second.getValue());
}
@Test
public void testParseRemainingParametersStorageLevel() {
String[] args = {"targetDatabase", "entity", "partition", "path-to-policy-file", "--storageLevel", "MEMORY_ONLY"};
CommandLineParams params = Validator.parseRemainingParameters(args, 4);
String storageLevel = params.getStorageLevel();
assertEquals("MEMORY_ONLY", storageLevel);
}
@Test
public void testDefaultStorageLevel() {
String[] args = {"targetDatabase", "entity", "partition", "path-to-policy-file"};
CommandLineParams params = Validator.parseRemainingParameters(args, 4);
String defaultStorageLevel = params.getStorageLevel();
assertEquals("MEMORY_AND_DISK", defaultStorageLevel);
}
@Test
public void testParseRemainingParametersNumPartitions() {
String[] args = {"targetDatabase", "entity", "partition", "path-to-policy-file", "--storageLevel", "MEMORY_ONLY", "--numPartitions", "10"};
CommandLineParams params = Validator.parseRemainingParameters(args, 4);
Integer numRDDPartitions = params.getNumPartitions();
assertEquals("10", String.valueOf(numRDDPartitions));
}
@Test
public void testDefaultNumPartitions() {
String[] args = {"targetDatabase", "entity", "partition", "path-to-policy-file"};
CommandLineParams params = Validator.parseRemainingParameters(args, 4);
Integer defaultRDDPartitions = params.getNumPartitions();
assertEquals("-1", String.valueOf(defaultRDDPartitions));
}
@Test
public void testParseRemainingParameters_missingParameters() {
String[] args = {"targetDatabase", "entity", "partition", "path-to-policy-file"};
CommandLineParams params = Validator.parseRemainingParameters(args, 4);
List<Param> hiveParams = params.getHiveParams();
assertTrue(hiveParams.isEmpty());
}
@Test
public void testValidateRange() {
assertEquals(Validator.VALID_RESULT, rangeValidate(1, 100, "decimal", "1"));
assertEquals(Validator.VALID_RESULT, rangeValidate(1, 100, "double", "100.0"));
assertEquals(Validator.VALID_RESULT, rangeValidate(1, 100, "decimal", "15.55"));
assertEquals(Validator.VALID_RESULT, rangeValidate(1, 100, "int", "50"));
assertEquals(Validator.VALID_RESULT, rangeValidate(-50, 0, "real", "-32.123"));
assertTrue(!rangeValidate(1, 100, "decimal", "0").isValid());
assertTrue(!rangeValidate(1, 100, "double", "0").isValid());
assertTrue(!rangeValidate(1, 100, "int", "0").isValid());
}
private ValidationResult rangeValidate(Number min, Number max, String dataType, String value) {
RangeValidator validatorPolicy = new RangeValidator(min, max);
List<BaseFieldPolicy> policies = new ArrayList<>();
policies.add(validatorPolicy);
FieldPolicy fieldPolicy = FieldPolicyBuilder.newBuilder().addPolicies(policies).tableName("emp").fieldName("field1").feedFieldName("field1").addPolicies(policies).build();
StandardizationAndValidationResult result = validator.standardizeAndValidateField(fieldPolicy, value, HCatDataType.createFromDataType("field1", dataType));
return result.getFinalValidationResult();
}
@Test
public void standardizeRegex() {
SimpleRegexReplacer standardizer = new SimpleRegexReplacer("(?i)foo", "bar");
String fieldName = "field1";
List<BaseFieldPolicy> policies = new ArrayList<>();
policies.add(standardizer);
FieldPolicy fieldPolicy = FieldPolicyBuilder.newBuilder().addPolicies(policies).tableName("emp").fieldName(fieldName).feedFieldName(fieldName).build();
HCatDataType fieldDataType = HCatDataType.createFromDataType(fieldName, "string");
StandardizationAndValidationResult result = validator.standardizeAndValidateField(fieldPolicy, "aafooaa", fieldDataType);
assertEquals(result.getFieldValue(),"aabaraa");
result = validator.standardizeAndValidateField(fieldPolicy, null, fieldDataType);
assertNull(result.getFieldValue());
result = validator.standardizeAndValidateField(fieldPolicy, "", fieldDataType);
assertEquals(result.getFieldValue(),"");
}
@Test
public void standardizeAndValidate() {
String fieldName = "field1";
List<BaseFieldPolicy> policies = new ArrayList<>();
policies.add(new SimpleRegexReplacer("(?i)foo", "bar"));
policies.add(new LookupValidator("aabaraa"));
policies.add(new SimpleRegexReplacer("(?i)bar", "test"));
policies.add(new LookupValidator("aatestaa"));
FieldPolicy fieldPolicy = FieldPolicyBuilder.newBuilder().addPolicies(policies).tableName("emp").fieldName(fieldName).feedFieldName(fieldName).build();
HCatDataType fieldDataType = HCatDataType.createFromDataType(fieldName, "string");
StandardizationAndValidationResult result = validator.standardizeAndValidateField(fieldPolicy, "aafooaa", fieldDataType);
assertEquals(result.getFieldValue(), "aatestaa");
assertEquals(Validator.VALID_RESULT,result.getFinalValidationResult());
}
@Test
public void invalidStandardizeAndValidate() {
String fieldName = "field1";
List<BaseFieldPolicy> policies = new ArrayList<>();
policies.add(new SimpleRegexReplacer("(?i)foo", "bar"));
policies.add(new LookupValidator("blah"));
policies.add(new SimpleRegexReplacer("(?i)bar", "test"));
policies.add(new LookupValidator("aatestaa"));
FieldPolicy fieldPolicy = FieldPolicyBuilder.newBuilder().addPolicies(policies).tableName("emp").fieldName(fieldName).feedFieldName(fieldName).build();
HCatDataType fieldDataType = HCatDataType.createFromDataType(fieldName, "string");
StandardizationAndValidationResult result = validator.standardizeAndValidateField(fieldPolicy, "aafooaa", fieldDataType);
assertEquals("aabaraa",result.getFieldValue() );
assertNotEquals(Validator.VALID_RESULT,result.getFinalValidationResult());
}
@Test
public void nullValueStandardizeAndValidate() {
String fieldValue = null;
String fieldName = "field1";
List<BaseFieldPolicy> policies = new ArrayList<>();
policies.add(new SimpleRegexReplacer("(?i)foo", "bar"));
policies.add(new LookupValidator("blah"));
policies.add(new SimpleRegexReplacer("(?i)bar", "test"));
policies.add(new LookupValidator("aatestaa"));
FieldPolicy fieldPolicy = FieldPolicyBuilder.newBuilder().addPolicies(policies).tableName("emp").fieldName(fieldName).feedFieldName(fieldName).build();
HCatDataType fieldDataType = HCatDataType.createFromDataType(fieldName, "string");
StandardizationAndValidationResult result = validator.standardizeAndValidateField(fieldPolicy, null, fieldDataType);
assertNotEquals(Validator.VALID_RESULT,result.getFinalValidationResult());
}
@Test
public void mixedStandardizeAndValidate() {
String fieldValue = "TeSt_fiELd";
String fieldName = "field1";
List<BaseFieldPolicy> policies = new ArrayList<>();
policies.add(UppercaseStandardizer.instance());
policies.add(new CharacterValidator("UPPERCASE"));
policies.add(LowercaseStandardizer.instance());
policies.add(new CharacterValidator("LOWERCASE"));
policies.add(UppercaseStandardizer.instance());
policies.add(new CharacterValidator("UPPERCASE"));
policies.add(LowercaseStandardizer.instance());
policies.add(new CharacterValidator("LOWERCASE"));
FieldPolicy fieldPolicy = FieldPolicyBuilder.newBuilder().addPolicies(policies).tableName("emp").fieldName(fieldName).feedFieldName(fieldName).build();
HCatDataType fieldDataType = HCatDataType.createFromDataType(fieldName, "string");
StandardizationAndValidationResult result = validator.standardizeAndValidateField(fieldPolicy, fieldValue, fieldDataType);
assertEquals(Validator.VALID_RESULT,result.getFinalValidationResult());
assertEquals("test_field", result.getFieldValue());
}
@Test
public void testValidateNotNull() {
assertNotEquals(Validator.VALID_RESULT, notNullValidate("string", null, false, false));
}
private ValidationResult notNullValidate(String dataType, String value, boolean allowEmptyString, boolean trimString) {
NotNullValidator validatorPolicy = new NotNullValidator(allowEmptyString, trimString);
List<BaseFieldPolicy> policies = new ArrayList<>();
policies.add(validatorPolicy);
FieldPolicy fieldPolicy = FieldPolicyBuilder.newBuilder().addPolicies(policies).tableName("emp").fieldName("field1").feedFieldName("field1").build();
StandardizationAndValidationResult result = validator.standardizeAndValidateField(fieldPolicy, value, HCatDataType.createFromDataType("field1", dataType));
return result.getFinalValidationResult();
}
@Test
public void testPolicyMap() {
String fieldPolicyJson = "[{\"profile\":true,\"index\":false,\"fieldName\":\"fieldA\",\"feedFieldName\":\"fieldA\",\"standardization\":null,\"validation\":[{\"name\":\"Not Null\","
+ "\"displayName\":null,\"description\":null,\"shortDescription\":null,\"properties\":[{\"name\":\"EMPTY_STRING\",\"displayName\":null,\"value\":\"false\",\"values\":null,\"placeholder\":null,\"type\":null,\"hint\":null,\"objectProperty\":\"allowEmptyString\",\"selectableValues\":[],\"required\":false,\"group\":null,\"groupOrder\":null,\"layout\":null,\"hidden\":false,\"pattern\":null,\"patternInvalidMessage\":null},{\"name\":\"TRIM_STRING\",\"displayName\":null,\"value\":\"true\",\"values\":null,\"placeholder\":null,\"type\":null,\"hint\":null,\"objectProperty\":\"trimString\",\"selectableValues\":[],\"required\":false,\"group\":null,\"groupOrder\":null,\"layout\":null,\"hidden\":false,\"pattern\":null,\"patternInvalidMessage\":null}],\"objectClassType\":\"com.thinkbiganalytics.policy.validation.NotNullValidator\",\"objectShortClassType\":\"NotNullValidator\",\"propertyValuesDisplayString\":null,\"regex\":null,\"type\":null}]},{\"profile\":true,\"index\":false,\"fieldName\":\"id\",\"feedFieldName\":\"id\",\"standardization\":null,\"validation\":null},{\"profile\":true,\"index\":false,\"fieldName\":\"email\",\"feedFieldName\":\"email\",\"standardization\":null,\"validation\":null},{\"profile\":true,\"index\":false,\"fieldName\":\"gender\",\"feedFieldName\":\"gender\",\"standardization\":null,\"validation\":null},{\"profile\":true,\"index\":false,\"fieldName\":\"ip_address\",\"feedFieldName\":\"ip_address\",\"standardization\":null,\"validation\":null},{\"profile\":true,\"index\":false,\"fieldName\":\"credit_card\",\"feedFieldName\":\"credit_card\",\"standardization\":null,\"validation\":null},{\"profile\":true,\"index\":false,\"fieldName\":\"country\",\"feedFieldName\":\"country\",\"standardization\":null,\"validation\":null},{\"profile\":true,\"index\":false,\"fieldName\":\"birthdate\",\"feedFieldName\":\"birthdate\",\"standardization\":null,\"validation\":null},{\"profile\":true,\"index\":false,\"fieldName\":\"salary\",\"feedFieldName\":\"salary\",\"standardization\":null,\"validation\":null},{\"profile\":true,\"index\":false,\"fieldName\":\"fieldB\",\"feedFieldName\":\"fieldB\",\"standardization\":null,\"validation\":null}]";
Map<String, FieldPolicy> policyMap = new FieldPoliciesJsonTransformer(fieldPolicyJson).buildPolicies();
assertEquals(policyMap.size(), 10);
}
}