package com.thinkbiganalytics.spark.validation;
/*-
* #%L
* thinkbig-spark-validate-cleanse-api
* %%
* Copyright (C) 2017 ThinkBig Analytics
* %%
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* #L%
*/
import com.thinkbiganalytics.policy.validation.DateValidator;
import com.thinkbiganalytics.policy.validation.TimestampValidator;
import com.thinkbiganalytics.spark.util.InvalidFormatException;
import org.apache.commons.lang3.StringUtils;
import java.io.Serializable;
import java.math.BigDecimal;
import java.math.BigInteger;
import java.sql.Date;
import java.sql.Timestamp;
import java.util.HashMap;
import java.util.Map;
/**
* Performs validation of a string value to ensure it is convertible to the give Hive column type. The class ensures the
* precision, range, and data type are compatible.
*/
public class HCatDataType implements Cloneable, Serializable {
private static HCatDataType UNCHECKED_TYPE = new HCatDataType();
private static Map<String, HCatDataType> dataTypes = new HashMap();
// Build static rules around the various column types
static {
dataTypes.put("tinyint", new HCatDataType((int) Byte.MIN_VALUE, (int) Byte.MAX_VALUE));
dataTypes.put("smallint", new HCatDataType((int) Short.MIN_VALUE, (int) Short.MAX_VALUE));
dataTypes.put("int", new HCatDataType(Integer.MIN_VALUE, Integer.MAX_VALUE));
dataTypes.put("bigint", new HCatDataType(BigInteger.class));
dataTypes.put("decimal", new HCatDataType(BigDecimal.class));
dataTypes.put("string", new HCatDataType(Long.MAX_VALUE));
dataTypes.put("varchar", new HCatDataType(65355L));
dataTypes.put("char", new HCatDataType(255L));
// We use -MAX_VALUE because MIN_VALUE is actually minimum positive non-zero value
dataTypes.put("float", new HCatDataType(-Float.MAX_VALUE, Float.MAX_VALUE));
dataTypes.put("double", new HCatDataType(-Double.MAX_VALUE, Double.MAX_VALUE));
dataTypes.put("real", new HCatDataType(-Double.MAX_VALUE, Double.MAX_VALUE));
dataTypes.put("date", new HCatDataType(Date.class));
dataTypes.put("timestamp", new HCatDataType(Timestamp.class));
}
/**
* Whether the value is numeric
*/
boolean isnumeric;
/**
* Class instance returned from converting string to native of this type
*/
private Class convertibleType;
/**
* Whether this is string type
*/
private boolean isstring;
/**
* whether this is unchecked type (essentially any type not defined)
*/
private boolean unchecked;
/**
* Minimum size of number
*/
private Comparable min;
/**
* Maximum size of number
*/
private Comparable max;
/**
* Max length of string
*/
private long maxlength;
/**
* Number of digits supported (precision)
*/
private Integer digits;
/**
* Name of the field (set after clone)
*/
private String name;
/**
* Hive data type (set after clone)
*/
private String nativeType;
private HCatDataType() {
this.unchecked = true;
this.convertibleType = String.class;
}
private HCatDataType(long length) {
this.maxlength = length;
this.isstring = true;
this.convertibleType = String.class;
}
private HCatDataType(Integer min, Integer max) {
this.isnumeric = true;
this.min = min;
this.max = max;
this.convertibleType = Integer.class;
}
private HCatDataType(Double min, Double max) {
this.isnumeric = true;
this.min = min;
this.max = max;
this.convertibleType = Double.class;
}
private HCatDataType(Float min, Float max) {
this.isnumeric = true;
this.min = min;
this.max = max;
this.convertibleType = Float.class;
}
private HCatDataType(Class clazz) {
this.convertibleType = clazz;
if (clazz == Date.class || clazz == Timestamp.class) {
this.isnumeric = false;
} else {
this.isnumeric = true;
BigDecimal minDecimal = new BigDecimal(Long.MIN_VALUE);
BigDecimal maxDecimal = new BigDecimal(Long.MAX_VALUE);
if (clazz == BigInteger.class) {
this.min = minDecimal.toBigInteger();
this.max = maxDecimal.toBigInteger();
} else if (clazz == BigDecimal.class) {
this.min = null;
this.max = null;
} else {
throw new RuntimeException("Invalid class for constructor " + clazz.getCanonicalName());
}
}
}
/**
* Generate a data type validator for the given hive data type
*
* @param columnType the defined hive column type
*/
public static HCatDataType createFromDataType(String name, String columnType) {
Integer decSize = null;
Integer decDigits = null;
Long strLen = null;
String dataType = columnType.toLowerCase();
// Extract precision and scale portion as in Decimal(8,2) or varchar(255)
int idx = columnType.indexOf("(");
if (idx > -1) {
dataType = columnType.substring(0, idx);
String precisionPart = columnType.substring(idx + 1, columnType.length() - 1);
String[] parts = precisionPart.split(",");
if (parts.length == 2) {
decSize = Integer.parseInt(parts[0]);
decDigits = Integer.parseInt(parts[1]);
} else {
strLen = Long.parseLong(parts[0]);
}
}
HCatDataType hcatType = dataTypes.get(dataType);
if (hcatType != null) {
try {
hcatType = (HCatDataType) hcatType.clone();
hcatType.name = name;
hcatType.nativeType = dataType;
// Determine min max based on column precision
if (decSize != null) {
hcatType.digits = decDigits;
hcatType.max = new BigDecimal(generateRepeatingCharacters(decSize, '9')
+ "."
+ generateRepeatingCharacters(decDigits,'9'));
hcatType.min = ((BigDecimal) hcatType.max).negate();
} else if (strLen != null) {
hcatType.maxlength = strLen;
}
} catch (CloneNotSupportedException e) {
throw new RuntimeException("Unexpected clone exception", e);
}
}
return (hcatType == null ? UNCHECKED_TYPE : hcatType);
}
private static String generateRepeatingCharacters(int repeatTimes, char repeatChar) {
if ((repeatTimes <= 0)) {
return "";
}
StringBuffer retVal = new StringBuffer();
for (int i = 0; i < repeatTimes; i++) {
retVal.append(repeatChar);
}
return retVal.toString();
}
public static Map<String, HCatDataType> getDataTypes() {
return dataTypes;
}
private int getNumberOfDecimalPlaces(BigDecimal bigDecimal) {
String string = bigDecimal.stripTrailingZeros().toPlainString();
int index = string.indexOf(".");
return index < 0 ? 0 : string.length() - index - 1;
}
private int getNumberOfDecimalPlaces(Double dbl) {
return getNumberOfDecimalPlaces(new BigDecimal(dbl));
}
public Class getConvertibleType() {
return this.convertibleType;
}
public <T extends Comparable> T toNativeValue(String val) throws InvalidFormatException {
try {
if (StringUtils.isEmpty(val) || convertibleType == String.class) {
return (unchecked || isstring ? (T) val : null);
} else if (convertibleType == Integer.class) {
return (T) new Integer(val);
} else if (convertibleType == Double.class) {
return (T) new Double(val);
} else if (convertibleType == Float.class) {
return (T) new Float(val);
} else if (convertibleType == BigInteger.class) {
return (T) new BigInteger(val);
} else if (convertibleType == BigDecimal.class) {
return (T) new BigDecimal(val);
} else {
throw new RuntimeException("Unexpected conversion [" + convertibleType + "] for value [" + val + "]");
}
} catch (NumberFormatException | ClassCastException e) {
throw new InvalidFormatException(this, val);
}
}
/**
* Validate scale (digits following the decimal)
*
* @param val the native value
* @return whether passes validation
*/
private boolean validatePrecision(Comparable val) {
if (convertibleType == BigDecimal.class) {
if (getNumberOfDecimalPlaces((BigDecimal) val) > digits) {
return false;
}
}
return true;
}
/**
* Tests whether the string value can be converted to the hive data type defined by this class. If it is not convertible then
* hive will not be able to show the value
*
* @param val the string value
* @return whether value is valid
*/
public boolean isValueConvertibleToType(String val) {
return isValueConvertibleToType(val, false);
}
public boolean isValueConvertibleToType(String val, boolean enforcePrecision) {
try {
if (val != null && !isnumeric) {
if (convertibleType == Timestamp.class) {
return new TimestampValidator(true).validate(val);
} else if (convertibleType == Date.class) {
return DateValidator.instance().validate(val);
}
}
Comparable nativeValue = toNativeValue(val);
if (nativeValue != null) {
if (isnumeric) {
if (min != null && min.compareTo(nativeValue) > 0) {
return false;
}
if (max != null && max.compareTo(nativeValue) < 0) {
return false;
}
if (digits != null && !(!enforcePrecision || (enforcePrecision && validatePrecision(nativeValue)))) {
return false;
}
} else if (isstring) {
if (val.length() > maxlength) {
return false;
}
}
}
} catch (InvalidFormatException | ClassCastException | IllegalArgumentException e) {
return false;
}
return true;
}
@Override
public Object clone() throws CloneNotSupportedException {
return super.clone();
}
public String getName() {
return name;
}
public String getNativeType() {
return nativeType;
}
public boolean isUnchecked() {
return unchecked;
}
}