/*******************************************************************************
* Copyright 2017 Capital One Services, LLC and Bitwise, Inc.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License
*******************************************************************************/
package hydrograph.engine.spark.helper;
import hydrograph.engine.core.constants.Constants;
import hydrograph.engine.spark.datasource.utils.TypeCast;
import org.apache.commons.lang3.time.FastDateFormat;
import org.apache.spark.sql.types.StructType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.Arrays;
import java.util.List;
/**
* The Class DelimitedAndFixedWidthHelper.
*
* @author Bitwise
*
*/
@SuppressWarnings("rawtypes")
public class DelimitedAndFixedWidthHelper {
private static final Logger LOG = LoggerFactory
.getLogger(DelimitedAndFixedWidthHelper.class);
static int counter = 0;
static boolean isFixedWidthField = false;
private DelimitedAndFixedWidthHelper() {
}
public static Object[] getFields(
StructType schema, String line,
String[] lengthsAndDelimiters, String[] lengthsAndDelimitersType,
boolean safe, String quote, List<FastDateFormat> dateFormats) throws Exception{
if (!line.equals("")) {
try {
String[] tokens = generateTokensFromRawData(line,
lengthsAndDelimiters, lengthsAndDelimitersType, quote);
return coerceParsedTokens(line, tokens, safe, schema, dateFormats);
} catch (Exception e) {
throw new RuntimeException("Exception while generating tokens.\nLine being parsed: "
+ line + "\nFields: " + getFieldsFromSchema(schema,dateFormats.size())
+ "\nLengths and delimiters in scheme: "
+ Arrays.toString(lengthsAndDelimiters)
+ "\nDatatypes in scheme: "
+ getTypesFromSchema(schema,dateFormats.size())
+ "\nSafe was set to: " + safe + "\n Error being -> " ,e);
}
} else {
return new Object[lengthsAndDelimiters.length];
}
}
private static String getTypesFromSchema(StructType schema, int length) {
String fields = schema.apply(0).dataType().toString();
for(int i=1;i< length ;i++){
fields = fields + schema.apply(i).dataType().toString();
}
return fields;
}
private static String getFieldsFromSchema(StructType schema, int length) {
String fields = schema.apply(0).name();
for(int i=1;i< length ;i++){
fields = fields + schema.apply(i).name();
}
return fields;
}
private static String[] generateTokensFromRawData(
String line,String[] lengthsAndDelimiters,
String[] lengthsAndDelimitersType,String quote) {
String tokens[] = new String[lengthsAndDelimiters.length];
String strings[];
String identifier;
quote = DelimitedAndFixedWidthHelper.maskRegexChar(quote);
for (int i = 0; i < lengthsAndDelimiters.length; i++) {
identifier = DelimitedAndFixedWidthHelper
.maskRegexChar(lengthsAndDelimiters[i]);
if (lengthsAndDelimitersType[i].contains("Integer")) {
tokens[i] = line.substring(0, Integer.parseInt(identifier));
if (i != (lengthsAndDelimiters.length - 1))
line = line.substring(Integer.parseInt(identifier));
} else {
if (!"".equals(quote) && line.contains(quote.replace("\\", ""))) {
// Creation of RegEx to split data based on delimiter
// ignoring the delimiter present in data based on
// presence of quote char
identifier = identifier + "(?=(?:[^" + quote + "]*" + quote
+ "[^" + quote + "]*[^" + quote + identifier + "]*"
+ quote + ")*(?![^" + quote + "]*" + quote + "))";
}
strings = line.split(identifier);
if (strings.length != 0) {
tokens[i] = ((strings)[0]).replace(quote.replace("\\", ""),
"");
if (i != (lengthsAndDelimiters.length - 1))
line = (line.split(identifier, 2))[1];
} else {
tokens[i] = "";
}
}
}
return tokens;
}
private static Object[] coerceParsedTokens(
String line, Object[] tokens, boolean safe,
StructType schema, List<FastDateFormat> dateFormats) throws Exception {
Object[] result = new Object[tokens.length];
for (int i = 0; i < tokens.length; i++) {
try {
tokens[i] = !schema.apply(i).dataType().simpleString().equalsIgnoreCase("String") ? tokens[i].toString().trim() : tokens[i];
result[i] = TypeCast.inputValue(tokens[i].toString(), schema.apply(i).dataType(),
schema.apply(i).nullable(), "null", true, dateFormats.get(i));
} catch (Exception exception) {
result[i] = null;
if (!safe) {
throw new RuntimeException(getSafeMessage(tokens[i], i, schema) + "\n Line being parsed => " + line,exception);
}
}
}
tokens = result;
return tokens;
}
private static String getSafeMessage(
Object object, int i, StructType schema) {
try {
return "field " + schema.apply(i).name() + " cannot be coerced from : " + object + " to: " + schema.apply(i).dataType();
} catch (Throwable throwable) {
return "field pos " + i + " cannot be coerced from: " + object + ", pos has no corresponding field name or coercion type";
}
}
public static String createLine(String strTuple,
String[] lengthsAndDelimiters, String[] lengthsAndDelimitersType,
boolean strict, char filler, String quote) {
counter = 0;
Object[] tuple = strTuple.split(Constants.LENGTHS_AND_DELIMITERS_SEPARATOR);
StringBuilder buffer = new StringBuilder();
for (Object value : tuple) {
isFixedWidthField = false;
isFixedWidthField = isFixedWidthField(lengthsAndDelimitersType,
counter);
if (value == null) {
value = "";
}
if (isFixedWidthField) {
int lengthDifference = value.toString().length()
- Integer.parseInt(lengthsAndDelimiters[counter]);
if (lengthDifference == 0) {
buffer.append(value);
counter++;
continue;
} else if (lengthDifference > 0) {
if (strict) {
throw new RuntimeException(
"Fixed width field write error. Field "
+ value
+ " has length "
+ value.toString().length()
+ " whereas provided is "
+ lengthsAndDelimiters[counter]
+ ". Set strict to false and provide filler to overide such errors if this is expected behaviour. " +
"The prospect tuple is : " +
Arrays.toString(tuple));
}
buffer.append(value.toString().substring(0,
Integer.parseInt(lengthsAndDelimiters[counter])));
counter++;
continue;
} else if (lengthDifference < 0) {
if (strict) {
throw new RuntimeException(
"Fixed width field write error. Field "
+ value
+ " has length "
+ value.toString().length()
+ " whereas provided is "
+ lengthsAndDelimiters[counter]
+ ". Set strict to false and provide filler to overide such errors if this is expected behaviour." +
" The prospect tuple is : "+
Arrays.toString(tuple));
}
try {
if (isNumeric(value)) {
appendZero(buffer, lengthDifference * -1);
buffer.append(value);
} else {
buffer.append(value);
appendFiller(buffer, filler, lengthDifference * -1);
}
} catch (IOException e) {
LOG.error("", e);
throw new RuntimeException(e);
}
counter++;
continue;
}
}
if (quoteCharPresent(quote)) {
value = appendQuoteChars(value, quote,
lengthsAndDelimiters[counter]);
}
buffer.append(value);
if (lengthsAndDelimiters[counter].contentEquals("\\n"))
lengthsAndDelimiters[counter] = "\n";
if (lengthsAndDelimiters[counter].contentEquals("\\t"))
lengthsAndDelimiters[counter] = "\t";
if (lengthsAndDelimiters[counter].contentEquals("\\r"))
lengthsAndDelimiters[counter] = "\r";
buffer.append(parseHex(lengthsAndDelimiters[counter]));
counter++;
}
return buffer.toString();
}
private static boolean quoteCharPresent(String quote) {
return !quote.equals("");
}
private static Object appendQuoteChars(Object value, String quote,
String lengthsAndDelimiters) {
if (value instanceof String && ((String) value).contains(lengthsAndDelimiters)) {
value = quote + value + quote;
}
return value;
}
private static boolean isFixedWidthField(String[] lengthsAndDelimitersType,
int counter) {
return lengthsAndDelimitersType[counter].contains("Integer");
}
private static boolean isNumeric(Object value) {
return value instanceof Number;
}
private static void appendZero(Appendable buffer, int times)
throws IOException {
char filler = ' ';
for (int i = 0; i < times; i++) {
buffer.append(filler);
}
}
private static void appendFiller(Appendable buffer, char filler, int times)
throws IOException {
for (int i = 0; i < times; i++) {
buffer.append(filler);
}
}
public static boolean isLastFieldNewLine(String[] lengthsAndDelimiters) {
return lengthsAndDelimiters[lengthsAndDelimiters.length - 1]
.matches("\n")
|| lengthsAndDelimiters[lengthsAndDelimiters.length - 1]
.contentEquals("\\n");
}
public static boolean hasaNewLineField(String[] lengthsAndDelimiters) {
for (String string : lengthsAndDelimiters) {
if (string.contains("\n") || string.contentEquals("\\n"))
return true;
}
return false;
}
public static boolean containsNewLine(String outputLine) {
return outputLine.contains("\n") || outputLine.contains("\\n")
|| outputLine.contains("\r\n") || outputLine.contains("\\r\\n");
}
public static String modifyIdentifier(String identifier) {
String string = identifier;
if (identifier.contains("\\r\\n")) {
string = identifier.replace("\\r\\n", "\r\n");
} else if (identifier.contains("\\n")) {
string = identifier.replace("\\n", "\n");
}
if (identifier.contains("\\t")) {
string = identifier.replace("\\t", "\t");
}
if (identifier.contains("\\x")) {
string = parseHex(identifier);
}
return string;
}
public static String[] modifyIdentifier(String[] identifiers) {
for (int i = 0; i < identifiers.length; i++) {
identifiers[i] = modifyIdentifier(identifiers[i]);
}
return identifiers;
}
public static String spillOneLineToOutput(String sb,
String[] lengthsAndDelimiters) {
String line = "";
if (!isLastFieldNewLine(lengthsAndDelimiters)
&& !isLastFixedWidthFieldNewLineField(lengthsAndDelimiters)) {
if (hasaNewLineField(lengthsAndDelimiters) || containsNewLine(sb)) {
String[] splits = sb.toString().split("\n");
for (int i = 0; i < splits.length; i++) {
if (i != splits.length - 1) {
line += splits[i];
line += "\n";
}
}
return line.substring(0, line.length() - 1);
}
} else {
sb = sb.substring(0,sb.length()-1);
return sb;
}
return line;
}
public static boolean isLastFixedWidthFieldNewLineField(
String[] lengthsAndDelimiters) {
try {
return Integer
.parseInt(lengthsAndDelimiters[lengthsAndDelimiters.length - 1]) == 1;
} catch (Exception e) {
return false;
}
}
public static String maskRegexChar(
String singleChar) {
if (singleChar.contains("|")) {
singleChar = singleChar.replace("|", "\\|");
}
if (singleChar.contains(".")) {
singleChar = singleChar.replace(".", "\\.");
}
if (singleChar.contains("+")) {
singleChar = singleChar.replace("+", "\\+");
}
if (singleChar.contains("$")) {
singleChar = singleChar.replace("$", "\\$");
}
if (singleChar.contains("*")) {
singleChar = singleChar.replace("*", "\\*");
}
if (singleChar.contains("?")) {
singleChar = singleChar.replace("?", "\\?");
}
if (singleChar.contains("^")) {
singleChar = singleChar.replace("^", "\\^");
}
if (singleChar.contains("-")) {
singleChar = singleChar.replace("-", "\\-");
}
if (singleChar.contains(")")) {
singleChar = singleChar.replace(")", "\\)");
}
if (singleChar.contains("(")) {
singleChar = singleChar.replace("(", "\\(");
}
if (singleChar.contains("\\x")) {
singleChar = parseHex(singleChar);
}
return singleChar;
}
public static String[] checkIfDelimiterIsRegexChar(
String[] lengthsAndDelimiters) {
for (int i = 0; i < lengthsAndDelimiters.length; i++)
lengthsAndDelimiters[i] = maskRegexChar(lengthsAndDelimiters[i]);
return lengthsAndDelimiters;
}
public static String arrayToString(String[] lengthsAndDelimiters) {
String string = "";
for (String str : lengthsAndDelimiters) {
string += str;
string += "comma";
}
return string;
}
public static String[] stringToArray(String string) {
return string.split("comma");
}
public static String parseHex(String input) {
final int NO_OF_DIGITS = 2;
if (input.contains("\\t")) {
input = input.replace("\\t", "\\x09");
}
String[] tokens = input.split("\\\\x");
String hex;
String temp;
boolean startsWithHex = input.startsWith("\\x");
for (int counter = 0; counter < tokens.length; counter++) {
if (counter == 0 && !startsWithHex)
continue;
if (tokens[counter].equals(""))
continue;
temp = tokens[counter];
hex = temp.substring(0, NO_OF_DIGITS);
temp = temp.substring(NO_OF_DIGITS, temp.length());
tokens[counter] = hexToChar(hex) + temp;
}
String result = "";
for (String token : tokens) {
result = result + token;
}
return result;
}
private static char hexToChar(String hex) {
return (char) Short.parseShort(hex, 16);
}
}