/*******************************************************************************
* Copyright 2017 Capital One Services, LLC and Bitwise, Inc.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License
*******************************************************************************/
package hydrograph.engine.cascading.scheme;
import cascading.tap.TapException;
import cascading.tuple.Fields;
import cascading.tuple.Tuple;
import cascading.tuple.coerce.Coercions;
import cascading.tuple.coerce.StringCoerce;
import cascading.tuple.type.CoercibleType;
import cascading.tuple.type.DateType;
import hydrograph.engine.core.utilities.GeneralUtilities;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.lang.reflect.Type;
import java.util.Arrays;
@SuppressWarnings("rawtypes")
public class DelimitedAndFixedWidthHelper {
private DelimitedAndFixedWidthHelper() {
}
static int counter = 0;
static boolean isFixedWidthField = false;
private static final Logger LOG = LoggerFactory
.getLogger(DelimitedAndFixedWidthHelper.class);
public static Object[] getFields(Fields sourceFields, String line,
String[] lengthsAndDelimiters, String[] lengthsAndDelimitersType,
Type[] types, boolean safe, String quote) {
if (!line.equals("")) {
try {
String[] tokens = generateTokensFromRawData(line,
lengthsAndDelimiters, lengthsAndDelimitersType, quote);
Type[] fieldDataTypes = types;
if (fieldDataTypes == null) {
fieldDataTypes = new Type[sourceFields.size()];
for (int i = 0; i < sourceFields.size(); i++) {
fieldDataTypes[i] = String.class;
}
}
return coerceParsedTokens(sourceFields, line, safe, tokens,
fieldDataTypes, Coercions.coercibleArray(
sourceFields.size(), fieldDataTypes));
} catch (Exception e) {
LOG.error(
"Exception while generating tokens.\nLine being parsed: "
+ line + "\nFields: " + sourceFields
+ "\nLengths and delimiters in scheme: "
+ Arrays.toString(lengthsAndDelimiters)
+ "\nDatatypes in scheme: "
+ Arrays.toString(types)
+ "\nSafe was set to: " + safe, e);
throw new RuntimeException(e);
}
} else {
return new Object[lengthsAndDelimiters.length];
}
}
private static String[] generateTokensFromRawData(String line,
String[] lengthsAndDelimiters, String[] lengthsAndDelimitersType,
String quote) {
String tokens[] = new String[lengthsAndDelimiters.length];
String strings[];
String identifier;
quote = DelimitedAndFixedWidthHelper.maskRegexChar(quote);
for (int i = 0; i < lengthsAndDelimiters.length; i++) {
identifier = DelimitedAndFixedWidthHelper
.maskRegexChar(lengthsAndDelimiters[i]);
if (lengthsAndDelimitersType[i].contains("Integer")) {
tokens[i] = line.substring(0, Integer.parseInt(identifier));
if (i != (lengthsAndDelimiters.length - 1))
line = line.substring(Integer.parseInt(identifier));
} else {
if (!"".equals(quote) && line.contains(quote.replace("\\", ""))) {
// Creation of RegEx to split data based on delimiter
// ignoring the delimiter present in data based on
// presence of quote char
identifier = identifier + "(?=(?:[^" + quote + "]*" + quote
+ "[^" + quote + "]*[^" + quote + identifier + "]*"
+ quote + ")*(?![^" + quote + "]*" + quote + "))";
}
strings = line.split(identifier);
if (strings.length != 0) {
tokens[i] = ((strings)[0]).replace(quote.replace("\\", ""),
"");
if (i != (lengthsAndDelimiters.length - 1))
line = (line.split(identifier, 2))[1];
} else {
tokens[i] = "";
}
}
}
return tokens;
}
private static Object[] coerceParsedTokens(Fields sourceFields,
String line, boolean safe, String[] tokens, Type[] fieldDataTypes,
CoercibleType[] coercions) {
Object[] coercedTokens = new Object[tokens.length];
for (int i = 0; i < tokens.length; i++) {
try {
if (coercions[i] instanceof StringCoerce) {
coercedTokens[i] = coercions[i].canonical(tokens[i]);
} else if (coercions[i] instanceof DateType
&& tokens[i] != null && !"".equals(tokens[i])) {
coercedTokens[i] = coercions[i].canonical(tokens[i]);
} else
coercedTokens[i] = coercions[i].canonical(tokens[i].trim()
.length() > 0 ? tokens[i].trim() : null);
} catch (Exception exception) {
String message = "field " + sourceFields.get(i)
+ " cannot be coerced from : " + tokens[i] + " to: "
+ fieldDataTypes[i];
coercedTokens[i] = null;
if (!safe) {
throw new TapException(message, exception, new Tuple(line));
}
}
}
return coercedTokens;
}
public static StringBuilder createLine(Tuple tuple,
String[] lengthsAndDelimiters, String[] lengthsAndDelimitersType,
boolean strict, char filler, Type[] types, String quote) {
counter = 0;
StringBuilder buffer = new StringBuilder();
for (Object value : tuple) {
isFixedWidthField = false;
isFixedWidthField = isFixedWidthField(lengthsAndDelimitersType,
counter);
// to apply datatype while writing the file
if (types != null) {
if (types[counter] instanceof DateType) {
value = Coercions.coercibleTypeFor(types[counter]).coerce(
value, String.class);
} else {
value = Coercions.coercibleTypeFor(types[counter])
.canonical(value);
}
}
if (value == null) {
value = "";
}
if (isFixedWidthField) {
int lengthDifference = value.toString().length()
- Integer.parseInt(lengthsAndDelimiters[counter]);
if (lengthDifference == 0) {
buffer.append(value);
counter++;
continue;
} else if (lengthDifference > 0) {
if (strict) {
throw new TapException(
"Fixed width field write error. Field "
+ value
+ " has length "
+ value.toString().length()
+ " whereas provided is "
+ lengthsAndDelimiters[counter]
+ ". Set strict to false and provide filler to overide such errors if this is expected behaviour.",
new Tuple(tuple));
}
buffer.append(value.toString().substring(0,
Integer.parseInt(lengthsAndDelimiters[counter])));
counter++;
continue;
} else if (lengthDifference < 0) {
if (strict) {
throw new TapException(
"Fixed width field write error. Field "
+ value
+ " has length "
+ value.toString().length()
+ " whereas provided is "
+ lengthsAndDelimiters[counter]
+ ". Set strict to false and provide filler to overide such errors if this is expected behaviour.",
new Tuple(tuple));
}
try {
if (isNumeric(value)) {
appendZero(buffer, lengthDifference * -1);
buffer.append(value);
} else {
buffer.append(value);
appendFiller(buffer, filler, lengthDifference * -1);
}
} catch (IOException e) {
LOG.error("", e);
throw new RuntimeException(e);
}
counter++;
continue;
}
}
if (quoteCharPresent(quote)) {
value = appendQuoteChars(value, quote,
lengthsAndDelimiters[counter]);
}
buffer.append(value);
if (lengthsAndDelimiters[counter].contentEquals("\\n"))
lengthsAndDelimiters[counter] = "\n";
if (lengthsAndDelimiters[counter].contentEquals("\\t"))
lengthsAndDelimiters[counter] = "\t";
if (lengthsAndDelimiters[counter].contentEquals("\\r"))
lengthsAndDelimiters[counter] = "\r";
buffer.append(GeneralUtilities
.parseHex(lengthsAndDelimiters[counter]));
counter++;
}
return buffer;
}
private static boolean quoteCharPresent(String quote) {
return !quote.equals("");
}
private static Object appendQuoteChars(Object value, String quote,
String lengthsAndDelimiters) {
if (value instanceof String && ((String) value).contains(lengthsAndDelimiters)) {
value = quote + ((String) value) + quote;
}
return value;
}
private static boolean isFixedWidthField(String[] lengthsAndDelimitersType,
int counter) {
return lengthsAndDelimitersType[counter].contains("Integer");
}
private static boolean isNumeric(Object value) {
return value instanceof Number;
}
private static void appendZero(Appendable buffer, int times)
throws IOException {
char filler = ' ';
for (int i = 0; i < times; i++) {
buffer.append(filler);
}
}
private static void appendFiller(Appendable buffer, char filler, int times)
throws IOException {
for (int i = 0; i < times; i++) {
buffer.append(filler);
}
}
public static boolean isLastFieldNewLine(String[] lengthsAndDelimiters) {
return lengthsAndDelimiters[lengthsAndDelimiters.length - 1]
.matches("\n")
|| lengthsAndDelimiters[lengthsAndDelimiters.length - 1]
.contentEquals("\\n");
}
public static boolean hasaNewLineField(String[] lengthsAndDelimiters) {
for (String string : lengthsAndDelimiters) {
if (string.contains("\n") || string.contentEquals("\\n"))
return true;
}
return false;
}
public static String modifyIdentifier(String identifier) {
String string = identifier;
if (identifier.contains("\\r\\n")) {
string = identifier.replace("\\r\\n", "\r\n");
} else if (identifier.contains("\\n")) {
string = identifier.replace("\\n", "\n");
}
if (identifier.contains("\\t")) {
string = identifier.replace("\\t", "\t");
}
if (identifier.contains("\\x")) {
string = GeneralUtilities.parseHex(identifier);
}
return string;
}
public static String[] modifyIdentifier(String[] identifiers) {
for (int i = 0; i < identifiers.length; i++) {
identifiers[i] = modifyIdentifier(identifiers[i]);
}
return identifiers;
}
public static String spillOneLineToOutput(StringBuilder sb,
String[] lengthsAndDelimiters) {
String line = "";
if (!isLastFieldNewLine(lengthsAndDelimiters)
&& !isLastFixedWidthFieldNewLineField(lengthsAndDelimiters)) {
if (hasaNewLineField(lengthsAndDelimiters)) {
String[] splits = sb.toString().split("\n");
for (int i = 0; i < splits.length; i++) {
if (i != splits.length - 1) {
line += splits[i];
line += "\n";
}
}
return line.substring(0, line.length() - 1);
}
} else {
sb.replace(sb.length() - 1, sb.length(), "");
return sb.toString();
}
return line;
}
public static boolean isLastFixedWidthFieldNewLineField(
String[] lengthsAndDelimiters) {
try {
return Integer
.parseInt(lengthsAndDelimiters[lengthsAndDelimiters.length - 1]) == 1;
} catch (Exception e) {
return false;
}
}
public static String maskRegexChar(
String singleChar) {
String string = singleChar;
if (singleChar.contains("|")) {
string = singleChar.replace("|", "\\|");
}
if (singleChar.contains(".")) {
string = singleChar.replace(".", "\\.");
}
if (singleChar.contains("+")) {
string = singleChar.replace("+", "\\+");
}
if (singleChar.contains("$")) {
string = singleChar.replace("$", "\\$");
}
if (singleChar.contains("*")) {
string = singleChar.replace("*", "\\*");
}
if (singleChar.contains("?")) {
string = singleChar.replace("?", "\\?");
}
if (singleChar.contains("^")) {
string = singleChar.replace("^", "\\^");
}
if (singleChar.contains("-")) {
string = singleChar.replace("-", "\\-");
}
if (singleChar.contains("\\x")) {
string = GeneralUtilities
.parseHex(singleChar);
}
return string;
}
public static String[] checkIfDelimiterIsRegexChar(
String[] lengthsAndDelimiters) {
for (int i = 0; i < lengthsAndDelimiters.length; i++)
lengthsAndDelimiters[i] = maskRegexChar(lengthsAndDelimiters[i]);
return lengthsAndDelimiters;
}
public static String arrayToString(String[] lengthsAndDelimiters) {
String string = "";
for (String str : lengthsAndDelimiters) {
string += str;
string += "comma";
}
return string;
}
public static String[] stringToArray(String string) {
return string.split("comma");
}
}