/******************************************************************************* * Copyright 2017 Capital One Services, LLC and Bitwise, Inc. * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * http://www.apache.org/licenses/LICENSE-2.0 * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. *******************************************************************************/ package hydrograph.engine.spark.datasource.delimited; import hydrograph.engine.spark.datasource.utils.TypeCast; import org.apache.commons.lang3.time.FastDateFormat; import org.apache.spark.sql.types.StructType; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.Serializable; import java.lang.reflect.Type; import java.util.Arrays; import java.util.List; import java.util.regex.Pattern; /** * The Class HydrographDelimitedParser. * * @author Bitwise * */ public class HydrographDelimitedParser implements Serializable { static final String SPECIAL_REGEX_CHARS = "([\\]\\[|.*<>\\\\$^?()=!+])"; static final String QUOTED_REGEX_FORMAT = "%2$s(?=(?:[^%1$s]*%1$s[^%1$s]*[^%1$s%2$s]*%1$s)*(?![^%1$s]*%1$s))"; static final String CLEAN_REGEX_FORMAT = "^(?:%1$s)(.*)(?:%1$s)$"; static final String ESCAPE_REGEX_FORMAT = "(%1$s%1$s)"; private static final long serialVersionUID = 4546944494735373827L; private static Logger LOG= LoggerFactory.getLogger(HydrographDelimitedParser.class); protected Pattern splitPattern; protected Pattern cleanPattern; protected Pattern escapePattern; protected String delimiter; protected String quote; protected boolean strict = true; // need to cache value across resets protected boolean enforceStrict = true; protected int numValues; protected Type[] types; protected boolean safe = true; protected StructType schema; protected List<FastDateFormat> dateFormats; public HydrographDelimitedParser(String delimiter, String quote, Class[] types, List<FastDateFormat> dateFormats,StructType schema) { reset(delimiter, quote, types, strict, safe, dateFormats, schema); } public HydrographDelimitedParser(String delimiter, String quote, Class[] types, boolean strict, boolean safe, List<FastDateFormat> dateFormats, StructType schema) { reset(delimiter, quote, types, strict, safe, dateFormats, schema/*, null, null*/); } public void reset(String delimiter, String quote, Type[] types, boolean strict, boolean safe, List<FastDateFormat> dateFormats, StructType schema) { if (delimiter == null || delimiter.isEmpty()) throw new IllegalArgumentException("delimiter may not be null or empty"); if (delimiter.equals(quote)) throw new IllegalArgumentException("delimiter and quote character may not be the same value, got: '" + delimiter + "'"); this.delimiter = delimiter; this.strict = strict; this.safe = safe; this.schema = schema; this.dateFormats = dateFormats; if (quote != null && !quote.isEmpty()) // if empty, leave null this.quote = quote; if (types != null && types.length == 0) this.types = null; if (types != null) this.types = Arrays.copyOf(types, types.length); this.numValues = schema.length(); this.enforceStrict = this.strict; splitPattern = createSplitPatternFor(this.delimiter, this.quote); cleanPattern = createCleanPatternFor(this.quote); escapePattern = createEscapePatternFor(this.quote); } public String getDelimiter() { return delimiter; } public String getQuote() { return quote; } /** * Method createEscapePatternFor creates a regex {@link Pattern} cleaning quote escapes from a String. * <p/> * If {@code quote} is null or empty, a null value will be returned; * * @param quote of type String * @return Pattern */ public Pattern createEscapePatternFor(String quote) { if (quote == null || quote.isEmpty()) return null; return Pattern.compile(String.format(ESCAPE_REGEX_FORMAT, quote)); } /** * Method createCleanPatternFor creates a regex {@link Pattern} for removing quote characters from a String. * <p/> * If {@code quote} is null or empty, a null value will be returned; * * @param quote of type String * @return Pattern */ public Pattern createCleanPatternFor(String quote) { if (quote == null || quote.isEmpty()) return null; return Pattern.compile(String.format(CLEAN_REGEX_FORMAT, quote)); } /** * Method createSplitPatternFor creates a regex {@link Pattern} for splitting a line of text into its component * parts using the given delimiter and quote Strings. {@code quote} may be null. * * @param delimiter of type String * @param quote of type String * @return Pattern */ public Pattern createSplitPatternFor(String delimiter, String quote) { String escapedDelimiter = delimiter.replaceAll(SPECIAL_REGEX_CHARS, "\\\\$1"); if (quote == null || quote.isEmpty()) return Pattern.compile(escapedDelimiter); else return Pattern.compile(String.format(QUOTED_REGEX_FORMAT, quote, escapedDelimiter)); } /** * Method createSplit will split the given {@code value} with the given {@code splitPattern}. * * @param value of type String * @param splitPattern of type Pattern * @param numValues of type int * @return String[] */ public String[] createSplit(String value, Pattern splitPattern, int numValues) { return splitPattern.split(value, numValues); } /** * Method cleanSplit will return a quote free array of String values, the given {@code split} array * will be updated in place. * <p/> * If {@code cleanPattern} is null, quote cleaning will not be performed, but all empty String values * will be replaces with a {@code null} value. * * @param split of type Object[] * @param cleanPattern of type Pattern * @param escapePattern of type Pattern * @param quote of type String * @return Object[] as a convenience */ public Object[] cleanSplit(Object[] split, Pattern cleanPattern, Pattern escapePattern, String quote) { if (cleanPattern != null) { for (int i = 0; i < split.length; i++) { split[i] = cleanPattern.matcher((String) split[i]).replaceAll("$1"); split[i] = escapePattern.matcher((String) split[i]).replaceAll(quote); } } for (int i = 0; i < split.length; i++) { if (((String) split[i]).isEmpty()) split[i] = null; } return split; } public Object[] parseLine(String line) { Object[] split = onlyParseLine(line); split = cleanParsedLine(split); return coerceParsedLine( line, split ); } private Object[] coerceParsedLine(String line, Object[] split) { Object[] result = new Object[split.length]; for (int i = 0; i < split.length; i++) { try { split[i] = !schema.apply(i).dataType().simpleString().equalsIgnoreCase("String") ? split[i].toString().trim() : split[i]; result[i] = TypeCast.inputValue(split[i].toString(), schema.apply(i).dataType(), schema.apply(i).nullable(), "null", true, dateFormats.get(i)); } catch (Exception exception) { result[i] = null; if (!safe) { LOG.error(getSafeMessage(split[i], i) + "\n Line being parsed => " + line); throw new RuntimeException(getSafeMessage(split[i], i) + "\n Line being parsed => " + line,exception); } } } split = result; return split; } protected Object[] cleanParsedLine(Object[] split) { return cleanSplit(split, cleanPattern, escapePattern, quote); } private String getSafeMessage(Object object, int i) { try { return "field " + schema.apply(i).name() + " cannot be coerced from : " + object + " to: " + schema.apply(i).dataType(); } catch( Throwable throwable ) { return "field pos " + i + " cannot be coerced from: " + object + ", pos has no corresponding field name or coercion type"; } } protected Object[] onlyParseLine(String line) { Object[] split = createSplit(line, splitPattern, numValues == 0 ? 0 : -1); if (numValues != 0 && split.length != numValues) { if( enforceStrict ){ LOG.error(getParseMessage( split ) ); throw new RuntimeException( getParseMessage( split ) ); // trap actual line data } Object[] array = new Object[numValues]; Arrays.fill(array, ""); System.arraycopy(split, 0, array, 0, Math.min(numValues, split.length)); split = array; } return split; } private String getParseMessage( Object[] split ) { return "did not parse correct number of values from input data, expected: " + numValues + ", got: " + split.length + ":" + Arrays.toString(split); } }