/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.drill.exec.store.easy.text.compliant; import org.apache.drill.exec.store.easy.text.TextFormatPlugin.TextFormatConfig; import com.google.common.base.Charsets; import com.univocity.parsers.common.TextParsingException; public class TextParsingSettings { static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(TextParsingSettings.class); public static final TextParsingSettings DEFAULT = new TextParsingSettings(); private String emptyValue = null; private boolean parseUnescapedQuotes = true; private byte quote = b('"'); private byte quoteEscape = b('"'); private byte delimiter = b(','); private byte comment = b('#'); private long maxCharsPerColumn = Character.MAX_VALUE; private byte normalizedNewLine = b('\n'); private byte[] newLineDelimiter = {normalizedNewLine}; private boolean ignoreLeadingWhitespaces = false; private boolean ignoreTrailingWhitespaces = false; private String lineSeparatorString = "\n"; private boolean skipFirstLine = false; private boolean headerExtractionEnabled = false; private boolean useRepeatedVarChar = true; private int numberOfRecordsToRead = -1; public void set(TextFormatConfig config){ this.quote = bSafe(config.getQuote(), "quote"); this.quoteEscape = bSafe(config.getEscape(), "escape"); this.newLineDelimiter = config.getLineDelimiter().getBytes(Charsets.UTF_8); this.delimiter = bSafe(config.getFieldDelimiter(), "fieldDelimiter"); this.comment = bSafe(config.getComment(), "comment"); this.skipFirstLine = config.isSkipFirstLine(); this.headerExtractionEnabled = config.isHeaderExtractionEnabled(); if (this.headerExtractionEnabled) { // In case of header TextRecordReader will use set of VarChar vectors vs RepeatedVarChar this.useRepeatedVarChar = false; } } public byte getComment(){ return comment; } public boolean isSkipFirstLine() { return skipFirstLine; } public void setSkipFirstLine(boolean skipFirstLine) { this.skipFirstLine = skipFirstLine; } public boolean isUseRepeatedVarChar() { return useRepeatedVarChar; } public void setUseRepeatedVarChar(boolean useRepeatedVarChar) { this.useRepeatedVarChar = useRepeatedVarChar; } private static byte bSafe(char c, String name){ if(c > Byte.MAX_VALUE) { throw new IllegalArgumentException(String.format("Failure validating configuration option %s. Expected a " + "character between 0 and 127 but value was actually %d.", name, (int) c)); } return (byte) c; } private static byte b(char c){ return (byte) c; } public byte[] getNewLineDelimiter() { return newLineDelimiter; } /** * Returns the character used for escaping values where the field delimiter is part of the value. Defaults to '"' * @return the quote character */ public byte getQuote() { return quote; } /** * Defines the character used for escaping values where the field delimiter is part of the value. Defaults to '"' * @param quote the quote character */ public void setQuote(byte quote) { this.quote = quote; } public String getLineSeparatorString(){ return lineSeparatorString; } /** * Identifies whether or not a given character is used for escaping values where the field delimiter is part of the value * @param ch the character to be verified * @return true if the given character is the character used for escaping values, false otherwise */ public boolean isQuote(byte ch) { return this.quote == ch; } /** * Returns the character used for escaping quotes inside an already quoted value. Defaults to '"' * @return the quote escape character */ public byte getQuoteEscape() { return quoteEscape; } /** * Defines the character used for escaping quotes inside an already quoted value. Defaults to '"' * @param quoteEscape the quote escape character */ public void setQuoteEscape(byte quoteEscape) { this.quoteEscape = quoteEscape; } /** * Identifies whether or not a given character is used for escaping quotes inside an already quoted value. * @param ch the character to be verified * @return true if the given character is the quote escape character, false otherwise */ public boolean isQuoteEscape(byte ch) { return this.quoteEscape == ch; } /** * Returns the field delimiter character. Defaults to ',' * @return the field delimiter character */ public byte getDelimiter() { return delimiter; } /** * Defines the field delimiter character. Defaults to ',' * @param delimiter the field delimiter character */ public void setDelimiter(byte delimiter) { this.delimiter = delimiter; } /** * Identifies whether or not a given character represents a field delimiter * @param ch the character to be verified * @return true if the given character is the field delimiter character, false otherwise */ public boolean isDelimiter(byte ch) { return this.delimiter == ch; } /** * Returns the String representation of an empty value (defaults to null) * * <p>When reading, if the parser does not read any character from the input, and the input is within quotes, the empty is used instead of an empty string * * @return the String representation of an empty value */ public String getEmptyValue() { return emptyValue; } /** * Sets the String representation of an empty value (defaults to null) * * <p>When reading, if the parser does not read any character from the input, and the input is within quotes, the empty is used instead of an empty string * * @param emptyValue the String representation of an empty value */ public void setEmptyValue(String emptyValue) { this.emptyValue = emptyValue; } /** * Indicates whether the CSV parser should accept unescaped quotes inside quoted values and parse them normally. Defaults to {@code true}. * @return a flag indicating whether or not the CSV parser should accept unescaped quotes inside quoted values. */ public boolean isParseUnescapedQuotes() { return parseUnescapedQuotes; } /** * Configures how to handle unescaped quotes inside quoted values. If set to {@code true}, the parser will parse the quote normally as part of the value. * If set the {@code false}, a {@link TextParsingException} will be thrown. Defaults to {@code true}. * @param parseUnescapedQuotes indicates whether or not the CSV parser should accept unescaped quotes inside quoted values. */ public void setParseUnescapedQuotes(boolean parseUnescapedQuotes) { this.parseUnescapedQuotes = parseUnescapedQuotes; } /** * Indicates whether or not the first valid record parsed from the input should be considered as the row containing the names of each column * @return true if the first valid record parsed from the input should be considered as the row containing the names of each column, false otherwise */ public boolean isHeaderExtractionEnabled() { return headerExtractionEnabled; } /** * Defines whether or not the first valid record parsed from the input should be considered as the row containing the names of each column * @param headerExtractionEnabled a flag indicating whether the first valid record parsed from the input should be considered as the row containing the names of each column */ public void setHeaderExtractionEnabled(boolean headerExtractionEnabled) { this.headerExtractionEnabled = headerExtractionEnabled; } /** * The number of valid records to be parsed before the process is stopped. A negative value indicates there's no limit (defaults to -1). * @return the number of records to read before stopping the parsing process. */ public int getNumberOfRecordsToRead() { return numberOfRecordsToRead; } /** * Defines the number of valid records to be parsed before the process is stopped. A negative value indicates there's no limit (defaults to -1). * @param numberOfRecordsToRead the number of records to read before stopping the parsing process. */ public void setNumberOfRecordsToRead(int numberOfRecordsToRead) { this.numberOfRecordsToRead = numberOfRecordsToRead; } public long getMaxCharsPerColumn() { return maxCharsPerColumn; } public void setMaxCharsPerColumn(long maxCharsPerColumn) { this.maxCharsPerColumn = maxCharsPerColumn; } public void setComment(byte comment) { this.comment = comment; } public byte getNormalizedNewLine() { return normalizedNewLine; } public void setNormalizedNewLine(byte normalizedNewLine) { this.normalizedNewLine = normalizedNewLine; } public boolean isIgnoreLeadingWhitespaces() { return ignoreLeadingWhitespaces; } public void setIgnoreLeadingWhitespaces(boolean ignoreLeadingWhitespaces) { this.ignoreLeadingWhitespaces = ignoreLeadingWhitespaces; } public boolean isIgnoreTrailingWhitespaces() { return ignoreTrailingWhitespaces; } public void setIgnoreTrailingWhitespaces(boolean ignoreTrailingWhitespaces) { this.ignoreTrailingWhitespaces = ignoreTrailingWhitespaces; } }