TextParsingSettings.java example

Explorer
drill-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.drill.exec.store.easy.text.compliant;

import org.apache.drill.exec.store.easy.text.TextFormatPlugin.TextFormatConfig;

import com.google.common.base.Charsets;
import com.univocity.parsers.common.TextParsingException;

public class TextParsingSettings {
  static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(TextParsingSettings.class);

  public static final TextParsingSettings DEFAULT = new TextParsingSettings();

  private String emptyValue = null;
  private boolean parseUnescapedQuotes = true;
  private byte quote = b('"');
  private byte quoteEscape = b('"');
  private byte delimiter = b(',');
  private byte comment = b('#');

  private long maxCharsPerColumn = Character.MAX_VALUE;
  private byte normalizedNewLine = b('\n');
  private byte[] newLineDelimiter = {normalizedNewLine};
  private boolean ignoreLeadingWhitespaces = false;
  private boolean ignoreTrailingWhitespaces = false;
  private String lineSeparatorString = "\n";
  private boolean skipFirstLine = false;

  private boolean headerExtractionEnabled = false;
  private boolean useRepeatedVarChar = true;
  private int numberOfRecordsToRead = -1;

  public void set(TextFormatConfig config){
    this.quote = bSafe(config.getQuote(), "quote");
    this.quoteEscape = bSafe(config.getEscape(), "escape");
    this.newLineDelimiter = config.getLineDelimiter().getBytes(Charsets.UTF_8);
    this.delimiter = bSafe(config.getFieldDelimiter(), "fieldDelimiter");
    this.comment = bSafe(config.getComment(), "comment");
    this.skipFirstLine = config.isSkipFirstLine();
    this.headerExtractionEnabled = config.isHeaderExtractionEnabled();
    if (this.headerExtractionEnabled) {
      // In case of header TextRecordReader will use set of VarChar vectors vs RepeatedVarChar
      this.useRepeatedVarChar = false;
    }
  }

  public byte getComment(){
    return comment;
  }

  public boolean isSkipFirstLine() {
    return skipFirstLine;
  }

  public void setSkipFirstLine(boolean skipFirstLine) {
    this.skipFirstLine = skipFirstLine;
  }

  public boolean isUseRepeatedVarChar() {
    return useRepeatedVarChar;
  }

  public void setUseRepeatedVarChar(boolean useRepeatedVarChar) {
    this.useRepeatedVarChar = useRepeatedVarChar;
  }


  private static byte bSafe(char c, String name){
    if(c > Byte.MAX_VALUE) {
      throw new IllegalArgumentException(String.format("Failure validating configuration option %s.  Expected a "
          + "character between 0 and 127 but value was actually %d.", name, (int) c));
    }
    return (byte) c;
  }

  private static byte b(char c){
    return (byte) c;
  }

  public byte[] getNewLineDelimiter() {
    return newLineDelimiter;
  }

  /**
   * Returns the character used for escaping values where the field delimiter is part of the value. Defaults to '"'
   * @return the quote character
   */
  public byte getQuote() {
    return quote;
  }

  /**
   * Defines the character used for escaping values where the field delimiter is part of the value. Defaults to '"'
   * @param quote the quote character
   */
  public void setQuote(byte quote) {
    this.quote = quote;
  }

  public String getLineSeparatorString(){
    return lineSeparatorString;
  }


  /**
   * Identifies whether or not a given character is used for escaping values where the field delimiter is part of the value
   * @param ch the character to be verified
   * @return true if the given character is the character used for escaping values, false otherwise
   */
  public boolean isQuote(byte ch) {
    return this.quote == ch;
  }

  /**
   * Returns the character used for escaping quotes inside an already quoted value. Defaults to '"'
   * @return the quote escape character
   */
  public byte getQuoteEscape() {
    return quoteEscape;
  }

  /**
   * Defines the character used for escaping quotes inside an already quoted value. Defaults to '"'
   * @param quoteEscape the quote escape character
   */
  public void setQuoteEscape(byte quoteEscape) {
    this.quoteEscape = quoteEscape;
  }

  /**
   * Identifies whether or not a given character is used for escaping quotes inside an already quoted value.
   * @param ch the character to be verified
   * @return true if the given character is the quote escape character, false otherwise
   */
  public boolean isQuoteEscape(byte ch) {
    return this.quoteEscape == ch;
  }

  /**
   * Returns the field delimiter character. Defaults to ','
   * @return the field delimiter character
   */
  public byte getDelimiter() {
    return delimiter;
  }

  /**
   * Defines the field delimiter character. Defaults to ','
   * @param delimiter the field delimiter character
   */
  public void setDelimiter(byte delimiter) {
    this.delimiter = delimiter;
  }

  /**
   * Identifies whether or not a given character represents a field delimiter
   * @param ch the character to be verified
   * @return true if the given character is the field delimiter character, false otherwise
   */
  public boolean isDelimiter(byte ch) {
    return this.delimiter == ch;
  }

  /**
   * Returns the String representation of an empty value (defaults to null)
   *
   * <p>When reading, if the parser does not read any character from the input, and the input is within quotes, the empty is used instead of an empty string
   *
   * @return the String representation of an empty value
   */
  public String getEmptyValue() {
    return emptyValue;
  }

  /**
   * Sets the String representation of an empty value (defaults to null)
   *
   * <p>When reading, if the parser does not read any character from the input, and the input is within quotes, the empty is used instead of an empty string
   *
   * @param emptyValue the String representation of an empty value
   */
  public void setEmptyValue(String emptyValue) {
    this.emptyValue = emptyValue;
  }


  /**
   * Indicates whether the CSV parser should accept unescaped quotes inside quoted values and parse them normally. Defaults to {@code true}.
   * @return a flag indicating whether or not the CSV parser should accept unescaped quotes inside quoted values.
   */
  public boolean isParseUnescapedQuotes() {
    return parseUnescapedQuotes;
  }

  /**
   * Configures how to handle unescaped quotes inside quoted values. If set to {@code true}, the parser will parse the quote normally as part of the value.
   * If set the {@code false}, a {@link TextParsingException} will be thrown. Defaults to {@code true}.
   * @param parseUnescapedQuotes indicates whether or not the CSV parser should accept unescaped quotes inside quoted values.
   */
  public void setParseUnescapedQuotes(boolean parseUnescapedQuotes) {
    this.parseUnescapedQuotes = parseUnescapedQuotes;
  }

  /**
   * Indicates whether or not the first valid record parsed from the input should be considered as the row containing the names of each column
   * @return true if the first valid record parsed from the input should be considered as the row containing the names of each column, false otherwise
   */
  public boolean isHeaderExtractionEnabled() {
    return headerExtractionEnabled;
  }

  /**
   * Defines whether or not the first valid record parsed from the input should be considered as the row containing the names of each column
   * @param headerExtractionEnabled a flag indicating whether the first valid record parsed from the input should be considered as the row containing the names of each column
   */
  public void setHeaderExtractionEnabled(boolean headerExtractionEnabled) {
    this.headerExtractionEnabled = headerExtractionEnabled;
  }

  /**
   * The number of valid records to be parsed before the process is stopped. A negative value indicates there's no limit (defaults to -1).
   * @return the number of records to read before stopping the parsing process.
   */
  public int getNumberOfRecordsToRead() {
    return numberOfRecordsToRead;
  }

  /**
   * Defines the number of valid records to be parsed before the process is stopped. A negative value indicates there's no limit (defaults to -1).
   * @param numberOfRecordsToRead the number of records to read before stopping the parsing process.
   */
  public void setNumberOfRecordsToRead(int numberOfRecordsToRead) {
    this.numberOfRecordsToRead = numberOfRecordsToRead;
  }

  public long getMaxCharsPerColumn() {
    return maxCharsPerColumn;
  }

  public void setMaxCharsPerColumn(long maxCharsPerColumn) {
    this.maxCharsPerColumn = maxCharsPerColumn;
  }

  public void setComment(byte comment) {
    this.comment = comment;
  }

  public byte getNormalizedNewLine() {
    return normalizedNewLine;
  }

  public void setNormalizedNewLine(byte normalizedNewLine) {
    this.normalizedNewLine = normalizedNewLine;
  }

  public boolean isIgnoreLeadingWhitespaces() {
    return ignoreLeadingWhitespaces;
  }

  public void setIgnoreLeadingWhitespaces(boolean ignoreLeadingWhitespaces) {
    this.ignoreLeadingWhitespaces = ignoreLeadingWhitespaces;
  }

  public boolean isIgnoreTrailingWhitespaces() {
    return ignoreTrailingWhitespaces;
  }

  public void setIgnoreTrailingWhitespaces(boolean ignoreTrailingWhitespaces) {
    this.ignoreTrailingWhitespaces = ignoreTrailingWhitespaces;
  }





}