TextReader.java example

Explorer
drill-master
/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.drill.exec.store.easy.text.compliant;

import io.netty.buffer.DrillBuf;

import java.io.IOException;

import org.apache.drill.common.exceptions.UserException;

import com.univocity.parsers.common.TextParsingException;
import com.univocity.parsers.csv.CsvParserSettings;

/*******************************************************************************
 * Portions Copyright 2014 uniVocity Software Pty Ltd
 ******************************************************************************/

/**
 * A byte-based Text parser implementation. Builds heavily upon the uniVocity parsers. Customized for UTF8 parsing and
 * DrillBuf support.
 */
final class TextReader {
  static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(TextReader.class);

  private static final byte NULL_BYTE = (byte) '\0';

  private final TextParsingContext context;

  private final long recordsToRead;
  private final TextParsingSettings settings;

  private final TextInput input;
  private final TextOutput output;
  private final DrillBuf workBuf;

  private byte ch;

  // index of the field within this record
  private int fieldIndex;

  /** Behavior settings **/
  private final boolean ignoreTrailingWhitespace;
  private final boolean ignoreLeadingWhitespace;
  private final boolean parseUnescapedQuotes;

  /** Key Characters **/
  private final byte comment;
  private final byte delimiter;
  private final byte quote;
  private final byte quoteEscape;
  private final byte newLine;

  /**
   * The CsvParser supports all settings provided by {@link CsvParserSettings}, and requires this configuration to be
   * properly initialized.
   * @param settings  the parser configuration
   * @param input  input stream
   * @param output  interface to produce output record batch
   * @param workBuf  working buffer to handle whitespaces
   */
  public TextReader(TextParsingSettings settings, TextInput input, TextOutput output, DrillBuf workBuf) {
    this.context = new TextParsingContext(input, output);
    this.workBuf = workBuf;
    this.settings = settings;

    this.recordsToRead = settings.getNumberOfRecordsToRead() == -1 ? Long.MAX_VALUE : settings.getNumberOfRecordsToRead();

    this.ignoreTrailingWhitespace = settings.isIgnoreTrailingWhitespaces();
    this.ignoreLeadingWhitespace = settings.isIgnoreLeadingWhitespaces();
    this.parseUnescapedQuotes = settings.isParseUnescapedQuotes();
    this.delimiter = settings.getDelimiter();
    this.quote = settings.getQuote();
    this.quoteEscape = settings.getQuoteEscape();
    this.newLine = settings.getNormalizedNewLine();
    this.comment = settings.getComment();

    this.input = input;
    this.output = output;

  }

  public TextOutput getOutput(){
    return output;
  }

  /* Check if the given byte is a white space. As per the univocity text reader
   * any ASCII <= ' ' is considered a white space. However since byte in JAVA is signed
   * we have an additional check to make sure its not negative
   */
  static final boolean isWhite(byte b){
    return b <= ' ' && b > -1;
  }

  // Inform the output interface to indicate we are starting a new record batch
  public void resetForNextBatch(){
    output.startBatch();
  }

  public long getPos(){
    return input.getPos();
  }

  /**
   * Function encapsulates parsing an entire record, delegates parsing of the
   * fields to parseField() function.
   * We mark the start of the record and if there are any failures encountered (OOM for eg)
   * then we reset the input stream to the marked position
   * @return  true if parsing this record was successful; false otherwise
   * @throws IOException
   */
  private boolean parseRecord() throws IOException {
    final byte newLine = this.newLine;
    final TextInput input = this.input;

    input.mark();

    fieldIndex = 0;
    if (isWhite(ch) && ignoreLeadingWhitespace) {
      skipWhitespace();
    }

    int fieldsWritten = 0;
    try{
      boolean earlyTerm = false;
      while (ch != newLine) {
        earlyTerm = !parseField();
        fieldsWritten++;
        if (ch != newLine) {
          ch = input.nextChar();
          if (ch == newLine) {
            output.startField(fieldsWritten++);
            output.endEmptyField();
            break;
          }
        }
        if(earlyTerm){
          if(ch != newLine){
            input.skipLines(1);
          }
          break;
        }
      }
    }catch(StreamFinishedPseudoException e){
      // if we've written part of a field or all of a field, we should send this row.
      if(fieldsWritten == 0 && !output.rowHasData()){
        throw e;
      }
    }

    output.finishRecord();
    return true;
  }

  /**
   * Function parses an individual field and ignores any white spaces encountered
   * by not appending it to the output vector
   * @throws IOException
   */
  private void parseValueIgnore() throws IOException {
    final byte newLine = this.newLine;
    final byte delimiter = this.delimiter;
    final TextOutput output = this.output;
    final TextInput input = this.input;

    byte ch = this.ch;
    while (ch != delimiter && ch != newLine) {
      output.appendIgnoringWhitespace(ch);
//      fieldSize++;
      ch = input.nextChar();
    }
    this.ch = ch;
  }

  /**
   * Function parses an individual field and appends all characters till the delimeter (or newline)
   * to the output, including white spaces
   * @throws IOException
   */
  private void parseValueAll() throws IOException {
    final byte newLine = this.newLine;
    final byte delimiter = this.delimiter;
    final TextOutput output = this.output;
    final TextInput input = this.input;

    byte ch = this.ch;
    while (ch != delimiter && ch != newLine) {
      output.append(ch);
      ch = input.nextChar();
    }
    this.ch = ch;
  }

  /**
   * Function simply delegates the parsing of a single field to the actual implementation based on parsing config
   * @throws IOException
   */
  private void parseValue() throws IOException {
    if (ignoreTrailingWhitespace) {
      parseValueIgnore();
    }else{
      parseValueAll();
    }
  }

  /**
   * Recursive function invoked when a quote is encountered. Function also
   * handles the case when there are non-white space characters in the field
   * after the quoted value.
   * @param prev  previous byte read
   * @throws IOException
   */
  private void parseQuotedValue(byte prev) throws IOException {
    final byte newLine = this.newLine;
    final byte delimiter = this.delimiter;
    final TextOutput output = this.output;
    final TextInput input = this.input;
    final byte quote = this.quote;

    ch = input.nextCharNoNewLineCheck();

    while (!(prev == quote && (ch == delimiter || ch == newLine || isWhite(ch)))) {
      if (ch != quote) {
        if (prev == quote) { // unescaped quote detected
          if (parseUnescapedQuotes) {
            output.append(quote);
            output.append(ch);
            parseQuotedValue(ch);
            break;
          } else {
            throw new TextParsingException(
                context,
                "Unescaped quote character '"
                    + quote
                    + "' inside quoted value of CSV field. To allow unescaped quotes, set 'parseUnescapedQuotes' to 'true' in the CSV parser settings. Cannot parse CSV input.");
          }
        }
        output.append(ch);
        prev = ch;
      } else if (prev == quoteEscape) {
        output.append(quote);
        prev = NULL_BYTE;
      } else {
        prev = ch;
      }
      ch = input.nextCharNoNewLineCheck();
    }

    // Handles whitespaces after quoted value:
    // Whitespaces are ignored (i.e., ch <= ' ') if they are not used as delimiters (i.e., ch != ' ')
    // For example, in tab-separated files (TSV files), '\t' is used as delimiter and should not be ignored
    // Content after whitespaces may be parsed if 'parseUnescapedQuotes' is enabled.
    if (ch != newLine && ch <= ' ' && ch != delimiter) {
      final DrillBuf workBuf = this.workBuf;
      workBuf.resetWriterIndex();
      do {
        // saves whitespaces after value
        workBuf.writeByte(ch);
        ch = input.nextChar();
        // found a new line, go to next record.
        if (ch == newLine) {
          return;
        }
      } while (ch <= ' ' && ch != delimiter);

      // there's more stuff after the quoted value, not only empty spaces.
      if (!(ch == delimiter || ch == newLine) && parseUnescapedQuotes) {

        output.append(quote);
        for(int i =0; i < workBuf.writerIndex(); i++){
          output.append(workBuf.getByte(i));
        }
        // the next character is not the escape character, put it there
        if (ch != quoteEscape) {
          output.append(ch);
        }
        // sets this character as the previous character (may be escaping)
        // calls recursively to keep parsing potentially quoted content
        parseQuotedValue(ch);
      }
    }

    if (!(ch == delimiter || ch == newLine)) {
      throw new TextParsingException(context, "Unexpected character '" + ch
          + "' following quoted value of CSV field. Expecting '" + delimiter + "'. Cannot parse CSV input.");
    }
  }

  /**
   * Captures the entirety of parsing a single field and based on the input delegates to the appropriate function
   * @return
   * @throws IOException
   */
  private final boolean parseField() throws IOException {

    output.startField(fieldIndex++);

    if (isWhite(ch) && ignoreLeadingWhitespace) {
      skipWhitespace();
    }

    if (ch == delimiter) {
      return output.endEmptyField();
    } else {
      if (ch == quote) {
        parseQuotedValue(NULL_BYTE);
      } else {
        parseValue();
      }

      return output.endField();
    }

  }

  /**
   * Helper function to skip white spaces occurring at the current input stream.
   * @throws IOException
   */
  private void skipWhitespace() throws IOException {
    final byte delimiter = this.delimiter;
    final byte newLine = this.newLine;
    final TextInput input = this.input;

    while (isWhite(ch) && ch != delimiter && ch != newLine) {
      ch = input.nextChar();
    }
  }

  /**
   * Starting point for the reader. Sets up the input interface.
   * @throws IOException
   */
  public final void start() throws IOException {
    context.stopped = false;
    input.start();
  }


  /**
   * Parses the next record from the input. Will skip the line if its a comment,
   * this is required when the file contains headers
   * @throws IOException
   */
  public final boolean parseNext() throws IOException {
    try {
      while (!context.stopped) {
        ch = input.nextChar();
        if (ch == comment) {
          input.skipLines(1);
          continue;
        }
        break;
      }
      final long initialLineNumber = input.lineCount();
      boolean success = parseRecord();
      if (initialLineNumber + 1 < input.lineCount()) {
        throw new TextParsingException(context, "Cannot use newline character within quoted string");
      }

      if(success){
        if (recordsToRead > 0 && context.currentRecord() >= recordsToRead) {
          context.stop();
        }
        return true;
      }else{
        return false;
      }

    } catch (StreamFinishedPseudoException ex) {
      stopParsing();
      return false;
    } catch (Exception ex) {
      try {
        throw handleException(ex);
      } finally {
        stopParsing();
      }
    }
  }

  private void stopParsing(){

  }

  private String displayLineSeparators(String str, boolean addNewLine) {
    if (addNewLine) {
      if (str.contains("\r\n")) {
        str = str.replaceAll("\\r\\n", "[\\\\r\\\\n]\r\n\t");
      } else if (str.contains("\n")) {
        str = str.replaceAll("\\n", "[\\\\n]\n\t");
      } else {
        str = str.replaceAll("\\r", "[\\\\r]\r\t");
      }
    } else {
      str = str.replaceAll("\\n", "\\\\n");
      str = str.replaceAll("\\r", "\\\\r");
    }
    return str;
  }

  /**
   * Helper method to handle exceptions caught while processing text files and generate better error messages associated with
   * the exception.
   * @param ex  Exception raised
   * @return
   * @throws IOException
   */
  private TextParsingException handleException(Exception ex) throws IOException {

    if (ex instanceof TextParsingException) {
      throw (TextParsingException) ex;
    }

    if (ex instanceof ArrayIndexOutOfBoundsException) {
      ex = UserException
          .dataReadError(ex)
          .message(
              "Drill failed to read your text file.  Drill supports up to %d columns in a text file.  Your file appears to have more than that.",
              RepeatedVarCharOutput.MAXIMUM_NUMBER_COLUMNS)
          .build(logger);
    }

    String message = null;
    String tmp = input.getStringSinceMarkForError();
    char[] chars = tmp.toCharArray();
    if (chars != null) {
      int length = chars.length;
      if (length > settings.getMaxCharsPerColumn()) {
        message = "Length of parsed input (" + length
            + ") exceeds the maximum number of characters defined in your parser settings ("
            + settings.getMaxCharsPerColumn() + "). ";
      }

      if (tmp.contains("\n") || tmp.contains("\r")) {
        tmp = displayLineSeparators(tmp, true);
        String lineSeparator = displayLineSeparators(settings.getLineSeparatorString(), false);
        message += "\nIdentified line separator characters in the parsed content. This may be the cause of the error. The line separator in your parser settings is set to '"
            + lineSeparator + "'. Parsed content:\n\t" + tmp;
      }

      int nullCharacterCount = 0;
      // ensuring the StringBuilder won't grow over Integer.MAX_VALUE to avoid OutOfMemoryError
      int maxLength = length > Integer.MAX_VALUE / 2 ? Integer.MAX_VALUE / 2 - 1 : length;
      StringBuilder s = new StringBuilder(maxLength);
      for (int i = 0; i < maxLength; i++) {
        if (chars[i] == '\0') {
          s.append('\\');
          s.append('0');
          nullCharacterCount++;
        } else {
          s.append(chars[i]);
        }
      }
      tmp = s.toString();

      if (nullCharacterCount > 0) {
        message += "\nIdentified "
            + nullCharacterCount
            + " null characters ('\0') on parsed content. This may indicate the data is corrupt or its encoding is invalid. Parsed content:\n\t"
            + tmp;
      }

    }

    throw new TextParsingException(context, message, ex);
  }

  /**
   * Finish the processing of a batch, indicates to the output
   * interface to wrap up the batch
   */
  public void finishBatch(){
    output.finishBatch();
//    System.out.println(String.format("line %d, cnt %d", input.getLineCount(), output.getRecordCount()));
  }

  /**
   * Invoked once there are no more records and we are done with the
   * current record reader to clean up state.
   * @throws IOException
   */
  public void close() throws IOException{
    input.close();
  }

}