/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.drill.exec.store.easy.text.compliant;
import io.netty.buffer.DrillBuf;
import java.io.IOException;
import org.apache.drill.common.exceptions.UserException;
import com.univocity.parsers.common.TextParsingException;
import com.univocity.parsers.csv.CsvParserSettings;
/*******************************************************************************
* Portions Copyright 2014 uniVocity Software Pty Ltd
******************************************************************************/
/**
* A byte-based Text parser implementation. Builds heavily upon the uniVocity parsers. Customized for UTF8 parsing and
* DrillBuf support.
*/
final class TextReader {
static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(TextReader.class);
private static final byte NULL_BYTE = (byte) '\0';
private final TextParsingContext context;
private final long recordsToRead;
private final TextParsingSettings settings;
private final TextInput input;
private final TextOutput output;
private final DrillBuf workBuf;
private byte ch;
// index of the field within this record
private int fieldIndex;
/** Behavior settings **/
private final boolean ignoreTrailingWhitespace;
private final boolean ignoreLeadingWhitespace;
private final boolean parseUnescapedQuotes;
/** Key Characters **/
private final byte comment;
private final byte delimiter;
private final byte quote;
private final byte quoteEscape;
private final byte newLine;
/**
* The CsvParser supports all settings provided by {@link CsvParserSettings}, and requires this configuration to be
* properly initialized.
* @param settings the parser configuration
* @param input input stream
* @param output interface to produce output record batch
* @param workBuf working buffer to handle whitespaces
*/
public TextReader(TextParsingSettings settings, TextInput input, TextOutput output, DrillBuf workBuf) {
this.context = new TextParsingContext(input, output);
this.workBuf = workBuf;
this.settings = settings;
this.recordsToRead = settings.getNumberOfRecordsToRead() == -1 ? Long.MAX_VALUE : settings.getNumberOfRecordsToRead();
this.ignoreTrailingWhitespace = settings.isIgnoreTrailingWhitespaces();
this.ignoreLeadingWhitespace = settings.isIgnoreLeadingWhitespaces();
this.parseUnescapedQuotes = settings.isParseUnescapedQuotes();
this.delimiter = settings.getDelimiter();
this.quote = settings.getQuote();
this.quoteEscape = settings.getQuoteEscape();
this.newLine = settings.getNormalizedNewLine();
this.comment = settings.getComment();
this.input = input;
this.output = output;
}
public TextOutput getOutput(){
return output;
}
/* Check if the given byte is a white space. As per the univocity text reader
* any ASCII <= ' ' is considered a white space. However since byte in JAVA is signed
* we have an additional check to make sure its not negative
*/
static final boolean isWhite(byte b){
return b <= ' ' && b > -1;
}
// Inform the output interface to indicate we are starting a new record batch
public void resetForNextBatch(){
output.startBatch();
}
public long getPos(){
return input.getPos();
}
/**
* Function encapsulates parsing an entire record, delegates parsing of the
* fields to parseField() function.
* We mark the start of the record and if there are any failures encountered (OOM for eg)
* then we reset the input stream to the marked position
* @return true if parsing this record was successful; false otherwise
* @throws IOException
*/
private boolean parseRecord() throws IOException {
final byte newLine = this.newLine;
final TextInput input = this.input;
input.mark();
fieldIndex = 0;
if (isWhite(ch) && ignoreLeadingWhitespace) {
skipWhitespace();
}
int fieldsWritten = 0;
try{
boolean earlyTerm = false;
while (ch != newLine) {
earlyTerm = !parseField();
fieldsWritten++;
if (ch != newLine) {
ch = input.nextChar();
if (ch == newLine) {
output.startField(fieldsWritten++);
output.endEmptyField();
break;
}
}
if(earlyTerm){
if(ch != newLine){
input.skipLines(1);
}
break;
}
}
}catch(StreamFinishedPseudoException e){
// if we've written part of a field or all of a field, we should send this row.
if(fieldsWritten == 0 && !output.rowHasData()){
throw e;
}
}
output.finishRecord();
return true;
}
/**
* Function parses an individual field and ignores any white spaces encountered
* by not appending it to the output vector
* @throws IOException
*/
private void parseValueIgnore() throws IOException {
final byte newLine = this.newLine;
final byte delimiter = this.delimiter;
final TextOutput output = this.output;
final TextInput input = this.input;
byte ch = this.ch;
while (ch != delimiter && ch != newLine) {
output.appendIgnoringWhitespace(ch);
// fieldSize++;
ch = input.nextChar();
}
this.ch = ch;
}
/**
* Function parses an individual field and appends all characters till the delimeter (or newline)
* to the output, including white spaces
* @throws IOException
*/
private void parseValueAll() throws IOException {
final byte newLine = this.newLine;
final byte delimiter = this.delimiter;
final TextOutput output = this.output;
final TextInput input = this.input;
byte ch = this.ch;
while (ch != delimiter && ch != newLine) {
output.append(ch);
ch = input.nextChar();
}
this.ch = ch;
}
/**
* Function simply delegates the parsing of a single field to the actual implementation based on parsing config
* @throws IOException
*/
private void parseValue() throws IOException {
if (ignoreTrailingWhitespace) {
parseValueIgnore();
}else{
parseValueAll();
}
}
/**
* Recursive function invoked when a quote is encountered. Function also
* handles the case when there are non-white space characters in the field
* after the quoted value.
* @param prev previous byte read
* @throws IOException
*/
private void parseQuotedValue(byte prev) throws IOException {
final byte newLine = this.newLine;
final byte delimiter = this.delimiter;
final TextOutput output = this.output;
final TextInput input = this.input;
final byte quote = this.quote;
ch = input.nextCharNoNewLineCheck();
while (!(prev == quote && (ch == delimiter || ch == newLine || isWhite(ch)))) {
if (ch != quote) {
if (prev == quote) { // unescaped quote detected
if (parseUnescapedQuotes) {
output.append(quote);
output.append(ch);
parseQuotedValue(ch);
break;
} else {
throw new TextParsingException(
context,
"Unescaped quote character '"
+ quote
+ "' inside quoted value of CSV field. To allow unescaped quotes, set 'parseUnescapedQuotes' to 'true' in the CSV parser settings. Cannot parse CSV input.");
}
}
output.append(ch);
prev = ch;
} else if (prev == quoteEscape) {
output.append(quote);
prev = NULL_BYTE;
} else {
prev = ch;
}
ch = input.nextCharNoNewLineCheck();
}
// Handles whitespaces after quoted value:
// Whitespaces are ignored (i.e., ch <= ' ') if they are not used as delimiters (i.e., ch != ' ')
// For example, in tab-separated files (TSV files), '\t' is used as delimiter and should not be ignored
// Content after whitespaces may be parsed if 'parseUnescapedQuotes' is enabled.
if (ch != newLine && ch <= ' ' && ch != delimiter) {
final DrillBuf workBuf = this.workBuf;
workBuf.resetWriterIndex();
do {
// saves whitespaces after value
workBuf.writeByte(ch);
ch = input.nextChar();
// found a new line, go to next record.
if (ch == newLine) {
return;
}
} while (ch <= ' ' && ch != delimiter);
// there's more stuff after the quoted value, not only empty spaces.
if (!(ch == delimiter || ch == newLine) && parseUnescapedQuotes) {
output.append(quote);
for(int i =0; i < workBuf.writerIndex(); i++){
output.append(workBuf.getByte(i));
}
// the next character is not the escape character, put it there
if (ch != quoteEscape) {
output.append(ch);
}
// sets this character as the previous character (may be escaping)
// calls recursively to keep parsing potentially quoted content
parseQuotedValue(ch);
}
}
if (!(ch == delimiter || ch == newLine)) {
throw new TextParsingException(context, "Unexpected character '" + ch
+ "' following quoted value of CSV field. Expecting '" + delimiter + "'. Cannot parse CSV input.");
}
}
/**
* Captures the entirety of parsing a single field and based on the input delegates to the appropriate function
* @return
* @throws IOException
*/
private final boolean parseField() throws IOException {
output.startField(fieldIndex++);
if (isWhite(ch) && ignoreLeadingWhitespace) {
skipWhitespace();
}
if (ch == delimiter) {
return output.endEmptyField();
} else {
if (ch == quote) {
parseQuotedValue(NULL_BYTE);
} else {
parseValue();
}
return output.endField();
}
}
/**
* Helper function to skip white spaces occurring at the current input stream.
* @throws IOException
*/
private void skipWhitespace() throws IOException {
final byte delimiter = this.delimiter;
final byte newLine = this.newLine;
final TextInput input = this.input;
while (isWhite(ch) && ch != delimiter && ch != newLine) {
ch = input.nextChar();
}
}
/**
* Starting point for the reader. Sets up the input interface.
* @throws IOException
*/
public final void start() throws IOException {
context.stopped = false;
input.start();
}
/**
* Parses the next record from the input. Will skip the line if its a comment,
* this is required when the file contains headers
* @throws IOException
*/
public final boolean parseNext() throws IOException {
try {
while (!context.stopped) {
ch = input.nextChar();
if (ch == comment) {
input.skipLines(1);
continue;
}
break;
}
final long initialLineNumber = input.lineCount();
boolean success = parseRecord();
if (initialLineNumber + 1 < input.lineCount()) {
throw new TextParsingException(context, "Cannot use newline character within quoted string");
}
if(success){
if (recordsToRead > 0 && context.currentRecord() >= recordsToRead) {
context.stop();
}
return true;
}else{
return false;
}
} catch (StreamFinishedPseudoException ex) {
stopParsing();
return false;
} catch (Exception ex) {
try {
throw handleException(ex);
} finally {
stopParsing();
}
}
}
private void stopParsing(){
}
private String displayLineSeparators(String str, boolean addNewLine) {
if (addNewLine) {
if (str.contains("\r\n")) {
str = str.replaceAll("\\r\\n", "[\\\\r\\\\n]\r\n\t");
} else if (str.contains("\n")) {
str = str.replaceAll("\\n", "[\\\\n]\n\t");
} else {
str = str.replaceAll("\\r", "[\\\\r]\r\t");
}
} else {
str = str.replaceAll("\\n", "\\\\n");
str = str.replaceAll("\\r", "\\\\r");
}
return str;
}
/**
* Helper method to handle exceptions caught while processing text files and generate better error messages associated with
* the exception.
* @param ex Exception raised
* @return
* @throws IOException
*/
private TextParsingException handleException(Exception ex) throws IOException {
if (ex instanceof TextParsingException) {
throw (TextParsingException) ex;
}
if (ex instanceof ArrayIndexOutOfBoundsException) {
ex = UserException
.dataReadError(ex)
.message(
"Drill failed to read your text file. Drill supports up to %d columns in a text file. Your file appears to have more than that.",
RepeatedVarCharOutput.MAXIMUM_NUMBER_COLUMNS)
.build(logger);
}
String message = null;
String tmp = input.getStringSinceMarkForError();
char[] chars = tmp.toCharArray();
if (chars != null) {
int length = chars.length;
if (length > settings.getMaxCharsPerColumn()) {
message = "Length of parsed input (" + length
+ ") exceeds the maximum number of characters defined in your parser settings ("
+ settings.getMaxCharsPerColumn() + "). ";
}
if (tmp.contains("\n") || tmp.contains("\r")) {
tmp = displayLineSeparators(tmp, true);
String lineSeparator = displayLineSeparators(settings.getLineSeparatorString(), false);
message += "\nIdentified line separator characters in the parsed content. This may be the cause of the error. The line separator in your parser settings is set to '"
+ lineSeparator + "'. Parsed content:\n\t" + tmp;
}
int nullCharacterCount = 0;
// ensuring the StringBuilder won't grow over Integer.MAX_VALUE to avoid OutOfMemoryError
int maxLength = length > Integer.MAX_VALUE / 2 ? Integer.MAX_VALUE / 2 - 1 : length;
StringBuilder s = new StringBuilder(maxLength);
for (int i = 0; i < maxLength; i++) {
if (chars[i] == '\0') {
s.append('\\');
s.append('0');
nullCharacterCount++;
} else {
s.append(chars[i]);
}
}
tmp = s.toString();
if (nullCharacterCount > 0) {
message += "\nIdentified "
+ nullCharacterCount
+ " null characters ('\0') on parsed content. This may indicate the data is corrupt or its encoding is invalid. Parsed content:\n\t"
+ tmp;
}
}
throw new TextParsingException(context, message, ex);
}
/**
* Finish the processing of a batch, indicates to the output
* interface to wrap up the batch
*/
public void finishBatch(){
output.finishBatch();
// System.out.println(String.format("line %d, cnt %d", input.getLineCount(), output.getRecordCount()));
}
/**
* Invoked once there are no more records and we are done with the
* current record reader to clean up state.
* @throws IOException
*/
public void close() throws IOException{
input.close();
}
}