/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nifi.csv;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.QuoteMode;
import org.apache.nifi.components.AllowableValue;
import org.apache.nifi.components.PropertyDescriptor;
import org.apache.nifi.components.PropertyValue;
import org.apache.nifi.controller.ConfigurationContext;
import org.apache.nifi.processor.util.StandardValidators;
public class CSVUtils {
static final AllowableValue CUSTOM = new AllowableValue("custom", "Custom Format",
"The format of the CSV is configured by using the properties of this Controller Service, such as Value Separator");
static final AllowableValue RFC_4180 = new AllowableValue("rfc-4180", "RFC 4180", "CSV data follows the RFC 4180 Specification defined at https://tools.ietf.org/html/rfc4180");
static final AllowableValue EXCEL = new AllowableValue("excel", "Microsoft Excel", "CSV data follows the format used by Microsoft Excel");
static final AllowableValue TDF = new AllowableValue("tdf", "Tab-Delimited", "CSV data is Tab-Delimited instead of Comma Delimited");
static final AllowableValue INFORMIX_UNLOAD = new AllowableValue("informix-unload", "Informix Unload", "The format used by Informix when issuing the UNLOAD TO file_name command");
static final AllowableValue INFORMIX_UNLOAD_CSV = new AllowableValue("informix-unload", "Informix Unload Escape Disabled",
"The format used by Informix when issuing the UNLOAD TO file_name command with escaping disabled");
static final AllowableValue MYSQL = new AllowableValue("mysql", "MySQL Format", "CSV data follows the format used by MySQL");
static final PropertyDescriptor CSV_FORMAT = new PropertyDescriptor.Builder()
.name("CSV Format")
.description("Specifies which \"format\" the CSV data is in, or specifies if custom formatting should be used.")
.expressionLanguageSupported(false)
.allowableValues(CUSTOM, RFC_4180, EXCEL, TDF, MYSQL, INFORMIX_UNLOAD, INFORMIX_UNLOAD_CSV)
.defaultValue(CUSTOM.getValue())
.required(true)
.build();
static final PropertyDescriptor VALUE_SEPARATOR = new PropertyDescriptor.Builder()
.name("Value Separator")
.description("The character that is used to separate values/fields in a CSV Record")
.addValidator(new SingleCharacterValidator())
.expressionLanguageSupported(false)
.defaultValue(",")
.required(true)
.build();
static final PropertyDescriptor QUOTE_CHAR = new PropertyDescriptor.Builder()
.name("Quote Character")
.description("The character that is used to quote values so that escape characters do not have to be used")
.addValidator(new SingleCharacterValidator())
.expressionLanguageSupported(false)
.defaultValue("\"")
.required(true)
.build();
static final PropertyDescriptor SKIP_HEADER_LINE = new PropertyDescriptor.Builder()
.name("Skip Header Line")
.description("Specifies whether or not the first line of CSV should be considered a Header and skipped. If the Schema Access Strategy "
+ "indicates that the columns must be defined in the header, then this property will be ignored, since the header must always be "
+ "present and won't be processed as a Record. Otherwise, this property should be 'true' if the first non-comment line of CSV "
+ "contains header information that needs to be ignored.")
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.expressionLanguageSupported(false)
.allowableValues("true", "false")
.defaultValue("false")
.required(true)
.build();
static final PropertyDescriptor COMMENT_MARKER = new PropertyDescriptor.Builder()
.name("Comment Marker")
.description("The character that is used to denote the start of a comment. Any line that begins with this comment will be ignored.")
.addValidator(new SingleCharacterValidator())
.expressionLanguageSupported(false)
.required(false)
.build();
static final PropertyDescriptor ESCAPE_CHAR = new PropertyDescriptor.Builder()
.name("Escape Character")
.description("The character that is used to escape characters that would otherwise have a specific meaning to the CSV Parser.")
.addValidator(new SingleCharacterValidator())
.expressionLanguageSupported(false)
.defaultValue("\\")
.required(true)
.build();
static final PropertyDescriptor NULL_STRING = new PropertyDescriptor.Builder()
.name("Null String")
.description("Specifies a String that, if present as a value in the CSV, should be considered a null field instead of using the literal value.")
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.expressionLanguageSupported(false)
.required(false)
.build();
static final PropertyDescriptor TRIM_FIELDS = new PropertyDescriptor.Builder()
.name("Trim Fields")
.description("Whether or not white space should be removed from the beginning and end of fields")
.expressionLanguageSupported(false)
.allowableValues("true", "false")
.defaultValue("true")
.required(true)
.build();
// CSV Format fields for writers only
static final AllowableValue QUOTE_ALL = new AllowableValue("ALL", "Quote All Values", "All values will be quoted using the configured quote character.");
static final AllowableValue QUOTE_MINIMAL = new AllowableValue("MINIMAL", "Quote Minimal",
"Values will be quoted only if they are contain special characters such as newline characters or field separators.");
static final AllowableValue QUOTE_NON_NUMERIC = new AllowableValue("NON_NUMERIC", "Quote Non-Numeric Values", "Values will be quoted unless the value is a number.");
static final AllowableValue QUOTE_NONE = new AllowableValue("NONE", "Do Not Quote Values",
"Values will not be quoted. Instead, all special characters will be escaped using the configured escape character.");
static final PropertyDescriptor QUOTE_MODE = new PropertyDescriptor.Builder()
.name("Quote Mode")
.description("Specifies how fields should be quoted when they are written")
.expressionLanguageSupported(false)
.allowableValues(QUOTE_ALL, QUOTE_MINIMAL, QUOTE_NON_NUMERIC, QUOTE_NONE)
.defaultValue(QUOTE_MINIMAL.getValue())
.required(true)
.build();
static final PropertyDescriptor TRAILING_DELIMITER = new PropertyDescriptor.Builder()
.name("Include Trailing Delimiter")
.description("If true, a trailing delimiter will be added to each CSV Record that is written. If false, the trailing delimiter will be omitted.")
.expressionLanguageSupported(false)
.allowableValues("true", "false")
.defaultValue("false")
.required(true)
.build();
static final PropertyDescriptor RECORD_SEPARATOR = new PropertyDescriptor.Builder()
.name("Record Separator")
.description("Specifies the characters to use in order to separate CSV Records")
.addValidator(StandardValidators.NON_EMPTY_VALIDATOR)
.expressionLanguageSupported(false)
.defaultValue("\\n")
.required(true)
.build();
static final PropertyDescriptor INCLUDE_HEADER_LINE = new PropertyDescriptor.Builder()
.name("Include Header Line")
.description("Specifies whether or not the CSV column names should be written out as the first line.")
.allowableValues("true", "false")
.defaultValue("true")
.required(true)
.build();
static CSVFormat createCSVFormat(final ConfigurationContext context) {
final String formatName = context.getProperty(CSV_FORMAT).getValue();
if (formatName.equalsIgnoreCase(CUSTOM.getValue())) {
return buildCustomFormat(context);
}
if (formatName.equalsIgnoreCase(RFC_4180.getValue())) {
return CSVFormat.RFC4180;
} else if (formatName.equalsIgnoreCase(EXCEL.getValue())) {
return CSVFormat.EXCEL;
} else if (formatName.equalsIgnoreCase(TDF.getValue())) {
return CSVFormat.TDF;
} else if (formatName.equalsIgnoreCase(MYSQL.getValue())) {
return CSVFormat.MYSQL;
} else if (formatName.equalsIgnoreCase(INFORMIX_UNLOAD.getValue())) {
return CSVFormat.INFORMIX_UNLOAD;
} else if (formatName.equalsIgnoreCase(INFORMIX_UNLOAD_CSV.getValue())) {
return CSVFormat.INFORMIX_UNLOAD_CSV;
} else {
return CSVFormat.DEFAULT;
}
}
private static char getChar(final ConfigurationContext context, final PropertyDescriptor property) {
return CSVUtils.unescape(context.getProperty(property).getValue()).charAt(0);
}
private static CSVFormat buildCustomFormat(final ConfigurationContext context) {
final char valueSeparator = getChar(context, VALUE_SEPARATOR);
CSVFormat format = CSVFormat.newFormat(valueSeparator)
.withAllowMissingColumnNames()
.withIgnoreEmptyLines();
final PropertyValue skipHeaderPropertyValue = context.getProperty(SKIP_HEADER_LINE);
if (skipHeaderPropertyValue.getValue() != null && skipHeaderPropertyValue.asBoolean()) {
format = format.withFirstRecordAsHeader();
}
format = format.withQuote(getChar(context, QUOTE_CHAR));
format = format.withEscape(getChar(context, ESCAPE_CHAR));
format = format.withTrim(context.getProperty(TRIM_FIELDS).asBoolean());
if (context.getProperty(COMMENT_MARKER).isSet()) {
format = format.withCommentMarker(getChar(context, COMMENT_MARKER));
}
if (context.getProperty(NULL_STRING).isSet()) {
format = format.withNullString(CSVUtils.unescape(context.getProperty(NULL_STRING).getValue()));
}
final PropertyValue quoteValue = context.getProperty(QUOTE_MODE);
if (quoteValue != null) {
final QuoteMode quoteMode = QuoteMode.valueOf(quoteValue.getValue());
format = format.withQuoteMode(quoteMode);
}
final PropertyValue trailingDelimiterValue = context.getProperty(TRAILING_DELIMITER);
if (trailingDelimiterValue != null) {
final boolean trailingDelimiter = trailingDelimiterValue.asBoolean();
format = format.withTrailingDelimiter(trailingDelimiter);
}
final PropertyValue recordSeparator = context.getProperty(RECORD_SEPARATOR);
if (recordSeparator != null) {
final String separator = unescape(recordSeparator.getValue());
format = format.withRecordSeparator(separator);
}
return format;
}
public static String unescape(final String input) {
if (input == null) {
return input;
}
return input.replace("\\t", "\t")
.replace("\\n", "\n")
.replace("\\r", "\r");
}
}