/* This file is part of VoltDB.
* Copyright (C) 2008-2017 VoltDB Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with VoltDB. If not, see <http://www.gnu.org/licenses/>.
*/
package org.voltdb.importer.formatter.builtin;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.Properties;
import org.supercsv.exception.SuperCsvException;
import org.supercsv.io.CsvListReader;
import org.supercsv.prefs.CsvPreference;
import org.supercsv_voltpatches.tokenizer.Tokenizer;
import org.voltdb.importer.formatter.FormatException;
import org.voltdb.importer.formatter.Formatter;
public class VoltSuperCSVFormatter implements Formatter {
/** String that can be used to indicate NULL value in CSV files */
public static final String CSV_NULL = "\\N";
/** String that can be used to indicate NULL value in CSV files */
public static final String QUOTED_CSV_NULL = "\"\\N\"";
public static final char DEFAULT_QUOTE_CHAR = '"';
public static final char DEFAULT_ESCAPE_CHAR = '\\';
/**
* Size limit for each column.
*/
public static final long DEFAULT_COLUMN_LIMIT_SIZE = 16777216;
private String m_blank;
private String m_customNullString;
private boolean m_nowhitespace;
private boolean m_surroundingSpacesNeedQuotes;
private char m_separator;
private char m_escape;
private boolean m_strictquotes;
private VoltCVSTokenizer m_tokenizer;
CsvListReader m_csvReader;
public VoltSuperCSVFormatter(String formatName, Properties prop) {
if (!("csv".equalsIgnoreCase(formatName) || "tsv".equalsIgnoreCase(formatName))) {
throw new IllegalArgumentException(
"Invalid format " + formatName + ", expected \"csv\" or \"tsv\".");
}
m_separator = "csv".equalsIgnoreCase(formatName) ? ',' : '\t';
String separatorProp = prop.getProperty("separator", "").trim();
if (!separatorProp.isEmpty() && separatorProp.length() == 1) {
m_separator = separatorProp.charAt(0);
}
char quotechar = DEFAULT_QUOTE_CHAR;
String quoteCharProp = prop.getProperty("quotechar", "").trim();
if (!quoteCharProp.isEmpty() && quoteCharProp.length() == 1) {
quotechar = quoteCharProp.charAt(0);
}
m_escape = DEFAULT_ESCAPE_CHAR;
String escapeProp = prop.getProperty("escape", "").trim();
if (!escapeProp.isEmpty() && escapeProp.length() == 1) {
m_escape = escapeProp.charAt(0);
}
m_strictquotes = "true".equalsIgnoreCase(prop.getProperty("strictquotes", ""));
m_surroundingSpacesNeedQuotes = "true".equalsIgnoreCase(prop.getProperty("trimunquoted", ""));
m_blank = prop.getProperty("blank", "").trim();
m_customNullString = prop.getProperty("nullstring", "").trim();
if (!m_customNullString.isEmpty() && !"error".equals(m_blank)) {
m_blank = "empty";
}
m_nowhitespace = "true".equalsIgnoreCase(prop.getProperty("nowhitespace", ""));
CsvPreference.Builder builder = new CsvPreference.Builder(quotechar, m_separator, "\n");
if (m_surroundingSpacesNeedQuotes) {
builder.surroundingSpacesNeedQuotes(true);
}
CsvPreference csvPreference = builder.build();
m_tokenizer = new VoltCVSTokenizer(new StringReader(""), csvPreference, m_strictquotes, m_escape,
DEFAULT_COLUMN_LIMIT_SIZE, 0);
m_csvReader = new CsvListReader(m_tokenizer, csvPreference);
}
@Override
public Object[] transform(ByteBuffer payload) throws FormatException {
if (payload == null) {
return null;
}
String line = new String(payload.array(), payload.arrayOffset(), payload.limit(), StandardCharsets.UTF_8);
m_tokenizer.setSourceString(line);
List<String> dataList;
try {
dataList = m_csvReader.read();
} catch (IOException | SuperCsvException e) {
throw new FormatException("Failed to parse csv data", e);
}
if (dataList == null) return null;
String[] data = dataList.toArray(new String[0]);
normalize(data);
return data;
}
private void normalize(String[] lineValues) throws FormatException {
for (int i = 0; i < lineValues.length; i++) {
if (lineValues[i] == null) {
if ("error".equals(m_blank)) {
throw new FormatException("Blank values are not allowed");
}
} else {
if (m_nowhitespace && (lineValues[i].charAt(0) == ' '
|| lineValues[i].charAt(lineValues[i].length() - 1) == ' ')) {
throw new FormatException("Whitespace detectet when nowhitespace is used");
} else if (m_surroundingSpacesNeedQuotes) {
lineValues[i] = lineValues[i].trim();
}
if (!m_customNullString.isEmpty()) {
if (m_customNullString.equals(lineValues[i])) {
lineValues[i] = null;
}
} else if ("NULL".equals(lineValues[i]) || CSV_NULL.equalsIgnoreCase(lineValues[i])
|| QUOTED_CSV_NULL.equals(lineValues[i])) {
lineValues[i] = null;
}
}
}
}
/**
* Importers transform the source one row at time. VoltCVSTokenizer will cut significant amount time on processing data
* via reader and cell processor and improve the performance.
*
*/
private class VoltCVSTokenizer extends Tokenizer {
private String m_sourceString;
public VoltCVSTokenizer(Reader reader, CsvPreference preferences, boolean strictquotes, char escapechar,
long columnsizelimit, long skipNum) {
super(reader, preferences, strictquotes, escapechar, columnsizelimit, skipNum);
}
/**
* {@inheritDoc}
*/
@Override
public String readLine() throws IOException {
String tempStr = m_sourceString;
//set to null to mark EOF
m_sourceString = null;
return tempStr;
}
/**
* {@inheritDoc}
*/
@Override
public int getLineNumber() {
return 1;
}
public void setSourceString(String sourceString) {
m_sourceString = sourceString;
}
}
}