VoltSuperCSVFormatter.java example

Explorer
voltdb-master
/* This file is part of VoltDB.
 * Copyright (C) 2008-2017 VoltDB Inc.
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Affero General Public License as
 * published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Affero General Public License for more details.
 *
 * You should have received a copy of the GNU Affero General Public License
 * along with VoltDB.  If not, see <http://www.gnu.org/licenses/>.
 */

package org.voltdb.importer.formatter.builtin;

import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.Properties;

import org.supercsv.exception.SuperCsvException;
import org.supercsv.io.CsvListReader;
import org.supercsv.prefs.CsvPreference;
import org.supercsv_voltpatches.tokenizer.Tokenizer;
import org.voltdb.importer.formatter.FormatException;
import org.voltdb.importer.formatter.Formatter;

public class VoltSuperCSVFormatter implements Formatter {

    /** String that can be used to indicate NULL value in CSV files */
    public static final String CSV_NULL = "\\N";

    /** String that can be used to indicate NULL value in CSV files */
    public static final String QUOTED_CSV_NULL = "\"\\N\"";

    public static final char DEFAULT_QUOTE_CHAR = '"';

    public static final char DEFAULT_ESCAPE_CHAR = '\\';

    /**
     * Size limit for each column.
     */
    public static final long DEFAULT_COLUMN_LIMIT_SIZE = 16777216;

    private String m_blank;
    private String m_customNullString;
    private boolean m_nowhitespace;
    private boolean m_surroundingSpacesNeedQuotes;
    private char m_separator;
    private char m_escape;
    private boolean m_strictquotes;
    private VoltCVSTokenizer m_tokenizer;
    CsvListReader m_csvReader;

    public VoltSuperCSVFormatter(String formatName, Properties prop) {

        if (!("csv".equalsIgnoreCase(formatName) || "tsv".equalsIgnoreCase(formatName))) {
            throw new IllegalArgumentException(
                    "Invalid format " + formatName + ", expected \"csv\" or \"tsv\".");
        }

        m_separator = "csv".equalsIgnoreCase(formatName) ? ',' : '\t';

        String separatorProp = prop.getProperty("separator", "").trim();
        if (!separatorProp.isEmpty() && separatorProp.length() == 1) {
            m_separator = separatorProp.charAt(0);
        }

        char quotechar = DEFAULT_QUOTE_CHAR;
        String quoteCharProp = prop.getProperty("quotechar", "").trim();
        if (!quoteCharProp.isEmpty() && quoteCharProp.length() == 1) {
            quotechar = quoteCharProp.charAt(0);
        }

        m_escape = DEFAULT_ESCAPE_CHAR;
        String escapeProp = prop.getProperty("escape", "").trim();
        if (!escapeProp.isEmpty() && escapeProp.length() == 1) {
            m_escape = escapeProp.charAt(0);
        }

        m_strictquotes = "true".equalsIgnoreCase(prop.getProperty("strictquotes", ""));
        m_surroundingSpacesNeedQuotes = "true".equalsIgnoreCase(prop.getProperty("trimunquoted", ""));
        m_blank = prop.getProperty("blank", "").trim();

        m_customNullString = prop.getProperty("nullstring", "").trim();
        if (!m_customNullString.isEmpty() && !"error".equals(m_blank)) {
            m_blank = "empty";
        }

        m_nowhitespace = "true".equalsIgnoreCase(prop.getProperty("nowhitespace", ""));

        CsvPreference.Builder builder = new CsvPreference.Builder(quotechar, m_separator, "\n");
        if (m_surroundingSpacesNeedQuotes) {
            builder.surroundingSpacesNeedQuotes(true);
        }
        CsvPreference csvPreference = builder.build();

        m_tokenizer = new VoltCVSTokenizer(new StringReader(""), csvPreference, m_strictquotes, m_escape,
                DEFAULT_COLUMN_LIMIT_SIZE, 0);

        m_csvReader = new CsvListReader(m_tokenizer, csvPreference);
    }

    @Override
    public Object[] transform(ByteBuffer payload) throws FormatException {
        if (payload == null) {
            return null;
        }
        String line = new String(payload.array(), payload.arrayOffset(), payload.limit(), StandardCharsets.UTF_8);
        m_tokenizer.setSourceString(line);
        List<String> dataList;
        try {
            dataList = m_csvReader.read();
        } catch (IOException | SuperCsvException e) {
            throw new FormatException("Failed to parse csv data", e);
        }
        if (dataList == null) return null;
        String[] data = dataList.toArray(new String[0]);
        normalize(data);
        return data;
    }

    private void normalize(String[] lineValues) throws FormatException {

        for (int i = 0; i < lineValues.length; i++) {

            if (lineValues[i] == null) {
                if ("error".equals(m_blank)) {
                    throw new FormatException("Blank values are not allowed");
                }
            } else {
                if (m_nowhitespace && (lineValues[i].charAt(0) == ' '
                        || lineValues[i].charAt(lineValues[i].length() - 1) == ' ')) {
                    throw new FormatException("Whitespace detectet when nowhitespace is used");
                } else if (m_surroundingSpacesNeedQuotes) {
                    lineValues[i] = lineValues[i].trim();
                }

                if (!m_customNullString.isEmpty()) {
                    if (m_customNullString.equals(lineValues[i])) {
                        lineValues[i] = null;
                    }
                } else if ("NULL".equals(lineValues[i]) || CSV_NULL.equalsIgnoreCase(lineValues[i])
                        || QUOTED_CSV_NULL.equals(lineValues[i])) {
                    lineValues[i] = null;
                }
            }
        }
    }

    /**
     * Importers transform the source one row at time. VoltCVSTokenizer will cut significant amount time on processing data
     * via reader and cell processor and improve the performance.
     *
     */
    private class VoltCVSTokenizer extends Tokenizer {

        private String m_sourceString;

        public VoltCVSTokenizer(Reader reader, CsvPreference preferences, boolean strictquotes, char escapechar,
                long columnsizelimit, long skipNum) {
            super(reader, preferences, strictquotes, escapechar, columnsizelimit, skipNum);
        }

        /**
         * {@inheritDoc}
         */
        @Override
        public String readLine() throws IOException {

            String tempStr = m_sourceString;

            //set to null to mark EOF
            m_sourceString = null;

            return tempStr;
        }

        /**
         * {@inheritDoc}
         */
        @Override
        public int getLineNumber() {
            return 1;
        }

        public void setSourceString(String sourceString) {
            m_sourceString = sourceString;
        }
    }
}