CsvParserFunction.java example

Explorer
DataCleaner-master
/**
 * DataCleaner (community edition)
 * Copyright (C) 2014 Neopost - Customer Information Management
 *
 * This copyrighted material is made available to anyone wishing to use, modify,
 * copy, or redistribute it subject to the terms and conditions of the GNU
 * Lesser General Public License, as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 * or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 * for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this distribution; if not, write to:
 * Free Software Foundation, Inc.
 * 51 Franklin Street, Fifth Floor
 * Boston, MA  02110-1301  USA
 */
package org.datacleaner.spark.functions;

import org.apache.metamodel.csv.CsvConfiguration;
import org.apache.spark.api.java.function.Function;

import au.com.bytecode.opencsv.CSVParser;

public final class CsvParserFunction implements Function<String, Object[]> {

    private static final long serialVersionUID = 1L;

    private final CsvConfiguration _csvConfiguration;

    public CsvParserFunction(final CsvConfiguration csvConfiguration) {
        if (csvConfiguration.isMultilineValues()) {
            throw new IllegalStateException("Multiline CSV files are not supported");
        }

        final String encoding = csvConfiguration.getEncoding();
        switch (encoding.toUpperCase()) {
        case "UTF-8":
        case "UTF8":
            // supported
            break;
        default:
            throw new IllegalStateException(
                    "Unsupported encoding: '" + encoding + "'. CSV files on Hadoop must be UTF-8 encoded.");
        }

        _csvConfiguration = csvConfiguration;
    }

    @Override
    public Object[] call(final String csvLine) throws Exception {
        final CSVParser csvParser =
                new CSVParser(_csvConfiguration.getSeparatorChar(), _csvConfiguration.getQuoteChar(),
                        _csvConfiguration.getEscapeChar());
        return csvParser.parseLine(csvLine);
    }

}