/**
* Copyright (C) 2014-2016 LinkedIn Corp. (pinot-core@linkedin.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.linkedin.pinot.core.data.readers;
import com.linkedin.pinot.common.data.FieldSpec;
import com.linkedin.pinot.common.data.Schema;
import com.linkedin.pinot.core.data.GenericRow;
import java.io.FileReader;
import java.io.Reader;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVParser;
import org.apache.commons.csv.CSVRecord;
import org.apache.commons.lang.StringEscapeUtils;
import org.apache.commons.lang.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class CSVRecordReader extends BaseRecordReader {
private static final Logger _logger = LoggerFactory.getLogger(CSVRecordReader.class);
private String _delimiterString = ",";
private String _fileName;
private Schema _schema = null;
private CSVParser _parser = null;
private Iterator<CSVRecord> _iterator = null;
CSVRecordReaderConfig _config = null;
public CSVRecordReader(String dataFile, RecordReaderConfig recordReaderConfig, Schema schema) {
super();
super.initNullCounters(schema);
_fileName = dataFile;
_schema = schema;
_config = (CSVRecordReaderConfig) recordReaderConfig;
_delimiterString = (_config != null) ? _config.getCsvDelimiter() : ",";
}
@Override
public void init() throws Exception {
final Reader reader = new FileReader(_fileName);
_parser = new CSVParser(reader, getFormat());
_iterator = _parser.iterator();
}
@Override
public void rewind() throws Exception {
_parser.close();
init();
}
@Override
public boolean hasNext() {
return _iterator.hasNext();
}
@Override
public Schema getSchema() {
return _schema;
}
@Override
public GenericRow next() {
return next(new GenericRow());
}
@Override
public GenericRow next(GenericRow row) {
CSVRecord record = _iterator.next();
for (final FieldSpec fieldSpec : _schema.getAllFieldSpecs()) {
String column = fieldSpec.getName();
String token = getValueForColumn(record, column);
Object value = null;
if (token == null || token.isEmpty()) {
incrementNullCountFor(fieldSpec.getName());
}
if (fieldSpec.isSingleValueField()) {
value = RecordReaderUtils.convertToDataType(token, fieldSpec.getDataType());
} else {
String[] tokens = (token != null) ? StringUtils.split(token, _delimiterString) : null;
value = RecordReaderUtils.convertToDataTypeArray(tokens, fieldSpec.getDataType());
}
row.putField(column, value);
}
return row;
}
@Override
public void close() throws Exception {
_parser.close();
}
private String getValueForColumn(CSVRecord record, String column) {
if ((_config != null) && (_config.columnIsDate(column))) {
return dateToDaysSinceEpochMilli(record.get(column)).toString();
} else {
return record.get(column);
}
}
private Long dateToDaysSinceEpochMilli(String token) {
if ((token == null) || (_config == null)) {
return 0L;
}
SimpleDateFormat dateFormat = new SimpleDateFormat(_config.getCsvDateFormat());
// Propagting this exception up causes a whole bunch of other readers to now throw exceptions.
// Catch here, and return 0.
try {
Date date = dateFormat.parse(token);
return date.getTime(); // This is in milli-seconds.
} catch (ParseException e) {
_logger.warn("Illegal date: Expected format: " + _config.getCsvDateFormat());
return 0L;
}
}
private CSVFormat getFormatFromConfig() {
String format = (_config != null) ? _config.getCsvFileFormat() : null;
if (format == null) {
return CSVFormat.DEFAULT;
}
format = format.toUpperCase();
if ((format.equals("DEFAULT"))) {
return CSVFormat.DEFAULT;
} else if (format.equals("EXCEL")) {
return CSVFormat.EXCEL;
} else if (format.equals("MYSQL")) {
return CSVFormat.MYSQL;
} else if (format.equals("RFC4180")) {
return CSVFormat.RFC4180;
} else if (format.equals("TDF")) {
return CSVFormat.TDF;
} else {
return CSVFormat.DEFAULT;
}
}
private String[] getHeaderFromConfig() {
String token;
if ((_config == null) || ((token = _config.getCsvHeader())) == null) {
return null;
}
return StringUtils.split(token, _delimiterString);
}
private char getDelimiterFromConfig() {
String delimiter;
if ((_config == null) || ((delimiter = _config.getCsvDelimiter()) == null)) {
return ',';
} else {
return StringEscapeUtils.unescapeJava(delimiter).charAt(0);
}
}
private CSVFormat getFormat() {
CSVFormat format = getFormatFromConfig().withDelimiter(getDelimiterFromConfig());
String[] header = getHeaderFromConfig();
if (header != null) {
format = format.withHeader(header);
} else {
format = format.withHeader();
}
return format;
}
}