/** * */ package edu.washington.escience.myria; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.nio.ByteBuffer; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.Iterator; import javax.annotation.Nullable; import org.apache.commons.csv.CSVFormat; import org.apache.commons.csv.CSVParser; import org.apache.commons.csv.CSVRecord; import org.apache.commons.lang.BooleanUtils; import com.fasterxml.jackson.annotation.JsonProperty; import com.google.common.base.MoreObjects; import com.google.common.base.Preconditions; import com.google.common.primitives.Floats; import edu.washington.escience.myria.storage.TupleBatch; import edu.washington.escience.myria.storage.TupleBatchBuffer; import edu.washington.escience.myria.storage.TupleUtils; import edu.washington.escience.myria.util.DateTimeUtils; /** * */ public class CsvTupleReader implements TupleReader { /** The Schema of the relation stored in this file. */ @JsonProperty private final Schema schema; /** A user-provided file delimiter; if null, the system uses the default comma as delimiter. */ @JsonProperty private final Character delimiter; /** A user-provided quotation mark, if null, the system uses '"'. */ @JsonProperty private final Character quote; /** A user-provided escape character to escape quote and itself, if null, the system uses '/'. */ @JsonProperty private final Character escape; /** Number of skipped lines on the head. */ @JsonProperty("skip") private final Integer numberOfSkippedLines; /** Scanner used to parse the file. */ private transient CSVParser parser = null; /** Iterator over CSV records. */ private transient Iterator<CSVRecord> iterator = null; /** Holds the tuples that are ready for release. */ private transient TupleBatchBuffer buffer; /** Which line of the file the scanner is currently on. */ private long lineNumber = 0; /** Required for Java serialization. */ private static final long serialVersionUID = 1L; /** * The logger for debug, trace, etc. messages in this class. */ private static final org.slf4j.Logger LOGGER = org.slf4j.LoggerFactory.getLogger(CsvTupleReader.class); public CsvTupleReader(final Schema schema) { this(schema, null, null, null, null); } public CsvTupleReader(final Schema schema, final Character delimiter) { this(schema, delimiter, null, null, null); } public CsvTupleReader( @JsonProperty(value = "schema", required = true) final Schema schema, @JsonProperty(value = "delimiter", required = false) @Nullable final Character delimiter, @JsonProperty(value = "quote", required = false) @Nullable final Character quote, @JsonProperty(value = "escape", required = false) @Nullable final Character escape, @JsonProperty(value = "numberOfSkippedLines", required = false) @Nullable final Integer numberOfSkippedLines) { this.schema = Preconditions.checkNotNull(schema, "schema"); this.delimiter = MoreObjects.firstNonNull(delimiter, CSVFormat.DEFAULT.getDelimiter()); this.quote = MoreObjects.firstNonNull(quote, CSVFormat.DEFAULT.getQuoteCharacter()); this.escape = escape != null ? escape : CSVFormat.DEFAULT.getEscapeCharacter(); this.numberOfSkippedLines = MoreObjects.firstNonNull(numberOfSkippedLines, 0); } @Override public void open(final InputStream stream) throws IOException, DbException { buffer = new TupleBatchBuffer(schema); try { parser = new CSVParser( new BufferedReader(new InputStreamReader(stream)), CSVFormat.newFormat(delimiter).withQuote(quote).withEscape(escape)); iterator = parser.iterator(); for (int i = 0; i < numberOfSkippedLines; i++) { iterator.next(); } } catch (IOException e) { throw new DbException(e); } lineNumber = 0; } @Override public TupleBatch readTuples() throws IOException, DbException { /* Let's assume that the scanner always starts at the beginning of a line. */ long lineNumberBegin = lineNumber; while ((buffer.numTuples() < buffer.getBatchSize())) { lineNumber++; if (parser.isClosed()) { break; } try { if (!iterator.hasNext()) { parser.close(); break; } } catch (final RuntimeException e) { throw new DbException("Error parsing row " + lineNumber, e); } CSVRecord record = iterator.next(); if (record.size() != schema.numColumns()) { throw new DbException( "Error parsing row " + lineNumber + ": Found " + record.size() + " column(s) but expected " + schema.numColumns() + " column(s)."); } for (int column = 0; column < schema.numColumns(); ++column) { String cell = record.get(column); try { switch (schema.getColumnType(column)) { case BOOLEAN_TYPE: if (Floats.tryParse(cell) != null) { buffer.putBoolean(column, Floats.tryParse(cell) != 0); } else if (BooleanUtils.toBoolean(cell)) { buffer.putBoolean(column, Boolean.parseBoolean(cell)); } break; case DOUBLE_TYPE: buffer.putDouble(column, Double.parseDouble(cell)); break; case FLOAT_TYPE: buffer.putFloat(column, Float.parseFloat(cell)); break; case INT_TYPE: buffer.putInt(column, Integer.parseInt(cell)); break; case LONG_TYPE: buffer.putLong(column, Long.parseLong(cell)); break; case STRING_TYPE: buffer.putString(column, cell); break; case DATETIME_TYPE: buffer.putDateTime(column, DateTimeUtils.parse(cell)); break; case BLOB_TYPE: throw new DbException("Reading BLOB type from csv file is not supported!"); } } catch (final IllegalArgumentException e) { throw new DbException( "Error parsing column " + column + " of row " + lineNumber + ", expected type: " + schema.getColumnType(column) + ", scanned value: " + cell, e); } } } LOGGER.debug("Scanned {} input lines", lineNumber - lineNumberBegin); return buffer.popAny(); } @Override public Schema getSchema() { return schema; } @Override public void close() throws IOException { parser = null; while (buffer.numTuples() > 0) { buffer.popAny(); } } }