package edu.washington.escience.myria.operator;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Iterator;
import javax.annotation.Nullable;
import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVParser;
import org.apache.commons.csv.CSVRecord;
import org.apache.commons.lang.BooleanUtils;
import com.google.common.base.MoreObjects;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableMap;
import com.google.common.primitives.Floats;
import java.nio.ByteBuffer;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import edu.washington.escience.myria.DbException;
import edu.washington.escience.myria.Schema;
import edu.washington.escience.myria.io.DataSource;
import edu.washington.escience.myria.io.FileSource;
import edu.washington.escience.myria.storage.TupleBatch;
import edu.washington.escience.myria.storage.TupleBatchBuffer;
import edu.washington.escience.myria.storage.TupleUtils;
import edu.washington.escience.myria.util.DateTimeUtils;
/**
* Reads data from a file. For CSV files, the default parser follows the RFC 4180 (http://tools.ietf.org/html/rfc4180).
* However, this operator can be used to scan files with different delimiters, etc.
*
* This operator assumes the input file is be comma-separated CSV files and have one record per line. For input files in
* other formats, delimiter need to be specified, e.g `\t` for tab delimited file, '|' for pipe delimited file. Each
* cell of the input can be enclosed by the default quotation mark '"'. Other quotation mark like '\'' can be specified
* by user as well. Note that the enclosure by quotation is not required in the input file.
*
*/
public final class FileScan extends LeafOperator {
/** The Schema of the relation stored in this file. */
private final Schema schema;
/** Scanner used to parse the file. */
private transient CSVParser parser = null;
/** Iterator over CSV records. */
private transient Iterator<CSVRecord> iterator = null;
/** A user-provided file delimiter; if null, the system uses the default comma as delimiter. */
private final Character delimiter;
/** A user-provided quotation mark, if null, the system uses '"'. */
private final Character quote;
/** A user-provided escape character to escape quote and itself, if null, the system uses '/'. */
private final Character escape;
/** The data source that will generate the input stream to be read at initialization. */
private final DataSource source;
/** Number of skipped lines on the head. */
private final Integer numberOfSkippedLines;
/** Holds the tuples that are ready for release. */
private transient TupleBatchBuffer buffer;
/** Which line of the file the scanner is currently on. */
private long lineNumber = 0;
/** Required for Java serialization. */
private static final long serialVersionUID = 1L;
/**
* The logger for debug, trace, etc. messages in this class.
*/
private static final org.slf4j.Logger LOGGER = org.slf4j.LoggerFactory.getLogger(FileScan.class);
/**
* Construct a new FileScan object to read from the specified file. This file is assumed to be comma-separated and
* have one record per line. '"' will be used as default quotation mark. `\` will be used as escape character.
*
* @param filename file containing the data to be scanned.
* @param schema the Schema of the relation contained in the file.
*/
public FileScan(final String filename, final Schema schema) {
this(filename, schema, null, null, null, null);
}
/**
* Construct a new FileScan object to read from the specified file. This file is assumed to be comma-separated and
* have one record per line. '"' will be used as default quotation mark. `\` will be used as escape character.
*
* @param source the data source containing the relation.
* @param schema the Schema of the relation contained in the file.
*/
public FileScan(final DataSource source, final Schema schema) {
this(source, schema, null, null, null, null);
}
/**
* Construct a new FileScan object to read from the specified file. This file is assumed to be comma-separated and
* have one record per line. If delimiter is non-null, the system uses its value as a delimiter. '"' will be used as
* default quotation mark. `\` will be used as escape character.
*
* @param filename file containing the data to be scanned.
* @param schema the Schema of the relation contained in the file.
* @param delimiter An optional override file delimiter.
*/
public FileScan(final String filename, final Schema schema, final Character delimiter) {
this(new FileSource(filename), schema, delimiter, null, null, null);
}
/**
* Construct a new FileScan object to read from the specified file. This file is assumed to be comma-separated and
* have one record per line. If delimiter is non-null, the system uses its value as a delimiter. '"' will be used as
* default quotation mark. `\` will be used as escape character.
*
* @param source file containing the data to be scanned.
* @param schema the Schema of the relation contained in the file.
* @param delimiter An optional override file delimiter.
*/
public FileScan(final DataSource source, final Schema schema, final Character delimiter) {
this(source, schema, delimiter, null, null, null);
}
/**
* Construct a new FileScan object to read from the specified file. This file is assumed to be comma-separated and
* have one record per line. If delimiter is non-null, the system uses its value as a delimiter. If quote is null, '"'
* will be used as default quotation mark. If escape is null, `\` will be used as escape character. If
* numberOfSkippedLines is null, no line will be skipped.
*
* @param filename file containing the data to be scanned.
* @param schema the Schema of the relation contained in the file.
* @param delimiter An optional override file delimiter.
* @param quote An optional quote character
* @param escape An optional escape character.
* @param numberOfSkippedLines number of lines to be skipped.
*/
public FileScan(
final String filename,
final Schema schema,
@Nullable final Character delimiter,
@Nullable final Character quote,
@Nullable final Character escape,
@Nullable final Integer numberOfSkippedLines) {
this(new FileSource(filename), schema, delimiter, quote, escape, numberOfSkippedLines);
}
/**
* Construct a new FileScan object to read from the specified file. This file is assumed to be comma-separated and
* have one record per line. If delimiter is non-null, the system uses its value as a delimiter. If quote is null, '"'
* will be used as default quotation mark. If escape is null, `\` will be used as escape character. If
* numberOfSkippedLines is null, no line will be skipped.
*
* @param source the data source containing the relation.
* @param schema the Schema of the relation contained in the file.
* @param delimiter An optional override file delimiter.
* @param quote An optional quote character
* @param escape An optional escape character.
* @param numberOfSkippedLines number of lines to be skipped (number of lines in header).
*/
public FileScan(
final DataSource source,
final Schema schema,
@Nullable final Character delimiter,
@Nullable final Character quote,
@Nullable final Character escape,
@Nullable final Integer numberOfSkippedLines) {
this.source = Preconditions.checkNotNull(source, "source");
this.schema = Preconditions.checkNotNull(schema, "schema");
this.delimiter = MoreObjects.firstNonNull(delimiter, CSVFormat.DEFAULT.getDelimiter());
this.quote = MoreObjects.firstNonNull(quote, CSVFormat.DEFAULT.getQuoteCharacter());
this.escape = escape != null ? escape : CSVFormat.DEFAULT.getEscapeCharacter();
this.numberOfSkippedLines = MoreObjects.firstNonNull(numberOfSkippedLines, 0);
}
@Override
public void cleanup() {
parser = null;
while (buffer.numTuples() > 0) {
buffer.popAny();
}
}
@Override
protected TupleBatch fetchNextReady() throws DbException, IOException {
/* Let's assume that the scanner always starts at the beginning of a line. */
long lineNumberBegin = lineNumber;
while ((buffer.numTuples() < buffer.getBatchSize())) {
lineNumber++;
if (parser.isClosed()) {
break;
}
try {
if (!iterator.hasNext()) {
parser.close();
break;
}
} catch (final RuntimeException e) {
throw new DbException("Error parsing row " + lineNumber, e);
}
CSVRecord record = iterator.next();
if (record.size() != schema.numColumns()) {
throw new DbException(
"Error parsing row "
+ lineNumber
+ ": Found "
+ record.size()
+ " column(s) but expected "
+ schema.numColumns()
+ " column(s).");
}
for (int column = 0; column < schema.numColumns(); ++column) {
String cell = record.get(column);
try {
switch (schema.getColumnType(column)) {
case BOOLEAN_TYPE:
Float f = Floats.tryParse(cell);
if (f != null) {
buffer.putBoolean(column, f != 0);
} else {
buffer.putBoolean(column, BooleanUtils.toBoolean(cell));
}
break;
case DOUBLE_TYPE:
buffer.putDouble(column, Double.parseDouble(cell));
break;
case FLOAT_TYPE:
buffer.putFloat(column, Float.parseFloat(cell));
break;
case INT_TYPE:
buffer.putInt(column, Integer.parseInt(cell));
break;
case LONG_TYPE:
buffer.putLong(column, Long.parseLong(cell));
break;
case STRING_TYPE:
buffer.putString(column, cell);
break;
case DATETIME_TYPE:
buffer.putDateTime(column, DateTimeUtils.parse(cell));
break;
case BLOB_TYPE:
buffer.putBlob(column, getFile(cell)); // read filename
break;
}
} catch (final IllegalArgumentException e) {
throw new DbException(
"Error parsing column "
+ column
+ " of row "
+ lineNumber
+ ", expected type: "
+ schema.getColumnType(column)
+ ", scanned value: "
+ cell,
e);
}
}
}
LOGGER.debug("Scanned {} input lines", lineNumber - lineNumberBegin);
return buffer.popAny();
}
@Override
public Schema generateSchema() {
return schema;
}
@Override
protected void init(final ImmutableMap<String, Object> execEnvVars) throws DbException {
buffer = new TupleBatchBuffer(getSchema());
try {
parser =
new CSVParser(
new BufferedReader(new InputStreamReader(source.getInputStream())),
CSVFormat.newFormat(delimiter).withQuote(quote).withEscape(escape));
iterator = parser.iterator();
for (int i = 0; i < numberOfSkippedLines; i++) {
iterator.next();
}
} catch (IOException e) {
throw new DbException(e);
}
lineNumber = 0;
}
protected ByteBuffer getFile(final String filename) throws DbException {
Preconditions.checkNotNull(filename, "byte[] filename was null");
Path path = Paths.get(filename);
byte[] data = null;
try {
data = Files.readAllBytes(path);
} catch (IOException e) {
throw new DbException(e);
}
return ByteBuffer.wrap(data);
}
}