package edu.washington.escience.myria.operator;
import java.io.BufferedInputStream;
import java.io.DataInput;
import java.io.EOFException;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.Objects;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.io.LittleEndianDataInputStream;
import edu.washington.escience.myria.DbException;
import edu.washington.escience.myria.Schema;
import edu.washington.escience.myria.Type;
import edu.washington.escience.myria.io.DataSource;
import edu.washington.escience.myria.io.FileSource;
import edu.washington.escience.myria.storage.TupleBatch;
import edu.washington.escience.myria.storage.TupleBatchBuffer;
import edu.washington.escience.myria.storage.TupleUtils;
/**
* Read a SeaFlow EVT/OPP file. See the formats in https://github.com/fribalet/flowPhyto/blob/master/R/Globals.R
*
* This operator implements file format version 3.
*/
public class SeaFlowFileScan extends LeafOperator {
/** Required for Java serialization. */
private static final long serialVersionUID = 1L;
/** The data input file. */
private transient DataInput input;
/** Holds the tuples that are ready for release. */
private transient TupleBatchBuffer buffer;
/** The group number file name. */
private final DataSource source;
/** Which record the reader is currently on. */
private int lineNumber;
/** The expected number of rows in the file. */
private int numRows;
/** The magic number at the end of each line (but the last) in a SeaFlow file. */
private static final int EOL = 10;
/** Schema for all SeaFlow event files. */
private static final Schema OPP_SCHEMA =
new Schema(
ImmutableList.of(
Type.INT_TYPE, // time
Type.INT_TYPE, // pulse_width
Type.INT_TYPE, // D1
Type.INT_TYPE, // D2
Type.INT_TYPE, // fsc_small
Type.INT_TYPE, // fsc_perp
Type.INT_TYPE, // fsc_big
Type.INT_TYPE, // pe
Type.INT_TYPE, // chl_small
Type.INT_TYPE // chl_big
),
ImmutableList.of(
"time",
"pulse_width",
"D1",
"D2",
"fsc_small",
"fsc_perp",
"fsc_big",
"pe",
"chl_small",
"chl_big"));
/** The number of columns in the schema of a SeaFlow EVT/OPP file. */
private static final int NUM_COLUMNS = OPP_SCHEMA.numColumns();
/** The number of bytes in one row of a SeaFlow EVT/OPP file. */
private static final int COLUMN_SIZE = OPP_SCHEMA.numColumns() * 2 + 4;
/**
* Construct a SeaFlowFileScan reading the specified local file.
*
* @param filename the file to be read.
*/
public SeaFlowFileScan(final String filename) {
this(new FileSource(filename));
}
/**
* Construct a SeaFlowFileScan reading the data in the specified data source.
*
* @param source contains the data to be read.
*/
public SeaFlowFileScan(final DataSource source) {
this.source = Objects.requireNonNull(source);
}
@Override
protected final TupleBatch fetchNextReady() throws DbException {
while ((lineNumber < numRows) && (buffer.numTuples() < buffer.getBatchSize())) {
try {
/*
* Every line but the last, including the header, is terminated with a 32-bit unsigned int with the value 10. We
* read the EOL for the header/previous line before the current line to simplify the EOF checking.
*/
Preconditions.checkState(input.readInt() == EOL);
for (int col = 0; col < NUM_COLUMNS; ++col) {
buffer.putInt(col, input.readUnsignedShort());
}
} catch (final IOException e) {
throw new DbException("Exception in line " + lineNumber, e);
}
lineNumber++;
}
if (lineNumber == numRows) {
/* Check for an EOF, and error if not. */
boolean flag = false;
try {
input.readByte();
} catch (EOFException e) {
flag = true;
} catch (IOException e) {
throw new DbException("Error when verifying EOF after line " + lineNumber, e);
}
Preconditions.checkState(
flag, "Was able to read another byte after %s rows, expected EOFException", lineNumber);
}
return buffer.popAny();
}
@Override
protected final void init(final ImmutableMap<String, Object> execEnvVars) throws DbException {
buffer = new TupleBatchBuffer(getSchema());
try {
input = new LittleEndianDataInputStream(new BufferedInputStream(source.getInputStream()));
numRows = input.readInt(); /* number of rows */
/* If the source is a FileSource, we can actually check its length. */
if (source instanceof FileSource) {
long length = Files.size(Paths.get(((FileSource) source).getFilename()));
long expectedSize = 4 + numRows * COLUMN_SIZE;
Preconditions.checkArgument(
length == expectedSize,
"Given %s rows, expected a file of length %s, not %s",
numRows,
expectedSize,
length);
}
} catch (IOException e) {
throw new DbException(e);
}
lineNumber = 0;
}
@Override
protected final void cleanup() throws DbException {
buffer.clear();
}
@Override
protected Schema generateSchema() {
return OPP_SCHEMA;
}
}