SeaFlowFileScan.java example

Explorer
myria-master
package edu.washington.escience.myria.operator;

import java.io.BufferedInputStream;
import java.io.DataInput;
import java.io.EOFException;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.Objects;

import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.io.LittleEndianDataInputStream;

import edu.washington.escience.myria.DbException;
import edu.washington.escience.myria.Schema;
import edu.washington.escience.myria.Type;
import edu.washington.escience.myria.io.DataSource;
import edu.washington.escience.myria.io.FileSource;
import edu.washington.escience.myria.storage.TupleBatch;
import edu.washington.escience.myria.storage.TupleBatchBuffer;
import edu.washington.escience.myria.storage.TupleUtils;

/**
 * Read a SeaFlow EVT/OPP file. See the formats in https://github.com/fribalet/flowPhyto/blob/master/R/Globals.R
 *
 * This operator implements file format version 3.
 */
public class SeaFlowFileScan extends LeafOperator {

  /** Required for Java serialization. */
  private static final long serialVersionUID = 1L;
  /** The data input file. */
  private transient DataInput input;
  /** Holds the tuples that are ready for release. */
  private transient TupleBatchBuffer buffer;

  /** The group number file name. */
  private final DataSource source;
  /** Which record the reader is currently on. */
  private int lineNumber;
  /** The expected number of rows in the file. */
  private int numRows;
  /** The magic number at the end of each line (but the last) in a SeaFlow file. */
  private static final int EOL = 10;

  /** Schema for all SeaFlow event files. */
  private static final Schema OPP_SCHEMA =
      new Schema(
          ImmutableList.of(
              Type.INT_TYPE, // time
              Type.INT_TYPE, // pulse_width
              Type.INT_TYPE, // D1
              Type.INT_TYPE, // D2
              Type.INT_TYPE, // fsc_small
              Type.INT_TYPE, // fsc_perp
              Type.INT_TYPE, // fsc_big
              Type.INT_TYPE, // pe
              Type.INT_TYPE, // chl_small
              Type.INT_TYPE // chl_big
              ),
          ImmutableList.of(
              "time",
              "pulse_width",
              "D1",
              "D2",
              "fsc_small",
              "fsc_perp",
              "fsc_big",
              "pe",
              "chl_small",
              "chl_big"));
  /** The number of columns in the schema of a SeaFlow EVT/OPP file. */
  private static final int NUM_COLUMNS = OPP_SCHEMA.numColumns();
  /** The number of bytes in one row of a SeaFlow EVT/OPP file. */
  private static final int COLUMN_SIZE = OPP_SCHEMA.numColumns() * 2 + 4;

  /**
   * Construct a SeaFlowFileScan reading the specified local file.
   *
   * @param filename the file to be read.
   */
  public SeaFlowFileScan(final String filename) {
    this(new FileSource(filename));
  }

  /**
   * Construct a SeaFlowFileScan reading the data in the specified data source.
   *
   * @param source contains the data to be read.
   */
  public SeaFlowFileScan(final DataSource source) {
    this.source = Objects.requireNonNull(source);
  }

  @Override
  protected final TupleBatch fetchNextReady() throws DbException {
    while ((lineNumber < numRows) && (buffer.numTuples() < buffer.getBatchSize())) {
      try {
        /*
         * Every line but the last, including the header, is terminated with a 32-bit unsigned int with the value 10. We
         * read the EOL for the header/previous line before the current line to simplify the EOF checking.
         */
        Preconditions.checkState(input.readInt() == EOL);
        for (int col = 0; col < NUM_COLUMNS; ++col) {
          buffer.putInt(col, input.readUnsignedShort());
        }
      } catch (final IOException e) {
        throw new DbException("Exception in line " + lineNumber, e);
      }
      lineNumber++;
    }

    if (lineNumber == numRows) {
      /* Check for an EOF, and error if not. */
      boolean flag = false;
      try {
        input.readByte();
      } catch (EOFException e) {
        flag = true;
      } catch (IOException e) {
        throw new DbException("Error when verifying EOF after line " + lineNumber, e);
      }
      Preconditions.checkState(
          flag, "Was able to read another byte after %s rows, expected EOFException", lineNumber);
    }
    return buffer.popAny();
  }

  @Override
  protected final void init(final ImmutableMap<String, Object> execEnvVars) throws DbException {
    buffer = new TupleBatchBuffer(getSchema());

    try {
      input = new LittleEndianDataInputStream(new BufferedInputStream(source.getInputStream()));
      numRows = input.readInt(); /* number of rows */
      /* If the source is a FileSource, we can actually check its length. */
      if (source instanceof FileSource) {
        long length = Files.size(Paths.get(((FileSource) source).getFilename()));
        long expectedSize = 4 + numRows * COLUMN_SIZE;
        Preconditions.checkArgument(
            length == expectedSize,
            "Given %s rows, expected a file of length %s, not %s",
            numRows,
            expectedSize,
            length);
      }
    } catch (IOException e) {
      throw new DbException(e);
    }

    lineNumber = 0;
  }

  @Override
  protected final void cleanup() throws DbException {
    buffer.clear();
  }

  @Override
  protected Schema generateSchema() {
    return OPP_SCHEMA;
  }
}