FileScan.java example

Explorer
myria-master
package edu.washington.escience.myria.operator;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Iterator;

import javax.annotation.Nullable;

import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVParser;
import org.apache.commons.csv.CSVRecord;
import org.apache.commons.lang.BooleanUtils;

import com.google.common.base.MoreObjects;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableMap;
import com.google.common.primitives.Floats;
import java.nio.ByteBuffer;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;

import edu.washington.escience.myria.DbException;
import edu.washington.escience.myria.Schema;
import edu.washington.escience.myria.io.DataSource;
import edu.washington.escience.myria.io.FileSource;
import edu.washington.escience.myria.storage.TupleBatch;
import edu.washington.escience.myria.storage.TupleBatchBuffer;
import edu.washington.escience.myria.storage.TupleUtils;
import edu.washington.escience.myria.util.DateTimeUtils;

/**
 * Reads data from a file. For CSV files, the default parser follows the RFC 4180 (http://tools.ietf.org/html/rfc4180).
 * However, this operator can be used to scan files with different delimiters, etc.
 *
 * This operator assumes the input file is be comma-separated CSV files and have one record per line. For input files in
 * other formats, delimiter need to be specified, e.g `\t` for tab delimited file, '|' for pipe delimited file. Each
 * cell of the input can be enclosed by the default quotation mark '"'. Other quotation mark like '\'' can be specified
 * by user as well. Note that the enclosure by quotation is not required in the input file.
 *
 */
public final class FileScan extends LeafOperator {
  /** The Schema of the relation stored in this file. */
  private final Schema schema;
  /** Scanner used to parse the file. */
  private transient CSVParser parser = null;
  /** Iterator over CSV records. */
  private transient Iterator<CSVRecord> iterator = null;
  /** A user-provided file delimiter; if null, the system uses the default comma as delimiter. */
  private final Character delimiter;
  /** A user-provided quotation mark, if null, the system uses '"'. */
  private final Character quote;
  /** A user-provided escape character to escape quote and itself, if null, the system uses '/'. */
  private final Character escape;
  /** The data source that will generate the input stream to be read at initialization. */
  private final DataSource source;
  /** Number of skipped lines on the head. */
  private final Integer numberOfSkippedLines;
  /** Holds the tuples that are ready for release. */
  private transient TupleBatchBuffer buffer;
  /** Which line of the file the scanner is currently on. */
  private long lineNumber = 0;

  /** Required for Java serialization. */
  private static final long serialVersionUID = 1L;

  /**
   * The logger for debug, trace, etc. messages in this class.
   */
  private static final org.slf4j.Logger LOGGER = org.slf4j.LoggerFactory.getLogger(FileScan.class);

  /**
   * Construct a new FileScan object to read from the specified file. This file is assumed to be comma-separated and
   * have one record per line. '"' will be used as default quotation mark. `\` will be used as escape character.
   *
   * @param filename file containing the data to be scanned.
   * @param schema the Schema of the relation contained in the file.
   */
  public FileScan(final String filename, final Schema schema) {
    this(filename, schema, null, null, null, null);
  }

  /**
   * Construct a new FileScan object to read from the specified file. This file is assumed to be comma-separated and
   * have one record per line. '"' will be used as default quotation mark. `\` will be used as escape character.
   *
   * @param source the data source containing the relation.
   * @param schema the Schema of the relation contained in the file.
   */
  public FileScan(final DataSource source, final Schema schema) {
    this(source, schema, null, null, null, null);
  }

  /**
   * Construct a new FileScan object to read from the specified file. This file is assumed to be comma-separated and
   * have one record per line. If delimiter is non-null, the system uses its value as a delimiter. '"' will be used as
   * default quotation mark. `\` will be used as escape character.
   *
   * @param filename file containing the data to be scanned.
   * @param schema the Schema of the relation contained in the file.
   * @param delimiter An optional override file delimiter.
   */
  public FileScan(final String filename, final Schema schema, final Character delimiter) {
    this(new FileSource(filename), schema, delimiter, null, null, null);
  }

  /**
   * Construct a new FileScan object to read from the specified file. This file is assumed to be comma-separated and
   * have one record per line. If delimiter is non-null, the system uses its value as a delimiter. '"' will be used as
   * default quotation mark. `\` will be used as escape character.
   *
   * @param source file containing the data to be scanned.
   * @param schema the Schema of the relation contained in the file.
   * @param delimiter An optional override file delimiter.
   */
  public FileScan(final DataSource source, final Schema schema, final Character delimiter) {
    this(source, schema, delimiter, null, null, null);
  }

  /**
   * Construct a new FileScan object to read from the specified file. This file is assumed to be comma-separated and
   * have one record per line. If delimiter is non-null, the system uses its value as a delimiter. If quote is null, '"'
   * will be used as default quotation mark. If escape is null, `\` will be used as escape character. If
   * numberOfSkippedLines is null, no line will be skipped.
   *
   * @param filename file containing the data to be scanned.
   * @param schema the Schema of the relation contained in the file.
   * @param delimiter An optional override file delimiter.
   * @param quote An optional quote character
   * @param escape An optional escape character.
   * @param numberOfSkippedLines number of lines to be skipped.
   */
  public FileScan(
      final String filename,
      final Schema schema,
      @Nullable final Character delimiter,
      @Nullable final Character quote,
      @Nullable final Character escape,
      @Nullable final Integer numberOfSkippedLines) {
    this(new FileSource(filename), schema, delimiter, quote, escape, numberOfSkippedLines);
  }

  /**
   * Construct a new FileScan object to read from the specified file. This file is assumed to be comma-separated and
   * have one record per line. If delimiter is non-null, the system uses its value as a delimiter. If quote is null, '"'
   * will be used as default quotation mark. If escape is null, `\` will be used as escape character. If
   * numberOfSkippedLines is null, no line will be skipped.
   *
   * @param source the data source containing the relation.
   * @param schema the Schema of the relation contained in the file.
   * @param delimiter An optional override file delimiter.
   * @param quote An optional quote character
   * @param escape An optional escape character.
   * @param numberOfSkippedLines number of lines to be skipped (number of lines in header).
   */
  public FileScan(
      final DataSource source,
      final Schema schema,
      @Nullable final Character delimiter,
      @Nullable final Character quote,
      @Nullable final Character escape,
      @Nullable final Integer numberOfSkippedLines) {
    this.source = Preconditions.checkNotNull(source, "source");
    this.schema = Preconditions.checkNotNull(schema, "schema");

    this.delimiter = MoreObjects.firstNonNull(delimiter, CSVFormat.DEFAULT.getDelimiter());
    this.quote = MoreObjects.firstNonNull(quote, CSVFormat.DEFAULT.getQuoteCharacter());
    this.escape = escape != null ? escape : CSVFormat.DEFAULT.getEscapeCharacter();
    this.numberOfSkippedLines = MoreObjects.firstNonNull(numberOfSkippedLines, 0);
  }

  @Override
  public void cleanup() {
    parser = null;
    while (buffer.numTuples() > 0) {
      buffer.popAny();
    }
  }

  @Override
  protected TupleBatch fetchNextReady() throws DbException, IOException {
    /* Let's assume that the scanner always starts at the beginning of a line. */
    long lineNumberBegin = lineNumber;

    while ((buffer.numTuples() < buffer.getBatchSize())) {
      lineNumber++;
      if (parser.isClosed()) {
        break;
      }
      try {
        if (!iterator.hasNext()) {
          parser.close();
          break;
        }
      } catch (final RuntimeException e) {
        throw new DbException("Error parsing row " + lineNumber, e);
      }
      CSVRecord record = iterator.next();

      if (record.size() != schema.numColumns()) {
        throw new DbException(
            "Error parsing row "
                + lineNumber
                + ": Found "
                + record.size()
                + " column(s) but expected "
                + schema.numColumns()
                + " column(s).");
      }
      for (int column = 0; column < schema.numColumns(); ++column) {
        String cell = record.get(column);
        try {
          switch (schema.getColumnType(column)) {
            case BOOLEAN_TYPE:
              Float f = Floats.tryParse(cell);
              if (f != null) {
                buffer.putBoolean(column, f != 0);
              } else {
                buffer.putBoolean(column, BooleanUtils.toBoolean(cell));
              }
              break;
            case DOUBLE_TYPE:
              buffer.putDouble(column, Double.parseDouble(cell));
              break;
            case FLOAT_TYPE:
              buffer.putFloat(column, Float.parseFloat(cell));
              break;
            case INT_TYPE:
              buffer.putInt(column, Integer.parseInt(cell));
              break;
            case LONG_TYPE:
              buffer.putLong(column, Long.parseLong(cell));
              break;
            case STRING_TYPE:
              buffer.putString(column, cell);
              break;
            case DATETIME_TYPE:
              buffer.putDateTime(column, DateTimeUtils.parse(cell));
              break;
            case BLOB_TYPE:
              buffer.putBlob(column, getFile(cell)); // read filename
              break;
          }
        } catch (final IllegalArgumentException e) {
          throw new DbException(
              "Error parsing column "
                  + column
                  + " of row "
                  + lineNumber
                  + ", expected type: "
                  + schema.getColumnType(column)
                  + ", scanned value: "
                  + cell,
              e);
        }
      }
    }

    LOGGER.debug("Scanned {} input lines", lineNumber - lineNumberBegin);

    return buffer.popAny();
  }

  @Override
  public Schema generateSchema() {
    return schema;
  }

  @Override
  protected void init(final ImmutableMap<String, Object> execEnvVars) throws DbException {
    buffer = new TupleBatchBuffer(getSchema());
    try {
      parser =
          new CSVParser(
              new BufferedReader(new InputStreamReader(source.getInputStream())),
              CSVFormat.newFormat(delimiter).withQuote(quote).withEscape(escape));
      iterator = parser.iterator();
      for (int i = 0; i < numberOfSkippedLines; i++) {
        iterator.next();
      }
    } catch (IOException e) {
      throw new DbException(e);
    }

    lineNumber = 0;
  }

  protected ByteBuffer getFile(final String filename) throws DbException {
    Preconditions.checkNotNull(filename, "byte[] filename was null");
    Path path = Paths.get(filename);
    byte[] data = null;
    try {
      data = Files.readAllBytes(path);
    } catch (IOException e) {
      throw new DbException(e);
    }
    return ByteBuffer.wrap(data);
  }
}