CSVFileScanFragment.java example

Explorer
myria-master
/**
 *
 */
package edu.washington.escience.myria.operator;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.ByteBuffer;
import java.util.Iterator;

import javax.annotation.Nullable;

import org.apache.commons.csv.CSVFormat;
import org.apache.commons.csv.CSVParser;
import org.apache.commons.csv.CSVRecord;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.BooleanUtils;

import com.google.common.base.MoreObjects;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableMap;
import com.google.common.primitives.Floats;

import edu.washington.escience.myria.DbException;
import edu.washington.escience.myria.MyriaConstants;
import edu.washington.escience.myria.Schema;
import edu.washington.escience.myria.io.AmazonS3Source;
import edu.washington.escience.myria.io.DataSource;
import edu.washington.escience.myria.io.FileSource;
import edu.washington.escience.myria.storage.TupleBatch;
import edu.washington.escience.myria.storage.TupleBatchBuffer;
import edu.washington.escience.myria.storage.TupleUtils;
import edu.washington.escience.myria.util.DateTimeUtils;

/**
 *
 */
public class CSVFileScanFragment extends LeafOperator {

  /** The Schema of the relation stored in this file. */
  private final Schema schema;
  /** Scanner used to parse the file. */
  private transient CSVParser parser = null;
  /** Iterator over CSV records. */
  private transient Iterator<CSVRecord> iterator = null;
  /** A user-provided file delimiter; if null, the system uses the default comma as delimiter. */
  private final Character delimiter;
  /** A user-provided quotation mark, if null, the system uses '"'. */
  private final Character quote;
  /** A user-provided escape character to escape quote and itself, if null, the system uses '/'. */
  private final Character escape;
  /** The data source that will generate the input stream to be read at initialization. */
  private final AmazonS3Source source;
  /** Number of skipped lines on the head. */
  private final Integer numberOfSkippedLines;
  /** Holds the tuples that are ready for release. */
  private transient TupleBatchBuffer buffer;
  /** Which line of the file the scanner is currently on. */
  private long lineNumber = 0;
  private long byteOverlap = MyriaConstants.PARALLEL_INGEST_BYTE_OVERLAP;
  private static final String truncatedQuoteErrorMessage =
      "EOF reached before encapsulated token finished";

  private boolean isLastWorker;
  private final long maxByteRange;
  private long partitionStartByteRange;
  private long partitionEndByteRange;

  private long adjustedStartByteRange;
  private int byteOffsetFromTruncatedRowAtStart = 0;
  private InputStream partitionInputStream;
  private CSVRecord record;
  private boolean onLastRow;
  private boolean finishedReadingLastRow;
  private boolean flagAsIncomplete;
  private boolean flagAsRangeSelected;
  private int[] workerIds;

  /** Required for Java serialization. */
  private static final long serialVersionUID = 1L;

  /**
   * The logger for debug, trace, etc. messages in this class.
   */
  private static final org.slf4j.Logger LOGGER =
      org.slf4j.LoggerFactory.getLogger(CSVFileScanFragment.class);

  public CSVFileScanFragment(
      final String filename,
      final Schema schema,
      final long startByteRange,
      final long endByteRange,
      final boolean isLastWorker) {
    this(filename, schema, startByteRange, endByteRange, isLastWorker, null, null, null, null);
  }

  public CSVFileScanFragment(
      final DataSource source,
      final Schema schema,
      final long startByteRange,
      final long endByteRange,
      final boolean isLastWorker) {
    this(source, schema, startByteRange, endByteRange, isLastWorker, null, null, null, null);
  }

  public CSVFileScanFragment(
      final String filename,
      final Schema schema,
      final long startByteRange,
      final long endByteRange,
      final boolean isLastWorker,
      final Character delimiter) {
    this(
        new FileSource(filename),
        schema,
        startByteRange,
        endByteRange,
        isLastWorker,
        delimiter,
        null,
        null,
        null);
  }

  public CSVFileScanFragment(
      final DataSource source,
      final Schema schema,
      final long startByteRange,
      final long endByteRange,
      final boolean isLastWorker,
      final Character delimiter) {
    this(source, schema, startByteRange, endByteRange, isLastWorker, delimiter, null, null, null);
  }

  public CSVFileScanFragment(
      final String filename,
      final Schema schema,
      final long startByteRange,
      final long endByteRange,
      final boolean isLastWorker,
      @Nullable final Character delimiter,
      @Nullable final Character quote,
      @Nullable final Character escape,
      @Nullable final Integer numberOfSkippedLines) {
    this(
        new FileSource(filename),
        schema,
        startByteRange,
        endByteRange,
        isLastWorker,
        delimiter,
        quote,
        escape,
        numberOfSkippedLines);
  }

  public CSVFileScanFragment(
      final DataSource source,
      final Schema schema,
      final long partitionStartByteRange,
      final long partitionEndByteRange,
      final boolean isLastWorker,
      @Nullable final Character delimiter,
      @Nullable final Character quote,
      @Nullable final Character escape,
      @Nullable final Integer numberOfSkippedLines) {
    this.source = (AmazonS3Source) Preconditions.checkNotNull(source, "source");
    this.schema = Preconditions.checkNotNull(schema, "schema");

    this.delimiter = MoreObjects.firstNonNull(delimiter, CSVFormat.DEFAULT.getDelimiter());
    this.quote = MoreObjects.firstNonNull(quote, CSVFormat.DEFAULT.getQuoteCharacter());
    this.escape = escape;
    this.numberOfSkippedLines = MoreObjects.firstNonNull(numberOfSkippedLines, 0);

    this.partitionStartByteRange = partitionStartByteRange;
    this.partitionEndByteRange = partitionEndByteRange;
    this.isLastWorker = isLastWorker;

    maxByteRange = ((AmazonS3Source) source).getFileSize();
    onLastRow = false;
    finishedReadingLastRow = false;
    flagAsIncomplete = false;
    flagAsRangeSelected = true;
  }

  public CSVFileScanFragment(
      final AmazonS3Source source,
      final Schema schema,
      final int[] workerIds,
      @Nullable final Character delimiter,
      @Nullable final Character quote,
      @Nullable final Character escape,
      @Nullable final Integer numberOfSkippedLines) {

    this.source = Preconditions.checkNotNull(source, "source");
    this.schema = Preconditions.checkNotNull(schema, "schema");
    this.workerIds = workerIds;

    this.delimiter = MoreObjects.firstNonNull(delimiter, CSVFormat.DEFAULT.getDelimiter());
    this.quote = MoreObjects.firstNonNull(quote, CSVFormat.DEFAULT.getQuoteCharacter());
    this.escape = escape;
    this.numberOfSkippedLines = MoreObjects.firstNonNull(numberOfSkippedLines, 0);

    maxByteRange = source.getFileSize();
    onLastRow = false;
    finishedReadingLastRow = false;
    flagAsIncomplete = false;
    flagAsRangeSelected = false;
  }

  @Override
  protected TupleBatch fetchNextReady() throws IOException, DbException {
    long lineNumberBegin = lineNumber;
    boolean nextRecordTruncated = false;

    while ((buffer.numTuples() < buffer.getBatchSize()) && !flagAsIncomplete) {
      lineNumber++;
      if (parser.isClosed()) {
        break;
      }

      if (nextRecordTruncated) {
        onLastRow = true;
      }

      try {
        if (!onLastRow) {
          record = iterator.next();
        }
      } catch (Exception e) {
        /*
         * FIX ME: If we hit an exception for a malformed row (in case of quotes for example), we mark this as the last
         * row
         */
        if (e.getMessage() != null && e.getMessage().contains(truncatedQuoteErrorMessage)) {
          onLastRow = true;
        } else {
          throw e;
        }
      }

      try {
        if (!iterator.hasNext()) {
          onLastRow = true;
        }
      } catch (Exception e) {
        /*
         * FIX ME: If we hit an exception for a malformed row (in case of quotes for example), we mark
         * nextRecordTruncated as true
         */
        if (e.getMessage() != null && e.getMessage().contains(truncatedQuoteErrorMessage)) {
          nextRecordTruncated = true;
        } else {
          throw e;
        }
      }

      /*
       * Here, if we are on the last row, we make sure to read the entire row until we either hit a new line or until we
       * have read the entire file (this is for the case where a single worker might be reading a single large row that
       * was split among other workers). If we're at the last row and the last worker is reading, we just mark this
       * final line as finished.
       */
      if (onLastRow && !finishedReadingLastRow && !isLastWorker) {
        long trailingStartByte = partitionEndByteRange + 1;
        long trailingEndByte = trailingStartByte + byteOverlap - 1;
        long finalBytePositionFound = trailingStartByte;
        boolean finalLineFound = false;

        while (!finalLineFound) {
          /*
           * If we are within the max byte range, then keep checking for a new line. Otherwise, if we've reached the end
           * of the file, mark finalLineFound as true.
           */
          if (trailingEndByte < maxByteRange) {
            InputStream trailingEndInputStream =
                source.getInputStream(trailingStartByte, trailingEndByte);
            int dataChar = trailingEndInputStream.read();
            while (dataChar != -1) {
              char currentChar = (char) dataChar;
              if (currentChar == '\n' || currentChar == '\r') {
                finalLineFound = true;
                break;
              }
              dataChar = trailingEndInputStream.read();
              finalBytePositionFound++;
            }
            trailingEndInputStream.close();
          } else {
            finalLineFound = true;
            finalBytePositionFound = maxByteRange;
          }

          /*
           * If we found the new line, then reset the parser for this line. Otherwise, increase the byte overlap and the
           * trailing range.
           */
          if (finalLineFound) {
            long characterPositionAtBeginningOfRecord =
                (record == null) ? 0 : record.getCharacterPosition();
            InputStream completePartitionStream =
                source.getInputStream(
                    adjustedStartByteRange + byteOffsetFromTruncatedRowAtStart,
                    finalBytePositionFound);
            BufferedReader reader =
                new BufferedReader(new InputStreamReader(completePartitionStream));
            reader.skip(characterPositionAtBeginningOfRecord);
            parser =
                new CSVParser(
                    reader,
                    CSVFormat.newFormat(delimiter).withQuote(quote).withEscape(escape),
                    0,
                    0);
            iterator = parser.iterator();
            record = iterator.next();
            if (nextRecordTruncated) {
              record = iterator.next();
            }
            finishedReadingLastRow = true;
          } else {
            trailingStartByte += byteOverlap;
            byteOverlap *= 2;
            trailingEndByte += byteOverlap;
          }
        }
      } else if (record.size() == schema.numColumns() && onLastRow && isLastWorker) {
        finishedReadingLastRow = true;
      }

      /*
       * If we're on the last row, we check if we've finished reading the row completely.
       */
      if (!onLastRow || (onLastRow && finishedReadingLastRow)) {
        for (int column = 0; column < schema.numColumns(); ++column) {
          String cell = record.get(column);
          try {
            switch (schema.getColumnType(column)) {
              case BOOLEAN_TYPE:
                if (Floats.tryParse(cell) != null) {
                  buffer.putBoolean(column, Floats.tryParse(cell) != 0);
                } else if (BooleanUtils.toBoolean(cell)) {
                  buffer.putBoolean(column, Boolean.parseBoolean(cell));
                }
                break;
              case DOUBLE_TYPE:
                buffer.putDouble(column, Double.parseDouble(cell));
                break;
              case FLOAT_TYPE:
                buffer.putFloat(column, Float.parseFloat(cell));
                break;
              case INT_TYPE:
                buffer.putInt(column, Integer.parseInt(cell));
                break;
              case LONG_TYPE:
                buffer.putLong(column, Long.parseLong(cell));
                break;
              case STRING_TYPE:
                buffer.putString(column, cell);
                break;
              case DATETIME_TYPE:
                buffer.putDateTime(column, DateTimeUtils.parse(cell));
                break;
              case BLOB_TYPE:
                throw new DbException(
                    "Ingesting BLOB via csv isn't supported. Use DownloadBlob expression.");
            }
          } catch (final IllegalArgumentException e) {
            throw new DbException(
                "Error parsing column "
                    + column
                    + " of row "
                    + lineNumber
                    + ", expected type: "
                    + schema.getColumnType(column)
                    + ", scanned value: "
                    + cell,
                e);
          }
        }
        /*
         * Once we finish reading the last row, we close the parser
         */
        if (onLastRow) {
          parser.close();
        }
      }
      LOGGER.debug("Scanned {} input lines", lineNumber - lineNumberBegin);
    }
    return buffer.popAny();
  }

  @Override
  public void cleanup() throws IOException {
    parser = null;
    while (buffer.numTuples() > 0) {
      buffer.popAny();
    }
  }

  @Override
  protected Schema generateSchema() {
    return schema;
  }

  @Override
  protected void init(final ImmutableMap<String, Object> execEnvVars) throws DbException {
    buffer = new TupleBatchBuffer(getSchema());

    if (!flagAsRangeSelected) {
      int workerID = getNodeID();
      long fileSize = source.getFileSize();
      long currentPartitionSize = fileSize / workerIds.length;
      int workerIndex = 0;
      for (int i = 0; i < workerIds.length; i++) {
        if (workerID == workerIds[i]) {
          workerIndex = i;
        }
      }
      boolean isLastWorker = workerIndex == workerIds.length - 1;
      long startByteRange = currentPartitionSize * workerIndex;
      long endByteRange;

      if (isLastWorker) {
        endByteRange = fileSize - 1;
      } else {
        endByteRange = (currentPartitionSize * (workerIndex + 1)) - 1;
      }
      this.partitionStartByteRange = startByteRange;
      this.partitionEndByteRange = endByteRange;
      this.isLastWorker = isLastWorker;
    }

    try {

      adjustedStartByteRange = partitionStartByteRange;
      /* Optimization */
      if (partitionStartByteRange != 0) {
        adjustedStartByteRange -= 1;
      }
      partitionInputStream = source.getInputStream(adjustedStartByteRange, partitionEndByteRange);

      /* If the file is empty, mark the partition as incomplete */
      if (maxByteRange == 0) {
        flagAsIncomplete = true;
      }

      /*
       * If this is not the first worker, we make sure to read until we hit a new line character. We do this to skip
       * partial rows at the beginning of the partition.
       */
      if (partitionStartByteRange != 0) {
        int firstChar = partitionInputStream.read();
        byteOffsetFromTruncatedRowAtStart = 1;
        if (firstChar != '\n' && firstChar != '\r') {
          boolean newLineFound = false;
          while (!newLineFound) {
            int currentChar = partitionInputStream.read();
            byteOffsetFromTruncatedRowAtStart++;
            if (currentChar == '\n' || currentChar == '\r' || currentChar == -1) {
              newLineFound = true;
              /*
               * If we never reach a new line (this could happen for a partial row at the last worker), mark as
               * incomplete
               */
              if (currentChar == -1) {
                flagAsIncomplete = true;
              } else if (currentChar == '\r') {
                currentChar = partitionInputStream.read();
                byteOffsetFromTruncatedRowAtStart++;
                if (currentChar != '\n') {
                  byteOffsetFromTruncatedRowAtStart--;
                  partitionInputStream =
                      source.getInputStream(
                          adjustedStartByteRange + byteOffsetFromTruncatedRowAtStart,
                          partitionEndByteRange);
                }
              }
            }
          }
        } else if (firstChar == '\r') {
          int currentChar = partitionInputStream.read();
          byteOffsetFromTruncatedRowAtStart++;
          if (currentChar != '\n') {
            byteOffsetFromTruncatedRowAtStart--;
            partitionInputStream =
                source.getInputStream(
                    adjustedStartByteRange + byteOffsetFromTruncatedRowAtStart,
                    partitionEndByteRange);
          }
        }
      }

      /* If we hit the end of the partition then mark it as incomplete.*/
      if (adjustedStartByteRange + byteOffsetFromTruncatedRowAtStart - 1 == partitionEndByteRange) {
        flagAsIncomplete = true;
      }

      /* If the partition is incomplete, do not instantiate the parser */
      if (!flagAsIncomplete) {
        parser =
            new CSVParser(
                new BufferedReader(new InputStreamReader(partitionInputStream)),
                CSVFormat.newFormat(delimiter).withQuote(quote).withEscape(escape));
        iterator = parser.iterator();

        /* FIX ME: For now, we only support cases where all skipped lines are contained within the first partition. */
        if (partitionStartByteRange == 0) {
          for (int i = 0; i < numberOfSkippedLines; i++) {
            iterator.next();
          }
        }
      }
    } catch (IOException e) {
      throw new DbException(e);
    }
  }
}