CsvParser.java example

Explorer
appinventor-sources-master
- appinventor
// -*- mode: java; c-basic-offset: 2; -*-
// Copyright 2009-2011 Google, All Rights reserved
// Copyright 2011-2012 MIT, All rights reserved
// Released under the Apache License, Version 2.0
// http://www.apache.org/licenses/LICENSE-2.0

package com.google.appinventor.server.util;

import com.google.common.base.Charsets;
import com.google.common.collect.Lists;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Pattern;

/**
 *
 * Note: this file was copied, more or less verbatim, from:
 *   java/com/google/collaboration/tables/util/CsvParser.java
 * by Debby Wallach on 7/20/2010.
 *
 * More efficient re-implementation of CsvQuotedParser. This is about 5X faster on large files. A
 * CSV cell can be quoted or unquoted. A quoted cell escapes quotes by doubling them. Whitespace
 * between the closing quote and the delimiter is ignored. An unquoted cell ends at the
 * delimiter. The delimiter is comma or newline (\n). A carriage return (\r) before the newline is
 * part of the delimiter.
 *
 * Quotes are handled subtly differently in CsvQuotedParser. There, quoted content can appear in
 * unquoted cells. This is not
 * <a href="http://en.wikipedia.org/wiki/Comma-separated_values">standard</a> and hampers error
 * reporting. Consider an unquoted cell value {@code 17" screen}. By allowing quoted content inside
 * cells the parser has to keep looking for the matching end quote. By restricting quotes to cell
 * boundaries many accidental quotes can be reported immediately.
 *
 */
public final class CsvParser implements Iterator<List<String>> {

  /**
   * Escaped quotes in quoted cells are doubled.
   */
  private static final Pattern ESCAPED_QUOTE_PATTERN = Pattern.compile("\"\"");

  /**
   * Character buffer for cell parsing. The size limits the largest parsable cell. Specifically, if
   * an unquoted cell and its trailing delimiter exceed this limit, the cell will be split at the
   * limit. Moreover, a quoted large cell will cause a syntax error as when we reach end-of-file
   * without reading a closing quote.
   */
  private final char[] buf = new char[10240];

  private final Reader in;

  /**
   * The beginning of the currently parsed cell in {@code buf}. Everything before it is discarded
   * during compaction. The beginning includes the quote for a quoted cell.
   */
  private int pos;

  /**
   * The end of valid content in {@code buf}.
   */
  private int limit;

  /**
   * Indicates whether more content might be in the reader.
   */
  private boolean opened = true;

  /**
   * Length of a successfully parsed cell. For a quoted cell this includes the closing quote. Set
   * whenever parsing of a cell succeeds. The value should be ignored when cell parsing fails, but
   * is set to -1 to help debugging.
   */
  private int cellLength = -1;

  /**
   * Length of a successfully parsed cell including its trailing delimiter. Set whenever parsing of
   * a cell with trailing delimiter succeeds. The value should be ignored when cell parsing fails,
   * but is set to -1 to help debugging.
   */
  private int delimitedCellLength = -1;

  /**
   * Last exception encountered. Saved here to properly implement {@code Iterator}.
   */
  private IOException lastException;

  private long previouslyRead;

  public CsvParser(InputStream in) {
    this.in = new InputStreamReader(in, Charsets.UTF_8);
  }

  public void skip(long charPosition) throws IOException {
    while (charPosition > 0) {
      int n = in.read(buf, 0, Math.min((int) charPosition, buf.length));
      if (n < 0) break;
      previouslyRead += n;
      charPosition -= n;
    }
  }

  public boolean hasNext() {
    if (limit == 0) {
      fill();
    }
    return (pos < limit || indexAfterCompactionAndFilling(pos) < limit) && lookingAtCell();
  }

  public List<String> next() {
    List<String> result = Lists.newArrayList();
    boolean trailingComma;
    boolean haveMoreData;
    do {
      // Invariant: pos < limit && lookingAtCell() from hasNext() or previous iteration
      if (buf[pos] != '"') {
        result.add(new String(buf, pos, cellLength));
      } else {
        String cell = new String(buf, pos + 1, cellLength - 2);
        result.add(ESCAPED_QUOTE_PATTERN.matcher(cell).replaceAll("\""));
      }
      trailingComma = delimitedCellLength > 0 && buf[pos + delimitedCellLength - 1] == ',';
      pos += delimitedCellLength;
      delimitedCellLength = cellLength = -1;
      haveMoreData = pos < limit || indexAfterCompactionAndFilling(pos) < limit;
    } while (trailingComma && haveMoreData && lookingAtCell());
    return result;
  }

  public long getCharPosition() {
    return previouslyRead + pos;
  }

  /**
   * Compacts and fills the buffer. Returns the possibly shifted index for the given index.
   */
  private int indexAfterCompactionAndFilling(int i) {
    if (pos > 0) {
      i = compact(i);
    }
    fill();
    return i;
  }

  /**
   * Moves the contents between {@code pos} and {@code limit} to the beginning of {@code buf}.
   * Returns the new position of the given index.
   */
  private int compact(int i) {
    int oldPos = pos;
    pos = 0;
    int toMove = limit - oldPos;
    if (toMove > 0) {
      System.arraycopy(buf, oldPos, buf, 0, toMove);
    }
    limit -= oldPos;
    previouslyRead += oldPos;
    return i - oldPos;
  }

  /**
   * Fills {@code buf} from the reader.
   */
  private void fill() {
    int toFill = buf.length - limit;
    while (opened && toFill > 0) {
      try {
        int n = in.read(buf, limit, toFill);
        if (n == -1) {
          opened = false;
        } else {
          limit += n;
          toFill -= n;
        }
      } catch (IOException e) {
        lastException = e;
        opened = false;
      }
    }
  }

  private boolean lookingAtCell() {
    return (buf[pos] == '"' ? findUnescapedEndQuote(pos + 1) : findUnquotedCellEnd(pos));
  }

  private boolean findUnescapedEndQuote(int i) {
    for (; i < limit || (i = indexAfterCompactionAndFilling(i)) < limit; i++) {
      if (buf[i] == '"') {
        i = checkedIndex(i + 1);
        if (i == limit || buf[i] != '"') {
          cellLength = i - pos;
          return findDelimOrEnd(i);
        }
      }
    }
    lastException = new IOException("Syntax Error. unclosed quoted cell");
    return false;
  }

  /**
   * Determines that we are looking at the end of a cell, tolerating some whitespace.  Called after
   * consuming the end quote of a quoted cell.
   */
  private boolean findDelimOrEnd(int i) {
    for (; i < limit || (i = indexAfterCompactionAndFilling(i)) < limit; i++) {
      switch (buf[i]) {
        case ' ':
        case '\t':
          // whitespace after closing quote
          continue;
        case '\r':
          // In standard CSV \r\n terminates a cell. However, Macintosh uses one \r instead of \n.
          int j = checkedIndex(i + 1);
          delimitedCellLength = (buf[j] == '\n' ? checkedIndex(j + 1) : j) - pos;
          return true;
        case ',':
        case '\n':
          delimitedCellLength = (checkedIndex(i + 1) - pos);
          return true;
        default:
          lastException = new IOException(
              "Syntax Error: non-whitespace between closing quote and delimiter or end");
          return false;
      }
    }
    delimitedCellLength = (limit - pos);
    return true;
  }

  /**
   * Returns the given index, after trying to read its corresponding buffered character. The
   * resulting index will be shifted if compaction was triggered.
   */
  private int checkedIndex(int i) {
    return i < limit ? i : indexAfterCompactionAndFilling(i);
  }

  private boolean findUnquotedCellEnd(int i) {
    for (; i < limit || (i = indexAfterCompactionAndFilling(i)) < limit; i++) {
      switch (buf[i]) {
        case ',':
        case '\n':
          cellLength = i - pos;
          delimitedCellLength = cellLength + 1;
          return true;
        case '\r':
          // In standard CSV \r\n terminates a cell. However, Macintosh uses one \r instead of \n.
          cellLength = i - pos;
          int j = checkedIndex(i + 1);
          delimitedCellLength = (buf[j] == '\n' ? checkedIndex(j + 1) : j) - pos;
          return true;
        case '"':
          lastException = new IOException("Syntax Error: quote in unquoted cell");
          return false;
      }
    }
    delimitedCellLength = cellLength = (limit - pos);
    return true;
  }

  public void remove() {
    throw new UnsupportedOperationException();
  }

  public void throwAnyProblem() throws IOException {
    if (lastException != null) {
      throw lastException;
    }
  }
}