// -*- mode: java; c-basic-offset: 2; -*-
// Copyright 2009-2011 Google, All Rights reserved
// Copyright 2011-2012 MIT, All rights reserved
// Released under the Apache License, Version 2.0
// http://www.apache.org/licenses/LICENSE-2.0
package com.google.appinventor.server.util;
import com.google.common.base.Charsets;
import com.google.common.collect.Lists;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Pattern;
/**
*
* Note: this file was copied, more or less verbatim, from:
* java/com/google/collaboration/tables/util/CsvParser.java
* by Debby Wallach on 7/20/2010.
*
* More efficient re-implementation of CsvQuotedParser. This is about 5X faster on large files. A
* CSV cell can be quoted or unquoted. A quoted cell escapes quotes by doubling them. Whitespace
* between the closing quote and the delimiter is ignored. An unquoted cell ends at the
* delimiter. The delimiter is comma or newline (\n). A carriage return (\r) before the newline is
* part of the delimiter.
*
* Quotes are handled subtly differently in CsvQuotedParser. There, quoted content can appear in
* unquoted cells. This is not
* <a href="http://en.wikipedia.org/wiki/Comma-separated_values">standard</a> and hampers error
* reporting. Consider an unquoted cell value {@code 17" screen}. By allowing quoted content inside
* cells the parser has to keep looking for the matching end quote. By restricting quotes to cell
* boundaries many accidental quotes can be reported immediately.
*
*/
public final class CsvParser implements Iterator<List<String>> {
/**
* Escaped quotes in quoted cells are doubled.
*/
private static final Pattern ESCAPED_QUOTE_PATTERN = Pattern.compile("\"\"");
/**
* Character buffer for cell parsing. The size limits the largest parsable cell. Specifically, if
* an unquoted cell and its trailing delimiter exceed this limit, the cell will be split at the
* limit. Moreover, a quoted large cell will cause a syntax error as when we reach end-of-file
* without reading a closing quote.
*/
private final char[] buf = new char[10240];
private final Reader in;
/**
* The beginning of the currently parsed cell in {@code buf}. Everything before it is discarded
* during compaction. The beginning includes the quote for a quoted cell.
*/
private int pos;
/**
* The end of valid content in {@code buf}.
*/
private int limit;
/**
* Indicates whether more content might be in the reader.
*/
private boolean opened = true;
/**
* Length of a successfully parsed cell. For a quoted cell this includes the closing quote. Set
* whenever parsing of a cell succeeds. The value should be ignored when cell parsing fails, but
* is set to -1 to help debugging.
*/
private int cellLength = -1;
/**
* Length of a successfully parsed cell including its trailing delimiter. Set whenever parsing of
* a cell with trailing delimiter succeeds. The value should be ignored when cell parsing fails,
* but is set to -1 to help debugging.
*/
private int delimitedCellLength = -1;
/**
* Last exception encountered. Saved here to properly implement {@code Iterator}.
*/
private IOException lastException;
private long previouslyRead;
public CsvParser(InputStream in) {
this.in = new InputStreamReader(in, Charsets.UTF_8);
}
public void skip(long charPosition) throws IOException {
while (charPosition > 0) {
int n = in.read(buf, 0, Math.min((int) charPosition, buf.length));
if (n < 0) break;
previouslyRead += n;
charPosition -= n;
}
}
public boolean hasNext() {
if (limit == 0) {
fill();
}
return (pos < limit || indexAfterCompactionAndFilling(pos) < limit) && lookingAtCell();
}
public List<String> next() {
List<String> result = Lists.newArrayList();
boolean trailingComma;
boolean haveMoreData;
do {
// Invariant: pos < limit && lookingAtCell() from hasNext() or previous iteration
if (buf[pos] != '"') {
result.add(new String(buf, pos, cellLength));
} else {
String cell = new String(buf, pos + 1, cellLength - 2);
result.add(ESCAPED_QUOTE_PATTERN.matcher(cell).replaceAll("\""));
}
trailingComma = delimitedCellLength > 0 && buf[pos + delimitedCellLength - 1] == ',';
pos += delimitedCellLength;
delimitedCellLength = cellLength = -1;
haveMoreData = pos < limit || indexAfterCompactionAndFilling(pos) < limit;
} while (trailingComma && haveMoreData && lookingAtCell());
return result;
}
public long getCharPosition() {
return previouslyRead + pos;
}
/**
* Compacts and fills the buffer. Returns the possibly shifted index for the given index.
*/
private int indexAfterCompactionAndFilling(int i) {
if (pos > 0) {
i = compact(i);
}
fill();
return i;
}
/**
* Moves the contents between {@code pos} and {@code limit} to the beginning of {@code buf}.
* Returns the new position of the given index.
*/
private int compact(int i) {
int oldPos = pos;
pos = 0;
int toMove = limit - oldPos;
if (toMove > 0) {
System.arraycopy(buf, oldPos, buf, 0, toMove);
}
limit -= oldPos;
previouslyRead += oldPos;
return i - oldPos;
}
/**
* Fills {@code buf} from the reader.
*/
private void fill() {
int toFill = buf.length - limit;
while (opened && toFill > 0) {
try {
int n = in.read(buf, limit, toFill);
if (n == -1) {
opened = false;
} else {
limit += n;
toFill -= n;
}
} catch (IOException e) {
lastException = e;
opened = false;
}
}
}
private boolean lookingAtCell() {
return (buf[pos] == '"' ? findUnescapedEndQuote(pos + 1) : findUnquotedCellEnd(pos));
}
private boolean findUnescapedEndQuote(int i) {
for (; i < limit || (i = indexAfterCompactionAndFilling(i)) < limit; i++) {
if (buf[i] == '"') {
i = checkedIndex(i + 1);
if (i == limit || buf[i] != '"') {
cellLength = i - pos;
return findDelimOrEnd(i);
}
}
}
lastException = new IOException("Syntax Error. unclosed quoted cell");
return false;
}
/**
* Determines that we are looking at the end of a cell, tolerating some whitespace. Called after
* consuming the end quote of a quoted cell.
*/
private boolean findDelimOrEnd(int i) {
for (; i < limit || (i = indexAfterCompactionAndFilling(i)) < limit; i++) {
switch (buf[i]) {
case ' ':
case '\t':
// whitespace after closing quote
continue;
case '\r':
// In standard CSV \r\n terminates a cell. However, Macintosh uses one \r instead of \n.
int j = checkedIndex(i + 1);
delimitedCellLength = (buf[j] == '\n' ? checkedIndex(j + 1) : j) - pos;
return true;
case ',':
case '\n':
delimitedCellLength = (checkedIndex(i + 1) - pos);
return true;
default:
lastException = new IOException(
"Syntax Error: non-whitespace between closing quote and delimiter or end");
return false;
}
}
delimitedCellLength = (limit - pos);
return true;
}
/**
* Returns the given index, after trying to read its corresponding buffered character. The
* resulting index will be shifted if compaction was triggered.
*/
private int checkedIndex(int i) {
return i < limit ? i : indexAfterCompactionAndFilling(i);
}
private boolean findUnquotedCellEnd(int i) {
for (; i < limit || (i = indexAfterCompactionAndFilling(i)) < limit; i++) {
switch (buf[i]) {
case ',':
case '\n':
cellLength = i - pos;
delimitedCellLength = cellLength + 1;
return true;
case '\r':
// In standard CSV \r\n terminates a cell. However, Macintosh uses one \r instead of \n.
cellLength = i - pos;
int j = checkedIndex(i + 1);
delimitedCellLength = (buf[j] == '\n' ? checkedIndex(j + 1) : j) - pos;
return true;
case '"':
lastException = new IOException("Syntax Error: quote in unquoted cell");
return false;
}
}
delimitedCellLength = cellLength = (limit - pos);
return true;
}
public void remove() {
throw new UnsupportedOperationException();
}
public void throwAnyProblem() throws IOException {
if (lastException != null) {
throw lastException;
}
}
}