package jeql.command.io;
import java.util.ArrayList;
import java.util.List;
import jeql.api.error.InvalidInputException;
/**
* Parses a single record in a CSV file into an array of {@link String}s.
*
*
* @author Martin Davis
*
*/
public class CSVRecordParser
{
private static final int CH_QUOTE = 1;
private static final int CH_WHITESPACE = 2;
private static final int CH_DATA = 3;
private static final int CH_SEPARATOR = 4;
private static final int CH_EOL = 5;
private static final int STATE_DATA = 1;
private static final int STATE_BEFORE = 2;
private static final int STATE_QUOTED_DATA = 3;
private static final int STATE_SEEN_QUOTE = 4;
private static final int STATE_AFTER = 5;
private static final String[] strArrayType = new String[0];
private char quote = '"';
private char colSep = ',';
private int loc = 0;
/**
* Controls whether the parsing strictly follows the CSV specification.
* If not in strict mode:
* <ul>
* <li>quotes which occur in the middle of fields are simply scanned as data
* </ul>
*/
private boolean isStrictMode = false;
public CSVRecordParser() {
}
public void setColSep(char separator)
{
this.colSep = separator;
}
/**
*
* @param line
* @return
* @throws IllegalArgumentException if the parsing of a field fails
*/
public String[] parse(String line)
{
loc = 0;
List vals = new ArrayList();
int lineLen = line.length();
while (loc < lineLen) {
vals.add(parseField(line));
}
return (String[]) vals.toArray(strArrayType);
}
private String parseField(String line)
{
StringBuffer data = new StringBuffer();
int state = STATE_BEFORE;
while (true) {
int category = CH_EOL;
if (loc < line.length())
category = categorize(line.charAt(loc));
switch (state) {
case STATE_BEFORE:
switch (category) {
case CH_WHITESPACE:
loc++;
break;
case CH_QUOTE:
loc++;
state = STATE_QUOTED_DATA;
break;
case CH_SEPARATOR:
loc++;
return "";
case CH_DATA:
data.append(line.charAt(loc));
state = STATE_DATA;
loc++;
break;
case CH_EOL:
return null;
}
break;
case STATE_DATA:
switch (category) {
case CH_SEPARATOR:
case CH_EOL:
loc++;
return data.toString();
case CH_QUOTE:
if (isStrictMode) {
throw new InvalidInputException("Malformed field - quote not at beginning of field");
}
else {
data.append(line.charAt(loc));
loc++;
}
break;
case CH_WHITESPACE:
case CH_DATA:
data.append(line.charAt(loc));
loc++;
break;
}
break;
case STATE_QUOTED_DATA:
switch (category) {
case CH_QUOTE:
loc++;
state = STATE_SEEN_QUOTE;
break;
case CH_SEPARATOR:
case CH_WHITESPACE:
case CH_DATA:
data.append(line.charAt(loc));
loc++;
break;
case CH_EOL:
return data.toString();
}
break;
case STATE_SEEN_QUOTE:
switch (category) {
case CH_QUOTE:
// double quote - add to value
loc++;
data.append('"');
state = STATE_QUOTED_DATA;
break;
case CH_SEPARATOR:
case CH_EOL:
// at end of field
loc++;
return data.toString();
case CH_WHITESPACE:
loc++;
state = STATE_AFTER;
case CH_DATA:
throw new InvalidInputException("Malformed field - quote not at end of field");
}
break;
case STATE_AFTER:
switch (category) {
case CH_QUOTE:
throw new InvalidInputException("Malformed field - unexpected quote");
case CH_EOL:
case CH_SEPARATOR:
// at end of field
loc++;
return data.toString();
case CH_WHITESPACE:
// skip trailing whitespace
loc++;
break;
case CH_DATA:
throw new InvalidInputException("Malformed field - unexpected data after quote");
}
}
}
}
public int categorize(char c) {
switch (c) {
case ' ':
case '\r':
case 0xff:
case '\n':
return CH_WHITESPACE;
default:
if (c == quote) {
return CH_QUOTE;
} else if (c == colSep) {
return CH_SEPARATOR;
}
else if ('!' <= c && c <= '~') {
return CH_DATA;
} else if (0x00 <= c && c <= 0x20) {
return CH_WHITESPACE;
} else if (Character.isWhitespace(c)) {
return CH_WHITESPACE;
} else {
return CH_DATA;
}
}
}
}