package water.parser;
import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.util.*;
import water.fvec.ParseTime;
import water.util.Log;
public class CsvParser extends CustomParser {
/* Constant to specify that separator is not specified. */
public static final byte AUTO_SEP = -1;
public final byte CHAR_DECIMAL_SEPARATOR = '.';
public final byte CHAR_SEPARATOR;
public static final byte HIVE_SEP = 1;
private static final byte SKIP_LINE = 0;
private static final byte EXPECT_COND_LF = 1;
private static final byte EOL = 2;
private static final byte TOKEN = 3;
private static final byte COND_QUOTED_TOKEN = 4;
private static final byte NUMBER = 5;
private static final byte NUMBER_SKIP = 6;
private static final byte NUMBER_SKIP_NO_DOT = 7;
private static final byte NUMBER_FRACTION = 8;
private static final byte NUMBER_EXP = 9;
private static final byte NUMBER_EXP_NEGATIVE = 10;
private static final byte NUMBER_EXP_START = 11;
private static final byte NUMBER_END = 12;
private static final byte STRING = 13;
private static final byte COND_QUOTE = 14;
private static final byte SEPARATOR_OR_EOL = 15;
private static final byte WHITESPACE_BEFORE_TOKEN = 16;
private static final byte STRING_END = 17;
private static final byte COND_QUOTED_NUMBER_END = 18;
private static final byte POSSIBLE_EMPTY_LINE = 19;
private static final byte POSSIBLE_CURRENCY = 20;
private static final long LARGEST_DIGIT_NUMBER = Long.MAX_VALUE/10;
public CsvParser(ParserSetup setup) {
super(setup);
CHAR_SEPARATOR = setup._separator;
}
public CsvParser clone(){
return new CsvParser(_setup == null?null:_setup.clone());
}
@Override public boolean parallelParseSupported(){return true;}
@SuppressWarnings("fallthrough")
@Override public final DataOut parallelParse(int cidx, final CustomParser.DataIn din, final CustomParser.DataOut dout) {
ValueString _str = new ValueString();
byte[] bits = din.getChunkData(cidx);
if( bits == null ) return dout;
int offset = din.getChunkDataStart(cidx); // General cursor into the giant array of bytes
final byte[] bits0 = bits; // Bits for chunk0
boolean firstChunk = true; // Have not rolled into the 2nd chunk
byte[] bits1 = null; // Bits for chunk1, loaded lazily.
// Starting state. Are we skipping the first (partial) line, or not? Skip
// a header line, or a partial line if we're in the 2nd and later chunks.
int state = (_setup._header || cidx > 0) ? SKIP_LINE : WHITESPACE_BEFORE_TOKEN;
// If handed a skipping offset, then it points just past the prior partial line.
if( offset >= 0 ) state = WHITESPACE_BEFORE_TOKEN;
else offset = 0; // Else start skipping at the start
int quotes = 0;
long number = 0;
int exp = 0;
int sgn_exp = 1;
boolean decimal = false;
int fractionDigits = 0;
int tokenStart = 0; // used for numeric token to backtrace if not successful
int colIdx = 0;
byte c = bits[offset];
// skip comments for the first chunk (or if not a chunk)
if( cidx == 0 ) {
while (c == '#' || c == '@'/*also treat as comments leading '@' from ARFF format*/) {
while ((offset < bits.length) && (bits[offset] != CHAR_CR) && (bits[offset ] != CHAR_LF)) ++offset;
if ((offset+1 < bits.length) && (bits[offset] == CHAR_CR) && (bits[offset+1] == CHAR_LF)) ++offset;
++offset;
if (offset >= bits.length)
return dout;
c = bits[offset];
}
}
dout.newLine();
MAIN_LOOP:
while (true) {
NEXT_CHAR:
switch (state) {
// ---------------------------------------------------------------------
case SKIP_LINE:
if (isEOL(c)) {
state = EOL;
} else {
break NEXT_CHAR;
}
continue MAIN_LOOP;
// ---------------------------------------------------------------------
case EXPECT_COND_LF:
state = POSSIBLE_EMPTY_LINE;
if (c == CHAR_LF)
break NEXT_CHAR;
continue MAIN_LOOP;
// ---------------------------------------------------------------------
case STRING:
if (c == quotes) {
state = COND_QUOTE;
break NEXT_CHAR;
}
if (!isEOL(c) && ((quotes != 0) || (c != CHAR_SEPARATOR))) {
_str.addChar();
break NEXT_CHAR;
}
// fallthrough to STRING_END
// ---------------------------------------------------------------------
case STRING_END:
if ((c != CHAR_SEPARATOR) && (c == CHAR_SPACE))
break NEXT_CHAR;
// we have parsed the string enum correctly
if((_str.get_off() + _str.get_length()) > _str.get_buf().length){ // crossing chunk boundary
assert _str.get_buf() != bits;
_str.addBuff(bits);
}
if(_setup._types != null && colIdx < _setup._types.length && _str.equals(_setup._types[colIdx]._naStr))
dout.addInvalidCol(colIdx);
else
dout.addStrCol(colIdx, _str);
_str.set(null, 0, 0);
++colIdx;
state = SEPARATOR_OR_EOL;
// fallthrough to SEPARATOR_OR_EOL
// ---------------------------------------------------------------------
case SEPARATOR_OR_EOL:
if (c == CHAR_SEPARATOR) {
state = WHITESPACE_BEFORE_TOKEN;
break NEXT_CHAR;
}
if (c==CHAR_SPACE)
break NEXT_CHAR;
// fallthrough to EOL
// ---------------------------------------------------------------------
case EOL:
if(quotes != 0){
System.err.println("Unmatched quote char " + ((char)quotes) + " " + (((_str.get_length()+1) < offset && _str.get_off() > 0)?new String(Arrays.copyOfRange(bits,_str.get_off()-1,offset)):""));
dout.invalidLine("Unmatched quote char " + ((char)quotes));
colIdx = 0;
quotes = 0;
}else if (colIdx != 0) {
dout.newLine();
colIdx = 0;
}
state = (c == CHAR_CR) ? EXPECT_COND_LF : POSSIBLE_EMPTY_LINE;
if( !firstChunk )
break MAIN_LOOP; // second chunk only does the first row
break NEXT_CHAR;
// ---------------------------------------------------------------------
case POSSIBLE_CURRENCY:
if (((c >= '0') && (c <= '9')) || (c == '-') || (c == CHAR_DECIMAL_SEPARATOR) || (c == '+')) {
state = TOKEN;
} else {
_str.set(bits,offset-1,0);
_str.addChar();
if (c == quotes) {
state = COND_QUOTE;
break NEXT_CHAR;
}
if ((quotes != 0) || ((!isEOL(c) && (c != CHAR_SEPARATOR)))) {
state = STRING;
} else {
state = STRING_END;
}
}
continue MAIN_LOOP;
// ---------------------------------------------------------------------
case POSSIBLE_EMPTY_LINE:
if (isEOL(c)) {
if (c == CHAR_CR)
state = EXPECT_COND_LF;
break NEXT_CHAR;
}
state = WHITESPACE_BEFORE_TOKEN;
// fallthrough to WHITESPACE_BEFORE_TOKEN
// ---------------------------------------------------------------------
case WHITESPACE_BEFORE_TOKEN:
if (c == CHAR_SPACE || (c == CHAR_TAB && CHAR_TAB!=CHAR_SEPARATOR)) {
break NEXT_CHAR;
} else if (c == CHAR_SEPARATOR) {
// we have empty token, store as NaN
dout.addInvalidCol(colIdx++);
break NEXT_CHAR;
} else if (isEOL(c)) {
dout.addInvalidCol(colIdx++);
state = EOL;
continue MAIN_LOOP;
}
// fallthrough to COND_QUOTED_TOKEN
// ---------------------------------------------------------------------
case COND_QUOTED_TOKEN:
state = TOKEN;
if( CHAR_SEPARATOR!=HIVE_SEP && // Only allow quoting in CSV not Hive files
((_setup._singleQuotes && c == CHAR_SINGLE_QUOTE) || (c == CHAR_DOUBLE_QUOTE))) {
assert (quotes == 0);
quotes = c;
break NEXT_CHAR;
}
// fallthrough to TOKEN
// ---------------------------------------------------------------------
case TOKEN:
if(_setup._types != null && colIdx < _setup._types.length && _setup._types[colIdx]._type == ParserSetup.Coltype.STR){
state = STRING; // Do not attempt a number parse, just do a string parse
_str.set(bits, offset, 0);
continue MAIN_LOOP;
} else if (((c >= '0') && (c <= '9')) || (c == '-') || (c == CHAR_DECIMAL_SEPARATOR) || (c == '+')) {
state = NUMBER;
number = 0;
fractionDigits = 0;
decimal = false;
tokenStart = offset;
if (c == '-') {
exp = -1;
break NEXT_CHAR;
} else if(c == '+'){
exp = 1;
break NEXT_CHAR;
} else {
exp = 1;
}
// fallthrough
} else if (c == '$') {
state = POSSIBLE_CURRENCY;
break NEXT_CHAR;
} else {
state = STRING;
_str.set(bits, offset, 0);
continue MAIN_LOOP;
}
// fallthrough to NUMBER
// ---------------------------------------------------------------------
case NUMBER:
if ((c >= '0') && (c <= '9')) {
if (number >= LARGEST_DIGIT_NUMBER) state = NUMBER_SKIP;
else number = (number*10)+(c-'0');
break NEXT_CHAR;
} else if (c == CHAR_DECIMAL_SEPARATOR) {
state = NUMBER_FRACTION;
fractionDigits = offset;
decimal = true;
break NEXT_CHAR;
} else if ((c == 'e') || (c == 'E')) {
state = NUMBER_EXP_START;
sgn_exp = 1;
break NEXT_CHAR;
}
if (exp == -1) {
number = -number;
}
exp = 0;
// fallthrough to COND_QUOTED_NUMBER_END
// ---------------------------------------------------------------------
case COND_QUOTED_NUMBER_END:
if ( c == quotes) {
state = NUMBER_END;
quotes = 0;
break NEXT_CHAR;
}
// fallthrough NUMBER_END
case NUMBER_END:
if (c == CHAR_SEPARATOR && quotes == 0) {
exp = exp - fractionDigits;
dout.addNumCol(colIdx,number,exp);
++colIdx;
// do separator state here too
state = WHITESPACE_BEFORE_TOKEN;
break NEXT_CHAR;
} else if (isEOL(c)) {
exp = exp - fractionDigits;
dout.addNumCol(colIdx,number,exp);
// do EOL here for speedup reasons
colIdx = 0;
dout.newLine();
state = (c == CHAR_CR) ? EXPECT_COND_LF : POSSIBLE_EMPTY_LINE;
if( !firstChunk )
break MAIN_LOOP; // second chunk only does the first row
break NEXT_CHAR;
} else if ((c == '%')) {
state = NUMBER_END;
exp -= 2;
break NEXT_CHAR;
} else if ((c != CHAR_SEPARATOR) && ((c == CHAR_SPACE) || (c == CHAR_TAB))) {
state = NUMBER_END;
break NEXT_CHAR;
} else {
state = STRING;
offset = tokenStart-1;
_str.set(bits,tokenStart,0);
break NEXT_CHAR; // parse as String token now
}
// ---------------------------------------------------------------------
case NUMBER_SKIP:
if ((c >= '0') && (c <= '9')) {
exp++;
break NEXT_CHAR;
} else if (c == CHAR_DECIMAL_SEPARATOR) {
state = NUMBER_SKIP_NO_DOT;
break NEXT_CHAR;
} else if ((c == 'e') || (c == 'E')) {
state = NUMBER_EXP_START;
sgn_exp = 1;
break NEXT_CHAR;
}
state = COND_QUOTED_NUMBER_END;
continue MAIN_LOOP;
// ---------------------------------------------------------------------
case NUMBER_SKIP_NO_DOT:
if ((c >= '0') && (c <= '9')) {
break NEXT_CHAR;
} else if ((c == 'e') || (c == 'E')) {
state = NUMBER_EXP_START;
sgn_exp = 1;
break NEXT_CHAR;
}
state = COND_QUOTED_NUMBER_END;
continue MAIN_LOOP;
// ---------------------------------------------------------------------
case NUMBER_FRACTION:
if ((c >= '0') && (c <= '9')) {
if (number >= LARGEST_DIGIT_NUMBER) {
if (decimal)
fractionDigits = offset - 1 - fractionDigits;
if (exp == -1) {
number = -number;
}
exp = 0;
state = NUMBER_SKIP_NO_DOT;
} else {
number = (number*10)+(c-'0');
}
break NEXT_CHAR;
} else if ((c == 'e') || (c == 'E')) {
if (decimal)
fractionDigits = offset - 1 - fractionDigits;
state = NUMBER_EXP_START;
sgn_exp = 1;
break NEXT_CHAR;
}
state = COND_QUOTED_NUMBER_END;
if (decimal)
fractionDigits = offset - fractionDigits-1;
if (exp == -1) {
number = -number;
}
exp = 0;
continue MAIN_LOOP;
// ---------------------------------------------------------------------
case NUMBER_EXP_START:
if (exp == -1) {
number = -number;
}
exp = 0;
if (c == '-') {
sgn_exp *= -1;
break NEXT_CHAR;
} else if (c == '+'){
break NEXT_CHAR;
}
if ((c < '0') || (c > '9')){
state = STRING;
offset = tokenStart-1;
_str.set(bits,tokenStart,0);
break NEXT_CHAR; // parse as String token now
}
state = NUMBER_EXP; // fall through to NUMBER_EXP
// ---------------------------------------------------------------------
case NUMBER_EXP:
if ((c >= '0') && (c <= '9')) {
exp = (exp*10)+(c-'0');
break NEXT_CHAR;
}
exp *= sgn_exp;
state = COND_QUOTED_NUMBER_END;
continue MAIN_LOOP;
// ---------------------------------------------------------------------
case COND_QUOTE:
if (c == quotes) {
_str.addChar();
// _str.skipChar();
state = STRING;
break NEXT_CHAR;
} else {
quotes = 0;
state = STRING_END;
continue MAIN_LOOP;
}
// ---------------------------------------------------------------------
default:
assert (false) : " We have wrong state "+state;
} // end NEXT_CHAR
++offset; // do not need to adjust for offset increase here - the offset is set to tokenStart-1!
if (offset < 0) { // Offset is negative?
assert !firstChunk; // Caused by backing up from 2nd chunk into 1st chunk
firstChunk = true;
bits = bits0;
offset += bits.length;
_str.set(bits,offset,0);
} else if (offset >= bits.length) { // Off end of 1st chunk? Parse into 2nd chunk
// Attempt to get more data.
if( firstChunk && bits1 == null )
bits1 = din.getChunkData(cidx+1);
// if we can't get further we might have been the last one and we must
// commit the latest guy if we had one.
if( !firstChunk || bits1 == null ) { // No more data available or allowed
// If we are mid-parse of something, act like we saw a LF to end the
// current token.
if ((state != EXPECT_COND_LF) && (state != POSSIBLE_EMPTY_LINE)) {
c = CHAR_LF;
if (!firstChunk) Log.warn("Row entry exceeded " + bits.length + " bytes in size, exceeded current parse limit.");
continue MAIN_LOOP;
}
break MAIN_LOOP; // Else we are just done
}
// Now parsing in the 2nd chunk. All offsets relative to the 2nd chunk start.
firstChunk = false;
if (state == NUMBER_FRACTION)
fractionDigits -= bits.length;
offset -= bits.length;
tokenStart -= bits.length;
bits = bits1; // Set main parsing loop bits
if( bits[0] == CHAR_LF && state == EXPECT_COND_LF )
break MAIN_LOOP; // when the first character we see is a line end
}
c = bits[offset];
if(isEOL(c) && state != COND_QUOTE && quotes != 0) // quoted string having newline character => fail the line!
state = EOL;
} // end MAIN_LOOP
if (colIdx == 0)
dout.rollbackLine();
// If offset is still validly within the buffer, save it so the next pass
// can start from there.
if( offset+1 < bits.length ) {
if( state == EXPECT_COND_LF && bits[offset+1] == CHAR_LF ) offset++;
if( offset+1 < bits.length ) din.setChunkDataStart(cidx+1, offset+1 );
}
return dout;
}
// ==========================================================================
// /** Setup of the parser.
// *
// * Simply holds the column names, their length also determines the number of
// * columns, the separator used and whether the CSV file had a header or not.
// */
// public static class Setup extends Iced {
// public final byte _separator;
// public final boolean _header;
// // Row zero is column names.
// // Remaining rows are parsed from the given data, until we run out
// // of data or hit some arbitrary display limit.
// public final String[][] _data;
// public final int _numlines; // Number of lines parsed
// public final byte[] _bits; // The original bits
//
// public Setup(byte separator, boolean header, String[][] data, int numlines, byte[] bits) {
// _separator = separator;
// _header = header;
// _data = data;
// _numlines = numlines;
// _bits = bits;
// }
// public Setup(Setup S, boolean header) {
// _separator = S._separator;
// _header = header;
// _data = S._data;
// _numlines = S._numlines;
// _bits = S._bits;
// }
//
// public int numCols(){return _data == null?-1:_data[0].length;}
//
// @Override public boolean equals( Object o ) {
// if( o == null || !(o instanceof Setup) ) return false;
// if( o == this ) return true;
// Setup s = (Setup)o;
// // "Compatible" setups means same columns and same separators
// return _separator == s._separator &&
// ((_data==null && s._data==null) ||
// (_data[0].length == s._data[0].length));
// }
// @Override public String toString() {
// return "'"+(char)_separator+"' head="+_header+" cols="+(_data==null?-2:(_data[0]==null?-1:_data[0].length));
// }
// }
/** Separators recognized by the parser. You can add new separators to this
* list and the parser will automatically attempt to recognize them. In
* case of doubt the separators are listed in descending order of
* probability, with space being the last one - space must always be the
* last one as it is used if all other fails because multiple spaces can be
* used as a single separator.
*/
private static byte[] separators = new byte[] { HIVE_SEP/* '^A', Hive table column separator */, ',', ';', '|', '\t', ' '/*space is last in this list, because we allow multiple spaces*/ };
/** Dermines the number of separators in given line. Correctly handles quoted
* tokens.
*/
private static int[] determineSeparatorCounts(String from, int single_quote) {
int[] result = new int[separators.length];
byte[] bits = from.getBytes();
boolean in_quote = false;
for( int j=0; j< bits.length; j++ ) {
byte c = bits[j];
if( (c == single_quote) || (c == CHAR_DOUBLE_QUOTE) )
in_quote ^= true;
if( !in_quote || c == HIVE_SEP )
for( int i = 0; i < separators.length; ++i)
if (c == separators[i])
++result[i];
}
return result;
}
/** Determines the tokens that are inside a line and returns them as strings
* in an array. Assumes the given separator.
*/
private static String[] determineTokens(String from, byte separator, int single_quote) {
ArrayList<String> tokens = new ArrayList();
byte[] bits = from.getBytes();
int offset = 0;
int quotes = 0;
while (offset < bits.length) {
while ((offset < bits.length) && (bits[offset] == CHAR_SPACE)) ++offset; // skip first whitespace
if(offset == bits.length)break;
StringBuilder t = new StringBuilder();
byte c = bits[offset];
if ((c == CHAR_DOUBLE_QUOTE) || (c == single_quote)) {
quotes = c;
++offset;
}
while (offset < bits.length) {
c = bits[offset];
if ((c == quotes)) {
++offset;
if ((offset < bits.length) && (bits[offset] == c)) {
t.append((char)c);
++offset;
continue;
}
quotes = 0;
} else if ((quotes == 0) && ((c == separator) || (c == CHAR_CR) || (c == CHAR_LF))) {
break;
} else {
t.append((char)c);
++offset;
}
}
c = (offset == bits.length) ? CHAR_LF : bits[offset];
tokens.add(t.toString());
if ((c == CHAR_CR) || (c == CHAR_LF) || (offset == bits.length))
break;
if (c != separator)
return new String[0]; // an error
++offset; // Skip separator
}
// If we have trailing empty columns (split by seperators) such as ",,\n"
// then we did not add the final (empty) column, so the column count will
// be down by 1. Add an extra empty column here
if( bits[bits.length-1] == separator && bits[bits.length-1] != CHAR_SPACE)
tokens.add("");
return tokens.toArray(new String[tokens.size()]);
}
private static boolean allStrings(String [] line){
ValueString str = new ValueString();
for( String s : line ) {
try {
Double.parseDouble(s);
return false; // Number in 1st row guesses: No Column Header
} catch (NumberFormatException e) { /*Pass - determining if number is possible*/ }
if( ParseTime.attemptTimeParse(str.setTo(s)) != Long.MIN_VALUE ) return false;
ParseTime.attemptUUIDParse0(str.setTo(s));
ParseTime.attemptUUIDParse1(str);
if( str.get_off() != -1 ) return false; // Valid UUID parse
}
return true;
}
// simple heuristic to determine if we have headers:
// return true iff the first line is all strings and second line has at least one number
private static boolean hasHeader(String[] l1, String[] l2) {
return allStrings(l1) && !allStrings(l2);
}
private static byte guessSeparator(String l1, String l2, int single_quote){
int[] s1 = determineSeparatorCounts(l1, single_quote);
int[] s2 = determineSeparatorCounts(l2, single_quote);
// Now we have the counts - if both lines have the same number of separators
// the we assume it is the separator. Separators are ordered by their
// likelyhoods.
int max = 0;
for( int i = 0; i < s1.length; ++i ) {
if( s1[i] == 0 ) continue; // Separator does not appear; ignore it
if( s1[max] < s1[i] ) max=i; // Largest count sep on 1st line
if( s1[i] == s2[i] ) { // Sep counts are equal?
try {
String[] t1 = determineTokens(l1, separators[i], single_quote);
String[] t2 = determineTokens(l2, separators[i], single_quote);
if( t1.length != s1[i]+1 || t2.length != s2[i]+1 )
continue; // Token parsing fails
return separators[i];
} catch (Exception e) { /*pass; try another parse attempt*/ }
}
}
// No sep's appeared, or no sep's had equal counts on lines 1 & 2. If no
// separators have same counts, the largest one will be used as the default
// one. If there's no largest one, space will be used.
if( s1[max]==0 ) max=separators.length-1; // Try last separator (space)
if( s1[max]!=0 ) {
String[] t1 = determineTokens(l1, separators[max], single_quote);
String[] t2 = determineTokens(l2, separators[max], single_quote);
if( t1.length == s1[max]+1 && t2.length == s2[max]+1 )
return separators[max];
}
return AUTO_SEP;
}
private static int guessNcols(ParserSetup setup,String [][] data){
int res = data[0].length;
if(setup._header)return res;
boolean samelen = true; // True if all are same length
boolean longest0 = true; // True if no line is longer than 1st line
for(String [] s:data) {
samelen &= (s.length == res);
if( s.length > res ) longest0=false;
}
if(samelen)return res; // All same length, take it
if( longest0 ) return res; // 1st line is longer than all the rest; take it
// we don't have lines of same length, pick the most common length
HashMap<Integer, Integer> lengths = new HashMap<Integer, Integer>();
for(String [] s:data){
if(!lengths.containsKey(s.length))lengths.put(s.length, 1);
else
lengths.put(s.length, lengths.get(s.length)+1);
}
int maxCnt = 0;
for(Map.Entry<Integer, Integer> e:lengths.entrySet())
if(e.getValue() > maxCnt){
maxCnt = e.getValue();
res = e.getKey();
}
return res;
}
/** Determines the CSV parser setup from the first two lines. Also parses
* the next few lines, tossing out comments and blank lines.
*
* A separator is given or it is selected if both two lines have the same ammount of them
* and the tokenization then returns same number of columns.
*/
public static CustomParser.PSetupGuess guessSetup(byte[] bits) { return guessSetup(bits, new ParserSetup(ParserType.CSV),true); }
public static CustomParser.PSetupGuess guessSetup(byte[] bits, ParserSetup setup){return guessSetup(bits,setup,false);}
public static CustomParser.PSetupGuess guessSetup(byte[] bits, ParserSetup setup, boolean checkHeader) {
ArrayList<String> lines = new ArrayList();
int offset = 0;
while (offset < bits.length && lines.size() < 10) {
int lineStart = offset;
while ((offset < bits.length) && (bits[offset] != CHAR_CR) && (bits[offset] != CHAR_LF)) ++offset;
int lineEnd = offset;
++offset;
if ((offset < bits.length) && (bits[offset] == CHAR_LF)) ++offset;
if (bits[lineStart] == '#' && !setup._hashHeader) continue; // Ignore comment lines unless header set to start with a hash character
if (bits[lineStart] == '@') continue; // Ignore ARFF comment lines
if (lineEnd>lineStart){
String str = new String(bits, lineStart,lineEnd-lineStart).trim();
if(!str.isEmpty())lines.add(str);
}
}
if(lines.isEmpty())
return new PSetupGuess(new ParserSetup(ParserType.AUTO,CsvParser.AUTO_SEP,0,false,null,setup._singleQuotes),0,0,null,false,new String[]{"No data!"});
boolean hasHeader = false;
final int single_quote = setup._singleQuotes ? CHAR_SINGLE_QUOTE : -1;
byte sep = setup._separator;
final String [][] data = new String[lines.size()][];
int ncols;
if( lines.size() < 2 ) {
if(sep == AUTO_SEP){
if(lines.get(0).split(",").length > 2)
sep = (byte)',';
else if(lines.get(0).split(" ").length > 2)
sep = ' ';
else {
data[0] = new String[]{lines.get(0)};
return new PSetupGuess(new ParserSetup(ParserType.CSV,CsvParser.AUTO_SEP,1,false,null,setup._singleQuotes),lines.size(),0,data,false,new String[]{"Failed to guess separator."});
}
}
if(lines.size() == 1)
data[0] = determineTokens(lines.get(0), sep, single_quote);
ncols = (setup._ncols > 0)?setup._ncols:data[0].length;
hasHeader = (checkHeader && allStrings(data[0])) || setup._header;
} else {
if(setup._separator == AUTO_SEP){ // first guess the separator
sep = guessSeparator(lines.get(0), lines.get(1), single_quote);
if(sep == AUTO_SEP && lines.size() > 2){
if(sep == AUTO_SEP)sep = guessSeparator(lines.get(1), lines.get(2), single_quote);
if(sep == AUTO_SEP)sep = guessSeparator(lines.get(0), lines.get(2), single_quote);
}
if(sep == AUTO_SEP)sep = (byte)' ';
}
for(int i = 0; i < lines.size(); ++i)
data[i] = determineTokens(lines.get(i), sep, single_quote);
// we do not have enough lines to decide
ncols = (setup._ncols > 0)?setup._ncols:guessNcols(setup,data);
if(checkHeader){
assert !setup._header;
assert setup._columnNames == null;
hasHeader = hasHeader(data[0],data[1]) && (data[0].length == ncols);
} else if(setup._header){
if(setup._columnNames != null){ // we know what the header looks like, check if the current file has matching header
hasHeader = data[0].length == setup._columnNames.length;
for(int i = 0; hasHeader && i < data[0].length; ++i)
hasHeader = data[0][i].equalsIgnoreCase(setup._columnNames[i]);
} else // otherwise we're told to take the first line as header whatever it might be
hasHeader = true;
}
}
ParserSetup resSetup = new ParserSetup(ParserType.CSV, sep, ncols,hasHeader, hasHeader?data[0]:null,setup._singleQuotes);
ArrayList<String> errors = new ArrayList<String>();
int ilines = 0;
for(int i = 0; i < data.length; ++i){
if(data[i].length != resSetup._ncols){
errors.add("error at line " + i + " : incompatible line length. Got " + data[i].length + " columns.");
++ilines;
}
}
String [] err = null;
if(!errors.isEmpty()){
err = new String[errors.size()];
errors.toArray(err);
}
PSetupGuess res = new PSetupGuess(resSetup,lines.size()-ilines,ilines,data,setup.isSpecified() || lines.size() > ilines, err);
if(res._isValid){ // now guess the types
InputStream is = new ByteArrayInputStream(bits);
CsvParser p = new CsvParser(res._setup);
TypeGuesserDataOut dout = new TypeGuesserDataOut(res._setup._ncols);
try{
p.streamParse(is, dout);
res._setup._types = dout.guessTypes();
}catch(Throwable e){}
}
return res;
}
@Override public boolean isCompatible(CustomParser p) {
return (p instanceof CsvParser) && p._setup._separator == _setup._separator && p._setup._ncols == _setup._ncols;
}
}