package water.parser; import org.apache.commons.lang.math.NumberUtils; import org.apache.http.ParseException; import water.fvec.Vec; import water.fvec.FileVec; import water.Key; import water.util.StringUtils; import java.io.ByteArrayInputStream; import java.io.InputStream; import java.util.ArrayList; import java.util.Arrays; import static water.parser.DefaultParserProviders.*; import static water.parser.DefaultParserProviders.CSV_INFO; class CsvParser extends Parser { private static final byte GUESS_SEP = ParseSetup.GUESS_SEP; private static final int NO_HEADER = ParseSetup.NO_HEADER; private static final int GUESS_HEADER = ParseSetup.GUESS_HEADER; private static final int HAS_HEADER = ParseSetup.HAS_HEADER; CsvParser( ParseSetup ps, Key jobKey ) { super(ps, jobKey); } // Parse this one Chunk (in parallel with other Chunks) @SuppressWarnings("fallthrough") @Override public ParseWriter parseChunk(int cidx, final ParseReader din, final ParseWriter dout) { BufferedString str = new BufferedString(); byte[] bits = din.getChunkData(cidx); if( bits == null ) return dout; int offset = din.getChunkDataStart(cidx); // General cursor into the giant array of bytes final byte[] bits0 = bits; // Bits for chunk0 boolean firstChunk = true; // Have not rolled into the 2nd chunk byte[] bits1 = null; // Bits for chunk1, loaded lazily. int state; boolean isNa = false; boolean isAllASCII = true; // If handed a skipping offset, then it points just past the prior partial line. if( offset >= 0 ) state = WHITESPACE_BEFORE_TOKEN; else { offset = 0; // Else start skipping at the start // Starting state. Are we skipping the first (partial) line, or not? Skip // a header line, or a partial line if we're in the 2nd and later chunks. if (_setup._check_header == ParseSetup.HAS_HEADER || cidx > 0) state = SKIP_LINE; else state = WHITESPACE_BEFORE_TOKEN; } // For parsing ARFF if (_setup._parse_type.equals(ARFF_INFO) && _setup._check_header == ParseSetup.HAS_HEADER) state = WHITESPACE_BEFORE_TOKEN; int quotes = 0; long number = 0; int exp = 0; int sgnExp = 1; boolean decimal = false; int fractionDigits = 0; int tokenStart = 0; // used for numeric token to backtrace if not successful int colIdx = 0; byte c = bits[offset]; // skip comments for the first chunk (or if not a chunk) if( cidx == 0 ) { while ( c == '#' || isEOL(c) || c == '@' /*also treat as comments leading '@' from ARFF format*/ || c == '%' /*also treat as comments leading '%' from ARFF format*/) { while ((offset < bits.length) && (bits[offset] != CHAR_CR) && (bits[offset ] != CHAR_LF)) { // System.out.print(String.format("%c",bits[offset])); ++offset; } if ((offset + 1 < bits.length) && (bits[offset] == CHAR_CR) && (bits[offset + 1] == CHAR_LF)) ++offset; ++offset; // System.out.println(); if (offset >= bits.length) return dout; c = bits[offset]; } } dout.newLine(); final boolean forceable = dout instanceof FVecParseWriter && ((FVecParseWriter)dout)._ctypes != null && _setup._column_types != null; MAIN_LOOP: while (true) { boolean forcedCategorical = forceable && colIdx < _setup._column_types.length && _setup._column_types[colIdx] == Vec.T_CAT; boolean forcedString = forceable && colIdx < _setup._column_types.length && _setup._column_types[colIdx] == Vec.T_STR; switch (state) { // --------------------------------------------------------------------- case SKIP_LINE: if (isEOL(c)) { state = EOL; } else { break; } continue MAIN_LOOP; // --------------------------------------------------------------------- case EXPECT_COND_LF: state = POSSIBLE_EMPTY_LINE; if (c == CHAR_LF) break; continue MAIN_LOOP; // --------------------------------------------------------------------- case STRING: if (c == quotes) { state = COND_QUOTE; break; } if (!isEOL(c) && ((quotes != 0) || (c != CHAR_SEPARATOR))) { str.addChar(); if ((c & 0x80) == 128) //value beyond std ASCII isAllASCII = false; break; } // fallthrough to STRING_END // --------------------------------------------------------------------- case STRING_END: if ((c != CHAR_SEPARATOR) && (c == CHAR_SPACE)) break; // we have parsed the string categorical correctly if((str.getOffset() + str.length()) > str.getBuffer().length){ // crossing chunk boundary assert str.getBuffer() != bits; str.addBuff(bits); } if( !isNa && _setup._na_strings != null && _setup._na_strings.length > colIdx && str.isOneOf(_setup._na_strings[colIdx])) { isNa = true; } if (!isNa) { dout.addStrCol(colIdx, str); if (!isAllASCII) dout.setIsAllASCII(colIdx, isAllASCII); } else { dout.addInvalidCol(colIdx); isNa = false; } str.set(null, 0, 0); isAllASCII = true; ++colIdx; state = SEPARATOR_OR_EOL; // fallthrough to SEPARATOR_OR_EOL // --------------------------------------------------------------------- case SEPARATOR_OR_EOL: if (c == CHAR_SEPARATOR) { state = WHITESPACE_BEFORE_TOKEN; break; } if (c==CHAR_SPACE) break; // fallthrough to EOL // --------------------------------------------------------------------- case EOL: if(quotes != 0){ //System.err.println("Unmatched quote char " + ((char)quotes) + " " + (((str.length()+1) < offset && str.getOffset() > 0)?new String(Arrays.copyOfRange(bits,str.getOffset()-1,offset)):"")); String err = "Unmatched quote char " + ((char) quotes); dout.invalidLine(new ParseWriter.ParseErr(err, cidx, dout.lineNum(), offset + din.getGlobalByteOffset())); colIdx = 0; quotes = 0; }else if (colIdx != 0) { dout.newLine(); colIdx = 0; } state = (c == CHAR_CR) ? EXPECT_COND_LF : POSSIBLE_EMPTY_LINE; if( !firstChunk ) break MAIN_LOOP; // second chunk only does the first row break; // --------------------------------------------------------------------- case POSSIBLE_CURRENCY: if (((c >= '0') && (c <= '9')) || (c == '-') || (c == CHAR_DECIMAL_SEP) || (c == '+')) { state = TOKEN; } else { str.set(bits, offset - 1, 0); str.addChar(); if (c == quotes) { state = COND_QUOTE; break; } if ((quotes != 0) || ((!isEOL(c) && (c != CHAR_SEPARATOR)))) { state = STRING; } else { state = STRING_END; } } continue MAIN_LOOP; // --------------------------------------------------------------------- case POSSIBLE_EMPTY_LINE: if (isEOL(c)) { if (c == CHAR_CR) state = EXPECT_COND_LF; break; } state = WHITESPACE_BEFORE_TOKEN; // fallthrough to WHITESPACE_BEFORE_TOKEN // --------------------------------------------------------------------- case WHITESPACE_BEFORE_TOKEN: if (c == CHAR_SPACE || (c == CHAR_TAB && CHAR_TAB!=CHAR_SEPARATOR)) { break; } else if (c == CHAR_SEPARATOR) { // we have empty token, store as NaN dout.addInvalidCol(colIdx++); break; } else if (isEOL(c)) { dout.addInvalidCol(colIdx++); state = EOL; continue MAIN_LOOP; } // fallthrough to COND_QUOTED_TOKEN // --------------------------------------------------------------------- case COND_QUOTED_TOKEN: state = TOKEN; if( CHAR_SEPARATOR!=HIVE_SEP && // Only allow quoting in CSV not Hive files ((_setup._single_quotes && c == CHAR_SINGLE_QUOTE) || (c == CHAR_DOUBLE_QUOTE))) { assert (quotes == 0); quotes = c; break; } // fallthrough to TOKEN // --------------------------------------------------------------------- case TOKEN: if( dout.isString(colIdx) ) { // Forced already to a string col? state = STRING; // Do not attempt a number parse, just do a string parse str.set(bits, offset, 0); continue MAIN_LOOP; } else if (((c >= '0') && (c <= '9')) || (c == '-') || (c == CHAR_DECIMAL_SEP) || (c == '+')) { state = NUMBER; number = 0; fractionDigits = 0; decimal = false; tokenStart = offset; if (c == '-') { exp = -1; break; } else if(c == '+'){ exp = 1; break; } else { exp = 1; } // fallthrough } else if (c == '$') { state = POSSIBLE_CURRENCY; break; } else { state = STRING; str.set(bits, offset, 0); continue MAIN_LOOP; } // fallthrough to NUMBER // --------------------------------------------------------------------- case NUMBER: if ((c >= '0') && (c <= '9')) { if (number >= LARGEST_DIGIT_NUMBER) state = NUMBER_SKIP; else number = (number*10)+(c-'0'); break; } else if (c == CHAR_DECIMAL_SEP) { state = NUMBER_FRACTION; fractionDigits = offset; decimal = true; break; } else if ((c == 'e') || (c == 'E')) { state = NUMBER_EXP_START; sgnExp = 1; break; } if (exp == -1) { number = -number; } exp = 0; // fallthrough to COND_QUOTED_NUMBER_END // --------------------------------------------------------------------- case COND_QUOTED_NUMBER_END: if ( c == quotes) { state = NUMBER_END; quotes = 0; break; } // fallthrough NUMBER_END case NUMBER_END: // forced if (forcedString || forcedCategorical ) { state = STRING; offset = tokenStart - 1; str.set(bits, tokenStart, 0); break; // parse as String token now } if (c == CHAR_SEPARATOR && quotes == 0) { exp = exp - fractionDigits; dout.addNumCol(colIdx,number,exp); ++colIdx; // do separator state here too state = WHITESPACE_BEFORE_TOKEN; break; } else if (isEOL(c)) { exp = exp - fractionDigits; dout.addNumCol(colIdx,number,exp); // do EOL here for speedup reasons colIdx = 0; dout.newLine(); state = (c == CHAR_CR) ? EXPECT_COND_LF : POSSIBLE_EMPTY_LINE; if( !firstChunk ) break MAIN_LOOP; // second chunk only does the first row break; } else if ((c == '%')) { state = NUMBER_END; exp -= 2; break; } else if ((c != CHAR_SEPARATOR) && ((c == CHAR_SPACE) || (c == CHAR_TAB))) { state = NUMBER_END; break; } else { state = STRING; offset = tokenStart-1; str.set(bits, tokenStart, 0); break; // parse as String token now } // --------------------------------------------------------------------- case NUMBER_SKIP: if ((c >= '0') && (c <= '9')) { exp++; break; } else if (c == CHAR_DECIMAL_SEP) { state = NUMBER_SKIP_NO_DOT; break; } else if ((c == 'e') || (c == 'E')) { state = NUMBER_EXP_START; sgnExp = 1; break; } state = COND_QUOTED_NUMBER_END; continue MAIN_LOOP; // --------------------------------------------------------------------- case NUMBER_SKIP_NO_DOT: if ((c >= '0') && (c <= '9')) { break; } else if ((c == 'e') || (c == 'E')) { state = NUMBER_EXP_START; sgnExp = 1; break; } state = COND_QUOTED_NUMBER_END; continue MAIN_LOOP; // --------------------------------------------------------------------- case NUMBER_FRACTION: if ((c >= '0') && (c <= '9')) { if (number >= LARGEST_DIGIT_NUMBER) { if (decimal) fractionDigits = offset - 1 - fractionDigits; if (exp == -1) number = -number; exp = 0; state = NUMBER_SKIP_NO_DOT; } else { number = (number*10)+(c-'0'); } break; } else if ((c == 'e') || (c == 'E')) { if (decimal) fractionDigits = offset - 1 - fractionDigits; state = NUMBER_EXP_START; sgnExp = 1; break; } state = COND_QUOTED_NUMBER_END; if (decimal) fractionDigits = offset - fractionDigits-1; if (exp == -1) { number = -number; } exp = 0; continue MAIN_LOOP; // --------------------------------------------------------------------- case NUMBER_EXP_START: if (exp == -1) { number = -number; } exp = 0; if (c == '-') { sgnExp *= -1; break; } else if (c == '+'){ break; } if ((c < '0') || (c > '9')){ state = STRING; offset = tokenStart-1; str.set(bits, tokenStart, 0); break; // parse as String token now } state = NUMBER_EXP; // fall through to NUMBER_EXP // --------------------------------------------------------------------- case NUMBER_EXP: if ((c >= '0') && (c <= '9')) { exp = (exp*10)+(c-'0'); break; } exp *= sgnExp; state = COND_QUOTED_NUMBER_END; continue MAIN_LOOP; // --------------------------------------------------------------------- case COND_QUOTE: if (c == quotes) { str.addChar(); state = STRING; break; } else { quotes = 0; state = STRING_END; continue MAIN_LOOP; } // --------------------------------------------------------------------- default: assert (false) : " We have wrong state "+state; } // end NEXT_CHAR // System.out.print(String.format("%c",bits[offset])); ++offset; // do not need to adjust for offset increase here - the offset is set to tokenStart-1! if (offset < 0) { // Offset is negative? assert !firstChunk; // Caused by backing up from 2nd chunk into 1st chunk firstChunk = true; bits = bits0; offset += bits.length; str.set(bits, offset, 0); } else if (offset >= bits.length) { // Off end of 1st chunk? Parse into 2nd chunk // Attempt to get more data. if( firstChunk && bits1 == null ) bits1 = din.getChunkData(cidx+1); // if we can't get further we might have been the last one and we must // commit the latest guy if we had one. if( !firstChunk || bits1 == null ) { // No more data available or allowed // If we are mid-parse of something, act like we saw a LF to end the // current token. if ((state != EXPECT_COND_LF) && (state != POSSIBLE_EMPTY_LINE)) { c = CHAR_LF; continue; // MAIN_LOOP; } break; // MAIN_LOOP; // Else we are just done } // Now parsing in the 2nd chunk. All offsets relative to the 2nd chunk start. firstChunk = false; if (state == NUMBER_FRACTION) fractionDigits -= bits.length; offset -= bits.length; tokenStart -= bits.length; bits = bits1; // Set main parsing loop bits if( bits[0] == CHAR_LF && state == EXPECT_COND_LF ) break; // MAIN_LOOP; // when the first character we see is a line end } c = bits[offset]; if(isEOL(c) && state != COND_QUOTE && quotes != 0) // quoted string having newline character => fail the line! state = EOL; } // end MAIN_LOOP if (colIdx == 0) dout.rollbackLine(); // If offset is still validly within the buffer, save it so the next pass // can start from there. if( offset+1 < bits.length ) { if( state == EXPECT_COND_LF && bits[offset+1] == CHAR_LF ) offset++; if( offset+1 < bits.length ) din.setChunkDataStart(cidx+1, offset+1 ); } return dout; } @Override protected int fileHasHeader(byte[] bits, ParseSetup ps) { boolean hasHdr = true; String[] lines = getFirstLines(bits); if (lines != null && lines.length > 0) { String[] firstLine = determineTokens(lines[0], _setup._separator, _setup._single_quotes); if (_setup._column_names != null) { for (int i = 0; hasHdr && i < firstLine.length; ++i) hasHdr = (_setup._column_names[i] == firstLine[i]) || (_setup._column_names[i] != null && _setup._column_names[i].equalsIgnoreCase(firstLine[i])); } else { // declared to have header, but no column names provided, assume header exist in all files _setup._column_names = firstLine; } } // else FIXME Throw exception return hasHdr ? ParseSetup.HAS_HEADER: ParseSetup.NO_HEADER; // consider making insensitive to quotes } // ========================================================================== /** Separators recognized by the CSV parser. You can add new separators to * this list and the parser will automatically attempt to recognize them. * In case of doubt the separators are listed in descending order of * probability, with space being the last one - space must always be the * last one as it is used if all other fails because multiple spaces can be * used as a single separator. */ static final byte HIVE_SEP = 0x1; // '^A', Hive table column separator private static byte[] separators = new byte[] { HIVE_SEP, ',', ';', '|', '\t', ' '/*space is last in this list, because we allow multiple spaces*/ }; /** Dermines the number of separators in given line. Correctly handles quoted tokens. */ private static int[] determineSeparatorCounts(String from, byte singleQuote) { int[] result = new int[separators.length]; byte[] bits = StringUtils.bytesOf(from); boolean inQuote = false; for( byte c : bits ) { if( (c == singleQuote) || (c == CsvParser.CHAR_DOUBLE_QUOTE) ) inQuote ^= true; if( !inQuote || c == HIVE_SEP ) for( int i = 0; i < separators.length; ++i ) if( c == separators[i] ) ++result[i]; } return result; } /** Determines the tokens that are inside a line and returns them as strings * in an array. Assumes the given separator. */ public static String[] determineTokens(String from, byte separator, boolean singleQuotes) { final byte singleQuote = singleQuotes ? CsvParser.CHAR_SINGLE_QUOTE : -1; return determineTokens(from, separator, singleQuote); } public static String[] determineTokens(String from, byte separator, byte singleQuote) { ArrayList<String> tokens = new ArrayList<>(); byte[] bits = StringUtils.bytesOf(from); int offset = 0; int quotes = 0; while (offset < bits.length) { while ((offset < bits.length) && (bits[offset] == CsvParser.CHAR_SPACE)) ++offset; // skip first whitespace if(offset == bits.length)break; StringBuilder t = new StringBuilder(); byte c = bits[offset]; if ((c == CsvParser.CHAR_DOUBLE_QUOTE) || (c == singleQuote)) { quotes = c; ++offset; } while (offset < bits.length) { c = bits[offset]; if ((c == quotes)) { ++offset; if ((offset < bits.length) && (bits[offset] == c)) { t.append((char)c); ++offset; continue; } quotes = 0; } else if( quotes == 0 && ((c == separator) || CsvParser.isEOL(c)) ) { break; } else { t.append((char)c); ++offset; } } c = (offset == bits.length) ? CsvParser.CHAR_LF : bits[offset]; tokens.add(t.toString()); if( CsvParser.isEOL(c) || (offset == bits.length) ) break; if (c != separator) return new String[0]; // an error ++offset; // Skip separator } // If we have trailing empty columns (split by separators) such as ",,\n" // then we did not add the final (empty) column, so the column count will // be down by 1. Add an extra empty column here if( bits.length > 0 && bits[bits.length-1] == separator && bits[bits.length-1] != CsvParser.CHAR_SPACE) tokens.add(""); return tokens.toArray(new String[tokens.size()]); } public static byte guessSeparator(String l1, String l2, boolean singleQuotes) { final byte singleQuote = singleQuotes ? CsvParser.CHAR_SINGLE_QUOTE : -1; int[] s1 = determineSeparatorCounts(l1, singleQuote); int[] s2 = determineSeparatorCounts(l2, singleQuote); // Now we have the counts - if both lines have the same number of // separators the we assume it is the separator. Separators are ordered by // their likelyhoods. int max = 0; for( int i = 0; i < s1.length; ++i ) { if( s1[i] == 0 ) continue; // Separator does not appear; ignore it if( s1[max] < s1[i] ) max=i; // Largest count sep on 1st line if( s1[i] == s2[i] && s1[i] >= s1[max]>>1 ) { // Sep counts are equal? And at nearly as large as the larger header sep? try { String[] t1 = determineTokens(l1, separators[i], singleQuote); String[] t2 = determineTokens(l2, separators[i], singleQuote); if( t1.length != s1[i]+1 || t2.length != s2[i]+1 ) continue; // Token parsing fails return separators[i]; } catch( Exception ignore ) { /*pass; try another parse attempt*/ } } } // No sep's appeared, or no sep's had equal counts on lines 1 & 2. If no // separators have same counts, the largest one will be used as the default // one. If there's no largest one, space will be used. if( s1[max]==0 ) max=separators.length-1; // Try last separator (space) if( s1[max]!=0 ) { String[] t1 = determineTokens(l1, separators[max], singleQuote); String[] t2 = determineTokens(l2, separators[max], singleQuote); if( t1.length == s1[max]+1 && t2.length == s2[max]+1 ) return separators[max]; } return GUESS_SEP; } // Guess number of columns public static int guessNcols( String[] columnNames, String[][] data ) { if( columnNames != null ) return columnNames.length; int longest = 0; // Longest line for( String[] s : data ) if( s.length > longest ) longest = s.length; if( longest == data[0].length ) return longest; // 1st line is longer than all the rest; take it // we don't have lines of same length, pick the most common length int lengths[] = new int[longest+1]; for( String[] s : data ) lengths[s.length]++; int maxCnt = 0; // Most common line length for( int i=0; i<=longest; i++ ) if( lengths[i] > lengths[maxCnt] ) maxCnt = i; return maxCnt; } /** Determines the CSV parser setup from the first few lines. Also parses * the next few lines, tossing out comments and blank lines. * * If the separator is GUESS_SEP, then it is guessed by looking at tokenization * and column count of the first few lines. * * If ncols is -1, then it is guessed similarly to the separator. * * singleQuotes is honored in all cases (and not guessed). * */ static ParseSetup guessSetup(byte[] bits, byte sep, int ncols, boolean singleQuotes, int checkHeader, String[] columnNames, byte[] columnTypes, String[][] naStrings) { String[] lines = getFirstLines(bits); if(lines.length==0 ) throw new ParseDataset.H2OParseException("No data!"); // Guess the separator, columns, & header String[] labels; final String[][] data = new String[lines.length][]; if( lines.length == 1 ) { // Ummm??? Only 1 line? if( sep == GUESS_SEP) { if (lines[0].split(",").length > 1) sep = (byte) ','; else if (lines[0].split(" ").length > 1) sep = ' '; else { //one item, guess type data[0] = new String[]{lines[0]}; byte[] ctypes = new byte[1]; String[][] domains = new String[1][]; if (NumberUtils.isNumber(data[0][0])) { ctypes[0] = Vec.T_NUM; } else { // non-numeric BufferedString str = new BufferedString(data[0][0]); if (ParseTime.isTime(str)) ctypes[0] = Vec.T_TIME; else if (ParseUUID.isUUID(str)) ctypes[0] = Vec.T_UUID; else { // give up and guess categorical ctypes[0] = Vec.T_CAT; domains[0] = new String[]{data[0][0]}; } } //FIXME should set warning message and let fall through return new ParseSetup(CSV_INFO, GUESS_SEP, singleQuotes, checkHeader, 1, null, ctypes, domains, naStrings, data, new ParseWriter.ParseErr[0],FileVec.DFLT_CHUNK_SIZE); } } data[0] = determineTokens(lines[0], sep, singleQuotes); ncols = (ncols > 0) ? ncols : data[0].length; if( checkHeader == GUESS_HEADER) { if (ParseSetup.allStrings(data[0]) && !data[0][0].isEmpty()) { labels = data[0]; checkHeader = HAS_HEADER; } else { labels = null; checkHeader = NO_HEADER; } } else if( checkHeader == HAS_HEADER ) labels = data[0]; else labels = null; } else { // 2 or more lines // First guess the field separator by counting occurrences in first few lines if( sep == GUESS_SEP) { // first guess the separator sep = guessSeparator(lines[0], lines[1], singleQuotes); if( sep == GUESS_SEP && lines.length > 2 ) { sep = guessSeparator(lines[1], lines[2], singleQuotes); if( sep == GUESS_SEP) sep = guessSeparator(lines[0], lines[2], singleQuotes); } if( sep == GUESS_SEP) sep = (byte)' '; // Bail out, go for space } // Tokenize the first few lines using the separator for( int i = 0; i < lines.length; ++i ) data[i] = determineTokens(lines[i], sep, singleQuotes ); // guess columns from tokenization ncols = guessNcols(columnNames,data); // Asked to check for a header, so see if 1st line looks header-ish if( checkHeader == HAS_HEADER || ( checkHeader == GUESS_HEADER && ParseSetup.hasHeader(data[0], data[1]))) { checkHeader = HAS_HEADER; labels = data[0]; } else { checkHeader = NO_HEADER; labels = columnNames; } // See if compatible headers if( columnNames != null && labels != null ) { if( labels.length != columnNames.length ) throw new ParseDataset.H2OParseException("Already have "+columnNames.length+" column labels, but found "+labels.length+" in this file"); else { for( int i = 0; i < labels.length; ++i ) if( !labels[i].equalsIgnoreCase(columnNames[i]) ) { throw new ParseDataset.H2OParseException("Column "+(i+1)+" label '"+labels[i]+"' does not match '"+columnNames[i]+"'"); } labels = columnNames; // Keep prior case & count in any case } } } // Assemble the setup understood so far ParseSetup resSetup = new ParseSetup(CSV_INFO, sep, singleQuotes, checkHeader, ncols, labels, null, null /*domains*/, naStrings, data); // now guess the types if (columnTypes == null || ncols != columnTypes.length) { int i = bits.length-1; for(; i > 0; --i) if(bits[i] == '\n') break; if(i > 0) bits = Arrays.copyOf(bits,i); // stop at the last full line InputStream is = new ByteArrayInputStream(bits); CsvParser p = new CsvParser(resSetup, null); PreviewParseWriter dout = new PreviewParseWriter(resSetup._number_columns); try { p.streamParse(is, dout); resSetup._column_previews = dout; resSetup._errs = dout._errs; } catch (Throwable e) { throw new RuntimeException(e); } } else { // If user sets column type as unknown/bad, guess numeric. for(int i=0; i < columnTypes.length; i++) if (columnTypes[i] == Vec.T_BAD) columnTypes[i] = Vec.T_NUM; resSetup._column_types = columnTypes; resSetup._na_strings = null; } // Return the final setup return resSetup; } private static String[] getFirstLines(byte[] bits) { // Parse up to 10 lines (skipping hash-comments & ARFF comments) String[] lines = new String[10]; // Parse 10 lines int nlines = 0; int offset = 0; while( offset < bits.length && nlines < lines.length ) { int lineStart = offset; while( offset < bits.length && !CsvParser.isEOL(bits[offset]) ) ++offset; int lineEnd = offset; ++offset; // For Windoze, skip a trailing LF after CR if( (offset < bits.length) && (bits[offset] == CsvParser.CHAR_LF)) ++offset; if( bits[lineStart] == '#') continue; // Ignore comment lines if( bits[lineStart] == '%') continue; // Ignore ARFF comment lines if( bits[lineStart] == '@') continue; // Ignore ARFF lines if( lineEnd > lineStart ) { String str = new String(bits, lineStart,lineEnd-lineStart).trim(); if( !str.isEmpty() ) lines[nlines++] = str; } } return Arrays.copyOf(lines, nlines); } }