package water.parser; import java.io.*; import java.util.Arrays; import water.Key; import water.fvec.Vec; import water.util.PrettyPrint; import static water.parser.DefaultParserProviders.SVMLight_INFO; class SVMLightParser extends Parser { private static final byte SKIP_TOKEN = 21; private static final byte INVALID_NUMBER = 22; private static final byte QID0 = 23; private static final byte QID1 = 24; // line global states private static final int TGT = 1; private static final int COL = 2; private static final int VAL = 3; SVMLightParser( ParseSetup ps, Key jobkey ) { super(ps, jobkey); } /** Try to parse the bytes as svm light format, return a ParseSetupHandler with type * SVMLight if the input is in svm light format, throw an exception otherwise. */ public static ParseSetup guessSetup(byte [] bytes) { // find the last eof int i = bytes.length-1; while(i > 0 && bytes[i] != '\n') --i; assert i >= 0; InputStream is = new ByteArrayInputStream(Arrays.copyOf(bytes,i)); SVMLightParser p = new SVMLightParser(new ParseSetup(SVMLight_INFO, ParseSetup.GUESS_SEP, false,ParseSetup.GUESS_HEADER,ParseSetup.GUESS_COL_CNT, null,null,null,null,null), null); SVMLightInspectParseWriter dout = new SVMLightInspectParseWriter(); try{ p.streamParse(is, dout); } catch(IOException e) { throw new RuntimeException(e); } if (dout._ncols > 0 && dout._nlines > 0 && dout._nlines > dout._invalidLines) return new ParseSetup(SVMLight_INFO, ParseSetup.GUESS_SEP, false,ParseSetup.NO_HEADER,dout._ncols,null,dout.guessTypes(),null,null,dout._data, dout.removeErrors()); else throw new ParseDataset.H2OParseException("Could not parse file as an SVMLight file."); } public static byte[] col_types(int ncols) { byte[] res = new byte[ncols]; Arrays.fill(res,Vec.T_NUM); return res; } final boolean isWhitespace(byte c){return c == ' ' || c == '\t';} @SuppressWarnings("fallthrough") @Override public final ParseWriter parseChunk(int cidx, final ParseReader din, final ParseWriter dout) { BufferedString _str = new BufferedString(); byte[] bits = din.getChunkData(cidx); if( bits == null ) return dout; final byte[] bits0 = bits; // Bits for chunk0 boolean firstChunk = true; // Have not rolled into the 2nd chunk byte[] bits1 = null; // Bits for chunk1, loaded lazily. int offset = 0; // General cursor into the giant array of bytes // Starting state. Are we skipping the first (partial) line, or not? Skip // a header line, or a partial line if we're in the 2nd and later chunks. int lstate = (cidx > 0)? SKIP_LINE : WHITESPACE_BEFORE_TOKEN; int gstate = TGT; long number = 0; int zeros = 0; int exp = 0; int sgnExp = 1; boolean decimal = false; int fractionDigits = 0; int colIdx = 0; byte c = bits[offset]; // skip comments for the first chunk (or if not a chunk) if( cidx == 0 ) { while (c == '#') { while ((offset < bits.length) && (bits[offset] != CHAR_CR) && (bits[offset ] != CHAR_LF)) ++offset; if ((offset+1 < bits.length) && (bits[offset] == CHAR_CR) && (bits[offset+1] == CHAR_LF)) ++offset; ++offset; if (offset >= bits.length) return dout; c = bits[offset]; } } MAIN_LOOP: while (true) { NEXT_CHAR: switch (lstate) { // --------------------------------------------------------------------- case SKIP_LINE: if (!isEOL(c)) break; // fall through case EOL: if (colIdx != 0) { colIdx = 0; if(lstate != SKIP_LINE) dout.newLine(); } if( !firstChunk ) break MAIN_LOOP; // second chunk only does the first row lstate = (c == CHAR_CR) ? EXPECT_COND_LF : POSSIBLE_EMPTY_LINE; gstate = TGT; break; // --------------------------------------------------------------------- case EXPECT_COND_LF: lstate = POSSIBLE_EMPTY_LINE; if (c == CHAR_LF) break; continue MAIN_LOOP; // --------------------------------------------------------------------- // --------------------------------------------------------------------- // --------------------------------------------------------------------- case POSSIBLE_EMPTY_LINE: if (isEOL(c)) { if (c == CHAR_CR) lstate = EXPECT_COND_LF; break; } lstate = WHITESPACE_BEFORE_TOKEN; // fallthrough to WHITESPACE_BEFORE_TOKEN // --------------------------------------------------------------------- case WHITESPACE_BEFORE_TOKEN: if (isWhitespace(c)) break; if (isEOL(c)){ lstate = EOL; continue MAIN_LOOP; } // fallthrough to TOKEN case TOKEN: if (((c >= '0') && (c <= '9')) || (c == '-') || (c == CHAR_DECIMAL_SEP) || (c == '+')) { lstate = NUMBER; number = 0; fractionDigits = 0; decimal = false; if (c == '-') { exp = -1; break; } else if(c == '+'){ exp = 1; break; } else { exp = 1; } // fallthrough } else if(c == 'q'){ lstate = QID0; } else { // failed, skip the line String err = "Unexpected character, expected number or qid, got '" + new String(Arrays.copyOfRange(bits, offset,Math.min(bits.length,offset+5))) + "...'"; dout.invalidLine(new ParseWriter.ParseErr(err,cidx,dout.lineNum(),offset + din.getGlobalByteOffset())); lstate = SKIP_LINE; continue MAIN_LOOP; } // fallthrough to NUMBER // --------------------------------------------------------------------- case NUMBER: if ((c >= '0') && (c <= '9')) { number = (number*10)+(c-'0'); if (number >= LARGEST_DIGIT_NUMBER) lstate = INVALID_NUMBER; break; } else if (c == CHAR_DECIMAL_SEP) { lstate = NUMBER_FRACTION; fractionDigits = offset; decimal = true; break; } else if ((c == 'e') || (c == 'E')) { lstate = NUMBER_EXP_START; sgnExp = 1; break; } if (exp == -1) { number = -number; } exp = 0; // fallthrough NUMBER_END case NUMBER_END: exp = exp - fractionDigits; switch(gstate){ case COL: if(c == ':'){ if(exp == 0 && number >= colIdx && (int)number == number){ colIdx = (int)number; gstate = VAL; lstate = WHITESPACE_BEFORE_TOKEN; } else { // wrong col Idx, just skip the token and try to continue // col idx is either too small (according to spec, cols must come in strictly increasing order) // or too small (col ids currently must fit into int) String err; if(number <= colIdx) err = "Columns come in non-increasing sequence. Got " + number + " after " + colIdx + ". Rest of the line is skipped."; else if(exp != 0) err = "Got non-integer as column id: " + number*PrettyPrint.pow10(exp) + ". Rest of the line is skipped."; else err = "column index out of range, " + number + " does not fit into integer." + " Rest of the line is skipped."; dout.invalidLine(new ParseWriter.ParseErr(err,cidx,dout.lineNum(),offset + din.getGlobalByteOffset())); lstate = SKIP_LINE; } } else { // we're probably out of sync, skip the rest of the line String err = "Unexpected character after column id: " + c; dout.invalidLine(new ParseWriter.ParseErr(err,cidx,dout.lineNum(),offset + din.getGlobalByteOffset())); lstate = SKIP_LINE; } break NEXT_CHAR; case TGT: case VAL: dout.addNumCol(colIdx++,number,exp); lstate = WHITESPACE_BEFORE_TOKEN; gstate = COL; continue MAIN_LOOP; } // --------------------------------------------------------------------- case NUMBER_FRACTION: if(c == '0'){ ++zeros; break; } if ((c > '0') && (c <= '9')) { if (number < LARGEST_DIGIT_NUMBER) { number = (number*PrettyPrint.pow10i(zeros+1))+(c-'0'); } else { String err = "number " + number + " is out of bounds."; dout.invalidLine(new ParseWriter.ParseErr(err,cidx,dout.lineNum(),offset + din.getGlobalByteOffset())); lstate = SKIP_LINE; } zeros = 0; break; } else if ((c == 'e') || (c == 'E')) { if (decimal) fractionDigits = offset - zeros - 1 - fractionDigits; lstate = NUMBER_EXP_START; sgnExp = 1; zeros = 0; break; } lstate = NUMBER_END; if (decimal) fractionDigits = offset - zeros - fractionDigits-1; if (exp == -1) { number = -number; } exp = 0; zeros = 0; continue MAIN_LOOP; // --------------------------------------------------------------------- case NUMBER_EXP_START: if (exp == -1) { number = -number; } exp = 0; if (c == '-') { sgnExp *= -1; break; } else if (c == '+'){ break; } if ((c < '0') || (c > '9')){ lstate = INVALID_NUMBER; continue MAIN_LOOP; } lstate = NUMBER_EXP; // fall through to NUMBER_EXP // --------------------------------------------------------------------- case NUMBER_EXP: if ((c >= '0') && (c <= '9')) { exp = (exp*10)+(c-'0'); break; } exp *= sgnExp; lstate = NUMBER_END; continue MAIN_LOOP; // --------------------------------------------------------------------- case INVALID_NUMBER: if(gstate == TGT) { // invalid tgt -> skip the whole row lstate = SKIP_LINE; String err = "invalid number (expecting target)"; dout.invalidLine(new ParseWriter.ParseErr(err,cidx,dout.lineNum(),offset + din.getGlobalByteOffset())); continue MAIN_LOOP; } if(gstate == VAL){ // add invalid value and skip until whitespace or eol dout.addInvalidCol(colIdx++); gstate = COL; } case QID0: if(c == 'i'){ lstate = QID1; break; } else { lstate = SKIP_TOKEN; break; } case QID1: if(c == 'd'){ lstate = SKIP_TOKEN; // skip qid for now break; } else { // TODO report an error lstate = SKIP_TOKEN; break; } // fall through case SKIP_TOKEN: if(isEOL(c)) lstate = EOL; else if(isWhitespace(c)) lstate = WHITESPACE_BEFORE_TOKEN; break; default: assert (false) : " We have wrong state "+lstate; } // end NEXT_CHAR ++offset; // do not need to adjust for offset increase here - the offset is set to tokenStart-1! if (offset < 0) { // Offset is negative? assert !firstChunk; // Caused by backing up from 2nd chunk into 1st chunk firstChunk = true; bits = bits0; offset += bits.length; _str.set(bits,offset,0); } else if (offset >= bits.length) { // Off end of 1st chunk? Parse into 2nd chunk // Attempt to get more data. if( firstChunk && bits1 == null ){ bits1 = din.getChunkData(cidx+1); // linePrefix = new String(Arrays.copyOfRange(bits, linestart, bits.length)); } // if we can't get further we might have been the last one and we must // commit the latest guy if we had one. if( !firstChunk || bits1 == null ) { // No more data available or allowed // If we are mid-parse of something, act like we saw a LF to end the // current token. if ((lstate != EXPECT_COND_LF) && (lstate != POSSIBLE_EMPTY_LINE)) { c = CHAR_LF; continue; } break; // Else we are just done } // Now parsing in the 2nd chunk. All offsets relative to the 2nd chunk start. firstChunk = false; if (lstate == NUMBER_FRACTION) fractionDigits -= bits.length; offset -= bits.length; bits = bits1; // Set main parsing loop bits if( bits[0] == CHAR_LF && lstate == EXPECT_COND_LF ) break; // when the first character we see is a line end } c = bits[offset]; } // end MAIN_LOOP return dout; } // -------------------------------------------------------- // Used for previewing datasets. // Fill with zeros not NAs, and grow columns on-demand. private static class SVMLightInspectParseWriter extends PreviewParseWriter { public SVMLightInspectParseWriter() { for (int i = 0; i < MAX_PREVIEW_LINES;++i) _data[i] = new String[MAX_PREVIEW_COLS]; for (String[] datum : _data) Arrays.fill(datum, "0"); } // Expand columns on-demand @Override public void addNumCol(int colIdx, long number, int exp) { _ncols = Math.max(_ncols,colIdx); if(colIdx < MAX_PREVIEW_COLS && _nlines < MAX_PREVIEW_LINES) _data[_nlines][colIdx] = Double.toString(number*PrettyPrint.pow10(exp)); } @Override public void addNumCol(int colIdx, double d) { _ncols = Math.max(_ncols,colIdx); if(colIdx < MAX_PREVIEW_COLS && _nlines < MAX_PREVIEW_LINES) _data[_nlines][colIdx] = Double.toString(d); } public byte[] guessTypes() { return col_types(_ncols); } } }