package water.parser;
import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import water.Iced;
import water.PrettyPrint;
/**
* Parser for SVM light format.
* @author tomasnykodym
*
*/
public class SVMLightParser extends CustomParser{
private static final byte SKIP_LINE = 0;
private static final byte EXPECT_COND_LF = 1;
private static final byte EOL = 2;
private static final byte TOKEN = 3;
private static final byte SKIP_TOKEN = 4;
private static final byte NUMBER = 5;
private static final byte NUMBER_FRACTION = 6;
private static final byte NUMBER_EXP = 7;
private static final byte INVALID_NUMBER = 8;
private static final byte NUMBER_EXP_START = 9;
private static final byte NUMBER_END = 10;
private static final byte WHITESPACE_BEFORE_TOKEN = 11;
private static final byte POSSIBLE_EMPTY_LINE = 12;
private static final byte QID0 = 13;
private static final byte QID1 = 14;
// line global states
private static final int TGT = 1;
private static final int COL = 2;
private static final int VAL = 3;
private static final long LARGEST_DIGIT_NUMBER = 1000000000000000000L;
final static char DECIMAL_SEP = '.';
public SVMLightParser(ParserSetup setup) {super(setup);}
@Override
public SVMLightParser clone(){return new SVMLightParser(_setup);}
@Override
public boolean parallelParseSupported(){return true;}
/**
* Try to parse the bytes as svm light format, return SVMParser instance if the input is in svm light format, null otherwise.
* @param bytes
* @return SVMLightPArser instance or null
*/
public static PSetupGuess guessSetup(byte [] bytes){
// find the last eof
int i = bytes.length-1;
while(i > 0 && bytes[i] != '\n')--i;
assert i >= 0;
InputStream is = new ByteArrayInputStream(Arrays.copyOf(bytes,i));
SVMLightParser p = new SVMLightParser(new ParserSetup(ParserType.SVMLight, CsvParser.AUTO_SEP, false));
InspectDataOut dout = new InspectDataOut();
try{p.streamParse(is, dout);}catch(Exception e){throw new RuntimeException(e);}
return new PSetupGuess(new ParserSetup(ParserType.SVMLight, CsvParser.AUTO_SEP, dout._ncols,false,null,false),dout._nlines,dout._invalidLines,dout.data(),dout._ncols > 0 && dout._nlines > 0 && dout._nlines > dout._invalidLines,dout.errors());
}
@Override
public boolean isCompatible(CustomParser p){return p instanceof SVMLightParser;}
@SuppressWarnings("fallthrough")
@Override public final DataOut parallelParse(int cidx, final CustomParser.DataIn din, final CustomParser.DataOut dout) {
ValueString _str = new ValueString();
byte[] bits = din.getChunkData(cidx);
if( bits == null ) return dout;
final byte[] bits0 = bits; // Bits for chunk0
boolean firstChunk = true; // Have not rolled into the 2nd chunk
byte[] bits1 = null; // Bits for chunk1, loaded lazily.
int offset = 0; // General cursor into the giant array of bytes
// Starting state. Are we skipping the first (partial) line, or not? Skip
// a header line, or a partial line if we're in the 2nd and later chunks.
int lstate = (cidx > 0)? SKIP_LINE : WHITESPACE_BEFORE_TOKEN;
int gstate = TGT;
long number = 0;
int zeros = 0;
int exp = 0;
int sgn_exp = 1;
boolean decimal = false;
int fractionDigits = 0;
int colIdx = 0;
byte c = bits[offset];
// skip comments for the first chunk (or if not a chunk)
if( cidx == 0 ) {
while (c == '#') {
while ((offset < bits.length) && (bits[offset] != CHAR_CR) && (bits[offset ] != CHAR_LF)) ++offset;
if ((offset+1 < bits.length) && (bits[offset] == CHAR_CR) && (bits[offset+1] == CHAR_LF)) ++offset;
++offset;
if (offset >= bits.length)
return dout;
c = bits[offset];
}
}
//dout.newLine();
int linestart = 0;
// String linePrefix = "";
MAIN_LOOP:
while (true) {
NEXT_CHAR:
switch (lstate) {
// ---------------------------------------------------------------------
case SKIP_LINE:
if (!isEOL(c))
break NEXT_CHAR;
// fall through
case EOL:
if (colIdx != 0) {
colIdx = 0;
linestart = offset+1;
if(lstate != SKIP_LINE)
dout.newLine();
}
if( !firstChunk )
break MAIN_LOOP; // second chunk only does the first row
lstate = (c == CHAR_CR) ? EXPECT_COND_LF : POSSIBLE_EMPTY_LINE;
gstate = TGT;
linestart = offset;
break NEXT_CHAR;
// ---------------------------------------------------------------------
case EXPECT_COND_LF:
lstate = POSSIBLE_EMPTY_LINE;
if (c == CHAR_LF)
break NEXT_CHAR;
continue MAIN_LOOP;
// ---------------------------------------------------------------------
// ---------------------------------------------------------------------
// ---------------------------------------------------------------------
case POSSIBLE_EMPTY_LINE:
if (isEOL(c)) {
if (c == CHAR_CR)
lstate = EXPECT_COND_LF;
break NEXT_CHAR;
}
lstate = WHITESPACE_BEFORE_TOKEN;
// fallthrough to WHITESPACE_BEFORE_TOKEN
// ---------------------------------------------------------------------
case WHITESPACE_BEFORE_TOKEN:
if (isWhitespace(c))
break NEXT_CHAR;
if (isEOL(c)){
lstate = EOL;
continue MAIN_LOOP;
}
// fallthrough to TOKEN
case TOKEN:
if (((c >= '0') && (c <= '9')) || (c == '-') || (c == DECIMAL_SEP) || (c == '+')) {
lstate = NUMBER;
number = 0;
fractionDigits = 0;
decimal = false;
if (c == '-') {
exp = -1;
break NEXT_CHAR;
} else if(c == '+'){
exp = 1;
break NEXT_CHAR;
} else {
exp = 1;
}
// fallthrough
} else if(c == 'q'){
lstate = QID0;
} else { // failed, skip the line
// TODO
dout.invalidLine("Unexpected character, expected number or qid, got '" + new String(Arrays.copyOfRange(bits, offset,Math.min(bits.length,offset+5))) + "...'");
lstate = SKIP_LINE;
continue MAIN_LOOP;
}
// fallthrough to NUMBER
// ---------------------------------------------------------------------
case NUMBER:
if ((c >= '0') && (c <= '9')) {
number = (number*10)+(c-'0');
if (number >= LARGEST_DIGIT_NUMBER)
lstate = INVALID_NUMBER;
break NEXT_CHAR;
} else if (c == DECIMAL_SEP) {
lstate = NUMBER_FRACTION;
fractionDigits = offset;
decimal = true;
break NEXT_CHAR;
} else if ((c == 'e') || (c == 'E')) {
lstate = NUMBER_EXP_START;
sgn_exp = 1;
break NEXT_CHAR;
}
if (exp == -1) {
number = -number;
}
exp = 0;
// fallthrough NUMBER_END
case NUMBER_END:
exp = exp - fractionDigits;
switch(gstate){
case COL:
if(c == ':'){
if(exp == 0 && number >= colIdx && (int)number == number){
colIdx = (int)number;
gstate = VAL;
lstate = WHITESPACE_BEFORE_TOKEN;
} else {
// wrong col Idx, just skip the token and try to continue
// col idx is either too small (according to spec, cols must come in strictly increasing order)
// or too small (col ids currently must fit into int)
String err = "";
if(number <= colIdx)
err = "Columns come in non-increasing sequence. Got " + number + " after " + colIdx + ".";
else if(exp != 0)
err = "Got non-integer as column id: " + number*PrettyPrint.pow10(exp);
else
err = "column index out of range, " + number + " does not fit into integer.";
dout.invalidLine("invalid column id:" + err);
lstate = SKIP_LINE;
}
} else { // we're probably out of sync, skip the rest of the line
dout.invalidLine("unexpected character after column id: " + c);
lstate = SKIP_LINE;
// TODO output error
}
break NEXT_CHAR;
case TGT:
case VAL:
dout.addNumCol(colIdx++,number,exp);
lstate = WHITESPACE_BEFORE_TOKEN;
gstate = COL;
continue MAIN_LOOP;
}
// ---------------------------------------------------------------------
case NUMBER_FRACTION:
if(c == '0'){
++zeros;
break NEXT_CHAR;
}
if ((c > '0') && (c <= '9')) {
if (number < LARGEST_DIGIT_NUMBER) {
number = (number*PrettyPrint.pow10i(zeros+1))+(c-'0');
} else {
dout.invalidLine("number " + number + " is out of bounds.");
lstate = SKIP_LINE;
}
zeros = 0;
break NEXT_CHAR;
} else if ((c == 'e') || (c == 'E')) {
if (decimal)
fractionDigits = offset - zeros - 1 - fractionDigits;
lstate = NUMBER_EXP_START;
sgn_exp = 1;
zeros = 0;
break NEXT_CHAR;
}
lstate = NUMBER_END;
if (decimal)
fractionDigits = offset - zeros - fractionDigits-1;
if (exp == -1) {
number = -number;
}
exp = 0;
zeros = 0;
continue MAIN_LOOP;
// ---------------------------------------------------------------------
case NUMBER_EXP_START:
if (exp == -1) {
number = -number;
}
exp = 0;
if (c == '-') {
sgn_exp *= -1;
break NEXT_CHAR;
} else if (c == '+'){
break NEXT_CHAR;
}
if ((c < '0') || (c > '9')){
lstate = INVALID_NUMBER;
continue MAIN_LOOP;
}
lstate = NUMBER_EXP; // fall through to NUMBER_EXP
// ---------------------------------------------------------------------
case NUMBER_EXP:
if ((c >= '0') && (c <= '9')) {
exp = (exp*10)+(c-'0');
break NEXT_CHAR;
}
exp *= sgn_exp;
lstate = NUMBER_END;
continue MAIN_LOOP;
// ---------------------------------------------------------------------
case INVALID_NUMBER:
if(gstate == TGT) { // invalid tgt -> skip the whole row
lstate = SKIP_LINE;
dout.invalidLine("invalid number (expecting target)");
continue MAIN_LOOP;
}
if(gstate == VAL){ // add invalid value and skip until whitespace or eol
dout.addInvalidCol(colIdx++);
gstate = COL;
}
case QID0:
if(c == 'i'){
lstate = QID1;
break NEXT_CHAR;
} else {
lstate = SKIP_TOKEN;
break NEXT_CHAR;
}
case QID1:
if(c == 'd'){
lstate = SKIP_TOKEN; // skip qid for now
break NEXT_CHAR;
} else {
// TODO report an error
lstate = SKIP_TOKEN;;
break NEXT_CHAR;
}
// fall through
case SKIP_TOKEN:
if(isEOL(c))
lstate = EOL;
else if(isWhitespace(c))
lstate = WHITESPACE_BEFORE_TOKEN;
break NEXT_CHAR;
default:
assert (false) : " We have wrong state "+lstate;
} // end NEXT_CHAR
++offset; // do not need to adjust for offset increase here - the offset is set to tokenStart-1!
if (offset < 0) { // Offset is negative?
assert !firstChunk; // Caused by backing up from 2nd chunk into 1st chunk
firstChunk = true;
bits = bits0;
offset += bits.length;
_str.set(bits,offset,0);
} else if (offset >= bits.length) { // Off end of 1st chunk? Parse into 2nd chunk
// Attempt to get more data.
if( firstChunk && bits1 == null ){
bits1 = din.getChunkData(cidx+1);
// linePrefix = new String(Arrays.copyOfRange(bits, linestart, bits.length));
linestart = 0;
}
// if we can't get further we might have been the last one and we must
// commit the latest guy if we had one.
if( !firstChunk || bits1 == null ) { // No more data available or allowed
// If we are mid-parse of something, act like we saw a LF to end the
// current token.
if ((lstate != EXPECT_COND_LF) && (lstate != POSSIBLE_EMPTY_LINE)) {
c = CHAR_LF; continue MAIN_LOOP;
}
break MAIN_LOOP; // Else we are just done
}
// Now parsing in the 2nd chunk. All offsets relative to the 2nd chunk start.
firstChunk = false;
if (lstate == NUMBER_FRACTION)
fractionDigits -= bits.length;
offset -= bits.length;
bits = bits1; // Set main parsing loop bits
if( bits[0] == CHAR_LF && lstate == EXPECT_COND_LF )
break MAIN_LOOP; // when the first character we see is a line end
}
c = bits[offset];
} // end MAIN_LOOP
return dout;
}
private static class InspectDataOut extends Iced implements DataOut {
public int _nlines;
public int _ncols;
public int _invalidLines;
public final static int MAX_COLS = 100;
public final static int MAX_LINES = 10;
private String [][] _data = new String[MAX_LINES][MAX_COLS];
transient ArrayList<String> _errors = new ArrayList<String>();
public InspectDataOut() {
for(int i = 0; i < MAX_LINES;++i)
Arrays.fill(_data[i],"0");
}
public String [][] data(){
if(_data.length <= _nlines && _data[0].length <= _ncols)
return _data;
String [][] res = Arrays.copyOf(_data, Math.min(MAX_LINES, _nlines));
for(int i = 0; i < res.length; ++i)
res[i] = Arrays.copyOf(_data[i], Math.min(MAX_COLS,_ncols));
return (_data = res);
}
@Override public void setColumnNames(String[] names) {}
@Override public void newLine() {
++_nlines;
}
@Override public boolean isString(int colIdx) {return false;}
@Override public void addNumCol(int colIdx, long number, int exp) {
_ncols = Math.max(_ncols,colIdx);
if(colIdx < MAX_COLS && _nlines < MAX_LINES)
_data[_nlines][colIdx] = Double.toString(number*PrettyPrint.pow10(exp));
}
@Override public void addNumCol(int colIdx, double d) {
_ncols = Math.max(_ncols,colIdx);
if(colIdx < MAX_COLS)
_data[_nlines][colIdx] = Double.toString(d);
}
@Override public void addInvalidCol(int colIdx) {}
@Override public void addStrCol(int colIdx, ValueString str) {}
@Override public void rollbackLine() {--_nlines;}
@Override public void invalidLine(String error) {
++_invalidLines;
if(_errors.size() < 10)
_errors.add("error at line " + (_nlines +_invalidLines) + ", cause: " + error);
}
@Override public void invalidValue(int linenum, int colnum) {}
public String [] errors(){
String [] res = new String[_errors.size()];
return _errors.toArray(res);
}
}
}