package water.parser; import water.Futures; import water.Iced; import water.fvec.AppendableVec; import water.fvec.C1Chunk; import water.fvec.NewChunk; import water.fvec.Vec; import water.util.ArrayUtils; import java.util.Arrays; import java.util.UUID; /** Parsed data output specialized for fluid vecs. * @author tomasnykodym */ public class FVecParseWriter extends Iced implements StreamParseWriter { protected AppendableVec[] _vecs; protected transient NewChunk[] _nvs; protected transient final Categorical [] _categoricals; protected transient final byte[] _ctypes; long _nLines; int _nCols; int _col = -1; final int _cidx; final int _chunkSize; ParseErr [] _errs = new ParseErr[0]; private final Vec.VectorGroup _vg; private long _errCnt; public FVecParseWriter(Vec.VectorGroup vg, int cidx, Categorical[] categoricals, byte[] ctypes, int chunkSize, AppendableVec[] avs){ _ctypes = ctypes; // Required not-null _vecs = avs; _nvs = new NewChunk[avs.length]; for(int i = 0; i < avs.length; ++i) _nvs[i] = _vecs[i].chunkForChunkIdx(cidx); _categoricals = categoricals; _nCols = avs.length; _cidx = cidx; _vg = vg; _chunkSize = chunkSize; } @Override public FVecParseWriter reduce(StreamParseWriter sdout){ FVecParseWriter dout = (FVecParseWriter)sdout; _nCols = Math.max(_nCols,dout._nCols); // SVMLight: max of columns if( _vecs != dout._vecs ) { if( dout._vecs.length > _vecs.length ) { // Swap longer one over the returned value AppendableVec[] tmpv = _vecs; _vecs = dout._vecs; dout._vecs = tmpv; } for(int i = 0; i < dout._vecs.length; ++i) _vecs[i].reduce(dout._vecs[i]); } _errCnt += ((FVecParseWriter) sdout)._errCnt; if(_errs.length < 20 && ((FVecParseWriter) sdout)._errs.length > 0) { _errs = ArrayUtils.append(_errs, ((FVecParseWriter) sdout)._errs); if(_errs.length > 20) _errs = Arrays.copyOf(_errs,20); } return this; } @Override public FVecParseWriter close(){ Futures fs = new Futures(); close(fs); fs.blockForPending(); return this; } @Override public FVecParseWriter close(Futures fs){ if( _nvs == null ) return this; // Might call close twice for(int i=0; i < _nvs.length; i++) { _nvs[i].close(_cidx, fs); _nvs[i] = null; // free immediately, don't wait for all columns to close } _nvs = null; // Free for GC return this; } @Override public FVecParseWriter nextChunk(){ return new FVecParseWriter(_vg, _cidx+1, _categoricals, _ctypes, _chunkSize, _vecs); } @Override public void newLine() { if(_col >= 0){ ++_nLines; for(int i = _col+1; i < _nCols; ++i) addInvalidCol(i); } _col = -1; } @Override public void addNumCol(int colIdx, long number, int exp) { if( colIdx < _nCols ) { _nvs[_col = colIdx].addNum(number, exp); if(_ctypes != null && _ctypes[colIdx] == Vec.T_BAD ) _ctypes[colIdx] = Vec.T_NUM; } } @Override public final void addInvalidCol(int colIdx) { if(colIdx < _nCols) _nvs[_col = colIdx].addNA(); } @Override public boolean isString(int colIdx) { return (colIdx < _nCols) && (_ctypes[colIdx] == Vec.T_CAT || _ctypes[colIdx] == Vec.T_STR);} @Override public void addStrCol(int colIdx, BufferedString str) { if(colIdx < _nvs.length){ if(_ctypes[colIdx] == Vec.T_NUM){ // support enforced types addInvalidCol(colIdx); return; } if(_ctypes[colIdx] == Vec.T_BAD && ParseTime.isTime(str)) _ctypes[colIdx] = Vec.T_TIME; if( _ctypes[colIdx] == Vec.T_BAD && ParseUUID.isUUID(str)) _ctypes[colIdx] = Vec.T_UUID; if( _ctypes[colIdx] == Vec.T_TIME ) { long l = ParseTime.attemptTimeParse(str); if( l == Long.MIN_VALUE ) addInvalidCol(colIdx); else { addNumCol(colIdx, l, 0); // Record time in msec _nvs[_col]._timCnt++; // Count histo of time parse patterns } } else if( _ctypes[colIdx] == Vec.T_UUID ) { // UUID column? Only allow UUID parses UUID uuid = ParseUUID.attemptUUIDParse(str); // FIXME: what if colIdx > _nCols if( colIdx < _nCols ) _nvs[_col = colIdx].addUUID(uuid); } else if( _ctypes[colIdx] == Vec.T_STR ) { _nvs[_col = colIdx].addStr(str); } else { // categoricals if(!_categoricals[colIdx].isMapFull()) { int id = _categoricals[_col = colIdx].addKey(str); if (_ctypes[colIdx] == Vec.T_BAD && id > 1) _ctypes[colIdx] = Vec.T_CAT; if(_ctypes[colIdx] == Vec.T_CAT) { _nvs[colIdx].addNum(id, 0); // if we are sure we have a categorical column, we can only store the integer (more efficient than remembering this value was categorical) } else _nvs[colIdx].addCategorical(id); } else { // maxed out categorical map throw new ParseDataset.H2OParseException("Exceeded categorical limit on column #"+(colIdx+1)+" (using 1-based indexing). Consider reparsing this column as a string."); } } } } /** Adds double value to the column. */ @Override public void addNumCol(int colIdx, double value) { if (Double.isNaN(value)) { addInvalidCol(colIdx); } else { double d= value; int exp = 0; long number = (long)d; while (number != d) { d *= 10; --exp; number = (long)d; } addNumCol(colIdx, number, exp); } } @Override public void setColumnNames(String [] names){} @Override public final void rollbackLine() {} @Override public void invalidLine(ParseErr err) { addErr(err); newLine(); } @Override public void addError(ParseErr err) { if(_errs == null) _errs = new ParseErr[]{err}; else if(_errs.length < 20) _errs = ArrayUtils.append(_errs,err); _errCnt++; } @Override public void setIsAllASCII(int colIdx, boolean b) { if(colIdx < _nvs.length) _nvs[colIdx]._isAllASCII = b; } @Override public boolean hasErrors() { return _errs != null && _errs.length > 0; } @Override public ParseErr[] removeErrors() { ParseErr [] res = _errs; _errs = null; return res; } @Override public long lineNum() {return _nLines;} public void addErr(ParseErr err){ if(_errs.length < 20) _errs = ArrayUtils.append(_errs,err); ++_errCnt; } }