package water.parser; import water.Iced; import water.fvec.Vec; import water.util.ArrayUtils; import water.util.IcedHashMap; /** Class implementing ParseWriter, on behalf ParseSetup * to examine the contents of a file for guess the column types. */ public class PreviewParseWriter extends Iced implements ParseWriter { protected final static int MAX_PREVIEW_COLS = 100; protected final static int MAX_PREVIEW_LINES = 10; protected int _nlines; protected int _ncols; protected int _invalidLines; private String [] _colNames; protected String [][] _data = new String[MAX_PREVIEW_LINES][]; private IcedHashMap<String,String>[] _domains; //used in leiu of a HashSet int [] _nnums; int [] _nstrings; int [] _ndates; int [] _nUUID; int [] _nzeros; int [] _nempty; protected ParseErr [] _errs = new ParseErr[0]; protected PreviewParseWriter() {} protected PreviewParseWriter(int ncols) { setColumnCount(ncols); } String[] colNames() { return _colNames; } @Override public void setColumnNames(String[] names) { _colNames = names; _data[0] = names; ++_nlines; setColumnCount(names.length); } private void setColumnCount(int n) { // initialize if (_ncols == 0 && n > 0) { _ncols = n; _nzeros = new int[n]; _nstrings = new int[n]; _nUUID = new int[n]; _ndates = new int[n]; _nnums = new int[n]; _nempty = new int[n]; _domains = new IcedHashMap[n]; for(int i = 0; i < n; ++i) _domains[i] = new IcedHashMap<>(); for(int i =0; i < MAX_PREVIEW_LINES; i++) _data[i] = new String[n]; } /*else if (n > _ncols) { // resize _nzeros = Arrays.copyOf(_nzeros, n); _nstrings = Arrays.copyOf(_nstrings, n); _nUUID = Arrays.copyOf(_nUUID, n); _ndates = Arrays.copyOf(_ndates, n); _nnums = Arrays.copyOf(_nnums, n); _nempty = Arrays.copyOf(_nempty, n); _domains = Arrays.copyOf(_domains, n); for (int i=_ncols; i < n; i++) _domains[i] = new HashSet<String>(); for(int i =0; i < MAX_PREVIEW_LINES; i++) _data[i] = Arrays.copyOf(_data[i], n); _ncols = n; }*/ } @Override public void newLine() { ++_nlines; } @Override public boolean isString(int colIdx) { return false; } @Override public void addNumCol(int colIdx, long number, int exp) { if(colIdx < _ncols) { if (number == 0) ++_nzeros[colIdx]; else ++_nnums[colIdx]; if (_nlines < MAX_PREVIEW_LINES) _data[_nlines][colIdx] = Double.toString(number * water.util.PrettyPrint.pow10(exp)); } } @Override public void addNumCol(int colIdx, double d) { if(colIdx < _ncols) { if (d == 0) ++_nzeros[colIdx]; else ++_nnums[colIdx]; if (_nlines < MAX_PREVIEW_LINES) _data[_nlines][colIdx] = Double.toString(d); } } @Override public void addInvalidCol(int colIdx) { if(colIdx < _ncols) { ++_nempty[colIdx]; if (_nlines < MAX_PREVIEW_LINES) _data[_nlines][colIdx] = "NA"; } } @Override public void addStrCol(int colIdx, BufferedString str) { if(colIdx < _ncols) { // Check for time if (ParseTime.isTime(str)) { ++_ndates[colIdx]; return; } //Check for UUID if(ParseUUID.isUUID(str)) { ++_nUUID[colIdx]; return; } //Add string to domains list for later determining string, NA, or categorical ++_nstrings[colIdx]; _domains[colIdx].put(str.toString(),""); if (_nlines < MAX_PREVIEW_LINES) _data[_nlines][colIdx] = str.toString(); } } @Override public void rollbackLine() {--_nlines;} @Override public void setIsAllASCII(int colIdx, boolean b) {} public byte[] guessTypes() { byte[] types = new byte[_ncols]; for (int i = 0; i < _ncols; ++i) { int nonemptyLines = _nlines - _nempty[i] - 1; //During guess, some columns may be shorted one line based on 4M boundary //Very redundant tests, but clearer and not speed critical // is it clearly numeric? if ((_nnums[i] + _nzeros[i]) >= _ndates[i] && (_nnums[i] + _nzeros[i]) >= _nUUID[i] && _nnums[i] >= _nstrings[i]) { // 0s can be an NA among categoricals, ignore types[i] = Vec.T_NUM; continue; } // All same string or empty? if( _domains[i].size() == 1 && _ndates[i]==0 ) { // Obvious NA, or few instances of the single string, declare numeric // else categorical types[i] = (_domains[i].containsKey("NA") || _domains[i].containsKey("na") || _domains[i].containsKey("Na") || _domains[i].containsKey("N/A") || _nstrings[i] < _nnums[i]+_nzeros[i]) ? Vec.T_NUM : Vec.T_CAT; continue; } // with NA, but likely numeric if (_domains[i].size() <= 1 && (_nnums[i] + _nzeros[i]) > _ndates[i] + _nUUID[i]) { types[i] = Vec.T_NUM; continue; } // Datetime if (_ndates[i] > _nUUID[i] && _ndates[i] > (_nnums[i] + _nzeros[i]) && (_ndates[i] > _nstrings[i] || _domains[i].size() <= 1)) { types[i] = Vec.T_TIME; continue; } // UUID if (_nUUID[i] > _ndates[i] && _nUUID[i] > (_nnums[i] + _nzeros[i]) && (_nUUID[i] > _nstrings[i] || _domains[i].size() <= 1)) { types[i] = Vec.T_UUID; continue; } // Strings, almost no dups if (_nstrings[i] > _ndates[i] && _nstrings[i] > _nUUID[i] && _nstrings[i] > (_nnums[i] + _nzeros[i]) && _domains[i].size() >= 0.95 * _nstrings[i]) { types[i] = Vec.T_STR; continue; } // categorical or string? // categorical with 0s for NAs if(_nzeros[i] > 0 && ((_nzeros[i] + _nstrings[i]) >= nonemptyLines) //just strings and zeros for NA (thus no empty lines) && (_domains[i].size() <= 0.95 * _nstrings[i]) ) { // not all unique strings types[i] = Vec.T_CAT; continue; } // categorical mixed with numbers if(_nstrings[i] >= (_nnums[i]+_nzeros[i]) // mostly strings && (_domains[i].size() <= 0.95 * _nstrings[i]) ) { // but not all unique types[i] = Vec.T_CAT; continue; } // All guesses failed types[i] = Vec.T_NUM; } return types; } public String[][] guessNAStrings(byte[] types) { //For now just catch 0's as NA in categoricals String[][] naStrings = new String[_ncols][]; boolean empty = true; for (int i = 0; i < _ncols; ++i) { int nonemptyLines = _nlines - _nempty[i] - 1; //During guess, some columns may be shorted one line (based on 4M boundary) if (types[i] == Vec.T_CAT && _nzeros[i] > 0 && ((_nzeros[i] + _nstrings[i]) >= nonemptyLines) //just strings and zeros for NA (thus no empty lines) && (_domains[i].size() <= 0.95 * _nstrings[i])) { // not all unique strings naStrings[i] = new String[1]; naStrings[i][0] = "0"; empty = false; } } if (empty) return null; else return naStrings; } public static PreviewParseWriter unifyColumnPreviews(PreviewParseWriter prevA, PreviewParseWriter prevB) { if (prevA == null) return prevB; else if (prevB == null) return prevA; else { //sanity checks if (prevA._ncols != prevB._ncols) throw new ParseDataset.H2OParseException("Files conflict in number of columns. " + prevA._ncols + " vs. " + prevB._ncols + "."); prevA._nlines += prevB._nlines; prevA._invalidLines += prevB._invalidLines; for (int i = 0; i < prevA._ncols; i++) { prevA._nnums[i] += prevB._nnums[i]; prevA._nstrings[i] += prevB._nstrings[i]; prevA._ndates[i] += prevB._ndates[i]; prevA._nUUID[i] += prevB._nUUID[i]; prevA._nzeros[i] += prevB._nzeros[i]; prevA._nempty[i] += prevB._nempty[i]; if (prevA._domains[i] != null) { if (prevB._domains[i] != null) for(String s:prevB._domains[i].keySet()) prevA._domains[i].put(s,""); } else if (prevB._domains[i] != null) prevA._domains = prevB._domains; } } return prevA; } @Override public void invalidLine(ParseErr err) { addError(err); ++_invalidLines; } @Override public void addError(ParseErr err) { if(_errs == null) _errs = new ParseErr[]{err}; else if(_errs.length < 20) _errs = ArrayUtils.append(_errs,err); } @Override public boolean hasErrors() {return _errs != null && _errs.length > 0;} @Override public ParseErr[] removeErrors() {return _errs;} @Override public long lineNum() {return _nlines;} }