PreviewParseWriter.java example

Explorer
h2o-3-master
package water.parser;

import water.Iced;
import water.fvec.Vec;
import water.util.ArrayUtils;
import water.util.IcedHashMap;

/** Class implementing ParseWriter, on behalf ParseSetup
 * to examine the contents of a file for guess the column types.
 */
public class PreviewParseWriter extends Iced implements ParseWriter {
  protected final static int MAX_PREVIEW_COLS  = 100;
  protected final static int MAX_PREVIEW_LINES = 10;
  protected int _nlines;
  protected int _ncols;
  protected int _invalidLines;
  private   String []   _colNames;
  protected String [][] _data = new String[MAX_PREVIEW_LINES][];
  private IcedHashMap<String,String>[] _domains;  //used in leiu of a HashSet
  int [] _nnums;
  int [] _nstrings;
  int [] _ndates;
  int [] _nUUID;
  int [] _nzeros;
  int [] _nempty;

  protected ParseErr [] _errs  = new ParseErr[0];

  protected PreviewParseWriter() {}
  protected PreviewParseWriter(int ncols) { setColumnCount(ncols); }

  String[] colNames() { return _colNames; }

  @Override public void setColumnNames(String[] names) {
    _colNames = names;
    _data[0] = names;
    ++_nlines;
    setColumnCount(names.length);
  }
  private void setColumnCount(int n) {
    // initialize
    if (_ncols == 0 && n > 0) {
      _ncols = n;
      _nzeros = new int[n];
      _nstrings = new int[n];
      _nUUID = new int[n];
      _ndates = new int[n];
      _nnums = new int[n];
      _nempty = new int[n];
      _domains = new IcedHashMap[n];
      for(int i = 0; i < n; ++i)
        _domains[i] = new IcedHashMap<>();
      for(int i =0; i < MAX_PREVIEW_LINES; i++)
        _data[i] = new String[n];
    } /*else if (n > _ncols) { // resize
        _nzeros = Arrays.copyOf(_nzeros, n);
        _nstrings = Arrays.copyOf(_nstrings, n);
        _nUUID = Arrays.copyOf(_nUUID, n);
        _ndates = Arrays.copyOf(_ndates, n);
        _nnums = Arrays.copyOf(_nnums, n);
        _nempty = Arrays.copyOf(_nempty, n);
        _domains = Arrays.copyOf(_domains, n);
        for (int i=_ncols; i < n; i++)
          _domains[i] = new HashSet<String>();
        for(int i =0; i < MAX_PREVIEW_LINES; i++)
          _data[i] = Arrays.copyOf(_data[i], n);
        _ncols = n;
      }*/
  }
  @Override public void newLine() { ++_nlines; }
  @Override public boolean isString(int colIdx) { return false; }
  @Override public void addNumCol(int colIdx, long number, int exp) {
    if(colIdx < _ncols) {
      if (number == 0)
        ++_nzeros[colIdx];
      else
        ++_nnums[colIdx];
      if (_nlines < MAX_PREVIEW_LINES)
        _data[_nlines][colIdx] = Double.toString(number * water.util.PrettyPrint.pow10(exp));
    }
  }
  @Override public void addNumCol(int colIdx, double d) {
    if(colIdx < _ncols) {
      if (d == 0)
        ++_nzeros[colIdx];
      else
        ++_nnums[colIdx];
      if (_nlines < MAX_PREVIEW_LINES)
        _data[_nlines][colIdx] = Double.toString(d);
    }
  }
  @Override public void addInvalidCol(int colIdx) {
    if(colIdx < _ncols) {
      ++_nempty[colIdx];
      if (_nlines < MAX_PREVIEW_LINES)
        _data[_nlines][colIdx] = "NA";
    }
  }
  @Override public void addStrCol(int colIdx, BufferedString str) {
    if(colIdx < _ncols) {
      // Check for time
      if (ParseTime.isTime(str)) {
        ++_ndates[colIdx];
        return;
      }

      //Check for UUID
      if(ParseUUID.isUUID(str)) {
        ++_nUUID[colIdx];
        return;
      }

      //Add string to domains list for later determining string, NA, or categorical
      ++_nstrings[colIdx];
      _domains[colIdx].put(str.toString(),"");

      if (_nlines < MAX_PREVIEW_LINES)
        _data[_nlines][colIdx] = str.toString();
    }
  }

  @Override public void rollbackLine() {--_nlines;}

  @Override public void setIsAllASCII(int colIdx, boolean b) {}


  public byte[] guessTypes() {
    byte[] types = new byte[_ncols];
    for (int i = 0; i < _ncols; ++i) {
      int nonemptyLines = _nlines - _nempty[i] - 1; //During guess, some columns may be shorted one line based on 4M boundary

      //Very redundant tests, but clearer and not speed critical

      // is it clearly numeric?
      if ((_nnums[i] + _nzeros[i]) >= _ndates[i]
              && (_nnums[i] + _nzeros[i]) >= _nUUID[i]
              && _nnums[i] >= _nstrings[i]) { // 0s can be an NA among categoricals, ignore
        types[i] = Vec.T_NUM;
        continue;
      }

      // All same string or empty?
      if( _domains[i].size() == 1 && _ndates[i]==0 ) {
        // Obvious NA, or few instances of the single string, declare numeric
        // else categorical
        types[i] = (_domains[i].containsKey("NA") ||
                    _domains[i].containsKey("na") ||
                    _domains[i].containsKey("Na") ||
                    _domains[i].containsKey("N/A") ||
                    _nstrings[i] < _nnums[i]+_nzeros[i]) ? Vec.T_NUM : Vec.T_CAT;
        continue;
      }

      // with NA, but likely numeric
      if (_domains[i].size() <= 1
              && (_nnums[i] + _nzeros[i]) > _ndates[i] + _nUUID[i]) {
        types[i] = Vec.T_NUM;
        continue;
      }

      // Datetime
      if (_ndates[i] > _nUUID[i]
              && _ndates[i] > (_nnums[i] + _nzeros[i])
              && (_ndates[i] > _nstrings[i] || _domains[i].size() <= 1)) {
        types[i] = Vec.T_TIME;
        continue;
      }

      // UUID
      if (_nUUID[i] > _ndates[i]
              && _nUUID[i] > (_nnums[i] + _nzeros[i])
              && (_nUUID[i] > _nstrings[i] || _domains[i].size() <= 1)) {
        types[i] = Vec.T_UUID;
        continue;
      }

      // Strings, almost no dups
      if (_nstrings[i] > _ndates[i]
              && _nstrings[i] > _nUUID[i]
              && _nstrings[i] > (_nnums[i] + _nzeros[i])
              && _domains[i].size() >= 0.95 * _nstrings[i]) {
        types[i] = Vec.T_STR;
        continue;
      }

      // categorical or string?
      // categorical with 0s for NAs
      if(_nzeros[i] > 0
              && ((_nzeros[i] + _nstrings[i]) >= nonemptyLines) //just strings and zeros for NA (thus no empty lines)
              && (_domains[i].size() <= 0.95 * _nstrings[i]) ) { // not all unique strings
        types[i] = Vec.T_CAT;
        continue;
      }
      // categorical mixed with numbers
      if(_nstrings[i] >= (_nnums[i]+_nzeros[i]) // mostly strings
              && (_domains[i].size() <= 0.95 * _nstrings[i]) ) { // but not all unique
        types[i] = Vec.T_CAT;
        continue;
      }

      // All guesses failed
      types[i] = Vec.T_NUM;
    }
    return types;
  }

  public String[][] guessNAStrings(byte[] types) {
    //For now just catch 0's as NA in categoricals
    String[][] naStrings = new String[_ncols][];
    boolean empty = true;
    for (int i = 0; i < _ncols; ++i) {
      int nonemptyLines = _nlines - _nempty[i] - 1; //During guess, some columns may be shorted one line (based on 4M boundary)
      if (types[i] == Vec.T_CAT
              && _nzeros[i] > 0
              && ((_nzeros[i] + _nstrings[i]) >= nonemptyLines) //just strings and zeros for NA (thus no empty lines)
              && (_domains[i].size() <= 0.95 * _nstrings[i])) { // not all unique strings
        naStrings[i] = new String[1];
        naStrings[i][0] = "0";
        empty = false;
      }
    }
    if (empty) return null;
    else return naStrings;
  }

  public static PreviewParseWriter unifyColumnPreviews(PreviewParseWriter prevA, PreviewParseWriter prevB) {
    if (prevA == null) return prevB;
    else if (prevB == null) return prevA;
    else {
      //sanity checks
      if (prevA._ncols != prevB._ncols)
        throw new ParseDataset.H2OParseException("Files conflict in number of columns. "
                + prevA._ncols + " vs. " + prevB._ncols + ".");
      prevA._nlines += prevB._nlines;
      prevA._invalidLines += prevB._invalidLines;
      for (int i = 0; i < prevA._ncols; i++) {
        prevA._nnums[i] += prevB._nnums[i];
        prevA._nstrings[i] += prevB._nstrings[i];
        prevA._ndates[i] += prevB._ndates[i];
        prevA._nUUID[i] += prevB._nUUID[i];
        prevA._nzeros[i] += prevB._nzeros[i];
        prevA._nempty[i] += prevB._nempty[i];
        if (prevA._domains[i] != null) {
          if (prevB._domains[i] != null)
            for(String s:prevB._domains[i].keySet())
              prevA._domains[i].put(s,"");
        } else if (prevB._domains[i] != null)
          prevA._domains = prevB._domains;
      }
    }
    return prevA;
  }
  @Override
  public void invalidLine(ParseErr err) {
    addError(err);
    ++_invalidLines;
  }

  @Override
  public void addError(ParseErr err) {
    if(_errs == null) _errs = new ParseErr[]{err};
    else if(_errs.length < 20)
      _errs = ArrayUtils.append(_errs,err);
  }

  @Override
  public boolean hasErrors() {return _errs != null && _errs.length > 0;}

  @Override
  public ParseErr[] removeErrors() {return _errs;}

  @Override
  public long lineNum() {return _nlines;}
}