CustomParser.java example

Explorer
h2o-2-master
package water.parser;

import java.io.IOException;
import java.io.InputStream;
import java.util.*;


import water.*;
import water.fvec.ParseDataset2.ParseProgressMonitor;


public abstract class CustomParser extends Iced {
  public static final byte CHAR_TAB = '\t';
  public static final byte CHAR_LF = 10;
  public static final byte CHAR_SPACE = ' ';
  public static final byte CHAR_CR = 13;
  public static final byte CHAR_DOUBLE_QUOTE = '"';
  public static final byte CHAR_SINGLE_QUOTE = '\'';

  public final static int MAX_PREVIEW_COLS  = 100;
  public final static int MAX_PREVIEW_LINES = 50;
  public final static int STRING_DOMINANCE_RATIO = 4;

  public final ParserSetup _setup;

  public CustomParser(ParserSetup setup){_setup = setup;}

  public static class PSetupGuess extends Iced {
    public final ParserSetup _setup;
    public final int _invalidLines;
    public final int _validLines;
    public final String []   _errors;
    public Key _setupFromFile;
    public Key _hdrFromFile;
    public String [][] _data;
    public final boolean _isValid;
    public PSetupGuess(ParserSetup ps, int vlines, int ilines, String [][] data, boolean isValid, String [] errors){
      _setup = ps;
      _invalidLines = ilines;
      _validLines = vlines;
      _errors = errors;
      _data = data;
      _isValid = isValid;
    }

    public Set<String> checkDupColumnNames(){
      return _setup.checkDupColumnNames();
    }

    public final boolean hasErrors(){
      return _errors != null && _errors.length > 0;
    }

    @Override public String toString(){
      if(!_isValid)
        return "Parser setup appears to be broken, got " + _setup.toString();
      else if(hasErrors())
        return "Parser setup appears to work with some errors, got " + _setup.toString();
      else
        return "Parser setup working fine, got " + _setup.toString();
    }
  }
  public enum ParserType {
    AUTO(false),XLS(false),XLSX(false),CSV(true), SVMLight(true);
    public final boolean parallelParseSupported;
    ParserType(boolean par){parallelParseSupported = par;}
  }
  public static class ParserSetup extends Iced implements Cloneable{
    public final ParserType _pType;
    public final byte _separator;
    public boolean _header;
    public boolean _hashHeader;
    public boolean _singleQuotes;
    public String [] _columnNames;
    public final int _ncols;


    public enum Coltype {
      NUM,ZERO,STR,AUTO,INVALID;
    }

    public static class TypeInfo extends Iced{
      Coltype _type;
      ValueString _naStr = new ValueString("");
      boolean _strongGuess;

      public void merge(TypeInfo tinfo){
        if(_type == Coltype.AUTO || !_strongGuess && tinfo._strongGuess){ // copy over stuff from the other
          _type = tinfo._type;
          _naStr = tinfo._naStr;
          _strongGuess = tinfo._strongGuess;
        } else if(tinfo._type != Coltype.AUTO && !_strongGuess){
          tinfo._type = Coltype.INVALID;
        } // else just keep mine
      }
    }
    public String [][] domains;
    public double [] _min;
    public double [] _max;
    public int _nnums;
    public int _nstr;
    public int _missing;
    public int _nzeros;

    TypeInfo [] _types;

    public ParserSetup() {
      _pType = ParserType.AUTO;
      _separator = CsvParser.AUTO_SEP;
      _header = false;
      _hashHeader = false;
      _ncols = 0;
      _columnNames = null;
    }
    protected ParserSetup(ParserType t) {
      this(t,CsvParser.AUTO_SEP,0,false,null,false);
    }
    public ParserSetup(ParserType t, byte sep, boolean header) {
      _pType = t;
      _separator = sep;
      _header = header;
      _hashHeader = false;
      _columnNames = null;
      _ncols = 0;
    }
    public ParserSetup(ParserType t, byte sep, boolean header, boolean hashHeader, boolean singleQuotes) {
      _pType = t;
      _separator = sep;
      _header = header || hashHeader;
      _hashHeader = hashHeader;
      _columnNames = null;
      _ncols = 0;
      _singleQuotes = singleQuotes;
    }
    public ParserSetup(ParserType t, byte sep, int ncolumns, boolean header, String [] columnNames, boolean singleQuotes) {
      _pType = t;
      _separator = sep;
      _ncols = ncolumns;
      _header = header;
      _hashHeader = false;
      _columnNames = columnNames;
      _singleQuotes = singleQuotes;
    }
    public boolean isSpecified(){
      return _pType != ParserType.AUTO && _separator != CsvParser.AUTO_SEP && (_header || _ncols > 0);
    }
    public Set<String> checkDupColumnNames(){
      HashSet<String> uniqueNames = new HashSet<String>();
      HashSet<String> conflictingNames = new HashSet<String>();
      if(_header){
        for(String n:_columnNames){
          if(!uniqueNames.contains(n)){
            uniqueNames.add(n);
          } else {
            conflictingNames.add(n);
          }
        }
      }
      return conflictingNames;
    }
    @Override public ParserSetup clone(){
      return new ParserSetup(_pType, _separator, _ncols,_header,null,_singleQuotes);
    }
    public boolean isCompatible(ParserSetup other){
      if(other == null || _pType != other._pType)return false;
      if(_pType == ParserType.CSV && (_separator != other._separator || _ncols != other._ncols))
        return false;
      if(_types == null) _types = other._types;
      else if(other._types != null){
        for(int i = 0; i < _types.length; ++i)
          _types[i].merge(other._types[i]);
      }
      return true;
    }
    public CustomParser parser(){
      switch(this._pType){
        case CSV:
          return new CsvParser(this);
        case SVMLight:
          return new SVMLightParser(this);
        case XLS:
          return new XlsParser(this);
        default:
          throw H2O.unimpl();
      }
    }
    @Override public String toString(){
      StringBuilder sb = new StringBuilder(_pType.name());
      switch(_pType){
        case SVMLight:
          sb.append(" data with (estimated) " + _ncols + " columns.");
          break;
        case CSV:
          sb.append(" data with " + _ncols + " columns using '" + (char)_separator + "' (\\" + _separator + "04d) as separator.");
          break;
        case XLS:
          sb.append(" data with " + _ncols + " columns.");
          break;
        case AUTO:
          sb.append("");
          break;
        default:
          throw H2O.unimpl();
      }
      return sb.toString();
    }
  }
  public boolean isCompatible(CustomParser p){return _setup == p._setup || (_setup != null && _setup.isCompatible(p._setup));}
  public DataOut parallelParse(int cidx, final DataIn din, final DataOut dout) {throw new UnsupportedOperationException();}
  public boolean parallelParseSupported(){return false;}

  public DataOut streamParse( final InputStream is, final DataOut dout) throws Exception {
    if(_setup._pType.parallelParseSupported){
      StreamData din = new StreamData(is);
      int cidx=0;
      while( is.available() > 0 )
        parallelParse(cidx++,din,dout);
      parallelParse(cidx++,din,dout);     // Parse the remaining partial 32K buffer
    } else {
      throw H2O.unimpl();
    }
    return dout;
  }
  // ------------------------------------------------------------------------
  // Zipped file; no parallel decompression; decompress into local chunks,
  // parse local chunks; distribute chunks later.
  public DataOut streamParse( final InputStream is, final StreamDataOut dout, ParseProgressMonitor pmon) throws IOException {
    // All output into a fresh pile of NewChunks, one per column
    if(_setup._pType.parallelParseSupported){
      StreamData din = new StreamData(is);
      int cidx=0;
      StreamDataOut nextChunk = dout;
      long lastProgress = pmon.progress();
      while( is.available() > 0 ){
        if (pmon.progress() > lastProgress) {
          lastProgress = pmon.progress();
          nextChunk.close();
          if(dout != nextChunk) dout.reduce(nextChunk);
          nextChunk = nextChunk.nextChunk();
        }
        parallelParse(cidx++,din,nextChunk);
      }
      parallelParse(cidx++,din,nextChunk);     // Parse the remaining partial 32K buffer
      nextChunk.close();
      if(dout != nextChunk)dout.reduce(nextChunk);
    } else {
      throw H2O.unimpl();
    }
    return dout;
  }
  protected static final boolean isWhitespace(byte c) {
    return (c == CHAR_SPACE) || (c == CHAR_TAB);
  }

  protected static final boolean isEOL(byte c) {
    return ((c == CHAR_LF) || (c == CHAR_CR));
  }
  public interface DataIn {
    // Get another chunk of byte data
    public abstract byte[] getChunkData( int cidx );
    public abstract int  getChunkDataStart( int cidx );
    public abstract void setChunkDataStart( int cidx, int offset );
  }
  public interface DataOut extends Freezable {
    public void setColumnNames(String [] names);
    // Register a newLine from the parser
    public void newLine();
    // True if already forced into a string column (skip number parsing)
    public boolean isString(int colIdx);
    // Add a number column with given digits & exp
    public void addNumCol(int colIdx, long number, int exp);
 // Add a number column with given digits & exp
    public void addNumCol(int colIdx, double d);
    // An an invalid / missing entry
    public void addInvalidCol(int colIdx);
    // Add a String column
    public void addStrCol( int colIdx, ValueString str );
    // Final rolling back of partial line
    public void rollbackLine();
    public void invalidLine(String err);
    public void invalidValue(int line, int col);
  }

  public interface StreamDataOut extends DataOut {
    StreamDataOut nextChunk();
    StreamDataOut reduce(StreamDataOut dout);
    StreamDataOut close();
    StreamDataOut close(Futures fs);
    int nChunks();
  }

  public static class StreamData implements CustomParser.DataIn {
    final transient InputStream _is;
    private byte[] _bits0 = new byte[2*1024*1024]; //allows for row lengths up to 2M
    private byte[] _bits1 = new byte[2*1024*1024];
    private int _cidx0=-1, _cidx1=-1; // Chunk #s
    private int _coff0=-1, _coff1=-1; // Last used byte in a chunk
    public StreamData(InputStream is){_is = is;}
    @Override public byte[] getChunkData(int cidx) {
      if(cidx == _cidx0)return _bits0;
      if(cidx == _cidx1)return _bits1;
      assert cidx==_cidx0+1 || cidx==_cidx1+1;
      byte[] bits = _cidx0<_cidx1 ? _bits0 : _bits1;
      if( _cidx0<_cidx1 ) { _cidx0 = cidx; _coff0 = -1; }
      else                { _cidx1 = cidx; _coff1 = -1; }
      // Read as much as the buffer will hold
      int off=0;
      try {
        while( off < bits.length ) {
          int len = _is.read(bits,off,bits.length-off);
          if( len == -1 ) break;
          off += len;
        }
        assert off == bits.length || _is.available() <= 0;
      } catch( IOException ioe ) {
        throw new RuntimeException(ioe);
      }
      if( off == bits.length ) return bits;
      // Final read is short; cache the short-read
      byte[] bits2 = (off == 0) ? null : Arrays.copyOf(bits,off);
      if( _cidx0==cidx ) _bits0 = bits2;
      else               _bits1 = bits2;
      return bits2;
    }
    @Override public int  getChunkDataStart(int cidx) {
      if( _cidx0 == cidx ) return _coff0;
      if( _cidx1 == cidx ) return _coff1;
      return 0;
    }
    @Override public void setChunkDataStart(int cidx, int offset) {
      if( _cidx0 == cidx ) _coff0 = offset;
      if( _cidx1 == cidx ) _coff1 = offset;
    }
  }
  public abstract CustomParser clone();
  public String [] headers(){return null;}


  protected static class TypeGuesserDataOut extends Iced implements DataOut {

    transient private HashSet<String> [] _domains;
    int [] _nnums;
    int [] _nstrings;
    int [] _nzeros;
    int _nlines = 0;
    final int _ncols;

    public TypeGuesserDataOut(int ncols){
      _ncols = ncols;
      _domains = new HashSet[ncols];
      _nzeros = new int[ncols];
      _nstrings = new int[ncols];
      _nnums = new int[ncols];
      for(int i = 0; i < ncols; ++i)
        _domains[i] = new HashSet<String>();
    }
    // TODO: ugly quick hack, needs revisit
    public ParserSetup.TypeInfo[] guessTypes() {
      ParserSetup.TypeInfo [] res = new ParserSetup.TypeInfo[_ncols];
      for(int i = 0; i < res.length; ++i)
        res[i] = new ParserSetup.TypeInfo();
      for(int i = 0; i < _ncols; ++i){
        if(_domains[i].size() <= 1) // only consider enums with multiple strings (otherwise it's probably garbage on NA)
          res[i]._type = ParserSetup.Coltype.NUM;
        else if(_nzeros[i] > 0 && (Math.abs(_nzeros[i] + _nstrings[i] - _nlines) <= 1)) { // enum with 0s for NAs
          res[i]._naStr = new ValueString("0");
          res[i]._type = ParserSetup.Coltype.STR;
          res[i]._strongGuess = true;
        } else if(_nstrings[i] >= STRING_DOMINANCE_RATIO*(_nnums[i]+_nzeros[i])) { // probably generic enum
          res[i]._type = ParserSetup.Coltype.STR;
        }
      }
      return res;
    }

    @Override
    public void setColumnNames(String[] names) {}

    @Override
    public void newLine() {
      ++_nlines;
    }

    @Override
    public boolean isString(int colIdx) {
      return false;
    }

    @Override
    public void addNumCol(int colIdx, long number, int exp) {
      if(colIdx < _nnums.length)
        if (number == 0)
          ++_nzeros[colIdx];
        else
          ++_nnums[colIdx];
    }

    @Override
    public void addNumCol(int colIdx, double d) {
      if(colIdx < _nnums.length)
        if (d == 0)
          ++_nzeros[colIdx];
        else
          ++_nnums[colIdx];
    }

    @Override
    public void addInvalidCol(int colIdx) {

    }

    @Override
    public void addStrCol(int colIdx, ValueString str) {
      if(colIdx < _nstrings.length) {
        ++_nstrings[colIdx];
        _domains[colIdx].add(str.toString());
      }
    }

    @Override
    public void rollbackLine() {--_nlines;}

    @Override
    public void invalidLine(String err) {}

    @Override
    public void invalidValue(int line, int col) {}
  }
  protected static class CustomInspectDataOut extends Iced implements DataOut {
    public int _nlines;
    public int _ncols;
    public int _invalidLines;
    public boolean _header;

    private String []   _colNames;
    private String [][] _data = new String[MAX_PREVIEW_LINES][MAX_PREVIEW_COLS];
    transient ArrayList<String> _errors;
    public CustomInspectDataOut() {
     for(int i = 0; i < MAX_PREVIEW_LINES;++i)
       Arrays.fill(_data[i],"NA");
    }
    public String [][] data(){
      String [][] res = Arrays.copyOf(_data, Math.min(MAX_PREVIEW_LINES, _nlines));
      for(int i = 0; i < res.length; ++i)
        res[i] = Arrays.copyOf(_data[i], Math.min(MAX_PREVIEW_COLS,_ncols));
      return (_data = res);
    }
    @Override public void setColumnNames(String[] names) {
      _colNames = names;
      _data[0] = names;
      ++_nlines;
      _ncols = names.length;
      _header = true;
    }

    @Override public void newLine() {
      ++_nlines;
    }
    @Override public boolean isString(int colIdx) {return false;}
    @Override public void addNumCol(int colIdx, long number, int exp) {
      if(colIdx < _ncols && _nlines < MAX_PREVIEW_LINES)
        _data[_nlines][colIdx] = Double.toString(number*PrettyPrint.pow10(exp));
    }
    @Override public void addNumCol(int colIdx, double d) {
      if(colIdx < _ncols) {
        _ncols = Math.max(_ncols, colIdx);
        if (_nlines < MAX_PREVIEW_LINES && colIdx < MAX_PREVIEW_COLS)
          _data[_nlines][colIdx] = Double.toString(d);
      }
    }
    @Override public void addInvalidCol(int colIdx) {
      if(colIdx < _ncols && _nlines < MAX_PREVIEW_LINES)
        _data[_nlines][colIdx] = "NA";
    }
    @Override public void addStrCol(int colIdx, ValueString str) {
      if(colIdx < _ncols && _nlines < MAX_PREVIEW_LINES)
        _data[_nlines][colIdx] = str.toString();
    }
    @Override public void rollbackLine() {--_nlines;}
    @Override public void invalidLine(String err) {
      ++_invalidLines;
      _errors.add("Error at line: " + _nlines + ", reason: " + err);
    }
    @Override public void invalidValue(int linenum, int colnum) {}
  }

}