package water.parser;
import java.io.IOException;
import java.io.InputStream;
import java.util.*;
import water.*;
import water.fvec.ParseDataset2.ParseProgressMonitor;
public abstract class CustomParser extends Iced {
public static final byte CHAR_TAB = '\t';
public static final byte CHAR_LF = 10;
public static final byte CHAR_SPACE = ' ';
public static final byte CHAR_CR = 13;
public static final byte CHAR_DOUBLE_QUOTE = '"';
public static final byte CHAR_SINGLE_QUOTE = '\'';
public final static int MAX_PREVIEW_COLS = 100;
public final static int MAX_PREVIEW_LINES = 50;
public final static int STRING_DOMINANCE_RATIO = 4;
public final ParserSetup _setup;
public CustomParser(ParserSetup setup){_setup = setup;}
public static class PSetupGuess extends Iced {
public final ParserSetup _setup;
public final int _invalidLines;
public final int _validLines;
public final String [] _errors;
public Key _setupFromFile;
public Key _hdrFromFile;
public String [][] _data;
public final boolean _isValid;
public PSetupGuess(ParserSetup ps, int vlines, int ilines, String [][] data, boolean isValid, String [] errors){
_setup = ps;
_invalidLines = ilines;
_validLines = vlines;
_errors = errors;
_data = data;
_isValid = isValid;
}
public Set<String> checkDupColumnNames(){
return _setup.checkDupColumnNames();
}
public final boolean hasErrors(){
return _errors != null && _errors.length > 0;
}
@Override public String toString(){
if(!_isValid)
return "Parser setup appears to be broken, got " + _setup.toString();
else if(hasErrors())
return "Parser setup appears to work with some errors, got " + _setup.toString();
else
return "Parser setup working fine, got " + _setup.toString();
}
}
public enum ParserType {
AUTO(false),XLS(false),XLSX(false),CSV(true), SVMLight(true);
public final boolean parallelParseSupported;
ParserType(boolean par){parallelParseSupported = par;}
}
public static class ParserSetup extends Iced implements Cloneable{
public final ParserType _pType;
public final byte _separator;
public boolean _header;
public boolean _hashHeader;
public boolean _singleQuotes;
public String [] _columnNames;
public final int _ncols;
public enum Coltype {
NUM,ZERO,STR,AUTO,INVALID;
}
public static class TypeInfo extends Iced{
Coltype _type;
ValueString _naStr = new ValueString("");
boolean _strongGuess;
public void merge(TypeInfo tinfo){
if(_type == Coltype.AUTO || !_strongGuess && tinfo._strongGuess){ // copy over stuff from the other
_type = tinfo._type;
_naStr = tinfo._naStr;
_strongGuess = tinfo._strongGuess;
} else if(tinfo._type != Coltype.AUTO && !_strongGuess){
tinfo._type = Coltype.INVALID;
} // else just keep mine
}
}
public String [][] domains;
public double [] _min;
public double [] _max;
public int _nnums;
public int _nstr;
public int _missing;
public int _nzeros;
TypeInfo [] _types;
public ParserSetup() {
_pType = ParserType.AUTO;
_separator = CsvParser.AUTO_SEP;
_header = false;
_hashHeader = false;
_ncols = 0;
_columnNames = null;
}
protected ParserSetup(ParserType t) {
this(t,CsvParser.AUTO_SEP,0,false,null,false);
}
public ParserSetup(ParserType t, byte sep, boolean header) {
_pType = t;
_separator = sep;
_header = header;
_hashHeader = false;
_columnNames = null;
_ncols = 0;
}
public ParserSetup(ParserType t, byte sep, boolean header, boolean hashHeader, boolean singleQuotes) {
_pType = t;
_separator = sep;
_header = header || hashHeader;
_hashHeader = hashHeader;
_columnNames = null;
_ncols = 0;
_singleQuotes = singleQuotes;
}
public ParserSetup(ParserType t, byte sep, int ncolumns, boolean header, String [] columnNames, boolean singleQuotes) {
_pType = t;
_separator = sep;
_ncols = ncolumns;
_header = header;
_hashHeader = false;
_columnNames = columnNames;
_singleQuotes = singleQuotes;
}
public boolean isSpecified(){
return _pType != ParserType.AUTO && _separator != CsvParser.AUTO_SEP && (_header || _ncols > 0);
}
public Set<String> checkDupColumnNames(){
HashSet<String> uniqueNames = new HashSet<String>();
HashSet<String> conflictingNames = new HashSet<String>();
if(_header){
for(String n:_columnNames){
if(!uniqueNames.contains(n)){
uniqueNames.add(n);
} else {
conflictingNames.add(n);
}
}
}
return conflictingNames;
}
@Override public ParserSetup clone(){
return new ParserSetup(_pType, _separator, _ncols,_header,null,_singleQuotes);
}
public boolean isCompatible(ParserSetup other){
if(other == null || _pType != other._pType)return false;
if(_pType == ParserType.CSV && (_separator != other._separator || _ncols != other._ncols))
return false;
if(_types == null) _types = other._types;
else if(other._types != null){
for(int i = 0; i < _types.length; ++i)
_types[i].merge(other._types[i]);
}
return true;
}
public CustomParser parser(){
switch(this._pType){
case CSV:
return new CsvParser(this);
case SVMLight:
return new SVMLightParser(this);
case XLS:
return new XlsParser(this);
default:
throw H2O.unimpl();
}
}
@Override public String toString(){
StringBuilder sb = new StringBuilder(_pType.name());
switch(_pType){
case SVMLight:
sb.append(" data with (estimated) " + _ncols + " columns.");
break;
case CSV:
sb.append(" data with " + _ncols + " columns using '" + (char)_separator + "' (\\" + _separator + "04d) as separator.");
break;
case XLS:
sb.append(" data with " + _ncols + " columns.");
break;
case AUTO:
sb.append("");
break;
default:
throw H2O.unimpl();
}
return sb.toString();
}
}
public boolean isCompatible(CustomParser p){return _setup == p._setup || (_setup != null && _setup.isCompatible(p._setup));}
public DataOut parallelParse(int cidx, final DataIn din, final DataOut dout) {throw new UnsupportedOperationException();}
public boolean parallelParseSupported(){return false;}
public DataOut streamParse( final InputStream is, final DataOut dout) throws Exception {
if(_setup._pType.parallelParseSupported){
StreamData din = new StreamData(is);
int cidx=0;
while( is.available() > 0 )
parallelParse(cidx++,din,dout);
parallelParse(cidx++,din,dout); // Parse the remaining partial 32K buffer
} else {
throw H2O.unimpl();
}
return dout;
}
// ------------------------------------------------------------------------
// Zipped file; no parallel decompression; decompress into local chunks,
// parse local chunks; distribute chunks later.
public DataOut streamParse( final InputStream is, final StreamDataOut dout, ParseProgressMonitor pmon) throws IOException {
// All output into a fresh pile of NewChunks, one per column
if(_setup._pType.parallelParseSupported){
StreamData din = new StreamData(is);
int cidx=0;
StreamDataOut nextChunk = dout;
long lastProgress = pmon.progress();
while( is.available() > 0 ){
if (pmon.progress() > lastProgress) {
lastProgress = pmon.progress();
nextChunk.close();
if(dout != nextChunk) dout.reduce(nextChunk);
nextChunk = nextChunk.nextChunk();
}
parallelParse(cidx++,din,nextChunk);
}
parallelParse(cidx++,din,nextChunk); // Parse the remaining partial 32K buffer
nextChunk.close();
if(dout != nextChunk)dout.reduce(nextChunk);
} else {
throw H2O.unimpl();
}
return dout;
}
protected static final boolean isWhitespace(byte c) {
return (c == CHAR_SPACE) || (c == CHAR_TAB);
}
protected static final boolean isEOL(byte c) {
return ((c == CHAR_LF) || (c == CHAR_CR));
}
public interface DataIn {
// Get another chunk of byte data
public abstract byte[] getChunkData( int cidx );
public abstract int getChunkDataStart( int cidx );
public abstract void setChunkDataStart( int cidx, int offset );
}
public interface DataOut extends Freezable {
public void setColumnNames(String [] names);
// Register a newLine from the parser
public void newLine();
// True if already forced into a string column (skip number parsing)
public boolean isString(int colIdx);
// Add a number column with given digits & exp
public void addNumCol(int colIdx, long number, int exp);
// Add a number column with given digits & exp
public void addNumCol(int colIdx, double d);
// An an invalid / missing entry
public void addInvalidCol(int colIdx);
// Add a String column
public void addStrCol( int colIdx, ValueString str );
// Final rolling back of partial line
public void rollbackLine();
public void invalidLine(String err);
public void invalidValue(int line, int col);
}
public interface StreamDataOut extends DataOut {
StreamDataOut nextChunk();
StreamDataOut reduce(StreamDataOut dout);
StreamDataOut close();
StreamDataOut close(Futures fs);
int nChunks();
}
public static class StreamData implements CustomParser.DataIn {
final transient InputStream _is;
private byte[] _bits0 = new byte[2*1024*1024]; //allows for row lengths up to 2M
private byte[] _bits1 = new byte[2*1024*1024];
private int _cidx0=-1, _cidx1=-1; // Chunk #s
private int _coff0=-1, _coff1=-1; // Last used byte in a chunk
public StreamData(InputStream is){_is = is;}
@Override public byte[] getChunkData(int cidx) {
if(cidx == _cidx0)return _bits0;
if(cidx == _cidx1)return _bits1;
assert cidx==_cidx0+1 || cidx==_cidx1+1;
byte[] bits = _cidx0<_cidx1 ? _bits0 : _bits1;
if( _cidx0<_cidx1 ) { _cidx0 = cidx; _coff0 = -1; }
else { _cidx1 = cidx; _coff1 = -1; }
// Read as much as the buffer will hold
int off=0;
try {
while( off < bits.length ) {
int len = _is.read(bits,off,bits.length-off);
if( len == -1 ) break;
off += len;
}
assert off == bits.length || _is.available() <= 0;
} catch( IOException ioe ) {
throw new RuntimeException(ioe);
}
if( off == bits.length ) return bits;
// Final read is short; cache the short-read
byte[] bits2 = (off == 0) ? null : Arrays.copyOf(bits,off);
if( _cidx0==cidx ) _bits0 = bits2;
else _bits1 = bits2;
return bits2;
}
@Override public int getChunkDataStart(int cidx) {
if( _cidx0 == cidx ) return _coff0;
if( _cidx1 == cidx ) return _coff1;
return 0;
}
@Override public void setChunkDataStart(int cidx, int offset) {
if( _cidx0 == cidx ) _coff0 = offset;
if( _cidx1 == cidx ) _coff1 = offset;
}
}
public abstract CustomParser clone();
public String [] headers(){return null;}
protected static class TypeGuesserDataOut extends Iced implements DataOut {
transient private HashSet<String> [] _domains;
int [] _nnums;
int [] _nstrings;
int [] _nzeros;
int _nlines = 0;
final int _ncols;
public TypeGuesserDataOut(int ncols){
_ncols = ncols;
_domains = new HashSet[ncols];
_nzeros = new int[ncols];
_nstrings = new int[ncols];
_nnums = new int[ncols];
for(int i = 0; i < ncols; ++i)
_domains[i] = new HashSet<String>();
}
// TODO: ugly quick hack, needs revisit
public ParserSetup.TypeInfo[] guessTypes() {
ParserSetup.TypeInfo [] res = new ParserSetup.TypeInfo[_ncols];
for(int i = 0; i < res.length; ++i)
res[i] = new ParserSetup.TypeInfo();
for(int i = 0; i < _ncols; ++i){
if(_domains[i].size() <= 1) // only consider enums with multiple strings (otherwise it's probably garbage on NA)
res[i]._type = ParserSetup.Coltype.NUM;
else if(_nzeros[i] > 0 && (Math.abs(_nzeros[i] + _nstrings[i] - _nlines) <= 1)) { // enum with 0s for NAs
res[i]._naStr = new ValueString("0");
res[i]._type = ParserSetup.Coltype.STR;
res[i]._strongGuess = true;
} else if(_nstrings[i] >= STRING_DOMINANCE_RATIO*(_nnums[i]+_nzeros[i])) { // probably generic enum
res[i]._type = ParserSetup.Coltype.STR;
}
}
return res;
}
@Override
public void setColumnNames(String[] names) {}
@Override
public void newLine() {
++_nlines;
}
@Override
public boolean isString(int colIdx) {
return false;
}
@Override
public void addNumCol(int colIdx, long number, int exp) {
if(colIdx < _nnums.length)
if (number == 0)
++_nzeros[colIdx];
else
++_nnums[colIdx];
}
@Override
public void addNumCol(int colIdx, double d) {
if(colIdx < _nnums.length)
if (d == 0)
++_nzeros[colIdx];
else
++_nnums[colIdx];
}
@Override
public void addInvalidCol(int colIdx) {
}
@Override
public void addStrCol(int colIdx, ValueString str) {
if(colIdx < _nstrings.length) {
++_nstrings[colIdx];
_domains[colIdx].add(str.toString());
}
}
@Override
public void rollbackLine() {--_nlines;}
@Override
public void invalidLine(String err) {}
@Override
public void invalidValue(int line, int col) {}
}
protected static class CustomInspectDataOut extends Iced implements DataOut {
public int _nlines;
public int _ncols;
public int _invalidLines;
public boolean _header;
private String [] _colNames;
private String [][] _data = new String[MAX_PREVIEW_LINES][MAX_PREVIEW_COLS];
transient ArrayList<String> _errors;
public CustomInspectDataOut() {
for(int i = 0; i < MAX_PREVIEW_LINES;++i)
Arrays.fill(_data[i],"NA");
}
public String [][] data(){
String [][] res = Arrays.copyOf(_data, Math.min(MAX_PREVIEW_LINES, _nlines));
for(int i = 0; i < res.length; ++i)
res[i] = Arrays.copyOf(_data[i], Math.min(MAX_PREVIEW_COLS,_ncols));
return (_data = res);
}
@Override public void setColumnNames(String[] names) {
_colNames = names;
_data[0] = names;
++_nlines;
_ncols = names.length;
_header = true;
}
@Override public void newLine() {
++_nlines;
}
@Override public boolean isString(int colIdx) {return false;}
@Override public void addNumCol(int colIdx, long number, int exp) {
if(colIdx < _ncols && _nlines < MAX_PREVIEW_LINES)
_data[_nlines][colIdx] = Double.toString(number*PrettyPrint.pow10(exp));
}
@Override public void addNumCol(int colIdx, double d) {
if(colIdx < _ncols) {
_ncols = Math.max(_ncols, colIdx);
if (_nlines < MAX_PREVIEW_LINES && colIdx < MAX_PREVIEW_COLS)
_data[_nlines][colIdx] = Double.toString(d);
}
}
@Override public void addInvalidCol(int colIdx) {
if(colIdx < _ncols && _nlines < MAX_PREVIEW_LINES)
_data[_nlines][colIdx] = "NA";
}
@Override public void addStrCol(int colIdx, ValueString str) {
if(colIdx < _ncols && _nlines < MAX_PREVIEW_LINES)
_data[_nlines][colIdx] = str.toString();
}
@Override public void rollbackLine() {--_nlines;}
@Override public void invalidLine(String err) {
++_invalidLines;
_errors.add("Error at line: " + _nlines + ", reason: " + err);
}
@Override public void invalidValue(int linenum, int colnum) {}
}
}