package water.parser; import water.*; import water.api.schemas3.ParseSetupV3; import water.exceptions.H2OIllegalArgumentException; import water.fvec.*; import water.util.ArrayUtils; import water.util.FileUtils; import water.util.Log; import java.io.BufferedReader; import java.io.IOException; import java.io.StringReader; import java.lang.reflect.Field; import java.util.Arrays; import java.util.HashSet; import static water.parser.DefaultParserProviders.*; /** * A generic configuration and base guesser for a parser. */ public class ParseSetup extends Iced { public static final byte GUESS_SEP = -1; public static final int NO_HEADER = -1; public static final int GUESS_HEADER = 0; public static final int HAS_HEADER = 1; public static final int GUESS_COL_CNT = -1; ParserInfo _parse_type; // CSV, XLS, XSLX, SVMLight, Auto, ARFF, ORC byte _separator; // Field separator, usually comma ',' or TAB or space ' ' // Whether or not single-quotes quote a field. E.g. how do we parse: // raw data: 123,'Mally,456,O'Mally // singleQuotes==True ==> 2 columns: 123 and Mally,456,OMally // singleQuotes==False ==> 4 columns: 123 and 'Mally and 456 and O'Mally boolean _single_quotes; int _check_header; // 1st row: 0: guess, +1 header, -1 data int _number_columns; // Columns to parse String[] _column_names; byte[] _column_types; // Column types String[][] _domains; // Domains for each column (null if numeric) String[][] _na_strings; // Strings for NA in a given column String[][] _data; // First few rows of parsed/tokenized data String [] _fileNames = new String[]{"unknown"}; public void setFileName(String name) {_fileNames[0] = name;} public ParseWriter.ParseErr[] _errs; public int _chunk_size = FileVec.DFLT_CHUNK_SIZE; // Optimal chunk size to be used store values PreviewParseWriter _column_previews = null; public ParseSetup(ParseSetup ps) { this(ps._parse_type, ps._separator, ps._single_quotes, ps._check_header, ps._number_columns, ps._column_names, ps._column_types, ps._domains, ps._na_strings, ps._data, new ParseWriter.ParseErr[0], ps._chunk_size); } public static ParseSetup makeSVMLightSetup(){ return new ParseSetup(SVMLight_INFO, ParseSetup.GUESS_SEP, false,ParseSetup.NO_HEADER,1,null,new byte[]{Vec.T_NUM},null,null,null, new ParseWriter.ParseErr[0]); } // This method was called during guess setup, lot of things are null, like ctypes. // when it is called again, it either contains the guess column types or it will have user defined column types public ParseSetup(ParserInfo parse_type, byte sep, boolean singleQuotes, int checkHeader, int ncols, String[] columnNames, byte[] ctypes, String[][] domains, String[][] naStrings, String[][] data, ParseWriter.ParseErr[] errs, int chunkSize) { _parse_type = parse_type; _separator = sep; _single_quotes = singleQuotes; _check_header = checkHeader; _number_columns = ncols; _column_names = columnNames; _column_types = ctypes; _domains = domains; _na_strings = naStrings; _data = data; _chunk_size = chunkSize; _errs = errs; } /** * Create a ParseSetup with parameters from the client. * * Typically used to guide sampling in the data * to verify chosen settings, and fill in missing settings. * * @param ps Parse setup settings from client */ public ParseSetup(ParseSetupV3 ps) { this(ps.parse_type != null ? ParserService.INSTANCE.getByName(ps.parse_type).info() : GUESS_INFO, ps.separator != 0 ? ps.separator : GUESS_SEP, ps.single_quotes, ps.check_header, GUESS_COL_CNT, ps.column_names, strToColumnTypes(ps.column_types), null, ps.na_strings, null, new ParseWriter.ParseErr[0], ps.chunk_size); } /** * Create a ParseSetup with all parameters except chunk size. * * Typically used by file type parsers for returning final valid results * _chunk_size will be set later using results from all files. */ public ParseSetup(ParserInfo parseType, byte sep, boolean singleQuotes, int checkHeader, int ncols, String[] columnNames, byte[] ctypes, String[][] domains, String[][] naStrings, String[][] data){ this(parseType, sep, singleQuotes, checkHeader, ncols, columnNames, ctypes, domains, naStrings, data, new ParseWriter.ParseErr[0], FileVec.DFLT_CHUNK_SIZE); } public ParseSetup(ParserInfo parseType, byte sep, boolean singleQuotes, int checkHeader, int ncols, String[] columnNames, byte[] ctypes, String[][] domains, String[][] naStrings, String[][] data, ParseWriter.ParseErr[] errs) { this(parseType, sep, singleQuotes, checkHeader, ncols, columnNames, ctypes, domains, naStrings, data, errs, FileVec.DFLT_CHUNK_SIZE); } /** * Create a ParseSetup without any column information * * Typically used by file type parsers for returning final invalid results */ public ParseSetup(ParserInfo parseType, byte sep, boolean singleQuotes, int checkHeader, int ncols, String[][] data, ParseWriter.ParseErr[] errs) { this(parseType, sep, singleQuotes, checkHeader, ncols, null, null, null, null, data, errs, FileVec.DFLT_CHUNK_SIZE); } /** * Create a default ParseSetup * * Used by Ray's schema magic */ public ParseSetup() {} public String[] getColumnNames() { return _column_names; } public String[][] getData() { return _data; } public String[] getColumnTypeStrings() { String[] types = new String[_column_types.length]; for(int i=0; i< types.length; i++) types[i] = Vec.TYPE_STR[_column_types[i]]; return types; } public byte[] getColumnTypes() { return _column_types; } public static byte[] strToColumnTypes(String[] strs) { if (strs == null) return null; byte[] types = new byte[strs.length]; for(int i=0; i< types.length;i++) { switch (strs[i].toLowerCase()) { case "unknown": types[i] = Vec.T_BAD; break; case "uuid": types[i] = Vec.T_UUID; break; case "string": types[i] = Vec.T_STR; break; case "float": case "real": case "double": case "int": case "numeric": types[i] = Vec.T_NUM; break; case "categorical": case "factor": case "enum": types[i] = Vec.T_CAT; break; case "time": types[i] = Vec.T_TIME; break; default: types[i] = Vec.T_BAD; throw new H2OIllegalArgumentException("Provided column type "+ strs[i] + " is unknown. Cannot proceed with parse due to invalid argument."); } } return types; } /** This is a single entry-point to create a parser. * * Should be override in subclasses. */ protected Parser parser(Key jobKey) { ParserProvider pp = ParserService.INSTANCE.getByInfo(_parse_type); if (pp != null) { return pp.createParser(this, jobKey); } throw new H2OIllegalArgumentException("Unknown file type. Parse cannot be completed.", "Attempted to invoke a parser for ParseType:" + _parse_type + ", which doesn't exist."); } /** Return create a final parser-specific setup * for this configuration. * * @param inputKeys inputs * @param demandedSetup setup demanded by a user * * @return a parser specific setup based on demanded setup */ public final ParseSetup getFinalSetup(Key[] inputKeys, ParseSetup demandedSetup) { ParserProvider pp = ParserService.INSTANCE.getByInfo(_parse_type); if (pp != null) { return pp.createParserSetup(inputKeys, demandedSetup); } throw new H2OIllegalArgumentException("Unknown parser configuration! Configuration=" + this); } // Set of duplicated column names HashSet<String> checkDupColumnNames() { HashSet<String> conflictingNames = new HashSet<>(); if( null==_column_names ) return conflictingNames; HashSet<String> uniqueNames = new HashSet<>(); for( String n : _column_names) if( !uniqueNames.add(n) ) conflictingNames.add(n); return conflictingNames; } @Override public String toString() { return _parse_type.toString(); } static boolean allStrings(String [] line){ BufferedString str = new BufferedString(); for( String s : line ) { try { Double.parseDouble(s); return false; // Number in 1st row guesses: No Column Header } catch (NumberFormatException e) { /*Pass - determining if number is possible*/ } str.set(s); if(ParseTime.isTime(str)) return false; if(ParseUUID.isUUID(str)) return false; } return true; } // simple heuristic to determine if we have headers: // return true iff the first line is all strings and second line has at least one number static boolean hasHeader(String[] l1, String[] l2) { return allStrings(l1) && !allStrings(l2); } /** * Used by test harnesses for simple parsing of test data. Presumes * auto-detection for file and separator types. * * @param fkeys Keys to input vectors to be parsed * @param singleQuote single quotes quote fields * @param checkHeader check for a header * @return ParseSetup settings from looking at all files */ public static ParseSetup guessSetup(Key[] fkeys, boolean singleQuote, int checkHeader) { return guessSetup(fkeys, new ParseSetup(GUESS_INFO, GUESS_SEP, singleQuote, checkHeader, GUESS_COL_CNT, null, new ParseWriter.ParseErr[0])); } /** * Discover the parse setup needed to correctly parse all files. * This takes a ParseSetup as guidance. Each file is examined * individually and then results merged. If a conflict exists * between any results all files are re-examined using the * best guess from the first examination. * * @param fkeys Keys to input vectors to be parsed * @param userSetup Setup guidance from user * @return ParseSetup settings from looking at all files */ public static ParseSetup guessSetup( Key[] fkeys, ParseSetup userSetup ) { //Guess setup of each file and collect results GuessSetupTsk t = new GuessSetupTsk(userSetup); t.doAll(fkeys).getResult(); //Calc chunk-size // FIXME: should be a parser specific - or at least parser should be able to override defaults Iced ice = DKV.getGet(fkeys[0]); if (ice instanceof Frame && ((Frame) ice).vec(0) instanceof UploadFileVec) { t._gblSetup._chunk_size = FileVec.DFLT_CHUNK_SIZE; } else { t._gblSetup._chunk_size = FileVec.calcOptimalChunkSize(t._totalParseSize, t._gblSetup._number_columns, t._maxLineLength, Runtime.getRuntime().availableProcessors(), H2O.getCloudSize(), false /*use new heuristic*/, true); } return t._gblSetup; } /** * Try to determine the ParseSetup on a file by file basis * and merge results. */ public static class GuessSetupTsk extends MRTask<GuessSetupTsk> { // Input final ParseSetup _userSetup; boolean _empty = true; // Output public ParseSetup _gblSetup; public long _totalParseSize; public long _maxLineLength; String _file; /** * * @param userSetup ParseSetup to guide examination of files */ public GuessSetupTsk(ParseSetup userSetup) { _userSetup = userSetup; } /** * Runs once on each file to guess that file's ParseSetup * * For ByteVecs, UploadFileVecs, compressed files and small files, * the ParseSetup is guessed from a single DFLT_CHUNK_SIZE chunk from * the start of the file. This is because UploadFileVecs and compressed * files don't allow random sampling, small files don't need it, and * ByteVecs tend to be small. * * For larger NSFFileVecs and HDFSFileVecs 1M samples are taken at the * beginning of every 100M, and an additional sample is taken from the * last chunk of the file. The results of these samples are merged * together (and compared for consistency). * * Sampling more than the first bytes is preferred, since large data sets * with sorted columns may have all the same value in their first bytes, * making for poor type guesses. * */ @Override public void map(Key key) { _file = key.toString(); Iced ice = DKV.getGet(key); if(ice == null) throw new H2OIllegalArgumentException("Missing data","Did not find any data under key " + key); ByteVec bv = (ByteVec)(ice instanceof ByteVec ? ice : ((Frame)ice).vecs()[0]); byte [] bits = ZipUtil.getFirstUnzippedBytes(bv); // The bits can be null if (bits != null && bits.length > 0) { _empty = false; // get file size // float decompRatio = ZipUtil.decompressionRatio(bv); // if (decompRatio > 1.0) // _totalParseSize += bv.length() * decompRatio; // estimate file size // else // avoid numerical distortion of file size when not compressed // since later calculation of chunk size and later number of chunks do not consider the // compression ratio, we should not do that here either. Quick fix proposed by Tomas. Sleek! _totalParseSize += bv.length(); // Check for supported encodings checkEncoding(bits); // Compute the max line length (to help estimate the number of bytes to read per Parse map) _maxLineLength = maxLineLength(bits); if (_maxLineLength==-1) throw new H2OIllegalArgumentException("The first 4MB of the data don't contain any line breaks. Cannot parse."); // only preview 1 DFLT_CHUNK_SIZE for ByteVecs, UploadFileVecs, compressed, and small files /* if (ice instanceof ByteVec || ((Frame)ice).vecs()[0] instanceof UploadFileVec || bv.length() <= FileVec.DFLT_CHUNK_SIZE || decompRatio > 1.0) { */ try { _gblSetup = guessSetup(bv, bits, _userSetup); for(ParseWriter.ParseErr e:_gblSetup._errs) { e._byteOffset += e._cidx*Parser.StreamData.bufSz; e._cidx = 0; e._file = _file; } } catch (ParseDataset.H2OParseException pse) { throw pse.resetMsg(pse.getMessage()+" for "+key); } /* } else { // file is aun uncompressed NFSFileVec or HDFSFileVec & larger than the DFLT_CHUNK_SIZE FileVec fv = (FileVec) ((Frame) ice).vecs()[0]; // reset chunk size to 1M (uncompressed) int chkSize = (int) ((1<<20) /decompRatio); fv.setChunkSize((Frame) ice, chkSize); // guessSetup from first chunk _gblSetup = guessSetup(fv.getPreviewChunkBytes(0), _userSetup); _userSetup._check_header = -1; // remaining chunks shouldn't check for header _userSetup._parse_type = _gblSetup._parse_type; // or guess parse type //preview 1M data every 100M int numChunks = fv.nChunks(); for (int i=100; i < numChunks;i += 100) { bits = fv.getPreviewChunkBytes(i); if (bits != null) _gblSetup = mergeSetups(_gblSetup, guessSetup(bits, _userSetup)); } // grab sample at end of file (if not done by prev loop) if (numChunks % 100 > 1){ bits = fv.getPreviewChunkBytes(numChunks - 1); if (bits != null) _gblSetup = mergeSetups(_gblSetup, guessSetup(bits, _userSetup)); } // return chunk size to DFLT fv.setChunkSize((Frame) ice, FileVec.DFLT_CHUNK_SIZE); } */ // report if multiple files exist in zip archive /* if (ZipUtil.getFileCount(bv) > 1) { if (_gblSetup._errors != null) _gblSetup._errors = Arrays.copyOf(_gblSetup._errors, _gblSetup._errors.length + 1); else _gblSetup._errors = new String[1]; _gblSetup._errors[_gblSetup._errors.length - 1] = "Only single file zip " + "archives are currently supported, only the first file has been parsed. " + "Remaining files have been ignored."; }*/ } if (_gblSetup==null) throw new RuntimeException("This H2O node couldn't find the file(s) to parse. Please check files and/or working directories."); _gblSetup.setFileName(FileUtils.keyToFileName(key)); } /** * Merges ParseSetup results, conflicts, and errors from several files */ @Override public void reduce(GuessSetupTsk other) { if (other._empty) return; if (_gblSetup == null) { _empty = false; _gblSetup = other._gblSetup; assert (_gblSetup != null); return; } _gblSetup = mergeSetups(_gblSetup, other._gblSetup, _file, other._file); _totalParseSize += other._totalParseSize; _maxLineLength = Math.max(_maxLineLength, other._maxLineLength); } @Override public void postGlobal() { if (_gblSetup._column_previews != null && !_gblSetup._parse_type.equals(ARFF_INFO)) { _gblSetup._column_types = _gblSetup._column_previews.guessTypes(); if (_userSetup._na_strings == null) _gblSetup._na_strings = _gblSetup._column_previews.guessNAStrings(_gblSetup._column_types); else _gblSetup._na_strings = _userSetup._na_strings; } // if(_gblSetup._errs != null) for(ParseWriter.ParseErr err:_gblSetup._errs) Log.warn("ParseSetup: " + err.toString()); } private ParseSetup mergeSetups(ParseSetup setupA, ParseSetup setupB, String fileA, String fileB) { // FIXME: have a merge function defined on a specific parser setup (each parser setup is responsible for merge) if (setupA == null) return setupB; if(setupA._parse_type.equals(DefaultParserProviders.SVMLight_INFO) && setupB._parse_type.equals(DefaultParserProviders.SVMLight_INFO)){ // no merging for svm light, all columns are numeric and we take the max of number of columns (it's an estimate anyways) return setupA._number_columns >= setupB._number_columns?setupA:setupB; } ParseSetup mergedSetup = setupA; mergedSetup._check_header = unifyCheckHeader(setupA._check_header, setupB._check_header); mergedSetup._separator = unifyColumnSeparators(setupA._separator, setupB._separator); if (setupA._parse_type.equals(ARFF_INFO) && setupB._parse_type.equals(CSV_INFO)) ;// do nothing parse_type and col_types are already set correctly else if (setupA._parse_type.equals(CSV_INFO) && setupB._parse_type.equals(ARFF_INFO)) { mergedSetup._parse_type = ARFF_INFO; mergedSetup._column_types = setupB._column_types; } else if (setupA.isCompatible(setupB)) { mergedSetup._column_previews = PreviewParseWriter.unifyColumnPreviews(setupA._column_previews, setupB._column_previews); } else throw new ParseDataset.H2OParseException("File type mismatch. Cannot parse files " + setupA.file() + " and " + setupB.file() + " of type " + setupA._parse_type.name() + " and " + setupB._parse_type.name() + " as one dataset."); mergedSetup._column_names = unifyColumnNames(setupA._column_names, setupB._column_names); mergedSetup._number_columns = mergedSetup._parse_type.equals(CSV_INFO) ? Math.max(setupA._number_columns,setupB._number_columns):unifyColumnCount(setupA._number_columns, setupB._number_columns,mergedSetup, fileA, fileB); if (mergedSetup._data.length < PreviewParseWriter.MAX_PREVIEW_LINES) { int n = mergedSetup._data.length; int m = Math.min(PreviewParseWriter.MAX_PREVIEW_LINES, n + setupB._data.length - 1); mergedSetup._data = Arrays.copyOf(mergedSetup._data, m); System.arraycopy(setupB._data, 1, mergedSetup._data, n, m - n); } mergedSetup._errs = ArrayUtils.append(setupA._errs,setupB._errs); mergedSetup._fileNames = ArrayUtils.append(setupA._fileNames,setupB._fileNames); if(mergedSetup._errs.length > 20) mergedSetup._errs = Arrays.copyOf(mergedSetup._errs,20); return mergedSetup; } private static int unifyCheckHeader(int chkHdrA, int chkHdrB){ if (chkHdrA == GUESS_HEADER || chkHdrB == GUESS_HEADER) throw new ParseDataset.H2OParseException("Unable to determine header on a file. Not expected."); if (chkHdrA == HAS_HEADER || chkHdrB == HAS_HEADER) return HAS_HEADER; else return NO_HEADER; } private static byte unifyColumnSeparators(byte sepA, byte sepB) { if( sepA == sepB) return sepA; else if (sepA == GUESS_SEP) return sepB; else if (sepB == GUESS_SEP) return sepA; // TODO: Point out which file is problem throw new ParseDataset.H2OParseException("Column separator mismatch. One file seems to use \"" + (char) sepA + "\" and the other uses \"" + (char) sepB + "\"."); } private int unifyColumnCount(int cntA, int cntB, ParseSetup mergedSetup, String fileA, String fileB) { if (cntA == cntB) return cntA; else if (cntA == 0) return cntB; else if (cntB == 0) return cntA; else { // files contain different numbers of columns ParseWriter.ParseErr err = new ParseWriter.ParseErr(); err._err = "Incompatible number of columns, " + cntA + " != " + cntB; err._file = fileA + ", " + fileB; mergedSetup._errs = ArrayUtils.append(mergedSetup._errs,err); return Math.max(cntA,cntB); } } private static String[] unifyColumnNames(String[] namesA, String[] namesB){ if (namesA == null) return namesB; else if (namesB == null) return namesA; else { for (int i = 0; i < namesA.length; i++) { if (i > namesB.length || !namesA[i].equals(namesB[i])) { // TODO improvement: if files match except for blanks, merge? throw new ParseDataset.H2OParseException("Column names do not match between files."); } } return namesA; } } } private String file() { String [] names = _fileNames; if(names.length > 5) names = Arrays.copyOf(names,5); return Arrays.toString(names); } protected boolean isCompatible(ParseSetup setupB) { return _parse_type.equals(setupB._parse_type) && _parse_type.equals(DefaultParserProviders.SVMLight_INFO) || _number_columns == setupB._number_columns; } /** * Guess everything from a single pile-o-bits. Used in tests, or in initial * parser inspections when the user has not told us anything about separators * or headers. * * @param bits Initial bytes from a parse source * @return ParseSetup settings from looking at all files */ public static ParseSetup guessSetup( ByteVec bv, byte [] bits, ParseSetup userSetup ) { return guessSetup(bv, bits, userSetup._parse_type, userSetup._separator, GUESS_COL_CNT, userSetup._single_quotes, userSetup._check_header, userSetup._column_names, userSetup._column_types, null, null); } public static ParseSetup guessSetup(ByteVec bv, byte [] bits, ParserInfo parserType, byte sep, int ncols, boolean singleQuotes, int checkHeader, String[] columnNames, byte[] columnTypes, String[][] domains, String[][] naStrings ) { ParserProvider pp = ParserService.INSTANCE.getByInfo(parserType); if (pp != null) { return pp.guessSetup(bv, bits, sep, ncols, singleQuotes, checkHeader, columnNames, columnTypes, domains, naStrings); } throw new ParseDataset.H2OParseException("Cannot determine file type."); } /** * Cleans up the file name to make .hex name * to be used as a destination key. Eliminates * common file extensions, and replaces odd * characters. * * @param n filename to be cleaned * @return cleaned name */ public static String createHexName(String n) { // blahblahblah/myName.ext ==> myName // blahblahblah/myName.csv.ext ==> myName int sep = n.lastIndexOf(java.io.File.separatorChar); if( sep > 0 ) n = n.substring(sep+1); int dot = n.lastIndexOf('.'); while ( dot > 0 && (n.endsWith("zip") || n.endsWith("gz") || n.endsWith("csv") || n.endsWith("xls") || n.endsWith("txt") || n.endsWith("svm") || n.endsWith("orc") || n.endsWith("arff"))) { n = n.substring(0, dot); dot = n.lastIndexOf('.'); } // "2012_somedata" ==> "X2012_somedata" if( !Character.isJavaIdentifierStart(n.charAt(0)) ) n = "X"+n; // "human%Percent" ==> "human_Percent" char[] cs = n.toCharArray(); for( int i=1; i<cs.length; i++ ) if( !Character.isJavaIdentifierPart(cs[i]) ) cs[i] = '_'; // "myName" ==> "myName.hex" n = new String(cs); int i = 0; String res = n + ".hex"; Key k = Key.make(res); // Renumber to handle dup names while(DKV.get(k) != null) k = Key.make(res = n + ++i + ".hex"); return res; } /** * Reject unsupported encodings * * For the curious, this is hardly a complete test, it only catches the * most polite UTF-16 cases. Switch to jChardet or guessEncoding libraries * for more robust solutions. WARNING: not all UTF-16 files * use BOM to indicate their encoding. Even worse, some datasets may be * made from disparate sources, and could used a mix that wouldn't be * detected by this. * * @param bits data to be examined for encoding */ private static final void checkEncoding(byte[] bits) { if (bits.length >= 2) { if ((bits[0] == (byte) 0xff && bits[1] == (byte) 0xfe) /* UTF-16, little endian */ || (bits[0] == (byte) 0xfe && bits[1] == (byte) 0xff) /* UTF-16, big endian */) { throw new ParseDataset.H2OParseException("UTF16 encoding detected, but is not supported."); } } } /** * Compute the longest line length in an array of bytes * @param bytes Array of bytes (containing 0 or more newlines) * @return The longest line length in the given bytes */ private static final long maxLineLength(byte[] bytes) { if (bytes.length >= 2) { String st = new String(bytes); StringReader sr = new StringReader(st); BufferedReader br = new BufferedReader(sr); String line; long maxLineLength=0; try { while(true) { line = br.readLine(); if (line == null) break; maxLineLength = Math.max(line.length(), maxLineLength); } } catch (IOException e) { return -1; } return maxLineLength; } return -1; } /** * Copies the common setup to another object (that is possibly and extension of the base setup). * Note: this method only copies fields directly declared in ParseSetup class, it doesn't handle * fields that are declared in classes derived from ParseSetup. * @param setup target setup object * @param <T> class derived from ParseSetup * @return the target setup object (for convenience) */ public <T extends ParseSetup> T copyTo(T setup) { try { for (Field field : ParseSetup.class.getDeclaredFields()) { if (! java.lang.reflect.Modifier.isStatic(field.getModifiers())) field.set(setup, field.get(this)); } return setup; } catch (IllegalAccessException e) { throw new RuntimeException(e); } } public ParserInfo getParseType() { return _parse_type; } public ParseSetup setParseType(ParserInfo parse_type) { this._parse_type = parse_type; return this; } public ParseSetup setSeparator(byte separator) { this._separator = separator; return this; } public ParseSetup setSingleQuotes(boolean single_quotes) { this._single_quotes = single_quotes; return this; } public ParseSetup setCheckHeader(int check_header) { this._check_header = check_header; return this; } public ParseSetup setNumberColumns(int number_columns) { this._number_columns = number_columns; return this; } public ParseSetup setColumnNames(String[] column_names) { this._column_names = column_names; return this; } public ParseSetup setColumnTypes(byte[] column_types) { this._column_types = column_types; return this; } public ParseSetup setDomains(String[][] domains) { this._domains = domains; return this; } public ParseSetup setNAStrings(String[][] na_strings) { this._na_strings = na_strings; return this; } public ParseSetup setChunkSize(int chunk_size) { this._chunk_size = chunk_size; return this; } } // ParseSetup state class