ParseSetup.java example

Explorer
h2o-3-master
package water.parser;

import water.*;
import water.api.schemas3.ParseSetupV3;
import water.exceptions.H2OIllegalArgumentException;
import water.fvec.*;
import water.util.ArrayUtils;
import water.util.FileUtils;
import water.util.Log;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.StringReader;
import java.lang.reflect.Field;
import java.util.Arrays;
import java.util.HashSet;

import static water.parser.DefaultParserProviders.*;

/**
 * A generic configuration and base guesser for a parser.
 */
public class ParseSetup extends Iced {
  public static final byte GUESS_SEP = -1;
  public static final int NO_HEADER = -1;
  public static final int GUESS_HEADER = 0;
  public static final int HAS_HEADER = 1;
  public static final int GUESS_COL_CNT = -1;

  ParserInfo _parse_type;     // CSV, XLS, XSLX, SVMLight, Auto, ARFF, ORC
  byte _separator;            // Field separator, usually comma ',' or TAB or space ' '
  // Whether or not single-quotes quote a field.  E.g. how do we parse:
  // raw data:  123,'Mally,456,O'Mally
  // singleQuotes==True  ==> 2 columns: 123  and  Mally,456,OMally
  // singleQuotes==False ==> 4 columns: 123  and  'Mally  and  456  and  O'Mally
  boolean _single_quotes;
  int _check_header;                 // 1st row: 0: guess, +1 header, -1 data
  int _number_columns;                 // Columns to parse
  String[] _column_names;
  byte[] _column_types;       // Column types
  String[][] _domains;        // Domains for each column (null if numeric)
  String[][] _na_strings;       // Strings for NA in a given column
  String[][] _data;           // First few rows of parsed/tokenized data

  String [] _fileNames = new String[]{"unknown"};

  public void setFileName(String name) {_fileNames[0] = name;}

  public ParseWriter.ParseErr[] _errs;
  public int _chunk_size = FileVec.DFLT_CHUNK_SIZE;  // Optimal chunk size to be used store values
  PreviewParseWriter _column_previews = null;

  public ParseSetup(ParseSetup ps) {
    this(ps._parse_type,
         ps._separator, ps._single_quotes, ps._check_header, ps._number_columns,
         ps._column_names, ps._column_types, ps._domains, ps._na_strings, ps._data,
         new ParseWriter.ParseErr[0], ps._chunk_size);
  }


  public static ParseSetup makeSVMLightSetup(){
    return new ParseSetup(SVMLight_INFO, ParseSetup.GUESS_SEP,
        false,ParseSetup.NO_HEADER,1,null,new byte[]{Vec.T_NUM},null,null,null, new ParseWriter.ParseErr[0]);
  }

  // This method was called during guess setup, lot of things are null, like ctypes.
  // when it is called again, it either contains the guess column types or it will have user defined column types
  public ParseSetup(ParserInfo parse_type, byte sep, boolean singleQuotes, int checkHeader, int ncols, String[] columnNames, byte[] ctypes, String[][] domains, String[][] naStrings, String[][] data, ParseWriter.ParseErr[] errs, int chunkSize) {
    _parse_type = parse_type;
    _separator = sep;
    _single_quotes = singleQuotes;
    _check_header = checkHeader;
    _number_columns = ncols;
    _column_names = columnNames;
    _column_types = ctypes;
    _domains = domains;
    _na_strings = naStrings;
    _data = data;
    _chunk_size = chunkSize;
    _errs = errs;
  }

  /**
   * Create a ParseSetup with parameters from the client.
   *
   * Typically used to guide sampling in the data
   * to verify chosen settings, and fill in missing settings.
   *
   * @param ps Parse setup settings from client
   */
  public ParseSetup(ParseSetupV3 ps) {
    this(ps.parse_type != null ? ParserService.INSTANCE.getByName(ps.parse_type).info() : GUESS_INFO,
         ps.separator != 0 ? ps.separator : GUESS_SEP,
         ps.single_quotes,
         ps.check_header,
         GUESS_COL_CNT,
         ps.column_names, strToColumnTypes(ps.column_types),
         null, ps.na_strings,
         null,
         new ParseWriter.ParseErr[0],
         ps.chunk_size);
  }

  /**
   * Create a ParseSetup with all parameters except chunk size.
   *
   * Typically used by file type parsers for returning final valid results
   * _chunk_size will be set later using results from all files.
   */
  public ParseSetup(ParserInfo parseType, byte sep, boolean singleQuotes, int checkHeader,
                    int ncols, String[] columnNames, byte[] ctypes,
                    String[][] domains, String[][] naStrings, String[][] data){
    this(parseType, sep, singleQuotes, checkHeader, ncols, columnNames, ctypes,
        domains, naStrings, data, new ParseWriter.ParseErr[0], FileVec.DFLT_CHUNK_SIZE);
  }
  public ParseSetup(ParserInfo parseType, byte sep, boolean singleQuotes, int checkHeader,
                    int ncols, String[] columnNames, byte[] ctypes,
                    String[][] domains, String[][] naStrings, String[][] data, ParseWriter.ParseErr[] errs) {
    this(parseType, sep, singleQuotes, checkHeader, ncols, columnNames, ctypes,
            domains, naStrings, data, errs, FileVec.DFLT_CHUNK_SIZE);
  }

  /**
   * Create a ParseSetup without any column information
   *
   * Typically used by file type parsers for returning final invalid results
   */
  public ParseSetup(ParserInfo parseType, byte sep, boolean singleQuotes, int checkHeader, int ncols, String[][] data, ParseWriter.ParseErr[] errs) {
    this(parseType, sep, singleQuotes, checkHeader, ncols, null, null, null, null, data, errs, FileVec.DFLT_CHUNK_SIZE);
  }

  /**
   * Create a default ParseSetup
   *
   * Used by Ray's schema magic
   */
  public ParseSetup() {}

  public String[] getColumnNames() { return _column_names; }
  public String[][] getData() { return _data; }

  public String[] getColumnTypeStrings() {
    String[] types = new String[_column_types.length];
    for(int i=0; i< types.length; i++)
      types[i] = Vec.TYPE_STR[_column_types[i]];
    return types;
  }
  public byte[] getColumnTypes() { return _column_types; }

  public static byte[] strToColumnTypes(String[] strs) {
    if (strs == null) return null;
    byte[] types = new byte[strs.length];
    for(int i=0; i< types.length;i++) {
      switch (strs[i].toLowerCase()) {
      case "unknown": types[i] = Vec.T_BAD;  break;
      case "uuid":    types[i] = Vec.T_UUID; break;
      case "string":  types[i] = Vec.T_STR;  break;
      case "float":
      case "real":
      case "double":
      case "int":
      case "numeric": types[i] = Vec.T_NUM;  break;
      case "categorical":
      case "factor":
      case "enum":    types[i] = Vec.T_CAT;  break;
      case "time":    types[i] = Vec.T_TIME; break;
      default:        types[i] = Vec.T_BAD;
        throw new H2OIllegalArgumentException("Provided column type "+ strs[i] + " is unknown.  Cannot proceed with parse due to invalid argument.");
      }
    }
    return types;
  }

  /** This is a single entry-point to create a parser.
   *
   * Should be override in subclasses. */
  protected Parser parser(Key jobKey) {
    ParserProvider pp = ParserService.INSTANCE.getByInfo(_parse_type);
    if (pp != null) {
      return pp.createParser(this, jobKey);
    }

    throw new H2OIllegalArgumentException("Unknown file type.  Parse cannot be completed.",
          "Attempted to invoke a parser for ParseType:" + _parse_type + ", which doesn't exist.");
  }

  /** Return create a final parser-specific setup
   * for this configuration.
   *
   * @param inputKeys  inputs
   * @param demandedSetup  setup demanded by a user
   *
   * @return a parser specific setup based on demanded setup
   */
  public final ParseSetup getFinalSetup(Key[] inputKeys, ParseSetup demandedSetup) {
    ParserProvider pp = ParserService.INSTANCE.getByInfo(_parse_type);
    if (pp != null) {
      return pp.createParserSetup(inputKeys, demandedSetup);
    }

    throw new H2OIllegalArgumentException("Unknown parser configuration! Configuration=" + this);
  }

  // Set of duplicated column names
  HashSet<String> checkDupColumnNames() {
    HashSet<String> conflictingNames = new HashSet<>();
    if( null==_column_names ) return conflictingNames;
    HashSet<String> uniqueNames = new HashSet<>();
    for( String n : _column_names)
      if( !uniqueNames.add(n) ) conflictingNames.add(n);
    return conflictingNames;
  }

  @Override public String toString() {
    return _parse_type.toString();
  }

  static boolean allStrings(String [] line){
    BufferedString str = new BufferedString();
    for( String s : line ) {
      try {
        Double.parseDouble(s);
        return false;       // Number in 1st row guesses: No Column Header
      } catch (NumberFormatException e) { /*Pass - determining if number is possible*/ }
      str.set(s);
      if(ParseTime.isTime(str)) return false;
      if(ParseUUID.isUUID(str)) return false;
    }
    return true;
  }
  // simple heuristic to determine if we have headers:
  // return true iff the first line is all strings and second line has at least one number
  static boolean hasHeader(String[] l1, String[] l2) {
    return allStrings(l1) && !allStrings(l2);
  }

  /**
   * Used by test harnesses for simple parsing of test data.  Presumes
   * auto-detection for file and separator types.
   *
   * @param fkeys Keys to input vectors to be parsed
   * @param singleQuote single quotes quote fields
   * @param checkHeader check for a header
   * @return ParseSetup settings from looking at all files
   */
  public static ParseSetup guessSetup(Key[] fkeys, boolean singleQuote, int checkHeader) {
    return guessSetup(fkeys, new ParseSetup(GUESS_INFO, GUESS_SEP, singleQuote, checkHeader, GUESS_COL_CNT, null, new ParseWriter.ParseErr[0]));
  }

  /**
   * Discover the parse setup needed to correctly parse all files.
   * This takes a ParseSetup as guidance.  Each file is examined
   * individually and then results merged.  If a conflict exists
   * between any results all files are re-examined using the
   * best guess from the first examination.
   *
   * @param fkeys Keys to input vectors to be parsed
   * @param userSetup Setup guidance from user
   * @return ParseSetup settings from looking at all files
   */
  public static ParseSetup guessSetup( Key[] fkeys, ParseSetup userSetup ) {

    //Guess setup of each file and collect results
    GuessSetupTsk t = new GuessSetupTsk(userSetup);
    t.doAll(fkeys).getResult();

    //Calc chunk-size
    // FIXME: should be a parser specific - or at least parser should be able to override defaults
    Iced ice = DKV.getGet(fkeys[0]);
    if (ice instanceof Frame && ((Frame) ice).vec(0) instanceof UploadFileVec) {
      t._gblSetup._chunk_size = FileVec.DFLT_CHUNK_SIZE;
    } else {
      t._gblSetup._chunk_size = FileVec.calcOptimalChunkSize(t._totalParseSize, t._gblSetup._number_columns, t._maxLineLength,
              Runtime.getRuntime().availableProcessors(), H2O.getCloudSize(), false /*use new heuristic*/, true);
    }

    return t._gblSetup;
  }

  /**
   * Try to determine the ParseSetup on a file by file basis
   * and merge results.
   */
  public static class GuessSetupTsk extends MRTask<GuessSetupTsk> {
    // Input
    final ParseSetup _userSetup;
    boolean _empty = true;

    // Output
    public ParseSetup _gblSetup;
    public long _totalParseSize;
    public long _maxLineLength;
    String _file;

    /**
     *
     * @param userSetup ParseSetup to guide examination of files
     */
    public GuessSetupTsk(ParseSetup userSetup) {
      _userSetup = userSetup;
    }

    /**
     * Runs once on each file to guess that file's ParseSetup
     *
     * For ByteVecs, UploadFileVecs, compressed files and small files,
     * the ParseSetup is guessed from a single DFLT_CHUNK_SIZE chunk from
     * the start of the file.  This is because UploadFileVecs and compressed
     * files don't allow random sampling, small files don't need it, and
     * ByteVecs tend to be small.
     *
     * For larger NSFFileVecs and HDFSFileVecs 1M samples are taken at the
     * beginning of every 100M, and an additional sample is taken from the
     * last chunk of the file.  The results of these samples are merged
     * together (and compared for consistency).
     *
     * Sampling more than the first bytes is preferred, since large data sets
     * with sorted columns may have all the same value in their first bytes,
     * making for poor type guesses.
     *
     */
    @Override public void map(Key key) {
      _file = key.toString();
      Iced ice = DKV.getGet(key);
      if(ice == null) throw new H2OIllegalArgumentException("Missing data","Did not find any data under key " + key);
      ByteVec bv = (ByteVec)(ice instanceof ByteVec ? ice : ((Frame)ice).vecs()[0]);
      byte [] bits = ZipUtil.getFirstUnzippedBytes(bv);
      // The bits can be null
      if (bits != null && bits.length > 0) {
        _empty = false;

        // get file size
//        float decompRatio = ZipUtil.decompressionRatio(bv);
//        if (decompRatio > 1.0)
//          _totalParseSize += bv.length() * decompRatio; // estimate file size
//        else  // avoid numerical distortion of file size when not compressed

        // since later calculation of chunk size and later number of chunks do not consider the
        // compression ratio, we should not do that here either.  Quick fix proposed by Tomas.  Sleek!
        _totalParseSize += bv.length();

        // Check for supported encodings
        checkEncoding(bits);

        // Compute the max line length (to help estimate the number of bytes to read per Parse map)
        _maxLineLength = maxLineLength(bits);
        if (_maxLineLength==-1) throw new H2OIllegalArgumentException("The first 4MB of the data don't contain any line breaks. Cannot parse.");

        // only preview 1 DFLT_CHUNK_SIZE for ByteVecs, UploadFileVecs, compressed, and small files
/*        if (ice instanceof ByteVec
                || ((Frame)ice).vecs()[0] instanceof UploadFileVec
                || bv.length() <= FileVec.DFLT_CHUNK_SIZE
                || decompRatio > 1.0) { */
        try {
          _gblSetup = guessSetup(bv, bits, _userSetup);
          for(ParseWriter.ParseErr e:_gblSetup._errs) {
            e._byteOffset += e._cidx*Parser.StreamData.bufSz;
            e._cidx = 0;
            e._file = _file;
          }
        } catch (ParseDataset.H2OParseException pse) {
          throw pse.resetMsg(pse.getMessage()+" for "+key);
        }
/*        } else { // file is aun uncompressed NFSFileVec or HDFSFileVec & larger than the DFLT_CHUNK_SIZE
          FileVec fv = (FileVec) ((Frame) ice).vecs()[0];
          // reset chunk size to 1M (uncompressed)
          int chkSize = (int) ((1<<20) /decompRatio);
          fv.setChunkSize((Frame) ice, chkSize);

          // guessSetup from first chunk
          _gblSetup = guessSetup(fv.getPreviewChunkBytes(0), _userSetup);
          _userSetup._check_header = -1; // remaining chunks shouldn't check for header
          _userSetup._parse_type = _gblSetup._parse_type; // or guess parse type

          //preview 1M data every 100M
          int numChunks = fv.nChunks();
          for (int i=100; i < numChunks;i += 100) {
            bits = fv.getPreviewChunkBytes(i);
            if (bits != null)
              _gblSetup = mergeSetups(_gblSetup, guessSetup(bits, _userSetup));
          }

          // grab sample at end of file (if not done by prev loop)
          if (numChunks % 100 > 1){
            bits = fv.getPreviewChunkBytes(numChunks - 1);
            if (bits != null)
              _gblSetup = mergeSetups(_gblSetup, guessSetup(bits, _userSetup));
          }

          // return chunk size to DFLT
          fv.setChunkSize((Frame) ice, FileVec.DFLT_CHUNK_SIZE);
        } */
        // report if multiple files exist in zip archive
/*        if (ZipUtil.getFileCount(bv) > 1) {
          if (_gblSetup._errors != null)
            _gblSetup._errors = Arrays.copyOf(_gblSetup._errors, _gblSetup._errors.length + 1);
          else
            _gblSetup._errors = new String[1];

          _gblSetup._errors[_gblSetup._errors.length - 1] = "Only single file zip " +
                  "archives are currently supported, only the first file has been parsed.  " +
                  "Remaining files have been ignored.";
        }*/
      }
      if (_gblSetup==null)
        throw new RuntimeException("This H2O node couldn't find the file(s) to parse. Please check files and/or working directories.");
      _gblSetup.setFileName(FileUtils.keyToFileName(key));
    }

    /**
     * Merges ParseSetup results, conflicts, and errors from several files
     */
    @Override
    public void reduce(GuessSetupTsk other) {
      if (other._empty) return;

      if (_gblSetup == null) {
        _empty = false;
        _gblSetup = other._gblSetup;
        assert (_gblSetup != null);
        return;
      }
      _gblSetup = mergeSetups(_gblSetup, other._gblSetup, _file, other._file);
      _totalParseSize += other._totalParseSize;
      _maxLineLength = Math.max(_maxLineLength, other._maxLineLength);
    }

    @Override public void postGlobal() {
      if (_gblSetup._column_previews != null && !_gblSetup._parse_type.equals(ARFF_INFO)) {
        _gblSetup._column_types = _gblSetup._column_previews.guessTypes();
        if (_userSetup._na_strings == null)
          _gblSetup._na_strings = _gblSetup._column_previews.guessNAStrings(_gblSetup._column_types);
        else
          _gblSetup._na_strings = _userSetup._na_strings;
      }
//      if(_gblSetup._errs != null)
        for(ParseWriter.ParseErr err:_gblSetup._errs)
          Log.warn("ParseSetup: " + err.toString());
    }
    private ParseSetup mergeSetups(ParseSetup setupA, ParseSetup setupB, String fileA, String fileB) {
      // FIXME: have a merge function defined on a specific parser setup (each parser setup is responsible for merge)
      if (setupA == null) return setupB;
      if(setupA._parse_type.equals(DefaultParserProviders.SVMLight_INFO) && setupB._parse_type.equals(DefaultParserProviders.SVMLight_INFO)){
        // no merging for svm light, all columns are numeric and we take the max of number of columns (it's an estimate anyways)
        return setupA._number_columns >= setupB._number_columns?setupA:setupB;
      }
      ParseSetup mergedSetup = setupA;

      mergedSetup._check_header = unifyCheckHeader(setupA._check_header, setupB._check_header);

      mergedSetup._separator = unifyColumnSeparators(setupA._separator, setupB._separator);
      if (setupA._parse_type.equals(ARFF_INFO) && setupB._parse_type.equals(CSV_INFO))
        ;// do nothing parse_type and col_types are already set correctly
      else if (setupA._parse_type.equals(CSV_INFO) && setupB._parse_type.equals(ARFF_INFO)) {
        mergedSetup._parse_type = ARFF_INFO;
        mergedSetup._column_types = setupB._column_types;
      } else if (setupA.isCompatible(setupB)) {
        mergedSetup._column_previews = PreviewParseWriter.unifyColumnPreviews(setupA._column_previews, setupB._column_previews);
      } else
        throw new ParseDataset.H2OParseException("File type mismatch. Cannot parse files " + setupA.file() + " and " + setupB.file() + " of type "
                + setupA._parse_type.name() + " and " + setupB._parse_type.name() + " as one dataset.");
      mergedSetup._column_names = unifyColumnNames(setupA._column_names, setupB._column_names);
      mergedSetup._number_columns = mergedSetup._parse_type.equals(CSV_INFO) ? Math.max(setupA._number_columns,setupB._number_columns):unifyColumnCount(setupA._number_columns, setupB._number_columns,mergedSetup, fileA, fileB);
      if (mergedSetup._data.length < PreviewParseWriter.MAX_PREVIEW_LINES) {
        int n = mergedSetup._data.length;
        int m = Math.min(PreviewParseWriter.MAX_PREVIEW_LINES, n + setupB._data.length - 1);
        mergedSetup._data = Arrays.copyOf(mergedSetup._data, m);
        System.arraycopy(setupB._data, 1, mergedSetup._data, n, m - n);
      }
      mergedSetup._errs = ArrayUtils.append(setupA._errs,setupB._errs);
      mergedSetup._fileNames = ArrayUtils.append(setupA._fileNames,setupB._fileNames);
      if(mergedSetup._errs.length > 20)
        mergedSetup._errs = Arrays.copyOf(mergedSetup._errs,20);
      return mergedSetup;
    }

    private static int unifyCheckHeader(int chkHdrA, int chkHdrB){
      if (chkHdrA == GUESS_HEADER || chkHdrB == GUESS_HEADER)
        throw new ParseDataset.H2OParseException("Unable to determine header on a file. Not expected.");
      if (chkHdrA == HAS_HEADER || chkHdrB == HAS_HEADER) return HAS_HEADER;
      else return NO_HEADER;

    }

    private static byte unifyColumnSeparators(byte sepA, byte sepB) {
      if( sepA == sepB) return sepA;
      else if (sepA == GUESS_SEP) return sepB;
      else if (sepB == GUESS_SEP) return sepA;
      // TODO: Point out which file is problem
      throw new ParseDataset.H2OParseException("Column separator mismatch. One file seems to use \""
              + (char) sepA + "\" and the other uses \"" + (char) sepB + "\".");
    }

    private int unifyColumnCount(int cntA, int cntB, ParseSetup mergedSetup, String fileA, String fileB) {
      if (cntA == cntB) return cntA;
      else if (cntA == 0) return cntB;
      else if (cntB == 0) return cntA;
      else { // files contain different numbers of columns
        ParseWriter.ParseErr err = new ParseWriter.ParseErr();
        err._err = "Incompatible number of columns, " + cntA + " != " + cntB;
        err._file = fileA + ", " + fileB;
        mergedSetup._errs = ArrayUtils.append(mergedSetup._errs,err);
        return Math.max(cntA,cntB);
      }
    }

    private static String[] unifyColumnNames(String[] namesA, String[] namesB){
      if (namesA == null) return namesB;
      else if (namesB == null) return namesA;
      else {
        for (int i = 0; i < namesA.length; i++) {
          if (i > namesB.length || !namesA[i].equals(namesB[i])) {
            // TODO improvement: if files match except for blanks, merge?
            throw new ParseDataset.H2OParseException("Column names do not match between files.");
          }
        }
        return namesA;
      }
    }
  }


  private String file() {
    String [] names = _fileNames;
    if(names.length > 5)
      names = Arrays.copyOf(names,5);
    return Arrays.toString(names);
  }

  protected boolean isCompatible(ParseSetup setupB) {
    return _parse_type.equals(setupB._parse_type) && _parse_type.equals(DefaultParserProviders.SVMLight_INFO) ||  _number_columns == setupB._number_columns;
  }

  /**
   * Guess everything from a single pile-o-bits.  Used in tests, or in initial
   * parser inspections when the user has not told us anything about separators
   * or headers.
   *
   * @param bits Initial bytes from a parse source
   * @return ParseSetup settings from looking at all files
   */
  public static ParseSetup guessSetup( ByteVec bv, byte [] bits, ParseSetup userSetup ) {
    return guessSetup(bv, bits, userSetup._parse_type, userSetup._separator, GUESS_COL_CNT, userSetup._single_quotes, userSetup._check_header, userSetup._column_names, userSetup._column_types, null, null);
  }

  public static ParseSetup guessSetup(ByteVec bv, byte [] bits, ParserInfo parserType, byte sep, int ncols, boolean singleQuotes, int checkHeader, String[] columnNames, byte[] columnTypes, String[][] domains, String[][] naStrings ) {
    ParserProvider pp = ParserService.INSTANCE.getByInfo(parserType);
    if (pp != null) {
      return pp.guessSetup(bv, bits, sep, ncols, singleQuotes, checkHeader, columnNames, columnTypes, domains, naStrings);
    }
    throw new ParseDataset.H2OParseException("Cannot determine file type.");
  }

  /**
   * Cleans up the file name to make .hex name
   * to be used as a destination key.  Eliminates
   * common file extensions, and replaces odd
   * characters.
   *
   * @param n filename to be cleaned
   * @return cleaned name
   */
  public static String createHexName(String n) {
    // blahblahblah/myName.ext ==> myName
    // blahblahblah/myName.csv.ext ==> myName
    int sep = n.lastIndexOf(java.io.File.separatorChar);
    if( sep > 0 ) n = n.substring(sep+1);
    int dot = n.lastIndexOf('.');
    while ( dot > 0 &&
            (n.endsWith("zip")
            || n.endsWith("gz")
            || n.endsWith("csv")
            || n.endsWith("xls")
            || n.endsWith("txt")
            || n.endsWith("svm")
            || n.endsWith("orc")
            || n.endsWith("arff"))) {
      n = n.substring(0, dot);
      dot = n.lastIndexOf('.');
    }
    // "2012_somedata" ==> "X2012_somedata"
    if( !Character.isJavaIdentifierStart(n.charAt(0)) ) n = "X"+n;
    // "human%Percent" ==> "human_Percent"
    char[] cs = n.toCharArray();
    for( int i=1; i<cs.length; i++ )
      if( !Character.isJavaIdentifierPart(cs[i]) )
        cs[i] = '_';
    // "myName" ==> "myName.hex"
    n = new String(cs);
    int i = 0;
    String res = n + ".hex";
    Key k = Key.make(res);
    // Renumber to handle dup names
    while(DKV.get(k) != null)
      k = Key.make(res = n + ++i + ".hex");
    return res;
  }

  /**
   *  Reject unsupported encodings
   *
   * For the curious, this is hardly a complete test, it only catches the
   * most polite UTF-16 cases.  Switch to jChardet or guessEncoding libraries
   * for more robust solutions.  WARNING: not all UTF-16 files
   * use BOM to indicate their encoding.  Even worse, some datasets may be
   * made from disparate sources, and could used a mix that wouldn't be
   * detected by this.
   *
   * @param bits data to be examined for encoding
   */
  private static final void checkEncoding(byte[] bits) {
    if (bits.length >= 2) {
      if ((bits[0] == (byte) 0xff && bits[1] == (byte) 0xfe) /* UTF-16, little endian */ ||
              (bits[0] == (byte) 0xfe && bits[1] == (byte) 0xff) /* UTF-16, big endian */) {
        throw new ParseDataset.H2OParseException("UTF16 encoding detected, but is not supported.");
      }
    }
  }

  /**
   * Compute the longest line length in an array of bytes
   * @param bytes Array of bytes (containing 0 or more newlines)
   * @return The longest line length in the given bytes
   */
  private static final long maxLineLength(byte[] bytes) {
    if (bytes.length >= 2) {
      String st = new String(bytes);
      StringReader sr = new StringReader(st);
      BufferedReader br = new BufferedReader(sr);
      String line;
      long maxLineLength=0;
      try {
        while(true) {
          line = br.readLine();
          if (line == null) break;
          maxLineLength = Math.max(line.length(), maxLineLength);
        }
      } catch (IOException e) {
        return -1;
      }
      return maxLineLength;
    }
    return -1;
  }

  /**
   * Copies the common setup to another object (that is possibly and extension of the base setup).
   * Note: this method only copies fields directly declared in ParseSetup class, it doesn't handle
   * fields that are declared in classes derived from ParseSetup.
   * @param setup target setup object
   * @param <T> class derived from ParseSetup
   * @return the target setup object (for convenience)
   */
  public <T extends ParseSetup> T copyTo(T setup) {
    try {
      for (Field field : ParseSetup.class.getDeclaredFields()) {
        if (! java.lang.reflect.Modifier.isStatic(field.getModifiers())) field.set(setup, field.get(this));
      }
      return setup;
    } catch (IllegalAccessException e) {
      throw new RuntimeException(e);
    }
  }

  public ParserInfo getParseType() {
    return _parse_type;
  }

  public ParseSetup setParseType(ParserInfo parse_type) {
    this._parse_type = parse_type;
    return this;
  }

  public ParseSetup setSeparator(byte separator) {
    this._separator = separator;
    return this;
  }

  public ParseSetup setSingleQuotes(boolean single_quotes) {
    this._single_quotes = single_quotes;
    return this;
  }

  public ParseSetup setCheckHeader(int check_header) {
    this._check_header = check_header;
    return this;
  }

  public ParseSetup setNumberColumns(int number_columns) {
    this._number_columns = number_columns;
    return this;
  }

  public ParseSetup setColumnNames(String[] column_names) {
    this._column_names = column_names;
    return this;
  }

  public ParseSetup setColumnTypes(byte[] column_types) {
    this._column_types = column_types;
    return this;
  }

  public ParseSetup setDomains(String[][] domains) {
    this._domains = domains;
    return this;
  }

  public ParseSetup setNAStrings(String[][] na_strings) {
    this._na_strings = na_strings;
    return this;
  }

  public ParseSetup setChunkSize(int chunk_size) {
    this._chunk_size = chunk_size;
    return this;
  }

} // ParseSetup state class