XlsParser.java example

Explorer
h2o-3-master
package water.parser;

import java.io.*;
import java.util.Arrays;
import java.util.HashMap;
import java.util.ArrayList;
import water.Key;
import water.H2O;
import water.util.UnsafeUtils;

import static water.parser.DefaultParserProviders.XLS_INFO;

class XlsParser extends Parser {
  XlsParser( ParseSetup ps, Key jobKey ) { super(ps, jobKey); }

  @Override
  protected ParseWriter parseChunk(int cidx, final ParseReader din, final ParseWriter dout) { throw H2O.unimpl(); }

  // A Stream, might be a Zip stream
  private InputStream _is;
  // The unpacked data.  We expect we can fully hold the unzipped data.
  private byte[] _buf;
  private int _lim;             // What was read so-far


  // Simple offset / lim over the underlying buffer
  private class Buf {
    final byte[] _buf;
    byte[] _bbuf;
    int _off, _lim;
    Buf( byte[] buf, int off, int size ) throws IOException { _buf = _bbuf = buf; _off = off; _lim = off+size; readAtLeast(_lim); }
    Buf( Buf B, int off, int size ) { _buf = _bbuf = B._bbuf; _off = off; _lim = off+size; assert _lim <= _buf.length; }

    void concat( int off, int size ) throws IOException {
      readAtLeast(off+size);
      if( _off == _lim ) {      // Empty Buf, so concat is really assign
        _off = off; _lim = off+size; 
        return;
      }
      if( off == _lim ) {       // Adjacent, so just extend
        _lim += size; 
        return;
      }
      _bbuf = Arrays.copyOfRange(_bbuf,_off,_lim+size);
      _lim = _lim-_off+size;
      _off = 0;
      System.arraycopy(_buf,off,_bbuf,_lim-size,size);
    }

    char get1(int pos ) { assert _off+pos+1<_lim; return (char)_bbuf[_off+pos]; }
    int get2( int pos ) { assert _off+pos+2<_lim; return UnsafeUtils.get2(_bbuf, _off + pos); }
    int get4( int pos ) { assert _off+pos+4<_lim; return UnsafeUtils.get4(_bbuf,_off+pos); }
    double get8d( int pos ) { assert _off+pos+8<_lim; return UnsafeUtils.get8d(_bbuf,_off+pos); }
    String getStr( int pos, int len ) { return new String(_bbuf,_off+pos,len); }
  }

  // Read & keep in _buf from the unpacked stream at least 'lim' bytes.
  // Toss a range-check if the stream runs dry too soon.
  private void readAtLeast(int lim) throws IOException{
    if( lim <= _lim ) return;   // Already read at least
    if( _buf == null ) _buf = new byte[0];
    if( lim > _buf.length ) { // Need to grow buffer
      int oldlen = _buf.length,  newlen = oldlen;
      if( newlen==0 ) newlen=1024;
      while( newlen < lim ) newlen<<=1;
      _buf = Arrays.copyOf(_buf,newlen);
    }
    // Now read/unzip until lim
    int x;
    while( _lim < lim && (x = _is.read(_buf,_lim,_buf.length-_lim)) != -1 )
      _lim += x;
    if( _lim < lim )
      throw new java.lang.ArrayIndexOutOfBoundsException("not an XLS file: reading at "+lim+" but file is only "+_lim+" bytes");
  }

  // Wrapper to fetch an int at a random offset
  private int get4( int pos ) throws IOException { readAtLeast(pos+4); return UnsafeUtils.get4(_buf,pos); }


  /** Try to parse the bytes as XLS format  */
  public static ParseSetup guessSetup( byte[] bytes ) {
    XlsParser p = new XlsParser(new ParseSetup(XLS_INFO, ParseSetup.GUESS_SEP, false,
                                ParseSetup.GUESS_HEADER, ParseSetup.GUESS_COL_CNT, null, null, null, null, null), null);
    p._buf = bytes;             // No need to copy already-unpacked data; just use it directly
    p._lim = bytes.length;
    PreviewParseWriter dout = new PreviewParseWriter();
    try{ p.streamParse(new ByteArrayInputStream(bytes), dout); } catch(IOException e) { throw new RuntimeException(e); }
    if (dout._ncols > 0 && dout._nlines > 0 && dout._nlines > dout._invalidLines)
      return new ParseSetup(XLS_INFO, ParseSetup.GUESS_SEP, false,
            dout.colNames()==null?ParseSetup.NO_HEADER:ParseSetup.HAS_HEADER,dout._ncols,
                                 dout.colNames(), dout.guessTypes(),null,null,dout._data);
    else throw new ParseDataset.H2OParseException("Could not parse file as an XLS file.");
  }


  /** Ported to Java from excel_reader2.php.
   *  Found at:  http://code.google.com/p/php-excel-reader/downloads/detail?name=php-excel-reader-2.21.zip&can=2&q=
   *
   *  Originally developed by Vadim Tkachenko under the name PHPExcelReader.
   *  (http://sourceforge.net/projects/phpexcelreader)
   *  Based on the Java version by Andy Khan (http://www.andykhan.com).  Now
   *  maintained by David Sanders.  Reads only Biff 7 and Biff 8 formats.
   * 
   *  PHP versions 4 and 5
   * 
   *  LICENSE: This source file is subject to version 3.0 of the PHP license
   *  that is available through the world-wide-web at the following URI:
   *  http://www.php.net/license/3_0.txt.  If you did not receive a copy of
   *  the PHP License and are unable to obtain it through the web, please
   *  send a note to license@php.net so we can mail you a copy immediately.
   */

  private static final int NUM_BIG_BLOCK_DEPOT_BLOCKS_POS = 0x2c;
  private static final int SMALL_BLOCK_DEPOT_BLOCK_POS = 0x3c;
  private static final int ROOT_START_BLOCK_POS = 0x30;
  private static final int BIG_BLOCK_SIZE = 0x200;
  private static final int SMALL_BLOCK_SIZE = 0x40;
  private static final int EXTENSION_BLOCK_POS = 0x44;
  private static final int NUM_EXTENSION_BLOCK_POS = 0x48;
  private static final int PROPERTY_STORAGE_BLOCK_SIZE = 0x80;
  private static final int BIG_BLOCK_DEPOT_BLOCKS_POS = 0x4c;
  private static final int SMALL_BLOCK_THRESHOLD = 0x1000;
  // property storage offsets
  private static final int SIZE_OF_NAME_POS = 0x40;
  private static final int TYPE_POS = 0x42;
  private static final int START_BLOCK_POS = 0x74;
  private static final int SIZE_POS = 0x78;
  private static final byte[] IDENTIFIER_OLE = new byte[] { (byte)0xd0,(byte)0xcf,(byte)0x11,(byte)0xe0,(byte)0xa1,(byte)0xb1,(byte)0x1a,(byte)0xe1 };

  // Breakdown of the OLE structure
  private int _numBigBlockDepotBlocks;
  private int _sbdStartBlock;
  private int _rootStartBlock;
  private int _extensionBlock;
  private int _numExtensionBlocks;
  private int[] _bigBlockChain;
  private int[] _smallBlockChain;
  private ArrayList<Props> _props = new ArrayList<>();
  private static class Props {
    final String _name;
    final int _type, _startBlock, _size;
    Props( String name, int type, int startBlock, int size ) { _name = name; _type = type; _startBlock = startBlock; _size = size; }
  }
  private Props _wrkbook, _rootentry;

  @Override public ParseWriter streamParse( final InputStream is, final ParseWriter dout) throws IOException {
    _is = is;
    // Check for magic first
    readAtLeast(IDENTIFIER_OLE.length);
    for( int i=0; i<IDENTIFIER_OLE.length; i++ ) 
      if( _buf[i] != IDENTIFIER_OLE[i] )
        throw new ParseDataset.H2OParseException("Not a valid XLS file, lacks correct starting bits (aka magic number).");

    _numBigBlockDepotBlocks = get4(NUM_BIG_BLOCK_DEPOT_BLOCKS_POS);
    _sbdStartBlock = get4(SMALL_BLOCK_DEPOT_BLOCK_POS);
    _rootStartBlock = get4(ROOT_START_BLOCK_POS);
    _extensionBlock = get4(EXTENSION_BLOCK_POS);
    _numExtensionBlocks = get4(NUM_EXTENSION_BLOCK_POS);

    int pos = BIG_BLOCK_DEPOT_BLOCKS_POS;
    int bbdBlocks = _numExtensionBlocks == 0 ? _numBigBlockDepotBlocks : (BIG_BLOCK_SIZE - BIG_BLOCK_DEPOT_BLOCKS_POS)/4;
    final int[] bigBlockDepotBlocks = new int[bbdBlocks];
    for( int i = 0; i < bbdBlocks; i++ )
      bigBlockDepotBlocks[i] = get4((pos+=4)-4);

    for( int j = 0; j < _numExtensionBlocks; j++ ) {
      pos = (_extensionBlock + 1) * BIG_BLOCK_SIZE;
      final int blocksToRead = Math.min(_numBigBlockDepotBlocks - bbdBlocks, BIG_BLOCK_SIZE / 4 - 1);
      for( int i = bbdBlocks; i < bbdBlocks + blocksToRead; i++ )
        bigBlockDepotBlocks[i] = get4((pos+=4)-4);
      bbdBlocks += blocksToRead;
      if( bbdBlocks < _numBigBlockDepotBlocks )
        _extensionBlock = get4(pos);
    }

    // readBigBlockDepot
    int index = 0;
    _bigBlockChain = new int[1];
    for( int i = 0; i < _numBigBlockDepotBlocks; i++ ) {
      pos = (bigBlockDepotBlocks[i] + 1) * BIG_BLOCK_SIZE;
      for( int j = 0 ; j < BIG_BLOCK_SIZE / 4; j++ ) {
        _bigBlockChain[index++] = get4((pos+=4)-4);
        if( index==_bigBlockChain.length ) _bigBlockChain = Arrays.copyOf(_bigBlockChain,index<<1);
      }
    }

    // readSmallBlockDepot();
    index = 0;
    int sbdBlock = _sbdStartBlock;
    int[] smallBlockChain = new int[1];
    while( sbdBlock != -2 ) {
      pos = (sbdBlock + 1) * BIG_BLOCK_SIZE;
      for( int j = 0; j < BIG_BLOCK_SIZE / 4; j++ ) {
        smallBlockChain[index++] = get4((pos+=4)-4);
        if( index==smallBlockChain.length ) smallBlockChain = Arrays.copyOf(smallBlockChain,index<<1);
      }
      sbdBlock = _bigBlockChain[sbdBlock];
    }

    // Read workbook & root entries
    __readPropertySets(__readData(_rootStartBlock));
    // Read the workbook - this holds all the csv data
    Buf data = getWorkBook();
    // Parse the workbook
    boolean res = parseWorkbook(data,dout);
    if( !res ) throw new IOException("not an XLS file");

    return dout;
  }

  private Buf __readData(int block) throws IOException {
    Buf data = new Buf(_buf,0,0);
    while( block != -2 )  {
      int pos = (block + 1) * BIG_BLOCK_SIZE;
      data.concat(pos, BIG_BLOCK_SIZE);
      block = _bigBlockChain[block];
    }
    return data;
  }

  // Find the workbook & root entries
  private void __readPropertySets(Buf entry) {
    int offset = 0;
    while( offset < entry._lim ) {
      Buf d = new Buf(entry, offset, PROPERTY_STORAGE_BLOCK_SIZE);
      int nameSize = d.get2(SIZE_OF_NAME_POS);
      int type = d._bbuf[TYPE_POS];
      int startBlock = d.get4(START_BLOCK_POS);
      int size = d.get4(SIZE_POS);
      String name = "";
      for( int i = 0; i < nameSize ; i+=2 ) name += (char)d.get2(i);
      name = name.replaceAll("\0", ""); // remove trailing nul (C string?)
      Props p = new Props(name,type,startBlock,size);
      _props.add(p);
      if( name.equalsIgnoreCase("workbook") || name.equalsIgnoreCase("book") )
        _wrkbook = p;
      if( name.equals("Root Entry") )
        _rootentry = p;
      offset += PROPERTY_STORAGE_BLOCK_SIZE;
    }
  }

  
  private Buf getWorkBook() throws IOException {
    if( _wrkbook._size < SMALL_BLOCK_THRESHOLD ) {
      Buf rootdata = __readData(_rootentry._startBlock);
      Buf streamData = new Buf(rootdata,0,0);
      int block = _wrkbook._startBlock;
      while( block != -2 ) {
        int pos = block * SMALL_BLOCK_SIZE;
        streamData.concat(pos, SMALL_BLOCK_SIZE);
        block = _smallBlockChain[block];
      }
      return streamData;
    } else {
      int numBlocks = _wrkbook._size / BIG_BLOCK_SIZE;
      if( _wrkbook._size % BIG_BLOCK_SIZE != 0 )
        numBlocks++;
      Buf streamData = new Buf(_buf,0,0);
      if( numBlocks == 0 ) return streamData;
      int block = _wrkbook._startBlock;
      while( block != -2 ) {
        int pos = (block + 1) * BIG_BLOCK_SIZE;
        streamData.concat(pos, BIG_BLOCK_SIZE);
        block = _bigBlockChain[block];
      }
      return streamData;
    }
  }


  private static final int SPREADSHEET_EXCEL_READER_BIFF8 = 0x600;
  private static final int SPREADSHEET_EXCEL_READER_BIFF7 = 0x500;
  private static final int SPREADSHEET_EXCEL_READER_WORKBOOKGLOBALS = 0x5;
  private static final int SPREADSHEET_EXCEL_READER_WORKSHEET = 0x10;
  private static final int SPREADSHEET_EXCEL_READER_TYPE_BOF = 0x809;
  private static final int SPREADSHEET_EXCEL_READER_TYPE_EOF = 0x0a;
  private static final int SPREADSHEET_EXCEL_READER_TYPE_BOUNDSHEET = 0x85;
  private static final int SPREADSHEET_EXCEL_READER_TYPE_DIMENSION = 0x200;
  private static final int SPREADSHEET_EXCEL_READER_TYPE_ROW = 0x208;
  private static final int SPREADSHEET_EXCEL_READER_TYPE_DBCELL = 0xd7;
  private static final int SPREADSHEET_EXCEL_READER_TYPE_FILEPASS = 0x2f;
  private static final int SPREADSHEET_EXCEL_READER_TYPE_NOTE = 0x1c;
  private static final int SPREADSHEET_EXCEL_READER_TYPE_TXO = 0x1b6;
  private static final int SPREADSHEET_EXCEL_READER_TYPE_RK = 0x7e;
  private static final int SPREADSHEET_EXCEL_READER_TYPE_RK2 = 0x27e;
  private static final int SPREADSHEET_EXCEL_READER_TYPE_MULRK = 0xbd;
  private static final int SPREADSHEET_EXCEL_READER_TYPE_MULBLANK = 0xbe;
  private static final int SPREADSHEET_EXCEL_READER_TYPE_INDEX = 0x20b;
  private static final int SPREADSHEET_EXCEL_READER_TYPE_SST = 0xfc;
  private static final int SPREADSHEET_EXCEL_READER_TYPE_EXTSST = 0xff;
  private static final int SPREADSHEET_EXCEL_READER_TYPE_CONTINUE = 0x3c;
  private static final int SPREADSHEET_EXCEL_READER_TYPE_LABEL = 0x204;
  private static final int SPREADSHEET_EXCEL_READER_TYPE_LABELSST = 0xfd;
  private static final int SPREADSHEET_EXCEL_READER_TYPE_NUMBER = 0x203;
  private static final int SPREADSHEET_EXCEL_READER_TYPE_NAME = 0x18;
  private static final int SPREADSHEET_EXCEL_READER_TYPE_ARRAY = 0x221;
  private static final int SPREADSHEET_EXCEL_READER_TYPE_STRING = 0x207;
  private static final int SPREADSHEET_EXCEL_READER_TYPE_FORMULA = 0x406;
  private static final int SPREADSHEET_EXCEL_READER_TYPE_FORMULA2 = 0x6;
  private static final int SPREADSHEET_EXCEL_READER_TYPE_FORMAT = 0x41e;
  private static final int SPREADSHEET_EXCEL_READER_TYPE_XF = 0xe0;
  private static final int SPREADSHEET_EXCEL_READER_TYPE_BOOLERR = 0x205;
  private static final int SPREADSHEET_EXCEL_READER_TYPE_FONT = 0x0031;
  private static final int SPREADSHEET_EXCEL_READER_TYPE_PALETTE = 0x0092;
  private static final int SPREADSHEET_EXCEL_READER_TYPE_UNKNOWN = 0xffff;
  private static final int SPREADSHEET_EXCEL_READER_TYPE_NINETEENFOUR = 0x22;
  private static final int SPREADSHEET_EXCEL_READER_TYPE_MERGEDCELLS = 0xE5;
  private static final int SPREADSHEET_EXCEL_READER_UTCOFFSETDAYS = 25569;
  private static final int SPREADSHEET_EXCEL_READER_UTCOFFSETDAYS1904 = 24107;
  private static final int SPREADSHEET_EXCEL_READER_MSINADAY = 86400;
  private static final int SPREADSHEET_EXCEL_READER_TYPE_HYPER = 0x01b8;
  private static final int SPREADSHEET_EXCEL_READER_TYPE_COLINFO = 0x7d;
  private static final int SPREADSHEET_EXCEL_READER_TYPE_DEFCOLWIDTH = 0x55;
  private static final int SPREADSHEET_EXCEL_READER_TYPE_STANDARDWIDTH = 0x99;
  private static final String SPREADSHEET_EXCEL_READER_DEF_NUM_FORMAT = "%s";

  // Excel spreadsheet specific stuff
  private int _version;
  private boolean _nineteenFour;
  private String[] _formatRecords = new String[1];
  private ArrayList<String> _sst = new ArrayList<>();
  private ArrayList<Sheet> _boundsheets = new ArrayList<>();
  private static class XF {
    final int _indexCode;
    enum Type { Date, Number, Other }
    final Type _type;
    XF( int code, Type type ) { _indexCode = code; _type = type; }
  }
  private ArrayList<XF> _xfRecords = new ArrayList<>();

  /** List of default date formats used by Excel */
  private static HashMap<Integer,String> DATEFORMATS = new HashMap<>();
  static {
    DATEFORMATS.put(0xe,"m/d/Y");
    DATEFORMATS.put(0xf,"M-d-Y");
    DATEFORMATS.put(0x10,"d-M");
    DATEFORMATS.put(0x11,"M-Y");
    DATEFORMATS.put(0x12,"h:i a");
    DATEFORMATS.put(0x13,"h:i:s a");
    DATEFORMATS.put(0x14,"H:i");
    DATEFORMATS.put(0x15,"H:i:s");
    DATEFORMATS.put(0x16,"d/m/Y H:i");
    DATEFORMATS.put(0x2d,"i:s");
    DATEFORMATS.put(0x2e,"H:i:s");
    DATEFORMATS.put(0x2f,"i:s.S");
  }

  /** Default number formats used by Excel */
  private static HashMap<Integer,String> NUMBERFORMATS = new HashMap<>();
  static {
    NUMBERFORMATS.put(0x1 ,"0");
    NUMBERFORMATS.put(0x2 ,"0.00");
    NUMBERFORMATS.put(0x3 ,"#,##0");
    NUMBERFORMATS.put(0x4 ,"#,##0.00");
    NUMBERFORMATS.put(0x5 ,"$#,##0;($#,##0)");
    NUMBERFORMATS.put(0x6 ,"$#,##0;[Red]($#,##0)");
    NUMBERFORMATS.put(0x7 ,"$#,##0.00;($#,##0.00)");
    NUMBERFORMATS.put(0x8 ,"$#,##0.00;[Red]($#,##0.00)");
    NUMBERFORMATS.put(0x9 ,"0%");
    NUMBERFORMATS.put(0xa ,"0.00%");
    NUMBERFORMATS.put(0xb ,"0.00E+00");
    NUMBERFORMATS.put(0x25,"#,##0;(#,##0)");
    NUMBERFORMATS.put(0x26,"#,##0;[Red](#,##0)");
    NUMBERFORMATS.put(0x27,"#,##0.00;(#,##0.00)");
    NUMBERFORMATS.put(0x28,"#,##0.00;[Red](#,##0.00)");
    NUMBERFORMATS.put(0x29,"#,##0;(#,##0)");  // Not exactly
    NUMBERFORMATS.put(0x2a,"$#,##0;($#,##0)");  // Not exactly
    NUMBERFORMATS.put(0x2b,"#,##0.00;(#,##0.00)");  // Not exactly
    NUMBERFORMATS.put(0x2c,"$#,##0.00;($#,##0.00)");  // Not exactly
    NUMBERFORMATS.put(0x30,"##0.0E+0");
  }


  /**
   * Parse a workbook
   */
  private boolean parseWorkbook(Buf data, final ParseWriter dout) {
    int pos = 0;

    int code = data.get2(pos);
    int length = data.get2(pos+2);
    int version = data.get2(pos+4);
    int substreamType = data.get2(pos+6);

    _version = version;

    if( version != SPREADSHEET_EXCEL_READER_BIFF8 &&
        version != SPREADSHEET_EXCEL_READER_BIFF7 )
      return false;

    if( substreamType != SPREADSHEET_EXCEL_READER_WORKBOOKGLOBALS )
      return false;

    pos += length + 4;
    code = data.get2(pos);
    length = data.get2(pos+2);

    while( code != SPREADSHEET_EXCEL_READER_TYPE_EOF ) {
      switch( code ) {
      case SPREADSHEET_EXCEL_READER_TYPE_SST: {
        int spos = pos + 4;
        int limitpos = spos + length;
        int uniqueStrings = data.get4(spos+4);
        spos += 8;
        for( int i = 0; i < uniqueStrings; i++ ) {
          // Read in the number of characters
          if (spos == limitpos) {
            int conlength = data.get2(spos+2);
            spos += 4;
            limitpos = spos + conlength;
          }
          int numChars = data.get2(spos);
          spos += 2;
          int optionFlags = data.get1(spos);
          spos++;
          boolean asciiEncoding = ((optionFlags & 0x01) == 0);
          boolean extendedString = ( (optionFlags & 0x04) != 0);

          // See if string contains formatting information
          boolean richString = ( (optionFlags & 0x08) != 0);

          int formattingRuns=0;
          if( richString )      // Read in the crun
            formattingRuns = data.get2((spos+=2)-2);

          int extendedRunLength=0;
          if( extendedString )  // Read in cchExtRst
            extendedRunLength = data.get4((spos+=4)-4);

          String retstr = null;
          int len = (asciiEncoding)? numChars : numChars*2;
          if( spos + len < limitpos ) {
            retstr = data.getStr((spos+=len)-len, len);
          }
          else {
            // found continue
            retstr = data.getStr(spos, limitpos - spos);
            int bytesRead = limitpos - spos;
            int charsLeft = numChars - ((asciiEncoding) ? bytesRead : (bytesRead / 2));
            spos = limitpos;

            while (charsLeft > 0) {
              int opcode = data.get2(spos);
              int conlength = data.get2(spos+2);
              if( opcode != 0x3c ) return false;
              spos += 4;
              limitpos = spos + conlength;
              int option = data.get1(spos);
              spos += 1;
      //        if (asciiEncoding && (option == 0)) {
      //          len = min(charsLeft, limitpos - spos); // min(charsLeft, conlength);
      //          retstr .= substr(data, spos, len);
      //          charsLeft -= len;
      //          asciiEncoding = true;
      //        }
      //        elseif (!asciiEncoding && (option != 0)) {
      //          len = min(charsLeft * 2, limitpos - spos); // min(charsLeft, conlength);
      //          retstr .= substr(data, spos, len);
      //          charsLeft -= len/2;
      //          asciiEncoding = false;
      //        }
      //        elseif (!asciiEncoding && (option == 0)) {
      //          // Bummer - the string starts off as Unicode, but after the
      //          // continuation it is in straightforward ASCII encoding
      //          len = min(charsLeft, limitpos - spos); // min(charsLeft, conlength);
      //          for (j = 0; j < len; j++) {
      //            retstr .= data[spos + j].chr(0);
      //          }
      //          charsLeft -= len;
      //          asciiEncoding = false;
      //        }
      //        else{
      //          newstr = '';
      //          for (j = 0; j < strlen(retstr); j++) {
      //            newstr = retstr[j].chr(0);
      //          }
      //          retstr = newstr;
      //          len = min(charsLeft * 2, limitpos - spos); // min(charsLeft, conlength);
      //          retstr .= substr(data, spos, len);
      //          charsLeft -= len/2;
      //          asciiEncoding = false;
      //        }
      //        spos += len;
              throw H2O.unimpl();
            }
          }
          retstr = (asciiEncoding) ? retstr : __encodeUTF16(retstr);

          if (richString) spos += 4 * formattingRuns;
          // For extended strings, skip over the extended string data
          if (extendedString) spos += extendedRunLength;
          _sst.add(retstr);
        }
        break;
      }
      case SPREADSHEET_EXCEL_READER_TYPE_FILEPASS:
        return false;
      case SPREADSHEET_EXCEL_READER_TYPE_NAME:
        break;
      case SPREADSHEET_EXCEL_READER_TYPE_FORMAT: {
        String formatString = version == SPREADSHEET_EXCEL_READER_BIFF8
          ? data.getStr(pos+9, data.get2(pos+6)*(data.get1(pos+8) == 0 ? 1 : 2))
          : data.getStr(pos+7, data.get1(pos+6)*2);
        int indexCode = data.get2(pos+4);
        while( indexCode >= _formatRecords.length ) 
          _formatRecords = Arrays.copyOf(_formatRecords,_formatRecords.length<<1);
        _formatRecords[indexCode] = formatString;
        break;
      }
      case SPREADSHEET_EXCEL_READER_TYPE_FONT: 
        break; // While the original php file parsed the font here, H2O just wants the data
      case SPREADSHEET_EXCEL_READER_TYPE_PALETTE:
        break; // While the original php file parsed the color palaette info here, H2O just wants the data
      case SPREADSHEET_EXCEL_READER_TYPE_XF: {
        // While the original php file parsed the extensive formatting info
        // here, H2O just wants the data.  Limit to figuring out if excel thinks
        // this is a date-formatted field or not
        int indexCode = data.get2(pos+6);
        XF.Type t=null;
        if( DATEFORMATS.containsKey(indexCode) )
          t = XF.Type.Date;
        else if( NUMBERFORMATS.containsKey(indexCode) )
          t = XF.Type.Number;
        else if( indexCode < _formatRecords.length && _formatRecords[indexCode] != null )
          t = XF.Type.Other;
        _xfRecords.add(new XF(indexCode,t));
        break; 
      }
      case SPREADSHEET_EXCEL_READER_TYPE_NINETEENFOUR:
        _nineteenFour = data.get1(pos+4) == 1;
        break;
      case SPREADSHEET_EXCEL_READER_TYPE_BOUNDSHEET:
        int recOffset = data.get4(pos+4);
        int recLength = data.get1(pos+10);
        String recName = version == SPREADSHEET_EXCEL_READER_BIFF8
          ? data.getStr(pos+12, recLength*(data.get1(pos+11) == 0 ? 1 : 2))
          : data.getStr(pos+11, recLength);
        _boundsheets.add(new Sheet(data,dout,recName,recOffset));
        break;
      default:
        // nothing; ignore this block typed
      }
      
      pos += length + 4;
      code = data.get2(pos);
      length = data.get2(pos+2);
    }

    // Parse all Sheets, although honestly H2O probably only wants the 1st sheet
    for( Sheet sheet : _boundsheets )
      sheet.parse();
    return true;
  }

  // ------------------------------
  // A single Excel Sheet
  private class Sheet { 
    final String _name;
    final Buf _data;
    final int _offset;
    final ParseWriter _dout;
    
    int _numRows, _numCols;
    String[] _labels;
    int _currow = 0;
    double[] _ds;

    Sheet( Buf data, ParseWriter dout, String name, int offset ) { _data = data; _dout = dout; _name = name; _offset = offset; }

    // Get the next row spec - and thus cleanup the prior row
    int row(int spos) {
      int row = _data.get2(spos);
      if( row < _currow ) throw new RuntimeException("XLS file but rows running backwards");
      return doRow(row);
    }
    int doRow(int row) {
      // Once we're done with row 0, look at the collection of Strings on this
      // row.  If all columns have a String, declare it a label row.  Else,
      // inject the partial Strings as categoricals.
      if( row > _currow && _currow == 0 ) {      // Moving off of row 0
        boolean header=true;
        for( String s : _labels ) header &= (s!=null); // All strings?
        if( header ) {          // It's a header row
          _dout.setColumnNames(_labels.clone());
          Arrays.fill(_labels,null); // Dont reuse them labels as categoricals
          _currow=1;                 // Done with this row
        }
      }
      // Advance to the next row
      while( _currow < row ) { 
        _currow++;              // Next row internally
        // Forward collected row to _dout.  
        for( int i=0; i<_ds.length; i++ ) {
          if( _labels[i] != null ) { _dout.addStrCol(i,new BufferedString(_labels[i])); _labels[i] = null; }
          else { _dout.addNumCol(i,_ds[i]); _ds[i] = Double.NaN; }
        }
        _dout.newLine();        // And advance dout a line
      }
      return row;
    }

    boolean parse() {
      // read BOF
      int spos = _offset;
      int code = _data.get2(spos);
      int length = _data.get2(spos+2);
      int version = _data.get2(spos + 4);
      if( (version != SPREADSHEET_EXCEL_READER_BIFF8) && (version != SPREADSHEET_EXCEL_READER_BIFF7) )
        return false;

      int substreamType = _data.get2(spos + 6);
      if( substreamType != SPREADSHEET_EXCEL_READER_WORKSHEET )
        return false;
      spos += length + 4;
      String recType = null;

      while(true) {
        code = _data.get1(spos);
        if( code != SPREADSHEET_EXCEL_READER_TYPE_EOF) {
          code = _data.get2(spos);
          length = _data.get2(spos+2);
          recType = null;
          spos += 4;
        }

        switch( code ) {
        case SPREADSHEET_EXCEL_READER_TYPE_DIMENSION:
          if( _numRows == 0 && _numCols == 0 ) {
            if( length == 10 || version == SPREADSHEET_EXCEL_READER_BIFF7 ) {
              _numRows = _data.get2(spos+ 2);
              _numCols = _data.get2(spos+ 6);
            } else {
              _numRows = _data.get2(spos+ 4);
              _numCols = _data.get2(spos+10);
            }
            _labels = new String[_numCols];
            _ds = new double[_numCols];
            Arrays.fill(_ds,Double.NaN);
          }
          break;
        case SPREADSHEET_EXCEL_READER_TYPE_MERGEDCELLS: break; // While the original php file parsed merged-cells here, H2O just wants the _data
        case SPREADSHEET_EXCEL_READER_TYPE_RK:
        case SPREADSHEET_EXCEL_READER_TYPE_RK2: {
          int row = row(spos);
          int col = _data.get2(spos+2);
          double d  = _GetIEEE754(_data.get4(spos+6));
          if( isDate(_data, spos) ) throw H2O.unimpl();
          _ds[col] = d;
          break;
        }
        case SPREADSHEET_EXCEL_READER_TYPE_LABELSST: {
          int row = row(spos);
          int col = _data.get2(spos+2);
          int index = _data.get4(spos+6);
          _labels[col] = _sst.get(index); // Set label
          break;
        }
        case SPREADSHEET_EXCEL_READER_TYPE_MULRK: {
          int row     = row(spos);
          int colFirst= _data.get2(spos+2);
          int colLast = _data.get2(spos+length-2);
          int columns  = colLast - colFirst + 1;
          int tmppos = spos+4;
          for( int i = 0; i < columns; i++ ) {
            double numValue = _GetIEEE754(_data.get4(tmppos + 2));
            if( isDate( _data, tmppos-4) ) throw H2O.unimpl();
            tmppos += 6;
            _ds[colFirst+i] = numValue;
          }
          break;
        }
        case SPREADSHEET_EXCEL_READER_TYPE_NUMBER: {
          int row = row(spos);
          int col = _data.get2(spos+2);
          double d = _data.get8d(spos+6);
          if( isDate(_data,spos) ) throw H2O.unimpl();
          _ds[col] = d;
          break;
        }
        case SPREADSHEET_EXCEL_READER_TYPE_MULBLANK: {
          int row = row(spos);
          int col = _data.get2(spos+2);
          int cols= (length / 2) - 3;
          for( int c = 0; c < cols; c++ ) {
            if( isDate( _data, spos+(c*2)) ) throw H2O.unimpl();
            _ds[col+c] = 0;
          }
          break;
        }

        case SPREADSHEET_EXCEL_READER_TYPE_FORMULA:
        case SPREADSHEET_EXCEL_READER_TYPE_FORMULA2: throw H2O.unimpl();
          //row	= ord(_data[spos]) | ord(_data[spos+1])<<8;
          //column = ord(_data[spos+2]) | ord(_data[spos+3])<<8;
          //if ((ord(_data[spos+6])==0) && (ord(_data[spos+12])==255) && (ord(_data[spos+13])==255)) {
          //  //String formula. Result follows in a STRING record
          //  // This row/col are stored to be referenced in that record
          //  // http://code.google.com/p/php-excel-reader/issues/detail?id=4
          //  previousRow = row;
          //  previousCol = column;
          //} elseif ((ord(_data[spos+6])==1) && (ord(_data[spos+12])==255) && (ord(_data[spos+13])==255)) {
          //  //Boolean formula. Result is in +2; 0=false,1=true
          //  // http://code.google.com/p/php-excel-reader/issues/detail?id=4
          //  if (ord(this->_data[spos+8])==1) {
          //    this->addcell(row, column, "TRUE");
          //  } else {
          //    this->addcell(row, column, "FALSE");
          //  }
          //} elseif ((ord(_data[spos+6])==2) && (ord(_data[spos+12])==255) && (ord(_data[spos+13])==255)) {
          //  //Error formula. Error code is in +2;
          //} elseif ((ord(_data[spos+6])==3) && (ord(_data[spos+12])==255) && (ord(_data[spos+13])==255)) {
          //  //Formula result is a null string.
          //  this->addcell(row, column, '');
          //} else {
          //  // result is a number, so first 14 bytes are just like a _NUMBER record
          //  tmp = unpack("ddouble", substr(_data, spos + 6, 8)); // It machine machine dependent
          //  if (this->isDate(spos)) {
          //    numValue = tmp['double'];
          //  }
          //  else {
          //    numValue = this->createNumber(spos);
          //  }
          //  info = this->_getCellDetails(spos,numValue,column);
          //  this->addcell(row, column, info['string'], info);
          //}
          //break;
        case SPREADSHEET_EXCEL_READER_TYPE_BOOLERR: throw H2O.unimpl();
          //row	= ord(_data[spos]) | ord(_data[spos+1])<<8;
          //column = ord(_data[spos+2]) | ord(_data[spos+3])<<8;
          //string = ord(_data[spos+6]);
          //this->addcell(row, column, string);
          //break;
        case SPREADSHEET_EXCEL_READER_TYPE_STRING: throw H2O.unimpl();
          //// http://code.google.com/p/php-excel-reader/issues/detail?id=4
          //if (version == SPREADSHEET_EXCEL_READER_BIFF8){
          //  // Unicode 16 string, like an SST record
          //  xpos = spos;
          //  numChars =ord(_data[xpos]) | (ord(_data[xpos+1]) << 8);
          //  xpos += 2;
          //  optionFlags =ord(_data[xpos]);
          //  xpos++;
          //  asciiEncoding = ((optionFlags &0x01) == 0) ;
          //  extendedString = ((optionFlags & 0x04) != 0);
          //  // See if string contains formatting information
          //  richString = ((optionFlags & 0x08) != 0);
          //  if (richString) {
          //    // Read in the crun
          //    formattingRuns =ord(_data[xpos]) | (ord(_data[xpos+1]) << 8);
          //    xpos += 2;
          //  }
          //  if (extendedString) {
          //    // Read in cchExtRst
          //    extendedRunLength =this->_GetInt4d(this->_data, xpos);
          //    xpos += 4;
          //  }
          //  len = (asciiEncoding)?numChars : numChars*2;
          //  retstr =substr(_data, xpos, len);
          //  xpos += len;
          //  retstr = (asciiEncoding)? retstr : this->_encodeUTF16(retstr);
          //}
          //elseif (version == SPREADSHEET_EXCEL_READER_BIFF7){
          //  // Simple byte string
          //  xpos = spos;
          //  numChars =ord(_data[xpos]) | (ord(_data[xpos+1]) << 8);
          //  xpos += 2;
          //  retstr =substr(_data, xpos, numChars);
          //}
          //this->addcell(previousRow, previousCol, retstr);
          //break;
        case SPREADSHEET_EXCEL_READER_TYPE_ROW: break; // While the original php file parsed the row info here, H2O just wants the _data
        case SPREADSHEET_EXCEL_READER_TYPE_DBCELL:
          break;
        case SPREADSHEET_EXCEL_READER_TYPE_LABEL: throw H2O.unimpl();
          //row	= ord(_data[spos]) | ord(_data[spos+1])<<8;
          //column = ord(_data[spos+2]) | ord(_data[spos+3])<<8;
          //this->addcell(row, column, substr(_data, spos + 8, ord(_data[spos + 6]) | ord(_data[spos + 7])<<8));
          //break;
        case SPREADSHEET_EXCEL_READER_TYPE_EOF:
          // Push out the final row
          doRow(_currow+1);
          return true;
        case SPREADSHEET_EXCEL_READER_TYPE_HYPER: throw H2O.unimpl();
          ////  Only handle hyperlinks to a URL
          //row	= ord(this->_data[spos]) | ord(this->_data[spos+1])<<8;
          //row2   = ord(this->_data[spos+2]) | ord(this->_data[spos+3])<<8;
          //column = ord(this->_data[spos+4]) | ord(this->_data[spos+5])<<8;
          //column2 = ord(this->_data[spos+6]) | ord(this->_data[spos+7])<<8;
          //linkData = Array();
          //flags = ord(this->_data[spos + 28]);
          //udesc = "";
          //ulink = "";
          //uloc = 32;
          //linkData['flags'] = flags;
          //if ((flags & 1) > 0 ) {   // is a type we understand
          //  //  is there a description ?
          //  if ((flags & 0x14) == 0x14 ) {   // has a description
          //    uloc += 4;
          //    descLen = ord(this->_data[spos + 32]) | ord(this->_data[spos + 33]) << 8;
          //    udesc = substr(this->_data, spos + uloc, descLen * 2);
          //    uloc += 2 * descLen;
          //  }
          //  ulink = this->read16bitstring(this->_data, spos + uloc + 20);
          //  if (udesc == "") {
          //    udesc = ulink;
          //  }
          //}
          //linkData['desc'] = udesc;
          //linkData['link'] = this->_encodeUTF16(ulink);
          //for (r=row; r<=row2; r++) { 
          //  for (c=column; c<=column2; c++) {
          //    this['cellsInfo'][r+1][c+1]['hyperlink'] = linkData;
          //  }
          //}
          //break;
        case SPREADSHEET_EXCEL_READER_TYPE_DEFCOLWIDTH: break; // Set default column width
        case SPREADSHEET_EXCEL_READER_TYPE_STANDARDWIDTH: break; // While the original php file parsed the standard width here, H2O just wants the _data
        case SPREADSHEET_EXCEL_READER_TYPE_COLINFO: break; // While the original php file parsed the column info here, H2O just wants the _data

        default:
          break;
        }
        spos += length;
      }
    }
  }

  boolean isDate( Buf data, int spos ) {
    int xfindex = data.get2(spos+4);
    return _xfRecords.get(xfindex)._type == XF.Type.Date;
  }

  static double _GetIEEE754(long rknum) {
    double value;
    if( (rknum & 0x02) != 0) {
      value = rknum >> 2;
    } else {
      //mmp
      // I got my info on IEEE754 encoding from
      // http://research.microsoft.com/~hollasch/cgindex/coding/ieeefloat.html
      // The RK format calls for using only the most significant 30 bits of the
      // 64 bit floating point value. The other 34 bits are assumed to be 0
      // So, we use the upper 30 bits of rknum as follows...
      int exp = (int)((rknum & 0x7ff00000L) >> 20);
      long mantissa = (0x100000 | (rknum & 0x000ffffc));
      value = mantissa / Math.pow( 2 , (20- (exp - 1023)));
      if( ((rknum & 0x80000000) >> 31) != 0 ) value *= -1;
      //end of changes by mmp
    }
    if( (rknum & 0x01) != 0 )
      value /= 100;
    return value;
  }


  // Ignore all encodings
  private String __encodeUTF16( String s ) { return s; }

}