package water.parser; import java.io.*; import java.util.Arrays; import java.util.HashMap; import java.util.ArrayList; import water.Key; import water.H2O; import water.util.UnsafeUtils; import static water.parser.DefaultParserProviders.XLS_INFO; class XlsParser extends Parser { XlsParser( ParseSetup ps, Key jobKey ) { super(ps, jobKey); } @Override protected ParseWriter parseChunk(int cidx, final ParseReader din, final ParseWriter dout) { throw H2O.unimpl(); } // A Stream, might be a Zip stream private InputStream _is; // The unpacked data. We expect we can fully hold the unzipped data. private byte[] _buf; private int _lim; // What was read so-far // Simple offset / lim over the underlying buffer private class Buf { final byte[] _buf; byte[] _bbuf; int _off, _lim; Buf( byte[] buf, int off, int size ) throws IOException { _buf = _bbuf = buf; _off = off; _lim = off+size; readAtLeast(_lim); } Buf( Buf B, int off, int size ) { _buf = _bbuf = B._bbuf; _off = off; _lim = off+size; assert _lim <= _buf.length; } void concat( int off, int size ) throws IOException { readAtLeast(off+size); if( _off == _lim ) { // Empty Buf, so concat is really assign _off = off; _lim = off+size; return; } if( off == _lim ) { // Adjacent, so just extend _lim += size; return; } _bbuf = Arrays.copyOfRange(_bbuf,_off,_lim+size); _lim = _lim-_off+size; _off = 0; System.arraycopy(_buf,off,_bbuf,_lim-size,size); } char get1(int pos ) { assert _off+pos+1<_lim; return (char)_bbuf[_off+pos]; } int get2( int pos ) { assert _off+pos+2<_lim; return UnsafeUtils.get2(_bbuf, _off + pos); } int get4( int pos ) { assert _off+pos+4<_lim; return UnsafeUtils.get4(_bbuf,_off+pos); } double get8d( int pos ) { assert _off+pos+8<_lim; return UnsafeUtils.get8d(_bbuf,_off+pos); } String getStr( int pos, int len ) { return new String(_bbuf,_off+pos,len); } } // Read & keep in _buf from the unpacked stream at least 'lim' bytes. // Toss a range-check if the stream runs dry too soon. private void readAtLeast(int lim) throws IOException{ if( lim <= _lim ) return; // Already read at least if( _buf == null ) _buf = new byte[0]; if( lim > _buf.length ) { // Need to grow buffer int oldlen = _buf.length, newlen = oldlen; if( newlen==0 ) newlen=1024; while( newlen < lim ) newlen<<=1; _buf = Arrays.copyOf(_buf,newlen); } // Now read/unzip until lim int x; while( _lim < lim && (x = _is.read(_buf,_lim,_buf.length-_lim)) != -1 ) _lim += x; if( _lim < lim ) throw new java.lang.ArrayIndexOutOfBoundsException("not an XLS file: reading at "+lim+" but file is only "+_lim+" bytes"); } // Wrapper to fetch an int at a random offset private int get4( int pos ) throws IOException { readAtLeast(pos+4); return UnsafeUtils.get4(_buf,pos); } /** Try to parse the bytes as XLS format */ public static ParseSetup guessSetup( byte[] bytes ) { XlsParser p = new XlsParser(new ParseSetup(XLS_INFO, ParseSetup.GUESS_SEP, false, ParseSetup.GUESS_HEADER, ParseSetup.GUESS_COL_CNT, null, null, null, null, null), null); p._buf = bytes; // No need to copy already-unpacked data; just use it directly p._lim = bytes.length; PreviewParseWriter dout = new PreviewParseWriter(); try{ p.streamParse(new ByteArrayInputStream(bytes), dout); } catch(IOException e) { throw new RuntimeException(e); } if (dout._ncols > 0 && dout._nlines > 0 && dout._nlines > dout._invalidLines) return new ParseSetup(XLS_INFO, ParseSetup.GUESS_SEP, false, dout.colNames()==null?ParseSetup.NO_HEADER:ParseSetup.HAS_HEADER,dout._ncols, dout.colNames(), dout.guessTypes(),null,null,dout._data); else throw new ParseDataset.H2OParseException("Could not parse file as an XLS file."); } /** Ported to Java from excel_reader2.php. * Found at: http://code.google.com/p/php-excel-reader/downloads/detail?name=php-excel-reader-2.21.zip&can=2&q= * * Originally developed by Vadim Tkachenko under the name PHPExcelReader. * (http://sourceforge.net/projects/phpexcelreader) * Based on the Java version by Andy Khan (http://www.andykhan.com). Now * maintained by David Sanders. Reads only Biff 7 and Biff 8 formats. * * PHP versions 4 and 5 * * LICENSE: This source file is subject to version 3.0 of the PHP license * that is available through the world-wide-web at the following URI: * http://www.php.net/license/3_0.txt. If you did not receive a copy of * the PHP License and are unable to obtain it through the web, please * send a note to license@php.net so we can mail you a copy immediately. */ private static final int NUM_BIG_BLOCK_DEPOT_BLOCKS_POS = 0x2c; private static final int SMALL_BLOCK_DEPOT_BLOCK_POS = 0x3c; private static final int ROOT_START_BLOCK_POS = 0x30; private static final int BIG_BLOCK_SIZE = 0x200; private static final int SMALL_BLOCK_SIZE = 0x40; private static final int EXTENSION_BLOCK_POS = 0x44; private static final int NUM_EXTENSION_BLOCK_POS = 0x48; private static final int PROPERTY_STORAGE_BLOCK_SIZE = 0x80; private static final int BIG_BLOCK_DEPOT_BLOCKS_POS = 0x4c; private static final int SMALL_BLOCK_THRESHOLD = 0x1000; // property storage offsets private static final int SIZE_OF_NAME_POS = 0x40; private static final int TYPE_POS = 0x42; private static final int START_BLOCK_POS = 0x74; private static final int SIZE_POS = 0x78; private static final byte[] IDENTIFIER_OLE = new byte[] { (byte)0xd0,(byte)0xcf,(byte)0x11,(byte)0xe0,(byte)0xa1,(byte)0xb1,(byte)0x1a,(byte)0xe1 }; // Breakdown of the OLE structure private int _numBigBlockDepotBlocks; private int _sbdStartBlock; private int _rootStartBlock; private int _extensionBlock; private int _numExtensionBlocks; private int[] _bigBlockChain; private int[] _smallBlockChain; private ArrayList<Props> _props = new ArrayList<>(); private static class Props { final String _name; final int _type, _startBlock, _size; Props( String name, int type, int startBlock, int size ) { _name = name; _type = type; _startBlock = startBlock; _size = size; } } private Props _wrkbook, _rootentry; @Override public ParseWriter streamParse( final InputStream is, final ParseWriter dout) throws IOException { _is = is; // Check for magic first readAtLeast(IDENTIFIER_OLE.length); for( int i=0; i<IDENTIFIER_OLE.length; i++ ) if( _buf[i] != IDENTIFIER_OLE[i] ) throw new ParseDataset.H2OParseException("Not a valid XLS file, lacks correct starting bits (aka magic number)."); _numBigBlockDepotBlocks = get4(NUM_BIG_BLOCK_DEPOT_BLOCKS_POS); _sbdStartBlock = get4(SMALL_BLOCK_DEPOT_BLOCK_POS); _rootStartBlock = get4(ROOT_START_BLOCK_POS); _extensionBlock = get4(EXTENSION_BLOCK_POS); _numExtensionBlocks = get4(NUM_EXTENSION_BLOCK_POS); int pos = BIG_BLOCK_DEPOT_BLOCKS_POS; int bbdBlocks = _numExtensionBlocks == 0 ? _numBigBlockDepotBlocks : (BIG_BLOCK_SIZE - BIG_BLOCK_DEPOT_BLOCKS_POS)/4; final int[] bigBlockDepotBlocks = new int[bbdBlocks]; for( int i = 0; i < bbdBlocks; i++ ) bigBlockDepotBlocks[i] = get4((pos+=4)-4); for( int j = 0; j < _numExtensionBlocks; j++ ) { pos = (_extensionBlock + 1) * BIG_BLOCK_SIZE; final int blocksToRead = Math.min(_numBigBlockDepotBlocks - bbdBlocks, BIG_BLOCK_SIZE / 4 - 1); for( int i = bbdBlocks; i < bbdBlocks + blocksToRead; i++ ) bigBlockDepotBlocks[i] = get4((pos+=4)-4); bbdBlocks += blocksToRead; if( bbdBlocks < _numBigBlockDepotBlocks ) _extensionBlock = get4(pos); } // readBigBlockDepot int index = 0; _bigBlockChain = new int[1]; for( int i = 0; i < _numBigBlockDepotBlocks; i++ ) { pos = (bigBlockDepotBlocks[i] + 1) * BIG_BLOCK_SIZE; for( int j = 0 ; j < BIG_BLOCK_SIZE / 4; j++ ) { _bigBlockChain[index++] = get4((pos+=4)-4); if( index==_bigBlockChain.length ) _bigBlockChain = Arrays.copyOf(_bigBlockChain,index<<1); } } // readSmallBlockDepot(); index = 0; int sbdBlock = _sbdStartBlock; int[] smallBlockChain = new int[1]; while( sbdBlock != -2 ) { pos = (sbdBlock + 1) * BIG_BLOCK_SIZE; for( int j = 0; j < BIG_BLOCK_SIZE / 4; j++ ) { smallBlockChain[index++] = get4((pos+=4)-4); if( index==smallBlockChain.length ) smallBlockChain = Arrays.copyOf(smallBlockChain,index<<1); } sbdBlock = _bigBlockChain[sbdBlock]; } // Read workbook & root entries __readPropertySets(__readData(_rootStartBlock)); // Read the workbook - this holds all the csv data Buf data = getWorkBook(); // Parse the workbook boolean res = parseWorkbook(data,dout); if( !res ) throw new IOException("not an XLS file"); return dout; } private Buf __readData(int block) throws IOException { Buf data = new Buf(_buf,0,0); while( block != -2 ) { int pos = (block + 1) * BIG_BLOCK_SIZE; data.concat(pos, BIG_BLOCK_SIZE); block = _bigBlockChain[block]; } return data; } // Find the workbook & root entries private void __readPropertySets(Buf entry) { int offset = 0; while( offset < entry._lim ) { Buf d = new Buf(entry, offset, PROPERTY_STORAGE_BLOCK_SIZE); int nameSize = d.get2(SIZE_OF_NAME_POS); int type = d._bbuf[TYPE_POS]; int startBlock = d.get4(START_BLOCK_POS); int size = d.get4(SIZE_POS); String name = ""; for( int i = 0; i < nameSize ; i+=2 ) name += (char)d.get2(i); name = name.replaceAll("\0", ""); // remove trailing nul (C string?) Props p = new Props(name,type,startBlock,size); _props.add(p); if( name.equalsIgnoreCase("workbook") || name.equalsIgnoreCase("book") ) _wrkbook = p; if( name.equals("Root Entry") ) _rootentry = p; offset += PROPERTY_STORAGE_BLOCK_SIZE; } } private Buf getWorkBook() throws IOException { if( _wrkbook._size < SMALL_BLOCK_THRESHOLD ) { Buf rootdata = __readData(_rootentry._startBlock); Buf streamData = new Buf(rootdata,0,0); int block = _wrkbook._startBlock; while( block != -2 ) { int pos = block * SMALL_BLOCK_SIZE; streamData.concat(pos, SMALL_BLOCK_SIZE); block = _smallBlockChain[block]; } return streamData; } else { int numBlocks = _wrkbook._size / BIG_BLOCK_SIZE; if( _wrkbook._size % BIG_BLOCK_SIZE != 0 ) numBlocks++; Buf streamData = new Buf(_buf,0,0); if( numBlocks == 0 ) return streamData; int block = _wrkbook._startBlock; while( block != -2 ) { int pos = (block + 1) * BIG_BLOCK_SIZE; streamData.concat(pos, BIG_BLOCK_SIZE); block = _bigBlockChain[block]; } return streamData; } } private static final int SPREADSHEET_EXCEL_READER_BIFF8 = 0x600; private static final int SPREADSHEET_EXCEL_READER_BIFF7 = 0x500; private static final int SPREADSHEET_EXCEL_READER_WORKBOOKGLOBALS = 0x5; private static final int SPREADSHEET_EXCEL_READER_WORKSHEET = 0x10; private static final int SPREADSHEET_EXCEL_READER_TYPE_BOF = 0x809; private static final int SPREADSHEET_EXCEL_READER_TYPE_EOF = 0x0a; private static final int SPREADSHEET_EXCEL_READER_TYPE_BOUNDSHEET = 0x85; private static final int SPREADSHEET_EXCEL_READER_TYPE_DIMENSION = 0x200; private static final int SPREADSHEET_EXCEL_READER_TYPE_ROW = 0x208; private static final int SPREADSHEET_EXCEL_READER_TYPE_DBCELL = 0xd7; private static final int SPREADSHEET_EXCEL_READER_TYPE_FILEPASS = 0x2f; private static final int SPREADSHEET_EXCEL_READER_TYPE_NOTE = 0x1c; private static final int SPREADSHEET_EXCEL_READER_TYPE_TXO = 0x1b6; private static final int SPREADSHEET_EXCEL_READER_TYPE_RK = 0x7e; private static final int SPREADSHEET_EXCEL_READER_TYPE_RK2 = 0x27e; private static final int SPREADSHEET_EXCEL_READER_TYPE_MULRK = 0xbd; private static final int SPREADSHEET_EXCEL_READER_TYPE_MULBLANK = 0xbe; private static final int SPREADSHEET_EXCEL_READER_TYPE_INDEX = 0x20b; private static final int SPREADSHEET_EXCEL_READER_TYPE_SST = 0xfc; private static final int SPREADSHEET_EXCEL_READER_TYPE_EXTSST = 0xff; private static final int SPREADSHEET_EXCEL_READER_TYPE_CONTINUE = 0x3c; private static final int SPREADSHEET_EXCEL_READER_TYPE_LABEL = 0x204; private static final int SPREADSHEET_EXCEL_READER_TYPE_LABELSST = 0xfd; private static final int SPREADSHEET_EXCEL_READER_TYPE_NUMBER = 0x203; private static final int SPREADSHEET_EXCEL_READER_TYPE_NAME = 0x18; private static final int SPREADSHEET_EXCEL_READER_TYPE_ARRAY = 0x221; private static final int SPREADSHEET_EXCEL_READER_TYPE_STRING = 0x207; private static final int SPREADSHEET_EXCEL_READER_TYPE_FORMULA = 0x406; private static final int SPREADSHEET_EXCEL_READER_TYPE_FORMULA2 = 0x6; private static final int SPREADSHEET_EXCEL_READER_TYPE_FORMAT = 0x41e; private static final int SPREADSHEET_EXCEL_READER_TYPE_XF = 0xe0; private static final int SPREADSHEET_EXCEL_READER_TYPE_BOOLERR = 0x205; private static final int SPREADSHEET_EXCEL_READER_TYPE_FONT = 0x0031; private static final int SPREADSHEET_EXCEL_READER_TYPE_PALETTE = 0x0092; private static final int SPREADSHEET_EXCEL_READER_TYPE_UNKNOWN = 0xffff; private static final int SPREADSHEET_EXCEL_READER_TYPE_NINETEENFOUR = 0x22; private static final int SPREADSHEET_EXCEL_READER_TYPE_MERGEDCELLS = 0xE5; private static final int SPREADSHEET_EXCEL_READER_UTCOFFSETDAYS = 25569; private static final int SPREADSHEET_EXCEL_READER_UTCOFFSETDAYS1904 = 24107; private static final int SPREADSHEET_EXCEL_READER_MSINADAY = 86400; private static final int SPREADSHEET_EXCEL_READER_TYPE_HYPER = 0x01b8; private static final int SPREADSHEET_EXCEL_READER_TYPE_COLINFO = 0x7d; private static final int SPREADSHEET_EXCEL_READER_TYPE_DEFCOLWIDTH = 0x55; private static final int SPREADSHEET_EXCEL_READER_TYPE_STANDARDWIDTH = 0x99; private static final String SPREADSHEET_EXCEL_READER_DEF_NUM_FORMAT = "%s"; // Excel spreadsheet specific stuff private int _version; private boolean _nineteenFour; private String[] _formatRecords = new String[1]; private ArrayList<String> _sst = new ArrayList<>(); private ArrayList<Sheet> _boundsheets = new ArrayList<>(); private static class XF { final int _indexCode; enum Type { Date, Number, Other } final Type _type; XF( int code, Type type ) { _indexCode = code; _type = type; } } private ArrayList<XF> _xfRecords = new ArrayList<>(); /** List of default date formats used by Excel */ private static HashMap<Integer,String> DATEFORMATS = new HashMap<>(); static { DATEFORMATS.put(0xe,"m/d/Y"); DATEFORMATS.put(0xf,"M-d-Y"); DATEFORMATS.put(0x10,"d-M"); DATEFORMATS.put(0x11,"M-Y"); DATEFORMATS.put(0x12,"h:i a"); DATEFORMATS.put(0x13,"h:i:s a"); DATEFORMATS.put(0x14,"H:i"); DATEFORMATS.put(0x15,"H:i:s"); DATEFORMATS.put(0x16,"d/m/Y H:i"); DATEFORMATS.put(0x2d,"i:s"); DATEFORMATS.put(0x2e,"H:i:s"); DATEFORMATS.put(0x2f,"i:s.S"); } /** Default number formats used by Excel */ private static HashMap<Integer,String> NUMBERFORMATS = new HashMap<>(); static { NUMBERFORMATS.put(0x1 ,"0"); NUMBERFORMATS.put(0x2 ,"0.00"); NUMBERFORMATS.put(0x3 ,"#,##0"); NUMBERFORMATS.put(0x4 ,"#,##0.00"); NUMBERFORMATS.put(0x5 ,"$#,##0;($#,##0)"); NUMBERFORMATS.put(0x6 ,"$#,##0;[Red]($#,##0)"); NUMBERFORMATS.put(0x7 ,"$#,##0.00;($#,##0.00)"); NUMBERFORMATS.put(0x8 ,"$#,##0.00;[Red]($#,##0.00)"); NUMBERFORMATS.put(0x9 ,"0%"); NUMBERFORMATS.put(0xa ,"0.00%"); NUMBERFORMATS.put(0xb ,"0.00E+00"); NUMBERFORMATS.put(0x25,"#,##0;(#,##0)"); NUMBERFORMATS.put(0x26,"#,##0;[Red](#,##0)"); NUMBERFORMATS.put(0x27,"#,##0.00;(#,##0.00)"); NUMBERFORMATS.put(0x28,"#,##0.00;[Red](#,##0.00)"); NUMBERFORMATS.put(0x29,"#,##0;(#,##0)"); // Not exactly NUMBERFORMATS.put(0x2a,"$#,##0;($#,##0)"); // Not exactly NUMBERFORMATS.put(0x2b,"#,##0.00;(#,##0.00)"); // Not exactly NUMBERFORMATS.put(0x2c,"$#,##0.00;($#,##0.00)"); // Not exactly NUMBERFORMATS.put(0x30,"##0.0E+0"); } /** * Parse a workbook */ private boolean parseWorkbook(Buf data, final ParseWriter dout) { int pos = 0; int code = data.get2(pos); int length = data.get2(pos+2); int version = data.get2(pos+4); int substreamType = data.get2(pos+6); _version = version; if( version != SPREADSHEET_EXCEL_READER_BIFF8 && version != SPREADSHEET_EXCEL_READER_BIFF7 ) return false; if( substreamType != SPREADSHEET_EXCEL_READER_WORKBOOKGLOBALS ) return false; pos += length + 4; code = data.get2(pos); length = data.get2(pos+2); while( code != SPREADSHEET_EXCEL_READER_TYPE_EOF ) { switch( code ) { case SPREADSHEET_EXCEL_READER_TYPE_SST: { int spos = pos + 4; int limitpos = spos + length; int uniqueStrings = data.get4(spos+4); spos += 8; for( int i = 0; i < uniqueStrings; i++ ) { // Read in the number of characters if (spos == limitpos) { int conlength = data.get2(spos+2); spos += 4; limitpos = spos + conlength; } int numChars = data.get2(spos); spos += 2; int optionFlags = data.get1(spos); spos++; boolean asciiEncoding = ((optionFlags & 0x01) == 0); boolean extendedString = ( (optionFlags & 0x04) != 0); // See if string contains formatting information boolean richString = ( (optionFlags & 0x08) != 0); int formattingRuns=0; if( richString ) // Read in the crun formattingRuns = data.get2((spos+=2)-2); int extendedRunLength=0; if( extendedString ) // Read in cchExtRst extendedRunLength = data.get4((spos+=4)-4); String retstr = null; int len = (asciiEncoding)? numChars : numChars*2; if( spos + len < limitpos ) { retstr = data.getStr((spos+=len)-len, len); } else { // found continue retstr = data.getStr(spos, limitpos - spos); int bytesRead = limitpos - spos; int charsLeft = numChars - ((asciiEncoding) ? bytesRead : (bytesRead / 2)); spos = limitpos; while (charsLeft > 0) { int opcode = data.get2(spos); int conlength = data.get2(spos+2); if( opcode != 0x3c ) return false; spos += 4; limitpos = spos + conlength; int option = data.get1(spos); spos += 1; // if (asciiEncoding && (option == 0)) { // len = min(charsLeft, limitpos - spos); // min(charsLeft, conlength); // retstr .= substr(data, spos, len); // charsLeft -= len; // asciiEncoding = true; // } // elseif (!asciiEncoding && (option != 0)) { // len = min(charsLeft * 2, limitpos - spos); // min(charsLeft, conlength); // retstr .= substr(data, spos, len); // charsLeft -= len/2; // asciiEncoding = false; // } // elseif (!asciiEncoding && (option == 0)) { // // Bummer - the string starts off as Unicode, but after the // // continuation it is in straightforward ASCII encoding // len = min(charsLeft, limitpos - spos); // min(charsLeft, conlength); // for (j = 0; j < len; j++) { // retstr .= data[spos + j].chr(0); // } // charsLeft -= len; // asciiEncoding = false; // } // else{ // newstr = ''; // for (j = 0; j < strlen(retstr); j++) { // newstr = retstr[j].chr(0); // } // retstr = newstr; // len = min(charsLeft * 2, limitpos - spos); // min(charsLeft, conlength); // retstr .= substr(data, spos, len); // charsLeft -= len/2; // asciiEncoding = false; // } // spos += len; throw H2O.unimpl(); } } retstr = (asciiEncoding) ? retstr : __encodeUTF16(retstr); if (richString) spos += 4 * formattingRuns; // For extended strings, skip over the extended string data if (extendedString) spos += extendedRunLength; _sst.add(retstr); } break; } case SPREADSHEET_EXCEL_READER_TYPE_FILEPASS: return false; case SPREADSHEET_EXCEL_READER_TYPE_NAME: break; case SPREADSHEET_EXCEL_READER_TYPE_FORMAT: { String formatString = version == SPREADSHEET_EXCEL_READER_BIFF8 ? data.getStr(pos+9, data.get2(pos+6)*(data.get1(pos+8) == 0 ? 1 : 2)) : data.getStr(pos+7, data.get1(pos+6)*2); int indexCode = data.get2(pos+4); while( indexCode >= _formatRecords.length ) _formatRecords = Arrays.copyOf(_formatRecords,_formatRecords.length<<1); _formatRecords[indexCode] = formatString; break; } case SPREADSHEET_EXCEL_READER_TYPE_FONT: break; // While the original php file parsed the font here, H2O just wants the data case SPREADSHEET_EXCEL_READER_TYPE_PALETTE: break; // While the original php file parsed the color palaette info here, H2O just wants the data case SPREADSHEET_EXCEL_READER_TYPE_XF: { // While the original php file parsed the extensive formatting info // here, H2O just wants the data. Limit to figuring out if excel thinks // this is a date-formatted field or not int indexCode = data.get2(pos+6); XF.Type t=null; if( DATEFORMATS.containsKey(indexCode) ) t = XF.Type.Date; else if( NUMBERFORMATS.containsKey(indexCode) ) t = XF.Type.Number; else if( indexCode < _formatRecords.length && _formatRecords[indexCode] != null ) t = XF.Type.Other; _xfRecords.add(new XF(indexCode,t)); break; } case SPREADSHEET_EXCEL_READER_TYPE_NINETEENFOUR: _nineteenFour = data.get1(pos+4) == 1; break; case SPREADSHEET_EXCEL_READER_TYPE_BOUNDSHEET: int recOffset = data.get4(pos+4); int recLength = data.get1(pos+10); String recName = version == SPREADSHEET_EXCEL_READER_BIFF8 ? data.getStr(pos+12, recLength*(data.get1(pos+11) == 0 ? 1 : 2)) : data.getStr(pos+11, recLength); _boundsheets.add(new Sheet(data,dout,recName,recOffset)); break; default: // nothing; ignore this block typed } pos += length + 4; code = data.get2(pos); length = data.get2(pos+2); } // Parse all Sheets, although honestly H2O probably only wants the 1st sheet for( Sheet sheet : _boundsheets ) sheet.parse(); return true; } // ------------------------------ // A single Excel Sheet private class Sheet { final String _name; final Buf _data; final int _offset; final ParseWriter _dout; int _numRows, _numCols; String[] _labels; int _currow = 0; double[] _ds; Sheet( Buf data, ParseWriter dout, String name, int offset ) { _data = data; _dout = dout; _name = name; _offset = offset; } // Get the next row spec - and thus cleanup the prior row int row(int spos) { int row = _data.get2(spos); if( row < _currow ) throw new RuntimeException("XLS file but rows running backwards"); return doRow(row); } int doRow(int row) { // Once we're done with row 0, look at the collection of Strings on this // row. If all columns have a String, declare it a label row. Else, // inject the partial Strings as categoricals. if( row > _currow && _currow == 0 ) { // Moving off of row 0 boolean header=true; for( String s : _labels ) header &= (s!=null); // All strings? if( header ) { // It's a header row _dout.setColumnNames(_labels.clone()); Arrays.fill(_labels,null); // Dont reuse them labels as categoricals _currow=1; // Done with this row } } // Advance to the next row while( _currow < row ) { _currow++; // Next row internally // Forward collected row to _dout. for( int i=0; i<_ds.length; i++ ) { if( _labels[i] != null ) { _dout.addStrCol(i,new BufferedString(_labels[i])); _labels[i] = null; } else { _dout.addNumCol(i,_ds[i]); _ds[i] = Double.NaN; } } _dout.newLine(); // And advance dout a line } return row; } boolean parse() { // read BOF int spos = _offset; int code = _data.get2(spos); int length = _data.get2(spos+2); int version = _data.get2(spos + 4); if( (version != SPREADSHEET_EXCEL_READER_BIFF8) && (version != SPREADSHEET_EXCEL_READER_BIFF7) ) return false; int substreamType = _data.get2(spos + 6); if( substreamType != SPREADSHEET_EXCEL_READER_WORKSHEET ) return false; spos += length + 4; String recType = null; while(true) { code = _data.get1(spos); if( code != SPREADSHEET_EXCEL_READER_TYPE_EOF) { code = _data.get2(spos); length = _data.get2(spos+2); recType = null; spos += 4; } switch( code ) { case SPREADSHEET_EXCEL_READER_TYPE_DIMENSION: if( _numRows == 0 && _numCols == 0 ) { if( length == 10 || version == SPREADSHEET_EXCEL_READER_BIFF7 ) { _numRows = _data.get2(spos+ 2); _numCols = _data.get2(spos+ 6); } else { _numRows = _data.get2(spos+ 4); _numCols = _data.get2(spos+10); } _labels = new String[_numCols]; _ds = new double[_numCols]; Arrays.fill(_ds,Double.NaN); } break; case SPREADSHEET_EXCEL_READER_TYPE_MERGEDCELLS: break; // While the original php file parsed merged-cells here, H2O just wants the _data case SPREADSHEET_EXCEL_READER_TYPE_RK: case SPREADSHEET_EXCEL_READER_TYPE_RK2: { int row = row(spos); int col = _data.get2(spos+2); double d = _GetIEEE754(_data.get4(spos+6)); if( isDate(_data, spos) ) throw H2O.unimpl(); _ds[col] = d; break; } case SPREADSHEET_EXCEL_READER_TYPE_LABELSST: { int row = row(spos); int col = _data.get2(spos+2); int index = _data.get4(spos+6); _labels[col] = _sst.get(index); // Set label break; } case SPREADSHEET_EXCEL_READER_TYPE_MULRK: { int row = row(spos); int colFirst= _data.get2(spos+2); int colLast = _data.get2(spos+length-2); int columns = colLast - colFirst + 1; int tmppos = spos+4; for( int i = 0; i < columns; i++ ) { double numValue = _GetIEEE754(_data.get4(tmppos + 2)); if( isDate( _data, tmppos-4) ) throw H2O.unimpl(); tmppos += 6; _ds[colFirst+i] = numValue; } break; } case SPREADSHEET_EXCEL_READER_TYPE_NUMBER: { int row = row(spos); int col = _data.get2(spos+2); double d = _data.get8d(spos+6); if( isDate(_data,spos) ) throw H2O.unimpl(); _ds[col] = d; break; } case SPREADSHEET_EXCEL_READER_TYPE_MULBLANK: { int row = row(spos); int col = _data.get2(spos+2); int cols= (length / 2) - 3; for( int c = 0; c < cols; c++ ) { if( isDate( _data, spos+(c*2)) ) throw H2O.unimpl(); _ds[col+c] = 0; } break; } case SPREADSHEET_EXCEL_READER_TYPE_FORMULA: case SPREADSHEET_EXCEL_READER_TYPE_FORMULA2: throw H2O.unimpl(); //row = ord(_data[spos]) | ord(_data[spos+1])<<8; //column = ord(_data[spos+2]) | ord(_data[spos+3])<<8; //if ((ord(_data[spos+6])==0) && (ord(_data[spos+12])==255) && (ord(_data[spos+13])==255)) { // //String formula. Result follows in a STRING record // // This row/col are stored to be referenced in that record // // http://code.google.com/p/php-excel-reader/issues/detail?id=4 // previousRow = row; // previousCol = column; //} elseif ((ord(_data[spos+6])==1) && (ord(_data[spos+12])==255) && (ord(_data[spos+13])==255)) { // //Boolean formula. Result is in +2; 0=false,1=true // // http://code.google.com/p/php-excel-reader/issues/detail?id=4 // if (ord(this->_data[spos+8])==1) { // this->addcell(row, column, "TRUE"); // } else { // this->addcell(row, column, "FALSE"); // } //} elseif ((ord(_data[spos+6])==2) && (ord(_data[spos+12])==255) && (ord(_data[spos+13])==255)) { // //Error formula. Error code is in +2; //} elseif ((ord(_data[spos+6])==3) && (ord(_data[spos+12])==255) && (ord(_data[spos+13])==255)) { // //Formula result is a null string. // this->addcell(row, column, ''); //} else { // // result is a number, so first 14 bytes are just like a _NUMBER record // tmp = unpack("ddouble", substr(_data, spos + 6, 8)); // It machine machine dependent // if (this->isDate(spos)) { // numValue = tmp['double']; // } // else { // numValue = this->createNumber(spos); // } // info = this->_getCellDetails(spos,numValue,column); // this->addcell(row, column, info['string'], info); //} //break; case SPREADSHEET_EXCEL_READER_TYPE_BOOLERR: throw H2O.unimpl(); //row = ord(_data[spos]) | ord(_data[spos+1])<<8; //column = ord(_data[spos+2]) | ord(_data[spos+3])<<8; //string = ord(_data[spos+6]); //this->addcell(row, column, string); //break; case SPREADSHEET_EXCEL_READER_TYPE_STRING: throw H2O.unimpl(); //// http://code.google.com/p/php-excel-reader/issues/detail?id=4 //if (version == SPREADSHEET_EXCEL_READER_BIFF8){ // // Unicode 16 string, like an SST record // xpos = spos; // numChars =ord(_data[xpos]) | (ord(_data[xpos+1]) << 8); // xpos += 2; // optionFlags =ord(_data[xpos]); // xpos++; // asciiEncoding = ((optionFlags &0x01) == 0) ; // extendedString = ((optionFlags & 0x04) != 0); // // See if string contains formatting information // richString = ((optionFlags & 0x08) != 0); // if (richString) { // // Read in the crun // formattingRuns =ord(_data[xpos]) | (ord(_data[xpos+1]) << 8); // xpos += 2; // } // if (extendedString) { // // Read in cchExtRst // extendedRunLength =this->_GetInt4d(this->_data, xpos); // xpos += 4; // } // len = (asciiEncoding)?numChars : numChars*2; // retstr =substr(_data, xpos, len); // xpos += len; // retstr = (asciiEncoding)? retstr : this->_encodeUTF16(retstr); //} //elseif (version == SPREADSHEET_EXCEL_READER_BIFF7){ // // Simple byte string // xpos = spos; // numChars =ord(_data[xpos]) | (ord(_data[xpos+1]) << 8); // xpos += 2; // retstr =substr(_data, xpos, numChars); //} //this->addcell(previousRow, previousCol, retstr); //break; case SPREADSHEET_EXCEL_READER_TYPE_ROW: break; // While the original php file parsed the row info here, H2O just wants the _data case SPREADSHEET_EXCEL_READER_TYPE_DBCELL: break; case SPREADSHEET_EXCEL_READER_TYPE_LABEL: throw H2O.unimpl(); //row = ord(_data[spos]) | ord(_data[spos+1])<<8; //column = ord(_data[spos+2]) | ord(_data[spos+3])<<8; //this->addcell(row, column, substr(_data, spos + 8, ord(_data[spos + 6]) | ord(_data[spos + 7])<<8)); //break; case SPREADSHEET_EXCEL_READER_TYPE_EOF: // Push out the final row doRow(_currow+1); return true; case SPREADSHEET_EXCEL_READER_TYPE_HYPER: throw H2O.unimpl(); //// Only handle hyperlinks to a URL //row = ord(this->_data[spos]) | ord(this->_data[spos+1])<<8; //row2 = ord(this->_data[spos+2]) | ord(this->_data[spos+3])<<8; //column = ord(this->_data[spos+4]) | ord(this->_data[spos+5])<<8; //column2 = ord(this->_data[spos+6]) | ord(this->_data[spos+7])<<8; //linkData = Array(); //flags = ord(this->_data[spos + 28]); //udesc = ""; //ulink = ""; //uloc = 32; //linkData['flags'] = flags; //if ((flags & 1) > 0 ) { // is a type we understand // // is there a description ? // if ((flags & 0x14) == 0x14 ) { // has a description // uloc += 4; // descLen = ord(this->_data[spos + 32]) | ord(this->_data[spos + 33]) << 8; // udesc = substr(this->_data, spos + uloc, descLen * 2); // uloc += 2 * descLen; // } // ulink = this->read16bitstring(this->_data, spos + uloc + 20); // if (udesc == "") { // udesc = ulink; // } //} //linkData['desc'] = udesc; //linkData['link'] = this->_encodeUTF16(ulink); //for (r=row; r<=row2; r++) { // for (c=column; c<=column2; c++) { // this['cellsInfo'][r+1][c+1]['hyperlink'] = linkData; // } //} //break; case SPREADSHEET_EXCEL_READER_TYPE_DEFCOLWIDTH: break; // Set default column width case SPREADSHEET_EXCEL_READER_TYPE_STANDARDWIDTH: break; // While the original php file parsed the standard width here, H2O just wants the _data case SPREADSHEET_EXCEL_READER_TYPE_COLINFO: break; // While the original php file parsed the column info here, H2O just wants the _data default: break; } spos += length; } } } boolean isDate( Buf data, int spos ) { int xfindex = data.get2(spos+4); return _xfRecords.get(xfindex)._type == XF.Type.Date; } static double _GetIEEE754(long rknum) { double value; if( (rknum & 0x02) != 0) { value = rknum >> 2; } else { //mmp // I got my info on IEEE754 encoding from // http://research.microsoft.com/~hollasch/cgindex/coding/ieeefloat.html // The RK format calls for using only the most significant 30 bits of the // 64 bit floating point value. The other 34 bits are assumed to be 0 // So, we use the upper 30 bits of rknum as follows... int exp = (int)((rknum & 0x7ff00000L) >> 20); long mantissa = (0x100000 | (rknum & 0x000ffffc)); value = mantissa / Math.pow( 2 , (20- (exp - 1023))); if( ((rknum & 0x80000000) >> 31) != 0 ) value *= -1; //end of changes by mmp } if( (rknum & 0x01) != 0 ) value /= 100; return value; } // Ignore all encodings private String __encodeUTF16( String s ) { return s; } }