package water.parser; import java.io.*; import java.util.ArrayList; import org.apache.poi.hssf.eventusermodel.*; import org.apache.poi.hssf.eventusermodel.dummyrecord.LastCellOfRowDummyRecord; import org.apache.poi.hssf.eventusermodel.dummyrecord.MissingCellDummyRecord; import org.apache.poi.hssf.record.*; import org.apache.poi.poifs.filesystem.POIFSFileSystem; import water.util.Log; import water.util.Log.Tag.Sys; public class XlsParser extends CustomParser implements HSSFListener { private transient POIFSFileSystem _fs; private transient FormatTrackingHSSFListener _formatListener; private transient final ValueString _str = new ValueString(); private transient CustomParser.DataOut _dout; public XlsParser(){super(new ParserSetup(ParserType.XLS,CsvParser.AUTO_SEP,0,false,null,false));} public XlsParser(CustomParser.ParserSetup setup){super(null);} public XlsParser clone(){return new XlsParser(_setup);} @Override public DataOut streamParse( final InputStream is, final DataOut dout) throws Exception { _dout = dout; _firstRow = true; try { _fs = new POIFSFileSystem(is); MissingRecordAwareHSSFListener listener = new MissingRecordAwareHSSFListener(this); _formatListener = new FormatTrackingHSSFListener(listener); HSSFEventFactory factory = new HSSFEventFactory(); HSSFRequest request = new HSSFRequest(); request.addListenerForAllRecords(_formatListener); factory.processWorkbookEvents(request, _fs); } finally { try { is.close(); } catch (IOException e) { } } return dout; } /** * Try to parse the bits as svm light format, return SVMParser instance if the input is in svm light format, null otherwise. * @param bits * @return SVMLightPArser instance or null */ public static PSetupGuess guessSetup(byte [] bits){ InputStream is = new ByteArrayInputStream(bits); XlsParser p = new XlsParser(); CustomInspectDataOut dout = new CustomInspectDataOut(); try{p.streamParse(is, dout);}catch(Exception e){} return new PSetupGuess(new ParserSetup(ParserType.XLS,CsvParser.AUTO_SEP,dout._ncols, dout._header,dout._header?dout.data()[0]:null,false),dout._nlines,dout._invalidLines,dout.data(),dout._nlines > dout._invalidLines,null); } transient ArrayList<String> _columnNames = new ArrayList(); boolean _firstRow; @Override public void processRecord(Record record) { int curCol = -1; double curNum = Double.NaN; ValueString curStr = null; switch( record.getSid() ) { case BoundSheetRecord.sid: case BOFRecord.sid: // we just run together multiple sheets break; case SSTRecord.sid: _sstRecord = (SSTRecord) record; break; case BlankRecord.sid: BlankRecord brec = (BlankRecord) record; curCol = brec.getColumn(); curStr = _str.setTo(""); break; case BoolErrRecord.sid: BoolErrRecord berec = (BoolErrRecord) record; curCol = berec.getColumn(); curStr = _str.setTo(""); break; case FormulaRecord.sid: FormulaRecord frec = (FormulaRecord) record; curCol = frec.getColumn(); curNum = frec.getValue(); if( Double.isNaN(curNum) ) { // Formula result is a string // This is stored in the next record _outputNextStringRecord = true; _nextCol = frec.getColumn(); } break; case StringRecord.sid: if( _outputNextStringRecord ) { // String for formula StringRecord srec = (StringRecord) record; curStr = _str.setTo(srec.getString()); curCol = _nextCol; _outputNextStringRecord = false; } break; case LabelRecord.sid: LabelRecord lrec = (LabelRecord) record; curCol = lrec.getColumn(); curStr = _str.setTo(lrec.getValue()); break; case LabelSSTRecord.sid: LabelSSTRecord lsrec = (LabelSSTRecord) record; if( _sstRecord == null ) { Log.warn(Sys.EXCEL,"[ExcelParser] Missing SST record"); } else { curCol = lsrec.getColumn(); curStr = _str.setTo(_sstRecord.getString(lsrec.getSSTIndex()).toString()); } break; case NoteRecord.sid: Log.warn(Sys.EXCEL,"Warning cell notes are unsupported"); break; case NumberRecord.sid: NumberRecord numrec = (NumberRecord) record; curCol = numrec.getColumn(); curNum = numrec.getValue(); break; case RKRecord.sid: Log.warn(Sys.EXCEL,"Warning RK records are unsupported"); break; default: break; } // Handle missing column if( record instanceof MissingCellDummyRecord ) { MissingCellDummyRecord mc = (MissingCellDummyRecord) record; curCol = mc.getColumn(); curNum = Double.NaN; } // Handle end of row if( record instanceof LastCellOfRowDummyRecord ) { if (_firstRow) { _firstRow = false; String[] arr = new String[_columnNames.size()]; arr = _columnNames.toArray(arr); _dout.setColumnNames(arr); } else { _dout.newLine(); curCol = -1; } } if (curCol == -1) return; if (_firstRow) { _columnNames.add(curStr == null ? ("C" + (curCol+1)) : curStr.toString()); } else { if (curStr == null) if (Double.isNaN(curNum)) _dout.addInvalidCol(curCol); else _dout.addNumCol(curCol, curNum); else _dout.addStrCol(curCol, curStr); } } private transient SSTRecord _sstRecord; private int _nextCol; private boolean _outputNextStringRecord; @Override public boolean isCompatible(CustomParser p) { return p instanceof XlsParser; } }