/*! ****************************************************************************** * * Pentaho Data Integration * * Copyright (C) 2002-2015 by Pentaho : http://www.pentaho.com * ******************************************************************************* * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ******************************************************************************/ /** * Author = Shailesh Ahuja */ package org.pentaho.di.trans.steps.excelinput.staxpoi; import java.io.IOException; import java.io.InputStream; import java.util.ArrayList; import java.util.List; import java.util.TimeZone; import javax.xml.stream.XMLInputFactory; import javax.xml.stream.XMLStreamConstants; import javax.xml.stream.XMLStreamException; import javax.xml.stream.XMLStreamReader; import org.apache.commons.lang.StringUtils; import org.apache.poi.openxml4j.exceptions.InvalidFormatException; import org.apache.poi.ss.usermodel.DateUtil; import org.apache.poi.xssf.eventusermodel.XSSFReader; import org.apache.poi.xssf.model.SharedStringsTable; import org.apache.poi.xssf.model.StylesTable; import org.apache.poi.xssf.usermodel.XSSFRichTextString; import org.openxmlformats.schemas.spreadsheetml.x2006.main.CTXf; import org.pentaho.di.core.spreadsheet.KCell; import org.pentaho.di.core.spreadsheet.KCellType; import org.pentaho.di.core.spreadsheet.KSheet; /** * Streaming reader for XLSX sheets.<br> * Rows should only be accessed sequentially: random access will severely impact performance.<br> */ public class StaxPoiSheet implements KSheet { // set to UTC for coherence with PoiSheet; private static final TimeZone DATE_TZ = TimeZone.getTimeZone( "UTC" ); private final String sheetName; private final String sheetId; private final XSSFReader xssfReader; private InputStream sheetStream; private XMLStreamReader sheetReader; // hold the pointer to the current row so that access to the next row in the stream is quick and easy private int currentRow; private List<String> headerRow; private int numRows; private int numCols; // 1-based first non-empty row private int firstRow; private KCell[] currentRowCells; // full shared strings table private SharedStringsTable sst; // custom styles private StylesTable styles; public StaxPoiSheet( XSSFReader reader, String sheetName, String sheetID ) throws InvalidFormatException, IOException, XMLStreamException { this.sheetName = sheetName; xssfReader = reader; sheetId = sheetID; sst = reader.getSharedStringsTable(); styles = reader.getStylesTable(); sheetStream = reader.getSheet( sheetID ); XMLInputFactory factory = XMLInputFactory.newInstance(); sheetReader = factory.createXMLStreamReader( sheetStream ); headerRow = new ArrayList<String>(); while ( sheetReader.hasNext() ) { int event = sheetReader.next(); if ( event == XMLStreamConstants.START_ELEMENT && sheetReader.getLocalName().equals( "dimension" ) ) { String dim = sheetReader.getAttributeValue( null, "ref" ); // empty sheets have dimension with no range if ( StringUtils.contains( dim, ':' ) ) { dim = dim.split( ":" )[1]; numRows = StaxUtil.extractRowNumber( dim ); numCols = StaxUtil.extractColumnNumber( dim ); } } if ( event == XMLStreamConstants.START_ELEMENT && sheetReader.getLocalName().equals( "row" ) ) { currentRow = Integer.parseInt( sheetReader.getAttributeValue( null, "r" ) ); firstRow = currentRow; // calculate the number of columns in the header row while ( sheetReader.hasNext() ) { event = sheetReader.next(); if ( event == XMLStreamConstants.END_ELEMENT && sheetReader.getLocalName().equals( "row" ) ) { // if the row has ended, break the inner while loop break; } if ( event == XMLStreamConstants.START_ELEMENT && sheetReader.getLocalName().equals( "c" ) ) { String attributeValue = sheetReader.getAttributeValue( null, "t" ); if ( attributeValue != null && attributeValue.equals( "s" ) ) { // only if the type of the cell is string, we continue while ( sheetReader.hasNext() ) { event = sheetReader.next(); if ( event == XMLStreamConstants.START_ELEMENT && sheetReader.getLocalName().equals( "v" ) ) { int idx = Integer.parseInt( sheetReader.getElementText() ); String content = new XSSFRichTextString( sst.getEntryAt( idx ) ).toString(); headerRow.add( content ); break; } } } else { break; } } } // we have parsed the header row break; } } } @Override public KCell[] getRow( int rownr ) { // xlsx raw row numbers are 1-based index, KSheet is 0-based if ( rownr < 0 || rownr >= numRows ) { // KSheet requires out of bounds here throw new ArrayIndexOutOfBoundsException( rownr ); } if ( rownr + 1 < firstRow ) { // before first non-empty row return new KCell[0]; } if ( rownr > 0 && currentRow == rownr + 1 ) { return currentRowCells; } try { if ( currentRow >= rownr + 1 ) { // allow random access per api despite performance hit resetSheetReader(); } while ( sheetReader.hasNext() ) { int event = sheetReader.next(); if ( event == XMLStreamConstants.START_ELEMENT && sheetReader.getLocalName().equals( "row" ) ) { String rowIndicator = sheetReader.getAttributeValue( null, "r" ); currentRow = Integer.parseInt( rowIndicator ); if ( currentRow < rownr + 1 ) { continue; } currentRowCells = parseRow(); return currentRowCells; } } } catch ( Exception e ) { throw new RuntimeException( e ); } numRows = currentRow; return new KCell[] {}; } private KCell[] parseRow() throws XMLStreamException { KCell[] cells = new StaxPoiCell[numCols]; for ( int i = 0; i < numCols; i++ ) { // go to the "c" cell tag while ( sheetReader.hasNext() ) { int event = sheetReader.next(); if ( event == XMLStreamConstants.START_ELEMENT && sheetReader.getLocalName().equals( "c" ) ) { break; } if ( event == XMLStreamConstants.END_ELEMENT && sheetReader.getLocalName().equals( "row" ) ) { // premature end of row, returning what we have return cells; } } String cellLocation = sheetReader.getAttributeValue( null, "r" ); int columnIndex = StaxUtil.extractColumnNumber( cellLocation ) - 1; String cellType = sheetReader.getAttributeValue( null, "t" ); String cellStyle = sheetReader.getAttributeValue( null, "s" ); boolean isFormula = false; String content = null; // get value tag while ( sheetReader.hasNext() ) { int event = sheetReader.next(); if ( event == XMLStreamConstants.START_ELEMENT && sheetReader.getLocalName().equals( "v" ) ) { // read content as string if ( cellType != null && cellType.equals( "s" ) ) { int idx = Integer.parseInt( sheetReader.getElementText() ); content = new XSSFRichTextString( sst.getEntryAt( idx ) ).toString(); } else { content = sheetReader.getElementText(); } } if ( event == XMLStreamConstants.START_ELEMENT && sheetReader.getLocalName().equals( "f" ) ) { isFormula = true; } if ( event == XMLStreamConstants.END_ELEMENT && sheetReader.getLocalName().equals( "c" ) ) { break; } } if ( content != null ) { KCellType kcType = getCellType( cellType, cellStyle, isFormula ); cells[columnIndex] = new StaxPoiCell( parseValue( kcType, content ), kcType, currentRow ); } // else let cell be null } return cells; } @Override public String getName() { return sheetName; } @Override public int getRows() { return numRows; } @Override public KCell getCell( int colnr, int rownr ) { if ( rownr == 0 && colnr < headerRow.size() ) { // only possible to return header return new StaxPoiCell( headerRow.get( colnr ), rownr ); } // if random access this will be very expensive KCell[] row = getRow( rownr ); if ( row != null && rownr < row.length ) { return row[colnr]; } return null; } private KCellType getCellType( String cellType, String cellStyle, boolean isFormula ) { // numeric type can be implicit or 'n' if ( cellType == null || cellType.equals( "n" ) ) { // the only difference between date and numeric is the cell format if ( isDateCell( cellStyle ) ) { return isFormula ? KCellType.DATE_FORMULA : KCellType.DATE; } return isFormula ? KCellType.NUMBER_FORMULA : KCellType.NUMBER; } switch ( cellType ) { case "s": return KCellType.LABEL; case "b": return isFormula ? KCellType.BOOLEAN_FORMULA : KCellType.BOOLEAN; case "e": // error return KCellType.EMPTY; case "str": default: return KCellType.STRING_FORMULA; } } private boolean isDateCell( String cellStyle ) { if ( cellStyle != null ) { int styleIdx = Integer.parseInt( cellStyle ); CTXf cellXf = styles.getCellXfAt( styleIdx ); if ( cellXf != null ) { // need id for builtin types, format if custom int formatId = (int) cellXf.getNumFmtId(); String format = styles.getNumberFormatAt( formatId ); return DateUtil.isADateFormat( formatId, format ); } } return false; } private Object parseValue( KCellType type, String vContent ) { if ( vContent == null ) { return null; } try { switch ( type ) { case NUMBER: case NUMBER_FORMULA: return Double.parseDouble( vContent ); case BOOLEAN: case BOOLEAN_FORMULA: return vContent.equals( "1" ); case DATE: case DATE_FORMULA: Double xlDate = Double.parseDouble( vContent ); return DateUtil.getJavaDate( xlDate, DATE_TZ ); case LABEL: case STRING_FORMULA: case EMPTY: default: return vContent; } } catch ( Exception e ) { return vContent; } } private void resetSheetReader() throws IOException, XMLStreamException, InvalidFormatException { sheetReader.close(); sheetStream.close(); sheetStream = xssfReader.getSheet( sheetId ); XMLInputFactory factory = XMLInputFactory.newInstance(); sheetReader = factory.createXMLStreamReader( sheetStream ); } public void close() throws IOException, XMLStreamException { sheetReader.close(); sheetStream.close(); } }