/* * (C) Copyright IBM Corp. 2010 * * LICENSE: Eclipse Public License v1.0 * http://www.eclipse.org/legal/epl-v10.html */ package com.ibm.db2j; import java.io.FileInputStream; import java.io.InputStream; import java.sql.ResultSet; import java.sql.ResultSetMetaData; import java.sql.SQLException; import java.util.ArrayList; import java.util.List; import org.apache.derby.iapi.error.StandardException; import org.apache.derby.iapi.store.access.Qualifier; import org.apache.derby.iapi.types.DataValueDescriptor; import org.apache.derby.vti.IFastPath; import org.apache.derby.vti.VTICosting; import org.apache.derby.vti.VTIEnvironment; import org.apache.poi.ss.usermodel.Cell; import org.apache.poi.ss.usermodel.DateUtil; import org.apache.poi.ss.usermodel.FormulaEvaluator; import org.apache.poi.ss.usermodel.Row; import org.apache.poi.ss.usermodel.Sheet; import org.apache.poi.ss.usermodel.Workbook; import org.apache.poi.ss.usermodel.WorkbookFactory; import org.apache.poi.ss.util.CellReference; import com.ibm.gaiandb.GaianChildVTI; import com.ibm.gaiandb.GaianResultSetMetaData; import com.ibm.gaiandb.Logger; import com.ibm.gaiandb.RowsFilter; import com.ibm.gaiandb.Util; import com.ibm.gaiandb.diags.GDBMessages; /** * Derby VTI for excel spreadsheets. * * Are not yet implemented : * * - physical/logical column mapping * - scrollability * * @author lengelle */ public class GExcel extends VTI60 implements VTICosting, IFastPath, GaianChildVTI { // Use PROPRIETARY notice if class contains a main() method, otherwise use // COPYRIGHT notice. public static final String COPYRIGHT_NOTICE = "(c) Copyright IBM Corp. 2008"; private static final Logger logger = new Logger( "GExcel", 30 ); private static final int CELL_WITH_NO_TYPE = -232; // Why did I choose -232 ? Honestly, I don't know.. private static final int DATE_TYPE = 233; // Excel Poi does not have a specific type for dates private static final char ARG_SEPARATOR = ','; private static String DEFAULT_COLUMN_LABEL = "COLUMN"; private Workbook workbook; private FormulaEvaluator evaluator; private Sheet sheet; private InputStream inputStream; private int numberOfColumns; private List<Integer> columnIndexes; private int[] columnTypes; private List<String> columnNames; private Qualifier[][] qualifiers; private Row currentRow; private ResultSetMetaData rsmd; // private CellReference firstCell; // private CellReference lastCell; private int firstColumnIndex; private int lastColumnIndex; private int firstRowIndex; private int lastRowIndex; private boolean stopScanOnFirstEmptyRow = false; private boolean firstRowIsMetaData; /** * Receives in parameter the argument from gaiandb configuration file. * * @param args * @throws SQLException */ public GExcel( String args ) throws SQLException { super(); //System.out.println( "*** constructeur 1 parameter : "+args ); String[] splitArgs = Util.splitByTrimmedDelimiter( args, ARG_SEPARATOR ); try { if ( splitArgs.length == 2 ) { initialize( splitArgs[0], splitArgs[1], null, null, true ); } else if ( splitArgs.length == 3 ) { initialize( splitArgs[0], splitArgs[1], null, null, Boolean.parseBoolean( splitArgs[2] ) ); } else if ( splitArgs.length == 4 ) { initialize( splitArgs[0], splitArgs[1], splitArgs[2], splitArgs[3], true ); } else if ( splitArgs.length == 5 ) { initialize( splitArgs[0], splitArgs[1], splitArgs[2], splitArgs[3], Boolean.parseBoolean( splitArgs[4] ) ); } else { throw new SQLException( "Unsupported number of parameters for GExcel() invocation: " + splitArgs.length); } } catch ( Exception e ) { logger.logInfo("Unable to initialise GExcel: " + e); throw new SQLException( e ); } } // ------------------ VTICosting methods ----------------------------------------------- public double getEstimatedCostPerInstantiation( VTIEnvironment arg0 ) throws SQLException { //System.out.println( "*** getEstimatedCostPerInstantiation" ); return 0; } public double getEstimatedRowCount( VTIEnvironment arg0 ) throws SQLException { //System.out.println( "*** getEstimatedRowCount: " + (lastRowIndex - firstRowIndex) ); return lastRowIndex - firstRowIndex; } public boolean supportsMultipleInstantiations( VTIEnvironment arg0 ) throws SQLException { //System.out.println( "*** supportsMultipleInstantiations" ); return false; } // ------------------ IFastPath methods ----------------------------------------------- public void currentRow( ResultSet arg0, DataValueDescriptor[] arg1 ) throws StandardException, SQLException { //System.out.println( "*** currentRow" ); } // Called if GExcel is invoked directly in SQL public boolean executeAsFastPath() throws StandardException, SQLException { //System.out.println( "*** executeAsFastPath" ); reinitialise(); // checkSheetTypeConsistency( columnIndexes ); return true; } public int nextRow( DataValueDescriptor[] arg0 ) throws StandardException, SQLException { //System.out.println( "*** nextRow" ); int result = createNextRow( sheet, arg0); if ( result==GOT_ROW && qualifiers!=null ) { boolean areQualifiersMet = RowsFilter.testQualifiers( arg0, qualifiers ); while ( result==GOT_ROW && !areQualifiersMet ) { result = createNextRow( sheet, arg0); areQualifiersMet = RowsFilter.testQualifiers( arg0, qualifiers ); } } return result; } public void rowsDone() throws StandardException, SQLException { //System.out.println( "*** rowsDone" ); } // ------------------ GaianChildVTI methods ----------------------------------------------- public boolean fetchNextRow( DataValueDescriptor[] row ) throws Exception { return IFastPath.GOT_ROW == nextRow(row); } public int getRowCount() throws Exception { //System.out.println( "*** getRowCount" ); return 0; } public boolean isScrollable() { //System.out.println( "*** isScrollable" ); return false; } public void setArgs( String[] args ) throws Exception { //System.out.println( "*** setArgs : "+args[0] ); } public void setExtractConditions( Qualifier[][] qualifiers, int[] projectedColumns, int[] physicalColumnsMapping ) throws Exception { //System.out.println( "*** setExtractConditions" ); this.qualifiers = qualifiers; // System.out.println("Cols involved: " + Util.intArrayAsString(projectedColumns)); } // ------------------ From extend methods ----------------------------------------------- public ResultSetMetaData getMetaData() throws SQLException { //System.out.println( "*** getMetaData" ); reinitialise(); rsmd = createStringTypeMetaData(); logger.logInfo("GExcel columnIndexes: " + columnIndexes ); return rsmd; } // Use the following method from the GaianDB GExcel API to do extra column type inferring public ResultSetMetaData getMetaDataByInferringTypes() throws SQLException { //System.out.println( "*** getMetaData" ); reinitialise(); rsmd = checkSheetTypeConsistency( columnIndexes ) ? createMetaData() : createStringTypeMetaData(); return rsmd; } @Override public boolean reinitialise() { findColumns( sheet ); columnTypes = new int[ columnIndexes.size() ]; for ( int i=0; i<columnTypes.length; ++i ) columnTypes[i] = CELL_WITH_NO_TYPE; currentRow = null; // also need to take into account + reinitialise based on whether we have new values for: // 1) sheet name, 2) first cell index, 3) last cell index and 4) flag for interpretFirstLineAsMetaData return false; // isPoolable? i.e. ready for re-use? not until comment above is addressed... test poolability with: Test_setDsExcel } public void close() throws SQLException { try { logger.logInfo( "*** Closing GExcel Spreadsheet data source wrapper" ); inputStream.close(); inputStream=null; if ( columnIndexes!=null ) { columnIndexes.clear(); columnIndexes = null; } if ( columnNames!=null ) { columnNames.clear(); columnNames = null; } qualifiers = null; columnTypes = null; currentRow = null; rsmd = null; evaluator = null; sheet = null; workbook = null; } catch( Exception e ) { logger.logWarning( GDBMessages.DSWRAPPER_GEXCEL_CLOSE_ERROR, "*** Failed to close GExcel Spreadsheet data source wrapper: " + e ); } } public boolean isBeforeFirst() { return null == currentRow; } public ResultSet executeQuery(java.lang.String sql) { //System.out.println( "*** executeQuery(sql)" ); return null; } public ResultSet executeQuery() { //System.out.println( "*** executeQuery()" ); return null; } public ResultSet getResultSet() { //System.out.println( "*** getResultSet" ); return null; } // ------------------ Spreadsheet methods ----------------------------------------------- /** * Initialize the attributes : * * - inputStream * - workbook * - evaluator * - sheet * - firstRowIsMetaData * * - firstColumnIndex * - firstRowIndex * - lastColumnIndex * - lastRowIndex * * @param fileName * @param spreadsheetName * @param firstCellRange * @param lastCellRange * @param interpretFirstLineAsMetaData * @throws SQLException */ public void initialize( String fileName, String spreadsheetName, String firstCellRange, String lastCellRange, boolean interpretFirstLineAsMetaData ) throws SQLException { try { inputStream = new FileInputStream( fileName ); workbook = WorkbookFactory.create( inputStream ); evaluator = workbook.getCreationHelper().createFormulaEvaluator(); sheet = findSpreadsheet( workbook, spreadsheetName ); firstRowIsMetaData = interpretFirstLineAsMetaData; if ( firstCellRange!=null && lastCellRange!=null ) { CellReference firstCell = new CellReference( firstCellRange ); // Deduce last row number if it was not specified if ( lastCellRange.matches("[a-zA-Z]+") ) { lastCellRange += (sheet.getLastRowNum() + 1); //Note: getLastRowNum is 0-based stopScanOnFirstEmptyRow = true; logger.logInfo("Deduced last row in Excel table: " + lastCellRange + " - but scans will end on first empty row"); } CellReference lastCell = new CellReference( lastCellRange ); firstColumnIndex = firstCell.getCol(); firstRowIndex = firstCell.getRow(); // + (firstRowIsMetaData?1:0); lastColumnIndex = lastCell.getCol(); lastRowIndex = lastCell.getRow(); } else { Row firstRow = locateFirstRow( sheet ); if ( firstRow==null ) { throw new SQLException( "Empty spreadsheet !" ); } firstRowIndex = firstRow.getRowNum(); // + (firstRowIsMetaData?1:0); lastRowIndex = sheet.getLastRowNum(); firstColumnIndex = firstRow.getFirstCellNum(); //Note: getFirstCellNum is 0-based lastColumnIndex = firstRow.getLastCellNum() - 1; //Note: getLastCellNum is 1-based } //System.out.println("sheet: " + sheet.getSheetName() + ", firstcolindex: " + firstColumnIndex + ", lastcolindex: " + lastColumnIndex + ", firstrowindex: " + firstRowIndex + ", lastrowindex: " + lastRowIndex); } catch( Exception e ) { throw new SQLException( e.getMessage() ); } } /** * Put the next row in the dvd row given in parameter. * Return SCAN_COMPLETED if there is no more row in the spreadsheet, or GOT_ROW if a row was successfully put in the dvd row. * * Uses the attribute currentRow to save the previous row fetched. * * @param sheet * @param dvdr * @param numberOfLogicalColumnsInvolved * @param columnIndexes * @return SCAN_COMPLETED or GOT_ROW * @throws SQLException */ private int createNextRow( Sheet sheet, DataValueDescriptor[] dvdr) { boolean gotData = false; /* * Find the next row to return. * * currentRow should currently point to the last row returned. * If that's null, then start from first row. * Else, search for the next non-empty row (until we hit the end of the prescribed range). */ if ( currentRow == null ) currentRow = sheet.getRow( firstRowIndex + ( firstRowIsMetaData ? 1 : 0 ) ); else { int nextRowIndex = currentRow.getRowNum()+1; currentRow = null; if(stopScanOnFirstEmptyRow) { currentRow = sheet.getRow( nextRowIndex ); } else { while ( currentRow==null && nextRowIndex<=lastRowIndex ) { currentRow = sheet.getRow( nextRowIndex ); nextRowIndex++; } } } /* * If we've run out of spreadsheet (currentRow == null) or gone out of the prescribed range, * then scan complete - return that. */ if ( currentRow==null || currentRow.getRowNum() > lastRowIndex ) { return SCAN_COMPLETED; } /* * Get the offset of the first column in the spreadsheet. * Note: this is used when iterating below, so that we can correctly relate * the actual column in the spreadsheet to the correct 'column' in the * DataValueDescriptor [] representing the row. */ int columnOffset = firstColumnIndex; //Figure out how many columns there are int numberOfColumns = lastColumnIndex - firstColumnIndex + 1; for(int i = 0; i < numberOfColumns; i++) { /* * Note: i is used to refer to the index of the DataValueDescriptor which represents * the actual spreadsheet column (at i + columnOffset) in the DataValueDescriptor[] * representing this row. */ Cell cell = currentRow.getCell(i + columnOffset); if(cell == null) { dvdr[i].setToNull(); } else { try { int cellValueType = cell.getCellType(); if (cellValueType == Cell.CELL_TYPE_FORMULA) cellValueType = cell.getCachedFormulaResultType(); switch( cellValueType ) { case Cell.CELL_TYPE_STRING: dvdr[i].setValue( cell.getStringCellValue() ); break; case Cell.CELL_TYPE_NUMERIC: if ( DateUtil.isCellDateFormatted( cell ) ) dvdr[i].setValue( new java.sql.Date( cell.getDateCellValue().getTime() ) ); else { cell.setCellType(Cell.CELL_TYPE_STRING); dvdr[i].setValue( cell.getStringCellValue() ); } break; case Cell.CELL_TYPE_BOOLEAN: dvdr[i].setValue( cell.getBooleanCellValue() ); break; default: dvdr[i].setToNull(); break; } //If a cell has data that is not null - then flag that we actually have data to return if ( !dvdr[i].isNull() ) gotData = true; } catch ( Exception e ) { dvdr[i].setToNull(); logger.logWarning( GDBMessages.DSWRAPPER_GEXCEL_MAP_LT_ERROR, "Excel cell [spreadsheet "+sheet.getSheetName()+"; row "+cell.getRow().getRowNum()+"; column "+cell.getColumnIndex()+"; value "+cell+"] could not be mapped into the logical table because of the column logical type: " + e); } } } if ( !gotData && stopScanOnFirstEmptyRow ) { logger.logInfo("Ending GExcel table scan on first empty row (as no row limit was specified in the ending cell config constraint)"); return SCAN_COMPLETED; } return GOT_ROW; } /** * Creates and returns the ResultSetMetaData object using the fields : * - columnNames * - columnTypes * * The method maps excel types into SQL types. * * @return the ResultSetMetaData object * @throws SQLException */ private ResultSetMetaData createMetaData() throws SQLException { try { StringBuffer metaData = new StringBuffer(); assert( columnNames.size()==columnTypes.length ); for ( int i=0; i<columnTypes.length; ++i ) { if(i>0) metaData.append( ", "); metaData.append( columnNames.get( i ) ); metaData.append( ' ' ); metaData.append( getStringSqlTypeFromSpreadsheetType( columnTypes[i] ) ); } return new GaianResultSetMetaData( metaData.toString() ); } catch( Exception e ) { throw new SQLException( "Problem occurs while creating the Meta-data: " + e, e ); } } /** * Creates and returns the ResultSetMetaData object using the fields : * - columnNames * * All the SQL types are defined as VARCHAR with this method. * * @return the ResultSetMetaData object * @throws SQLException */ private ResultSetMetaData createStringTypeMetaData() throws SQLException { try { StringBuffer metaData = new StringBuffer(); metaData.append( columnNames.get( 0 ) ); metaData.append( ' ' ); metaData.append( getStringSqlTypeFromSpreadsheetType( Cell.CELL_TYPE_STRING ) ); for ( int i=1; i<columnNames.size(); ++i ) { metaData.append( ", " ); metaData.append( columnNames.get( i ) ); metaData.append( ' ' ); metaData.append( getStringSqlTypeFromSpreadsheetType( Cell.CELL_TYPE_STRING ) ); } return new GaianResultSetMetaData( metaData.toString() ); } catch( Exception e ) { throw new SQLException( "Problem occured while creating the Meta-data: " + e ); } } /** * Returns a string containing the SQL type definition of the excel type given in parameter. * * @param excelType * @return the SQL type definition * @throws SQLException */ private String getStringSqlTypeFromSpreadsheetType( int excelType ) throws SQLException { switch( excelType ) { case Cell.CELL_TYPE_STRING: return "VARCHAR(50)"; case Cell.CELL_TYPE_NUMERIC: return "INT"; case Cell.CELL_TYPE_BOOLEAN: return "BOOLEAN"; case Cell.CELL_TYPE_FORMULA: return "VARCHAR(50)"; case DATE_TYPE: return "DATE"; case CELL_WITH_NO_TYPE: return "VARCHAR(50)"; default: throw new SQLException( "Unknow type detected !" ); } } /** * Finds the spreadsheet defined by the name given in parameter and return it if found. * Else return null. * * @param workbook * @param spreadsheetName * @return the spreadsheet * @throws SQLException */ private Sheet findSpreadsheet( Workbook workbook, String spreadsheetName ) throws SQLException { boolean sheetFound = false; Sheet sheetTmp = null; for ( int i=0; i<workbook.getNumberOfSheets() && !sheetFound; ++i ) { sheetTmp = workbook.getSheetAt( i ); if ( sheetTmp.getSheetName().equals( spreadsheetName ) ) sheetFound = true; } if ( sheetFound==false && sheet==null ) { throw new SQLException( "The file does not contain a spreadsheet named : "+spreadsheetName ); } return sheetTmp; } /** * Locates and return the first row of the the spreadsheet. * * @param sheet * @return the first row */ private Row locateFirstRow( Sheet sheet ) { for ( Row row : sheet ) { for ( Cell cell : row ) { switch( cell.getCellType() ) { case Cell.CELL_TYPE_STRING: case Cell.CELL_TYPE_NUMERIC: case Cell.CELL_TYPE_BOOLEAN: case Cell.CELL_TYPE_FORMULA: return row; default: break; } } } return null; } /** * looks for the column definition and initializes the following attributes : * * - numberOfColumns * - columnIndexes * - columnNames * * If a column which contains no values is ignored. * * If firstRowIsMetaData is true, the column names will be extract from the first row of the spreadsheet. * Else, they will be automatically generated : COLUMN1, COLUMN2... * * @param sheet */ private void findColumns( Sheet sheet ) { numberOfColumns = 0; columnIndexes = new ArrayList<Integer>(); columnNames = new ArrayList<String>(); Row firstRow = sheet.getRow( firstRowIndex ); int columnLabelIndex = 1; if ( firstRowIsMetaData ) { //For each column for ( int i=firstColumnIndex; i<=lastColumnIndex; ++i ) { //Get the first cell in the column Cell cell = firstRow.getCell( i, Row.CREATE_NULL_AS_BLANK ); columnIndexes.add( cell.getColumnIndex() ); int cellType = cell.getCellType(); if ( Cell.CELL_TYPE_FORMULA == cellType ) { cellType = cell.getCachedFormulaResultType(); // System.out.println("cell type is now getCachedFormulaResultType() = " + cellType ); } //Build the column names depending on it's type switch( cell.getCellType() ) { case Cell.CELL_TYPE_STRING: // case Cell.CELL_TYPE_FORMULA: // DO NOT USE: getCellFormula() !!! // System.out.println("cell type string" ); // Note: Javadoc on method getStringCellValue() states: // "get the value of the cell as a string - for numeric cells we throw an exception. For blank cells we return an empty string. // For formulaCells that are not string Formulas, we throw an exception" ++numberOfColumns; columnNames.add( cell.getStringCellValue().replaceAll("[\\ ]", "_") ); // Note we should not have to do this in future... once defect is fixed break; case Cell.CELL_TYPE_NUMERIC: // System.out.println("cell type numeric " + // ( DateUtil.isCellDateFormatted( cell ) ? "date: " + cell.getDateCellValue().toString() : "num: " + cell.getNumericCellValue() ) ); ++numberOfColumns; columnNames.add( DateUtil.isCellDateFormatted( cell ) ? cell.getDateCellValue().toString() : "" + cell.getNumericCellValue() ); break; case Cell.CELL_TYPE_BOOLEAN: // System.out.println("cell type boolean" ); ++numberOfColumns; columnNames.add( "" + cell.getBooleanCellValue() ); break; default: // System.out.println("cell type default" ); ++numberOfColumns; columnNames.add( DEFAULT_COLUMN_LABEL+""+columnLabelIndex ); break; } columnLabelIndex++; } } else { //For each column for ( int i=firstColumnIndex; i<=lastColumnIndex; ++i ) { //Get the first cell in the column Cell cell = firstRow.getCell( i, Row.CREATE_NULL_AS_BLANK ); columnIndexes.add( cell.getColumnIndex() ); columnNames.add( DEFAULT_COLUMN_LABEL+""+columnLabelIndex++ ); } } } /** * This method checks if the spreadsheet is well typed. * This means that all values in each column have the same excel type. * * Returns true if the spreadsheet is well typed, else returns false. * * In addition, this method tries to deduce the excel types from each columns and initializes the attribute : columnTypes. * If a column is empty, then its type is CELL_WITH_NO_TYPE. * The attribute columnTypes must only be used if the spreadsheet is well typed. In the other cases, columnTypes is not significant. * * @param columnIndexes * @return returns true if the spreadsheet is well typed, else returns false */ private boolean checkSheetTypeConsistency( List<Integer> columnIndexes ) { boolean isConsistent = true; int firstRow = firstRowIndex; if ( firstRowIsMetaData ) { ++firstRow; } Row currentRow; Cell cell; int index; for ( int i=firstRow; i<=lastRowIndex; ++i ) { currentRow = sheet.getRow( i ); if ( currentRow!=null ) { index = 0; for ( int j=firstColumnIndex; j<=lastColumnIndex; ++j ) { cell = currentRow.getCell( j, Row.CREATE_NULL_AS_BLANK ); if ( cell != null ) { // logger.logInfo("Checking non-null cell: " + cell); int cellType = -1; try { cellType = evaluator.evaluateInCell(cell).getCellType(); } catch( Exception e ) { logger.logWarning(GDBMessages.DSWRAPPER_GEXCEL_CELL_TYPE_EVALUATION_FAILURE, "Unable to evaluate type for cell at row " + i + " col " + j + ": " + cell); isConsistent = false; continue; } switch( cellType ) { case Cell.CELL_TYPE_STRING: isConsistent = checkSheetConsistencySubMethod( cell, index, isConsistent ); ++index; break; case Cell.CELL_TYPE_NUMERIC: isConsistent = checkSheetConsistencySubMethod( cell, index, isConsistent ); ++index; break; case Cell.CELL_TYPE_BOOLEAN: isConsistent = checkSheetConsistencySubMethod( cell, index, isConsistent ); ++index; break; default: if ( index<columnIndexes.size() && columnIndexes.get( index )==cell.getColumnIndex() ) { // The cell is null for this column ++index; } break; } } } } } return isConsistent; } private boolean checkSheetConsistencySubMethod( Cell cell, int index, boolean isConsistent ) { if ( index<columnTypes.length ) { boolean isADate = ( cell.getCellType()==Cell.CELL_TYPE_NUMERIC && DateUtil.isCellDateFormatted( cell ) ); if ( columnTypes[index]==CELL_WITH_NO_TYPE ) { if ( isADate ) { // Specific case where the cell is a date columnTypes[index]=DATE_TYPE; } else { columnTypes[index]=cell.getCellType(); } } else if ( isADate && columnTypes[index]!=DATE_TYPE ) { // Specific case where the cell is a date return false; } else if ( !isADate && ( columnTypes[index]!=cell.getCellType() ) ) { return false; } } return isConsistent; } }