/**
* Copyright (C) 2001-2017 by RapidMiner and the contributors
*
* Complete list of developers available at our web site:
*
* http://rapidminer.com
*
* This program is free software: you can redistribute it and/or modify it under the terms of the
* GNU Affero General Public License as published by the Free Software Foundation, either version 3
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
* even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License along with this program.
* If not, see http://www.gnu.org/licenses/.
*/
package com.rapidminer.operator.nio.model.xlsx;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import org.xml.sax.Attributes;
import com.rapidminer.operator.Operator;
import com.rapidminer.operator.UserError;
import com.rapidminer.operator.nio.model.ExcelResultSetConfiguration;
import com.rapidminer.operator.nio.model.xlsx.XlsxResultSet.XlsxReadMode;
import com.rapidminer.operator.nio.model.xlsx.XlsxUtilities.XlsxCellCoordinates;
/**
* StAX parser for XLSX worksheet meta data.
*
* FIXME adapt It opens the selected worksheet XML file and looks for the 'dimension' element to
* extract the worksheet cell ranges. In case the 'dimension' element cannot be found or has invalid
* content it has to perform a full scan of the worksheet XML to extract the cell range which can -
* depending on the size of the provided Excel file - can take some time but is necessary for
* pyramid like structured Excel files.
*
* @author Nils Woehler
* @since 6.3.0
*/
public class XlsxSheetMetaDataParser {
/**
* This element specifies the used range of the worksheet. It specifies the row and column
* bounds of used cells in the worksheet. This is optional and is not required. Used cells
* include cells with formulas, text content, and cell formatting. When an entire column is
* formatted, only the first cell in that column is considered used.
*
* @see ECMA-376, 4th Edition, 18.3.1.35 Worksheet dimensions (p. 1617). This tag is optional.
*/
public static final String TAG_DIMENSION = "dimension";
/**
* The row and column bounds of all cells in this worksheet. Corresponds to the range that would
* contain all c elements written under sheetData. Does not support whole column or whole row
* reference notation.
*
* @see ECMA-376, 4th Edition, 18.3.1.35 Worksheet dimensions (p. 1617)
*/
public static final String ATT_DIMENSION_REF = "ref";
/**
* Optimization only, and not required. Specifies the range of non-empty columns (in the format
* X:Y) for the block of rows to which the current row belongs. To achieve the optimization,
* span attribute values in a single block should be the same.
*
* @see ECMA-376, 4th Edition, 18.3.1.73 Row (p. 1670)
*/
public static final String ATT_SPANS = "spans";
/**
* Currently, as of 16.01.2015, XLSX supports up to 104.857.76 rows per sheet.
*/
public static final int MAXIMUM_XLSX_ROW_INDEX = 104_857_75;
/**
* Currently, as of 16.01.2015, XLSX supports up to 16.384 columns per sheet.
*/
public static final int MAXIMUM_XLSX_COLUMN_INDEX = 16_383;
private final File xlsxFile;
private final XMLInputFactory xmlInputFactory;
private final String workbookZipEntryPath;
public XlsxSheetMetaDataParser(File xlsxFile, String workbookZipEntryPath, XMLInputFactory xmlInputFactory) {
this.xlsxFile = xlsxFile;
this.workbookZipEntryPath = workbookZipEntryPath;
this.xmlInputFactory = xmlInputFactory;
}
/**
* Parses the XLSX worksheet file to retrieve sheet meta data.
*
* @param configuration
* the result set configuration
* @param readMode
*
* @return the parsed sheet meta data
*
* @throws XMLStreamException
* if there is an error processing the underlying XML source
* @throws IOException
* if an I/O error has occurred
* @throws UserError
* in case something is configured wrong (e.g. wrong cell parsing range)
*/
public XlsxSheetMetaData parseMetaData(Operator callingOperator, ExcelResultSetConfiguration configuration,
XlsxReadMode readMode) throws XMLStreamException, IOException, UserError {
// use 0 as first row index in case the whole column is specified
// offset via -1 for row number
int firstRowIndex = Math.max(configuration.getRowOffset(), 0);
int firstColumnIndex = configuration.getColumnOffset();
int userSpecifiedLastRow = configuration.getRowLast();
int userSpecifiedLastColumn = configuration.getColumnLast();
// Check configured range
if (firstColumnIndex > userSpecifiedLastColumn || firstRowIndex > userSpecifiedLastRow || firstColumnIndex < 0
|| firstRowIndex < 0) {
throw new UserError(callingOperator, 223, convertOffsetToHumanReadableFormat(firstColumnIndex, firstRowIndex,
userSpecifiedLastRow, userSpecifiedLastColumn));
}
if (readMode != XlsxReadMode.WIZARD_WORKPANE) {
// If the user has specified an end range just return the specified range
if (userSpecifiedLastColumn != Integer.MAX_VALUE) {
return new XlsxSheetMetaData(firstColumnIndex, firstRowIndex, userSpecifiedLastColumn,
Math.min(userSpecifiedLastRow, MAXIMUM_XLSX_ROW_INDEX));
}
} else {
firstRowIndex = 0;
firstColumnIndex = 0;
}
// Otherwise parse work sheet to obtain meta data
boolean isRowWithoutSpan = false;
int maximumColumn = -1;
XMLStreamReader reader = null;
try (ZipFile xlsxZipFile = new ZipFile(xlsxFile)) {
ZipEntry workbookZipEntry = xlsxZipFile.getEntry(workbookZipEntryPath);
if (workbookZipEntry == null) {
throw new FileNotFoundException("Selected workbook is missing in XLSX file. Path: " + workbookZipEntryPath);
}
reader = xmlInputFactory.createXMLStreamReader(xlsxZipFile.getInputStream(workbookZipEntry));
while (reader.hasNext()) {
int eventCode = reader.next();
if (eventCode == XMLStreamReader.START_ELEMENT) {
Attributes attributes = XlsxUtilities.getAttributes(reader);
switch (reader.getLocalName()) {
case TAG_DIMENSION:
String dimension = attributes.getValue(ATT_DIMENSION_REF);
/*
* Continue with next tags and parse all rows if dimension does not have
* a "ref" attribute
*/
if (dimension == null || dimension.isEmpty()) {
continue;
}
int dimColonIndex = dimension.indexOf(':');
/*
* If dimension does not contain a colon (e.g. when generated by
* RapidMiner Studio) it is invalid and should not be evaluated,
* otherwise try to parse the maximum cell range.
*/
if (dimColonIndex != -1) {
String maxCellRange = dimension.substring(dimColonIndex + 1);
try {
XlsxCellCoordinates cellRange = XlsxUtilities.convertCellRefToCoordinates(maxCellRange);
return new XlsxSheetMetaData(firstColumnIndex, firstRowIndex, cellRange.columnNumber,
/*
* Always return the maximum Integer value for the last
* row index to be able to read the whole excel file in
* case the internal format is broken (e.g. files
* created by Libre Office)
*/
Integer.MAX_VALUE);
} catch (IllegalArgumentException e) {
// ignore malformed cell reference and continue parsing whole
// file
}
}
/*
* If we haven't found the dimension yet after parsing the dimension
* element we have to go over all rows and count the rows and columns
* one by one.
*/
break;
case XlsxUtilities.TAG_ROW:
// Check if row contains "spans" attribute
String spans = attributes.getValue(ATT_SPANS);
if (spans != null) {
/*
* In case it is present extract the maximum column range of this
* row if a colon character can be found
*/
int rowColonIndex = spans.indexOf(':');
if (rowColonIndex != -1) {
/*
* We need to subtract 1 as Excel stores columns with a 1 based
* index.
*/
int maxColumn = Integer.parseInt(spans.substring(rowColonIndex + 1, spans.length())) - 1;
if (maxColumn > maximumColumn) {
maximumColumn = maxColumn;
}
}
} else {
isRowWithoutSpan = true;
}
break;
case XlsxUtilities.TAG_CELL:
// only parse cells in case the row does not contain a "spans" attribute
if (isRowWithoutSpan) {
XlsxCellCoordinates columnAndRowIndices = XlsxUtilities
.convertCellRefToCoordinates(attributes.getValue(XlsxUtilities.TAG_CELL_REFERENCE));
if (columnAndRowIndices.columnNumber > maximumColumn) {
maximumColumn = columnAndRowIndices.columnNumber;
}
}
break;
default:
// ignore other local names
break;
}
} else if (eventCode == XMLStreamReader.END_ELEMENT) {
if (reader.getLocalName().equals(XlsxUtilities.TAG_ROW)) {
// Closing row element -> Row completed
isRowWithoutSpan = false;
}
}
}
} finally {
if (reader != null) {
reader.close();
}
}
/*
* Always return the maximum Integer value for the last row index to be able to read the
* whole excel file in case the internal format is broken (e.g. files created by Libre
* Office)
*/
return new XlsxSheetMetaData(firstColumnIndex, firstRowIndex, maximumColumn, Integer.MAX_VALUE);
}
/**
* Converts the provided row and column offset and last column and row indices to a human
* readable Excel format so we can display a proper error message.
*/
private String convertOffsetToHumanReadableFormat(int columnOffset, int confRowOffset, int columnLast, int rowLast) {
StringBuilder rangeBuilder = new StringBuilder();
if (columnOffset >= 0) {
rangeBuilder.append(XlsxUtilities.convertToColumnName(columnOffset));
}
if (confRowOffset >= 0) {
rangeBuilder.append(confRowOffset + 1);
}
boolean colonAdded = false;
if (rowLast == -1 || columnLast == -1) {
rangeBuilder.append(":");
colonAdded = true;
}
if (columnLast != Integer.MAX_VALUE && columnLast != -1) {
if (!colonAdded) {
rangeBuilder.append(":");
colonAdded = true;
}
rangeBuilder.append(XlsxUtilities.convertToColumnName(columnLast));
}
if (rowLast != Integer.MAX_VALUE && rowLast != -1) {
if (!colonAdded) {
rangeBuilder.append(":");
colonAdded = true;
}
rangeBuilder.append(rowLast + 1);
}
return rangeBuilder.toString();
}
}