/**
* Copyright (C) 2001-2017 by RapidMiner and the contributors
*
* Complete list of developers available at our web site:
*
* http://rapidminer.com
*
* This program is free software: you can redistribute it and/or modify it under the terms of the
* GNU Affero General Public License as published by the Free Software Foundation, either version 3
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
* even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License along with this program.
* If not, see http://www.gnu.org/licenses/.
*/
package com.rapidminer.operator.nio.model.xlsx;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import org.apache.tika.io.CountingInputStream;
import org.xml.sax.Attributes;
import com.rapidminer.operator.nio.model.ParseException;
import com.rapidminer.operator.nio.model.ParsingError;
import com.rapidminer.operator.nio.model.ParsingError.ErrorCode;
import com.rapidminer.operator.nio.model.xlsx.XlsxResultSet.XlsxReadMode;
import com.rapidminer.operator.nio.model.xlsx.XlsxUtilities.XlsxCell;
/**
* A StAX parser for XLSX worksheets based on specifications from ECMA-376, 4th Edition.
*
* @author Nils Woehler
* @since 6.3.0
*/
class XlsxSheetContentParser implements AutoCloseable {
/**
* An enumeration representing the cell's data type.
*
* @see ECMA-376, 4th Edition, 18.3.1.4 Cell (p. 1589)
*/
private static final String ATT_CELL_TYPE = "t";
/**
* The index of this cell's style. Style records are stored in the Styles Part.
*
* @see ECMA-376, 4th Edition, 18.3.1.4 Cell (p. 1589)
*/
private static final String ATT_CELL_STYLE = "s";
/**
* This element expresses the value contained in a cell. If the cell contains a string, then
* this value is an index into the shared string table, pointing to the actual string value.
* Otherwise, the value of the cell is expressed directly in this element.
*
* Cells containing formulas express the last calculated result of the formula in this element.
* For applications not wanting to implement the shared string table, an 'inline string' can be
* expressed in an <is> element under <c> (instead of a <v> element under <c>),in the same way a
* string would be expressed in the shared string table.
*
* [Example: In this example, cell B4 contains the number "360", cell C4 contains the local date
* and time 22 November 1976, 08:30, and cell C5 contains the 1900 date system serial date-time
* for the date-time in cell C4.
*
* <pre>
* {@code
* <c r="B4">
* <v>360</v>
* </c>
* <c r="C4" t="d">
* <v>1976-11-22T08:30</v>
* </c>
* <c r="C5">
* <f>C4</f>
* <v>28086.3541666667</v>
* </c>
* }
* </pre>
*
* end example]
*
* @see ECMA-376, 4th Edition, 18.3.1.96 Cell Value (pp. 1699 ff.)
*/
private static final String TAG_VALUE = "v";
/**
* This element allows for strings to be expressed directly in the cell definition instead of
* implementing the shared string table.<br/>
* [Example:
*
* <pre>
* {@code
* <c r="A1">
* <is>
* <t>String</t>
* </is>
* </c>
* }
* </pre>
*
* end example]
*
* @see ECMA-376, 4th Edition, 18.3.1.53 Rich Text Inline (pp. 1648 ff.)
*/
private static final String TAG_INLINE_STRING = "is";
/** The XLSX file itself */
private final File xlsxFile;
/** The path to the worksheet within the XLSX zip file */
private final String workbookZipEntryPath;
/** Strings shared in multiple sheets */
private final String[] sharedStrings;
/** Number formats defined for XLSX cells */
private final XlsxNumberFormats numberFormats;
/** The ZipFile object used to open the {@link #reader} */
private ZipFile xlsxZipFile;
/** Reads XML stream */
private XMLStreamReader reader;
/** CountingInputStream that is used to determine the operator progress */
private CountingInputStream cis;
/** The current row content. */
private XlsxCell[] currentRowContent;
/**
* Keeps track of the current {@code 0-based} row index that was extracted from the XLSX sheet.
* Is {@code -1} if no row was parsed yet.
*/
private int currentRowIndex;
/** Cache for the next row that contains element */
private XlsxCell[] nextRowWithContent;
/**
* The index of the row that has already been parsed ({@code 0-based}). Is {@code -1} if no row
* has been parsed yet. <br/>
*/
private int parsedRowIndex;
/** A flag which represents if more rows with content are available */
private boolean hasMoreContent;
/**
* The parsed sheet meta data.
*/
private XlsxSheetMetaData sheetMetaData;
/**
* An array which stores information about the content of a column.
*/
private final boolean[] emptyColumn;
/**
* Flag that defines whether the first empty rows should be skipped.
*/
private boolean isUseFirstRowAsNames;
/** The encoding used to read the XML stream */
private final Charset encoding;
/**
* Constructs object by saving the shared strings and instantiating a XML reader.
*
* @param xlsxFile
* The xlsx file itself
* @param workbookZipEntryPath
* the path of the workbook Zip entry
* @param sharedStrings
* A ordered list of shared strings to generate complete cell values.
* @param numberFormats
* the parsed XLSX number formats
* @param maximumCellRange
* the maximum cell that should be parsed
* @param columnOffset
* the offset of the first column to use.
* @throws XMLStreamException
* On errors creating a XML stream reader.
* @throws IOException
* in case opening the workbook does not work
*/
public XlsxSheetContentParser(File xlsxFile, String workbookZipEntryPath, String[] sharedStrings,
XlsxNumberFormats numberFormats, XlsxSheetMetaData sheetMetaData, XMLInputFactory factory, Charset encoding)
throws XMLStreamException, IOException {
this.xlsxFile = xlsxFile;
this.workbookZipEntryPath = workbookZipEntryPath;
this.sharedStrings = sharedStrings;
this.numberFormats = numberFormats;
this.sheetMetaData = sheetMetaData;
this.encoding = encoding;
this.emptyColumn = new boolean[sheetMetaData.getNumberOfColumns()];
Arrays.fill(emptyColumn, true);
reset(factory);
}
/**
* Continues parsing the XML stream until a complete row is found.
*
* @param readMode
* defines whether reading is done from an operator which means the parameters should
* be obeyed or if reading is done from a Wizard which means all data should be read
* @throws XMLStreamException
* If there is an error processing the underlying XML source.
* @throws ParseException
* If there is an error on parsing a single XML item.
*/
public void next(XlsxReadMode readMode) throws XMLStreamException, ParseException {
// If reading from an operator or wizard preview skip rows up to row index before actual
// first row that should be parsed
if (readMode != XlsxReadMode.WIZARD_WORKPANE) {
skipToStartRow();
}
// If no row has been parsed yet ..
if (nextRowWithContent == null) {
// parse next row with content
this.nextRowWithContent = parseNextRowWithContent();
}
// Increase current row index
++currentRowIndex;
/*
* The current row is empty if the row index is smaller than the index of the next parsed
* row.
*/
boolean isRowEmpty = currentRowIndex < parsedRowIndex;
// Check for first imported row whether empty rows should be skipped
if (currentRowContent == null) {
assignNextCurrentRow(isRowEmpty && (!isUseFirstRowAsNames || readMode == XlsxReadMode.WIZARD_PREVIEW));
} else {
// All other empty rows will be added
assignNextCurrentRow(isRowEmpty);
}
}
/**
* Assigns the next current row by checking whether an empty row or the next parsed row with
* content should be assigned. In case the next row with content was assigned as current row,
* the next row with content is parsed subsequently.
*
* @param assignEmptyRow
* defines whether an empty row or the next row with content should be assigned
* @throws XMLStreamException
* If there is an error processing the underlying XML source.
* @throws ParseException
* If there is an error on parsing a single XML item.
*/
private void assignNextCurrentRow(boolean assignEmptyRow) throws ParseException, XMLStreamException {
if (assignEmptyRow) {
// If current row is empty initialize a new (empty) array for current row
this.currentRowContent = new XlsxCell[sheetMetaData.getNumberOfColumns()];
} else {
// ... otherwise assign next row with content to current row and parse next row with
// content
this.currentRowContent = nextRowWithContent;
this.currentRowIndex = parsedRowIndex;
this.nextRowWithContent = parseNextRowWithContent();
}
}
/**
* Parses next XLSX rows until a row with content was found.
*
* @return if the next row with content or an empty row in case no row with content was found
* anymore
* @throws XMLStreamException
* If there is an error processing the underlying XML source.
* @throws ParseException
* If there is an error on parsing a single XML item.
*/
private XlsxCell[] parseNextRowWithContent() throws ParseException, XMLStreamException {
boolean isRowWithContent = false;
boolean isValue = false;
boolean contentFound = false;
int columnIndex = 0;
int numberOfColumns = sheetMetaData.getNumberOfColumns();
XlsxCell[] nextRowWithContent = new XlsxCell[numberOfColumns];
while (!isRowWithContent && reader.hasNext()) {
// Parse the next element
switch (reader.next()) {
case XMLStreamReader.START_ELEMENT:
String startLocalName = reader.getLocalName();
if (startLocalName.equals(XlsxUtilities.TAG_ROW)) {
// We need to subtract 1 as XLSX indices start with 1
String indexValue = XlsxUtilities.getAttributes(reader).getValue(XlsxUtilities.TAG_ROW_INDEX);
try {
parsedRowIndex = Integer.parseInt(indexValue) - 1;
} catch (NumberFormatException e) {
throw new ParseException(
new ParsingError(parsedRowIndex, columnIndex, ErrorCode.FILE_SYNTAX_ERROR, indexValue));
}
} else if (startLocalName.equals(XlsxUtilities.TAG_CELL)) {
Attributes attributes = XlsxUtilities.getAttributes(reader);
// Update column index
String cellReference = attributes.getValue(XlsxUtilities.TAG_CELL_REFERENCE);
try {
columnIndex = sheetMetaData
.mapColumnIndex(XlsxUtilities.convertCellRefToCoordinates(cellReference).columnNumber);
} catch (IllegalArgumentException e) {
throw new ParseException(new ParsingError(parsedRowIndex, columnIndex,
ParsingError.ErrorCode.FILE_SYNTAX_ERROR, cellReference));
}
// Check if the current column should be skipped
if (sheetMetaData.isSkipColumn(columnIndex)) {
break;
}
// Parse cell type and create new XLSX cell
String cellType = attributes.getValue(ATT_CELL_TYPE);
String cellStyle = attributes.getValue(ATT_CELL_STYLE);
XlsxCellType type = XlsxCellType.getCellType(cellType, numberFormats, cellStyle);
if (type != null) {
nextRowWithContent[columnIndex] = new XlsxCell(type);
} else {
throw new ParseException(new ParsingError(parsedRowIndex, columnIndex,
ParsingError.ErrorCode.FILE_SYNTAX_ERROR, cellType));
}
} else if (startLocalName.equals(TAG_VALUE) || startLocalName.equals(TAG_INLINE_STRING)) {
// Information for value parsing
isValue = true;
}
break;
case XMLStreamReader.END_ELEMENT:
String endLocalName = reader.getLocalName();
if (endLocalName.equals(XlsxUtilities.TAG_ROW)) {
// At least one value has been parsed for current row
// so stop parsing any further
isRowWithContent = contentFound;
} else if (endLocalName.equals(TAG_VALUE) || endLocalName.equals(TAG_INLINE_STRING)) {
// Text part ends
isValue = false;
}
break;
case XMLStreamReader.CHARACTERS:
if (!sheetMetaData.isSkipColumn(columnIndex) && isValue) {
String text = reader.getText();
if (nextRowWithContent[columnIndex].cellType.equals(XlsxCellType.SHARED_STRING)) {
nextRowWithContent[columnIndex].value = sharedStrings[Integer.parseInt(text)];
} else {
nextRowWithContent[columnIndex].value = text;
}
emptyColumn[columnIndex] = false;
contentFound = true;
}
break;
case XMLStreamReader.END_DOCUMENT:
// end of document was reached but no new row with content was found
this.hasMoreContent = false;
break;
default:
// ignore other events
break;
}
}
return nextRowWithContent;
}
/**
* Skips all rows before the selected starting row.
*
* @throws XMLStreamException
* If there is an error processing the underlying XML source.
*/
private void skipToStartRow() throws XMLStreamException {
int rowIndexBeforeFirstRow = sheetMetaData.getFirstRowIndex() - 1;
boolean skipRows = currentRowIndex < rowIndexBeforeFirstRow;
while (skipRows && reader.hasNext()) {
int nextEvent = reader.next();
if (nextEvent == XMLStreamReader.START_ELEMENT) {
// If a row was found..
if (reader.getLocalName().equals(XlsxUtilities.TAG_ROW)) {
// ... parse the row index (subtract 1 as XLSX indices start with 1)
parsedRowIndex = Integer
.parseInt(XlsxUtilities.getAttributes(reader).getValue(XlsxUtilities.TAG_ROW_INDEX)) - 1;
// Check whether the parsed index is already
// behind the desired end index (which means that all rows before were
// empty)
if (parsedRowIndex > rowIndexBeforeFirstRow) {
currentRowIndex = rowIndexBeforeFirstRow;
skipRows = false;
} else {
// Update the current row index if parsed index is still
// before first row to read
currentRowIndex = parsedRowIndex;
}
}
} else if (nextEvent == XMLStreamReader.END_ELEMENT && reader.getLocalName().equals(XlsxUtilities.TAG_ROW)) {
// parse until end of row if we haven't parsed too far yet
skipRows = currentRowIndex < rowIndexBeforeFirstRow;
}
}
}
/**
* Returns the {@code 0-based} index of the current row, which was extracted from the worksheet.
*
* @return the {@code 0} based index of the current parsed row. Returns {@code -1} in case no
* row has been parsed yet. It is increased each time calling {@link #next()} and is
* reset to {@code -1} in case {@link #reset(XMLInputFactory)} is called.
*/
int getCurrentRowIndex() {
return currentRowIndex;
}
/**
* @return the content of the current parsed row. Returns <code>null</code> in case no row has
* been parsed yet.
*/
XlsxCell[] getRowContent() {
return currentRowContent;
}
@Override
public void close() throws XMLStreamException, IOException {
// Close the XMLStreamReader
if (reader != null) {
reader.close();
}
// close Zip file and its open InputStreams
if (xlsxZipFile != null) {
xlsxZipFile.close();
}
}
/**
* @return <code>true</code> in case the {@link XMLStreamReader} has more rows with content
* available.
*/
boolean hasNext() {
return hasMoreContent && sheetMetaData.getLastRowIndex() >= 0 && currentRowIndex < sheetMetaData.getLastRowIndex();
}
/**
* Closes the current open {@link XMLStreamReader} and creates a new one which starts the
* reading process at the first row. It is assumed the the XLSX content and operator
* configuration remain the same.
*
* @param factory
* the {@link XMLInputFactory} that should be used to open the
* {@link XMLStreamReader}.
*
* @throws IOException
* if an I/O error has occurred
* @throws XMLStreamException
* if there are errors freeing associated XML reader resources or creating a new XML
* reader
*/
void reset(XMLInputFactory xmlFactory) throws IOException, XMLStreamException {
// close open file and reader object
close();
// create new file and stream reader objects
xlsxZipFile = new ZipFile(xlsxFile);
ZipEntry workbookZipEntry = xlsxZipFile.getEntry(workbookZipEntryPath);
if (workbookZipEntry == null) {
throw new FileNotFoundException(
"XLSX file is malformed. Reason: Selected workbook is missing in XLSX file. Path: "
+ workbookZipEntryPath);
}
InputStream inputStream = xlsxZipFile.getInputStream(workbookZipEntry);
cis = new CountingInputStream(inputStream);
reader = xmlFactory.createXMLStreamReader(new InputStreamReader(cis, encoding));
// reset other variables
currentRowIndex = -1;
parsedRowIndex = -1;
currentRowContent = null;
nextRowWithContent = null;
hasMoreContent = true;
Arrays.fill(emptyColumn, true);
}
/**
* @return an array which stores if an column was empty during parsing. You should only call
* this method iff {@link #hasNext()} returns <code>false</code>.
*/
boolean[] getEmptyColumns() {
return emptyColumn;
}
/**
* @param parameterAsBoolean
* defines whether the first row should be used as names. If set to <code>true</code>
* the worksheet parser will skip all beginning empty rows until the first row with
* content was found.
*/
void setUseFirstRowAsNames(boolean isFirstRowAsNames) {
this.isUseFirstRowAsNames = isFirstRowAsNames;
}
/**
*
* @return The total size of the entry data, which can be used to determine the total operator
* progress
*/
long getTotalSize() {
return xlsxZipFile.getEntry(workbookZipEntryPath).getSize();
}
/**
*
* @return The current position in the entry data, which can be used to determine the current
* operator progress
*/
long getCurrentPosition() {
return cis.getByteCount();
}
}