/**
* Copyright (C) 2001-2017 by RapidMiner and the contributors
*
* Complete list of developers available at our web site:
*
* http://rapidminer.com
*
* This program is free software: you can redistribute it and/or modify it under the terms of the
* GNU Affero General Public License as published by the Free Software Foundation, either version 3
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
* even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License along with this program.
* If not, see http://www.gnu.org/licenses/.
*/
package com.rapidminer.operator.nio.model.xlsx;
import java.io.File;
import java.io.IOException;
import java.math.BigDecimal;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.text.DateFormat;
import java.text.DecimalFormat;
import java.text.SimpleDateFormat;
import java.util.Collections;
import java.util.Date;
import java.util.LinkedList;
import java.util.List;
import java.util.TimeZone;
import java.util.logging.Level;
import java.util.zip.ZipFile;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import org.apache.poi.ss.usermodel.DateUtil;
import org.xml.sax.SAXException;
import com.rapidminer.operator.Operator;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.ProcessStoppedException;
import com.rapidminer.operator.UserError;
import com.rapidminer.operator.nio.model.ColumnMetaData;
import com.rapidminer.operator.nio.model.DataResultSet;
import com.rapidminer.operator.nio.model.DateFormatProvider;
import com.rapidminer.operator.nio.model.ExcelResultSetConfiguration;
import com.rapidminer.operator.nio.model.ParseException;
import com.rapidminer.operator.nio.model.ParsingError;
import com.rapidminer.operator.nio.model.xlsx.XlsxUtilities.XlsxCell;
import com.rapidminer.operator.nio.model.xlsx.XlsxWorkbookParser.XlsxWorkbook;
import com.rapidminer.operator.nio.model.xlsx.XlsxWorkbookRelationParser.XlsxWorkbookRel;
import com.rapidminer.tools.I18N;
import com.rapidminer.tools.LogService;
import com.rapidminer.tools.ProgressListener;
/**
* A DataResultSet for an Excel 2007 files (XLSX). It uses StAX parsers based on the file format
* definitions of ECMA-376, 4th Edition to parse configuration and worksheet content from XML files
* stored within a XLSX file.
*
* @author Nils Woehler
* @since 6.3.0
*/
public class XlsxResultSet implements DataResultSet {
/**
* Defines whether the Excel file is read by the operator or by the Wizard.
*/
public static enum XlsxReadMode {
WIZARD_WORKPANE, WIZARD_PREVIEW, OPERATOR,
/**
* Specifies that the {@link XlsxResultSet} was created to display preview content in the
* sheet selection step of the new data import dialog. If used the
* {@link XlsxSheetTableModel} will load a data preview instead of the full sheet content.
*/
WIZARD_SHEET_SELECTION
}
/**
* The factory used to create XML StAX streams.
*/
private static final XMLInputFactory XML_STREAM_FACTORY = XMLInputFactory.newFactory();
static {
XML_STREAM_FACTORY.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, false);
XML_STREAM_FACTORY.setProperty(XMLInputFactory.IS_VALIDATING, false);
XML_STREAM_FACTORY.setProperty(XMLInputFactory.IS_COALESCING, true);
// Re-enable once we can use Aalto or Woodstox parser
// XML_STREAM_FACTORY.setProperty(XMLInputFactory2.P_LAZY_PARSING, true);
// XML_STREAM_FACTORY.setProperty(XMLInputFactory2.P_INTERN_NAMES, true);
// XML_STREAM_FACTORY.setProperty(XMLInputFactory2.P_INTERN_NS_URIS, true);
// XML_STREAM_FACTORY.setProperty(XMLInputFactory2.P_PRESERVE_LOCATION, false);
// XML_STREAM_FACTORY.setProperty(XMLInputFactory2.P_REPORT_PROLOG_WHITESPACE, false);
}
/**
* Returned value in case boolean is stored as 0 in XLSX file
*/
private static final String BOOLEAN_FALSE = "false";
/**
* Returned value in case boolean is stored as 1 in XLSX file
*/
private static final String BOOLEAN_TRUE = "true";
/**
* Format used to format numbers that should be used as text (nominal) values. This will prevent
* scientific notation like '1.12123E12'. Instead it will return '1121230000000'. Numbers with
* fractions will be display with up to 30 fraction digits (specified by the number of # after
* the dot).
*/
private final DecimalFormat decimalFormat = new DecimalFormat("#.##############################");
/** Configuration, set by constructor */
private final ExcelResultSetConfiguration configuration;
/** The StAX parser which parses the worksheet content. */
private final XlsxSheetContentParser worksheetParser;
/**
* The {@link DateFormatProvider} used to parse cells that contain date entries
*/
private final DateFormatProvider dateFormatProvider;
/**
* The parsed sheet meta data which, e.g., contains information about the cell range to parse.
*/
private final XlsxSheetMetaData sheetMetaData;
/** Defines whether reading is done from the Read Excel operator or from a Wizard */
private final XlsxReadMode readMode;
/** The content of the parsed workbook file */
private final XlsxWorkbook xlsxWorkbook;
private Operator operator = null;
private long multiplier;
private int progressCounter = 0;
/**
* Configures the Excel result set with the provided configuration object. Also parses multiple
* XML configuration files included in the XLSX file and creates the worksheet parser.
*
* @param callingOperator
* the calling operator. <code>null</code> is allowed in case the class isn't created
* from within an operator.
* @param configuration
* the result set configuration
* @param provider
* a {@link DateFormatProvider}, can be {@code null} in which case the date format is
* fixed by the current value of {@link configuration#getDatePattern()}
* @throws UserError
* in case something is configured in a wrong way so that the XLSX file cannot be
* parsed
*/
public XlsxResultSet(Operator callingOperator, final ExcelResultSetConfiguration configuration, int sheetIndex,
XlsxReadMode readMode, final DateFormatProvider provider) throws UserError {
// Check file presence
if (configuration.getFile() == null) {
throw new UserError(callingOperator, "file_consumer.no_file_defined");
}
try {
File xlsxFile = configuration.getFile();
XlsxWorkbookRel workbookRelations;
try (ZipFile zipFile = new ZipFile(xlsxFile)) {
// Parse workbook XML which contains a list of sheets with name, rId, sheetId
xlsxWorkbook = new XlsxWorkbookParser().parseZipEntry(zipFile);
// Parse workbook relations XML which contains the path of shared strings
// and the mapping of relationship IDs and paths of worksheets
XlsxWorkbookRelationParser xlsxWorkbookRelHandler = new XlsxWorkbookRelationParser(callingOperator, zipFile,
xlsxWorkbook.xlsxWorkbookSheets, sheetIndex);
workbookRelations = xlsxWorkbookRelHandler.parseZipEntry(zipFile);
}
this.sheetMetaData = new XlsxSheetMetaDataParser(xlsxFile, workbookRelations.worksheetsPath, XML_STREAM_FACTORY)
.parseMetaData(callingOperator, configuration, readMode);
// Check if sheet is empty.
// Wizards should also be able to show empty sheets so also check if we are running from
// an operator.
if (readMode == XlsxReadMode.OPERATOR
&& (sheetMetaData.getLastColumnIndex() < 0 || sheetMetaData.getLastRowIndex() < 0)) {
throw new UserError(callingOperator, 404);
}
// Do not use encoding from ExcelResultSetConfiguration but always use UTF-8
// as UTF-8 is default XLSX encoding: https://msdn.microsoft.com/en-us/library/bb507946
Charset encoding = StandardCharsets.UTF_8;
// Parse shared strings file (only if it exists)
String[] sharedStrings = new String[0];
if (workbookRelations.sharedStringsPath != null) {
sharedStrings = new XlsxSharedStringsParser(xlsxFile, workbookRelations.sharedStringsPath,
XML_STREAM_FACTORY).parseSharedStrings(callingOperator, encoding);
}
// Parse styles file (only if it exists)
XlsxNumberFormats numberFormats = null;
if (workbookRelations.stylesPath != null) {
numberFormats = new XlsxNumberFormatParser(xlsxFile, workbookRelations.stylesPath, XML_STREAM_FACTORY)
.parseNumberFormats();
}
// initialize worksheet parser
this.worksheetParser = new XlsxSheetContentParser(xlsxFile, workbookRelations.worksheetsPath, sharedStrings,
numberFormats, sheetMetaData, XML_STREAM_FACTORY, encoding);
} catch (IOException | XMLStreamException e) {
throw new UserError(callingOperator, e, 321, configuration.getFile(), e.getMessage());
} catch (ParserConfigurationException | SAXException e) {
throw new UserError(callingOperator, e, 401, e.getMessage());
}
this.readMode = readMode;
this.configuration = configuration;
final String timezone = configuration.getTimezone();
if (provider != null) {
if (timezone != null) {
this.dateFormatProvider = new DateFormatProvider() {
@Override
public DateFormat geDateFormat() {
DateFormat format = provider.geDateFormat();
format.setTimeZone(TimeZone.getTimeZone(timezone));
return null;
}
};
} else {
this.dateFormatProvider = provider;
}
} else {
String datePattern = configuration.getDatePattern();
final DateFormat dateFormat = new SimpleDateFormat(datePattern == null ? "" : datePattern);
if (timezone != null) {
dateFormat.setTimeZone(TimeZone.getTimeZone(timezone));
}
this.dateFormatProvider = new DateFormatProvider() {
@Override
public DateFormat geDateFormat() {
return dateFormat;
}
};
}
if (callingOperator != null) {
callingOperator.getProgress().setCheckForStop(false);
callingOperator.getProgress().setTotal(100);
progressCounter = 0;
multiplier = worksheetParser.getTotalSize() / 100L;
operator = callingOperator;
}
}
/**
* Configures the Excel result set with the provided configuration object. Also parses multiple
* XML configuration files included in the XLSX file and creates the worksheet parser.
*
* @param callingOperator
* the calling operator. <code>null</code> is allowed in case the class isn't created
* from within an operator.
* @param configuration
* the result set configuration
* @throws UserError
* in case something is configured in a wrong way so that the XLSX file cannot be
* parsed
*/
public XlsxResultSet(Operator callingOperator, ExcelResultSetConfiguration configuration, int sheetIndex,
XlsxReadMode readMode) throws UserError {
this(callingOperator, configuration, sheetIndex, readMode, null);
}
@Override
public String[] getColumnNames() {
return sheetMetaData.getColumnNames(configuration.isEmulatingOldNames());
}
@Override
public int getCurrentRow() {
return worksheetParser.getCurrentRowIndex();
}
@Override
public boolean isMissing(int columnIndex) {
String value = getValue(columnIndex);
XlsxCellType cellType = getCellType(columnIndex);
if (value == null || value.trim().isEmpty() || XlsxCellType.ERROR.equals(cellType)) {
return true;
} else {
return false;
}
}
@Override
public Date getDate(int columnIndex) throws ParseException {
String dateValue = getValue(columnIndex);
if (dateValue == null) {
return null;
}
switch (getCellType(columnIndex)) {
case NUMBER:
case DATE:
// XLSX stores dates as double values
double dateAsDouble = Double.parseDouble(dateValue);
// Use POI methods to convert value to Date java object
if (DateUtil.isValidExcelDate(dateAsDouble)) {
return DateUtil.getJavaDate(dateAsDouble, xlsxWorkbook.isDate1904);
} else {
throw new ParseException(new ParsingError(getCurrentRow() + 1, columnIndex,
ParsingError.ErrorCode.UNPARSEABLE_DATE, dateValue));
}
case INLINE_STRING:
case SHARED_STRING:
case STRING:
// In case a date is stored as String, we try to parse it here
String dateString = dateValue;
try {
return dateFormatProvider.geDateFormat().parse(dateString);
} catch (java.text.ParseException e) {
throw new ParseException(new ParsingError(getCurrentRow() + 1, columnIndex,
ParsingError.ErrorCode.UNPARSEABLE_DATE, dateString));
}
default:
throw new ParseException(new ParsingError(getCurrentRow() + 1, columnIndex,
ParsingError.ErrorCode.UNPARSEABLE_DATE, dateValue));
}
}
@Override
public ValueType getNativeValueType(int columnIndex) throws ParseException {
XlsxCellType cellType = getCellType(columnIndex);
if (cellType == null) {
return ValueType.EMPTY;
}
switch (cellType) {
case DATE:
return ValueType.DATE;
case ERROR:
return ValueType.EMPTY;
case NUMBER:
return ValueType.NUMBER;
case BOOLEAN:
case INLINE_STRING:
case SHARED_STRING:
case STRING:
default:
return ValueType.STRING;
}
}
@Override
public Number getNumber(int columnIndex) throws ParseException {
String numberValue = getValue(columnIndex);
if (numberValue == null) {
return null;
}
try {
return Double.valueOf(numberValue);
} catch (NumberFormatException e) {
throw new ParseException(new ParsingError(getCurrentRow() + 1, columnIndex,
ParsingError.ErrorCode.UNPARSEABLE_REAL, numberValue));
}
}
/**
* @param columnIndex
* the index of the column
* @return the value as String
*/
private String getValue(int columnIndex) {
XlsxCell xlsxCell = getXlsxCell(columnIndex);
return xlsxCell == null ? null : xlsxCell.value;
}
/**
* @return the {@link XlsxCell} for the specified column
*/
private XlsxCell getXlsxCell(int columnIndex) {
XlsxCell[] rowContent = worksheetParser.getRowContent();
if (rowContent != null) {
return rowContent[columnIndex];
}
return null;
}
/**
* @param columnIndex
* the index of the cell in the current row
* @return the {@link XlsxCellType} of the cell or <code>null</code> if the cell is empty
*/
private XlsxCellType getCellType(int columnIndex) {
XlsxCell xlsxCell = getXlsxCell(columnIndex);
return xlsxCell == null ? null : xlsxCell.cellType;
}
@Override
public int getNumberOfColumns() {
return sheetMetaData.getNumberOfColumns();
}
@Override
public String getString(int columnIndex) throws ParseException {
String value = getValue(columnIndex);
XlsxCellType cellType = getCellType(columnIndex);
if (cellType == null || value == null) {
return null;
}
switch (cellType) {
case NUMBER:
return decimalFormat.format(new BigDecimal(value));
case BOOLEAN:
if (Integer.parseInt(value) == 1) {
return BOOLEAN_TRUE;
} else {
return BOOLEAN_FALSE;
}
default:
return value;
}
}
@Override
public int[] getValueTypes() {
// return an array full of zeros as we cannot determine the value types of each columns here
return new int[getNumberOfColumns()];
}
/**
* @return the number of all rows available by the specified worksheet (also includes empty rows
* at the end of the file) or -1 if number is unknown
*/
public int getNumberOfRows() {
return sheetMetaData.getNumberOfRows();
}
/**
* @param columnMetaDatas
* meta data defined by the operator
* @return all names of columns that have been found empty after parsing the XLSX file
*/
public List<String> getEmptyColumnNames(ColumnMetaData[] columnMetaDatas) {
List<String> toRemove = new LinkedList<>();
String[] columnNames = getColumnNames();
boolean[] emptyColumns = worksheetParser.getEmptyColumns();
// For all columns a meta data was defined so do not remove any column
if (columnNames.length == columnMetaDatas.length) {
return Collections.emptyList();
}
// We can safely assume that empty columns will always have their Excel column name
for (int i = 0; i < emptyColumns.length; i++) {
boolean empty = emptyColumns[i];
ColumnMetaData cmd = null;
if (i < columnMetaDatas.length) {
cmd = columnMetaDatas[i];
}
if (empty && cmd == null) {
toRemove.add(columnNames[i]);
}
}
return toRemove;
}
@Override
public boolean hasNext() {
return worksheetParser.hasNext();
}
@Override
public void next(ProgressListener listener) throws OperatorException {
try {
worksheetParser.next(readMode);
} catch (XMLStreamException | ParseException e) {
throw new UserError(null, e, 321, configuration.getFile(), e.getMessage());
}
if (listener != null) {
listener.setCompleted(getCurrentRow());
}
if (operator != null && ++progressCounter % 100 == 0) {
try {
operator.getProgress().setCompleted((int) (worksheetParser.getCurrentPosition() / multiplier));
} catch (ProcessStoppedException e) {
// Will not happen, because check for stop is deactivated
}
}
}
@Override
public void reset(ProgressListener listener) throws OperatorException {
try {
worksheetParser.reset(XML_STREAM_FACTORY);
} catch (IOException | XMLStreamException e) {
throw new UserError(null, e, 321, configuration.getFile(), e.getMessage());
}
// Update listener
if (listener != null) {
int numberOfRows = getNumberOfRows();
listener.setTotal(numberOfRows == -1 ? XlsxSheetMetaDataParser.MAXIMUM_XLSX_ROW_INDEX + 1 : numberOfRows);
listener.setCompleted(0);
}
// Update progress
if (operator != null) {
operator.getProgress().setTotal(100);
progressCounter = 0;
multiplier = worksheetParser.getTotalSize() / 100L;
}
}
@Override
public void close() {
try {
if (worksheetParser != null) {
worksheetParser.close();
}
} catch (XMLStreamException | IOException e) {
LogService.getRoot().log(Level.WARNING,
I18N.getMessage(LogService.getRoot().getResourceBundle(),
"com.rapidminer.operator.nio.model.ExcelResultSetConfiguration.close_workbook_error",
e.getMessage()),
e);
}
}
/**
* @param parameterAsBoolean
* defines whether the first row should be used as names. If set to <code>true</code>
* the worksheet parser will skip all beginning empty rows until the first row with
* content was found.
*/
public void setUseFirstRowAsNames(boolean isFirstRowAsNames) {
this.worksheetParser.setUseFirstRowAsNames(isFirstRowAsNames);
}
}