/** * Copyright (C) 2001-2017 by RapidMiner and the contributors * * Complete list of developers available at our web site: * * http://rapidminer.com * * This program is free software: you can redistribute it and/or modify it under the terms of the * GNU Affero General Public License as published by the Free Software Foundation, either version 3 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License along with this program. * If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.operator.nio.model.xlsx; import java.io.File; import java.io.IOException; import java.math.BigDecimal; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.text.DateFormat; import java.text.DecimalFormat; import java.text.SimpleDateFormat; import java.util.Collections; import java.util.Date; import java.util.LinkedList; import java.util.List; import java.util.TimeZone; import java.util.logging.Level; import java.util.zip.ZipFile; import javax.xml.parsers.ParserConfigurationException; import javax.xml.stream.XMLInputFactory; import javax.xml.stream.XMLStreamException; import org.apache.poi.ss.usermodel.DateUtil; import org.xml.sax.SAXException; import com.rapidminer.operator.Operator; import com.rapidminer.operator.OperatorException; import com.rapidminer.operator.ProcessStoppedException; import com.rapidminer.operator.UserError; import com.rapidminer.operator.nio.model.ColumnMetaData; import com.rapidminer.operator.nio.model.DataResultSet; import com.rapidminer.operator.nio.model.DateFormatProvider; import com.rapidminer.operator.nio.model.ExcelResultSetConfiguration; import com.rapidminer.operator.nio.model.ParseException; import com.rapidminer.operator.nio.model.ParsingError; import com.rapidminer.operator.nio.model.xlsx.XlsxUtilities.XlsxCell; import com.rapidminer.operator.nio.model.xlsx.XlsxWorkbookParser.XlsxWorkbook; import com.rapidminer.operator.nio.model.xlsx.XlsxWorkbookRelationParser.XlsxWorkbookRel; import com.rapidminer.tools.I18N; import com.rapidminer.tools.LogService; import com.rapidminer.tools.ProgressListener; /** * A DataResultSet for an Excel 2007 files (XLSX). It uses StAX parsers based on the file format * definitions of ECMA-376, 4th Edition to parse configuration and worksheet content from XML files * stored within a XLSX file. * * @author Nils Woehler * @since 6.3.0 */ public class XlsxResultSet implements DataResultSet { /** * Defines whether the Excel file is read by the operator or by the Wizard. */ public static enum XlsxReadMode { WIZARD_WORKPANE, WIZARD_PREVIEW, OPERATOR, /** * Specifies that the {@link XlsxResultSet} was created to display preview content in the * sheet selection step of the new data import dialog. If used the * {@link XlsxSheetTableModel} will load a data preview instead of the full sheet content. */ WIZARD_SHEET_SELECTION } /** * The factory used to create XML StAX streams. */ private static final XMLInputFactory XML_STREAM_FACTORY = XMLInputFactory.newFactory(); static { XML_STREAM_FACTORY.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, false); XML_STREAM_FACTORY.setProperty(XMLInputFactory.IS_VALIDATING, false); XML_STREAM_FACTORY.setProperty(XMLInputFactory.IS_COALESCING, true); // Re-enable once we can use Aalto or Woodstox parser // XML_STREAM_FACTORY.setProperty(XMLInputFactory2.P_LAZY_PARSING, true); // XML_STREAM_FACTORY.setProperty(XMLInputFactory2.P_INTERN_NAMES, true); // XML_STREAM_FACTORY.setProperty(XMLInputFactory2.P_INTERN_NS_URIS, true); // XML_STREAM_FACTORY.setProperty(XMLInputFactory2.P_PRESERVE_LOCATION, false); // XML_STREAM_FACTORY.setProperty(XMLInputFactory2.P_REPORT_PROLOG_WHITESPACE, false); } /** * Returned value in case boolean is stored as 0 in XLSX file */ private static final String BOOLEAN_FALSE = "false"; /** * Returned value in case boolean is stored as 1 in XLSX file */ private static final String BOOLEAN_TRUE = "true"; /** * Format used to format numbers that should be used as text (nominal) values. This will prevent * scientific notation like '1.12123E12'. Instead it will return '1121230000000'. Numbers with * fractions will be display with up to 30 fraction digits (specified by the number of # after * the dot). */ private final DecimalFormat decimalFormat = new DecimalFormat("#.##############################"); /** Configuration, set by constructor */ private final ExcelResultSetConfiguration configuration; /** The StAX parser which parses the worksheet content. */ private final XlsxSheetContentParser worksheetParser; /** * The {@link DateFormatProvider} used to parse cells that contain date entries */ private final DateFormatProvider dateFormatProvider; /** * The parsed sheet meta data which, e.g., contains information about the cell range to parse. */ private final XlsxSheetMetaData sheetMetaData; /** Defines whether reading is done from the Read Excel operator or from a Wizard */ private final XlsxReadMode readMode; /** The content of the parsed workbook file */ private final XlsxWorkbook xlsxWorkbook; private Operator operator = null; private long multiplier; private int progressCounter = 0; /** * Configures the Excel result set with the provided configuration object. Also parses multiple * XML configuration files included in the XLSX file and creates the worksheet parser. * * @param callingOperator * the calling operator. <code>null</code> is allowed in case the class isn't created * from within an operator. * @param configuration * the result set configuration * @param provider * a {@link DateFormatProvider}, can be {@code null} in which case the date format is * fixed by the current value of {@link configuration#getDatePattern()} * @throws UserError * in case something is configured in a wrong way so that the XLSX file cannot be * parsed */ public XlsxResultSet(Operator callingOperator, final ExcelResultSetConfiguration configuration, int sheetIndex, XlsxReadMode readMode, final DateFormatProvider provider) throws UserError { // Check file presence if (configuration.getFile() == null) { throw new UserError(callingOperator, "file_consumer.no_file_defined"); } try { File xlsxFile = configuration.getFile(); XlsxWorkbookRel workbookRelations; try (ZipFile zipFile = new ZipFile(xlsxFile)) { // Parse workbook XML which contains a list of sheets with name, rId, sheetId xlsxWorkbook = new XlsxWorkbookParser().parseZipEntry(zipFile); // Parse workbook relations XML which contains the path of shared strings // and the mapping of relationship IDs and paths of worksheets XlsxWorkbookRelationParser xlsxWorkbookRelHandler = new XlsxWorkbookRelationParser(callingOperator, zipFile, xlsxWorkbook.xlsxWorkbookSheets, sheetIndex); workbookRelations = xlsxWorkbookRelHandler.parseZipEntry(zipFile); } this.sheetMetaData = new XlsxSheetMetaDataParser(xlsxFile, workbookRelations.worksheetsPath, XML_STREAM_FACTORY) .parseMetaData(callingOperator, configuration, readMode); // Check if sheet is empty. // Wizards should also be able to show empty sheets so also check if we are running from // an operator. if (readMode == XlsxReadMode.OPERATOR && (sheetMetaData.getLastColumnIndex() < 0 || sheetMetaData.getLastRowIndex() < 0)) { throw new UserError(callingOperator, 404); } // Do not use encoding from ExcelResultSetConfiguration but always use UTF-8 // as UTF-8 is default XLSX encoding: https://msdn.microsoft.com/en-us/library/bb507946 Charset encoding = StandardCharsets.UTF_8; // Parse shared strings file (only if it exists) String[] sharedStrings = new String[0]; if (workbookRelations.sharedStringsPath != null) { sharedStrings = new XlsxSharedStringsParser(xlsxFile, workbookRelations.sharedStringsPath, XML_STREAM_FACTORY).parseSharedStrings(callingOperator, encoding); } // Parse styles file (only if it exists) XlsxNumberFormats numberFormats = null; if (workbookRelations.stylesPath != null) { numberFormats = new XlsxNumberFormatParser(xlsxFile, workbookRelations.stylesPath, XML_STREAM_FACTORY) .parseNumberFormats(); } // initialize worksheet parser this.worksheetParser = new XlsxSheetContentParser(xlsxFile, workbookRelations.worksheetsPath, sharedStrings, numberFormats, sheetMetaData, XML_STREAM_FACTORY, encoding); } catch (IOException | XMLStreamException e) { throw new UserError(callingOperator, e, 321, configuration.getFile(), e.getMessage()); } catch (ParserConfigurationException | SAXException e) { throw new UserError(callingOperator, e, 401, e.getMessage()); } this.readMode = readMode; this.configuration = configuration; final String timezone = configuration.getTimezone(); if (provider != null) { if (timezone != null) { this.dateFormatProvider = new DateFormatProvider() { @Override public DateFormat geDateFormat() { DateFormat format = provider.geDateFormat(); format.setTimeZone(TimeZone.getTimeZone(timezone)); return null; } }; } else { this.dateFormatProvider = provider; } } else { String datePattern = configuration.getDatePattern(); final DateFormat dateFormat = new SimpleDateFormat(datePattern == null ? "" : datePattern); if (timezone != null) { dateFormat.setTimeZone(TimeZone.getTimeZone(timezone)); } this.dateFormatProvider = new DateFormatProvider() { @Override public DateFormat geDateFormat() { return dateFormat; } }; } if (callingOperator != null) { callingOperator.getProgress().setCheckForStop(false); callingOperator.getProgress().setTotal(100); progressCounter = 0; multiplier = worksheetParser.getTotalSize() / 100L; operator = callingOperator; } } /** * Configures the Excel result set with the provided configuration object. Also parses multiple * XML configuration files included in the XLSX file and creates the worksheet parser. * * @param callingOperator * the calling operator. <code>null</code> is allowed in case the class isn't created * from within an operator. * @param configuration * the result set configuration * @throws UserError * in case something is configured in a wrong way so that the XLSX file cannot be * parsed */ public XlsxResultSet(Operator callingOperator, ExcelResultSetConfiguration configuration, int sheetIndex, XlsxReadMode readMode) throws UserError { this(callingOperator, configuration, sheetIndex, readMode, null); } @Override public String[] getColumnNames() { return sheetMetaData.getColumnNames(configuration.isEmulatingOldNames()); } @Override public int getCurrentRow() { return worksheetParser.getCurrentRowIndex(); } @Override public boolean isMissing(int columnIndex) { String value = getValue(columnIndex); XlsxCellType cellType = getCellType(columnIndex); if (value == null || value.trim().isEmpty() || XlsxCellType.ERROR.equals(cellType)) { return true; } else { return false; } } @Override public Date getDate(int columnIndex) throws ParseException { String dateValue = getValue(columnIndex); if (dateValue == null) { return null; } switch (getCellType(columnIndex)) { case NUMBER: case DATE: // XLSX stores dates as double values double dateAsDouble = Double.parseDouble(dateValue); // Use POI methods to convert value to Date java object if (DateUtil.isValidExcelDate(dateAsDouble)) { return DateUtil.getJavaDate(dateAsDouble, xlsxWorkbook.isDate1904); } else { throw new ParseException(new ParsingError(getCurrentRow() + 1, columnIndex, ParsingError.ErrorCode.UNPARSEABLE_DATE, dateValue)); } case INLINE_STRING: case SHARED_STRING: case STRING: // In case a date is stored as String, we try to parse it here String dateString = dateValue; try { return dateFormatProvider.geDateFormat().parse(dateString); } catch (java.text.ParseException e) { throw new ParseException(new ParsingError(getCurrentRow() + 1, columnIndex, ParsingError.ErrorCode.UNPARSEABLE_DATE, dateString)); } default: throw new ParseException(new ParsingError(getCurrentRow() + 1, columnIndex, ParsingError.ErrorCode.UNPARSEABLE_DATE, dateValue)); } } @Override public ValueType getNativeValueType(int columnIndex) throws ParseException { XlsxCellType cellType = getCellType(columnIndex); if (cellType == null) { return ValueType.EMPTY; } switch (cellType) { case DATE: return ValueType.DATE; case ERROR: return ValueType.EMPTY; case NUMBER: return ValueType.NUMBER; case BOOLEAN: case INLINE_STRING: case SHARED_STRING: case STRING: default: return ValueType.STRING; } } @Override public Number getNumber(int columnIndex) throws ParseException { String numberValue = getValue(columnIndex); if (numberValue == null) { return null; } try { return Double.valueOf(numberValue); } catch (NumberFormatException e) { throw new ParseException(new ParsingError(getCurrentRow() + 1, columnIndex, ParsingError.ErrorCode.UNPARSEABLE_REAL, numberValue)); } } /** * @param columnIndex * the index of the column * @return the value as String */ private String getValue(int columnIndex) { XlsxCell xlsxCell = getXlsxCell(columnIndex); return xlsxCell == null ? null : xlsxCell.value; } /** * @return the {@link XlsxCell} for the specified column */ private XlsxCell getXlsxCell(int columnIndex) { XlsxCell[] rowContent = worksheetParser.getRowContent(); if (rowContent != null) { return rowContent[columnIndex]; } return null; } /** * @param columnIndex * the index of the cell in the current row * @return the {@link XlsxCellType} of the cell or <code>null</code> if the cell is empty */ private XlsxCellType getCellType(int columnIndex) { XlsxCell xlsxCell = getXlsxCell(columnIndex); return xlsxCell == null ? null : xlsxCell.cellType; } @Override public int getNumberOfColumns() { return sheetMetaData.getNumberOfColumns(); } @Override public String getString(int columnIndex) throws ParseException { String value = getValue(columnIndex); XlsxCellType cellType = getCellType(columnIndex); if (cellType == null || value == null) { return null; } switch (cellType) { case NUMBER: return decimalFormat.format(new BigDecimal(value)); case BOOLEAN: if (Integer.parseInt(value) == 1) { return BOOLEAN_TRUE; } else { return BOOLEAN_FALSE; } default: return value; } } @Override public int[] getValueTypes() { // return an array full of zeros as we cannot determine the value types of each columns here return new int[getNumberOfColumns()]; } /** * @return the number of all rows available by the specified worksheet (also includes empty rows * at the end of the file) or -1 if number is unknown */ public int getNumberOfRows() { return sheetMetaData.getNumberOfRows(); } /** * @param columnMetaDatas * meta data defined by the operator * @return all names of columns that have been found empty after parsing the XLSX file */ public List<String> getEmptyColumnNames(ColumnMetaData[] columnMetaDatas) { List<String> toRemove = new LinkedList<>(); String[] columnNames = getColumnNames(); boolean[] emptyColumns = worksheetParser.getEmptyColumns(); // For all columns a meta data was defined so do not remove any column if (columnNames.length == columnMetaDatas.length) { return Collections.emptyList(); } // We can safely assume that empty columns will always have their Excel column name for (int i = 0; i < emptyColumns.length; i++) { boolean empty = emptyColumns[i]; ColumnMetaData cmd = null; if (i < columnMetaDatas.length) { cmd = columnMetaDatas[i]; } if (empty && cmd == null) { toRemove.add(columnNames[i]); } } return toRemove; } @Override public boolean hasNext() { return worksheetParser.hasNext(); } @Override public void next(ProgressListener listener) throws OperatorException { try { worksheetParser.next(readMode); } catch (XMLStreamException | ParseException e) { throw new UserError(null, e, 321, configuration.getFile(), e.getMessage()); } if (listener != null) { listener.setCompleted(getCurrentRow()); } if (operator != null && ++progressCounter % 100 == 0) { try { operator.getProgress().setCompleted((int) (worksheetParser.getCurrentPosition() / multiplier)); } catch (ProcessStoppedException e) { // Will not happen, because check for stop is deactivated } } } @Override public void reset(ProgressListener listener) throws OperatorException { try { worksheetParser.reset(XML_STREAM_FACTORY); } catch (IOException | XMLStreamException e) { throw new UserError(null, e, 321, configuration.getFile(), e.getMessage()); } // Update listener if (listener != null) { int numberOfRows = getNumberOfRows(); listener.setTotal(numberOfRows == -1 ? XlsxSheetMetaDataParser.MAXIMUM_XLSX_ROW_INDEX + 1 : numberOfRows); listener.setCompleted(0); } // Update progress if (operator != null) { operator.getProgress().setTotal(100); progressCounter = 0; multiplier = worksheetParser.getTotalSize() / 100L; } } @Override public void close() { try { if (worksheetParser != null) { worksheetParser.close(); } } catch (XMLStreamException | IOException e) { LogService.getRoot().log(Level.WARNING, I18N.getMessage(LogService.getRoot().getResourceBundle(), "com.rapidminer.operator.nio.model.ExcelResultSetConfiguration.close_workbook_error", e.getMessage()), e); } } /** * @param parameterAsBoolean * defines whether the first row should be used as names. If set to <code>true</code> * the worksheet parser will skip all beginning empty rows until the first row with * content was found. */ public void setUseFirstRowAsNames(boolean isFirstRowAsNames) { this.worksheetParser.setUseFirstRowAsNames(isFirstRowAsNames); } }