/** * Copyright (C) 2001-2017 by RapidMiner and the contributors * * Complete list of developers available at our web site: * * http://rapidminer.com * * This program is free software: you can redistribute it and/or modify it under the terms of the * GNU Affero General Public License as published by the Free Software Foundation, either version 3 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License along with this program. * If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.operator.nio.model.xlsx; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.nio.charset.StandardCharsets; import java.util.zip.ZipEntry; import java.util.zip.ZipFile; import javax.xml.stream.XMLInputFactory; import javax.xml.stream.XMLStreamException; import javax.xml.stream.XMLStreamReader; import org.xml.sax.Attributes; /** * StAX parser that extracts number formats from the XLSX styles file. * * @see ECMA-376, 4th Edition, 18.8 Styles (pp. 1744 ff.) * * @author Nils Woehler * @since 6.3.0 */ public class XlsxNumberFormatParser { /** * This element defines the number formats in this workbook, consisting of a sequence of numFmt * records, where each numFmt record defines a particular number format, indicating how to * format and render the numeric value of a cell. * * @see ECMA-376, 4th Edition, subclause 18.8.31 **/ private static final String TAG_NUMBER_FORMAT = "numFmt"; /** Tag used to define number format IDs. */ private static final String ATT_NUM_FORM_ID = "numFmtId"; /** Tag used to define the actual number format code */ private static final String ATT_FORMAT_CODE = "formatCode"; /** * This element contains the master formatting records (xf) which define the formatting applied * to cells in this workbook. These records are the starting point for determining the * formatting for a cell. Cells in the Sheet Part reference the xf records by zero-based index. <br/> * <br/> * A cell can have both direct formatting (e.g., bold) and a cell style (e.g., Explanatory) * applied to it. Therefore, both the cell style xf records and cell xf records shall be read to * understand the full set of formatting applied to a cell. * * @see ECMA-376, 4th Edition, subclause 18.8.10 */ private static final String TAG_CELL_FORMATS = "cellXfs"; /** * A single xf element describes all of the formatting for a cell. * * @see ECMA-376, 4th Edition, subclause 18.8.45 */ private static final String TAG_FORMAT = "xf"; /** Tag that defines the amount of cell formats stored within the styles XML file. */ private static final String ATT_COUNT = "count"; /** * Id of the number format (numFmt) record used by this cell format. */ private static final String ATT_NUMBER_FORMAT_ID = "numFmtId"; /** The XLSX file */ private final File xlsxFile; /** The factory used to create {@link XMLStreamReader} */ private final XMLInputFactory xmlFactory; /** The path of the styles file */ private final String stylesPath; public XlsxNumberFormatParser(File xlsxFile, String stylesPath, XMLInputFactory xmlFactory) { this.xlsxFile = xlsxFile; this.stylesPath = stylesPath; this.xmlFactory = xmlFactory; } /** * Parses the XLSX styles XML file (with UTF-8 encoding) and returns the parsed number formats. * * @return the number formats stored within a {@link XlsxNumberFormats} object * @throws IOException * in case the Shared Strings Zip entry cannot be opened * @throws XMLStreamException * in case the {@link XMLInputFactory} cannot create a {@link XMLStreamReader} * @throws XlsxException * in case the shared string XML content is invalid */ public XlsxNumberFormats parseNumberFormats() throws XMLStreamException, IOException { boolean isCellFormats = false; int cellFormatIndex = 0; XlsxNumberFormats xlsxNumberFormats = new XlsxNumberFormats(); XMLStreamReader reader = null; try (ZipFile zipFile = new ZipFile(xlsxFile)) { ZipEntry zipEntry = zipFile.getEntry(XlsxUtilities.XLSX_PATH_PREFIX + stylesPath); if (zipEntry == null) { // no styles defined return null; } InputStream inputStream = zipFile.getInputStream(zipEntry); reader = xmlFactory.createXMLStreamReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8)); while (reader.hasNext()) { switch (reader.next()) { case XMLStreamReader.START_ELEMENT: if (TAG_NUMBER_FORMAT.equals(reader.getLocalName())) { Attributes attributes = XlsxUtilities.getAttributes(reader); xlsxNumberFormats.addNumberFormat(Integer.parseInt(attributes.getValue(ATT_NUM_FORM_ID)), attributes.getValue(ATT_FORMAT_CODE)); } else if (TAG_CELL_FORMATS.equals(reader.getLocalName())) { isCellFormats = true; // create an array of the size of all defined cell formats xlsxNumberFormats.initializeCellNumberFormatIds(Integer.parseInt(XlsxUtilities.getAttributes( reader).getValue(ATT_COUNT))); } else if (isCellFormats && TAG_FORMAT.equals(reader.getLocalName())) { xlsxNumberFormats.setCellNumberFormatId(cellFormatIndex, Integer.parseInt(XlsxUtilities.getAttributes(reader).getValue(ATT_NUMBER_FORMAT_ID))); ++cellFormatIndex; } break; case XMLStreamReader.END_ELEMENT: if (TAG_CELL_FORMATS.equals(reader.getLocalName())) { isCellFormats = false; } break; default: // ignore other cases break; } } } finally { if (reader != null) { reader.close(); } } return xlsxNumberFormats; } }