/** * Copyright (C) 2001-2017 by RapidMiner and the contributors * * Complete list of developers available at our web site: * * http://rapidminer.com * * This program is free software: you can redistribute it and/or modify it under the terms of the * GNU Affero General Public License as published by the Free Software Foundation, either version 3 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License along with this program. * If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.operator.nio.model.xlsx; import java.util.Locale; import javax.xml.stream.XMLStreamReader; import org.xml.sax.Attributes; import org.xml.sax.helpers.AttributesImpl; import com.rapidminer.operator.nio.ImportWizardUtils; /** * A class that contains utility methods for parsing XLSX files. * * @author Nils Woehler * @since 6.3.0 */ public final class XlsxUtilities { /** * The minimum limit for number of row shown in the new data access sheet selection panel * preview table. */ private static final int SHEET_SELECTION_ROW_NUMBER_LIMIT = 5_000; /** @see ECMA-376, 4th Edition, 18.3.1.4 Cell (p. 1599), 18.18.7 Cell Reference (p. 2432) */ public static final String TAG_CELL_REFERENCE = "r"; /** * Row index. Indicates to which row in the sheet this <row> definition corresponds. * * @see ECMA-376, 4th Edition, 18.3.1.73 Row (p. 1670) */ public static final String TAG_ROW_INDEX = "r"; /** * The element expresses information about an entire row of a worksheet, and contains all cell * definitions for a particular row in the worksheet. * * [Example: This row expresses information about row 2 in the worksheet, and contains 3 cell * definitions. * * <pre> * {@code * <row r="2" spans="2:12"> * <c r="C2" s="1"> * <f>PMT(B3/12,B4,-B5)</f> * <v>672.68336574300008</v> * </c> * <c r="D2"> * <v>180</v> * </c> * <c r="E2"> * <v>360</v> * </c> * </row> * } * </pre> * * @see ECMA-376, 4th Edition, 18.3.1.73 Row (pp. 1667 ff.) */ public static final String TAG_ROW = "row"; /** * This collection represents a cell in the worksheet. Information about the cell's location * (reference), value, data type, formatting, and formula is expressed here. <br/> * <br/> * [Example: This example shows the information stored for a cell whose address in the grid is * C6, whose style index is '6', and whose value metadata index is '15'. The cell contains a * formula as well as a calculated result of that formula. * * <pre> * {@code * <c r="C6" s="1" vm="15"> * <f>CUBEVALUE("xlextdat9 Adventure Works",C$5,$A6)</f> * <v>2838512.355</v> * </c> * } * </pre> * * end example] <br/> * <br/> * / While a cell can have a formula element f and a value element v, when the cell's type t is * inlineStr then only the element is is allowed as a child element. <br/> * <br/> * [Example: Here is an example of expressing a string in the cell rather than using the shared * string table. * * <pre> * {@code * <row r="1" spans="1:1"> * <c r="A1" t="inlineStr"> * <is><t>This is inline string example</t></is> * </c> * </row> * } * </pre> * * end example] * * @see ECMA-376, 4th Edition, 18.3.1.4 Cell (pp. 1588 ff.) */ public static final String TAG_CELL = "c"; /** Default prefix of paths defined inside XLSX */ public static final String XLSX_PATH_PREFIX = "xl/"; /** * Container class that contains Excel cell coordinates. */ public static final class XlsxCellCoordinates { /** * Value used to specify that the coordinates reference a column only. */ public static final int NO_ROW_NUMBER = -1; /** {@code 0} based column number */ public int columnNumber; /** * {@code 0} based row number or {@value #NO_ROW_NUMBER} in case only a column is specified. */ public int rowNumber; /** * Creates an {@link XlsxCellCoordinates} instance with only the column number defined. The * row number is set to {@value #NO_ROW_NUMBER}. * * @param columnNumber * the column number */ public XlsxCellCoordinates(int columnNumber) { this(columnNumber, NO_ROW_NUMBER); } /** * Constructs a new {@link XlsxCellCoordinates} instance with both the column and the row * number defined. * * @param columnNumber * the column number * @param rowNumber * the row number */ public XlsxCellCoordinates(int columnNumber, int rowNumber) { this.columnNumber = columnNumber; this.rowNumber = rowNumber; } @Override public int hashCode() { final int prime = 31; int result = 1; result = prime * result + columnNumber; result = prime * result + rowNumber; return result; } @Override public boolean equals(Object obj) { if (this == obj) { return true; } if (obj == null) { return false; } if (getClass() != obj.getClass()) { return false; } XlsxCellCoordinates other = (XlsxCellCoordinates) obj; if (columnNumber != other.columnNumber) { return false; } if (rowNumber != other.rowNumber) { return false; } return true; } @Override public String toString() { return "XlsxCellCoordinates [columnNumber=" + columnNumber + ", rowNumber=" + rowNumber + "]"; } } /** * A simple POJO object that represents a XSLX cell which consist of a {@link XlsxCellType} and * a String value. */ static final class XlsxCell { final XlsxCellType cellType; String value; XlsxCell(XlsxCellType cellType) { this.cellType = cellType; } @Override public int hashCode() { final int prime = 31; int result = 1; result = prime * result + (cellType == null ? 0 : cellType.hashCode()); result = prime * result + (value == null ? 0 : value.hashCode()); return result; } @Override public boolean equals(Object obj) { if (this == obj) { return true; } if (obj == null) { return false; } if (getClass() != obj.getClass()) { return false; } XlsxCell other = (XlsxCell) obj; if (cellType != other.cellType) { return false; } if (value == null) { if (other.value != null) { return false; } } else if (!value.equals(other.value)) { return false; } return true; } @Override public String toString() { return "XlsxCell [cellType=" + cellType + ", value=" + value + "]"; } } private XlsxUtilities() { throw new IllegalStateException("Cannot instantiate utility class"); } /** * Convert given Excel column name to column index, eg. 'A=0', 'AA=26' * * @param columnName * the column name * @return {@code 0} based index of the column */ public static int convertToColumnIndex(String columnName) { // ensure provided String is upper case columnName = columnName.toUpperCase(Locale.ENGLISH); int index = 0; for (int i = 0; i < columnName.length(); i++) { index *= 26; index += columnName.charAt(i) - 'A' + 1; } return index - 1; } /** * Convert given Excel column index to column name, eg. '0=A', '26=AA' * * @param columnIndex * the {@code 0} based column index * @return the column name */ public static String convertToColumnName(int index) { if (index < 0) { throw new IllegalArgumentException("Indices below 0 are not allowed"); } StringBuilder sb = new StringBuilder(); // increase by 1 as algorithm expects a 1 as starting point index++; while (index-- > 0) { sb.append((char) ('A' + index % 26)); index /= 26; } return sb.reverse().toString(); } /** * Converts a cell reference String of format 'A12' to a pair that contains the column number * (starting with {@code 0} for the first column) and the row number (starting with {@code 0} * for the first row). * * @param cellReference * the cell reference string that starts with letters and ends with digits (e.g. * 'AA123') * @return the pair with column number as first (starts with {@code 0}) and row number as second * content (starts with {@code 0}, is {@link XlsxCellCoordinates#NO_ROW_NUMBER} in case * no row number is defined) */ public static XlsxCellCoordinates convertCellRefToCoordinates(String cellReference) { for (int i = 0; i < cellReference.length(); i++) { if (Character.isDigit(cellReference.charAt(i))) { if (i == 0) { throw new IllegalArgumentException( "The provided cell reference does not contain any letters (" + cellReference + ")"); } int columnNumber = XlsxUtilities.convertToColumnIndex(cellReference.substring(0, i)); int rowNumber = Integer.parseInt(cellReference.substring(i, cellReference.length())) - 1; return new XlsxCellCoordinates(columnNumber, rowNumber); } } // no digits specified -> return coordinates without row number return new XlsxCellCoordinates(XlsxUtilities.convertToColumnIndex(cellReference)); } /** * Generates a list of attributes of the current XML item. * * The generated attributes are instantiated without namespace URI. * * @param reader * the {@link XMLStreamReader} to use * * @return The available attributes */ static Attributes getAttributes(XMLStreamReader reader) { AttributesImpl attributes = new AttributesImpl(); for (int i = 0; i < reader.getAttributeCount(); i++) { attributes.addAttribute("", reader.getAttributeLocalName(i), reader.getAttributeName(i).toString(), reader.getAttributeType(i), reader.getAttributeValue(i)); } return attributes; } /** * @return the row limit for the read mode WIZARD_SHEET_SELECTION */ public static int getSheetSelectionLength() { return Math.max(ImportWizardUtils.getPreviewLength(), SHEET_SELECTION_ROW_NUMBER_LIMIT); } }