XlsxSheetMetaDataParser.java example

Explorer
rapidminer-studio-master
- doc
  - doc
- src
/**
 * Copyright (C) 2001-2017 by RapidMiner and the contributors
 * 
 * Complete list of developers available at our web site:
 * 
 * http://rapidminer.com
 * 
 * This program is free software: you can redistribute it and/or modify it under the terms of the
 * GNU Affero General Public License as published by the Free Software Foundation, either version 3
 * of the License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
 * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Affero General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License along with this program.
 * If not, see http://www.gnu.org/licenses/.
*/
package com.rapidminer.operator.nio.model.xlsx;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;

import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;

import org.xml.sax.Attributes;

import com.rapidminer.operator.Operator;
import com.rapidminer.operator.UserError;
import com.rapidminer.operator.nio.model.ExcelResultSetConfiguration;
import com.rapidminer.operator.nio.model.xlsx.XlsxResultSet.XlsxReadMode;
import com.rapidminer.operator.nio.model.xlsx.XlsxUtilities.XlsxCellCoordinates;


/**
 * StAX parser for XLSX worksheet meta data.
 *
 * FIXME adapt It opens the selected worksheet XML file and looks for the 'dimension' element to
 * extract the worksheet cell ranges. In case the 'dimension' element cannot be found or has invalid
 * content it has to perform a full scan of the worksheet XML to extract the cell range which can -
 * depending on the size of the provided Excel file - can take some time but is necessary for
 * pyramid like structured Excel files.
 *
 * @author Nils Woehler
 * @since 6.3.0
 */
public class XlsxSheetMetaDataParser {

	/**
	 * This element specifies the used range of the worksheet. It specifies the row and column
	 * bounds of used cells in the worksheet. This is optional and is not required. Used cells
	 * include cells with formulas, text content, and cell formatting. When an entire column is
	 * formatted, only the first cell in that column is considered used.
	 *
	 * @see ECMA-376, 4th Edition, 18.3.1.35 Worksheet dimensions (p. 1617). This tag is optional.
	 */
	public static final String TAG_DIMENSION = "dimension";

	/**
	 * The row and column bounds of all cells in this worksheet. Corresponds to the range that would
	 * contain all c elements written under sheetData. Does not support whole column or whole row
	 * reference notation.
	 *
	 * @see ECMA-376, 4th Edition, 18.3.1.35 Worksheet dimensions (p. 1617)
	 */
	public static final String ATT_DIMENSION_REF = "ref";

	/**
	 * Optimization only, and not required. Specifies the range of non-empty columns (in the format
	 * X:Y) for the block of rows to which the current row belongs. To achieve the optimization,
	 * span attribute values in a single block should be the same.
	 *
	 * @see ECMA-376, 4th Edition, 18.3.1.73 Row (p. 1670)
	 */
	public static final String ATT_SPANS = "spans";

	/**
	 * Currently, as of 16.01.2015, XLSX supports up to 104.857.76 rows per sheet.
	 */
	public static final int MAXIMUM_XLSX_ROW_INDEX = 104_857_75;

	/**
	 * Currently, as of 16.01.2015, XLSX supports up to 16.384 columns per sheet.
	 */
	public static final int MAXIMUM_XLSX_COLUMN_INDEX = 16_383;

	private final File xlsxFile;
	private final XMLInputFactory xmlInputFactory;
	private final String workbookZipEntryPath;

	public XlsxSheetMetaDataParser(File xlsxFile, String workbookZipEntryPath, XMLInputFactory xmlInputFactory) {
		this.xlsxFile = xlsxFile;
		this.workbookZipEntryPath = workbookZipEntryPath;
		this.xmlInputFactory = xmlInputFactory;
	}

	/**
	 * Parses the XLSX worksheet file to retrieve sheet meta data.
	 *
	 * @param configuration
	 *            the result set configuration
	 * @param readMode
	 *
	 * @return the parsed sheet meta data
	 *
	 * @throws XMLStreamException
	 *             if there is an error processing the underlying XML source
	 * @throws IOException
	 *             if an I/O error has occurred
	 * @throws UserError
	 *             in case something is configured wrong (e.g. wrong cell parsing range)
	 */
	public XlsxSheetMetaData parseMetaData(Operator callingOperator, ExcelResultSetConfiguration configuration,
			XlsxReadMode readMode) throws XMLStreamException, IOException, UserError {

		// use 0 as first row index in case the whole column is specified
		// offset via -1 for row number
		int firstRowIndex = Math.max(configuration.getRowOffset(), 0);
		int firstColumnIndex = configuration.getColumnOffset();
		int userSpecifiedLastRow = configuration.getRowLast();
		int userSpecifiedLastColumn = configuration.getColumnLast();

		// Check configured range
		if (firstColumnIndex > userSpecifiedLastColumn || firstRowIndex > userSpecifiedLastRow || firstColumnIndex < 0
				|| firstRowIndex < 0) {
			throw new UserError(callingOperator, 223, convertOffsetToHumanReadableFormat(firstColumnIndex, firstRowIndex,
					userSpecifiedLastRow, userSpecifiedLastColumn));
		}

		if (readMode != XlsxReadMode.WIZARD_WORKPANE) {
			// If the user has specified an end range just return the specified range
			if (userSpecifiedLastColumn != Integer.MAX_VALUE) {
				return new XlsxSheetMetaData(firstColumnIndex, firstRowIndex, userSpecifiedLastColumn,
						Math.min(userSpecifiedLastRow, MAXIMUM_XLSX_ROW_INDEX));
			}
		} else {
			firstRowIndex = 0;
			firstColumnIndex = 0;
		}

		// Otherwise parse work sheet to obtain meta data
		boolean isRowWithoutSpan = false;
		int maximumColumn = -1;

		XMLStreamReader reader = null;
		try (ZipFile xlsxZipFile = new ZipFile(xlsxFile)) {
			ZipEntry workbookZipEntry = xlsxZipFile.getEntry(workbookZipEntryPath);
			if (workbookZipEntry == null) {
				throw new FileNotFoundException("Selected workbook is missing in XLSX file. Path: " + workbookZipEntryPath);
			}
			reader = xmlInputFactory.createXMLStreamReader(xlsxZipFile.getInputStream(workbookZipEntry));

			while (reader.hasNext()) {
				int eventCode = reader.next();
				if (eventCode == XMLStreamReader.START_ELEMENT) {
					Attributes attributes = XlsxUtilities.getAttributes(reader);
					switch (reader.getLocalName()) {
						case TAG_DIMENSION:
							String dimension = attributes.getValue(ATT_DIMENSION_REF);

							/*
							 * Continue with next tags and parse all rows if dimension does not have
							 * a "ref" attribute
							 */
							if (dimension == null || dimension.isEmpty()) {
								continue;
							}

							int dimColonIndex = dimension.indexOf(':');

							/*
							 * If dimension does not contain a colon (e.g. when generated by
							 * RapidMiner Studio) it is invalid and should not be evaluated,
							 * otherwise try to parse the maximum cell range.
							 */
							if (dimColonIndex != -1) {
								String maxCellRange = dimension.substring(dimColonIndex + 1);
								try {
									XlsxCellCoordinates cellRange = XlsxUtilities.convertCellRefToCoordinates(maxCellRange);
									return new XlsxSheetMetaData(firstColumnIndex, firstRowIndex, cellRange.columnNumber,
											/*
											 * Always return the maximum Integer value for the last
											 * row index to be able to read the whole excel file in
											 * case the internal format is broken (e.g. files
											 * created by Libre Office)
											 */
											Integer.MAX_VALUE);
								} catch (IllegalArgumentException e) {
									// ignore malformed cell reference and continue parsing whole
									// file
								}
							}

							/*
							 * If we haven't found the dimension yet after parsing the dimension
							 * element we have to go over all rows and count the rows and columns
							 * one by one.
							 */
							break;
						case XlsxUtilities.TAG_ROW:
							// Check if row contains "spans" attribute
							String spans = attributes.getValue(ATT_SPANS);
							if (spans != null) {
								/*
								 * In case it is present extract the maximum column range of this
								 * row if a colon character can be found
								 */
								int rowColonIndex = spans.indexOf(':');
								if (rowColonIndex != -1) {
									/*
									 * We need to subtract 1 as Excel stores columns with a 1 based
									 * index.
									 */
									int maxColumn = Integer.parseInt(spans.substring(rowColonIndex + 1, spans.length())) - 1;
									if (maxColumn > maximumColumn) {
										maximumColumn = maxColumn;
									}
								}
							} else {
								isRowWithoutSpan = true;
							}
							break;
						case XlsxUtilities.TAG_CELL:
							// only parse cells in case the row does not contain a "spans" attribute
							if (isRowWithoutSpan) {
								XlsxCellCoordinates columnAndRowIndices = XlsxUtilities
										.convertCellRefToCoordinates(attributes.getValue(XlsxUtilities.TAG_CELL_REFERENCE));
								if (columnAndRowIndices.columnNumber > maximumColumn) {
									maximumColumn = columnAndRowIndices.columnNumber;
								}
							}
							break;
						default:
							// ignore other local names
							break;
					}
				} else if (eventCode == XMLStreamReader.END_ELEMENT) {
					if (reader.getLocalName().equals(XlsxUtilities.TAG_ROW)) {
						// Closing row element -> Row completed
						isRowWithoutSpan = false;
					}
				}
			}
		} finally {
			if (reader != null) {
				reader.close();
			}
		}

		/*
		 * Always return the maximum Integer value for the last row index to be able to read the
		 * whole excel file in case the internal format is broken (e.g. files created by Libre
		 * Office)
		 */
		return new XlsxSheetMetaData(firstColumnIndex, firstRowIndex, maximumColumn, Integer.MAX_VALUE);
	}

	/**
	 * Converts the provided row and column offset and last column and row indices to a human
	 * readable Excel format so we can display a proper error message.
	 */
	private String convertOffsetToHumanReadableFormat(int columnOffset, int confRowOffset, int columnLast, int rowLast) {
		StringBuilder rangeBuilder = new StringBuilder();
		if (columnOffset >= 0) {
			rangeBuilder.append(XlsxUtilities.convertToColumnName(columnOffset));
		}
		if (confRowOffset >= 0) {
			rangeBuilder.append(confRowOffset + 1);
		}
		boolean colonAdded = false;
		if (rowLast == -1 || columnLast == -1) {
			rangeBuilder.append(":");
			colonAdded = true;
		}

		if (columnLast != Integer.MAX_VALUE && columnLast != -1) {
			if (!colonAdded) {
				rangeBuilder.append(":");
				colonAdded = true;
			}
			rangeBuilder.append(XlsxUtilities.convertToColumnName(columnLast));
		}

		if (rowLast != Integer.MAX_VALUE && rowLast != -1) {
			if (!colonAdded) {
				rangeBuilder.append(":");
				colonAdded = true;
			}
			rangeBuilder.append(rowLast + 1);
		}
		return rangeBuilder.toString();
	}
}