XlsxSheetContentParser.java example

Explorer
rapidminer-studio-master
- doc
  - doc
- src
/**
 * Copyright (C) 2001-2017 by RapidMiner and the contributors
 * 
 * Complete list of developers available at our web site:
 * 
 * http://rapidminer.com
 * 
 * This program is free software: you can redistribute it and/or modify it under the terms of the
 * GNU Affero General Public License as published by the Free Software Foundation, either version 3
 * of the License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
 * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * Affero General Public License for more details.
 * 
 * You should have received a copy of the GNU Affero General Public License along with this program.
 * If not, see http://www.gnu.org/licenses/.
*/
package com.rapidminer.operator.nio.model.xlsx;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;

import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;

import org.apache.tika.io.CountingInputStream;
import org.xml.sax.Attributes;

import com.rapidminer.operator.nio.model.ParseException;
import com.rapidminer.operator.nio.model.ParsingError;
import com.rapidminer.operator.nio.model.ParsingError.ErrorCode;
import com.rapidminer.operator.nio.model.xlsx.XlsxResultSet.XlsxReadMode;
import com.rapidminer.operator.nio.model.xlsx.XlsxUtilities.XlsxCell;


/**
 * A StAX parser for XLSX worksheets based on specifications from ECMA-376, 4th Edition.
 *
 * @author Nils Woehler
 * @since 6.3.0
 */
class XlsxSheetContentParser implements AutoCloseable {

	/**
	 * An enumeration representing the cell's data type.
	 *
	 * @see ECMA-376, 4th Edition, 18.3.1.4 Cell (p. 1589)
	 */
	private static final String ATT_CELL_TYPE = "t";

	/**
	 * The index of this cell's style. Style records are stored in the Styles Part.
	 *
	 * @see ECMA-376, 4th Edition, 18.3.1.4 Cell (p. 1589)
	 */
	private static final String ATT_CELL_STYLE = "s";

	/**
	 * This element expresses the value contained in a cell. If the cell contains a string, then
	 * this value is an index into the shared string table, pointing to the actual string value.
	 * Otherwise, the value of the cell is expressed directly in this element.
	 *
	 * Cells containing formulas express the last calculated result of the formula in this element.
	 * For applications not wanting to implement the shared string table, an 'inline string' can be
	 * expressed in an <is> element under <c> (instead of a <v> element under <c>),in the same way a
	 * string would be expressed in the shared string table.
	 *
	 * [Example: In this example, cell B4 contains the number "360", cell C4 contains the local date
	 * and time 22 November 1976, 08:30, and cell C5 contains the 1900 date system serial date-time
	 * for the date-time in cell C4.
	 *
	 * <pre>
	 * {@code
	 * <c r="B4">
	 * 	<v>360</v>
	 * </c>
	 * <c r="C4" t="d">
	 * 	<v>1976-11-22T08:30</v>
	 * </c>
	 * <c r="C5">
	 * 	<f>C4</f>
	 * 	<v>28086.3541666667</v>
	 * </c>
	 * }
	 * </pre>
	 *
	 * end example]
	 *
	 * @see ECMA-376, 4th Edition, 18.3.1.96 Cell Value (pp. 1699 ff.)
	 */
	private static final String TAG_VALUE = "v";

	/**
	 * This element allows for strings to be expressed directly in the cell definition instead of
	 * implementing the shared string table.<br/>
	 * [Example:
	 *
	 * <pre>
	 * {@code
	 * <c r="A1">
	 *  <is>
	 *   <t>String</t>
	 *  </is>
	 * </c>
	 * }
	 * </pre>
	 *
	 * end example]
	 *
	 * @see ECMA-376, 4th Edition, 18.3.1.53 Rich Text Inline (pp. 1648 ff.)
	 */
	private static final String TAG_INLINE_STRING = "is";

	/** The XLSX file itself */
	private final File xlsxFile;

	/** The path to the worksheet within the XLSX zip file */
	private final String workbookZipEntryPath;

	/** Strings shared in multiple sheets */
	private final String[] sharedStrings;

	/** Number formats defined for XLSX cells */
	private final XlsxNumberFormats numberFormats;

	/** The ZipFile object used to open the {@link #reader} */
	private ZipFile xlsxZipFile;

	/** Reads XML stream */
	private XMLStreamReader reader;

	/** CountingInputStream that is used to determine the operator progress */
	private CountingInputStream cis;

	/** The current row content. */
	private XlsxCell[] currentRowContent;

	/**
	 * Keeps track of the current {@code 0-based} row index that was extracted from the XLSX sheet.
	 * Is {@code -1} if no row was parsed yet.
	 */
	private int currentRowIndex;

	/** Cache for the next row that contains element */
	private XlsxCell[] nextRowWithContent;

	/**
	 * The index of the row that has already been parsed ({@code 0-based}). Is {@code -1} if no row
	 * has been parsed yet. <br/>
	 */
	private int parsedRowIndex;

	/** A flag which represents if more rows with content are available */
	private boolean hasMoreContent;

	/**
	 * The parsed sheet meta data.
	 */
	private XlsxSheetMetaData sheetMetaData;

	/**
	 * An array which stores information about the content of a column.
	 */
	private final boolean[] emptyColumn;

	/**
	 * Flag that defines whether the first empty rows should be skipped.
	 */
	private boolean isUseFirstRowAsNames;

	/** The encoding used to read the XML stream */
	private final Charset encoding;

	/**
	 * Constructs object by saving the shared strings and instantiating a XML reader.
	 *
	 * @param xlsxFile
	 *            The xlsx file itself
	 * @param workbookZipEntryPath
	 *            the path of the workbook Zip entry
	 * @param sharedStrings
	 *            A ordered list of shared strings to generate complete cell values.
	 * @param numberFormats
	 *            the parsed XLSX number formats
	 * @param maximumCellRange
	 *            the maximum cell that should be parsed
	 * @param columnOffset
	 *            the offset of the first column to use.
	 * @throws XMLStreamException
	 *             On errors creating a XML stream reader.
	 * @throws IOException
	 *             in case opening the workbook does not work
	 */
	public XlsxSheetContentParser(File xlsxFile, String workbookZipEntryPath, String[] sharedStrings,
			XlsxNumberFormats numberFormats, XlsxSheetMetaData sheetMetaData, XMLInputFactory factory, Charset encoding)
			throws XMLStreamException, IOException {
		this.xlsxFile = xlsxFile;
		this.workbookZipEntryPath = workbookZipEntryPath;
		this.sharedStrings = sharedStrings;
		this.numberFormats = numberFormats;
		this.sheetMetaData = sheetMetaData;
		this.encoding = encoding;
		this.emptyColumn = new boolean[sheetMetaData.getNumberOfColumns()];
		Arrays.fill(emptyColumn, true);
		reset(factory);
	}

	/**
	 * Continues parsing the XML stream until a complete row is found.
	 *
	 * @param readMode
	 *            defines whether reading is done from an operator which means the parameters should
	 *            be obeyed or if reading is done from a Wizard which means all data should be read
	 * @throws XMLStreamException
	 *             If there is an error processing the underlying XML source.
	 * @throws ParseException
	 *             If there is an error on parsing a single XML item.
	 */
	public void next(XlsxReadMode readMode) throws XMLStreamException, ParseException {

		// If reading from an operator or wizard preview skip rows up to row index before actual
		// first row that should be parsed
		if (readMode != XlsxReadMode.WIZARD_WORKPANE) {
			skipToStartRow();
		}

		// If no row has been parsed yet ..
		if (nextRowWithContent == null) {
			// parse next row with content
			this.nextRowWithContent = parseNextRowWithContent();
		}

		// Increase current row index
		++currentRowIndex;

		/*
		 * The current row is empty if the row index is smaller than the index of the next parsed
		 * row.
		 */
		boolean isRowEmpty = currentRowIndex < parsedRowIndex;

		// Check for first imported row whether empty rows should be skipped
		if (currentRowContent == null) {
			assignNextCurrentRow(isRowEmpty && (!isUseFirstRowAsNames || readMode == XlsxReadMode.WIZARD_PREVIEW));
		} else {
			// All other empty rows will be added
			assignNextCurrentRow(isRowEmpty);
		}

	}

	/**
	 * Assigns the next current row by checking whether an empty row or the next parsed row with
	 * content should be assigned. In case the next row with content was assigned as current row,
	 * the next row with content is parsed subsequently.
	 *
	 * @param assignEmptyRow
	 *            defines whether an empty row or the next row with content should be assigned
	 * @throws XMLStreamException
	 *             If there is an error processing the underlying XML source.
	 * @throws ParseException
	 *             If there is an error on parsing a single XML item.
	 */
	private void assignNextCurrentRow(boolean assignEmptyRow) throws ParseException, XMLStreamException {
		if (assignEmptyRow) {
			// If current row is empty initialize a new (empty) array for current row
			this.currentRowContent = new XlsxCell[sheetMetaData.getNumberOfColumns()];
		} else {
			// ... otherwise assign next row with content to current row and parse next row with
			// content
			this.currentRowContent = nextRowWithContent;
			this.currentRowIndex = parsedRowIndex;
			this.nextRowWithContent = parseNextRowWithContent();
		}
	}

	/**
	 * Parses next XLSX rows until a row with content was found.
	 *
	 * @return if the next row with content or an empty row in case no row with content was found
	 *         anymore
	 * @throws XMLStreamException
	 *             If there is an error processing the underlying XML source.
	 * @throws ParseException
	 *             If there is an error on parsing a single XML item.
	 */
	private XlsxCell[] parseNextRowWithContent() throws ParseException, XMLStreamException {

		boolean isRowWithContent = false;
		boolean isValue = false;
		boolean contentFound = false;
		int columnIndex = 0;

		int numberOfColumns = sheetMetaData.getNumberOfColumns();

		XlsxCell[] nextRowWithContent = new XlsxCell[numberOfColumns];
		while (!isRowWithContent && reader.hasNext()) {

			// Parse the next element
			switch (reader.next()) {
				case XMLStreamReader.START_ELEMENT:
					String startLocalName = reader.getLocalName();
					if (startLocalName.equals(XlsxUtilities.TAG_ROW)) {
						// We need to subtract 1 as XLSX indices start with 1
						String indexValue = XlsxUtilities.getAttributes(reader).getValue(XlsxUtilities.TAG_ROW_INDEX);
						try {
							parsedRowIndex = Integer.parseInt(indexValue) - 1;
						} catch (NumberFormatException e) {
							throw new ParseException(
									new ParsingError(parsedRowIndex, columnIndex, ErrorCode.FILE_SYNTAX_ERROR, indexValue));
						}
					} else if (startLocalName.equals(XlsxUtilities.TAG_CELL)) {
						Attributes attributes = XlsxUtilities.getAttributes(reader);

						// Update column index
						String cellReference = attributes.getValue(XlsxUtilities.TAG_CELL_REFERENCE);
						try {
							columnIndex = sheetMetaData
									.mapColumnIndex(XlsxUtilities.convertCellRefToCoordinates(cellReference).columnNumber);
						} catch (IllegalArgumentException e) {
							throw new ParseException(new ParsingError(parsedRowIndex, columnIndex,
									ParsingError.ErrorCode.FILE_SYNTAX_ERROR, cellReference));
						}

						// Check if the current column should be skipped
						if (sheetMetaData.isSkipColumn(columnIndex)) {
							break;
						}

						// Parse cell type and create new XLSX cell
						String cellType = attributes.getValue(ATT_CELL_TYPE);
						String cellStyle = attributes.getValue(ATT_CELL_STYLE);
						XlsxCellType type = XlsxCellType.getCellType(cellType, numberFormats, cellStyle);
						if (type != null) {
							nextRowWithContent[columnIndex] = new XlsxCell(type);
						} else {
							throw new ParseException(new ParsingError(parsedRowIndex, columnIndex,
									ParsingError.ErrorCode.FILE_SYNTAX_ERROR, cellType));
						}
					} else if (startLocalName.equals(TAG_VALUE) || startLocalName.equals(TAG_INLINE_STRING)) {
						// Information for value parsing
						isValue = true;
					}
					break;
				case XMLStreamReader.END_ELEMENT:
					String endLocalName = reader.getLocalName();
					if (endLocalName.equals(XlsxUtilities.TAG_ROW)) {
						// At least one value has been parsed for current row
						// so stop parsing any further
						isRowWithContent = contentFound;
					} else if (endLocalName.equals(TAG_VALUE) || endLocalName.equals(TAG_INLINE_STRING)) {
						// Text part ends
						isValue = false;
					}
					break;
				case XMLStreamReader.CHARACTERS:
					if (!sheetMetaData.isSkipColumn(columnIndex) && isValue) {
						String text = reader.getText();
						if (nextRowWithContent[columnIndex].cellType.equals(XlsxCellType.SHARED_STRING)) {
							nextRowWithContent[columnIndex].value = sharedStrings[Integer.parseInt(text)];
						} else {
							nextRowWithContent[columnIndex].value = text;
						}
						emptyColumn[columnIndex] = false;
						contentFound = true;
					}
					break;
				case XMLStreamReader.END_DOCUMENT:
					// end of document was reached but no new row with content was found
					this.hasMoreContent = false;
					break;
				default:
					// ignore other events
					break;
			}
		}
		return nextRowWithContent;
	}

	/**
	 * Skips all rows before the selected starting row.
	 *
	 * @throws XMLStreamException
	 *             If there is an error processing the underlying XML source.
	 */
	private void skipToStartRow() throws XMLStreamException {
		int rowIndexBeforeFirstRow = sheetMetaData.getFirstRowIndex() - 1;
		boolean skipRows = currentRowIndex < rowIndexBeforeFirstRow;
		while (skipRows && reader.hasNext()) {
			int nextEvent = reader.next();
			if (nextEvent == XMLStreamReader.START_ELEMENT) {
				// If a row was found..
				if (reader.getLocalName().equals(XlsxUtilities.TAG_ROW)) {
					// ... parse the row index (subtract 1 as XLSX indices start with 1)
					parsedRowIndex = Integer
							.parseInt(XlsxUtilities.getAttributes(reader).getValue(XlsxUtilities.TAG_ROW_INDEX)) - 1;

					// Check whether the parsed index is already
					// behind the desired end index (which means that all rows before were
					// empty)
					if (parsedRowIndex > rowIndexBeforeFirstRow) {
						currentRowIndex = rowIndexBeforeFirstRow;
						skipRows = false;
					} else {
						// Update the current row index if parsed index is still
						// before first row to read
						currentRowIndex = parsedRowIndex;
					}
				}
			} else if (nextEvent == XMLStreamReader.END_ELEMENT && reader.getLocalName().equals(XlsxUtilities.TAG_ROW)) {
				// parse until end of row if we haven't parsed too far yet
				skipRows = currentRowIndex < rowIndexBeforeFirstRow;
			}
		}
	}

	/**
	 * Returns the {@code 0-based} index of the current row, which was extracted from the worksheet.
	 *
	 * @return the {@code 0} based index of the current parsed row. Returns {@code -1} in case no
	 *         row has been parsed yet. It is increased each time calling {@link #next()} and is
	 *         reset to {@code -1} in case {@link #reset(XMLInputFactory)} is called.
	 */
	int getCurrentRowIndex() {
		return currentRowIndex;
	}

	/**
	 * @return the content of the current parsed row. Returns <code>null</code> in case no row has
	 *         been parsed yet.
	 */
	XlsxCell[] getRowContent() {
		return currentRowContent;
	}

	@Override
	public void close() throws XMLStreamException, IOException {

		// Close the XMLStreamReader
		if (reader != null) {
			reader.close();
		}

		// close Zip file and its open InputStreams
		if (xlsxZipFile != null) {
			xlsxZipFile.close();
		}
	}

	/**
	 * @return <code>true</code> in case the {@link XMLStreamReader} has more rows with content
	 *         available.
	 */
	boolean hasNext() {
		return hasMoreContent && sheetMetaData.getLastRowIndex() >= 0 && currentRowIndex < sheetMetaData.getLastRowIndex();
	}

	/**
	 * Closes the current open {@link XMLStreamReader} and creates a new one which starts the
	 * reading process at the first row. It is assumed the the XLSX content and operator
	 * configuration remain the same.
	 *
	 * @param factory
	 *            the {@link XMLInputFactory} that should be used to open the
	 *            {@link XMLStreamReader}.
	 *
	 * @throws IOException
	 *             if an I/O error has occurred
	 * @throws XMLStreamException
	 *             if there are errors freeing associated XML reader resources or creating a new XML
	 *             reader
	 */
	void reset(XMLInputFactory xmlFactory) throws IOException, XMLStreamException {

		// close open file and reader object
		close();

		// create new file and stream reader objects
		xlsxZipFile = new ZipFile(xlsxFile);
		ZipEntry workbookZipEntry = xlsxZipFile.getEntry(workbookZipEntryPath);
		if (workbookZipEntry == null) {
			throw new FileNotFoundException(
					"XLSX file is malformed. Reason: Selected workbook is missing in XLSX file. Path: "
							+ workbookZipEntryPath);
		}
		InputStream inputStream = xlsxZipFile.getInputStream(workbookZipEntry);
		cis = new CountingInputStream(inputStream);
		reader = xmlFactory.createXMLStreamReader(new InputStreamReader(cis, encoding));

		// reset other variables
		currentRowIndex = -1;
		parsedRowIndex = -1;
		currentRowContent = null;
		nextRowWithContent = null;
		hasMoreContent = true;
		Arrays.fill(emptyColumn, true);
	}

	/**
	 * @return an array which stores if an column was empty during parsing. You should only call
	 *         this method iff {@link #hasNext()} returns <code>false</code>.
	 */
	boolean[] getEmptyColumns() {
		return emptyColumn;
	}

	/**
	 * @param parameterAsBoolean
	 *            defines whether the first row should be used as names. If set to <code>true</code>
	 *            the worksheet parser will skip all beginning empty rows until the first row with
	 *            content was found.
	 */
	void setUseFirstRowAsNames(boolean isFirstRowAsNames) {
		this.isUseFirstRowAsNames = isFirstRowAsNames;
	}

	/**
	 *
	 * @return The total size of the entry data, which can be used to determine the total operator
	 *         progress
	 */
	long getTotalSize() {
		return xlsxZipFile.getEntry(workbookZipEntryPath).getSize();
	}

	/**
	 *
	 * @return The current position in the entry data, which can be used to determine the current
	 *         operator progress
	 */
	long getCurrentPosition() {
		return cis.getByteCount();
	}

}