/**
* Copyright (C) 2001-2017 by RapidMiner and the contributors
*
* Complete list of developers available at our web site:
*
* http://rapidminer.com
*
* This program is free software: you can redistribute it and/or modify it under the terms of the
* GNU Affero General Public License as published by the Free Software Foundation, either version 3
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
* even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License along with this program.
* If not, see http://www.gnu.org/licenses/.
*/
package com.rapidminer.operator.nio.model.xlsx;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import org.xml.sax.Attributes;
import com.rapidminer.operator.Operator;
import com.rapidminer.operator.UserError;
/**
* StAX parser for XLSX Shared String Table.
*
* @see ECMA-376, 4th Edition, 18.4 Shared String Table (pp. 1709 ff.)
*
* @author Nils Woehler
* @since 6.3.0
*/
public class XlsxSharedStringsParser {
/**
* ECMA-376: 'optional unless uniqueCount is used'
*
* An integer representing the total count of strings in the workbook. This count does not
* include any numbers, it counts only the total of text strings in the workbook.
*
* This attribute is optional unless uniqueCount is used, in which case it is required.
*
* The possible values for this attribute are defined by the W3C XML Schema unsignedInt
* datatype.
*
* */
private static final String ATT_SHARED_STRING_TABLE_COUNT = "count";
/**
* ECMA-376: 'optional unless count is used'
*
* An integer representing the total count of unique strings in the Shared String Table. A
* string is unique even if it is a copy of another string, but has different formatting applied
* at the character level.
*
* [Example:
*
* World, [italic]World[italic], and World.
*
* The count would be 3, and the uniqueCount would be 2. Only one entry for "World" would show
* in the table because it is the same string, just with different formatting applied at the
* cell level (i.e., applied to the entire string in the cell). The "World" string would get a
* separate unique entry in the shared string table because it has different formatting applied
* to specific characters.
*
* end example]
*
* This attribute is optional unless count is used, in which case it is required.
*
* The possible values for this attribute are defined by the W3C XML Schema unsignedInt
* datatype.
*
* */
private static final String ATT_SHARED_STRING_TABLE_UNIQUE_COUNT = "uniqueCount";
/**
* Shared String Table top level element.
*
* @see ECMA-376, 4th Edition, subclause 18.4.9
*/
private static final String TAG_SHARED_STRING_TABLE = "sst";
/**
* String Item child element.
*
* @see ECMA-376, 4th Edition, subclause 18.4.8
*/
private static final String TAG_STRING_ITEM = "si";
/**
* This element represents the text content shown as part of a string.
*
* The possible values for this element are defined by the ST_Xstring simple type (22.9.2.19).
*
* @see ECMA-376, 4th Edition, subclause 18.4.12
*/
private static final String TAG_TEXT = "t";
/** The XLSX file */
private final File xlsxFile;
/** The factory used to create {@link XMLStreamReader} */
private final XMLInputFactory xmlFactory;
/**
* The path of the shared strings file.
*/
private final String sharedStringsFilePath;
public XlsxSharedStringsParser(File xlsxFile, String sharedStringsFilePath, XMLInputFactory xmlFactory) {
this.xlsxFile = xlsxFile;
this.sharedStringsFilePath = sharedStringsFilePath;
this.xmlFactory = xmlFactory;
}
/**
* Parses the XLSX shared strings XML file and returns the parsed Strings as an array.
*
* @return the parsed shared strings as an array
* @throws IOException
* in case the Shared Strings Zip entry cannot be opened
* @throws XMLStreamException
* in case the {@link XMLInputFactory} cannot create a {@link XMLStreamReader}
* @throws UserError
* in case the shared string content is malformed
* @throws XlsxException
* in case the shared string XML content is invalid
*/
public String[] parseSharedStrings(Operator op, Charset encoding) throws XMLStreamException, IOException, UserError {
boolean isCurrentTagText = false;
int numberOfItems = 0;
int stringItemCounter = 0;
String[] xlsxSharedStrings = null;
XMLStreamReader reader = null;
try (ZipFile zipFile = new ZipFile(xlsxFile)) {
ZipEntry zipEntry = zipFile.getEntry(XlsxUtilities.XLSX_PATH_PREFIX + sharedStringsFilePath);
if (zipEntry == null) {
// no shared strings defined
return new String[0];
}
InputStream inputStream = zipFile.getInputStream(zipEntry);
reader = xmlFactory.createXMLStreamReader(new InputStreamReader(inputStream, encoding));
while (reader.hasNext()) {
switch (reader.next()) {
case XMLStreamReader.START_ELEMENT:
Attributes attributes = XlsxUtilities.getAttributes(reader);
if (reader.getLocalName().equals(TAG_SHARED_STRING_TABLE)) {
// retrieve uniqueCount values
String uniqueCount = attributes.getValue(ATT_SHARED_STRING_TABLE_UNIQUE_COUNT);
if (uniqueCount != null) {
// in case uniqueCount is set use it as counter
numberOfItems = Integer.parseInt(uniqueCount);
} else {
String count = attributes.getValue(ATT_SHARED_STRING_TABLE_COUNT);
// in case only count is set, use count
if (count != null) {
numberOfItems = Integer.parseInt(count);
}
}
// initialize String array
xlsxSharedStrings = new String[numberOfItems];
} else if (reader.getLocalName().equals(TAG_TEXT)) {
// we ignore formatting stored within the Shared Table XML because we
// are only looking for the actual text
isCurrentTagText = true;
}
break;
case XMLStreamReader.END_ELEMENT:
if (reader.getLocalName().equals(TAG_STRING_ITEM)) {
stringItemCounter++;
} else if (reader.getLocalName().equals(TAG_TEXT)) {
isCurrentTagText = false;
}
break;
case XMLStreamReader.CHARACTERS:
if (isCurrentTagText) {
if (xlsxSharedStrings[stringItemCounter] == null) {
// no text found yet for current TAG_STRING_ITEM
xlsxSharedStrings[stringItemCounter] = reader.getText();
} else {
// append new text to other text for current TAG_STRING_ITEM
xlsxSharedStrings[stringItemCounter] += reader.getText();
}
}
break;
case XMLStreamReader.END_DOCUMENT:
// Final check of correctness of logic
if (stringItemCounter != numberOfItems) {
throw new UserError(op, "xlsx_content_malformed");
}
break;
default:
// ignore other cases
break;
}
}
} finally {
if (reader != null) {
reader.close();
}
}
return xlsxSharedStrings;
}
}