/**
* Copyright (C) 2001-2017 by RapidMiner and the contributors
*
* Complete list of developers available at our web site:
*
* http://rapidminer.com
*
* This program is free software: you can redistribute it and/or modify it under the terms of the
* GNU Affero General Public License as published by the Free Software Foundation, either version 3
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
* even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License along with this program.
* If not, see http://www.gnu.org/licenses/.
*/
package com.rapidminer.operator.nio.model;
import static com.rapidminer.operator.nio.ExcelExampleSource.PARAMETER_SHEET_NUMBER;
import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.Map;
import java.util.zip.ZipFile;
import javax.swing.table.AbstractTableModel;
import javax.swing.table.TableModel;
import javax.xml.parsers.ParserConfigurationException;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.xml.sax.SAXException;
import com.rapidminer.operator.Operator;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.UserError;
import com.rapidminer.operator.nio.ExcelExampleSource;
import com.rapidminer.operator.nio.ExcelSheetTableModel;
import com.rapidminer.operator.nio.model.xlsx.XlsxResultSet;
import com.rapidminer.operator.nio.model.xlsx.XlsxResultSet.XlsxReadMode;
import com.rapidminer.operator.nio.model.xlsx.XlsxSheetTableModel;
import com.rapidminer.operator.nio.model.xlsx.XlsxWorkbookParser;
import com.rapidminer.operator.nio.model.xlsx.XlsxWorkbookParser.XlsxWorkbook;
import com.rapidminer.operator.ports.metadata.ExampleSetMetaData;
import com.rapidminer.parameter.UndefinedParameterError;
import com.rapidminer.tools.ProgressListener;
import com.rapidminer.tools.Tools;
import com.rapidminer.tools.io.Encoding;
import jxl.Workbook;
import jxl.WorkbookSettings;
import jxl.read.biff.BiffException;
/**
* A class holding information about configuration of the Excel Result Set
*
* @author Sebastian Land, Marco Boeck, Nils Woehler
*/
public class ExcelResultSetConfiguration implements DataResultSetFactory {
private static final String XLS_FILE_ENDING = ".xls";
private static final String XLSX_FILE_ENDING = ".xlsx";
private int rowOffset = -1;
private int columnOffset = -1;
private int rowLast = Integer.MAX_VALUE;
private int columnLast = Integer.MAX_VALUE;
/** Numbering starts at 0. */
private int sheet = -1;
private Charset encoding;
private jxl.Workbook workbookJXL;
private File workbookFile;
private boolean isEmulatingOldNames;
private String timezone;
private String datePattern;
/**
* This constructor must read in all settings from the parameters of the given operator.
*
* @throws OperatorException
*/
public ExcelResultSetConfiguration(ExcelExampleSource excelExampleSource) throws OperatorException {
if (excelExampleSource.isParameterSet(ExcelExampleSource.PARAMETER_IMPORTED_CELL_RANGE)) {
parseExcelRange(excelExampleSource.getParameterAsString(ExcelExampleSource.PARAMETER_IMPORTED_CELL_RANGE));
}
if (excelExampleSource.isParameterSet(PARAMETER_SHEET_NUMBER)) {
this.sheet = excelExampleSource.getParameterAsInt(PARAMETER_SHEET_NUMBER) - 1;
}
if (excelExampleSource.isFileSpecified()) {
this.workbookFile = excelExampleSource.getSelectedFile();
} else {
String excelParamter;
try {
excelParamter = excelExampleSource.getParameter(ExcelExampleSource.PARAMETER_EXCEL_FILE);
} catch (UndefinedParameterError e) {
excelParamter = null;
}
if (excelParamter != null && !"".equals(excelParamter)) {
File excelFile = new File(excelParamter);
if (excelFile.exists()) {
this.workbookFile = excelFile;
}
}
}
if (excelExampleSource.isParameterSet(AbstractDataResultSetReader.PARAMETER_DATE_FORMAT)) {
datePattern = excelExampleSource.getParameterAsString(AbstractDataResultSetReader.PARAMETER_DATE_FORMAT);
}
if (excelExampleSource.isParameterSet(AbstractDataResultSetReader.PARAMETER_TIME_ZONE)) {
timezone = excelExampleSource.getParameterAsString(AbstractDataResultSetReader.PARAMETER_TIME_ZONE);
}
encoding = Encoding.getEncoding(excelExampleSource);
isEmulatingOldNames = excelExampleSource.getCompatibilityLevel()
.isAtMost(ExcelExampleSource.CHANGE_5_0_11_NAME_SCHEMA);
}
/**
* This will create a completely empty result set configuration
*/
public ExcelResultSetConfiguration() {}
/**
* @return the 0-based row offset
*/
public int getRowOffset() {
return rowOffset;
}
/**
* @return the 0-based column offset
*/
public int getColumnOffset() {
return columnOffset;
}
/**
* Returns if there is an active workbook.
*/
public boolean hasWorkbook() {
return workbookJXL != null;
}
/**
* Creates an excel table model (either {@link ExcelSheetTableModel} or
* {@link Excel2007SheetTableModel}, depending on file).
*
* @param sheetIndex
* the index of the sheet (0-based)
* @param readMode
* the read mode for {@link XlsxSheetTableModel} creation. It defines whether only a
* preview or the whole sheet content will be loaded
* @param progressListener
* the progress listener to report progress to
* @return
* @throws BiffException
* @throws IOException
* @throws InvalidFormatException
*/
public AbstractTableModel createExcelTableModel(int sheetIndex, XlsxReadMode readMode, ProgressListener progressListener)
throws BiffException, IOException, InvalidFormatException, OperatorException, ParseException {
if (getFile().getAbsolutePath().endsWith(XLSX_FILE_ENDING)) {
// excel 2007 file
return new XlsxSheetTableModel(this, sheetIndex, readMode, getFile().getAbsolutePath(), progressListener);
} else {
// excel pre 2007 file
if (workbookJXL == null) {
createWorkbookJXL();
}
progressListener.setCompleted(50);
return new ExcelSheetTableModel(workbookJXL.getSheet(sheetIndex));
}
}
/**
* Returns the number of sheets in the excel file
*
* @return
* @throws IOException
* @throws BiffException
* @throws InvalidFormatException
*/
public int getNumberOfSheets() throws BiffException, IOException, InvalidFormatException, UserError {
if (getFile().getAbsolutePath().endsWith(XLSX_FILE_ENDING)) {
// excel 2007 file
try (ZipFile zipFile = new ZipFile(getFile().getAbsolutePath())) {
try {
return getNumberOfSheets(parseWorkbook(zipFile));
} catch (ParserConfigurationException | SAXException e) {
throw new UserError(null, e, "xlsx_content_malformed");
}
}
} else {
// excel pre 2007 file
if (workbookJXL == null) {
createWorkbookJXL();
}
return workbookJXL.getNumberOfSheets();
}
}
private int getNumberOfSheets(XlsxWorkbook workbook) {
return workbook.xlsxWorkbookSheets.size();
}
private XlsxWorkbook parseWorkbook(ZipFile zipFile)
throws UserError, IOException, ParserConfigurationException, SAXException {
return new XlsxWorkbookParser().parseZipEntry(zipFile);
}
/**
* Returns the names of all sheets in the excel file
*
* @return
* @throws IOException
* @throws BiffException
* @throws InvalidFormatException
*/
public String[] getSheetNames() throws BiffException, IOException, InvalidFormatException, UserError {
if (getFile().getAbsolutePath().endsWith(XLSX_FILE_ENDING)) {
// excel 2007 file
try (ZipFile zipFile = new ZipFile(getFile().getAbsolutePath())) {
XlsxWorkbook workbook;
try {
workbook = parseWorkbook(zipFile);
String[] sheetNames = new String[getNumberOfSheets(workbook)];
for (int i = 0; i < getNumberOfSheets(); i++) {
sheetNames[i] = workbook.xlsxWorkbookSheets.get(i).name;
}
return sheetNames;
} catch (ParserConfigurationException | SAXException e) {
throw new UserError(null, e, "xlsx_content_malformed");
}
}
} else {
// excel pre 2007 file
if (workbookJXL == null) {
createWorkbookJXL();
}
return workbookJXL.getSheetNames();
}
}
/**
* Returns the encoding for this configuration.
*
* @return
*/
public Charset getEncoding() {
return this.encoding;
}
/**
* @param encoding
* the new encoding
*/
public void setEncoding(Charset encoding) {
this.encoding = encoding;
}
/**
* This returns the file of the referenced excel file
*/
public File getFile() {
return workbookFile;
}
/**
* This will set the workbook file. It will assure that an existing preopened workbook will be
* closed if files differ.
*/
public void setWorkbookFile(File selectedFile) {
if (selectedFile.equals(this.workbookFile)) {
return;
}
if (workbookJXL != null) {
workbookJXL.close();
workbookJXL = null;
}
workbookFile = selectedFile;
rowOffset = 0;
columnOffset = 0;
rowLast = Integer.MAX_VALUE;
columnLast = Integer.MAX_VALUE;
sheet = 0;
}
/**
* @return the index of the last row to import. The index is 0 based. In case it is
* {@link Integer#MAX_VALUE} all rows with content should be imported.
*/
public int getRowLast() {
return rowLast;
}
public void setRowLast(int rowLast) {
this.rowLast = rowLast;
}
/**
* @return the index of the last column to import. The index is 0 based. In case it is
* {@link Integer#MAX_VALUE} all columns with content should be imported.
*/
public int getColumnLast() {
return columnLast;
}
public void setColumnLast(int columnLast) {
this.columnLast = columnLast;
}
public int getSheet() {
return sheet;
}
public void setSheet(int sheet) {
this.sheet = sheet;
}
public void setRowOffset(int rowOffset) {
this.rowOffset = rowOffset;
}
public void setColumnOffset(int columnOffset) {
this.columnOffset = columnOffset;
}
@Override
public DataResultSet makeDataResultSet(Operator operator) throws OperatorException {
return makeDataResultSet(operator, XlsxReadMode.OPERATOR);
}
/**
* Creates a {@link DataResultSet} based on the current configuration and the provided
* {@link XlsxReadMode}.
*
* @param operator
* the operator to create the {@link DataResultSet} for. Might be {@code null} in
* case no operator is available
* @param readMode
* the read mode
* @param provider
* a {@link DateFormatProvider}, can be {@code null} in which case the date format is
* fixed by the current value of {@link configuration#getDatePattern()}
* @return the created {@link DataResultSet}
* @throws OperatorException
* in case the creation fails because of an invalid configuration
*/
public DataResultSet makeDataResultSet(Operator operator, XlsxReadMode readMode, DateFormatProvider provider)
throws OperatorException {
File file = getFile();
if (file == null) {
throw new UndefinedParameterError(ExcelExampleSource.PARAMETER_EXCEL_FILE, operator);
}
String absolutePath = file.getAbsolutePath();
DataResultSet resultSet;
if (absolutePath.endsWith(XLSX_FILE_ENDING)) {
resultSet = createExcel2007ResultSet(operator, readMode, provider);
} else if (absolutePath.endsWith(XLS_FILE_ENDING)) {
// excel pre 2007 file
resultSet = new ExcelResultSet(operator, this, provider);
} else {
// we might also get a file object that has neither .xlsx nor .xls as file ending,
// so we have no choice but to try and open the file with the pre 2007 JXL lib to
// see if it works. If it does not work, it's an excel 2007 file.
try {
Workbook.getWorkbook(file);
resultSet = new ExcelResultSet(operator, this, provider);
} catch (Exception e) {
resultSet = createExcel2007ResultSet(operator, readMode, provider);
}
}
return resultSet;
}
/**
* Creates a {@link DataResultSet} based on the current configuration and the provided
* {@link XlsxReadMode}.
*
* @param operator
* the operator to create the {@link DataResultSet} for. Might be {@code null} in
* case no operator is available
* @param readMode
* the read mode
* @return the created {@link DataResultSet}
* @throws OperatorException
* in case the creation fails because of an invalid configuration
*/
public DataResultSet makeDataResultSet(Operator operator, XlsxReadMode readMode) throws OperatorException {
return makeDataResultSet(operator, readMode, null);
}
/**
* Creates a new XLSX DataResultSet for the specified operator.
*
* @param operator
* the operator which is used as error source in case something goes wrong
* @param readMode
* the read mode which should be used to read the file. The read mode defines how
* many lines should actually be read.
* @return the new XLSX DataResultSet
*/
@SuppressWarnings("deprecation")
private DataResultSet createExcel2007ResultSet(Operator operator, XlsxReadMode readMode, DateFormatProvider provider)
throws OperatorException {
if (operator == null || operator.getCompatibilityLevel().isAbove(ExcelExampleSource.CHANGE_6_2_0_OLD_XLSX_IMPORT)) {
return createXLSXResultSet(operator, readMode, provider);
} else {
return new Excel2007ResultSet(operator, this);
}
}
private XlsxResultSet createXLSXResultSet(Operator operator, XlsxReadMode readMode, DateFormatProvider provider)
throws UserError {
return new XlsxResultSet(operator, this, getSheet(), readMode, provider);
}
/**
* See class comment on {@link ExcelSheetTableModel} for a comment why that class is not used
* here. In fact we are using a {@link DefaultPreview} here as well.
*/
@Override
public TableModel makePreviewTableModel(ProgressListener listener) throws OperatorException {
File file = getFile();
if (file == null) {
throw new UserError(null, 205, ExcelExampleSource.PARAMETER_EXCEL_FILE, "");
}
String absolutePath = file.getAbsolutePath();
if (absolutePath.endsWith(XLSX_FILE_ENDING)) {
try {
return createExcelTableModel(getSheet(), XlsxReadMode.WIZARD_PREVIEW, listener);
} catch (BiffException | InvalidFormatException | IOException | ParseException e) {
throw new UserError(null, e, "xlsx_content_malformed");
}
} else {
try (final DataResultSet resultSet = makeDataResultSet(null)) {
return new DefaultPreview(resultSet, listener);
} catch (ParseException e) {
throw new UserError(null, 302, file.getPath(), e.getMessage());
}
}
}
public void closeWorkbook() {
if (workbookJXL != null) {
workbookJXL.close();
workbookJXL = null;
}
}
@Override
public void setParameters(AbstractDataResultSetReader source) {
String range = Tools.getExcelColumnName(columnOffset) + (rowOffset + 1);
// only add end range to cell range parameter if user has specified it explicitly
if (Integer.MAX_VALUE != columnLast && Integer.MAX_VALUE != rowOffset) {
range += ":" + Tools.getExcelColumnName(columnLast) + (rowLast + 1);
}
source.setParameter(ExcelExampleSource.PARAMETER_IMPORTED_CELL_RANGE, range);
source.setParameter(PARAMETER_SHEET_NUMBER, String.valueOf(sheet + 1));
source.setParameter(ExcelExampleSource.PARAMETER_EXCEL_FILE, workbookFile.getAbsolutePath());
}
public void parseExcelRange(String range) throws OperatorException {
String[] split = range.split(":", 2);
try {
int[] topLeft = parseExcelCell(split[0]);
columnOffset = topLeft[0];
rowOffset = topLeft[1];
if (split.length < 2) {
rowLast = Integer.MAX_VALUE;
columnLast = Integer.MAX_VALUE;
} else {
int[] bottomRight = parseExcelCell(split[1]);
columnLast = bottomRight[0];
rowLast = bottomRight[1];
}
} catch (OperatorException e) {
throw new UserError(null, e, 223, range);
}
}
private static int[] parseExcelCell(String string) throws OperatorException {
int i = 0;
int column = 0;
int row = 0;
while (i < string.length() && Character.isLetter(string.charAt(i))) {
char c = string.charAt(i);
c = Character.toUpperCase(c);
if (c < 'A' || c > 'Z') {
throw new UserError(null, 224, string);
}
column *= 26;
column += c - 'A' + 1;
i++;
}
if (i < string.length()) { // at least one digit left
String columnStr = string.substring(i);
try {
row = Integer.parseInt(columnStr);
} catch (NumberFormatException e) {
throw new UserError(null, 224, string);
}
}
return new int[] { column - 1, row - 1 };
}
@Override
public String getResourceName() {
return workbookFile.getAbsolutePath();
}
@Override
public ExampleSetMetaData makeMetaData() {
final ExampleSetMetaData result = new ExampleSetMetaData();
if (rowLast != Integer.MAX_VALUE) {
result.setNumberOfExamples(rowLast - rowOffset + 1);
}
return result;
}
/**
* This returns whether the old naming style should be kept from prior to 5.1.000 versions.
*/
public boolean isEmulatingOldNames() {
return isEmulatingOldNames;
}
@Override
public void close() {
if (workbookJXL != null) {
workbookJXL.close();
}
}
/**
* Creates the JXL workbook.
*
* @throws BiffException
* @throws IOException
*/
private void createWorkbookJXL() throws BiffException, IOException {
File file = getFile();
WorkbookSettings workbookSettings = new WorkbookSettings();
if (encoding != null) {
workbookSettings.setEncoding(encoding.name());
}
workbookJXL = Workbook.getWorkbook(file, workbookSettings);
}
/**
* @return the timezone
*/
public String getTimezone() {
return this.timezone;
}
/**
* @return the datePattern
*/
public String getDatePattern() {
return this.datePattern;
}
/**
* @param timezone
* the timezone to set
*/
public void setTimezone(String timezone) {
this.timezone = timezone;
}
/**
* @param datePattern
* the datePattern to set
*/
public void setDatePattern(String datePattern) {
this.datePattern = datePattern;
}
/**
* Write the parameters of the {@link ExcelResultSetConfiguration} into a map. Each parameter
* value is written as a string value.
*
* @param parameters
* the map to store the parameter to
*/
public void storeConfiguration(Map<String, String> parameters) {
File file = getFile();
parameters.put("excel.fileLocation", file != null ? file.toString() : "");
parameters.put("excel.sheet", String.valueOf(getSheet()));
parameters.put("excel.rowOffset", String.valueOf(getRowOffset()));
parameters.put("excel.rowLast", String.valueOf(getRowLast()));
parameters.put("excel.columnOffset", String.valueOf(getColumnOffset()));
parameters.put("excel.columnLast", String.valueOf(getColumnLast()));
}
}