/**
* Copyright (C) 2001-2017 by RapidMiner and the contributors
*
* Complete list of developers available at our web site:
*
* http://rapidminer.com
*
* This program is free software: you can redistribute it and/or modify it under the terms of the
* GNU Affero General Public License as published by the Free Software Foundation, either version 3
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
* even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License along with this program.
* If not, see http://www.gnu.org/licenses/.
*/
package com.rapidminer.operator.nio;
import java.text.NumberFormat;
import java.util.Arrays;
import java.util.LinkedList;
import java.util.List;
import com.rapidminer.example.Attribute;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.OperatorVersion;
import com.rapidminer.operator.io.AbstractReader;
import com.rapidminer.operator.nio.model.AbstractDataResultSetReader;
import com.rapidminer.operator.nio.model.DataResultSet;
import com.rapidminer.operator.nio.model.DataResultSetFactory;
import com.rapidminer.operator.nio.model.DataResultSetTranslationConfiguration;
import com.rapidminer.operator.nio.model.ExcelResultSetConfiguration;
import com.rapidminer.operator.nio.model.xlsx.XlsxResultSet;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeConfiguration;
import com.rapidminer.parameter.ParameterTypeInt;
import com.rapidminer.parameter.ParameterTypeString;
import com.rapidminer.tools.io.Encoding;
/**
*
* <p>
* This operator can be used to load data from Microsoft Excel spreadsheets. This operator is able
* to reads data from Excel 95, 97, 2000, XP, and 2003. The user has to define which of the
* spreadsheets in the workbook should be used as data table. The table must have a format so that
* each line is an example and each column represents an attribute. Please note that the first line
* might be used for attribute names which can be indicated by a parameter.
* </p>
*
* <p>
* The data table can be placed anywhere on the sheet and is allowed to contain arbitrary formatting
* instructions, empty rows, and empty columns. Missing data values are indicated by empty cells or
* by cells containing only "?".
* </p>
*
* @author Ingo Mierswa, Tobias Malbrecht, Sebastian Loh, Sebastian Land, Marco Boeck
*/
public class ExcelExampleSource extends AbstractDataResultSetReader {
private static final String XLSX = "xlsx";
private static final String XLS = "xls";
public static final OperatorVersion CHANGE_5_0_4 = new OperatorVersion(5, 0, 4);
public static final OperatorVersion CHANGE_5_0_11_NAME_SCHEMA = new OperatorVersion(5, 0, 11);
/** Last version that used the old POI XLSX import */
public static final OperatorVersion CHANGE_6_2_0_OLD_XLSX_IMPORT = new OperatorVersion(6, 2, 0);
/**
* The parameter name for "The Excel spreadsheet file which should be loaded."
*/
public static final String PARAMETER_EXCEL_FILE = "excel_file";
/**
* The parameter name for "The number of the sheet which should be imported."
*/
public static final String PARAMETER_SHEET_NUMBER = "sheet_number";
/**
* The parameter name for "Indicates which column should be used for the label attribute
* (0: no label)"
*/
public static final String PARAMETER_LABEL_COLUMN = "label_column";
/**
* The parameter name for "Indicates which column should be used for the Id attribute (0:
* no id)"
*/
public static final String PARAMETER_ID_COLUMN = "id_column";
public static final String PARAMETER_CREATE_LABEL = "create_label";
public static final String PARAMETER_CREATE_ID = "create_id";
public static final String PARAMETER_IMPORTED_CELL_RANGE = "imported_cell_range";
static {
AbstractReader.registerReaderDescription(new ReaderDescription(XLS, ExcelExampleSource.class, PARAMETER_EXCEL_FILE));
AbstractReader
.registerReaderDescription(new ReaderDescription(XLSX, ExcelExampleSource.class, PARAMETER_EXCEL_FILE));
}
public ExcelExampleSource(final OperatorDescription description) {
super(description);
}
@Override
protected ExampleSet transformDataResultSet(DataResultSet dataResultSet) throws OperatorException {
ExampleSet exampleSet;
if (getCompatibilityLevel().isAbove(CHANGE_6_2_0_OLD_XLSX_IMPORT) && dataResultSet instanceof XlsxResultSet) {
XlsxResultSet xlsxResultSet = (XlsxResultSet) dataResultSet;
xlsxResultSet.setUseFirstRowAsNames(getParameterAsBoolean(PARAMETER_FIRST_ROW_AS_NAMES));
exampleSet = super.transformDataResultSet(dataResultSet);
// Remove attributes if they are empty and no meta data is defined
for (String attrToRemove : xlsxResultSet.getEmptyColumnNames(DataResultSetTranslationConfiguration
.readColumnMetaData(this))) {
Attribute toRemove = exampleSet.getAttributes().get(attrToRemove);
if (toRemove != null) {
exampleSet.getAttributes().remove(toRemove);
}
}
} else {
exampleSet = super.transformDataResultSet(dataResultSet);
}
return exampleSet;
}
@Override
protected DataResultSetFactory getDataResultSetFactory() throws OperatorException {
return new ExcelResultSetConfiguration(this);
}
@Override
protected NumberFormat getNumberFormat() throws OperatorException {
return null;
}
@Override
protected String getFileParameterName() {
return PARAMETER_EXCEL_FILE;
}
@Override
protected String getFileExtension() {
return XLS;
}
/** Returns the allowed file extensions. */
@Override
protected String[] getFileExtensions() {
return new String[] { XLSX, XLS };
}
@Override
public List<ParameterType> getParameterTypes() {
List<ParameterType> types = new LinkedList<>();
ParameterType type = new ParameterTypeConfiguration(ExcelExampleSourceConfigurationWizardCreator.class, this);
type.setExpert(false);
types.add(type);
types.add(makeFileParameterType());
types.add(new ParameterTypeInt(PARAMETER_SHEET_NUMBER, "The number of the sheet which should be imported.", 1,
Integer.MAX_VALUE, 1, false));
types.add(new ParameterTypeString(PARAMETER_IMPORTED_CELL_RANGE,
"Cells to import, in Excel notation, e.g. B2:D25 or B2 for an open interval.", "A1"));
types.addAll(Encoding.getParameterTypes(this));
types.addAll(super.getParameterTypes());
return types;
}
@Override
public OperatorVersion[] getIncompatibleVersionChanges() {
OperatorVersion[] changes = super.getIncompatibleVersionChanges();
changes = Arrays.copyOf(changes, changes.length + 3);
changes[changes.length - 3] = CHANGE_5_0_4;
changes[changes.length - 2] = CHANGE_5_0_11_NAME_SCHEMA;
changes[changes.length - 1] = CHANGE_6_2_0_OLD_XLSX_IMPORT;
return changes;
}
}