/*
* RapidMiner
*
* Copyright (C) 2001-2014 by RapidMiner and the contributors
*
* Complete list of developers available at our web site:
*
* http://rapidminer.com
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see http://www.gnu.org/licenses/.
*/
package com.rapidminer.operator.nio;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.Date;
import java.util.List;
import jxl.Cell;
import jxl.CellType;
import jxl.DateCell;
import jxl.NumberCell;
import jxl.Sheet;
import jxl.Workbook;
import jxl.WorkbookSettings;
import jxl.biff.EmptyCell;
import jxl.format.CellFormat;
import jxl.read.biff.BiffException;
import com.rapidminer.example.Attribute;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.example.table.AttributeFactory;
import com.rapidminer.example.table.DoubleArrayDataRow;
import com.rapidminer.example.table.MemoryExampleTable;
import com.rapidminer.operator.Annotations;
import com.rapidminer.operator.IOObject;
import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.UserError;
import com.rapidminer.operator.io.AbstractExampleSource;
import com.rapidminer.operator.nio.file.FileInputPortHandler;
import com.rapidminer.operator.nio.file.FileObject;
import com.rapidminer.operator.ports.InputPort;
import com.rapidminer.operator.ports.Port;
import com.rapidminer.operator.ports.metadata.MetaData;
import com.rapidminer.operator.ports.metadata.SimplePrecondition;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeInt;
import com.rapidminer.parameter.PortProvider;
import com.rapidminer.tools.Ontology;
import com.rapidminer.tools.Tools;
import com.rapidminer.tools.io.Encoding;
/**
*
* <p>
* This operator can be used to load data from Microsoft Excel spreadsheets.
* This operator is able to reads data from Excel 95, 97, 2000, XP, and 2003.
* The user has to define which of the spreadsheets in the workbook should be
* used as data table. The table must have a format so that each line is an
* example and each column represents an attribute. Please note that the first
* line might be used for attribute names which can be indicated by a parameter.
* </p>
*
* <p>
* The data table can be placed anywhere on the sheet and is allowed to contain
* arbitrary formatting instructions, empty rows, and empty columns. Missing
* data values are indicated by empty cells or by cells containing only
* "?".
* </p>
*
* @author Ingo Mierswa
*/
public class ExcelFormatExampleSource extends AbstractExampleSource {
/**
* The parameter name for "The Excel spreadsheet file which should be
* loaded."
*/
public static final String PARAMETER_EXCEL_FILE = "excel_file";
/**
* The parameter name for "The number of the sheet which should be
* imported."
*/
public static final String PARAMETER_SHEET_NUMBER = "sheet_number";
private InputPort fileInputPort = getInputPorts().createPort("file");
public ExcelFormatExampleSource(OperatorDescription description) {
super(description);
fileInputPort.addPrecondition(new SimplePrecondition(fileInputPort,
new MetaData(FileObject.class)) {
@Override
protected boolean isMandatory() {
return false;
}
});
}
@Override
public ExampleSet createExampleSet() throws OperatorException {
// reading configuration
String sourceAnnotation = null;
InputStream inputStream = null;
if (!fileInputPort.isConnected()) {
File inputFile = getParameterAsFile(PARAMETER_EXCEL_FILE);
try {
inputStream = new FileInputStream(inputFile);
} catch (FileNotFoundException e) {
throw new UserError(this, 302, inputFile.getPath(), e
.getMessage());
}
sourceAnnotation = inputFile.getPath();
} else {
IOObject fileObject = fileInputPort.getDataOrNull(IOObject.class);
if (fileObject != null) {
inputStream = fileInputPort.getData(FileObject.class)
.openStream();
sourceAnnotation = fileObject.getAnnotations().getAnnotation(
Annotations.KEY_SOURCE);
} else {
throw new UserError(this, 302,
"no data specified at input port");
}
}
// load the excelWorkbook if it is not set
Workbook workbook = null;
try {
Charset encoding = Encoding.getEncoding(this);
WorkbookSettings workbookSettings = new WorkbookSettings();
workbookSettings.setEncoding(encoding.name());
workbook = Workbook.getWorkbook(inputStream, workbookSettings);
} catch (IOException e) {
throw new UserError(this, 302, sourceAnnotation, e.getMessage());
} catch (BiffException e) {
throw new UserError(this, 302, sourceAnnotation, e.getMessage());
}
int sheetNumber = getParameterAsInt(PARAMETER_SHEET_NUMBER) - 1;
Sheet sheet = null;
try {
sheet = workbook.getSheet(sheetNumber);
} catch (IndexOutOfBoundsException e) {
throw new UserError(this, 953, sheetNumber);
}
int totalNumberOfColumns = sheet.getColumns();
int totalNumberOfRows = sheet.getRows();
if (totalNumberOfColumns < 0 || totalNumberOfRows < 0)
throw new UserError(this, 404);
boolean[] emptyColumns = new boolean[totalNumberOfColumns];
boolean[] emptyRows = new boolean[totalNumberOfRows];
// filling offsets
Arrays.fill(emptyColumns, true);
Arrays.fill(emptyRows, true);
// determine offsets and emptiness
boolean foundAny = false;
for (int r = 0; r < totalNumberOfRows; r++) {
for (int c = 0; c < totalNumberOfColumns; c++) {
if (emptyRows[r] || emptyColumns[c]) {
final Cell cell = sheet.getCell(c, r);
if (!(cell instanceof EmptyCell)
&& cell.getType() != CellType.EMPTY
&& !"".equals(cell.getContents().trim())) {
foundAny = true;
emptyRows[r] = false;
emptyColumns[c] = false;
}
}
}
}
if (!foundAny) {
throw new UserError(this, 302, sourceAnnotation,
"spreadsheet seems to be empty");
}
// retrieve attribute names: first count columns
int numberOfAttributes = 0;
for (int i = 0; i < totalNumberOfColumns; i++) {
if (!emptyColumns[i])
numberOfAttributes++;
}
// retrieve or generate attribute names
String[] attributeNames = new String[numberOfAttributes * 2];
int counter = 0;
for (int i = 0; i < totalNumberOfColumns; i++) {
if (!emptyColumns[i]) {
String currentName = Tools.getExcelColumnName(i);
attributeNames[counter] = currentName;
attributeNames[counter + numberOfAttributes] = currentName
+ "_format";
counter++;
}
}
// create example table
Attribute[] attributes = new Attribute[attributeNames.length];
for (int i = 0; i < attributeNames.length; i++) {
attributes[i] = AttributeFactory.createAttribute(attributeNames[i],
Ontology.NOMINAL);
}
MemoryExampleTable table = new MemoryExampleTable(attributes);
for (int r = 0; r < totalNumberOfRows; r++) {
if (emptyRows[r])
continue;
int currentC = 0;
double[] data = new double[attributes.length];
for (int c = 0; c < totalNumberOfColumns; c++) {
if (emptyColumns[c])
continue;
final Cell cell = sheet.getCell(c, r);
if (cell instanceof EmptyCell
|| cell.getType() == CellType.EMPTY
|| cell.getType() == CellType.ERROR
|| cell.getType() == CellType.FORMULA_ERROR
|| cell.getContents() == null
|| "".equals(cell.getContents().trim())) {
data[currentC] = Double.NaN;
data[currentC + numberOfAttributes] = Double.NaN;
} else {
final CellType type = cell.getType();
if ((type == CellType.NUMBER)
|| (type == CellType.NUMBER_FORMULA)) {
final double value = ((NumberCell) cell).getValue();
data[currentC] = attributes[currentC].getMapping()
.mapString(value + "");
} else if ((type == CellType.DATE)
|| (type == CellType.DATE_FORMULA)) {
Date date = ((DateCell) cell).getDate();
data[currentC] = attributes[currentC].getMapping()
.mapString(Tools.formatDateTime(date));
} else {
data[currentC] = attributes[currentC].getMapping()
.mapString(cell.getContents());
}
final CellFormat cellFormat = cell.getCellFormat();
StringBuffer formatInfo = new StringBuffer();
formatInfo.append("background: "
+ cellFormat.getBackgroundColour().getDescription()
+ "; ");
formatInfo.append("pattern: "
+ cellFormat.getPattern().getDescription() + "; ");
formatInfo.append("foreground: "
+ cellFormat.getFont().getColour().getDescription()
+ "; ");
formatInfo.append("font_name: "
+ cellFormat.getFont().getName() + "; ");
formatInfo.append("font_bold_weight: "
+ cellFormat.getFont().getBoldWeight() + "; ");
formatInfo.append("font_size: "
+ cellFormat.getFont().getPointSize() + "; ");
formatInfo.append("font_italic: "
+ cellFormat.getFont().isItalic() + "; ");
formatInfo.append("font_struckout: "
+ cellFormat.getFont().isStruckout() + ";");
data[currentC + numberOfAttributes] = attributes[currentC
+ numberOfAttributes].getMapping().mapString(
formatInfo.toString());
}
currentC++;
}
table.addDataRow(new DoubleArrayDataRow(data));
}
ExampleSet exampleSet = table.createExampleSet();
if (sourceAnnotation != null) {
exampleSet.getAnnotations().setAnnotation(Annotations.KEY_SOURCE,
sourceAnnotation);
}
return exampleSet;
}
@Override
public List<ParameterType> getParameterTypes() {
List<ParameterType> types = super.getParameterTypes();
/*
* ParameterType fileParam = new ParameterTypeFile(PARAMETER_EXCEL_FILE,
* "Name of the excel file to read the data from.", "xls", false);
* fileParam.registerDependencyCondition(new
* InputPortNotConnectedCondition(this, new PortProvider() {
*
* @Override public Port getPort() { return fileInputPort; } }, true));
* types.add(fileParam);
*/
types.add(FileInputPortHandler.makeFileParameterType(this,
PARAMETER_EXCEL_FILE, "Name of the file to read the data from.", "xls", new PortProvider() {
@Override
public Port getPort() {
return fileInputPort;
}
}));
types.add(new ParameterTypeInt(PARAMETER_SHEET_NUMBER,
"The number of the sheet which should be imported.", 1,
Integer.MAX_VALUE, 1, false));
types.addAll(Encoding.getParameterTypes(this));
return types;
}
}