/*
* RapidMiner
*
* Copyright (C) 2001-2008 by Rapid-I and the contributors
*
* Complete list of developers available at our web site:
*
* http://rapid-i.com
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see http://www.gnu.org/licenses/.
*/
package com.rapidminer.operator.io;
import java.io.File;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.SortedSet;
import java.util.TreeSet;
import jxl.Cell;
import jxl.Sheet;
import jxl.Workbook;
import com.rapidminer.example.Attribute;
import com.rapidminer.example.Attributes;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.example.table.AttributeFactory;
import com.rapidminer.example.table.DataRow;
import com.rapidminer.example.table.DataRowFactory;
import com.rapidminer.example.table.MemoryExampleTable;
import com.rapidminer.operator.IOObject;
import com.rapidminer.operator.Operator;
import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.UserError;
import com.rapidminer.parameter.ParameterType;
import com.rapidminer.parameter.ParameterTypeBoolean;
import com.rapidminer.parameter.ParameterTypeCategory;
import com.rapidminer.parameter.ParameterTypeFile;
import com.rapidminer.parameter.ParameterTypeInt;
import com.rapidminer.parameter.ParameterTypeString;
import com.rapidminer.tools.Ontology;
import com.rapidminer.tools.att.AttributeDataSourceCreator;
/**
* <p>This operator can be used to load data from Microsoft Excel spreadsheets.
* This operator is able to reads data from Excel 95, 97, 2000, XP, and 2003.
* The user has to define which of the spreadsheets in the workbook should be
* used as data table. The table must have a format so that each line is an example
* and each column represents an attribute. Please note that the first line might
* be used for attribute names which can be indicated by a parameter.</p>
*
* <p>The data table can be placed anywhere on the sheet and is allowed to
* contain arbitrary formatting instructions, empty rows, and empty columns. Missing data values
* are indicated by empty cells or by cells containing only "?".</p>
*
* @author Ingo Mierswa
* @version $Id: ExcelExampleSource.java,v 1.8 2008/07/07 07:06:38 ingomierswa Exp $
*/
public class ExcelExampleSource extends Operator {
/** The parameter name for "The Excel spreadsheet file which should be loaded." */
public static final String PARAMETER_EXCEL_FILE = "excel_file";
/** The parameter name for "The number of the sheet which should be imported." */
public static final String PARAMETER_SHEET_NUMBER = "sheet_number";
/** The parameter name for "Indicates if the first row should be used for the attribute names." */
public static final String PARAMETER_FIRST_ROW_AS_NAMES = "first_row_as_names";
/** The parameter name for "Indicates which column should be used for the label attribute (0: no label)" */
public static final String PARAMETER_LABEL_COLUMN = "label_column";
/** The parameter name for "Indicates which column should be used for the Id attribute (0: no id)" */
public static final String PARAMETER_ID_COLUMN = "id_column";
/** The parameter name for "Determines, how the data is represented internally." */
public static final String PARAMETER_DATAMANAGEMENT = "datamanagement";
/** The parameter name for "Character that is used as decimal point." */
public static final String PARAMETER_DECIMAL_POINT_CHARACTER = "decimal_point_character";
public ExcelExampleSource(OperatorDescription description) {
super(description);
}
public IOObject[] apply() throws OperatorException {
File file = getParameterAsFile(PARAMETER_EXCEL_FILE);
Workbook workbook = null;
try {
workbook = Workbook.getWorkbook(file);
} catch (Exception e) {
throw new UserError(this, 302, file, e.getMessage());
}
Sheet sheet = workbook.getSheet(getParameterAsInt(PARAMETER_SHEET_NUMBER));
int numberOfColumns = sheet.getColumns();
int numberOfRows = sheet.getRows();
// determine offsets
int rowOffset = 0;
int columnOffset = 0;
boolean contentFound = false;
for (int r = 0; r < numberOfRows; r++) {
for (int c = 0; c < numberOfColumns; c++) {
Cell cell = sheet.getCell(c, r);
String content = cell.getContents();
if ((content != null) && (content.trim().length() > 0)) {
columnOffset = c;
contentFound = true;
break;
}
}
if (contentFound) {
rowOffset = r;
break;
}
}
if (!contentFound) {
throw new UserError(this, 302, file, "spreadsheet seems to be empty");
}
// determine empty rows
SortedSet<Integer> emptyRows = new TreeSet<Integer>();
for (int r = rowOffset; r < numberOfRows; r++) {
boolean rowEmpty = true;
for (int c = columnOffset; c < numberOfColumns; c++) {
Cell cell = sheet.getCell(c, r);
String content = cell.getContents();
if ((content != null) && (content.trim().length() > 0)) {
rowEmpty = false;
break;
}
}
if (rowEmpty) {
emptyRows.add(r);
}
}
// determine empty columns
SortedSet<Integer> emptyColumns = new TreeSet<Integer>();
for (int c = columnOffset; c < numberOfColumns; c++) {
boolean columnEmpty = true;
for (int r = rowOffset; r < numberOfRows; r++) {
Cell cell = sheet.getCell(c, r);
String content = cell.getContents();
if ((content != null) && (content.trim().length() > 0)) {
columnEmpty = false;
break;
}
}
if (columnEmpty) {
emptyColumns.add(c);
}
}
// attribute names
String[] attributeNames = new String[numberOfColumns - columnOffset - emptyColumns.size()];
if (getParameterAsBoolean(PARAMETER_FIRST_ROW_AS_NAMES)) {
int columnCounter = 0;
for (int c = columnOffset; c < numberOfColumns; c++) {
// skip empty columns
if (emptyColumns.contains(c))
continue;
Cell cell = sheet.getCell(c, rowOffset);
attributeNames[columnCounter++] = cell.getContents();
}
} else {
for (int c = 0; c < numberOfColumns - columnOffset - emptyColumns.size(); c++) {
attributeNames[c] = file.getName() + " (" + (c + 1) + ")";
}
}
// attribute value types
char decimalPointCharacter = getParameterAsString(PARAMETER_DECIMAL_POINT_CHARACTER).charAt(0);
int[] valueTypes = new int[numberOfColumns - columnOffset - emptyColumns.size()];
for (int i = 0; i < valueTypes.length; i++)
valueTypes[i] = Ontology.INTEGER;
for (int r = rowOffset; r < numberOfRows; r++) {
// skip name row
if ((r == rowOffset) && getParameterAsBoolean(PARAMETER_FIRST_ROW_AS_NAMES))
continue;
// skip empty rows
if (emptyRows.contains(r))
continue;
String[] row = new String[numberOfColumns - columnOffset - emptyColumns.size()];
int columnCounter = 0;
for (int c = columnOffset; c < numberOfColumns; c++) {
// skip empty columns
if (emptyColumns.contains(c))
continue;
Cell cell = sheet.getCell(c, r);
row[columnCounter] = cell.getContents();
if ((row[columnCounter] == null) || (row[columnCounter].trim().length() == 0))
row[columnCounter] = "?";
columnCounter++;
}
AttributeDataSourceCreator.guessValueTypes(row, valueTypes, decimalPointCharacter);
}
// create attributes
List<Attribute> attributes = new LinkedList<Attribute>();
for (int c = 0; c < numberOfColumns - columnOffset - emptyColumns.size(); c++) {
attributes.add(AttributeFactory.createAttribute(attributeNames[c], valueTypes[c]));
}
// create and fill table
MemoryExampleTable table = new MemoryExampleTable(attributes);
DataRowFactory dataRowFactory = new DataRowFactory(getParameterAsInt(PARAMETER_DATAMANAGEMENT), decimalPointCharacter);
Attribute[] attributeArray = new Attribute[attributes.size()];
attributes.toArray(attributeArray);
for (int r = rowOffset; r < numberOfRows; r++) {
// skip name row
if ((r == rowOffset) && getParameterAsBoolean(PARAMETER_FIRST_ROW_AS_NAMES))
continue;
// skip empty rows
if (emptyRows.contains(r))
continue;
String[] row = new String[numberOfColumns - columnOffset - emptyColumns.size()];
int columnCounter = 0;
for (int c = columnOffset; c < numberOfColumns; c++) {
// skip empty columns
if (emptyColumns.contains(c))
continue;
Cell cell = sheet.getCell(c, r);
row[columnCounter] = cell.getContents();
if ((row[columnCounter] == null) || (row[columnCounter].trim().length() == 0))
row[columnCounter] = "?";
columnCounter++;
}
DataRow dataRow = dataRowFactory.create(row, attributeArray);
table.addDataRow(dataRow);
checkForStop();
}
// special attributes
Attribute labelAttribute = null;
int labelColumn = getParameterAsInt(PARAMETER_LABEL_COLUMN);
if (labelColumn != 0) {
if (labelColumn >= attributes.size() + 1)
throw new UserError(this, 111, "label_column = " + labelColumn);
labelAttribute = attributeArray[labelColumn - 1];
}
Attribute idAttribute = null;
int idColumn = getParameterAsInt(PARAMETER_ID_COLUMN);
if (idColumn != 0) {
if (idColumn >= attributes.size() + 1)
throw new UserError(this, 111, "id_column = " + idColumn);
idAttribute = attributeArray[idColumn - 1];
}
Map<Attribute, String> specialMap = new HashMap<Attribute, String>();
specialMap.put(labelAttribute, Attributes.LABEL_NAME);
specialMap.put(idAttribute, Attributes.ID_NAME);
ExampleSet exampleSet = table.createExampleSet(specialMap);
return new IOObject[] { exampleSet };
}
public Class<?>[] getInputClasses() {
return new Class[0];
}
public Class<?>[] getOutputClasses() {
return new Class[] { ExampleSet.class };
}
public List<ParameterType> getParameterTypes() {
List<ParameterType> types = super.getParameterTypes();
types.add(new ParameterTypeFile(PARAMETER_EXCEL_FILE, "The Excel spreadsheet file which should be loaded.", "xls", false));
types.add(new ParameterTypeInt(PARAMETER_SHEET_NUMBER, "The number of the sheet which should be imported.", 0, Integer.MAX_VALUE, 0));
ParameterType type = new ParameterTypeBoolean(PARAMETER_FIRST_ROW_AS_NAMES, "Indicates if the first row should be used for the attribute names.", false);
type.setExpert(false);
types.add(type);
type = new ParameterTypeInt(PARAMETER_LABEL_COLUMN, "Indicates which column should be used for the label attribute (0: no label)", 0, Integer.MAX_VALUE, 0);
type.setExpert(false);
types.add(type);
types.add(new ParameterTypeInt(PARAMETER_ID_COLUMN, "Indicates which column should be used for the Id attribute (0: no id)", 0, Integer.MAX_VALUE, 0));
types.add(new ParameterTypeString(PARAMETER_DECIMAL_POINT_CHARACTER, "Character that is used as decimal point.", "."));
types.add(new ParameterTypeCategory(PARAMETER_DATAMANAGEMENT, "Determines, how the data is represented internally.", DataRowFactory.TYPE_NAMES, DataRowFactory.TYPE_DOUBLE_ARRAY));
return types;
}
}