/* * RapidMiner * * Copyright (C) 2001-2011 by Rapid-I and the contributors * * Complete list of developers available at our web site: * * http://rapid-i.com * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.operator.io; import java.io.IOException; import java.util.Arrays; import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; import java.util.SortedSet; import java.util.TimeZone; import java.util.TreeSet; import jxl.Cell; import jxl.CellType; import jxl.DateCell; import jxl.NumberCell; import jxl.Sheet; import jxl.Workbook; import jxl.read.biff.BiffException; import com.rapidminer.gui.tools.dialogs.wizards.dataimport.excel.ExcelImportWizard; import com.rapidminer.operator.Annotations; import com.rapidminer.operator.OperatorDescription; import com.rapidminer.operator.OperatorException; import com.rapidminer.operator.UserError; import com.rapidminer.parameter.ParameterType; import com.rapidminer.parameter.ParameterTypeBoolean; import com.rapidminer.parameter.ParameterTypeCategory; import com.rapidminer.parameter.ParameterTypeConfiguration; import com.rapidminer.parameter.ParameterTypeFile; import com.rapidminer.parameter.ParameterTypeInt; import com.rapidminer.parameter.ParameterTypeList; import com.rapidminer.parameter.UndefinedParameterError; /** * * <p> * This operator can be used to load data from Microsoft Excel spreadsheets. * This operator is able to reads data from Excel 95, 97, 2000, XP, and 2003. * The user has to define which of the spreadsheets in the workbook should be * used as data table. The table must have a format so that each line is an * example and each column represents an attribute. Please note that the first * line might be used for attribute names which can be indicated by a parameter. * </p> * * <p> * The data table can be placed anywhere on the sheet and is allowed to contain * arbitrary formatting instructions, empty rows, and empty columns. Missing * data values are indicated by empty cells or by cells containing only * "?". * </p> * * @author Ingo Mierswa, Tobias Malbrecht, Sebastian Loh */ public class ExcelExampleSource extends AbstractDataReader { /** Pseudo-annotation to be used for attribute names. */ public static final String ANNOTATION_NAME = Annotations.ANNOTATION_NAME; /** * Pseudo-annotation to be used for original attribute names which are read * from the ANNOTATION_NAME line and later were changed by the user */ public static final String ANNOTATION_NAMES_FROM_DOCUMENT = "Name from document"; /** * The parameter name for "The Excel spreadsheet file which should be * loaded." */ public static final String PARAMETER_EXCEL_FILE = "excel_file"; static { AbstractReader.registerReaderDescription(new ReaderDescription("xls", ExcelExampleSource.class, PARAMETER_EXCEL_FILE)); } /** * The parameter name for "The number of the sheet which should be * imported." */ public static final String PARAMETER_SHEET_NUMBER = "sheet_number"; /** * The parameter name for "Indicates if the first row should be used * for the attribute names." */ public static final String PARAMETER_FIRST_ROW_AS_NAMES = "first_row_as_names"; /** * The parameter name for "Indicates which column should be used for * the label attribute (0: no label)" */ public static final String PARAMETER_LABEL_COLUMN = "label_column"; /** * The parameter name for "Indicates which column should be used for * the Id attribute (0: no id)" */ public static final String PARAMETER_ID_COLUMN = "id_column"; /** * The parameter name for "Determines, how the data is represented * internally." */ public static final String PARAMETER_DATAMANAGEMENT = "datamanagement"; public static final String PARAMETER_COLUMN_OFFSET = "column_offset"; public static final String PARAMETER_ROW_OFFSET = "row_offset"; public static final String PARAMETER_CREATE_LABEL = "create_label"; public static final String PARAMETER_CREATE_ID = "create_id"; public static final String PARAMETER_ANNOTATIONS = "annotations"; private Workbook workbook = null; private boolean keepWorkbookOpen = false; private ExcelDataSet cacheDataSet = null; private boolean skipAnnotationRows = false; private int nameRowF = -1; private Map<Integer, String> annotationsMap; private Set<Integer> annotationRows; public ExcelExampleSource(OperatorDescription description) { super(description); getParameters().addObserver(new CacheResetParameterObserver(PARAMETER_EXCEL_FILE), false); } // @Override // public ExampleSet createExampleSet() throws OperatorException { //// if (getParameterAsFile(PARAMETER_CACHED_EXCEL_FILE).equals(getParameterAsFile(PARAMETER_EXCEL_FILE)) && attributeNamesDefinedByUser()) { // if (attributeNamesDefinedByUser()){ // loadMetaDataFromParameters(); // } // return super.createExampleSet(); // } @Override public void writeMetaDataInParameter() { try { // clean up annotations in case attribute names are defined manually List<String[]> annotations = getParameterList(PARAMETER_ANNOTATIONS); List<String[]> cleanedAnnotations = new LinkedList<String[]>(); // annotation name needs to be changed into // ANNOTATION_NAMES_FROM_DOCUMENT since the reader shall // not overwrite the user defined attribute name by names which // might occur in the name row. for (String[] pair : annotations) { if (!pair[1].equals(Annotations.ANNOTATION_NAME)) { cleanedAnnotations.add(pair); } else { pair[1] = ExcelExampleSource.ANNOTATION_NAMES_FROM_DOCUMENT; cleanedAnnotations.add(pair); } } setParameter(PARAMETER_ANNOTATIONS, ParameterTypeList.transformList2String(cleanedAnnotations)); // setParameter(ExcelExampleSource.PARAMETER_CACHED_EXCEL_FILE, getParameter(PARAMETER_EXCEL_FILE)); } catch (UndefinedParameterError e) { // TODO Auto-generated catch block e.printStackTrace(); } super.writeMetaDataInParameter(); } /** * inform the excel reader to keep the workbook open due to performance * reasons. The workbook has to be closed then manually. * * @see ExcelExampleSource#closeWorkbook() */ public void keepWorkbookOpen() { keepWorkbookOpen = true; } /** * closes the the excel workbook */ public void closeWorkbook() { if (workbook != null) { workbook.close(); } } public void resetWorkbook() { closeWorkbook(); workbook = null; cacheDataSet = null; } public void skipNameAnnotationRow(boolean flag) { skipAnnotationRows = flag; } @Override protected ExcelDataSet getDataSet() throws OperatorException { if (cacheDataSet != null) { cacheDataSet.setCurrentRow(0); return cacheDataSet; } List<String[]> allAnnotations = getParameterList(PARAMETER_ANNOTATIONS); annotationRows = new HashSet<Integer>(); annotationsMap = new HashMap<Integer, String>(); boolean nameFound = false; int lastAnnotatedRow = -1; int nameRow = -1; for (String[] pair : allAnnotations) { try { final int row = Integer.parseInt(pair[0]); if (row > lastAnnotatedRow) { lastAnnotatedRow = row; } annotationsMap.put(row, pair[1]); if (Annotations.ANNOTATION_NAME.equals(pair[1])) { nameFound = true; nameRow = row; } annotationRows.add(row); } catch (NumberFormatException e) { throw new OperatorException("row_number entries in parameter list " + PARAMETER_ANNOTATIONS + " must be integers.", e); } } if (nameFound && getParameterAsBoolean(PARAMETER_FIRST_ROW_AS_NAMES)) { throw new OperatorException("If " + PARAMETER_FIRST_ROW_AS_NAMES + " is set to true, you cannot use " + Annotations.ANNOTATION_NAME + " entries in parameter list " + PARAMETER_ANNOTATIONS + "."); } if (getParameterAsBoolean(PARAMETER_FIRST_ROW_AS_NAMES)) { annotationsMap.put(0, Annotations.ANNOTATION_NAME); annotationRows.add(0); nameRow = 0; } // final int lastAnnotatedRowF = lastAnnotatedRow + 1; //+1 since the // last annotated row itself must be ignored nameRowF = nameRow; return new ExcelDataSet(); } @Override public List<ParameterType> getParameterTypes() { List<ParameterType> types = new LinkedList<ParameterType>(); ParameterType type = new ParameterTypeConfiguration(ExcelImportWizard.ExcelExampleSourceConfigurationWizardCreator.class, this); type.setExpert(false); types.add(type); types.add(new ParameterTypeFile(PARAMETER_EXCEL_FILE, "Name of the excel file to read the data from.", "xls", false)); types.add(new ParameterTypeInt(PARAMETER_SHEET_NUMBER, "The number of the sheet which should be imported.", 1, Integer.MAX_VALUE, 1, false)); types.add(new ParameterTypeInt(PARAMETER_ROW_OFFSET, "The number of rows to skip at top of sheet as they contain no usable data.", 0, 65535, 0, true)); types.add(new ParameterTypeInt(PARAMETER_COLUMN_OFFSET, "The number of columns to skip at left side of sheet as they contain no usable data.", 0, 255, 0, true)); types.add(new ParameterTypeBoolean(PARAMETER_FIRST_ROW_AS_NAMES, "Indicates if the first row should be used for the attribute names.", true, true)); List<String> annotations = new LinkedList<String>(); annotations.add(Annotations.ANNOTATION_NAME); annotations.addAll(Arrays.asList(Annotations.ALL_KEYS_ATTRIBUTE)); types.add(new ParameterTypeList(PARAMETER_ANNOTATIONS, "Maps row numbers to annotation names.", new ParameterTypeInt("row_number", "Row number which contains an annotation", 0, Integer.MAX_VALUE), new ParameterTypeCategory("annotation", "Name of the annotation to assign this row.", annotations.toArray(new String[annotations.size()]), 0))); types.addAll(super.getParameterTypes()); return types; } /** * * Dedicated DataSet class for ExcelDataSets. It is need too cache the the * DataSet in the {@link ExcelExampleSource#getDataSet()} method. * * @author Sebastian Loh (22.06.2010) * */ private class ExcelDataSet extends DataSet { private Sheet sheet = null; private Cell[] cells = null; private SortedSet<Integer> emptyRows = new TreeSet<Integer>(); private SortedSet<Integer> emptyColumns = new TreeSet<Integer>(); private int rowOffset = 0; private int columnOffset = 0; private int numberOfRows = 0; private int numberOfColumns = 0; private int currentRow; public ExcelDataSet() throws OperatorException { rowOffset = getParameterAsInt(PARAMETER_ROW_OFFSET); columnOffset = getParameterAsInt(PARAMETER_COLUMN_OFFSET); currentRow = rowOffset; // load the excelWorkbook if it is not set if (workbook == null || true) { try { workbook = Workbook.getWorkbook(getParameterAsInputStream(PARAMETER_EXCEL_FILE)); } catch (IOException e) { throw new UserError(ExcelExampleSource.this, 302, getParameter(PARAMETER_EXCEL_FILE), e.getMessage()); } catch (BiffException e) { throw new UserError(ExcelExampleSource.this, 302, getParameter(PARAMETER_EXCEL_FILE), e.getMessage()); } } try{ sheet = workbook.getSheet(getParameterAsInt(PARAMETER_SHEET_NUMBER) - 1); } catch (IndexOutOfBoundsException e) { throw new UserError(ExcelExampleSource.this, 953, getParameter(PARAMETER_SHEET_NUMBER)); } numberOfColumns = sheet.getColumns(); numberOfRows = sheet.getRows(); // TODO unifiy offset and emptiness checks in one loop // determine offsets boolean contentFound = false; for (int r = rowOffset; r < numberOfRows; r++) { for (int c = columnOffset; c < numberOfColumns; c++) { if (sheet.getCell(c, r).getType() != CellType.EMPTY && !"".equals(sheet.getCell(c, r).getContents().trim())) { columnOffset = c; contentFound = true; break; } } if (contentFound) { rowOffset = r; break; } } if (!contentFound) { throw new UserError(ExcelExampleSource.this, 302, getParameter(PARAMETER_EXCEL_FILE), "spreadsheet seems to be empty"); } // determine empty rows for (int r = rowOffset; r < numberOfRows; r++) { boolean rowEmpty = true; for (int c = columnOffset; c < numberOfColumns; c++) { if (sheet.getCell(c, r).getType() != CellType.EMPTY && !"".equals(sheet.getCell(c, r).getContents().trim())) { rowEmpty = false; break; } } if (rowEmpty) { emptyRows.add(r); } } // determine empty columns for (int c = columnOffset; c < numberOfColumns; c++) { boolean columnEmpty = true; for (int r = rowOffset; r < numberOfRows; r++) { if (sheet.getCell(c, r).getType() != CellType.EMPTY && !"".equals(sheet.getCell(c, r).getContents().trim())) { columnEmpty = false; break; } } if (columnEmpty) { emptyColumns.add(c); } } // set attribute names if (nameRowF != -1) { String[] attributeNames = new String[numberOfColumns - columnOffset - emptyColumns.size()]; int columnCounter = 0; for (int c = columnOffset; c < numberOfColumns; c++) { // skip empty columns if (!emptyColumns.contains(c)) { Cell cell = sheet.getCell(c, rowOffset + nameRowF); attributeNames[columnCounter++] = cell.getContents(); } } setAttributeNames(attributeNames); currentRow++; } // Annotations int columnCounter = 0; Annotations[] annotations = new Annotations[numberOfColumns - columnOffset - emptyColumns.size()]; boolean foundAnnotations = false; for (int c = columnOffset; c < numberOfColumns; c++) { // skip empty columns if (emptyColumns.contains(c)) continue; annotations[columnCounter] = new Annotations(); for (Map.Entry<Integer, String> entry : annotationsMap.entrySet()) { if (Annotations.ANNOTATION_NAME.equals(entry.getValue())) { continue; } else { Cell cell = sheet.getCell(c, rowOffset + entry.getKey()); annotations[columnCounter].put(entry.getValue(), cell.getContents()); foundAnnotations = true; } } columnCounter++; } if (foundAnnotations) setAnnotations(annotations); } @Override public int getNumberOfColumnsInCurrentRow() { return numberOfColumns - columnOffset - emptyColumns.size(); } @Override public boolean isMissing(int columnIndex) { return cells[columnIndex].getType() == CellType.EMPTY || cells[columnIndex].getType() == CellType.ERROR || cells[columnIndex].getType() == CellType.FORMULA_ERROR || cells[columnIndex].getContents() == null || "".equals(cells[columnIndex].getContents().trim()); } @Override public Number getNumber(int columnIndex) { try { if (cells[columnIndex].getType() == CellType.NUMBER) return Double.valueOf(((NumberCell) cells[columnIndex]).getValue()); else return Double.valueOf(cells[columnIndex].getContents()); } catch (ClassCastException e) { } catch (NumberFormatException e) {} return null; } @Override public Date getDate(int columnIndex) { try { Date date = ((DateCell) cells[columnIndex]).getDate(); if (date == null) { return null; } int offset = TimeZone.getDefault().getOffset(date.getTime()); return new Date(date.getTime() - offset); } catch (ClassCastException e) { } return null; } @Override public String getString(int columnIndex) { return cells[columnIndex].getContents(); } @Override public boolean next() { while ((emptyRows.contains(currentRow) || (skipAnnotationRows && annotationRows.contains(currentRow))) && currentRow < numberOfRows) { currentRow++; } if (currentRow >= numberOfRows) { return false; } cells = new Cell[numberOfColumns - columnOffset - emptyColumns.size()]; int columnCounter = 0; for (int c = columnOffset; c < numberOfColumns; c++) { if (emptyColumns.contains(c)) { continue; } // sheet.getRow(currentRow); cells[columnCounter] = sheet.getCell(c, currentRow); columnCounter++; } currentRow++; return true; } @Override public void close() throws OperatorException { if (!keepWorkbookOpen) { workbook.close(); } } public void setCurrentRow(int currentRow) { this.currentRow = currentRow; } } }