/** * Copyright (C) 2001-2017 by RapidMiner and the contributors * * Complete list of developers available at our web site: * * http://rapidminer.com * * This program is free software: you can redistribute it and/or modify it under the terms of the * GNU Affero General Public License as published by the Free Software Foundation, either version 3 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License along with this program. * If not, see http://www.gnu.org/licenses/. */ package com.rapidminer.operator.nio.model; import java.io.File; import java.io.IOException; import java.text.DateFormat; import java.text.SimpleDateFormat; import java.util.Arrays; import java.util.Date; import java.util.LinkedList; import java.util.List; import java.util.NoSuchElementException; import java.util.TimeZone; import com.rapidminer.operator.Operator; import com.rapidminer.operator.OperatorException; import com.rapidminer.operator.ProcessStoppedException; import com.rapidminer.operator.UserError; import com.rapidminer.tools.ProgressListener; import com.rapidminer.tools.Tools; import jxl.Cell; import jxl.CellType; import jxl.DateCell; import jxl.NumberCell; import jxl.Sheet; import jxl.Workbook; import jxl.WorkbookSettings; import jxl.read.biff.BiffException; /** * A DataResultSet for an Excel File. * * @author Sebastian Land, Marco Boeck * */ public class ExcelResultSet implements DataResultSet { private Sheet sheet = null; private boolean[] emptyRows; private boolean[] emptyColumns; private int rowOffset = 0; private int columnOffset = 0; private int totalNumberOfRows = 0; private int totalNumberOfColumns = 0; private int currentRow; private Cell[] currentRowCells; private Workbook workbook; // private ExcelResultSetConfiguration configuration; private String[] attributeNames; private final DateFormatProvider dateFormatProvider; private Operator operator = null; /** * The constructor to build an ExcelResultSet from the given configuration. The calling operator * might be null. It is only needed for error handling. */ public ExcelResultSet(Operator callingOperator, final ExcelResultSetConfiguration configuration, final DateFormatProvider dateFormatProvider) throws OperatorException { // reading configuration columnOffset = configuration.getColumnOffset(); rowOffset = Math.max(configuration.getRowOffset(), 0); currentRow = configuration.getRowOffset() - 1; // check range if (columnOffset > configuration.getColumnLast() || rowOffset > configuration.getRowLast() || columnOffset < 0 || rowOffset < 0) { throw new UserError(callingOperator, 223, Tools.getExcelColumnName(columnOffset) + rowOffset + ":" + Tools.getExcelColumnName(configuration.getColumnLast()) + configuration.getRowLast()); } // check file presence if (configuration.getFile() == null) { throw new UserError(callingOperator, "file_consumer.no_file_defined"); } // load the excelWorkbook if it is not set try { File file = configuration.getFile(); WorkbookSettings workbookSettings = new WorkbookSettings(); if (configuration.getEncoding() != null) { workbookSettings.setEncoding(configuration.getEncoding().name()); } workbook = Workbook.getWorkbook(file, workbookSettings); } catch (IOException e) { throw new UserError(callingOperator, 302, configuration.getFile().getPath(), e.getMessage()); } catch (BiffException e) { throw new UserError(callingOperator, 302, configuration.getFile().getPath(), e.getMessage()); } try { sheet = workbook.getSheet(configuration.getSheet()); } catch (IndexOutOfBoundsException | IllegalArgumentException e) { throw new UserError(callingOperator, 953, configuration.getSheet() + 1); } totalNumberOfColumns = Math.min(configuration.getColumnLast(), sheet.getColumns() - 1) - columnOffset + 1; totalNumberOfRows = Math.min(configuration.getRowLast(), sheet.getRows() - 1) - rowOffset + 1; if (totalNumberOfColumns < 0 || totalNumberOfRows < 0) { throw new UserError(callingOperator, 404); } emptyColumns = new boolean[totalNumberOfColumns]; emptyRows = new boolean[totalNumberOfRows]; // filling offsets Arrays.fill(emptyColumns, true); Arrays.fill(emptyRows, true); // determine offsets and emptiness boolean foundAny = false; for (int r = 0; r < totalNumberOfRows; r++) { for (int c = 0; c < totalNumberOfColumns; c++) { if (emptyRows[r] || emptyColumns[c]) { final Cell cell = sheet.getCell(c + columnOffset, r + rowOffset); if (cell.getType() != CellType.EMPTY && !"".equals(cell.getContents().trim())) { foundAny = true; emptyRows[r] = false; emptyColumns[c] = false; } } } } if (!foundAny) { throw new UserError(callingOperator, 302, configuration.getFile().getPath(), "spreadsheet seems to be empty"); } // retrieve attribute names: first count columns int numberOfAttributes = 0; List<Integer> nonEmptyColumnsList = new LinkedList<>(); for (int i = 0; i < totalNumberOfColumns; i++) { if (!emptyColumns[i]) { numberOfAttributes++; nonEmptyColumnsList.add(i); } } // retrieve or generate attribute names attributeNames = new String[nonEmptyColumnsList.size()]; if (!configuration.isEmulatingOldNames()) { for (int i = 0; i < numberOfAttributes; i++) { attributeNames[i] = Tools.getExcelColumnName(nonEmptyColumnsList.get(i)); } } else { // emulate old 5.0.x style for (int i = 0; i < numberOfAttributes; i++) { if (!emptyColumns[i]) { attributeNames[i] = "attribute_" + i; } } } final String timezone = configuration.getTimezone(); if (dateFormatProvider != null) { if (timezone != null) { this.dateFormatProvider = new DateFormatProvider() { @Override public DateFormat geDateFormat() { DateFormat format = dateFormatProvider.geDateFormat(); format.setTimeZone(TimeZone.getTimeZone(timezone)); return format; } }; } else { this.dateFormatProvider = dateFormatProvider; } } else { String datePattern = configuration.getDatePattern(); final DateFormat dateFormat = datePattern == null ? new SimpleDateFormat() : new SimpleDateFormat(datePattern); if (timezone != null) { dateFormat.setTimeZone(TimeZone.getTimeZone(timezone)); } this.dateFormatProvider = new DateFormatProvider() { @Override public DateFormat geDateFormat() { return dateFormat; } }; } if (callingOperator != null) { callingOperator.getProgress().setCheckForStop(false); callingOperator.getProgress().setTotal(totalNumberOfRows); operator = callingOperator; } } @Override public void reset(ProgressListener listener) { currentRow = rowOffset - 1; if (listener != null) { listener.setTotal(totalNumberOfRows); listener.setCompleted(0); } if (operator != null) { operator.getProgress().setTotal(totalNumberOfRows); } } @Override public boolean hasNext() { int nextRow = currentRow + 1; while (nextRow < totalNumberOfRows + rowOffset && emptyRows[nextRow - rowOffset]) { nextRow++; } return nextRow < totalNumberOfRows + rowOffset; } @Override public void next(ProgressListener listener) { currentRow++; while (currentRow < totalNumberOfRows + rowOffset && emptyRows[currentRow - rowOffset]) { currentRow++; } if (currentRow >= totalNumberOfRows + rowOffset) { throw new NoSuchElementException("No further row in excel sheet."); } currentRowCells = new Cell[attributeNames.length]; int columnCounter = 0; for (int c = 0; c < totalNumberOfColumns; c++) { if (!emptyColumns[c]) { currentRowCells[columnCounter] = sheet.getCell(c + columnOffset, currentRow); columnCounter++; } } // notifying progress listener if (listener != null) { listener.setCompleted(currentRow); } if (operator != null) { try { if (currentRow % 100 == 0) { operator.getProgress().setCompleted(currentRow); } } catch (ProcessStoppedException e) { // Will not happen, because check for stop is deactivated } } } @Override public void close() throws OperatorException { // configuration.closeWorkbook(); } @Override public int getNumberOfColumns() { return attributeNames.length; } @Override public String[] getColumnNames() { return attributeNames; } @Override public boolean isMissing(int columnIndex) { Cell cell = getCurrentCell(columnIndex); return cell.getType() == CellType.EMPTY || cell.getType() == CellType.ERROR || cell.getType() == CellType.FORMULA_ERROR || cell.getContents() == null || "".equals(cell.getContents().trim()); } private Cell getCurrentCell(int index) { // return currentRowCells[index + columnOffset]; return currentRowCells[index]; } @Override public Number getNumber(int columnIndex) throws ParseException { final Cell cell = getCurrentCell(columnIndex); if (cell.getType() == CellType.NUMBER || cell.getType() == CellType.NUMBER_FORMULA) { final double value = ((NumberCell) cell).getValue(); return Double.valueOf(value); } else { String valueString = cell.getContents(); try { return Double.valueOf(valueString); } catch (NumberFormatException e) { throw new ParseException( new ParsingError(currentRow, columnIndex, ParsingError.ErrorCode.UNPARSEABLE_REAL, valueString)); } } } @Override public Date getDate(int columnIndex) throws ParseException { final Cell cell = getCurrentCell(columnIndex); if (cell.getType() == CellType.DATE || cell.getType() == CellType.DATE_FORMULA) { Date date = ((DateCell) cell).getDate(); // hack to get actual date written in excel sheet. converts date to UTC int offset = TimeZone.getDefault().getOffset(date.getTime()); date.setTime(date.getTime() - offset); return date; } else { String valueString = cell.getContents(); try { return dateFormatProvider.geDateFormat().parse(valueString); } catch (java.text.ParseException e) { throw new ParseException( new ParsingError(currentRow, columnIndex, ParsingError.ErrorCode.UNPARSEABLE_DATE, valueString)); } } } @Override public String getString(int columnIndex) { return getCurrentCell(columnIndex).getContents(); } @Override public int[] getValueTypes() { return new int[this.attributeNames.length]; } @Override public ValueType getNativeValueType(int columnIndex) throws ParseException { final CellType type = getCurrentCell(columnIndex).getType(); if (type == CellType.EMPTY) { return ValueType.EMPTY; } else if (type == CellType.NUMBER || type == CellType.NUMBER_FORMULA) { return ValueType.NUMBER; } else if (type == CellType.DATE || type == CellType.DATE_FORMULA) { return ValueType.DATE; } else { return ValueType.STRING; } } @Override public int getCurrentRow() { return currentRow; } }